[Pkg-ceph-commits] [ceph] 01/02: Imported Upstream version 10.0.1

James Downing Page jamespage at moszumanska.debian.org
Tue Feb 9 11:05:09 UTC 2016


This is an automated email from the git hooks/post-receive script.

jamespage pushed a commit to branch upstream
in repository ceph.

commit 398a4cbf907ee608f9bfc7e0beaf60435a7e4b24
Author: James Page <james.page at ubuntu.com>
Date:   Mon Jan 25 15:11:54 2016 +0000

    Imported Upstream version 10.0.1
---
 AUTHORS                                            |    68 +-
 COPYING                                            |     5 +
 ChangeLog                                          |   803 +-
 Makefile.am                                        |     5 +
 Makefile.in                                        |   414 +-
 autogen.sh                                         |     1 -
 ceph.spec                                          |    20 +-
 ceph.spec.in                                       |    18 +-
 configure                                          |   178 +-
 configure.ac                                       |    44 +-
 doc/Makefile.in                                    |     1 +
 doc/man/8/ceph-disk.rst                            |    81 +-
 doc/man/8/ceph.rst                                 |    13 +-
 doc/man/8/rbd.rst                                  |    17 +-
 install-deps.sh                                    |     5 +-
 man/Makefile.in                                    |     1 +
 man/ceph-authtool.8                                |     2 +-
 man/ceph-clsinfo.8                                 |     2 +-
 man/ceph-conf.8                                    |     2 +-
 man/ceph-create-keys.8                             |     2 +-
 man/ceph-debugpack.8                               |     2 +-
 man/ceph-dencoder.8                                |     2 +-
 man/ceph-deploy.8                                  |     2 +-
 man/ceph-detect-init.8                             |     2 +-
 man/ceph-disk.8                                    |   143 +-
 man/ceph-fuse.8                                    |     2 +-
 man/ceph-mds.8                                     |     2 +-
 man/ceph-mon.8                                     |     2 +-
 man/ceph-osd.8                                     |     2 +-
 man/ceph-post-file.8                               |     2 +-
 man/ceph-rbdnamer.8                                |     2 +-
 man/ceph-rest-api.8                                |     2 +-
 man/ceph-run.8                                     |     2 +-
 man/ceph-syn.8                                     |     2 +-
 man/ceph.8                                         |    23 +-
 man/cephfs.8                                       |     2 +-
 man/crushtool.8                                    |     2 +-
 man/librados-config.8                              |     2 +-
 man/monmaptool.8                                   |     2 +-
 man/mount.ceph.8                                   |     2 +-
 man/osdmaptool.8                                   |     2 +-
 man/rados.8                                        |     2 +-
 man/radosgw-admin.8                                |     2 +-
 man/radosgw.8                                      |     2 +-
 man/rbd-fuse.8                                     |     2 +-
 man/rbd-replay-many.8                              |     2 +-
 man/rbd-replay-prep.8                              |     2 +-
 man/rbd-replay.8                                   |     2 +-
 man/rbd.8                                          |    24 +-
 selinux/Makefile.in                                |     1 +
 src/.git_version                                   |     4 +-
 src/Makefile-client.am                             |    10 +-
 src/Makefile-env.am                                |    56 +-
 src/Makefile-rocksdb.am                            |  1217 +-
 src/Makefile-server.am                             |    10 +-
 src/Makefile.am                                    |     5 +-
 src/Makefile.in                                    |  5721 +++---
 src/acconfig.h.in                                  |     9 +
 src/auth/cephx/CephxSessionHandler.cc              |     5 +-
 src/bash_completion/rbd                            |    97 +-
 src/ceph-disk                                      |   478 +-
 src/ceph.in                                        |    27 +-
 src/ceph_fuse.cc                                   |     9 +-
 src/ceph_mds.cc                                    |     2 +-
 src/ceph_mon.cc                                    |     6 +-
 src/ceph_osd.cc                                    |     2 +-
 src/ceph_syn.cc                                    |     4 -
 src/client/Client.cc                               |   172 +-
 src/client/Client.h                                |    26 +-
 src/client/Inode.h                                 |     3 +
 src/client/ObjecterWriteback.h                     |     7 +-
 src/client/SyntheticClient.cc                      |    20 +-
 src/client/Trace.h                                 |     2 +-
 src/client/fuse_ll.cc                              |     1 +
 src/cls/Makefile-client.am                         |    10 +-
 src/cls/Makefile-server.am                         |    10 +-
 src/cls/journal/cls_journal.cc                     |   619 +
 src/cls/journal/cls_journal_client.cc              |   274 +
 src/cls/journal/cls_journal_client.h               |    51 +
 src/cls/journal/cls_journal_types.cc               |   155 +
 src/cls/journal/cls_journal_types.h                |   108 +
 src/cls/rbd/cls_rbd.cc                             |   193 +-
 src/cls/rbd/cls_rbd_client.cc                      |    27 +
 src/cls/rbd/cls_rbd_client.h                       |     9 +
 src/common/Makefile.am                             |    29 +-
 src/common/MemoryModel.cc                          |     3 +-
 src/common/SubProcess.h                            |    25 +-
 src/common/Thread.h                                |     4 +-
 src/common/Throttle.cc                             |     4 +-
 src/common/TrackedOp.cc                            |    45 +-
 src/common/TrackedOp.h                             |    21 +-
 src/common/WorkQueue.h                             |   130 +-
 src/common/buffer.cc                               |     2 +-
 src/common/ceph_context.cc                         |     8 +-
 src/common/ceph_context.h                          |     6 +-
 src/common/common_init.cc                          |     6 +-
 src/common/common_init.h                           |     2 +-
 src/common/config.cc                               |    16 +
 src/common/config.h                                |     2 +
 src/common/config_opts.h                           |    41 +-
 src/common/obj_bencher.cc                          |    24 +-
 src/common/obj_bencher.h                           |     6 +-
 src/common/run_cmd.cc                              |     3 +
 src/common/sctp_crc32.c                            |     3 +-
 src/common/solaris_errno.cc                        |   227 +
 src/crush/CrushTester.cc                           |     4 +-
 src/crush/mapper.c                                 |    15 +-
 src/erasure-code/isa/Makefile.am                   |     2 +-
 src/erasure-code/jerasure/Makefile.am              |    10 +-
 src/erasure-code/lrc/Makefile.am                   |     2 +-
 src/erasure-code/shec/Makefile.am                  |    10 +-
 src/{rbdmap => etc-rbdmap}                         |     0
 src/global/Makefile.am                             |     3 +-
 src/global/global_init.cc                          |    12 +-
 src/global/global_init.h                           |     6 +-
 src/global/signal_handler.cc                       |    15 +
 src/include/assert.h                               |     8 +
 src/include/buffer.h                               |     2 +-
 src/include/byteorder.h                            |    11 +
 src/include/ceph_features.h                        |     6 +
 src/include/ceph_fs.h                              |     3 +-
 src/include/cephfs/libcephfs.h                     |    18 +-
 src/include/compat.h                               |     9 +-
 src/include/encoding.h                             |     2 +
 src/include/rados.h                                |     4 +
 src/include/rados/buffer.h                         |     2 +-
 src/include/rados/librados.h                       |    58 +-
 src/include/rados/librados.hpp                     |    25 +
 src/include/rbd/features.h                         |    13 +-
 src/include/rbd/librbd.h                           |    39 +
 src/include/rbd/librbd.hpp                         |    30 +
 src/include/sock_compat.h                          |    12 -
 src/include/types.h                                |    35 +
 src/init-ceph.in                                   |    42 +-
 src/init-radosgw                                   |     8 +-
 src/init-rbdmap                                    |   105 +-
 src/java/Makefile.in                               |     1 +
 src/journal/AsyncOpTracker.cc                      |    39 +
 src/journal/AsyncOpTracker.h                       |    32 +
 src/journal/Entry.cc                               |   156 +
 src/journal/Entry.h                                |    62 +
 src/journal/Future.cc                              |    40 +
 src/journal/Future.h                               |    58 +
 src/journal/FutureImpl.cc                          |   149 +
 src/journal/FutureImpl.h                           |   128 +
 src/journal/JournalMetadata.cc                     |   493 +
 src/journal/JournalMetadata.h                      |   324 +
 src/journal/JournalPlayer.cc                       |   407 +
 src/journal/JournalPlayer.h                        |   136 +
 src/journal/JournalRecorder.cc                     |   182 +
 src/journal/JournalRecorder.h                      |   117 +
 src/journal/JournalTrimmer.cc                      |   204 +
 src/journal/JournalTrimmer.h                       |    83 +
 src/journal/Journaler.cc                           |   230 +
 src/journal/Journaler.h                            |    94 +
 src/journal/Makefile.am                            |    37 +
 src/journal/ObjectPlayer.cc                        |   248 +
 src/journal/ObjectPlayer.h                         |   133 +
 src/journal/ObjectRecorder.cc                      |   310 +
 src/journal/ObjectRecorder.h                       |   149 +
 src/journal/ReplayEntry.h                          |    34 +
 src/journal/ReplayHandler.h                        |    21 +
 src/journal/Utils.cc                               |    25 +
 src/journal/Utils.h                                |    23 +
 src/krbd.cc                                        |    13 +-
 src/{os => kv}/KeyValueDB.cc                       |     0
 src/kv/KeyValueDB.h                                |   277 +
 src/kv/KineticStore.cc                             |   348 +
 src/kv/KineticStore.h                              |   161 +
 src/kv/LevelDBStore.cc                             |   349 +
 src/kv/LevelDBStore.h                              |   415 +
 src/kv/Makefile.am                                 |    40 +
 src/kv/RocksDBStore.cc                             |   587 +
 src/kv/RocksDBStore.h                              |   285 +
 src/libcephfs.cc                                   |     4 +-
 src/librados/IoCtxImpl.cc                          |   242 +-
 src/librados/IoCtxImpl.h                           |     6 +
 src/librados/RadosClient.cc                        |    54 +-
 src/librados/RadosClient.h                         |     2 +
 src/librados/RadosXattrIter.cc                     |     2 +
 src/librados/librados.cc                           |   118 +-
 src/librbd/AioCompletion.cc                        |    35 +-
 src/librbd/AioCompletion.h                         |    73 +-
 src/librbd/AioImageRequest.cc                      |   445 +
 src/librbd/AioImageRequest.h                       |   211 +
 src/librbd/AioImageRequestWQ.cc                    |   303 +
 src/librbd/AioImageRequestWQ.h                     |    90 +
 src/librbd/AioObjectRequest.cc                     |   565 +
 src/librbd/AioObjectRequest.h                      |   349 +
 src/librbd/AioRequest.cc                           |   526 -
 src/librbd/AioRequest.h                            |   347 -
 src/librbd/AsyncFlattenRequest.cc                  |     6 +-
 src/librbd/AsyncOperation.cc                       |    40 +-
 src/librbd/AsyncResizeRequest.cc                   |     8 +-
 src/librbd/AsyncTrimRequest.cc                     |    16 +-
 src/librbd/CopyupRequest.cc                        |    16 +-
 src/librbd/CopyupRequest.h                         |     4 +-
 src/librbd/ImageCtx.cc                             |   233 +-
 src/librbd/ImageCtx.h                              |    26 +-
 src/librbd/ImageWatcher.cc                         |   452 +-
 src/librbd/ImageWatcher.h                          |   507 +-
 src/librbd/Journal.cc                              |   635 +
 src/librbd/Journal.h                               |   209 +
 src/librbd/JournalReplay.cc                        |   120 +
 src/librbd/JournalReplay.h                         |    66 +
 src/librbd/JournalTypes.cc                         |   192 +
 src/librbd/JournalTypes.h                          |   107 +
 src/librbd/LibrbdAdminSocketHook.cc                |    97 +
 src/librbd/LibrbdAdminSocketHook.h                 |    31 +
 src/librbd/LibrbdWriteback.cc                      |   117 +-
 src/librbd/LibrbdWriteback.h                       |    11 +-
 src/librbd/Makefile.am                             |    19 +-
 src/librbd/ObjectMap.cc                            |     4 +
 src/librbd/ObjectMap.h                             |     1 +
 src/librbd/WatchNotifyTypes.cc                     |    45 +-
 src/librbd/WatchNotifyTypes.h                      |    65 +-
 src/librbd/internal.cc                             |  1342 +-
 src/librbd/internal.h                              |    66 +-
 src/librbd/librbd.cc                               |   418 +-
 src/mds/CDentry.cc                                 |    53 +
 src/mds/CDentry.h                                  |    51 +
 src/mds/CDir.cc                                    |   261 +-
 src/mds/CDir.h                                     |   103 +
 src/mds/CInode.cc                                  |   263 +-
 src/mds/CInode.h                                   |    91 +-
 src/mds/Locker.cc                                  |     6 +
 src/mds/MDCache.cc                                 |   122 +-
 src/mds/MDCache.h                                  |    21 +
 src/mds/MDSAuthCaps.cc                             |   152 +-
 src/mds/MDSAuthCaps.h                              |   102 +-
 src/mds/MDSContinuation.h                          |    12 +-
 src/mds/MDSDaemon.cc                               |    38 +-
 src/mds/MDSRank.cc                                 |   185 +-
 src/mds/MDSRank.h                                  |    44 +-
 src/mds/Makefile-server.am                         |     2 +
 src/mds/Makefile.am                                |     4 +-
 src/mds/ScrubHeader.h                              |    23 +
 src/mds/ScrubStack.cc                              |   447 +
 src/mds/ScrubStack.h                               |   201 +
 src/mds/Server.cc                                  |   115 +-
 src/mds/Server.h                                   |     2 +
 src/mds/SessionMap.cc                              |   350 +-
 src/mds/SessionMap.h                               |    37 +-
 src/mds/mdstypes.cc                                |    40 +-
 src/mds/mdstypes.h                                 |    22 +-
 src/messages/MAuthReply.h                          |     2 +-
 src/messages/MClientCaps.h                         |    22 +-
 src/messages/MCommandReply.h                       |     2 +-
 src/messages/MLog.h                                |     3 +-
 src/messages/MMonCommandAck.h                      |     2 +-
 src/messages/MOSDOp.h                              |   276 +-
 src/messages/MOSDOpReply.h                         |     4 +-
 src/messages/MOSDPGCreate.h                        |     8 +-
 src/messages/MRoute.h                              |    41 +-
 src/messages/MWatchNotify.h                        |     2 +-
 src/mon/Makefile.am                                |     6 +-
 src/mon/MonCap.cc                                  |     2 +
 src/mon/MonClient.cc                               |    37 +-
 src/mon/MonClient.h                                |    55 +-
 src/mon/MonCommands.h                              |     4 +-
 src/mon/Monitor.cc                                 |    74 +-
 src/mon/MonitorDBStore.h                           |     3 +-
 src/mon/OSDMonitor.cc                              |   114 +-
 src/mon/OSDMonitor.h                               |    21 +-
 src/mon/PGMap.cc                                   |    18 +-
 src/mon/PGMap.h                                    |     4 +-
 src/mon/PGMonitor.cc                               |   369 +-
 src/mon/PGMonitor.h                                |    22 +-
 src/mon/Session.h                                  |     2 +-
 src/msg/Connection.h                               |    18 +-
 src/msg/async/AsyncConnection.cc                   |   109 +-
 src/msg/async/AsyncConnection.h                    |    11 +
 src/msg/async/Event.cc                             |     2 +-
 src/msg/async/net_handler.cc                       |    16 +-
 src/msg/async/net_handler.h                        |     9 +
 src/msg/msg_types.h                                |    47 +-
 src/msg/simple/Pipe.cc                             |   107 +-
 src/msg/simple/Pipe.h                              |     9 +
 src/ocf/Makefile.in                                |     1 +
 src/os/DBObjectMap.cc                              |    15 +-
 src/os/DBObjectMap.h                               |     6 +-
 src/os/FileJournal.cc                              |   141 +-
 src/os/FileJournal.h                               |    22 +-
 src/os/FileStore.cc                                |   101 +-
 src/os/FileStore.h                                 |    14 +-
 src/os/GenericObjectMap.cc                         |     2 +-
 src/os/GenericObjectMap.h                          |     6 +-
 src/os/IndexManager.cc                             |     2 +-
 src/os/Journal.h                                   |     5 +-
 src/os/JournalingObjectStore.cc                    |    22 +-
 src/os/JournalingObjectStore.h                     |     9 +-
 src/os/KeyValueDB.h                                |   220 -
 src/os/KeyValueStore.cc                            |     4 +-
 src/os/KeyValueStore.h                             |     2 +-
 src/os/KineticStore.cc                             |   329 -
 src/os/KineticStore.h                              |   160 -
 src/os/LFNIndex.cc                                 |     1 -
 src/os/LevelDBStore.cc                             |   306 -
 src/os/LevelDBStore.h                              |   402 -
 src/os/Makefile.am                                 |    55 +-
 src/os/MemStore.cc                                 |    36 +-
 src/os/MemStore.h                                  |     7 +-
 src/os/ObjectMap.h                                 |    14 +-
 src/os/ObjectStore.cc                              |    35 +
 src/os/ObjectStore.h                               |    57 +-
 src/os/RocksDBStore.cc                             |   518 -
 src/os/RocksDBStore.h                              |   280 -
 src/os/chain_xattr.cc                              |     8 +-
 src/os/chain_xattr.h                               |     6 +-
 src/os/fs/FS.cc                                    |    41 +-
 src/os/newstore/NewStore.cc                        |    73 +-
 src/os/newstore/NewStore.h                         |    11 +-
 src/osd/ClassHandler.h                             |     2 +-
 src/osd/ECBackend.cc                               |    22 +-
 src/osd/ECBackend.h                                |     1 +
 src/osd/Makefile.am                                |    12 +-
 src/osd/OSD.cc                                     |   737 +-
 src/osd/OSD.h                                      |   100 +-
 src/osd/OSDMap.cc                                  |    34 +-
 src/osd/OSDMap.h                                   |    17 +-
 src/osd/OpRequest.cc                               |     1 +
 src/osd/PG.cc                                      |    84 +-
 src/osd/PG.h                                       |    10 +-
 src/osd/PGBackend.cc                               |     5 +-
 src/osd/PGBackend.h                                |     9 +
 src/osd/PGLog.cc                                   |     2 +-
 src/osd/PGLog.h                                    |    15 +-
 src/osd/ReplicatedBackend.cc                       |    28 +-
 src/osd/ReplicatedPG.cc                            |   750 +-
 src/osd/ReplicatedPG.h                             |    25 +-
 src/osd/TierAgentState.h                           |     4 -
 src/osd/osd_types.cc                               |    62 +-
 src/osd/osd_types.h                                |    45 +-
 src/osdc/ObjectCacher.cc                           |    66 +-
 src/osdc/ObjectCacher.h                            |    38 +-
 src/osdc/Objecter.cc                               |    54 +-
 src/osdc/Objecter.h                                |    20 +-
 src/osdc/WritebackHandler.h                        |     5 +-
 src/pybind/rados.py                                |   302 +-
 src/pybind/rbd.py                                  |   299 +-
 src/rbd.cc                                         |  4115 ----
 src/rbdmap                                         |   117 +-
 src/rgw/rgw_acl_s3.cc                              |     2 +-
 src/rgw/rgw_admin.cc                               |    60 +-
 src/rgw/rgw_auth_s3.cc                             |     3 +
 src/rgw/rgw_bucket.cc                              |    21 +-
 src/rgw/rgw_civetweb.cc                            |     7 +
 src/rgw/rgw_common.cc                              |    31 +
 src/rgw/rgw_common.h                               |     9 +-
 src/rgw/rgw_http_client.cc                         |     2 +-
 src/rgw/rgw_json_enc.cc                            |     2 +
 src/rgw/rgw_main.cc                                |     3 +-
 src/rgw/rgw_metadata.cc                            |     6 +-
 src/rgw/rgw_object_expirer.cc                      |     2 +-
 src/rgw/rgw_op.cc                                  |    88 +-
 src/rgw/rgw_op.h                                   |    42 +
 src/rgw/rgw_quota.cc                               |    46 +-
 src/rgw/rgw_rados.cc                               |    31 +-
 src/rgw/rgw_rest.cc                                |    49 +-
 src/rgw/rgw_rest_s3.cc                             |   123 +-
 src/rgw/rgw_rest_s3.h                              |    21 +
 src/rgw/rgw_rest_swift.cc                          |    45 +-
 src/rgw/rgw_rest_swift.h                           |     3 +
 src/rgw/rgw_tools.cc                               |     5 +-
 src/rgw/rgw_user.cc                                |    37 +-
 src/rgw/rgw_user.h                                 |     8 +-
 src/rocksdb/.arcconfig                             |    17 +
 src/rocksdb/.clang-format                          |     5 +
 src/rocksdb/.gitignore                             |    39 +-
 src/rocksdb/.travis.yml                            |    43 +
 src/rocksdb/CMakeLists.txt                         |   386 +
 src/rocksdb/DUMP_FORMAT.md                         |    16 +
 src/rocksdb/HISTORY.md                             |    82 +-
 src/rocksdb/INSTALL.md                             |     7 +-
 src/rocksdb/Makefile                               |  1165 ++
 src/rocksdb/Makefile.am                            |   383 -
 src/rocksdb/ROCKSDB_LITE.md                        |     1 +
 src/rocksdb/USERS.md                               |     5 +-
 src/rocksdb/Vagrantfile                            |    33 +
 src/rocksdb/WINDOWS_PORT.md                        |   228 +
 src/rocksdb/appveyor.yml                           |    11 +
 src/rocksdb/appveyordailytests.yml                 |    22 +
 .../arcanist_util/__phutil_library_init__.php      |     3 +
 .../arcanist_util/__phutil_library_map__.php       |    38 +
 .../config/FacebookArcanistConfiguration.php       |    35 +
 .../cpp_linter/ArcanistCpplintLinter.php           |    88 +
 .../cpp_linter/BaseDirectoryScopedFormatLinter.php |    74 +
 .../cpp_linter/FacebookHowtoevenLinter.php         |   223 +
 .../cpp_linter/FbcodeClangFormatLinter.php         |    58 +
 .../arcanist_util/cpp_linter/FbcodeCppLinter.php   |   123 +
 src/rocksdb/arcanist_util/cpp_linter/cpplint.py    |  4767 +++++
 .../lint_engine/FacebookFbcodeLintEngine.php       |   140 +
 .../lint_engine/FacebookHowtoevenLintEngine.php    |    27 +
 .../unit_engine/FacebookFbcodeUnitTestEngine.php   |    21 +
 src/rocksdb/build_tools/amalgamate.py              |   110 +
 src/rocksdb/build_tools/build_detect_platform      |   391 +
 src/rocksdb/build_tools/dockerbuild.sh             |     2 +
 src/rocksdb/build_tools/fb_compile_mongo.sh        |    55 +
 src/rocksdb/build_tools/fbcode_config.sh           |   133 +
 src/rocksdb/build_tools/fbcode_config4.8.1.sh      |   110 +
 src/rocksdb/build_tools/format-diff.sh             |   113 +
 src/rocksdb/build_tools/make_new_version.sh        |    46 +
 src/rocksdb/build_tools/make_package.sh            |   116 +
 src/rocksdb/build_tools/regression_build_test.sh   |   428 +
 src/rocksdb/build_tools/rocksdb-lego-determinator  |   587 +
 src/rocksdb/build_tools/run_ci_db_test.ps1         |   252 +
 src/rocksdb/build_tools/version.sh                 |    14 +
 src/rocksdb/configure.ac                           |    87 -
 src/rocksdb/coverage/coverage_test.sh              |    78 +
 src/rocksdb/coverage/parse_gcov_output.py          |   118 +
 src/rocksdb/db/builder.cc                          |   238 +-
 src/rocksdb/db/builder.h                           |    10 +-
 src/rocksdb/db/c.cc                                |   406 +-
 src/rocksdb/db/c_test.c                            |   224 +-
 src/rocksdb/db/column_family.cc                    |   231 +-
 src/rocksdb/db/column_family.h                     |    26 +-
 src/rocksdb/db/column_family_test.cc               |   324 +-
 src/rocksdb/db/compact_files_test.cc               |    28 +-
 src/rocksdb/db/compacted_db_impl.cc                |   163 +
 src/rocksdb/db/compacted_db_impl.h                 |    95 +
 src/rocksdb/db/compaction.cc                       |   144 +-
 src/rocksdb/db/compaction.h                        |    66 +-
 src/rocksdb/db/compaction_iterator.cc              |   338 +
 src/rocksdb/db/compaction_iterator.h               |   138 +
 src/rocksdb/db/compaction_iterator_test.cc         |    71 +
 src/rocksdb/db/compaction_job.cc                   |  1597 +-
 src/rocksdb/db/compaction_job.h                    |    94 +-
 src/rocksdb/db/compaction_job_stats_test.cc        |  1045 +
 src/rocksdb/db/compaction_job_test.cc              |   654 +-
 src/rocksdb/db/compaction_picker.cc                |   171 +-
 src/rocksdb/db/compaction_picker.h                 |    30 +-
 src/rocksdb/db/compaction_picker_test.cc           |   407 +-
 src/rocksdb/db/comparator_db_test.cc               |    24 +-
 src/rocksdb/db/convenience.cc                      |    23 +
 src/rocksdb/db/corruption_test.cc                  |    59 +-
 src/rocksdb/db/cuckoo_table_db_test.cc             |    16 +-
 src/rocksdb/db/db_bench.cc                         |   998 +-
 src/rocksdb/db/db_compaction_filter_test.cc        |   586 +
 src/rocksdb/db/db_compaction_test.cc               |  1858 ++
 src/rocksdb/db/db_dynamic_level_test.cc            |   497 +
 src/rocksdb/db/db_filesnapshot.cc                  |     2 +
 src/rocksdb/db/db_impl.cc                          |  1857 +-
 src/rocksdb/db/db_impl.h                           |   250 +-
 src/rocksdb/db/db_impl_debug.cc                    |    18 +-
 src/rocksdb/db/db_impl_experimental.cc             |     7 +-
 src/rocksdb/db/db_impl_readonly.cc                 |     3 +-
 src/rocksdb/db/db_impl_readonly.h                  |    18 +-
 src/rocksdb/db/db_inplace_update_test.cc           |   171 +
 src/rocksdb/db/db_iter.cc                          |   188 +-
 src/rocksdb/db/db_iter_test.cc                     |   906 +-
 src/rocksdb/db/db_log_iter_test.cc                 |   290 +
 src/rocksdb/db/db_tailing_iter_test.cc             |   659 +
 src/rocksdb/db/db_test.cc                          | 19658 ++++++++-----------
 src/rocksdb/db/db_universal_compaction_test.cc     |  1223 ++
 src/rocksdb/db/db_wal_test.cc                      |   144 +
 src/rocksdb/db/dbformat.cc                         |    20 +-
 src/rocksdb/db/dbformat.h                          |    80 +-
 src/rocksdb/db/dbformat_test.cc                    |    19 +
 src/rocksdb/db/deletefile_test.cc                  |    36 +-
 src/rocksdb/db/event_helpers.cc                    |   108 +
 src/rocksdb/db/event_helpers.h                     |    33 +
 src/rocksdb/db/event_logger_helpers.cc             |    46 -
 src/rocksdb/db/event_logger_helpers.h              |    18 -
 src/rocksdb/db/fault_injection_test.cc             |   206 +-
 src/rocksdb/db/file_indexer.h                      |     4 +-
 src/rocksdb/db/filename.cc                         |    43 +-
 src/rocksdb/db/filename.h                          |    16 +-
 src/rocksdb/db/flush_job.cc                        |   123 +-
 src/rocksdb/db/flush_job.h                         |    21 +-
 src/rocksdb/db/flush_job_test.cc                   |    98 +-
 src/rocksdb/db/forward_iterator.cc                 |   188 +-
 src/rocksdb/db/forward_iterator.h                  |    26 +-
 src/rocksdb/db/internal_stats.cc                   |   240 +-
 src/rocksdb/db/internal_stats.h                    |   127 +-
 src/rocksdb/db/job_context.h                       |     5 +
 src/rocksdb/db/listener_test.cc                    |   133 +-
 src/rocksdb/db/log_reader.cc                       |    27 +-
 src/rocksdb/db/log_reader.h                        |    14 +-
 src/rocksdb/db/log_test.cc                         |   136 +-
 src/rocksdb/db/log_writer.cc                       |     6 +-
 src/rocksdb/db/log_writer.h                        |    16 +-
 src/rocksdb/db/managed_iterator.cc                 |     1 +
 src/rocksdb/db/memtable.cc                         |   115 +-
 src/rocksdb/db/memtable.h                          |    45 +-
 src/rocksdb/db/memtable_list.cc                    |   163 +-
 src/rocksdb/db/memtable_list.h                     |   105 +-
 src/rocksdb/db/memtable_list_test.cc               |   295 +-
 src/rocksdb/db/memtablerep_bench.cc                |     9 +-
 src/rocksdb/db/merge_helper.cc                     |   259 +-
 src/rocksdb/db/merge_helper.h                      |   104 +-
 src/rocksdb/db/merge_helper_test.cc                |   289 +
 src/rocksdb/db/merge_operator.cc                   |     4 +-
 src/rocksdb/db/merge_test.cc                       |    21 +-
 src/rocksdb/db/plain_table_db_test.cc              |   108 +-
 src/rocksdb/db/prefix_test.cc                      |    18 +-
 src/rocksdb/db/repair.cc                           |    29 +-
 src/rocksdb/db/skiplist.h                          |   134 +-
 src/rocksdb/db/snapshot.h                          |   111 -
 src/rocksdb/db/snapshot_impl.cc                    |    23 +
 src/rocksdb/db/snapshot_impl.h                     |   111 +
 src/rocksdb/db/table_cache.cc                      |   205 +-
 src/rocksdb/db/table_cache.h                       |    17 +-
 src/rocksdb/db/table_properties_collector.cc       |     8 +-
 src/rocksdb/db/table_properties_collector.h        |     8 +-
 src/rocksdb/db/table_properties_collector_test.cc  |   197 +-
 src/rocksdb/db/transaction_log_impl.cc             |    19 +-
 src/rocksdb/db/transaction_log_impl.h              |     7 +-
 src/rocksdb/db/version_builder.cc                  |    48 +-
 src/rocksdb/db/version_builder.h                   |     3 +-
 src/rocksdb/db/version_builder_test.cc             |    33 +-
 src/rocksdb/db/version_edit.cc                     |    74 +-
 src/rocksdb/db/version_edit.h                      |    25 +-
 src/rocksdb/db/version_edit_test.cc                |     7 +-
 src/rocksdb/db/version_set.cc                      |   598 +-
 src/rocksdb/db/version_set.h                       |    82 +-
 src/rocksdb/db/version_set_test.cc                 |    23 +
 src/rocksdb/db/wal_manager.cc                      |    19 +-
 src/rocksdb/db/wal_manager_test.cc                 |    21 +-
 src/rocksdb/db/write_batch.cc                      |   231 +-
 src/rocksdb/db/write_batch_base.cc                 |    30 +
 src/rocksdb/db/write_batch_internal.h              |    13 +
 src/rocksdb/db/write_batch_test.cc                 |   294 +-
 src/rocksdb/db/write_callback.h                    |    24 +
 src/rocksdb/db/write_callback_test.cc              |   129 +
 src/rocksdb/db/write_controller.cc                 |    86 +-
 src/rocksdb/db/write_controller.h                  |    41 +-
 src/rocksdb/db/write_controller_test.cc            |    73 +-
 src/rocksdb/db/write_thread.cc                     |   227 +-
 src/rocksdb/db/write_thread.h                      |   146 +-
 src/rocksdb/examples/.gitignore                    |     7 +-
 src/rocksdb/examples/Makefile                      |    26 +-
 src/rocksdb/examples/c_simple_example.c            |     2 +-
 src/rocksdb/examples/compact_files_example.cc      |    48 +-
 src/rocksdb/examples/compaction_filter_example.cc  |    84 +
 .../examples/optimistic_transaction_example.cc     |   142 +
 .../examples/rocksdb_option_file_example.ini       |    53 +
 src/rocksdb/examples/transaction_example.cc        |   144 +
 src/rocksdb/hdfs/env_hdfs.h                        |    26 +-
 src/rocksdb/include/rocksdb/c.h                    |  1173 +-
 src/rocksdb/include/rocksdb/cache.h                |     6 +
 src/rocksdb/include/rocksdb/compaction_filter.h    |   116 +-
 src/rocksdb/include/rocksdb/compaction_job_stats.h |    85 +
 src/rocksdb/include/rocksdb/comparator.h           |     9 +
 src/rocksdb/include/rocksdb/convenience.h          |    83 +
 src/rocksdb/include/rocksdb/db.h                   |   219 +-
 src/rocksdb/include/rocksdb/db_dump_tool.h         |    45 +
 src/rocksdb/include/rocksdb/delete_scheduler.h     |    66 +
 src/rocksdb/include/rocksdb/env.h                  |   238 +-
 src/rocksdb/include/rocksdb/immutable_options.h    |    10 +-
 src/rocksdb/include/rocksdb/iostats_context.h      |    31 +-
 src/rocksdb/include/rocksdb/listener.h             |   130 +-
 src/rocksdb/include/rocksdb/memtablerep.h          |     5 +
 src/rocksdb/include/rocksdb/merge_operator.h       |    13 +-
 src/rocksdb/include/rocksdb/metadata.h             |     6 +-
 src/rocksdb/include/rocksdb/options.h              |   259 +-
 src/rocksdb/include/rocksdb/perf_context.h         |    51 +-
 src/rocksdb/include/rocksdb/perf_level.h           |    30 +
 src/rocksdb/include/rocksdb/rate_limiter.h         |     2 +-
 src/rocksdb/include/rocksdb/slice.h                |    26 +-
 src/rocksdb/include/rocksdb/snapshot.h             |    45 +
 src/rocksdb/include/rocksdb/sst_file_writer.h      |    77 +
 src/rocksdb/include/rocksdb/statistics.h           |    88 +-
 src/rocksdb/include/rocksdb/status.h               |   136 +-
 src/rocksdb/include/rocksdb/table.h                |    31 +-
 src/rocksdb/include/rocksdb/table_properties.h     |    21 +-
 src/rocksdb/include/rocksdb/thread_status.h        |    10 +-
 src/rocksdb/include/rocksdb/transaction_log.h      |    21 +
 src/rocksdb/include/rocksdb/universal_compaction.h |     8 +-
 .../include/rocksdb/utilities/backupable_db.h      |    54 +-
 src/rocksdb/include/rocksdb/utilities/checkpoint.h |     2 +
 .../include/rocksdb/utilities/convenience.h        |    57 +-
 .../include/rocksdb/utilities/info_log_finder.h    |    19 +
 .../rocksdb/utilities/optimistic_transaction_db.h  |    72 +
 src/rocksdb/include/rocksdb/utilities/spatial_db.h |    49 +-
 .../include/rocksdb/utilities/stackable_db.h       |    54 +-
 .../utilities/table_properties_collectors.h        |    29 +
 .../include/rocksdb/utilities/transaction.h        |   307 +
 .../include/rocksdb/utilities/transaction_db.h     |   137 +
 .../rocksdb/utilities/transaction_db_mutex.h       |    92 +
 src/rocksdb/include/rocksdb/utilities/utility_db.h |     7 +-
 .../rocksdb/utilities/write_batch_with_index.h     |    60 +-
 src/rocksdb/include/rocksdb/version.h              |     6 +-
 src/rocksdb/include/rocksdb/write_batch.h          |    84 +-
 src/rocksdb/include/rocksdb/write_batch_base.h     |    27 +
 src/rocksdb/include/utilities/backupable_db.h      |    12 -
 src/rocksdb/include/utilities/db_ttl.h             |     8 -
 src/rocksdb/include/utilities/document_db.h        |     8 -
 src/rocksdb/include/utilities/geo_db.h             |     8 -
 src/rocksdb/include/utilities/json_document.h      |     7 -
 src/rocksdb/include/utilities/stackable_db.h       |     7 -
 src/rocksdb/include/utilities/utility_db.h         |     7 -
 src/rocksdb/java/HISTORY-JAVA.md                   |    86 +
 src/rocksdb/java/Makefile                          |   190 +
 src/rocksdb/java/RELEASE.md                        |    54 +
 .../java/org/rocksdb/benchmark/DbBenchmark.java    |  1624 ++
 src/rocksdb/java/crossbuild/Vagrantfile            |    26 +
 src/rocksdb/java/crossbuild/build-linux-centos.sh  |    24 +
 src/rocksdb/java/crossbuild/build-linux.sh         |    14 +
 src/rocksdb/java/jdb_bench.sh                      |    10 +
 src/rocksdb/java/rocksjni.pom                      |   145 +
 src/rocksdb/java/rocksjni/backupablejni.cc         |   330 +
 src/rocksdb/java/rocksjni/backupenginejni.cc       |   216 +
 src/rocksdb/java/rocksjni/checkpoint.cc            |    61 +
 src/rocksdb/java/rocksjni/columnfamilyhandle.cc    |    25 +
 src/rocksdb/java/rocksjni/compaction_filter.cc     |    24 +
 src/rocksdb/java/rocksjni/comparator.cc            |    66 +
 src/rocksdb/java/rocksjni/comparatorjnicallback.cc |   176 +
 src/rocksdb/java/rocksjni/comparatorjnicallback.h  |    95 +
 src/rocksdb/java/rocksjni/env.cc                   |    79 +
 src/rocksdb/java/rocksjni/filter.cc                |    46 +
 src/rocksdb/java/rocksjni/iterator.cc              |   144 +
 src/rocksdb/java/rocksjni/loggerjnicallback.cc     |   195 +
 src/rocksdb/java/rocksjni/loggerjnicallback.h      |    44 +
 src/rocksdb/java/rocksjni/memtablejni.cc           |    90 +
 src/rocksdb/java/rocksjni/merge_operator.cc        |    37 +
 src/rocksdb/java/rocksjni/options.cc               |  4089 ++++
 src/rocksdb/java/rocksjni/portal.h                 |   833 +
 src/rocksdb/java/rocksjni/ratelimiterjni.cc        |    24 +
 .../remove_emptyvalue_compactionfilterjni.cc       |    27 +
 src/rocksdb/java/rocksjni/restorejni.cc            |   203 +
 src/rocksdb/java/rocksjni/rocksjni.cc              |  1653 ++
 src/rocksdb/java/rocksjni/slice.cc                 |   259 +
 src/rocksdb/java/rocksjni/snapshot.cc              |    26 +
 src/rocksdb/java/rocksjni/statistics.cc            |    50 +
 src/rocksdb/java/rocksjni/table.cc                 |    89 +
 src/rocksdb/java/rocksjni/transaction_log.cc       |    78 +
 src/rocksdb/java/rocksjni/ttl.cc                   |   183 +
 src/rocksdb/java/rocksjni/write_batch.cc           |   238 +
 src/rocksdb/java/rocksjni/write_batch_test.cc      |   148 +
 .../java/rocksjni/write_batch_with_index.cc        |   386 +
 .../java/rocksjni/writebatchhandlerjnicallback.cc  |   104 +
 .../java/rocksjni/writebatchhandlerjnicallback.h   |    46 +
 .../src/main/java/RocksDBColumnFamilySample.java   |    95 +
 .../java/samples/src/main/java/RocksDBSample.java  |   312 +
 .../java/org/rocksdb/AbstractCompactionFilter.java |    29 +
 .../main/java/org/rocksdb/AbstractComparator.java  |   100 +
 .../java/org/rocksdb/AbstractRocksIterator.java    |   106 +
 .../src/main/java/org/rocksdb/AbstractSlice.java   |   171 +
 .../main/java/org/rocksdb/AbstractWriteBatch.java  |    92 +
 .../src/main/java/org/rocksdb/BackupEngine.java    |   222 +
 .../java/src/main/java/org/rocksdb/BackupInfo.java |    67 +
 .../src/main/java/org/rocksdb/BackupableDB.java    |   166 +
 .../main/java/org/rocksdb/BackupableDBOptions.java |   271 +
 .../java/org/rocksdb/BlockBasedTableConfig.java    |   425 +
 .../src/main/java/org/rocksdb/BloomFilter.java     |    89 +
 .../main/java/org/rocksdb/BuiltinComparator.java   |    20 +
 .../java/src/main/java/org/rocksdb/Checkpoint.java |    72 +
 .../src/main/java/org/rocksdb/ChecksumType.java    |    39 +
 .../java/org/rocksdb/ColumnFamilyDescriptor.java   |    61 +
 .../main/java/org/rocksdb/ColumnFamilyHandle.java  |    45 +
 .../main/java/org/rocksdb/ColumnFamilyOptions.java |   820 +
 .../org/rocksdb/ColumnFamilyOptionsInterface.java  |  1182 ++
 .../src/main/java/org/rocksdb/CompactionStyle.java |    52 +
 .../java/src/main/java/org/rocksdb/Comparator.java |    24 +
 .../main/java/org/rocksdb/ComparatorOptions.java   |    57 +
 .../src/main/java/org/rocksdb/CompressionType.java |    94 +
 .../java/src/main/java/org/rocksdb/DBOptions.java  |   655 +
 .../main/java/org/rocksdb/DBOptionsInterface.java  |   764 +
 .../main/java/org/rocksdb/DirectComparator.java    |    24 +
 .../src/main/java/org/rocksdb/DirectSlice.java     |   118 +
 .../src/main/java/org/rocksdb/EncodingType.java    |    55 +
 .../java/src/main/java/org/rocksdb/Env.java        |    92 +
 .../java/src/main/java/org/rocksdb/Filter.java     |    31 +
 .../src/main/java/org/rocksdb/FlushOptions.java    |    51 +
 .../java/org/rocksdb/GenericRateLimiterConfig.java |    66 +
 .../org/rocksdb/HashLinkedListMemTableConfig.java  |   173 +
 .../org/rocksdb/HashSkipListMemTableConfig.java    |   105 +
 .../src/main/java/org/rocksdb/HistogramData.java   |    44 +
 .../src/main/java/org/rocksdb/HistogramType.java   |    40 +
 .../java/src/main/java/org/rocksdb/IndexType.java  |    37 +
 .../src/main/java/org/rocksdb/InfoLogLevel.java    |    47 +
 .../java/src/main/java/org/rocksdb/Logger.java     |   108 +
 .../src/main/java/org/rocksdb/MemTableConfig.java  |    29 +
 .../src/main/java/org/rocksdb/MergeOperator.java   |    15 +
 .../main/java/org/rocksdb/NativeLibraryLoader.java |   114 +
 .../java/src/main/java/org/rocksdb/Options.java    |  1328 ++
 .../main/java/org/rocksdb/PlainTableConfig.java    |   251 +
 .../main/java/org/rocksdb/RateLimiterConfig.java   |    23 +
 .../src/main/java/org/rocksdb/ReadOptions.java     |   163 +
 .../rocksdb/RemoveEmptyValueCompactionFilter.java  |    18 +
 .../main/java/org/rocksdb/RestoreBackupableDB.java |   166 +
 .../src/main/java/org/rocksdb/RestoreOptions.java  |    41 +
 .../java/src/main/java/org/rocksdb/RocksDB.java    |  1824 ++
 .../main/java/org/rocksdb/RocksDBException.java    |    21 +
 .../java/src/main/java/org/rocksdb/RocksEnv.java   |    43 +
 .../src/main/java/org/rocksdb/RocksIterator.java   |    64 +
 .../java/org/rocksdb/RocksIteratorInterface.java   |    80 +
 .../src/main/java/org/rocksdb/RocksMemEnv.java     |    33 +
 .../src/main/java/org/rocksdb/RocksObject.java     |   125 +
 .../java/org/rocksdb/SkipListMemTableConfig.java   |    50 +
 .../java/src/main/java/org/rocksdb/Slice.java      |    88 +
 .../java/src/main/java/org/rocksdb/Snapshot.java   |    37 +
 .../java/src/main/java/org/rocksdb/Statistics.java |    37 +
 .../main/java/org/rocksdb/StatisticsCollector.java |   107 +
 .../org/rocksdb/StatisticsCollectorCallback.java   |    32 +
 .../main/java/org/rocksdb/StatsCollectorInput.java |    35 +
 .../java/org/rocksdb/StringAppendOperator.java     |    17 +
 .../main/java/org/rocksdb/TableFormatConfig.java   |    22 +
 .../java/src/main/java/org/rocksdb/TickerType.java |   137 +
 .../java/org/rocksdb/TransactionLogIterator.java   |   116 +
 .../java/src/main/java/org/rocksdb/TtlDB.java      |   197 +
 .../java/org/rocksdb/VectorMemTableConfig.java     |    45 +
 .../main/java/org/rocksdb/WBWIRocksIterator.java   |   149 +
 .../java/src/main/java/org/rocksdb/WriteBatch.java |   126 +
 .../main/java/org/rocksdb/WriteBatchInterface.java |    98 +
 .../main/java/org/rocksdb/WriteBatchWithIndex.java |   149 +
 .../src/main/java/org/rocksdb/WriteOptions.java    |   106 +
 .../main/java/org/rocksdb/util/Environment.java    |    59 +
 .../src/main/java/org/rocksdb/util/SizeUnit.java   |    16 +
 .../java/org/rocksdb/AbstractComparatorTest.java   |   217 +
 .../test/java/org/rocksdb/BackupEngineTest.java    |   305 +
 .../java/org/rocksdb/BackupableDBOptionsTest.java  |   283 +
 .../test/java/org/rocksdb/BackupableDBTest.java    |   425 +
 .../org/rocksdb/BlockBasedTableConfigTest.java     |   185 +
 .../src/test/java/org/rocksdb/CheckPointTest.java  |    97 +
 .../java/org/rocksdb/ColumnFamilyOptionsTest.java  |   745 +
 .../test/java/org/rocksdb/ColumnFamilyTest.java    |   746 +
 .../java/org/rocksdb/ComparatorOptionsTest.java    |    35 +
 .../src/test/java/org/rocksdb/ComparatorTest.java  |   227 +
 .../java/org/rocksdb/CompressionOptionsTest.java   |    21 +
 .../src/test/java/org/rocksdb/DBOptionsTest.java   |   570 +
 .../java/org/rocksdb/DirectComparatorTest.java     |    52 +
 .../src/test/java/org/rocksdb/DirectSliceTest.java |   106 +
 .../java/src/test/java/org/rocksdb/FilterTest.java |    47 +
 .../java/src/test/java/org/rocksdb/FlushTest.java  |    65 +
 .../test/java/org/rocksdb/InfoLogLevelTest.java    |   134 +
 .../src/test/java/org/rocksdb/KeyMayExistTest.java |    95 +
 .../java/src/test/java/org/rocksdb/LoggerTest.java |   220 +
 .../src/test/java/org/rocksdb/MemTableTest.java    |   137 +
 .../java/src/test/java/org/rocksdb/MergeTest.java  |   302 +
 .../test/java/org/rocksdb/MixedOptionsTest.java    |    56 +
 .../java/org/rocksdb/NativeLibraryLoaderTest.java  |    31 +
 .../src/test/java/org/rocksdb/OptionsTest.java     |  1208 ++
 .../java/org/rocksdb/PlainTableConfigTest.java     |    95 +
 .../java/org/rocksdb/PlatformRandomHelper.java     |    58 +
 .../src/test/java/org/rocksdb/ReadOnlyTest.java    |   365 +
 .../src/test/java/org/rocksdb/ReadOptionsTest.java |   151 +
 .../src/test/java/org/rocksdb/RocksDBTest.java     |   809 +
 .../src/test/java/org/rocksdb/RocksEnvTest.java    |    38 +
 .../test/java/org/rocksdb/RocksIteratorTest.java   |    72 +
 .../src/test/java/org/rocksdb/RocksMemEnvTest.java |   196 +
 .../test/java/org/rocksdb/RocksMemoryResource.java |    20 +
 .../java/src/test/java/org/rocksdb/SliceTest.java  |   105 +
 .../src/test/java/org/rocksdb/SnapshotTest.java    |   217 +
 .../java/org/rocksdb/StatisticsCollectorTest.java  |    60 +
 .../test/java/org/rocksdb/StatsCallbackMock.java   |    20 +
 .../org/rocksdb/TransactionLogIteratorTest.java    |   182 +
 .../java/src/test/java/org/rocksdb/TtlDBTest.java  |   166 +
 .../java/src/test/java/org/rocksdb/Types.java      |    43 +
 .../java/org/rocksdb/WriteBatchHandlerTest.java    |   170 +
 .../src/test/java/org/rocksdb/WriteBatchTest.java  |   123 +
 .../java/org/rocksdb/WriteBatchWithIndexTest.java  |   268 +
 .../test/java/org/rocksdb/WriteOptionsTest.java    |    31 +
 .../java/org/rocksdb/test/RocksJunitRunner.java    |    68 +
 .../java/org/rocksdb/util/EnvironmentTest.java     |   171 +
 .../test/java/org/rocksdb/util/SizeUnitTest.java   |    27 +
 src/rocksdb/m4/libtool.m4                          |  7997 --------
 src/rocksdb/m4/ltoptions.m4                        |   384 -
 src/rocksdb/m4/ltsugar.m4                          |   123 -
 src/rocksdb/m4/ltversion.m4                        |    23 -
 src/rocksdb/m4/lt~obsolete.m4                      |    98 -
 src/rocksdb/port/dirent.h                          |    47 +
 src/rocksdb/port/port.h                            |     2 +
 src/rocksdb/port/port_posix.cc                     |    28 +-
 src/rocksdb/port/port_posix.h                      |    20 +-
 src/rocksdb/port/sys_time.h                        |    48 +
 src/rocksdb/port/util_logger.h                     |    23 +
 src/rocksdb/port/win/env_win.cc                    |  2099 ++
 src/rocksdb/port/win/port_win.cc                   |   315 +
 src/rocksdb/port/win/port_win.h                    |   250 +
 src/rocksdb/port/win/stdint.h                      |    24 -
 src/rocksdb/port/win/win_logger.cc                 |   154 +
 src/rocksdb/port/win/win_logger.h                  |    57 +
 src/rocksdb/src.mk                                 |   312 +
 src/rocksdb/table/adaptive_table_factory.cc        |    16 +-
 src/rocksdb/table/adaptive_table_factory.h         |    15 +-
 src/rocksdb/table/block.cc                         |     4 +-
 src/rocksdb/table/block.h                          |    11 +
 src/rocksdb/table/block_based_filter_block.cc      |    12 +-
 src/rocksdb/table/block_based_table_builder.cc     |    39 +-
 src/rocksdb/table/block_based_table_builder.h      |     7 +-
 src/rocksdb/table/block_based_table_factory.cc     |    30 +-
 src/rocksdb/table/block_based_table_factory.h      |    22 +-
 src/rocksdb/table/block_based_table_reader.cc      |   178 +-
 src/rocksdb/table/block_based_table_reader.h       |    10 +-
 src/rocksdb/table/block_hash_index.cc              |     6 +-
 src/rocksdb/table/block_prefix_index.h             |     1 +
 src/rocksdb/table/cuckoo_table_builder.cc          |     3 +-
 src/rocksdb/table/cuckoo_table_builder.h           |    18 +-
 src/rocksdb/table/cuckoo_table_builder_test.cc     |   267 +-
 src/rocksdb/table/cuckoo_table_factory.cc          |    13 +-
 src/rocksdb/table/cuckoo_table_factory.h           |    17 +-
 src/rocksdb/table/cuckoo_table_reader.cc           |     9 +-
 src/rocksdb/table/cuckoo_table_reader.h            |    14 +-
 src/rocksdb/table/cuckoo_table_reader_test.cc      |    85 +-
 src/rocksdb/table/format.cc                        |    22 +-
 src/rocksdb/table/format.h                         |    15 +-
 src/rocksdb/table/full_filter_block.cc             |     9 +-
 src/rocksdb/table/get_context.cc                   |    54 +-
 src/rocksdb/table/get_context.h                    |     9 +
 src/rocksdb/table/iter_heap.h                      |    16 +-
 src/rocksdb/table/merger.cc                        |   204 +-
 src/rocksdb/table/merger_test.cc                   |    36 +-
 src/rocksdb/table/meta_blocks.cc                   |    14 +-
 src/rocksdb/table/meta_blocks.h                    |    36 +-
 src/rocksdb/table/mock_table.cc                    |    49 +-
 src/rocksdb/table/mock_table.h                     |    57 +-
 src/rocksdb/table/plain_table_builder.cc           |    10 +-
 src/rocksdb/table/plain_table_builder.h            |    11 +-
 src/rocksdb/table/plain_table_factory.cc           |    25 +-
 src/rocksdb/table/plain_table_factory.h            |    15 +-
 src/rocksdb/table/plain_table_index.cc             |     4 +-
 src/rocksdb/table/plain_table_key_coding.cc        |   309 +-
 src/rocksdb/table/plain_table_key_coding.h         |    72 +-
 src/rocksdb/table/plain_table_reader.cc            |   157 +-
 src/rocksdb/table/plain_table_reader.h             |    29 +-
 src/rocksdb/table/sst_file_writer.cc               |   188 +
 src/rocksdb/table/table_builder.h                  |    19 +
 src/rocksdb/table/table_properties.cc              |    10 +
 src/rocksdb/table/table_reader_bench.cc            |    25 +-
 src/rocksdb/table/table_test.cc                    |   399 +-
 src/rocksdb/table/two_level_iterator.cc            |    29 +-
 src/rocksdb/table/two_level_iterator.h             |     5 +-
 src/rocksdb/third-party/fbson/COMMIT.md            |     3 +
 src/rocksdb/third-party/fbson/FbsonDocument.h      |    17 +-
 src/rocksdb/third-party/fbson/FbsonStream.h        |     4 +
 .../gtest-1.7.0/fused-src/gtest/CMakeLists.txt     |     1 +
 src/rocksdb/thirdparty.inc                         |   169 +
 src/rocksdb/tools/Dockerfile                       |     5 +
 src/rocksdb/tools/auto_sanity_test.sh              |    91 +
 src/rocksdb/tools/benchmark.sh                     |   361 +
 src/rocksdb/tools/benchmark_leveldb.sh             |   185 +
 src/rocksdb/tools/check_format_compatible.sh       |   115 +
 src/rocksdb/tools/db_crashtest.py                  |   203 +
 src/rocksdb/tools/db_crashtest2.py                 |   231 +
 src/rocksdb/tools/db_repl_stress.cc                |   158 +
 src/rocksdb/tools/db_sanity_test.cc                |   294 +
 src/rocksdb/tools/db_stress.cc                     |  2197 +++
 src/rocksdb/tools/dbench_monitor                   |   102 +
 src/rocksdb/tools/dump/db_dump_tool.cc             |   261 +
 src/rocksdb/tools/dump/rocksdb_dump.cc             |    63 +
 src/rocksdb/tools/dump/rocksdb_undump.cc           |    62 +
 src/rocksdb/tools/generate_random_db.sh            |    30 +
 src/rocksdb/tools/ldb.cc                           |    21 +
 src/rocksdb/tools/ldb_test.py                      |   456 +
 src/rocksdb/tools/pflag                            |   217 +
 src/rocksdb/tools/reduce_levels_test.cc            |   217 +
 src/rocksdb/tools/rocksdb_dump_test.sh             |     7 +
 src/rocksdb/tools/run_flash_bench.sh               |   282 +
 src/rocksdb/tools/run_leveldb.sh                   |   174 +
 src/rocksdb/tools/sample-dump.dmp                  |   Bin 0 -> 100 bytes
 src/rocksdb/tools/sst_dump.cc                      |    21 +
 src/rocksdb/tools/verify_random_db.sh              |    27 +
 src/rocksdb/util/aligned_buffer.h                  |   154 +
 src/rocksdb/util/arena.cc                          |    37 +-
 src/rocksdb/util/arena.h                           |     5 +
 src/rocksdb/util/arena_test.cc                     |    53 +-
 src/rocksdb/util/auto_roll_logger.cc               |     2 +-
 src/rocksdb/util/auto_roll_logger.h                |     2 +-
 src/rocksdb/util/auto_roll_logger_test.cc          |   110 +-
 src/rocksdb/util/autovector.h                      |     8 +-
 src/rocksdb/util/autovector_test.cc                |    29 +-
 src/rocksdb/util/bloom_test.cc                     |     4 +-
 src/rocksdb/util/cache.cc                          |    39 +-
 src/rocksdb/util/cache_test.cc                     |    51 +
 src/rocksdb/util/channel.h                         |    67 +
 src/rocksdb/util/compaction_job_stats_impl.cc      |    80 +
 src/rocksdb/util/comparator.cc                     |    22 +-
 src/rocksdb/util/compression.h                     |    99 +
 src/rocksdb/util/crc32c.cc                         |     8 +
 src/rocksdb/util/crc32c.h                          |     2 +
 src/rocksdb/util/db_info_dumper.cc                 |    40 +-
 src/rocksdb/util/db_test_util.cc                   |   981 +
 src/rocksdb/util/db_test_util.h                    |   669 +
 src/rocksdb/util/delete_scheduler_impl.cc          |   231 +
 src/rocksdb/util/delete_scheduler_impl.h           |    81 +
 src/rocksdb/util/delete_scheduler_test.cc          |   469 +
 src/rocksdb/util/dynamic_bloom.h                   |     2 +-
 src/rocksdb/util/dynamic_bloom_test.cc             |     4 +-
 src/rocksdb/util/env.cc                            |    42 +-
 src/rocksdb/util/env_hdfs.cc                       |    26 +-
 src/rocksdb/util/env_posix.cc                      |   549 +-
 src/rocksdb/util/env_test.cc                       |   179 +-
 src/rocksdb/util/event_logger.cc                   |    40 +-
 src/rocksdb/util/event_logger.h                    |    60 +-
 src/rocksdb/util/file_reader_writer.cc             |   471 +
 src/rocksdb/util/file_reader_writer.h              |   166 +
 src/rocksdb/util/file_reader_writer_test.cc        |    92 +
 src/rocksdb/util/file_util.cc                      |    29 +-
 src/rocksdb/util/file_util.h                       |     6 +-
 src/rocksdb/util/hash_cuckoo_rep.cc                |    51 +-
 src/rocksdb/util/hash_linklist_rep.cc              |    15 +-
 src/rocksdb/util/heap.h                            |   140 +
 src/rocksdb/util/heap_test.cc                      |   139 +
 src/rocksdb/util/histogram.h                       |     5 +-
 src/rocksdb/util/iostats_context.cc                |    20 +-
 src/rocksdb/util/iostats_context_imp.h             |     8 +
 src/rocksdb/util/ldb_cmd.cc                        |    83 +-
 src/rocksdb/util/ldb_cmd.h                         |    38 +-
 src/rocksdb/util/ldb_cmd_execute_result.h          |     4 +
 src/rocksdb/util/ldb_cmd_test.cc                   |    44 +
 src/rocksdb/util/log_buffer.cc                     |    12 +-
 src/rocksdb/util/log_buffer.h                      |     2 +-
 src/rocksdb/util/manual_compaction_test.cc         |     5 +-
 src/rocksdb/util/memenv.cc                         |    12 +-
 src/rocksdb/util/memenv_test.cc                    |    22 +-
 src/rocksdb/util/mock_env.cc                       |    20 +-
 src/rocksdb/util/mock_env.h                        |     6 +-
 src/rocksdb/util/mock_env_test.cc                  |    13 +-
 src/rocksdb/util/mutable_cf_options.cc             |    20 +-
 src/rocksdb/util/mutable_cf_options.h              |   127 +-
 src/rocksdb/util/options.cc                        |   390 +-
 src/rocksdb/util/options_helper.cc                 |   774 +-
 src/rocksdb/util/options_helper.h                  |   388 +-
 src/rocksdb/util/options_parser.cc                 |   612 +
 src/rocksdb/util/options_parser.h                  |   124 +
 src/rocksdb/util/options_test.cc                   |  1055 +-
 src/rocksdb/util/perf_context.cc                   |    35 +-
 src/rocksdb/util/perf_context_imp.h                |    48 +-
 src/rocksdb/util/perf_level.cc                     |    27 +
 src/rocksdb/util/perf_level_imp.h                  |    18 +
 src/rocksdb/util/perf_step_timer.h                 |    54 +
 src/rocksdb/util/posix_logger.h                    |     6 +-
 src/rocksdb/util/rate_limiter.cc                   |     4 +-
 src/rocksdb/util/rate_limiter.h                    |     4 +-
 src/rocksdb/util/skiplistrep.cc                    |     9 +
 src/rocksdb/util/slice.cc                          |    33 +
 src/rocksdb/util/sst_dump_test.cc                  |    42 +-
 src/rocksdb/util/sst_dump_tool.cc                  |   149 +-
 src/rocksdb/util/sst_dump_tool_imp.h               |    16 +-
 src/rocksdb/util/statistics.cc                     |    10 +-
 src/rocksdb/util/statistics.h                      |     1 +
 src/rocksdb/util/status.cc                         |    18 +-
 src/rocksdb/util/status_message.cc                 |    17 +
 src/rocksdb/util/stl_wrappers.h                    |    48 +-
 src/rocksdb/util/stop_watch.h                      |     4 +
 src/rocksdb/util/string_util.h                     |     4 +-
 src/rocksdb/util/sync_point.cc                     |    15 +
 src/rocksdb/util/sync_point.h                      |    28 +
 src/rocksdb/util/testutil.cc                       |    31 +
 src/rocksdb/util/testutil.h                        |   258 +
 src/rocksdb/util/thread_local.cc                   |   121 +-
 src/rocksdb/util/thread_local.h                    |    11 +-
 src/rocksdb/util/thread_local_test.cc              |     2 +-
 src/rocksdb/util/thread_operation.h                |     4 +-
 src/rocksdb/util/thread_status_impl.cc             |     2 +-
 src/rocksdb/util/thread_status_updater.cc          |   104 +-
 src/rocksdb/util/thread_status_updater.h           |    29 +-
 src/rocksdb/util/thread_status_util.cc             |     5 +-
 src/rocksdb/util/thread_status_util.h              |     4 +-
 src/rocksdb/util/vectorrep.cc                      |     2 +-
 src/rocksdb/util/xfunc.cc                          |   114 +
 src/rocksdb/util/xfunc.h                           |    10 +
 src/rocksdb/utilities/backupable/backupable_db.cc  |   821 +-
 .../utilities/backupable/backupable_db_test.cc     |   628 +-
 src/rocksdb/utilities/checkpoint/checkpoint.cc     |    53 +-
 .../utilities/checkpoint/checkpoint_test.cc        |   373 +
 .../utilities/compacted_db/compacted_db_impl.cc    |   163 -
 .../utilities/compacted_db/compacted_db_impl.h     |    96 -
 .../remove_emptyvalue_compactionfilter.cc          |    30 +
 .../remove_emptyvalue_compactionfilter.h           |    27 +
 src/rocksdb/utilities/convenience/convenience.cc   |    23 -
 .../utilities/convenience/info_log_finder.cc       |    48 +
 src/rocksdb/utilities/document/document_db_test.cc |    12 +
 src/rocksdb/utilities/document/json_document.cc    |     2 +-
 .../utilities/document/json_document_test.cc       |    12 +
 src/rocksdb/utilities/geodb/geodb_impl.cc          |     7 +
 src/rocksdb/utilities/geodb/geodb_impl.h           |    14 +-
 src/rocksdb/utilities/geodb/geodb_test.cc          |    14 +-
 .../merge_operators/string_append/stringappend2.cc |     2 +-
 .../string_append/stringappend_test.cc             |    12 +-
 src/rocksdb/utilities/merge_operators/uint64add.cc |     4 +-
 src/rocksdb/utilities/redis/redis_lists_test.cc    |    13 +-
 src/rocksdb/utilities/spatialdb/spatial_db.cc      |    46 +-
 src/rocksdb/utilities/spatialdb/spatial_db_test.cc |    28 +
 .../compact_on_deletion_collector.cc               |    93 +
 .../compact_on_deletion_collector.h                |   101 +
 .../compact_on_deletion_collector_test.cc          |   177 +
 .../transactions/optimistic_transaction_db_impl.cc |    80 +
 .../transactions/optimistic_transaction_db_impl.h  |    33 +
 .../transactions/optimistic_transaction_impl.cc    |   109 +
 .../transactions/optimistic_transaction_impl.h     |    80 +
 .../transactions/optimistic_transaction_test.cc    |  1134 ++
 .../utilities/transactions/transaction_base.cc     |   385 +
 .../utilities/transactions/transaction_base.h      |   250 +
 .../utilities/transactions/transaction_db_impl.cc  |   260 +
 .../utilities/transactions/transaction_db_impl.h   |    80 +
 .../transactions/transaction_db_mutex_impl.cc      |   121 +
 .../transactions/transaction_db_mutex_impl.h       |    26 +
 .../utilities/transactions/transaction_impl.cc     |   320 +
 .../utilities/transactions/transaction_impl.h      |   124 +
 .../utilities/transactions/transaction_lock_mgr.cc |   460 +
 .../utilities/transactions/transaction_lock_mgr.h  |    94 +
 .../utilities/transactions/transaction_test.cc     |  1902 ++
 .../utilities/transactions/transaction_util.cc     |   147 +
 .../utilities/transactions/transaction_util.h      |    60 +
 src/rocksdb/utilities/ttl/db_ttl_impl.cc           |     6 +-
 src/rocksdb/utilities/ttl/db_ttl_impl.h            |    16 +-
 src/rocksdb/utilities/ttl/ttl_test.cc              |    42 +-
 .../write_batch_with_index.cc                      |   264 +-
 .../write_batch_with_index_internal.cc             |    33 +-
 .../write_batch_with_index_internal.h              |     9 +-
 .../write_batch_with_index_test.cc                 |   636 +-
 src/test/Makefile-client.am                        |    36 +-
 src/test/Makefile-server.am                        |     6 +-
 src/test/Makefile.am                               |    10 +-
 src/test/ObjectMap/KeyValueDBMemory.cc             |     4 +
 src/test/ObjectMap/KeyValueDBMemory.h              |     2 +-
 src/test/ObjectMap/test_keyvaluedb_atomicity.cc    |     8 +-
 src/test/ObjectMap/test_keyvaluedb_iterators.cc    |    20 +-
 src/test/ObjectMap/test_object_map.cc              |     5 +-
 src/test/centos-6/Dockerfile.in                    |     2 +-
 src/test/centos-6/ceph.spec.in                     |    18 +-
 src/test/centos-6/install-deps.sh                  |     5 +-
 src/test/centos-7/Dockerfile.in                    |     4 +-
 src/test/centos-7/ceph.spec.in                     |    18 +-
 src/test/centos-7/install-deps.sh                  |     5 +-
 src/test/ceph_objectstore_tool.py                  |   380 +-
 src/test/cli/crushtool/check-names.empty.t         |     3 +-
 src/test/cli/crushtool/check-names.max-id.t        |     1 +
 src/test/cli/crushtool/help.t                      |     2 +-
 src/test/cli/osdmaptool/pool.t                     |     2 +
 src/test/cli/radosgw-admin/help.t                  |     8 +-
 src/test/cli/rbd/help.t                            |   926 +-
 src/test/cli/rbd/invalid-snap-usage.t              |    72 +-
 src/test/cli/rbd/not-enough-args.t                 |   128 +-
 src/test/cli/rbd/too-many-args.t                   |    33 +
 src/test/cls_journal/test_cls_journal.cc           |   380 +
 src/test/cls_rbd/test_cls_rbd.cc                   |     9 +
 src/test/debian-jessie/Dockerfile.in               |     2 +-
 src/test/debian-jessie/install-deps.sh             |     5 +-
 src/test/encoding/types.h                          |     9 +
 src/test/erasure-code/test-erasure-code.sh         |     2 +-
 src/test/erasure-code/test-erasure-eio.sh          |     2 +-
 src/test/fedora-21/Dockerfile.in                   |     2 +-
 src/test/fedora-21/ceph.spec.in                    |    18 +-
 src/test/fedora-21/install-deps.sh                 |     5 +-
 src/test/journal/RadosTestFixture.cc               |    93 +
 src/test/journal/RadosTestFixture.h                |    62 +
 src/test/journal/test_Entry.cc                     |    96 +
 src/test/journal/test_FutureImpl.cc                |   206 +
 src/test/journal/test_JournalMetadata.cc           |   101 +
 src/test/journal/test_JournalPlayer.cc             |   354 +
 src/test/journal/test_JournalRecorder.cc           |   148 +
 src/test/journal/test_JournalTrimmer.cc            |   188 +
 src/test/journal/test_Journaler.cc                 |    84 +
 src/test/journal/test_ObjectPlayer.cc              |   275 +
 src/test/journal/test_ObjectRecorder.cc            |   329 +
 src/test/journal/test_main.cc                      |    26 +
 src/test/libcephfs/access.cc                       |   358 +
 src/test/libcephfs/flock.cc                        |     3 +-
 src/test/libcephfs/multiclient.cc                  |     8 +-
 src/test/libcephfs/test.cc                         |    94 +-
 src/test/librados/aio.cc                           |     8 +-
 src/test/librados/test.cc                          |    64 +-
 src/test/librados/tier.cc                          |   663 +-
 src/test/librados/watch_notify.cc                  |   102 +-
 src/test/librados_test_stub/LibradosTestStub.cc    |    15 +
 src/test/librados_test_stub/MockTestMemIoCtxImpl.h |   101 +
 .../librados_test_stub/MockTestMemRadosClient.h    |    36 +
 src/test/librados_test_stub/TestIoCtxImpl.cc       |    21 +
 src/test/librados_test_stub/TestIoCtxImpl.h        |    17 +
 src/test/librados_test_stub/TestMemIoCtxImpl.cc    |    17 +
 src/test/librados_test_stub/TestMemIoCtxImpl.h     |     2 +
 src/test/librados_test_stub/TestRadosClient.cc     |     4 +
 src/test/librados_test_stub/TestRadosClient.h      |     2 +
 src/test/librados_test_stub/TestWatchNotify.cc     |    68 +-
 src/test/librados_test_stub/TestWatchNotify.h      |     4 +-
 src/test/libradosstriper/rados-striper.sh          |     2 +-
 src/test/librbd/fsx.cc                             |     2 +-
 src/test/librbd/test_ImageWatcher.cc               |   450 +-
 src/test/librbd/test_JournalEntries.cc             |   217 +
 src/test/librbd/test_JournalReplay.cc              |   209 +
 src/test/librbd/test_internal.cc                   |   108 +-
 src/test/librbd/test_librbd.cc                     |   220 +-
 src/test/librbd/test_main.cc                       |     4 +
 src/test/mds/TestMDSAuthCaps.cc                    |   121 +-
 src/test/mon/misc.sh                               |     6 +-
 src/test/mon/mkfs.sh                               |     2 +-
 src/test/mon/mon-ping.sh                           |     2 +-
 src/test/mon/mon-scrub.sh                          |     2 +-
 src/test/mon/osd-crush.sh                          |    12 +-
 src/test/mon/osd-erasure-code-profile.sh           |     2 +-
 src/test/mon/osd-pool-create.sh                    |     2 +-
 src/test/msgr/test_msgr.cc                         |     4 +-
 src/test/objectstore/FileStoreTracker.h            |     2 +-
 src/test/objectstore/TestRocksdbOptionParse.cc     |     2 +-
 src/test/objectstore/test_idempotent.cc            |     5 +-
 src/test/objectstore/test_kv.cc                    |     2 +-
 src/test/opensuse-13.2/Dockerfile.in               |     2 +-
 src/test/opensuse-13.2/ceph.spec.in                |    18 +-
 src/test/opensuse-13.2/install-deps.sh             |     5 +-
 src/test/osd/osd-bench.sh                          |     2 +-
 src/test/osd/osd-config.sh                         |     2 +-
 src/test/osd/osd-copy-from.sh                      |     2 +-
 src/test/osd/osd-reactivate.sh                     |    55 +
 src/test/osd/osd-scrub-repair.sh                   |    45 +-
 src/test/osd/osd-scrub-snaps.sh                    |   227 +
 src/test/osdc/FakeWriteback.cc                     |     2 +-
 src/test/osdc/FakeWriteback.h                      |     3 +-
 src/test/osdc/object_cacher_stress.cc              |     4 +-
 src/test/pybind/test_ceph_argparse.py              |     3 -
 src/test/python/ceph-disk/tests/test_ceph_disk.py  |   765 +-
 src/test/run-rbd-unit-tests.sh                     |     2 +-
 src/test/test_filejournal.cc                       |    81 +-
 src/test/test_rgw_admin_log.cc                     |     4 +-
 src/test/test_rgw_admin_meta.cc                    |     4 +-
 src/test/test_rgw_admin_opstate.cc                 |     4 +-
 src/test/test_subprocess.cc                        |    44 +-
 src/test/ubuntu-12.04/Dockerfile.in                |     2 +-
 src/test/ubuntu-12.04/install-deps.sh              |     5 +-
 src/test/ubuntu-14.04/Dockerfile.in                |     2 +-
 src/test/ubuntu-14.04/install-deps.sh              |     5 +-
 src/tools/Makefile-client.am                       |    49 +
 src/tools/Makefile.am                              |     6 +-
 src/tools/ceph-monstore-update-crush.sh            |    13 +-
 src/tools/ceph_kvstore_tool.cc                     |     6 +-
 src/tools/ceph_objectstore_tool.cc                 |   544 +-
 src/tools/ceph_osdomap_tool.cc                     |     8 +-
 src/tools/cephfs/Dumper.cc                         |     7 +-
 src/tools/rados/RadosImport.cc                     |     4 +-
 src/tools/rados/rados.cc                           |   121 +-
 src/tools/rbd/ArgumentTypes.cc                     |   342 +
 src/tools/rbd/ArgumentTypes.h                      |   157 +
 src/tools/rbd/IndentStream.cc                      |    59 +
 src/tools/rbd/IndentStream.h                       |    60 +
 src/tools/rbd/OptionPrinter.cc                     |   107 +
 src/tools/rbd/OptionPrinter.h                      |    40 +
 src/tools/rbd/Shell.cc                             |   401 +
 src/tools/rbd/Shell.h                              |    76 +
 src/tools/rbd/Utils.cc                             |   431 +
 src/tools/rbd/Utils.h                              |    94 +
 src/tools/rbd/action/BenchWrite.cc                 |   310 +
 src/tools/rbd/action/Children.cc                   |    98 +
 src/tools/rbd/action/Clone.cc                      |   100 +
 src/tools/rbd/action/Copy.cc                       |    90 +
 src/tools/rbd/action/Create.cc                     |    94 +
 src/tools/rbd/action/Diff.cc                       |   140 +
 src/tools/rbd/action/DiskUsage.cc                  |   268 +
 src/tools/rbd/action/Export.cc                     |   196 +
 src/tools/rbd/action/ExportDiff.cc                 |   260 +
 src/tools/rbd/action/Feature.cc                    |    86 +
 src/tools/rbd/action/Flatten.cc                    |    71 +
 src/tools/rbd/action/ImageMeta.cc                  |   313 +
 src/tools/rbd/action/Import.cc                     |   319 +
 src/tools/rbd/action/ImportDiff.cc                 |   223 +
 src/tools/rbd/action/Info.cc                       |   232 +
 src/tools/rbd/action/Kernel.cc                     |   360 +
 src/tools/rbd/action/List.cc                       |   235 +
 src/tools/rbd/action/Lock.cc                       |   266 +
 src/tools/rbd/action/MergeDiff.cc                  |   436 +
 src/tools/rbd/action/ObjectMap.cc                  |    73 +
 src/tools/rbd/action/Remove.cc                     |    84 +
 src/tools/rbd/action/Rename.cc                     |    84 +
 src/tools/rbd/action/Resize.cc                     |    94 +
 src/tools/rbd/action/Snap.cc                       |   495 +
 src/tools/rbd/action/Status.cc                     |   133 +
 src/tools/rbd/action/Watch.cc                      |   137 +
 src/tools/rbd/rbd.cc                               |    20 +
 src/tracing/librados.tp                            |    99 +-
 src/tracing/librbd.tp                              |   109 +
 src/tracing/osd.tp                                 |    28 +-
 src/tracing/tracing-common.h                       |     2 +-
 src/vstart.sh                                      |     4 +-
 systemd/Makefile.am                                |     3 +-
 systemd/Makefile.in                                |     4 +-
 systemd/ceph                                       |     4 +-
 systemd/rbdmap.service                             |    12 +
 1168 files changed, 146659 insertions(+), 44508 deletions(-)

diff --git a/AUTHORS b/AUTHORS
index d6cbb94..b02281b 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -3,6 +3,7 @@ Abhishek Lekshmanan <abhishek.lekshmanan at ril.com>
 Accela Zhao <accelazh at gmail.com>
 Adam C. Emerson <aemerson at linuxbox.com>
 Adam Crume <adamcrume at gmail.com>
+Adam Kupczyk <akupczyk at mirantis.com>
 Adam Manzanares <nmtadam at gmail.com>
 Adam Spiers <aspiers at suse.com>
 Adam Twardowski <adam.twardowski at gmail.com>
@@ -10,7 +11,7 @@ Ahoussi Armand <ahoussi.say at telecom-bretagne.eu>
 Ailing Zhang <zhangal1992 at gmail.com>
 Alan Grosskurth <code at alan.grosskurth.ca>
 Alan Somers <asomers at gmail.com>
-Alexander Chuzhoy <schuzhoy at users.noreply.github.com>
+Alexander Chuzhoy <achuzhoy at redhat.com>
 Alexandre Marangone <alexandre.marangone at inktank.com>
 Alexandre Marangone <amarango at redhat.com>
 Alexandre Oliva <oliva at gnu.org>
@@ -55,34 +56,38 @@ Billy Olsen <billy.olsen at canonical.com>
 BJ Lougee <almightybeeij at gmail.com>
 Bjørnar Ness <bjornar.ness at gmail.com>
 Blaine Gardner <blaine.gardner at hp.com>
-blinke <Burkhard.Linke at computational.bio.uni-giessen.de>
 Bo Cai <cai.bo at h3c.com>
 Boris Ranto <branto at redhat.com>
 Bosse Klykken <larkly at gmail.com>
 Brad Hubbard <bhubbard at redhat.com>
 Brandon Seibel <brandon at seibelnet.ca>
-Brian Andrus <bandrus+github at gmail.com>
+Brian Andrus <bandrus at redhat.com>
+brian <bjfelton at gmail.com>
 Brian Chrisman <brchrisman at gmail.com>
 Brian Rak <dn at devicenull.org>
 Brown, David M JR <david.brown at pnl.gov>
-caibo <cai.bo at h3c.com>
+Burkhard Linke <Burkhard.Linke at computational.bio.uni-giessen.de>
 Caleb Miles <caleb.miles at inktank.com>
 Carlos Maltzahn <carlosm at cs.ucsc.edu>
 carsonoid <ca at carsonoid.net>
 Casey Bodley <casey at cohortfs.com>
 Casey Bodley <casey at linuxbox.com>
 Casey Bodley <cbodley at redhat.com>
+Casey Bodley <cbodley at users.noreply.github.com>
 Casey Marshall <csm at soe.ucsc.edu>
 CC Lien <cc_lien at tcloudcomputing.com>
 Ce Gu <guce at h3c.com>
+ceph <zhuang.zeqiang at h3c.com>
 Cesar Mello <cesar at d1.(none)>
 Chen Baozi <baozich at gmail.com>
+Chen Dihao <tobeg3oogle at gmail.com>
 Chendi Xue <chendi.xue at intel.com>
 Cheng Cheng <ccheng.leo at gmail.com>
-chenji <insomnia at 139.com>
+Chengyuan Li <chengyli at ebay.com>
 Chris Dunlop <chris at onthe.net.au>
 Chris Glass <tribaal at gmail.com>
 Chris Holcombe <chris.holcombe at nebula.com>
+Chris Holcombe <xfactor973 at gmail.com>
 Christian Brunner <christian at brunner-muc.de>
 Christian Marie <pingu at anchor.net.au>
 Christian Theune <ct at gocept.com>
@@ -96,18 +101,19 @@ Colin Mattson <colinmattson at gmail.com>
 Colin P. McCabe <colinm at hq.newdream.net>
 Dan Chai <tengweicai at gmail.com>
 Daniel Gollub <d.gollub at telekom.de>
-Daniel Gryniewicz <dang at fprintf.net>
+Daniel Gryniewicz <dang at redhat.com>
 Daniel J. Hofmann <daniel at trvx.org>
 Dan Mick <dan.mick at inktank.com>
 Dan Mick <dmick at redhat.com>
 Danny Al-Gaaf <danny.al-gaaf at bisect.de>
 Dan van der Ster <daniel.vanderster at cern.ch>
 David Anderson <dave at natulte.net>
+David Coles <dcoles at gaikai.com>
 David Disseldorp <ddiss at suse.de>
 David Moreau Simard <dmsimard at iweb.com>
 David Zafman <david.zafman at inktank.com>
 David Zafman <dzafman at redhat.com>
-Dennis Schafroth <dennis at schafroth.dk>
+Dennis Schafroth <dennis at schafroth.com>
 Derek Yarnell <derek at umiacs.umd.edu>
 Derrick Schneider <derrick.schneider at opower.com>
 Ding Dinghua <dingdinghua85 at gmail.com>
@@ -156,18 +162,19 @@ Greg Farnum <gfarnum at redhat.com>
 Greg Farnum <greg at inktank.com>
 Gregory Meno <gmeno at redhat.com>
 Guangliang Zhao <guangliang at unitedstack.com>
+Guang Yang <yguang at yahoo-inc>
 Guang Yang <yguang at yahoo-inc.com>
 Guilhem Lettron <guilhem at lettron.fr>
 Haifeng Liu <haifeng at yahoo-inc.com>
 Hannes Reinecke <hare at suse.de>
 Hannu Valtonen <hannu.valtonen at ormod.com>
 Haomai Wang <haomai at xsky.com>
-Haomai Wang <haomai at xsky.io>
 Harpreet Dhillon <harpreet at ironsystems.com>
 Hazem Amara <hazem.amara at telecom-bretagne.eu>
 Henry C Chang <henry_c_chang at tcloudcomputing.com>
 Henry Chang <henry at bigtera.com>
 Herb Shiu <herb_shiu at tcloudcomputing.com>
+Herve Rousseau <hroussea at cern.ch>
 Hervé Rousseau <hroussea at cern.ch>
 Holger Macht <hmacht at suse.de>
 Huamin Chen <hchen at redhat.com>
@@ -179,25 +186,28 @@ Ilya Dryomov <idryomov at redhat.com>
 Ilya Dryomov <ilya.dryomov at inktank.com>
 Ira Cooper <ira at samba.org>
 Ismael Serrano <ismael.serrano at gmail.com>
+Jacek J. Lakis <jacek.lakis at intel.com>
 James Page <james.page at ubuntu.com>
 James Ryan Cresawn <jrcresawn at gmail.com>
 Jan Harkes <jaharkes at cs.cmu.edu>
 Janne Grunau <j at jannau.net>
+Jashan Kamboj <jashank42 at gmail.com>
 Jason Dillaman <dillaman at redhat.com>
 Javier Guerra <javier at guerrag.com>
 Javier M. Mellid <jmunhoz at igalia.com>
 Jean-Rémi Deveaux <jeanremi.deveaux at gmail.com>
+Jeff Epstein <jepst79 at gmail.com>
 Jeff Weber <jweber at cofront.net>
-Jenkins Build Slave User <jenkins-build at jenkins-slave-wheezy.localdomain>
 Jenkins <jenkins at ceph.com>
-Jenkins <jenkins at inktank.com>
 Jens-Christian Fischer <jens-christian.fischer at switch.ch>
-jepst <jepst79 at gmail.com>
 Jevon Qiao <qiaojianfeng at unitedstack.com>
 Jiang Heng <jiangheng0511 at gmail.com>
+Jianhui Yuan <zuiwanyuan at gmail.com>
 Jiantao He <hejiantao5 at gmail.com>
 Jian Wen <wenjian at letv.com>
 Jiaying Ren <mikulely at gmail.com>
+Ji Chen <insomnia at 139.com>
+Jie Wang <jie.wang at kylin-cloud.com>
 Jim Schutt <jaschut at sandia.gov>
 João Eduardo Luís <joao.luis at inktank.com>
 João Eduardo Luís <joao at redhat.com>
@@ -205,7 +215,9 @@ Joao Eduardo Luis <joao at suse.de>
 Joaquim Rocha <joaquim.rocha at cern.ch>
 Joe Buck <jbbuck at gmail.com>
 Joe Handzik <joseph.t.handzik at hp.com>
+Joe Julian <jjulian at io.com>
 Johannes Erdfelt <johannes at erdfelt.com>
+John Coyle <dx9err at gmail.com>
 John Spray <john.spray at inktank.com>
 John Spray <jspray at redhat.com>
 Johnu George <johnugeo at cisco.com>
@@ -251,8 +263,7 @@ Lee Revell <rlrevell at gmail.com>
 Lei Dong <leidong at yahoo-inc.com>
 Liam Monahan <liam at umiacs.umd.edu>
 Li Peng <lip at dtdream.com>
-liumingxin <mingxinliu at ubuntukylin.com>
-Li Wang <liwang at ubuntukylin.com>
+Li Wang <li.wang at kylin-cloud.com>
 Lluis Pamies-Juarez <lluis.pamies-juarez at hgst.com>
 Loic Dachary <ldachary at redhat.com>
 Loic Dachary <loic-201408 at dachary.org>
@@ -286,10 +297,8 @@ Mike Kelly <pioto at pioto.org>
 Mike Lundy <mike at fluffypenguin.org>
 Mike Ryan <mike.ryan at inktank.com>
 Milan Broz <mbroz at redhat.com>
-minchen <minchen at ubuntukylin.com>
 Min Chen <minchen at ubuntukylin.com>
-Mingxin Liu <mingxinliu at ubuntukylin.com>
-MingXin Liu <mingxinliu at ubuntukylin.com>
+MingXin Liu <mingxin.liu at kylin-cloud.com>
 Mingyue Zhao <zhao.mingyue at h3c.com>
 Mohammad Salehe <salehe+dev at gmail.com>
 Moritz Möller <mm at mxs.de>
@@ -304,9 +313,9 @@ Nicolas Yong <nicolas.yong93 at gmail.com>
 Nikola Kotur <kotnick at gmail.com>
 Nilamdyuti Goswami <ngoswami at redhat.com>
 Ning Yao <yaoning at ruijie.com.cn>
+Nishtha Rai <nishtha3rai at gmail.com>
 Noah Watkins <nwatkins at redhat.com>
 (no author) <(no author)@29311d96-e01e-0410-9327-a35deaab8ce9>
-oddomatik <bandrus+github at gmail.com>
 Orit Wasserman <owasserm at redhat.com>
 Owen Synge <osynge at suse.com>
 Padraig O'Sullivan <posulliv at umd.edu>
@@ -327,13 +336,12 @@ Pierre Chaumont <pierre.chaumont31 at gmail.com>
 Pierre Rognant <prognant at oodrive.com>
 Piotr Dałek <piotr.dalek at ts.fujitsu.com>
 Qiankun Zheng <zheng.qiankun at h3c.com>
-Radoslaw Zarzynski <rzarzynski at github.com>
 Radoslaw Zarzynski <rzarzynski at mirantis.com>
 Rajesh Nambiar <rajesh.n at msystechnologies.com>
 Raju Kurunkad <raju.kurunkad at sandisk.com>
 Ray Lv <xiangyulv at gmail.com>
 rca <bertosmailbox at gmail.com>
-renhwztetecs <rhwlyw at 163.com>
+Ren Huanwen <ren.huanwen at zte.com.cn>
 riccardo80 <riccardo80 at 29311d96-e01e-0410-9327-a35deaab8ce9>
 Riccardo Ferretti <rferrett at soe.ucsc.edu>
 ritz303 <ritz_303 at yahoo.com>
@@ -347,12 +355,13 @@ Rohan Mars <code at rohanmars.com>
 Roman Haritonov <reclosedev at gmail.com>
 Ron Allred <rallred at itrefined.com>
 Rongze Zhu <zrzhit at gmail.com>
+root <rahul.1aggarwal at gmail.com>
 root <root at phenom.dyweni.com>
-root <root at ubuntu1.com>
 Ross Turk <ross.turk at inktank.com>
 Ross Turk <rturk at redhat.com>
 Ruben Kerkhof <ruben at rubenkerkhof.com>
 Ruifeng Yang <yangruifeng.09209 at h3c.com>
+runsisi <runsisi at hust.edu.cn>
 Rutger ter Borg <rutger at terborg.net>
 Sage Weil <sage at inktank.com>
 Sage Weil <sweil at redhat.com>
@@ -362,7 +371,6 @@ Samuel Just <sam.just at inktank.com>
 Samuel Just <sjust at redhat.com>
 Sandon Van Ness <sandon at inktank.com>
 Sandon Van Ness <svanness at redhat.com>
-Sangdi <xu.sangdi at h3c>
 Sangdi Xu <xu.sangdi at h3c.com>
 Scott A. Brandt <scott at cs.ucsc.edu>
 Scott Devoid <devoid at anl.gov>
@@ -372,7 +380,6 @@ Sebastien Ponce <sebastien.ponce at cern.ch>
 Sergey Arkhipov <nineseconds at yandex.ru>
 Shanggao Qiu <qiushanggao at qq.com>
 Sharif Olorin <sio at tesser.org>
-shawn <chen.xiaowei at h3c.com>
 Shawn Edwards <lesser.evil at gmail.com>
 shishir gowda <shishir.gowda at sandisk.com>
 Shotaro Kawaguchi <kawaguchi.s at jp.fujitsu.com>
@@ -393,6 +400,7 @@ Steve MacGregor <grape at lapgoat-0.(none)>
 Steve Stock <steve at technolope.org>
 Stratos Psomadakis <psomas at grnet.gr>
 Stuart Longland <stuartl at vrt.com.au>
+suckowbiz <tobias at suckow.biz>
 Sushma Gurram <sushma.gurram at sandisk.com>
 Swami Reddy <swami.reddy at ril.com>
 Sylvain Baubeau <sbaubeau at redhat.com>
@@ -412,7 +420,6 @@ Tianshan Qu <tianshan at xsky.com>
 Tim Freund <tim at freunds.net>
 Tim Serong <tserong at suse.com>
 tmuthamizhan <tamil.muthamizhan at inktank.com>
-tobe <tobeg3oogle at gmail.com>
 Tobias Florek <tobias.florek at bytesandbutter.de>
 Tomasz Paskowski <ss7pro at gmail.com>
 Tom Callaway <spot at redhat.com>
@@ -425,6 +432,7 @@ Valentin Arshanes Thomas <valentin.arshanes.thomas at gmail.com>
 Vangelis Koukis <vkoukis at cslab.ece.ntua.gr>
 Varada Kari <varada.kari at sandisk.com>
 Vartika Rai <vartikarai17 at gmail.com>
+vasukulkarni <vasu.kulkarni at gmail.com>
 Vasu Kulkarni <vasu at redhat.com>
 Venky Shankar <vshankar at redhat.com>
 Vicente Cheng <freeze.bilsted at gmail.com>
@@ -437,8 +445,10 @@ Walter Huf <hufman at gmail.com>
 Wang, Yaguang <yaguang.wang at intel.com>
 Warren Usui <warren.usui at inktank.com>
 Weijun Duan <duanweijun at h3c.com>
+weill <weilluo at tencent.com>
 Wei Luo <luowei at yahoo-inc.com>
-weiqian <weiq at dtdream.com>
+Wei Qian <weiq at dtdream.com>
+wenjunhuang <wenjunhuang at tencent.com>
 Wesley Spikes <wesley.spikes at dreamhost.com>
 Wido den Hollander <wido at 42on.com>
 William A. Kennington III <william at wkennington.com>
@@ -449,26 +459,28 @@ Xavier Roche <roche+git at exalead.com>
 Xiaowei Chen <chen.xiaowei at h3c.com>
 Xiaoxi Chen <xiaoxi.chen at intel.com>
 Xie Rui <875016668 at qq.com>
-Xie Rui <jerry.xr86 at gmail.com>
-xiexingguo <258156334 at qq.com>
+Xie Xingguo <xie.xingguo at zte.com.cn>
 Xihui He <xihuihe at gmail.com>
 Xing Lin <xinglin at cs.utah.edu>
 Xingyi Wu <wuxingyi2015 at outlook.com>
-Xinze Chi <xinze at xksy.com>
 Xinze Chi <xinze at xsky.com>
 Xiong Yiliang <xiongyiliang at xunlei.com>
 Xuan Liu <liu.xuan at h3c.com>
+YankunLi <YankunLi at users.noreply.github.com>
 Yann Dupont <yann at objoo.org>
 Yannick Atchy Dalama <yannick.atchy.dalama at gmail.com>
 Yan, Zheng <zheng.z.yan at intel.com>
 Yan, Zheng <zyan at redhat.com>
 Yazen Ghannam <yazen.ghannam at linaro.org>
+Yehua Chen <chen.yehua at h3c.com>
 Yehuda Sadeh <yehuda at inktank.com>
 Yehuda Sadeh <ysadehwe at redhat.com>
 Yongyue Sun <abioy.sun at gmail.com>
+youji <youji at ebay.com>
 Yuan Zhou <yuan.zhou at intel.com>
-Yunchuan Wen <yunchuanwen at ubuntukylin.com>
+Yunchuan Wen <yunchuan.wen at kylin-cloud.com>
 Yuri Weinstein <yuri.weinstein at inktank.com>
+Zengran Zhang <zhangzengran at h3c.com>
 Zhe Zhang <zzxuanyuan at gmail.com>
 Zhicheng Wei <zhicheng at opensourceforge.net>
 Zhi (David) Zhang <zhangz at yahoo-inc.com>
diff --git a/COPYING b/COPYING
index 5efc838..189b1ac 100644
--- a/COPYING
+++ b/COPYING
@@ -149,3 +149,8 @@ Files: src/include/timegm.h
   Copyright (C) Copyright Howard Hinnant
   Copyright (C) Copyright 2010-2011 Vicente J. Botet Escriba
   License: Boost Software License, Version 1.0
+
+Files: src/msg/async/AsyncConnection.cc, src/msg/simple/Pipe.cc (sigpipe suppression)
+  Copyright (C) 2010 Tomash Brechko.  All rights reserved.
+  License: GPL3
+
diff --git a/ChangeLog b/ChangeLog
index dd6c2a5..05771ab 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,12 +1,620 @@
-bb2ecea (HEAD, tag: v9.2.0, origin/infernalis) 9.2.0
+9180a92 (HEAD, tag: v10.0.1, origin/jewel) 10.0.1
+247ee60 build/ops: enable CR in CentOS 7
+b47eeae tests: centos7 needs the Continuous Release (CR) Repository enabled for libunwind
+1adf306 SubmittingPatches: there is no next; only jewel
+73aab5e test: use sequential journal_tid for object cacher test
+fb120d7 osd: call on_new_interval on newly split child PG
+e9daed2 rgw: use smart pointer for C_Reinitwatch
+c4fbec7 rgw: fix partial read mime map issue
+f914b8d rgw: fix rgw_admin partial read issue
+88e6694 osd: fix ClassHandler::ClassData::get_filter()
+9d06041 rbd: bail if too many arguments provided
+d133f42 rbd: don't append an extra newline after some errors
+1c84681 tests: update unmap.t CLI test
+5ce663f cmake: librbd needs libjournal and libcls_journal_client
+06b3b47 mon/PGMonitor: MAX AVAIL is 0 if some OSDs' weight is 0
+b2eefca os: FileStore::_destroy_collection may hide the real mistake.
+04c0360 Fix mon routed_request_tids leak
+baf9da3 pybind: decode empty string in conf_parse_argv() correctly
+903350c ceph_test_keyvaluedb_iterators: Fix broken test
+0b474c5 mon: don't require OSD W for MRemoveSnaps
+3680dc3 mon/OSDMonitor: block 'ceph osd pg-temp ...' if update is pending
+095c29c ceph.spec.in: make --with lowmem_builder limit _smp_mflags
+1509ada mailmap: Jenkins affiliation
+d92f611 mailmap: Burkhard Linke affiliation
+27f81d4 mailmap: Chen Dihao affiliation
+8dc6748 mailmap: Wei Qian affiliation Signed-off-by: Yann Dupont <yann at objoo.org>
+f86eb3f mds: fix scrub_path
+4025f75 doc/release-notes: fix typo
+efbcd12 doc/release-notes: final v10.0.0 notes
+5972a44 doc: fix message typos in systemd
+9aabc8a test/mon/osd-crush.sh: escape ceph tell mon.*
+72edab2 osd: make some of the pg_temp methods/fields private
+987f68a osdc/Objecter: call notify completion only once
+d201c6d mon: change mon_osd_min_down_reporters from 1 -> 2
+0269a0c mon/OSDMonitor: simplify failure reporters vs reports logic
+53f2c7f osd: simplify pg creation
+57121db mon/MonClient: make _sub_got behave if we "got" old stuff
+ca75e37 mon/OSDMonitor: fix oldest_map in send_incremental
+9864a79 mon/PGMonitor: avoid useless pg gets when pool is deleted
+1f4b714 mon/PGMonitor: revamp how pg creates are tracked
+3ad0c92 mon/PGMonitor: only send pg create messages to up osds
+23d4df3 mon/PGMonitor: only churn mapping_epoch if the primary changes
+5912382 mon/PGMonitor: a bunch of cosmetic cleanup
+0389763 mon/PGMonitor: drop old creating_pgs_by_osd
+160a020 osd: reduce mon_subscribe messages
+7fcffe3 mon/MonClient: only send new subscriptions
+c85b152 mon/PGMonitor: send pg creates via persistent subscriptions, not spam
+0938bf0 mon/PGMonitor: only map and send pg creates post paxos update
+6cbdd67 mon/PGMonitor: remove map_pg_creates, send_pg_creates commands
+c1f6eec messages/MOSDPGCreate: make it more readable
+d3eba9b osd: subscribe to all pg creates, not just once on start
+dd91837 mon/PGMonitor: track creating_pgs_by_osd_epoch
+2754007 mon/PGMap: assert our pg counts don't go negative
+b3b0a95 mon/OSDMonitor: do not prime pg_temp for creating pgs
+242bf50 mon/PGMonitor: note mapping_epoch for creating pgs
+39e06ef mon: let peon mons send the osdmap replies
+05aaa60 msg/simple/Pipe: show keepalives at level 2
+6557b76 mon: set mon_subscribe_interval to a day
+26496b9 mon: only ack subscriptions (and renew) if client or mon is old
+ae9d5ee mon: remove old subscribe renewal-based timeouts
+6f30002 mon: small cleanup in _ms_dispatch
+e5fc790 mon: new session_timeout mechanism that is not subscribe-based
+536c702 msg: make last_keepalive[_ack] lock safe
+fb9dfad msg: track stamp of last keepalive[2] received
+d781f48 common: mirror leveldb default tuning w/ rocksdb
+73bdf0f mon/MonClient: don't send log if we're reconnecting
+a12dd1b mon: disabled rocksdb compression when used as the backend
+7489ec4 osd: cap adjusted max mon report interval at 2/3 of timeout
+39c1495 osd: protect mon reporting with mon_report_lock
+e31b695 osd: fix reconnect behavior from booting state
+8b5b6c8 osd: move the monitor report to OSD::tick_without_osd_lock
+7bc4763 osd: _got_mon_epochs - refactor the lock scope to avoid a race (which fail make check)
+21ca0b5 osd: don't send dup subscribes so much
+d4f813b osd: introduce explicit preboot stage
+2af422a osd: skip osdmap version query if we can
+605e188 osd: make [_]maybe_boot lockless variant
+21e95c2 osd: only send boot if booting on getversion completion
+894eb2a osd: do not resend pg_temp requests
+c9534df osd: do not send dup failure reports
+865ddca osd: resend pending failure reports with a new mon session
+b3ca828 osd: fix send_failures() locking
+5e10de4 osd: backoff the max reporting interval, too
+12c7e54 osd: no need for regular send_pg_temps
+19b714f osd: just send alive when it is queue
+d5a2f9a osd: fix pg stat reporting
+f74e310 osd: Only add random deep scrubs when NOT user initiated scrub
+4c19abd Revert "test: osd-scrub-snaps.sh: Randomized deep-scrubs can now happen during a scrub"
+0fe26c2 test: osd-scrub-snaps.sh: Randomized deep-scrubs can now happen during a scrub
+07f68b5 Typo in the apt-get command. Signed-off-by: Chris Holcombe <xfactor973 at gmail.com>
+3193ee1 scripts: ceph-release-notes for development versions
+c44ab62 release-notes: draft v10.0.0 release notes
+1420a1f doc: add v0.80.11 to the release timeline
+9e9b03e doc/releases: add v0.80.11 to release table
+4b5afe5 doc/release-notes: final v0.80.11 notes
+6316ff8 10.0.0
+99ba661 13207: Rados Gateway: Anonymous user is able to read bucket with authenticated read ACL
+1536cb0 osd: note down the number of missing clones
+3b146f5 RadosClient: reimplement the pool alignment methods using the new ones
+1633d3e doc: Update ceph-disk manual page to remove some option description.
+d3d139b doc: Update ceph-disk manual page with new feature deactivate/destroy.
+9cbe132 pep8 changes
+cb18a10 Add test cases to validate symlinks pointing to devs
+b3c7cb0 Compare parted output with the dereferenced path
+7d6002b Cephfs will crash if enabling async msg because of an assertion
+04e3810 osd: partial revert of "ReplicatedPG: result code not correctly set in some cases."
+f92f741 librbd: copy operation needs to use AIO work queue for writes
+ee7c6f7 librbd: simplify IO flush handling
+cb634df librbd: possible deadlock attempting to drain parent image WQs
+b118d7d WorkQueue: PointerWQ drain no longer waits for other queues
+5875345 tools/cephfs: use snprintf in Dumper
+5c2815e tools/cephfs: enlarge dump header
+2a3040b ceph-disk: remove the redundant try except and minor nits
+b954c51 tests: ceph-disk: add wait_for_osd_down() in ceph-disk-test.py of qa
+0f892e6 tests: ceph-disk: modify the ceph-disk qa test cases
+c110a63 tests: ceph-disk: improve the unit test to coverage all deactivate/destroy function.
+3823c31 ceph-disk: improve the device query stage on deactivate/destroy feature.
+7e88cf0 tests: ceph-disk: add deactivate/reactivate/destroy test cases.
+f51dd57 tests: ceph-disk: add some unittest functions to coverage destroy/deactivate feature.
+56b3bbc ceph-disk: modify the destroy/deactivate behavior to handle dmcrypt/mpath feature.
+d490fe9 tests: ceph-disk: Make unit test coverage all ceph-disk destroy/deactivate feature
+06aeec9 tests: ceph-disk: modified the destroy_osd test function.
+7922041 ceph-disk: Implement unittest for ceph-disk deactivate/destroy feature
+7b5151a ceph-disk: add --reactivate option, modify parameter about deactivate and destroy
+7ca8d1d ceph-disk: add `--mark-out` option on deactivate feature.
+be471a2 ceph-disk: use `ceph osd dump` to check osd status
+f064622 ceph-disk: add destroy feature
+3fcdf41 ceph-disk: add deactivate feature
+00a9ce7 tests: fix typo in TestClsRbd.snapshots test case
+2622993 (tag: v10.0.0) 10.0.0
+5aa840a rbd: support negative boolean command-line optionals
+6df48f8 os/newstore: disable rocksdb compression
+d885489 common: mirror leveldb default tuning w/ rocksdb
+ae516d7 mon: disabled rocksdb compression when used as the backend
+ba60bf0 os/fs/FS: fix zero()'s PUNCH_HOLE incancation
+2d92113 os/fs/FS: fix zero() return value on fallback
+eddb00b os/RocksDBStore: set up $path.wal -> $path symlink
+0dac747 os/newstore: distinguish between db open and create
+22c9310 os/newstore: remove newstore_db_path option
+9c0ae4b os/newstore: newstore_backend_options -> newstore_rocksdb_options
+7f07e1e os/newstore: set rocksdb default options
+f58ffdc tests: new rbd CLI command aliases
+1ff6889 rbd: add missing command aliases to refactored CLI
+37642a7 osd: scrub if load below daily avg and decreasing
+cf408a3 ceph_test_msgr: Use send_message instead of keepalive to wakeup connection
+b7df772 osd: randomize deep scrubbing
+511435f client: avoid creating orphan object in Client::check_pool_perm()
+f02a51f scrub: do not assign value if read error for ECBackend
+8bb61d3 scrub: do not assign value if read error for ReplicatedBackend
+a8b7464 osdservice: state changed to atomic_t to decrease thread context switch.
+b3a1290 osd: change mutex to spinlock to optimize thread context switch.
+7fc87e3 Update .mailmap
+08fd09a rbd: stripe unit/count set incorrectly from config
+481bb2c rbd: accept --user, refuse -i
+8629b61 tests: ignore test-suite.log
+f2432e0 rbd: recognize cephx_sign_messages option
+34b2b8f rbd: unbreak rbd map CLI
+c6a2ec2 tests: fix test case using new api
+2d2e6b2 osd: inline do_mon_report
+c131c81 osd: limit nubmer of pg stat updates in flight
+093478a osd: fix pg_stats_queue lock protection
+e2756f9 osd: scale mon report interval with timeout backoff
+ae1cae0 osd: keep count of outstanding pg stat updates to mon
+75e28c4 osd: no stats outstanding when we reset the session
+17d2429 osd: remove old stats backoff mechanism
+facd36f osd: exponential backoff on pg stats ack timeout
+25888bb message/MLog: include seq in print
+56dbf7a osd/OSDMap: cache values for in, up osds
+28138c6 mon/PGMonitor: avoid iterating over all pgs to find stale
+a0204dc build/ops: rbd-replay moved from ceph-test-dbg to ceph-common-dbg
+fe012a2 tests: avoid bashism
+5281bdd rbd: hardcode application name into help
+e89f0e1 ceph-objectstore-tool: Add dry-run checking to ops missing it
+4666d5f ceph-objectstore-tool: Remove unused function (cleanup)
+e6f1bdc test: Remove redundant test output
+9a29c59 test: Verify replicated PG beyond just data after vstart
+92d00f2 test: Fix verify() used after import to also check xattr and omap
+cbaed6a test: Add test cases for xattr and omap ceph-objectstore-tool operations
+c549a8c kv/KineticStore: Fix broken split_key
+2b8c30d Update infernalis release notes
+d0e9c40 tests: restore run-cli-tests
+131deb3 SIGPIPE suppression for platforms without SO_NOSIGPIPE or MSG_NOSIGNAL
+c73e96a radosgw-admin: fix cli tests
+4ff0368 osdmaptool: fix cli tests
+a5b0465 crushtool: fix cli tests
+b7bb216 crushtool: fix cli test help
+0533cf9 osd: fix wrong use of right parenthesis
+ef011da Update .organizationmap
+0fd8de3 msg/async: support of non-block connect in async messenger
+785e58e scrub: clarify the result report
+a3aa565 journal: avoid holding lock while marking ops are complete
+4719696 cmake: updates for refactored librbd IO path
+10deea8 librbd: flush journal entries prior to releasing lock
+b515314 librbd: only erase IO events after they are marked safe
+5cf282a tests: librbd updates for C++11
+25072e4 tests: verify librbd journal entries are written to disk
+107bca7 librbd: treat flush op as a write op
+c67a85e tests: verify proper handling of librbd replay of IO journal events
+a1affd8 tests: updated librbd ImageWatcher listener signature
+b2eb5ab librbd: addressed possible race conditions / deadlocks from unit testing
+e34cd80 librbd: add replay support for IO events
+9259476 librbd: fix integration with journal library
+fc8b8d6 librbd: inform the journal when pending IO safely commits
+0039041 librbd: integrate cache with journal
+b10398b osdc: track journal commit tid within ObjectCacher for writes
+d480ad7 librbd: add support for removing the journal from disk
+a39fcf9 tests: add journal support to RBD test cases
+03636c9 librdb: initial interface with journal library
+3515e22 pybind/rados: return pool_reverse_lookup() result as a string
+763ca2a pybind/test_rbd: convert a few more str to bytes for py3
+d5bec87 pybind/rbd: encode snap_rename args for py3
+ddb1ee9 pybind/rbd: decode stat() and list_children() results for py3
+3c839b4 pybind/rbd: decode parent_info() to str types for py3
+1af2674 pybind/test_rbd: fix map() usage for py3 compat
+ea2a654 pybind/test_rbd: use // for division for py3
+f83f10a solaris big endian fixes
+6a3aedd error code translation logic
+b7cc9d4 Solaris compilation/build changes
+2a1b9f9 updated common library linked objects
+ecab8e1 cross platform pthreads
+41eff6b fix sem_open() call to be compliant
+4512ed5 import missing free function definition
+671ab30 stdin/stdout/stderr clash with standard definition, renamed to getter/setter
+615f8f4 ceph: Make stdout/stderr always output Unicode (UTF-8)
+0b3435f pybind: Add decode_cstr helper function
+801ea73 pybind: Add test for creating pool by raw UTF-8
+63f5462 pybind: Import cstr from the rados module
+47d9ca7 pybind: Don't encode str on Python 2
+be9d26f librbd: prevent concurrent image refreshes
+dcd3a65 librbd: added journal::EventEntry::get_event_type() helper
+50e2a6c librbd: removed ImageWatcher header namespace indentation
+44a49aa librbd: re-use common logic between aio_write/discard
+4966416 osdc: improved discard_set signature const correctness
+eb8d37a test: encoding: add new librbd journal event type
+7d5d1ae librbd: simplify watch/notify type encoding
+6f18f04 librbd: initial journal entry structures
+cc0ed06 tests: update librbd tests to support new AIO / lock handling
+75e26b2 librbd: ImageWatcher no longer maintains pending AIO op list
+75240f1 tests: updates for AIO locking changes
+ca83e4b librbd: consistent owner_lock handling for AIO paths
+6be6c3d librbd: simplify AioImageRequestWQ function signatures
+eea016d librbd: cleanup ictx_check's handling of owner_lock
+e0c9b88 tests: aio_write/aio_discard have been refactored
+9117778 librbd: refactor AIO request handling
+8aae868 librbd: AIO submissions processed by new AioImageRequestWQ
+b85a5fe librbd: rename AioRequest classes to AioObjectRequest
+4c33afb librbd: rename C_AioWrite to C_AioRequest
+3710c43 librbd: removed unused method declarations and definitions
+7d5323b librbd: add support for dynamically enabling/disabling journaling
+3a9b869 pybind/rbd.py: add new journaling feature code
+750771c librbd: add new RBD_FEATURE_JOURNALING feature code
+eb020b6 os: write file journal optimezation
+102539e librbd: API: options on image create: update tests
+c3be44e librbd: API: options on image create
+4052282 cmake: add nss as a suffix for pk11pub.h
+0e5a794 librbd: provide an out-of-class definition for MAX_DESCRIPTION_OFFSET
+6c5d601 cmake: fix librbd and add src/journal
+460c74a mds: properly set STATE_STRAY/STATE_ORPHAN for stray dentry/inode
+1ce364b mailmap: Ubuntu Kylin name changed to Kylin Cloud
+0d684ad mailmap: sort files
+d911641 journal: update allocated tid when skipping committed entry in player
+0669cba use new api and fix some wrong flag caller
+628f69f save init flags to CephContext
+925596a osd: check do_shutdown before do_restart
+016ed34 rados: Minor output changes for consistency across operations
+3ea903e cmake: fix files list
+a1b690d cls::journal: fixup: constify dump functions
+0b261e2 journal: call metadata shutdown on journal remove
+0dd6e0f journal: don't use object_number when comparing positions
+b9c6ae8 journal: make commit and flush params configurable
+13bc7be journal: allow alternate pool for journal objects
+d4a14e0 journal: output operators for journal types
+a6d354c src/init-ceph.in: remove unused variables
+d4c0969 src/init-ceph.in: improve usage message
+c994c0a src/init-ceph.in: process command-line options using getopt
+ff8b06f rbd: RBD::clone2: fix tracepoint
+3f749b7 rbd: make config changes actually apply
+738d64c Doc: add temperature related stuff in documents and test scripts
+e578926 Mon: add temperature support for existing cache related commands
+ed7d879 Osd: add a temperature based object eviction policy for cache tiering
+595c1e5 tests: rbd/admin_socket: use xmlstarlet when parsing perf dump
+fbe5526 librbd: perf section name: use hyphen to separate components
+9d5bb61 Mon: expose commands for temperature related setting
+e629094 Osd: add two fields to pg_pool_t
+1f85545 RadosClient: result code overflowed
+b30d1b8 doc: Adding --cluster option to rbd man page.
+9ebea48 rbd: dynamically generated bash completion
+01c720b rbd: hidden 'bash-completion' command dumps all available commands
+6daa1eb tests: updated rbd CLI --image-feature optional
+c12f7d2 ceph_test_rados_api_tier: fix PromoteOn2ndRead for EC case
+1be02b1 rbd: corrected handling of '--image-feature' optional
+3e78b18 WorkQueue: new PointerWQ base class for ContextWQ
+b65ace2 mds: properly update scrubs_in_progress
+48186fc mds: re-use C_KickOffScrubs context
+afef58b mds: avoid potenial double get CDentry::PIN_SCRUBQUEUE
+6e370bb mds: call CDentry::scrub_finished() when skipping special dentry
+b65345e mds: remove dir dentry from scrubstack after finishing scrube dirfrags
+50c088b mds: properly call CDir::scrub_finished()
+dd5a263 librbd: perf counters might not be initialized on error
+08b4c29 PendingReleaseNotes: document updated rbd CLI options
+f4f1f57 mds: skip scrubbing remote linkage
+d548b5f mon: revert MonitorDBStore's WholeStoreIteratorImpl::get
+f018928 revise organization
+d290b27 osd: trivial optimization
+d28698b osd: fix trivial bug
+f7f5a08 internal: remove unused local variables
+c8fe5ae librados: cast oid to object explicitly before call ioctx methods Cast oid to object explicitly before call ioctx methods. Signed-off-by: xie xingguo <xie.xingguo at zte.com.cn>
+e986ade IoCtxImpl: remove unused variable sName
+a5651b8 Revert 0374bb4a2f5054d606e4aba2d97b5e6765e781b0
+7496741 rgw: fix modification to index attrs when setting acls
+9689fe0 kv: fix string ctor usage
+bfeb90e librbd: fixed deadlock while attempting to flush AIO requests
+a9729d9 tests: new test case to catch deadlock on RBD image refresh
+d33842d tests: librbd: admin socket commands to flush and invalidate cache
+0996f9d librbd: flush and invalidate cache via admin socket
+39503f5 librbd: perf counter for cache invalidates
+3b39226 tests: fix typo in TestClsRbd.snapshots test case
+8ad594f tracing: fix librados signed/unsigned warnings
+057d39a os/osd: disable extra iterator validation
+117f40c os/KeyValueDB: don't call self.valid() from next() and prev()
+66b7b92 mon/MonitorDBStore: use single-key LevelDB::get() method
+a3f8891 os/DBObjectMap: use single-key LevelDB::get method
+76eb04a kv/LevelDBStore: simpler code for single-key fetches
+e184ca2 os/LevelDBStore: faster LevelDBTransactionImpl::set
+48ceaaf kv/RocksDBStore: do not Delete before Put
+1e3c2fa kv/LevelDBStore: do not Delete before Put
+338b4ed osd/ReplicatedPG: use bl-based setkeys/rmkeys
+402d181 tests: fix test_rados_tools.sh rados lookup
+fad3772 client: use null snapc to check pool permission
+f33dd76 librbd: start perf counters after id is initialized
+db85bdd FileStore: support multiple ondisk finish and apply finisher
+26befe1 cls_rbd: change object_map_update to return 0 on success, add logging
+f33282e doc/releases-notes: fix build error
+9224ac2 rbdmap: systemd support
+1b000ab rgw: fix reload on non Debian systems.
+78dbd13 ceph.spec.in: add new cls_journal RADOS class
+ea4971c journal: FutureImpl shouldn't hold lock while invoking callbacks
+ea275cc tests: journal updates to support C++11
+3f3be14 journal: updates to support C++11
+2097b4d tests: journal test no longer explicitly shuts down crypto
+87e3e05 journal: cleanup shutdown handling
+1acc688 librados: don't provide an invalid watch handle on error
+a68bf81 journal: fix race condition with unwatch on shutdown
+2d7f4d0 librados_test_stub: assert that no AIO ops are outstanding on destruction
+1faf96b tests: updated journal tests to support new commit API
+352194d journal: simplified commit position tracking
+bd9880a cls_journal: EntryPositions is now represented as a list
+5c2e9ea tests: fix journal test issues discovered via valgrind
+4d3969d journal: fix issues discovered via valgrind
+d85ea50 tests: update journal tests based on API changes
+6c28467 journal: JournalPlayer::process_state should support positive result
+651e4f7 journal: add method to remove journal from disk
+b97bc2b journal: signal playback complete via finisher thread
+a51d9d8 journal: complete flush context w/o holding locks
+4740a66 journal: Journaler::stop_append should be async
+0e1accd journal: avoid reentrant callbacks on Future
+09beb8e journal: add default constructor to Payload
+c1dd24e journal: new async flush method for whole journal
+cd3c378 journal: playback should notify when complete
+0b20876 journal: add default constructor to Future
+d8f7e96 journal: added additional callbacks for async ops
+8fcc38c tests: cls_journal: get_immutable_metadata is now async
+5f2ab50 cls_journal_client: get_immutable_metadata is now asynchronous
+e7d4f7b cls_journal: permit empty client id string
+00c123c tests: updated tests for journal trimmer modifications
+d2a9875 cls_journal: new convenience functions for ObjectSetPosition
+65f86fe journal: add support for trimming committed journal objects
+6bd8c0a journal: move rados_ctx_callback to utils namespace
+7233b8b tests: initial unit tests for new generic journal library
+adfc3c7 test/encoding/types.h: add journal::Entry to types
+debe172 journal: new generic journal implementation
+c11d71c librados_test_stub: added support for append op
+8270170 librados_test_stub: add support for new aio_notify API
+312d4e2 tests: new test for librados AIO notify API
+c9b873a librados: new AIO version of notify API
+8cee9a2 test/encoding/types.h: add cls::journal types
+050763f tests: new cls_journal_client test cases
+0f8d722 cls_journal_client: initial implementation of the cls journal client library
+dc19d57 cls_journal: initial implementation of journal class methods
+33aabf2 skip copyup if write is write_full
+40dcfe2 optimize clone write path if object-map is enabled
+8451522 tests: additional updates to cli/rbd cram test for updated CLI help
+610ddc1 rbd: add support for new 'snap rename' command
+a3b8e2d rbd: additional common Ceph command-line arguments
+b2f8363 rbd: don't print command help on error
+7ef01dc tests: update cli/rbd cram tests for refactored rbd
+b838ed1 qa/workunits/rbd: fixed incorrect stripe settings
+f955ee5 tests: fixed deprecated option in run-rbd-tests
+9652721 rbd: strictly enforce all command-line options
+038ed52 common: added getter for retrieving all configuration keys
+1b9661e rbd: switched rbd CLI over to refactored codebase
+c7f71d1 rbd: migrated existing command logic to new namespaces
+fa4e00f rbd: stub versions of all existing CLI commands
+c4b219a configure: check for boost regex library support
+77937ed rbd: support libraries for switching CLI processing to boost
+ba39d33 rbd: move rbd to tools/rbd subdirectory
+c0980af rbdmap: Move do_map and do_unmap shell functions to rbdmap script
+88e0b2c AsyncConnection: Let receiver ack message ASAP
+508bd87 librados: wrongly passed in argument for stat command
+619d804 FileStore::_check_replay_guard avoids double check on replaying and can_checkpoint() Already checked in _check_replay_guard, avoid double check in the inner function _check_global_replay_guard
+c228bd2 [mailmap] add member info. Signed-off-by: Xiaowei Chen <chen.xiaowei at h3c.com>
+b0536eb librbd : fix enable objectmap feature issue
+2ac35be doc/release-notes: edits from Nathan Cutler
+6e87d23 doc/release-notes: final infernalis notes
+78c5b9a radosgw-admin: metadata list user should return an empty list when pool is empty
+e8fe4bc tests: concatenate test_rados_test_tool from src and qa
+da6825d test/test_rados_tool.sh: Add tests for the new bench's write options
+9259e6e tools/rados/rados.cc: Add options to choose the benchmark's write destination
+7524e16 tools/rados/rados.cc: Write to different destinations
+00c6fa9 Objecter: pool_op callback may hang forever.
+400b0f4 Build internal plugins and classes as modules
+d457fc2 mds: apply validate_disk_state to dirs too
+6ba5bef mds: tidy up cdir scrub_initialize in scrubstack
+1930083 mds: write scrub tag during validation
+ee82243 mds: Hook ScrubStack into CInode::validate_disk_state
+3aeda58 mds: implement ScrubHeader
+3eceaf8 mds: add "tag path" command in MDSRank
+5a3f3d0 mds: implement enqueue scrub in MDCache
+069cd09 CMake: update for ScrubStack
+be04a90 mds: initial ScrubStack implementation
+4a6e799 mds: CInode: create scrub_info_t and surrounding infrastructure
+c5934c1 mds: CDentry: create scrub_info_t and surrounding infrastructure
+a2e3657 mds: CDir: implement scrub_local() and call it in CInode::validate_disk_state()
+fc6bd33 mds: CDir: Implement scrub_dentries_scrubbing()
+23c38c8 mds: CDir: create scrub_info_t and surrounding scrub infrastructure
+2cbf07b mds: frag_t: add scrub stamp and version for recursive and local scrubs
+f8b2fb9 mds: inode_t: add scrub stamp and version for latest complete scrub
+1627b45 MDSContinuation: remove expectation that it's using an MDR
+b789edd mdstypes: dentry_key_t: add an is_valid() function
+e09e548 mds: CDir: rearrange constructor
+bb2ecea (tag: v9.2.0, origin/infernalis) 9.2.0
+da48dbb rbd: fix clone issue when we specify image feature
+a603429 tests: test/librados/test.cc must create profile
+d5be20b librbd: resize should only update image size within header
+47abab9 tests: destroy testprofile before creating one
+ab46d79 tests: add destroy_ec_profile{,_pp} helpers
+629c41f ceph.spec.in: We no longer need redhat-lsb-core
+e382c67 init-rbdmap: Rewrite to use logger + clean-up
+5a6117e Objecter: remove redundant result-check of _calc_target in _map_session.
+8655416 Objecter: potential null pointer access when do pool_snap_list.
 b9ac90d osd/PG: tolerate missing epoch key
+c9681fd osd: merge local_t and op_t tnx to single one
+43ba820 mon:honour last seen election epoch in win_standalone_election()
+a341d97 ceph.in: Notify user that 'tell' can't be used in interactive mode
+92e0201 doc: update rpm links to download.ceph.com
+caa4780 doc: update debian links to download.ceph.com
+d712737 test: osd-scrub-snaps.sh uses ceph-helpers.sh and added to make check
+310bf78 osd: Use boost::optional instead of snap 0 for "all_clones"
+f508ddc osd, test: When head missing a snapset, clones not an error
+2e0bb0a osd, test: Keep missing count and log number of missing clones
+8227b4b test: Eliminate check for bogus "obj13/head snaps empty" error
+c6d283f ceph-objectstore-tool: Add new remove-clone-metadata object op
+eb0ca42 osd: Fix trim_object() to not crash on corrupt snapset
+caf2d59 ceph-objectstore-tool: Improve object spec error handling
+b6302ac ceph-objectstore-tool: Add undocumented clear-snapset command for testing
+138a33b ceph-objectstore-tool: Add set-size command for objects
+1688deb ceph-objectstore-tool: Enhanced dump command replaces dump-info
+b4ba3e6 test: Add some clones to ceph-objectstore-tool test
+d276d32 ceph-objectstore-tool: For corrupt objectstores, don't abort listing on errors
+b0c884b ceph-objectstore-tool: Improve some error messages
+0564f39 ceph-objectstore-tool: White space fixes
+9222f56 tools/rados: Improve xattr import handling so future internal xattrs ignored
+e5ad33e test: Test scrubbing of snapshot problems
+e0b3965 osd: Don't crash if OI_ATTR attribute is missing or corrupt
+9e48e18 osd: Additional _scrub() check for snapset inconsistency
+3b381ca osd: Better SnapSet scrub checking (find issues instead of asserting)
+a23036c osd: Make the _scrub routine produce good output and detect errors properly
+e0fd540 rgw:swift use Civetweb ssl can not get right url
+b698a76 rgw: Fix typo in RGWHTTPClient::process error message
+173bfd0 rgw: link against system openssl (instead of dlopen at runtime)
+8160af6 tools: ceph-monstore-update-crush: add "--test" to crushtool
+83afe15 test: ceph-disk: coverage list_format_dev_plain() new behavior.
+6253aea FileJournal:_fdump wrongly returns if journal is currently unreadable.
+39fb7f1 messages/MOSDOp: Cast in assert to eliminate warnings
+3047b56 rgw: Add default quota config
+570285b ceph-disk: get Nonetype when ceph-disk list with --format plain on single device.
+f22f4ac mailmap: Xie Xingguo affiliation
+93ec538 crush/mapper: ensure take bucket value is valid
+976a24a crush/mapper: ensure bucket id is valid before indexing buckets array
+4300f2a krbd: remove deprecated --quiet param from udevadm
+f46f7dc run_cmd: close parent process console file descriptors
+6f960fd rgw: add x-amz-request-charged header
+7fcd423 osd: check OSDSuperblock in mkfs() when it already have superblock
+e684e42 osd: test mkfs failure when the osd try to do mkfs again.
+f141444 tests: port uniqueness reminder
+eea4026 common: fix reset max in Throttle using perf reset command
+deb41b3 FileStore: remove unused local variable 'handle'
+3b5d54b ReplicatedPG:  result code not correctly set in some cases.
+ace7dd0 FileStore: potential memory leak if _fgetattrs fails
+f119fb5 doc: download GPG key from download.ceph.com
+ed88d88 doc/release-notes: v0.94.5
+284f4df release-notes: draft v0.94.5 release notes
+dd31d4a rocksdb: remove rdb source files from dist tarball
+1e4b37d release-notes: draft v0.80.11 release notes
+ce95ce1 tools: ceph-release-notes support multiple issues
+a704c5d vstart.sh: grant full access to Swift testing account
+56d6929 KeyValueStore: fix the name's typo of keyvaluestore_default_strip_size
+545e4b2 osd: Fix log message name of ceph-objectstore-tool
+631469c Revert "Speed optimizations. Merged 3 writes into 1."
+45ab728 osd: only calculate op crush mapping if we don't have the PG
+56ba90f osd: move misdirected op check from OSD thread to PG thread
+6528563 osd: ensure op rwm flags are checked before they are initialized
 5c49192 osd: fix OSDService vs Objecter init order
 1560057 ceph.spec.in: We no longer need redhat-lsb-core
 c567341 init-rbdmap: Rewrite to use logger + clean-up
+ebfd750 ReplicatedPG: remove unused local variables
+f4906a1 tests: ceph-disk workunit uses configobj
+163de5b tests: ceph-disk workunit uses the ceph task
+c4fdbdd cmake: Use uname instead of arch. arch is deprecated in linux-utils and coreutils does not install it by default.
+03e556b doc: Removed the NOTE section about non-LTS supported distributions
 58414c5 librbd: potential assertion failure during cache read
 011e9e5 tests: reproduce crash during read-induced CoW
 2a6b90f doc/release-notes.rst: recovery isn't in the unified queue yet
+9bf21ee doc: Updated the OS recommendations for newer Ceph releases
+ea52014 rgw: support core file limit for radosgw daemon
+ee4db81 mailmap: Jason Dillaman name normalization
+5449b3d mailmap: Joao Eduardo Luis name normalization
+759f3e8 mailmap: Min Chen name normalization
+4026efb mailmap: James Page name normalization
+4ca04a0 mailmap: Ken Dreyer name normalization
+7b6958b mailmap: tobe affiliation
+2a926de mailmap: Ruben Kerkhof affiliation
+cf45031 mailmap: Brian Andrus affiliation
+a2b33d0 mailmap: Kadu Ribeiro affiliation
+6e8e90e mailmap: Jeff Epstein affiliation
+f68d554 mailmap: Jeff Weber affiliation
+38cc18e mailmap: ritz303 affiliation
+dd24707 mailmap: chenji affiliation
+249828a mailmap: Burkhard Linke affiliation
+ed56b73 mailmap: Sylvain Baubeau affiliation
+fbcec33 mailmap: Siddharth Sharma affiliation
+d531dd7 mailmap: Milan Broz affiliation
+602c9ef mailmap: Matt Benjamin affiliation
+f07575b mailmap: Casey Bodley affiliation
+30bf4dd mailmap: Ali Maredia affiliation
+0ba0c0d mailmap: Brad Hubbard affiliation
+38b29e1 mailmap: Juan A. Suarez Romero affiliation
+d54b918 mailmap: Xiaowei Chen name normalization
+c905cc1 mailmap: Wei Qian affiliation
+c70fe11 mailmap: Li Peng affiliation
+ca6f4e9 mailmap: Robin H. Johnson affiliation
+51440f7 mailmap: Ren Huanwen affiliation
+3d36df3 mailmap: Yehua Chen name normalization
+ca1ba57 mailmap: Xinze Chi name normalization
+bec8bc7 mailmap: Xie Xiexingguo affiliation
+b66f53f mailmap: Xie Rui name normalization
+470158e mailmap: Sangdi Xu name normalization
+cb64c9f mailmap: Radoslaw Zarzynski name normalization
+220fcef mailmap: Haomai Wang name normalization
+96106b5 os/chain_xattr: On linux use linux/limits.h for XATTR_NAME_MAX.
+57af631 LFNIndex: remove redundant local variable 'obj'.
+dc21d8e rgw: add explicit success/error paths in RGWGetObj::execute()
+3d2ed6f mailmap: Dennis Schafroth affiliation
+ffd4f2a mailmap: Daniel Gryniewicz affiliation
+16e90c5 mailmap: Bo Cai name normalization
+7b2e9fc ceph.in: Remove unused variable
+113d727 ceph.in: Don't drop out of command mode on certain kinds of errors
+bb5bcab makefile: For ceph command generation don't append another copy of ceph.in
+597c43e tracing: add tracepoints for cache pin/unpin
+4783899 osd: return ENOENT when object doesn't exist for cache pin/unpin
+59d0de7 test/tier: add test for cache_pin
+36b2d3d test/tier: test flush/evict on pinned object
+c184218 osd/ReplicatedPG: reject flush/evict ops on pinned objects
+c502ea1 librados: add RADOS C interfaces for pin/unpin object
+a07d4c9 osd: only allow pin/unpin op on the cache tier
+0643845 osd: add num_objects_pinned in object_stat_sum_t
+f824c93 osd/ReplicatePG: skip flush/evict pinned objects
+5cd10e4 osd: force promotion when pin an object in cache tier
+af8d6ec osd: add support of pin/unpin objects in cache tier
+28b7205 rados: add the support of pin/unpin object in cache tier
+19d0a59 Fix Makefile in example/librados file.
+904c0e9 pybind: Use basestring as string type for Python 2
+ab6b923 pybind: Add Python 3 support for rados and rbd modules
+0278f5f doc/release-notes: drop 0.94.4 plaintext
+5f8ba74 doc/releases: fix 0.94.4 link
+49d3367 doc/release-notes: final v0.94.4 notes
+cdcdd78 osd/ReplicatedBackend: add bl-based setkeys/rmkeys
+126ba59 os/MemStore: avoid STL map/set for omap_{setkeys,rmkeys}
+332481e os/newstore: avoid STL map/set for omap_{setkeys,rmkeys}
+7fc05b4 os/ObjectStore: helpers for validating map<string,string> and set<string> to bl
+6c50b33 os/ObjectStore: add bufferlist-based omap_setkeys() and omap_rmkeys()
+1019ec1 kv/KeyValueDB: add bufferlist-based set() and rmkeys() interface
+ca72d50 kv/LevelDBStore: make set() avoid bufferlist copy most of the time
+c9c9618 kv/RocksDBStore: make get() avoid bufferlist copy most of the time
+1b25ef8 buffer: make is_contiguous() const
+1f3c01b kv/RocksDBStore: implement single-item get()
+8874249 test/ObjectMap: add test for raw_key_is_prefixed
+709b111 os/KeyValueDB: reduce malloc/free/string copy count
+e1783d2 kv: move KeyValueDB from os/ to kv/, libos.a to libkv.a
+b271b25 break KeyValueDB dependency on ObjectMap
+79c43b9 test: use KeyValueDB directly (not LevelDBStore!)
+d1646e0 Makefile: link TrackedOp in libglobal
+b22690e os/Makefile: build rocksdb with PORTABLE=1
+d613590 test/libcephfs/flock: add sys/file.h include for flock operations
+c7f75b8 mon: should not set isvalid = true when cephx_verify_authorizer return false
+91497e4 Speed optimizations. Merged 3 writes into 1. Got rid of std::string construction. More unification on syslog,stderr,fd.
+f2a2345 osd: remove the useless code.
+92454c2 osd: optimize get_object_context.
+cc0fcba test_rgw_admin: musl libc defines stdout as read-only. Use freopen for output redirection.
 c7d96a5 osd: init objecter after we authenticate
+d6803b8 drop envz.h includes
+5e81140 client: sys/file.h includes for flock operations
+e138e78 common/MemoryModel: Alpine is a linux variant but does not implement mallinfo(). Added explicit feature check.
+c40754b compat: use prefixed typeof to support stricter environments
+4f7bcab assert: __STRING is not defined by musl libc. Define __STRING when it is missing.
+1ff3870 libcephfs: Improve portability by replacing loff_t type usage with off_t. 64-bit behavior on glibc is enforced with __USE_FILE_OFFSET64 compiler error.
+abde034 rocksdb: 4.1
+ba0d2c3 rocksdb: build rocksdb with its own Makefile
+c1e4429 Makefile: link libos.a statically (no .la)
+e8614f8 Makefile: link mon statically (not .la)
+f86fbdb Makefile: make libosd.a static (not .la)
+e10301b librados_test_stub: add missing headers
+a099270 osd/tools: new and delete ObjectStore::Transaction in a function is not necessary
+212157a doc: Renamed the "Create a Ceph User" section and added verbage about the "ceph" user
+e826213 osd: off-by-one when check deep scrubbing
+fde458a test: add integration test for the auto repair feature
+8c8e1b7 pg: add auto-repair for EC pool
+1079636 pg: only queue for recovery if there is any objects to repair after scrubbing
 7673845 osd/PG: make upgrade() use sequencer
 52b79f7 Revert "os/FileStore: require upgrade to hammer before moving beyond"
 41c9466 Revert "osd: require an upgrade to hammer first"
@@ -15,9 +623,18 @@ de97840 Revert "osd: drop support for pre-hammer pg metadata"
 dff5783 Revert "ceph-objectstore-tool: drop support for pre-pgmeta PGs"
 9446770 Revert "os: drop deprecated collection_* attr methods"
 0f1b1f0 Revert "os/FileStore: fix version check"
+c560020 rgw/rgw_main: Added compat header for TEMP_FAILURE_RETRY
+b15c541 libcephfs: only check file offset on glibc platforms
+07e7496 osd: Add config option osd_read_ec_check_for_errors for testing
 661e2a0 qa: remove legacy OS support from rbd/qemu-iotests
+6a91101 doc/release-notes: v9.1.0
+7f337a7 osd/PGLog.h: reorder bool fields in PGLog struct
+e4b8600 rgw: Handle x-amz-request-payer in pre-signed urls
+f9c44ef osd: drop the interim set from load_pgs()
 1fb9fc9 librbd: fix rebuild_object_map() when no object map exists
 fb62c78 ceph_context: remove unsafe cast for singletons
+24740a7 client: drop prefix from int types
+65d0fc4 doc: fix outdated content in cache tier
 477bb06 ceph.spec.in: only run systemd-tmpfiles on ceph run directory
 40336fa CMake: fix rbd_replay error
 0009f34 osd: conditionally initialize the tracepoint provider
@@ -29,7 +646,13 @@ b3d02cc tracing: dynamic tracepoint provider helper
 a7ed8e1 packaging: add new tracepoint probe shared libraries
 f4feee2 ceph.spec.in: add new tracepoint probe shared libraries
 4a5305e lttng: move tracepoint probes to dynamic libraries
+7cd12cd CMake: add vstart convenience targets
+e6c7eb7 CMake: make mon and osd depend on EC plugins
 b61f3e4 osd: fix the snapshot reads of evicted tiering pool
+e26469e mailmap: Alexander Chuzhoy affiliation
+fee7144 rgw: fix response of delete expired objects
+2cf8d20 update radosgw-admin command
+4a3f375 vstart: set cephfs root uid/gid to caller
 7060a3b doc/infernalis: hate hate
 e6a9e62 doc/release-notes: i hate rst
 e98408d doc/release-notes: final infernalis notes
@@ -37,14 +660,23 @@ b105449 doc/release-notes: fix some attributions
 e9f200c doc/release-notes: infernalis notable changes
 638738f Revert "common, global: use lttng ust functions for handling fork-like calls"
 fca97db rgw, doc: remove remark for lack of custom account metadata of Swift.
+b4c5620 doc: remove toctree items under Create CephFS
 3be81ae 9.1.0
 036d36f debian/control: python-setuptools is a build dependency
 8e59595 doc/release-notes: 9.1.0
+1deb31d Init crush_location in Objecter from config file.
+303263d os: add a field indicate xattr only one chunk for set xattr.
+65064ca OSD:shall reset primary and up_primary fields when beginning a new past_interval.
 8855e60 ReplicatedPG::maybe_handle_cache_detail: always populate missing_oid
 da4803e ReplicatedPG::_rollback_to: handle block on full correctly
+be35ea9 release-notes: draft v0.94.4 release notes
 2b7ddde osd: Correct the object_info_t::decode() version
 03078ba rgw: location constraints should return api name
 a077301 mon/OSDMonitor: put crushtool error in log
+0bf2a79 messages/MOSDOp: fix reqid encoding/decoding
+6f6fe39 messages/MOSDOp: decode complete message for v6, too.
+e0cf25f Fix debug message in osd::is_healthy
+f276308 ceph-fuse.cc: While starting ceph-fuse, start the log thread first
 d36d7f2 ReplicatedPG: allow maybe_handle_cache to return status detailing what happened
 68c722c pybind/rados, get_omap_vals: Fix pydoc type.
 5a6e762 test: pybind/test_rados: add binary data.
@@ -56,12 +688,22 @@ d689db8 cls: new force-promotion flag for class methods
 6eca7d0 librados: restored pre-infernalis API compatibility
 cac1d6f buffer: restored pre-infernalis API compatibility
 030f697 rgw: orphan tool shouldn't clean up head objects
+453698f messages/MOSDOp: cleanup
+d09cdae rgw: Check request-payer configuration
+520c4bd rgw: Allow to set the requestPayment configuration
+f2a31ab rgw: Add requestPayment retrieval
 8f28913 rgw, doc: mention that Swift objexp is supported now.
 7250db6 CephxServiceHandler.cc: fix get_auth conditional
 1a2689f ReplicatedPG::maybe_handle_cache: do not promote before checking full
 e0d8cb1 tests: removed obsolete rbd_replay test cases
 c2a83d0 ceph-dencoder: new rbd_replay trace file types
 3ecdae8 rbd-replay: added version control to trace output file
+e692773 rgw: add support for skipping manifest parsing during GET on Swift object.
+a52383d client: don't mark_down on command reply
+1e57e6d mds/Session: use projected parent for auth path check
+116bc83 ceph_test_libcephfs: parse env properly (access)
+9489359 ceph_test_libcephfs: parse CEPH_ARGS properly
+21236ac release-notes: draft v0.94.4 release notes
 646e50a rbd-replay-prep: added --verbose command line option
 98f513a rbd-replay-prep: stream events to the prep file
 65fb1b8 rbd-replay-prep: simplify IO dependency calculation
@@ -75,10 +717,15 @@ e049de3 os/FileStore: kludge sloppy hammer temp objects into temp collection
 d258bf5 ceph.spec.in: drop MY_CONF_OPTS
 468c2dd doc: remove mention of --lazy-remove from radosgw-admin manpage
 98cbf03 osd/PG: fix generate_past_intervals
+5e9cf8e doc/release-notes: fix math error in firefly notes
 e675400 librbd: invalidate object map on error even w/o holding lock
 bc48ef0 selinux: Fix man page location
 378d56d man/Makefile-server.am: conditionalize make ceph_selinux manpage
 fb50ff6 mon: do not remove proxied sessions
+0d1cab4 test: add TestSessionFilter
+be3c4a8 mds: implement filtered "session ls" tell command
+47a1816 mds: call through to MDSRank in handle_command
+4af1764 mds: add auth_name to session_info_t
 1045291 ceph.spec.in: remove comments regarding ceph UID/GID in SUSE
 800d974 ceph.spec.in: enable OBS post-build-checks to find systemd-tmpfiles
 498578d etc/sysconfig/ceph: add CEPH_AUTO_RESTART_ON_UPGRADE
@@ -95,6 +742,8 @@ bf9ca1e ceph.spec.in: lttng in SLES12 only
 50567b4 ceph.spec.in: drop %insserv_prereq (obsoleted by systemd)
 c84722a ceph.spec.in: fix boost-random build dependency for SLE/openSUSE
 d0ecb0a doc/release-notes: initial v9.1.0 notes
+ff6223f CMake - Fix perftools checks
+fe54ef8 CMake - fix yasm check
 a6a6923 osdc/Objecter: send FULL_TRY and FULL_FORCE ops despite full flag
 8201f0e mon: allow ping through despite synch/quorum status, with session
 5008da2 mon: drop ops on closed sessions early
@@ -103,12 +752,17 @@ a6a6923 osdc/Objecter: send FULL_TRY and FULL_FORCE ops despite full flag
 6cf34a3 mon: drop any ops from closed sessions in dispatch_op
 a875826 mon: always set up session; move waitlist logic
 e2e1bd9 mds: avoid emitting cap warnings before evicting session
+8e930e3 messages/MOSDOp: avoid uninit/undecoded fields in print()
 362b18a mon: fix msg leak in resend_routed_requests
 c9dad52 Mon: Fix decoded message leak when this monitor is leader
+4698e24 tests: allow docker-test.sh to run under root
+a1f76ee tests: remove fedora docker files
 3ed25c1 librados: document new flag
 929e5d0 ceph.spec.in: correctly declare systemd dependency for SLE/openSUSE
 8d8fcee osd/ReplicatedPG: exempt MDS from the failsafe check, too
 81c2374 rgw: improve handling of already removed buckets in object expirer.
+662ad52 release-notes: draft v0.94.4 release notes
+2228c22 tools: ceph-release-notes handle multiple issues
 b915952 ceph.spec.in: Do not always restart the daemons on removal
 c95c14b ceph.spec.in: Do not always restart the daemons on upgrades
 b20a1ba ReplicatedPG: consider IGNORE_CACHE for all maybe_handle_cache calls
@@ -119,13 +773,124 @@ ea93ead osd: return -EDQUOT instead of -ENOSPC if it is a pool quota
 e86d033 osdc/Objecter: distinguish between multiple notify completions
 049ea70 osd: reply to notify request with our unique notify_id
 0f9dca4 install-deps.sh: openSUSE-release/sles-release/sled-release are always present
+b6b9e85 ceph_test_libcephfs: remove remaining pool name assumptions
+ac03144 client: clarify setattr forced sync behavior
+5272416 qa/workunits/fs/test_auth_caps: superceded by ceph_test_libcephfs
+283be86 ceph_test_libcephfs: fix LibCephFS.OpenLayout test
+9b48e24 unittest_mds_authcap: improve user tests
+e54ff35 mds: drop MAY_CREATE
+96ee6c9 mds/MDSAuthCap: verify the caller_gid is valid
+614e4cf mds: fix chown/chgrp check and tests
+d473732 mds/MDSAuthCaps: rename args for is_capable
+a6fe4ae ceph_test_libcephfs: add AccessTest.User
+371ed4b mds/SessionMap: fix MAY_CREATE check
+c77450b ceph_test_libcephfs: make foo, path tests use unique paths, users
+7569239 qa/workunits: drop bash path tests
+aed5cad mds/SessionMap: fix check_access for stray inodes
+fa43c6f mds/SessionMap: move Session method definitions together
+d76093b mds/Locker: do not ack from do_cap_update
+269e2de ceph_test_libcephfs: fix update-after-unlink test
+a43f5c7 mds/Server: skip auth check on session-less mdr's
+1957aed client: do sync setattr when caller != last cap dirtier
+2df9bfb client: consolidate client_mount_{uid,gid} and client_{user,group}_id options
+4867ef0 client: add get_{uid,gid} helpers for consistent uid/gids
+43f50c7 add caps_dirty to setattr
+c5e9d69 unittest_mds_authcaps: fix a few unit tests
+56bece2 fix test_path_caps
+10295e9 doc:fix path-based restriction
+548e34c test/libcephfs/access: add update_after_unlink test
+58a6f7c test/libcephfs/access: add ReadOnly restriction test
+073e92a test/libcephfs/access: add Path restriction test
+ddc69bb client/Client: added client_mount_uid and gid as parameters to getattr call
+7b7e2c2 mds/MDSAuthCap: fix creation ownership check
+868a871 mds/MDSAuthCaps: whitespace
+5ed6625 mds/MDSAuthCaps: only verify create git when not AUTH_UID_ANY
+9056a48 mds: send cap flush ack even when access check failed.
+0cb3616 client: force setattr to MDS when caller's {uid,gid} are not the specified ones
+74e05c6 client: allow specifying default caller_{uid,gid} of MClientRequest
+c6ab8de client: add options to specify caller_{uid,gid} of MClientCaps
+785b21c messages: add caller_{uid,gid} to cap msgs
+35f039e test/libcephfs/access: expand example test a bit
+f0a418d mds/Server: fix check_access
+8d78d5c client: properly set caller_{uid,gid} of readdir request
+aea8a0e ceph_test_libcephfs: skeleton for access tests
+b71a9c4 mds: fix Server::check_access
+6a6c068 doc/cephfs: path-based restriction
+b70d70b add check_access in _do_cap_update
+d0b5a33 move _check_access to SessionMap
+6f2ac9c add _check_access for async cap updates
+6cd7306 add stray_prior_path for is_stray
+4c896c7 add stray_prior_path to store path before rename
+d12a388 mds/MDSAuthCaps: add test cases for is_capable
+b3fdbb6 mds/Server: add chown and chgrp check access to setattr
+be7eb67 mds/Server: add create access check for openc
+62b9502 Server: add create access check for mknod and symlink
+a4d5c3b test_auth_caps: add mkdir check with mode 557
+88d7478 test_auth_caps: resolve bug with other bits test case
+00d7480 test_auth_caps: remove grp mount
+4f71b11 MDSAuthCaps: validate create access
+17c758b add stray_prior_path to store path before unlink
+e33cd74 add open check_access
+ea94bc4 add link check_access
+8a29c4e add rename check_access
+991d340 add snaps(ls,mk,rm,rename) check_access
+fb9c379 add setlayout, setdirlayout, setxattr check_access
+74b140a add readdir check_access
+9aa6128 mds/Server: clean up check_access a bit
+5b318aa MDSAuthCaps: add logic for group bits check
+1aaee87 test_auth_caps: add test for user bits
+7293540 mds/MDSAuthCaps: add permissions for user bits
+d5ebb02 Makefile: include ceph-fuse in base target
+807d369 client: behave if we can't getattr parents of mount point
+4ce4b58 test_auth_caps: Move trap and cleanup to the top
+3c4eb6a test_auth_caps: Added test logic for world bits
+5f5cf95 MDSAuthCaps: add world bits check logic
+0bb8210 client: pass uid, gid to lookup
+9bc2a01 add unlink, rmdir check_access test
+3b42d57 add unlink, rmdir check_access
+3d2e604 add symlink test
+7f35ae4 add symlink check_access
+5a19886 Add mknod check_access
+74b8f0c qa/workunits/fs/test_path_caps: tolerate existing directories
+eef2028 functional test of mds cap path restriction
+dc199f4 mds/Server.cc: drop leading / from path in is_capable check
+b07e015 unittest_mds_authcap: test lists of allow grants
+3e781f2 mds: mkdir check_access
+f5cb4e0 mds/MDSAuthCaps: fix parse error message
+6c0ebae mds: whitespace
+425ff66 mds: log to cluster log if mds cap parse fails
+a375834 mds/MDSAuthCaps: debug is_capable
+7c1614a mds/MDSAuthCap: fix debug prefix
+622fe9f mds/MDSAuthCap: drop leading / in paths
+0b557d5 mds/MDSAuthCaps: fix allow_all
+6f60c6d mds/MDSAuthCaps: add cct for debug context
+51f1028 mds: calculate path in check_access()
+e24a9cb mds/MDSAuthCaps: normalize path, drop useless constant.
+65eaf84 mds/MDSAuthCaps: pass down inode uid.gid and mode
+f8d4d80 mds/Server: add a few access checks
+a3f7f5a mds/Server: add check_access() hook
+c320bde mds/MDSAuthCap: fix uid and gid types
+68621e9 mds/MDSAuthCaps: cosmetic
+d0e4fae mds/MDSAuthCaps: use bitmask for is_capable()
+1b0a82b mds/MDSAuthCap: fix path match
+63c29ad mds/MDSAuthCaps: move allows() into MDSCapSpec
+1d82ec4 mds/MDSAuthCaps: parse optional gid list
+57a1860 mds/MDSAuthCaps: whitespace
+02113ac init-rbdmap: fix CMDPARAMS
 04c09ac mds: fix SnapServer crash on deleted pool
+2bb3d4b docs: Fix styling of newly added mirror docs
 0ce7491 bugfix: should call md_config_t::remove_observer on shutdown
+6b1e4a6 cleanup: make the pool setting GET decription point to SET description
+ef59f7c doc: update doc for with new pool settings
 a965378 ReplicatedPG: clearing a whiteout should create the object
 47f4a03 ceph-objectstore-tool: delete ObjectStore::Sequencer after umount
 f20f67e pybind/cephfs: fix DirEntry helpers
+7b1882f ceph.spec.in: correctly declare systemd dependency for SLE/openSUSE
 3f00042 rgw: set default value for env->get() call
+469d35f osd: init started to 0
 bba3ab3 mon: combine _ms_dispatch and dispatch
+612480b test/test_rados_tool.sh: implement regression test for bench verify crash
+0c8faf7 common/obj_bencher.cc: fix verification crashing when there's no objects
 e42c9aa ceph.spec.in: re-re-drop fdupes
 566c872 os/fs: fix aio submit method
 d7b620f ECBackend::handle_recovery_read_complete: do not expose the hash_info when getting the obc
@@ -133,6 +898,8 @@ d7b620f ECBackend::handle_recovery_read_complete: do not expose the hash_info wh
 80b7237 qa/workunits/cephtool/test.sh: don't assume crash_replay_interval=45
 c5a9275 osd/ReplicatedPG: preserve (some) flags when proxying reads
 994ec60 mds: respect max_entries/max_bytes of lssnap request
+818d790 MOSDOp::decode : Splitting message decoding, new version
+afcfb05 handle_op/do_op: Moving couple of checks from dispatcher to parallelized workers
 2fea3a5 examples/librados/hello_world.cc:missing semicolon
 216eef5 Revert "osd: new pool settings: scrub intervals"
 04679c5 OSDMap: fill in known encode_features where possible
@@ -140,6 +907,7 @@ c7e905e ceph-create-keys: set mds "allow *"
 f1f14f1 erasure-code: shec must compare for equality with epsilon
 e52204c client: fix quote enforcement on subdir mounts
 15e19a4 client: refactor quota check functions
+e7f277b rgw/rgw_admin: Checking the legality of the params There is no messages When some params are invalid. so the Program should be added the function which checks params, if the params are invalid, the program will give some messages.
 f1d8a8f Objecter: repeated free op->ontimeout.
 0635b13 Objecter: maybe access wild pointer(op) in _op_submit_with_budget.
 482d4e5 AsyncConnection: Add new debug log
@@ -166,6 +934,7 @@ efdaa93 mds: fix error reformatting subtreemap_test events
 9c8200b librbd:reads larger than cache size hang.
 396702a build/ops: make dist needs files with names > 99 characters
 5f7b3f5 filestore: fix peek_queue for OpSequencer
+6334d64 rgw:mdlog trim add usage prompt
 c053499 osd/: eliminate unnecessary pg_hit_set_history_t::current_info
 f5359f2 osd: print min_last_epoch_clean along with pg dump
 ef909cc mon/Elector: do a trivial write on every election cycle
@@ -186,6 +955,8 @@ a7ce8f5 CMake - Add check for keyutils
 dcf647e CMake - fix check for NSS
 b02e0f9 CMake - fix libatomic_ops and gperftools checks
 3123b2c arch/arm: s/false/0/
+66d19c7 rgw: fix swift API returning incorrect account metadata
+cb2fc29 rgw: refuse to calculate digest when the s3 secret key is empty
 7e5980b rgw: improve convenience for key operate.
 36e4a80 ReplicatedPG::hit_set_setup: fix hit_set_remove_all call
 8e5a801 osd/: assert in HitSet constructor if type is TYPE_NONE as well
@@ -193,8 +964,10 @@ ef97305 cls_rgw: fix bucket listing when dealing with invisible entries
 d422f28 OSDService::agent_entry: don't use PG::operator<< without pg lock
 e17c8e1 init-radosgw: specify pid file to start-stop-daemon
 d18cf51 osd: fix requeue of replay requests during activating
+88cffd8 rgw: don't treat Content-Type as automatically dumpable metadata.
 4264358 erasure-code: workaround i386 optimization bug with SHEC
 f4b55f4 journaler: detect unexpected holes in journal objects
+742906a rgw: fix wrong etag calculation during POST on S3 bucket.
 182676d tests: ceph-disk: workunit must fail when test fail
 0cf0e88 tests: ceph-disk: only install multipath on CentOS
 fb4dd7d tests: ceph-disk: inline run_osd
@@ -227,14 +1000,17 @@ c27b73f mon/OSDMonitor: respect NODOWN on osd failure checks
 ea97761 systemd: increase nproc ulimit
 9f89ae7 mon/PGMonitor: avoid useless register_new_pgs work
 2a01bbc mon: make all paxos-related timeouts relative to mon_lease
+5e2c665 scripts: release_notes can track original issue
 fd9ce66 osd/ReplicatedPG: tolerate promotion completion with stopped agent
 e65fb1b mds: adjust MDSRank::incarnation according to mdsmap
+d6b30de osd/ReplicatedPG: If object exist and not whiteout, don't do touch for create.
 30810da osd: new pool settings: scrub intervals
 48db7b1 osd: new pool flags: noscrub, nodeep-scrub
 b97ae76 osd: make 'ceph osd pool get' work for all settable pool flags
 10235e3 osd: refactor setting write_fadvise_dontneed pool flag
 b41f574 Fix unneccessary at/near max target warn in ceph -s when using ecpool When calculated objects needing eviction, we use object - hitset_achieve. So setting max objects = 30000, ceph -s will warn you at/near if there exists hitset_achieve
 21a1e75 tests: update to match crushmap validation message
+937e4f8 rhel 5.9 librados fix, removed blkid from compilation, Fixes #13177
 4da6793 install-deps: enable python3
 170f9ad doc: do not promise backports to Dumpling
 a6f07e9 doc: remove mention of ceph-extra as a requirement
@@ -250,6 +1026,10 @@ e44d1e0 ceph.spec.in: Fix up (/var)/run/ceph creation
 aa238e5 crush/CrushTester: allow testing by ruleset
 4f553b0 Librbd: Fix incorrect metadata filter behavior
 3971274 mon: return size_t from MonitorDBStore::Transaction::size()
+52bbeb1 mds: Make mds can dynamic set optracker via asok.
+4aaa123 osd: Make osd can dynamic set optracker via asok.
+01f816a common/TrackedOp: make Tracker can dynamic control.
+7fe72a2 common/TrackedOp: Should lock ops_history_lock when access shutdown.
 26fee81 osd/ReplicatedPG: using hobject_t::get_snapdir.
 fd0a384 osd/ReplicatedPG: Using Hobject_t::get_head.
 a326bd9 osd/ReplicatedPG: Remove the unuseful judgement in get_object_context.
@@ -262,10 +1042,15 @@ c1afc38 unsigned type is short for journal max_size,use uint64_t instead.
 f25b67f rgw: add a new error message for user conflict when using Admin Ops API
 490938e osd/: find_object_context: return obc for head for snapdir if non null
 bdb2fa2 mds: Make sure wanted_state of first MDSBeacon is MDSMap::STATE_BOOT
+c6e681a rgw: improve documentation for rgw_extended_http_attrs config opt.
+eb7c21f rgw: add support for printing generic attrs on Swift container.
+8c2b8b7 rgw: add support for printing generic attrs on Swift account.
 85bece7 new release key
+9d8fb1c rgw: camelcase also after dash in rgw_extended_http_attrs.
 6a24d31 libcephfs: fix calling init() then mount()
 e017aab CMake: fix libcephfs shared lib generation
 7182499 install-deps.sh: disable python3
+b8c5d5b rgw: clarify the error message when trying to create an existed user
 a825f68 client/MetaRequest: optimize func can_forward/auth_is_best.
 a195928 unify order limit
 f51afa6 client/MetaRequest: open w/ O_CREAT|O_TRUNC is write.
@@ -318,6 +1103,7 @@ adb8478 include/inline_memory: out-of-bounds read on unaligned memory
 0b03b32 tools:remove the local file when get map failed.
 c2a9764 mon: do not return ref to MonOpRequest::get_session() caller
 d99e689 mon: fix MonSession leak when waitlisting op
+d903df5 tools:Removing duplicate references
 2f663d9 rgw: make radosgw-admin user rm idempotent
 fef7142 ceph: fix rename into sub-directory check
 40c3c85 mon: debug MonSession refs
@@ -361,6 +1147,7 @@ fa6e4ff osdc/Journaler: add write_ihont filed which record the write fadvise fla
 4b45e6d osdc/Objecter: pass extra_ops of read_full into read func.
 9b44fab osdc/Filer: make read/write support iohint flags.
 7c09e50 osdc/Objecter: make sg_read/write support iohint_flags.
+ffad282 rgw:modify the conditional statement in parse_metadata_key method.
 f4498f5 osd:the fuction osd::shutdown Lock failed.
 d7f1d70 memstore: don't encode/decode 'size_t'
 6359f3a cls_numops: don't include asm-generic/errno.h directly
@@ -395,9 +1182,11 @@ c4401ad test/Makefile-client: ship LibradosTestStub.h in tarball
 c503e97 rgw: include RequestId as part of the Error response
 94d84cc test: mon/mon-ping.sh: make sure 'ceph mon ping' works as expected
 6907778 ceph-objectstore-tool: add mark-complete operation
+567dd1e common: OpTracker age histogram calculation is not correct
 06147dd rgw: preserve all attrs if intra-zone copy
 293d12a test/Makefile.am: run mon/mon-scrub.sh as part of checks
 6ceb37d test: mon/mon-scrub.sh: port clashed with other tests
+8fd40e1 librbd: remove duplicate read_only test in librbd::async_flatten
 897f074 test_async_compressor.cc: prefer ++operator for non-primitive iterators
 9d9b305 os/KeyValueStore.cc: prefer ++operator for non-primitive iterators
 8810f8f SnappyCompressor.h: prefer ++operator for non-primitive iterators
@@ -409,6 +1198,7 @@ cddca59 interval_set: add lower_bound(T k) member function
 95bd3c2 test: Fix failure test to find message anywhere in stderr
 b968fb3 rados: Fix usage for "notify" command
 d741352 AsyncMessenger: add instance name in debug log when processing msg
+19a191e objects can not be displayed which object name does not contain '.' when get the bucket index.
 95685c1 rgw:add --reset-regions for regionmap update
 3ccc3bb librbd: diff_iterate needs to handle holes in parent images
 d5650c9 tests: new test case for librbd diff_iterate over discard extents
@@ -430,6 +1220,7 @@ e6fbe53 improve error handle of rbd metadata operation & format output
 82b0243 qa/workunits/post-file.sh: sudo
 bfe359a osd: dump full map bl at 20 when crc doesn't match
 351d957 doc: fix the typo in command example
+7080e0f Thread.h: disable copy constr and assignment op
 7d781f7 doc: 'ceph --admin-daemon ...' -> 'ceph daemon ...'
 404dd16 tests: base gmock class support for librbd
 e8749b2 librbd: support templating of ImageCtx for async state machines
@@ -491,6 +1282,13 @@ eb2993a osd/ReplicatedPG: create apply_ctx_stats() helper
 3626db4 rgw: don't copy delete_at attr, unless it's intra region copy
 a69a989 rgw: objexp shards index by key
 fa347d8 rgw: delete-at and delete-after also on obj put / copy
+14c400f add test for python binding
+8ca0f10 add snap rename in python wrapper
+a73ac73 add snap rename test in cls_rbd
+98a483c add snap rename unit test in test_librbd.cc
+2865de4 add unit test for snap rename in imagewatcher
+d139f35 add snapshot rename CLI
+6ce8b2a handle snap rename notify
 65dcc2d osd: When generating past intervals due to an import end at pg epoch
 cabfe13 osd: check the length of the map before accessing the first element
 ddca321 rbd: add verbose error reporting to merge-diff tool
@@ -619,6 +1417,7 @@ ee20404 osdc/Objecter: optimize Objecter::tick.
 08296dc rados: make 'rados bench' support json format output Fixes: #12864 rados bench add '[--format json]' and '[-o | --output outfile]' support. output option only take effect in json format. now we can use the bench result draw performance graph easily.
 f420fe4 mds: fix shutdown while in standby
 80f10e3 osdc/Objecter: remove the unuseful code.
+bd80473 add snapshot rename methods in cls_rbd
 7cc963b osdc/Objecter: Don't forget call _op_cancel_map_check when cancel linger op.
 36b6271 osdc/Objecter: In _cancel_linger_op, it should make num_unacked/num_committed decrease.
 076bad9 ceph_test_rados_api_aio: add a test for aio_sparse_read
@@ -1246,6 +2045,7 @@ d171537 os/Memstore:Refactor collection_list_range and collection_list_partial
 9471bb8 Common/Thread: pthread_attr_destroy(thread_attr) when done with it When a thread attributes object is no longer required, it should be destroyed using the pthread_attr_destroy() function. Destroying a thread attributes object has no effect on threads that were created using that object.
 e3147b8 rgw:segmentation fault when rgw_gc_max_objs > HASH_PRIME
 9420d24 rgw:the arguments 'domain' should not be assigned when return false
+2938fdd radosgw-admin: Create --secret-key alias for --secret
 cd4ac1c rbd: support size suffixes for size-based options
 d1735a4 rgw: rework X-Trans-Id header to be conform with Swift API.
 278a6ae qa: add fs layout case for stripe_size decrease
@@ -1670,6 +2470,7 @@ b7b1bf2 rgw: add minimum support for copy multipart part
 16ead95 qa: update pool quota test for internal retries
 dbcf2e4 Fixes : #12018
 67de12b Fixes : #12018 osd/OSD.cc : drop write if pool is full
+50b4bdc ceph.spec.in: fix libs-compat / devel-compat conditional
 6849274 osd: pg_interval_t::check_new_interval should not rely on pool.min_size to determine if the PG was active
 466b083 osd: Move IsRecoverablePredicate/IsReadablePredicate to osd_types.h
 53072b9 ceph.spec.in: do not run fdupes, even on SLE/openSUSE
diff --git a/Makefile.am b/Makefile.am
index 7ff3cf7..d572e14 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -57,6 +57,11 @@ check-local:: all
 		exit 1 ; \
 	fi
 
+# display the output of failed check_SCRIPTS after a failed make check
+export VERBOSE = true
+
+TESTS = $(check_SCRIPTS)
+
 check_SCRIPTS = \
 	src/test/run-cli-tests
 
diff --git a/Makefile.in b/Makefile.in
index 812100b..4419415 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -82,8 +82,8 @@ subdir = .
 DIST_COMMON = INSTALL NEWS README AUTHORS ChangeLog \
 	$(srcdir)/Makefile.in $(srcdir)/Makefile.am \
 	$(top_srcdir)/configure $(am__configure_deps) \
-	$(srcdir)/ceph.spec.in COPYING ar-lib compile config.guess \
-	config.sub install-sh missing ltmain.sh
+	$(srcdir)/ceph.spec.in test-driver COPYING ar-lib compile \
+	config.guess config.sub install-sh missing ltmain.sh
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_classpath.m4 \
 	$(top_srcdir)/m4/ac_prog_jar.m4 \
@@ -142,7 +142,7 @@ am__recursive_targets = \
   $(RECURSIVE_CLEAN_TARGETS) \
   $(am__extra_recursive_targets)
 AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
-	cscope distdir dist dist-all distcheck
+	cscope check recheck distdir dist dist-all distcheck
 am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
 # Read a list of newline-separated strings from the standard input,
 # and print each of them once, without duplicates.  Input order is
@@ -163,6 +163,209 @@ am__define_uniq_tagged_files = \
 ETAGS = etags
 CTAGS = ctags
 CSCOPE = cscope
+am__tty_colors_dummy = \
+  mgn= red= grn= lgn= blu= brg= std=; \
+  am__color_tests=no
+am__tty_colors = { \
+  $(am__tty_colors_dummy); \
+  if test "X$(AM_COLOR_TESTS)" = Xno; then \
+    am__color_tests=no; \
+  elif test "X$(AM_COLOR_TESTS)" = Xalways; then \
+    am__color_tests=yes; \
+  elif test "X$$TERM" != Xdumb && { test -t 1; } 2>/dev/null; then \
+    am__color_tests=yes; \
+  fi; \
+  if test $$am__color_tests = yes; then \
+    red=''; \
+    grn=''; \
+    lgn=''; \
+    blu=''; \
+    mgn=''; \
+    brg=''; \
+    std=''; \
+  fi; \
+}
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+am__recheck_rx = ^[ 	]*:recheck:[ 	]*
+am__global_test_result_rx = ^[ 	]*:global-test-result:[ 	]*
+am__copy_in_global_log_rx = ^[ 	]*:copy-in-global-log:[ 	]*
+# A command that, given a newline-separated list of test names on the
+# standard input, print the name of the tests that are to be re-run
+# upon "make recheck".
+am__list_recheck_tests = $(AWK) '{ \
+  recheck = 1; \
+  while ((rc = (getline line < ($$0 ".trs"))) != 0) \
+    { \
+      if (rc < 0) \
+        { \
+          if ((getline line2 < ($$0 ".log")) < 0) \
+	    recheck = 0; \
+          break; \
+        } \
+      else if (line ~ /$(am__recheck_rx)[nN][Oo]/) \
+        { \
+          recheck = 0; \
+          break; \
+        } \
+      else if (line ~ /$(am__recheck_rx)[yY][eE][sS]/) \
+        { \
+          break; \
+        } \
+    }; \
+  if (recheck) \
+    print $$0; \
+  close ($$0 ".trs"); \
+  close ($$0 ".log"); \
+}'
+# A command that, given a newline-separated list of test names on the
+# standard input, create the global log from their .trs and .log files.
+am__create_global_log = $(AWK) ' \
+function fatal(msg) \
+{ \
+  print "fatal: making $@: " msg | "cat >&2"; \
+  exit 1; \
+} \
+function rst_section(header) \
+{ \
+  print header; \
+  len = length(header); \
+  for (i = 1; i <= len; i = i + 1) \
+    printf "="; \
+  printf "\n\n"; \
+} \
+{ \
+  copy_in_global_log = 1; \
+  global_test_result = "RUN"; \
+  while ((rc = (getline line < ($$0 ".trs"))) != 0) \
+    { \
+      if (rc < 0) \
+         fatal("failed to read from " $$0 ".trs"); \
+      if (line ~ /$(am__global_test_result_rx)/) \
+        { \
+          sub("$(am__global_test_result_rx)", "", line); \
+          sub("[ 	]*$$", "", line); \
+          global_test_result = line; \
+        } \
+      else if (line ~ /$(am__copy_in_global_log_rx)[nN][oO]/) \
+        copy_in_global_log = 0; \
+    }; \
+  if (copy_in_global_log) \
+    { \
+      rst_section(global_test_result ": " $$0); \
+      while ((rc = (getline line < ($$0 ".log"))) != 0) \
+      { \
+        if (rc < 0) \
+          fatal("failed to read from " $$0 ".log"); \
+        print line; \
+      }; \
+      printf "\n"; \
+    }; \
+  close ($$0 ".trs"); \
+  close ($$0 ".log"); \
+}'
+# Restructured Text title.
+am__rst_title = { sed 's/.*/   &   /;h;s/./=/g;p;x;s/ *$$//;p;g' && echo; }
+# Solaris 10 'make', and several other traditional 'make' implementations,
+# pass "-e" to $(SHELL), and POSIX 2008 even requires this.  Work around it
+# by disabling -e (using the XSI extension "set +e") if it's set.
+am__sh_e_setup = case $$- in *e*) set +e;; esac
+# Default flags passed to test drivers.
+am__common_driver_flags = \
+  --color-tests "$$am__color_tests" \
+  --enable-hard-errors "$$am__enable_hard_errors" \
+  --expect-failure "$$am__expect_failure"
+# To be inserted before the command running the test.  Creates the
+# directory for the log if needed.  Stores in $dir the directory
+# containing $f, in $tst the test, in $log the log.  Executes the
+# developer- defined test setup AM_TESTS_ENVIRONMENT (if any), and
+# passes TESTS_ENVIRONMENT.  Set up options for the wrapper that
+# will run the test scripts (or their associated LOG_COMPILER, if
+# thy have one).
+am__check_pre = \
+$(am__sh_e_setup);					\
+$(am__vpath_adj_setup) $(am__vpath_adj)			\
+$(am__tty_colors);					\
+srcdir=$(srcdir); export srcdir;			\
+case "$@" in						\
+  */*) am__odir=`echo "./$@" | sed 's|/[^/]*$$||'`;;	\
+    *) am__odir=.;; 					\
+esac;							\
+test "x$$am__odir" = x"." || test -d "$$am__odir" 	\
+  || $(MKDIR_P) "$$am__odir" || exit $$?;		\
+if test -f "./$$f"; then dir=./;			\
+elif test -f "$$f"; then dir=;				\
+else dir="$(srcdir)/"; fi;				\
+tst=$$dir$$f; log='$@'; 				\
+if test -n '$(DISABLE_HARD_ERRORS)'; then		\
+  am__enable_hard_errors=no; 				\
+else							\
+  am__enable_hard_errors=yes; 				\
+fi; 							\
+case " $(XFAIL_TESTS) " in				\
+  *[\ \	]$$f[\ \	]* | *[\ \	]$$dir$$f[\ \	]*) \
+    am__expect_failure=yes;;				\
+  *)							\
+    am__expect_failure=no;;				\
+esac; 							\
+$(AM_TESTS_ENVIRONMENT) $(TESTS_ENVIRONMENT)
+# A shell command to get the names of the tests scripts with any registered
+# extension removed (i.e., equivalently, the names of the test logs, with
+# the '.log' extension removed).  The result is saved in the shell variable
+# '$bases'.  This honors runtime overriding of TESTS and TEST_LOGS.  Sadly,
+# we cannot use something simpler, involving e.g., "$(TEST_LOGS:.log=)",
+# since that might cause problem with VPATH rewrites for suffix-less tests.
+# See also 'test-harness-vpath-rewrite.sh' and 'test-trs-basic.sh'.
+am__set_TESTS_bases = \
+  bases='$(TEST_LOGS)'; \
+  bases=`for i in $$bases; do echo $$i; done | sed 's/\.log$$//'`; \
+  bases=`echo $$bases`
+RECHECK_LOGS = $(TEST_LOGS)
+TEST_SUITE_LOG = test-suite.log
+TEST_EXTENSIONS = @EXEEXT@ .test
+LOG_DRIVER = $(SHELL) $(top_srcdir)/test-driver
+LOG_COMPILE = $(LOG_COMPILER) $(AM_LOG_FLAGS) $(LOG_FLAGS)
+am__set_b = \
+  case '$@' in \
+    */*) \
+      case '$*' in \
+        */*) b='$*';; \
+          *) b=`echo '$@' | sed 's/\.log$$//'`; \
+       esac;; \
+    *) \
+      b='$*';; \
+  esac
+am__test_logs1 = $(TESTS:=.log)
+am__test_logs2 = $(am__test_logs1:@EXEEXT at .log=.log)
+TEST_LOGS = $(am__test_logs2:.test.log=.log)
+TEST_LOG_DRIVER = $(SHELL) $(top_srcdir)/test-driver
+TEST_LOG_COMPILE = $(TEST_LOG_COMPILER) $(AM_TEST_LOG_FLAGS) \
+	$(TEST_LOG_FLAGS)
 DIST_SUBDIRS = $(SUBDIRS)
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 distdir = $(PACKAGE)-$(VERSION)
@@ -220,6 +423,7 @@ AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
 BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
+BOOST_REGEX_LIBS = @BOOST_REGEX_LIBS@
 BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
 CC = @CC@
 CCAS = @CCAS@
@@ -420,12 +624,14 @@ EXTRA_DIST = autogen.sh ceph.spec.in ceph.spec install-deps.sh \
 # the "." here makes sure check-local builds gtest and gmock before they are used
 SUBDIRS = . src man doc systemd selinux
 CHECK_ULIMIT := true
+TESTS = $(check_SCRIPTS)
 check_SCRIPTS = \
 	src/test/run-cli-tests
 
 all: all-recursive
 
 .SUFFIXES:
+.SUFFIXES: .log .test .test$(EXEEXT) .trs
 am--refresh: Makefile
 	@:
 $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
@@ -578,6 +784,169 @@ distclean-tags:
 	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
 	-rm -f cscope.out cscope.in.out cscope.po.out cscope.files
 
+# Recover from deleted '.trs' file; this should ensure that
+# "rm -f foo.log; make foo.trs" re-run 'foo.test', and re-create
+# both 'foo.log' and 'foo.trs'.  Break the recipe in two subshells
+# to avoid problems with "make -n".
+.log.trs:
+	rm -f $< $@
+	$(MAKE) $(AM_MAKEFLAGS) $<
+
+# Leading 'am--fnord' is there to ensure the list of targets does not
+# expand to empty, as could happen e.g. with make check TESTS=''.
+am--fnord $(TEST_LOGS) $(TEST_LOGS:.log=.trs): $(am__force_recheck)
+am--force-recheck:
+	@:
+
+$(TEST_SUITE_LOG): $(TEST_LOGS)
+	@$(am__set_TESTS_bases); \
+	am__f_ok () { test -f "$$1" && test -r "$$1"; }; \
+	redo_bases=`for i in $$bases; do \
+	              am__f_ok $$i.trs && am__f_ok $$i.log || echo $$i; \
+	            done`; \
+	if test -n "$$redo_bases"; then \
+	  redo_logs=`for i in $$redo_bases; do echo $$i.log; done`; \
+	  redo_results=`for i in $$redo_bases; do echo $$i.trs; done`; \
+	  if $(am__make_dryrun); then :; else \
+	    rm -f $$redo_logs && rm -f $$redo_results || exit 1; \
+	  fi; \
+	fi; \
+	if test -n "$$am__remaking_logs"; then \
+	  echo "fatal: making $(TEST_SUITE_LOG): possible infinite" \
+	       "recursion detected" >&2; \
+	else \
+	  am__remaking_logs=yes $(MAKE) $(AM_MAKEFLAGS) $$redo_logs; \
+	fi; \
+	if $(am__make_dryrun); then :; else \
+	  st=0;  \
+	  errmsg="fatal: making $(TEST_SUITE_LOG): failed to create"; \
+	  for i in $$redo_bases; do \
+	    test -f $$i.trs && test -r $$i.trs \
+	      || { echo "$$errmsg $$i.trs" >&2; st=1; }; \
+	    test -f $$i.log && test -r $$i.log \
+	      || { echo "$$errmsg $$i.log" >&2; st=1; }; \
+	  done; \
+	  test $$st -eq 0 || exit 1; \
+	fi
+	@$(am__sh_e_setup); $(am__tty_colors); $(am__set_TESTS_bases); \
+	ws='[ 	]'; \
+	results=`for b in $$bases; do echo $$b.trs; done`; \
+	test -n "$$results" || results=/dev/null; \
+	all=`  grep "^$$ws*:test-result:"           $$results | wc -l`; \
+	pass=` grep "^$$ws*:test-result:$$ws*PASS"  $$results | wc -l`; \
+	fail=` grep "^$$ws*:test-result:$$ws*FAIL"  $$results | wc -l`; \
+	skip=` grep "^$$ws*:test-result:$$ws*SKIP"  $$results | wc -l`; \
+	xfail=`grep "^$$ws*:test-result:$$ws*XFAIL" $$results | wc -l`; \
+	xpass=`grep "^$$ws*:test-result:$$ws*XPASS" $$results | wc -l`; \
+	error=`grep "^$$ws*:test-result:$$ws*ERROR" $$results | wc -l`; \
+	if test `expr $$fail + $$xpass + $$error` -eq 0; then \
+	  success=true; \
+	else \
+	  success=false; \
+	fi; \
+	br='==================='; br=$$br$$br$$br$$br; \
+	result_count () \
+	{ \
+	    if test x"$$1" = x"--maybe-color"; then \
+	      maybe_colorize=yes; \
+	    elif test x"$$1" = x"--no-color"; then \
+	      maybe_colorize=no; \
+	    else \
+	      echo "$@: invalid 'result_count' usage" >&2; exit 4; \
+	    fi; \
+	    shift; \
+	    desc=$$1 count=$$2; \
+	    if test $$maybe_colorize = yes && test $$count -gt 0; then \
+	      color_start=$$3 color_end=$$std; \
+	    else \
+	      color_start= color_end=; \
+	    fi; \
+	    echo "$${color_start}# $$desc $$count$${color_end}"; \
+	}; \
+	create_testsuite_report () \
+	{ \
+	  result_count $$1 "TOTAL:" $$all   "$$brg"; \
+	  result_count $$1 "PASS: " $$pass  "$$grn"; \
+	  result_count $$1 "SKIP: " $$skip  "$$blu"; \
+	  result_count $$1 "XFAIL:" $$xfail "$$lgn"; \
+	  result_count $$1 "FAIL: " $$fail  "$$red"; \
+	  result_count $$1 "XPASS:" $$xpass "$$red"; \
+	  result_count $$1 "ERROR:" $$error "$$mgn"; \
+	}; \
+	{								\
+	  echo "$(PACKAGE_STRING): $(subdir)/$(TEST_SUITE_LOG)" |	\
+	    $(am__rst_title);						\
+	  create_testsuite_report --no-color;				\
+	  echo;								\
+	  echo ".. contents:: :depth: 2";				\
+	  echo;								\
+	  for b in $$bases; do echo $$b; done				\
+	    | $(am__create_global_log);					\
+	} >$(TEST_SUITE_LOG).tmp || exit 1;				\
+	mv $(TEST_SUITE_LOG).tmp $(TEST_SUITE_LOG);			\
+	if $$success; then						\
+	  col="$$grn";							\
+	 else								\
+	  col="$$red";							\
+	  test x"$$VERBOSE" = x || cat $(TEST_SUITE_LOG);		\
+	fi;								\
+	echo "$${col}$$br$${std}"; 					\
+	echo "$${col}Testsuite summary for $(PACKAGE_STRING)$${std}";	\
+	echo "$${col}$$br$${std}"; 					\
+	create_testsuite_report --maybe-color;				\
+	echo "$$col$$br$$std";						\
+	if $$success; then :; else					\
+	  echo "$${col}See $(subdir)/$(TEST_SUITE_LOG)$${std}";		\
+	  if test -n "$(PACKAGE_BUGREPORT)"; then			\
+	    echo "$${col}Please report to $(PACKAGE_BUGREPORT)$${std}";	\
+	  fi;								\
+	  echo "$$col$$br$$std";					\
+	fi;								\
+	$$success || exit 1
+
+check-TESTS:
+	@list='$(RECHECK_LOGS)';           test -z "$$list" || rm -f $$list
+	@list='$(RECHECK_LOGS:.log=.trs)'; test -z "$$list" || rm -f $$list
+	@test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG)
+	@set +e; $(am__set_TESTS_bases); \
+	log_list=`for i in $$bases; do echo $$i.log; done`; \
+	trs_list=`for i in $$bases; do echo $$i.trs; done`; \
+	log_list=`echo $$log_list`; trs_list=`echo $$trs_list`; \
+	$(MAKE) $(AM_MAKEFLAGS) $(TEST_SUITE_LOG) TEST_LOGS="$$log_list"; \
+	exit $$?;
+recheck: all $(check_SCRIPTS)
+	@test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG)
+	@set +e; $(am__set_TESTS_bases); \
+	bases=`for i in $$bases; do echo $$i; done \
+	         | $(am__list_recheck_tests)` || exit 1; \
+	log_list=`for i in $$bases; do echo $$i.log; done`; \
+	log_list=`echo $$log_list`; \
+	$(MAKE) $(AM_MAKEFLAGS) $(TEST_SUITE_LOG) \
+	        am__force_recheck=am--force-recheck \
+	        TEST_LOGS="$$log_list"; \
+	exit $$?
+src/test/run-cli-tests.log: src/test/run-cli-tests
+	@p='src/test/run-cli-tests'; \
+	b='src/test/run-cli-tests'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+.test.log:
+	@p='$<'; \
+	$(am__set_b); \
+	$(am__check_pre) $(TEST_LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_TEST_LOG_DRIVER_FLAGS) $(TEST_LOG_DRIVER_FLAGS) -- $(TEST_LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+ at am__EXEEXT_TRUE@.test$(EXEEXT).log:
+ at am__EXEEXT_TRUE@	@p='$<'; \
+ at am__EXEEXT_TRUE@	$(am__set_b); \
+ at am__EXEEXT_TRUE@	$(am__check_pre) $(TEST_LOG_DRIVER) --test-name "$$f" \
+ at am__EXEEXT_TRUE@	--log-file $$b.log --trs-file $$b.trs \
+ at am__EXEEXT_TRUE@	$(am__common_driver_flags) $(AM_TEST_LOG_DRIVER_FLAGS) $(TEST_LOG_DRIVER_FLAGS) -- $(TEST_LOG_COMPILE) \
+ at am__EXEEXT_TRUE@	"$$tst" $(AM_TESTS_FD_REDIRECT)
+
 distdir: $(DISTFILES)
 	$(am__remove_distdir)
 	test -d "$(distdir)" || mkdir "$(distdir)"
@@ -771,7 +1140,7 @@ distcleancheck: distclean
 	       exit 1; } >&2
 check-am: all-am
 	$(MAKE) $(AM_MAKEFLAGS) $(check_SCRIPTS)
-	$(MAKE) $(AM_MAKEFLAGS) check-local
+	$(MAKE) $(AM_MAKEFLAGS) check-TESTS check-local
 check: check-recursive
 all-am: Makefile all-local
 installdirs: installdirs-recursive
@@ -796,6 +1165,9 @@ install-strip:
 	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
 	fi
 mostlyclean-generic:
+	-test -z "$(TEST_LOGS)" || rm -f $(TEST_LOGS)
+	-test -z "$(TEST_LOGS:.log=.trs)" || rm -f $(TEST_LOGS:.log=.trs)
+	-test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG)
 
 clean-generic:
 
@@ -879,21 +1251,22 @@ uninstall-am:
 .MAKE: $(am__recursive_targets) check-am install-am install-strip
 
 .PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am all-local \
-	am--refresh check check-am check-local clean clean-cscope \
-	clean-generic clean-libtool clean-local cscope cscopelist-am \
-	ctags ctags-am dist dist-all dist-bzip2 dist-gzip dist-hook \
-	dist-lzip dist-shar dist-tarZ dist-xz dist-zip distcheck \
-	distclean distclean-generic distclean-libtool distclean-tags \
-	distcleancheck distdir distuninstallcheck dvi dvi-am html \
-	html-am info info-am install install-am install-data \
-	install-data-am install-data-local install-dvi install-dvi-am \
-	install-exec install-exec-am install-html install-html-am \
-	install-info install-info-am install-man install-pdf \
-	install-pdf-am install-ps install-ps-am install-strip \
-	installcheck installcheck-am installdirs installdirs-am \
-	maintainer-clean maintainer-clean-generic mostlyclean \
-	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
-	tags tags-am uninstall uninstall-am
+	am--refresh check check-TESTS check-am check-local clean \
+	clean-cscope clean-generic clean-libtool clean-local cscope \
+	cscopelist-am ctags ctags-am dist dist-all dist-bzip2 \
+	dist-gzip dist-hook dist-lzip dist-shar dist-tarZ dist-xz \
+	dist-zip distcheck distclean distclean-generic \
+	distclean-libtool distclean-tags distcleancheck distdir \
+	distuninstallcheck dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am \
+	install-data-local install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am install-info \
+	install-info-am install-man install-pdf install-pdf-am \
+	install-ps install-ps-am install-strip installcheck \
+	installcheck-am installdirs installdirs-am maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-generic \
+	mostlyclean-libtool pdf pdf-am ps ps-am recheck tags tags-am \
+	uninstall uninstall-am
 
 
 # why is it so hard to make autotools to this?
@@ -927,6 +1300,9 @@ check-local:: all
 		exit 1 ; \
 	fi
 
+# display the output of failed check_SCRIPTS after a failed make check
+export VERBOSE = true
+
 # "make distclean" both runs this and recurses into src/gtest, if
 # gtest is in DIST_SUBDIRS. Take extra care to not fail when
 # effectively cleaned twice.
diff --git a/autogen.sh b/autogen.sh
index 99d4f7b..b6f5029 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -46,5 +46,4 @@ autoconf
 autoheader
 automake -a --add-missing -Wall
 ( cd src/gmock && autoreconf -fvi; )
-( cd src/rocksdb && autoreconf -fvi; )
 exit
diff --git a/ceph.spec b/ceph.spec
index fee4bbb..c6c5e9d 100644
--- a/ceph.spec
+++ b/ceph.spec
@@ -54,7 +54,7 @@ restorecon -R /var/log/ceph > /dev/null 2>&1;
 # common
 #################################################################################
 Name:		ceph
-Version:	9.2.0
+Version:	10.0.1
 Release:	0%{?dist}
 Epoch:		1
 Summary:	User space components of the Ceph file system
@@ -590,6 +590,11 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 		%{?_with_tcmalloc} \
 		CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
 
+%if %{with lowmem_builder}
+%if 0%{?jobs} > 8
+%define _smp_mflags -j8
+%endif
+%endif
 
 make %{?_smp_mflags}
 
@@ -607,8 +612,7 @@ make %{?_smp_mflags} check-local
 make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
-install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
-install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
+install -D src/etc-rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
 %if 0%{?fedora} || 0%{?rhel}
 install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph
 %endif
@@ -617,6 +621,7 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
 %endif
 %if 0%{?_with_systemd}
   install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/ceph-common.conf
+  install -m 0644 -D systemd/rbdmap.service $RPM_BUILD_ROOT%{_unitdir}/rbdmap.service
   install -m 0644 -D systemd/ceph-osd at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-osd at .service
   install -m 0644 -D systemd/ceph-mon at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mon at .service
   install -m 0644 -D systemd/ceph-create-keys at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-create-keys at .service
@@ -626,6 +631,7 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
   install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
   install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
 %else
+  install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
   install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
   install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
   ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
@@ -810,6 +816,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/rados-classes/libcls_timeindex.so*
 %{_libdir}/rados-classes/libcls_user.so*
 %{_libdir}/rados-classes/libcls_version.so*
+%{_libdir}/rados-classes/libcls_journal.so*
 %dir %{_libdir}/ceph/erasure-code
 %{_libdir}/ceph/erasure-code/libec_*.so*
 %if 0%{?_with_lttng}
@@ -872,6 +879,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/rbd
 %{_bindir}/rbd-replay
 %{_bindir}/rbd-replay-many
+%{_bindir}/rbdmap
 %if 0%{?_with_lttng}
 %{_bindir}/rbd-replay-prep
 %endif
@@ -901,7 +909,11 @@ rm -rf $RPM_BUILD_ROOT
 %config %{_sysconfdir}/bash_completion.d/rados
 %config %{_sysconfdir}/bash_completion.d/rbd
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
+%if 0%{?_with_systemd}
+%{_unitdir}/rbdmap.service
+%else
 %{_initrddir}/rbdmap
+%endif
 %{python_sitelib}/ceph_argparse.py*
 %{python_sitelib}/ceph_daemon.py*
 %{_udevrulesdir}/50-rbd.rules
@@ -1302,12 +1314,12 @@ exit 0
 %files libs-compat
 # We need an empty %%files list for ceph-libs-compat, to tell rpmbuild to actually
 # build this meta package.
+%endif
 
 #################################################################################
 %files devel-compat
 # We need an empty %%files list for ceph-devel-compat, to tell rpmbuild to
 # actually build this meta package.
-%endif
 
 #################################################################################
 %files -n python-ceph-compat
diff --git a/ceph.spec.in b/ceph.spec.in
index 8f2a6fc..2939fef 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -590,6 +590,11 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 		%{?_with_tcmalloc} \
 		CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
 
+%if %{with lowmem_builder}
+%if 0%{?jobs} > 8
+%define _smp_mflags -j8
+%endif
+%endif
 
 make %{?_smp_mflags}
 
@@ -607,8 +612,7 @@ make %{?_smp_mflags} check-local
 make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
-install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
-install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
+install -D src/etc-rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
 %if 0%{?fedora} || 0%{?rhel}
 install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph
 %endif
@@ -617,6 +621,7 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
 %endif
 %if 0%{?_with_systemd}
   install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/ceph-common.conf
+  install -m 0644 -D systemd/rbdmap.service $RPM_BUILD_ROOT%{_unitdir}/rbdmap.service
   install -m 0644 -D systemd/ceph-osd at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-osd at .service
   install -m 0644 -D systemd/ceph-mon at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mon at .service
   install -m 0644 -D systemd/ceph-create-keys at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-create-keys at .service
@@ -626,6 +631,7 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
   install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
   install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
 %else
+  install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
   install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
   install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
   ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
@@ -810,6 +816,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/rados-classes/libcls_timeindex.so*
 %{_libdir}/rados-classes/libcls_user.so*
 %{_libdir}/rados-classes/libcls_version.so*
+%{_libdir}/rados-classes/libcls_journal.so*
 %dir %{_libdir}/ceph/erasure-code
 %{_libdir}/ceph/erasure-code/libec_*.so*
 %if 0%{?_with_lttng}
@@ -872,6 +879,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/rbd
 %{_bindir}/rbd-replay
 %{_bindir}/rbd-replay-many
+%{_bindir}/rbdmap
 %if 0%{?_with_lttng}
 %{_bindir}/rbd-replay-prep
 %endif
@@ -901,7 +909,11 @@ rm -rf $RPM_BUILD_ROOT
 %config %{_sysconfdir}/bash_completion.d/rados
 %config %{_sysconfdir}/bash_completion.d/rbd
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
+%if 0%{?_with_systemd}
+%{_unitdir}/rbdmap.service
+%else
 %{_initrddir}/rbdmap
+%endif
 %{python_sitelib}/ceph_argparse.py*
 %{python_sitelib}/ceph_daemon.py*
 %{_udevrulesdir}/50-rbd.rules
@@ -1302,12 +1314,12 @@ exit 0
 %files libs-compat
 # We need an empty %%files list for ceph-libs-compat, to tell rpmbuild to actually
 # build this meta package.
+%endif
 
 #################################################################################
 %files devel-compat
 # We need an empty %%files list for ceph-devel-compat, to tell rpmbuild to
 # actually build this meta package.
-%endif
 
 #################################################################################
 %files -n python-ceph-compat
diff --git a/configure b/configure
index 4b93790..00b462c 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for ceph 9.2.0.
+# Generated by GNU Autoconf 2.69 for ceph 10.0.1.
 #
 # Report bugs to <ceph-devel at vger.kernel.org>.
 #
@@ -590,8 +590,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ceph'
 PACKAGE_TARNAME='ceph'
-PACKAGE_VERSION='9.2.0'
-PACKAGE_STRING='ceph 9.2.0'
+PACKAGE_VERSION='10.0.1'
+PACKAGE_STRING='ceph 10.0.1'
 PACKAGE_BUGREPORT='ceph-devel at vger.kernel.org'
 PACKAGE_URL=''
 
@@ -661,6 +661,7 @@ LTTNG_GEN_TP_CHECK
 WITH_LTTNG_FALSE
 WITH_LTTNG_TRUE
 BOOST_PROGRAM_OPTIONS_LIBS
+BOOST_REGEX_LIBS
 BOOST_RANDOM_LIBS
 BOOST_THREAD_LIBS
 USE_BOOST_SPIRIT_OLD_HDR_FALSE
@@ -791,6 +792,8 @@ WITH_RADOS_TRUE
 AM_CXXFLAGS
 CLANG_FALSE
 CLANG_TRUE
+SOLARIS_FALSE
+SOLARIS_TRUE
 DARWIN_FALSE
 DARWIN_TRUE
 FREEBSD_FALSE
@@ -1017,8 +1020,7 @@ LIBROCKSDB_LIBS
 LIBZFS_CFLAGS
 LIBZFS_LIBS
 PYTHON'
-ac_subdirs_all='src/gmock
-src/rocksdb'
+ac_subdirs_all='src/gmock'
 
 # Initialize some variables set by options.
 ac_init_help=
@@ -1558,7 +1560,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures ceph 9.2.0 to adapt to many kinds of systems.
+\`configure' configures ceph 10.0.1 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1629,7 +1631,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of ceph 9.2.0:";;
+     short | recursive ) echo "Configuration of ceph 10.0.1:";;
    esac
   cat <<\_ACEOF
 
@@ -1815,7 +1817,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-ceph configure 9.2.0
+ceph configure 10.0.1
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2891,7 +2893,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by ceph $as_me 9.2.0, which was
+It was created by ceph $as_me 10.0.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -16386,7 +16388,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='ceph'
- VERSION='9.2.0'
+ VERSION='10.0.1'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -16905,6 +16907,9 @@ linux*)
 freebsd*)
 	freebsd="yes"
 	;;
+solaris*)
+	solaris="yes"
+	;;
 esac
  if test x"$linux" = x"yes"; then
   LINUX_TRUE=
@@ -16930,6 +16935,14 @@ else
   DARWIN_FALSE=
 fi
 
+ if test x"$solaris" = x"yes"; then
+  SOLARIS_TRUE=
+  SOLARIS_FALSE='#'
+else
+  SOLARIS_TRUE='#'
+  SOLARIS_FALSE=
+fi
+
 
 # Checks for programs.
 ac_ext=cpp
@@ -18609,8 +18622,12 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 #Linux only dependencies
 if test x"$linux" = x"yes"; then
-  # libblkid
-  ac_fn_c_check_header_mongrel "$LINENO" "blkid/blkid.h" "ac_cv_header_blkid_blkid_h" "$ac_includes_default"
+
+  # rbd {map,unmap,showmapped} dependencies, Linux only
+  if test x"$with_rbd" = x"yes"; then
+
+    # libblkid
+    ac_fn_c_check_header_mongrel "$LINENO" "blkid/blkid.h" "ac_cv_header_blkid_blkid_h" "$ac_includes_default"
 if test "x$ac_cv_header_blkid_blkid_h" = xyes; then :
 
 else
@@ -18618,7 +18635,7 @@ else
 fi
 
 
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid_get_cache in -lblkid" >&5
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid_get_cache in -lblkid" >&5
 $as_echo_n "checking for blkid_get_cache in -lblkid... " >&6; }
 if ${ac_cv_lib_blkid_blkid_get_cache+:} false; then :
   $as_echo_n "(cached) " >&6
@@ -18663,7 +18680,7 @@ as_fn_error $? "libblkid not found
 See \`config.log' for more details" "$LINENO" 5; }
 fi
 
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid_find_dev_with_tag in -lblkid" >&5
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid_find_dev_with_tag in -lblkid" >&5
 $as_echo_n "checking for blkid_find_dev_with_tag in -lblkid... " >&6; }
 if ${ac_cv_lib_blkid_blkid_find_dev_with_tag+:} false; then :
   $as_echo_n "(cached) " >&6
@@ -18708,7 +18725,7 @@ as_fn_error $? "libblkid not found
 See \`config.log' for more details" "$LINENO" 5; }
 fi
 
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid_dev_devname in -lblkid" >&5
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid_dev_devname in -lblkid" >&5
 $as_echo_n "checking for blkid_dev_devname in -lblkid... " >&6; }
 if ${ac_cv_lib_blkid_blkid_dev_devname+:} false; then :
   $as_echo_n "(cached) " >&6
@@ -18753,10 +18770,6 @@ as_fn_error $? "libblkid not found
 See \`config.log' for more details" "$LINENO" 5; }
 fi
 
-
-  # rbd {map,unmap,showmapped} dependencies, Linux only
-  if test x"$with_rbd" = x"yes"; then
-    # libblkid
     { $as_echo "$as_me:${as_lineno-$LINENO}: checking for blkid_devno_to_wholedisk in -lblkid" >&5
 $as_echo_n "checking for blkid_devno_to_wholedisk in -lblkid... " >&6; }
 if ${ac_cv_lib_blkid_blkid_devno_to_wholedisk+:} false; then :
@@ -20552,7 +20565,7 @@ else
 JAVA_TEST=Test.java
 CLASS_TEST=Test.class
 cat << \EOF > $JAVA_TEST
-/* #line 20555 "configure" */
+/* #line 20568 "configure" */
 public class Test {
 }
 EOF
@@ -22227,10 +22240,6 @@ if test "x$with_librocksdb_static" = "xcheck" -a "x$HAVE_CXX11" = "x1" ; then :
   with_librocksdb_static="yes"
 fi
 if test "x$with_librocksdb_static" = "xyes"; then :
-  subdirs="$subdirs src/rocksdb"
-
-fi
-if test "x$with_librocksdb_static" = "xyes"; then :
 
 $as_echo "#define HAVE_LIBROCKSDB 1" >>confdefs.h
 
@@ -22964,6 +22973,17 @@ See \`config.log' for more details" "$LINENO" 5; }
 fi
 
 
+ac_fn_cxx_check_header_mongrel "$LINENO" "boost/regex.hpp" "ac_cv_header_boost_regex_hpp" "$ac_includes_default"
+if test "x$ac_cv_header_boost_regex_hpp" = xyes; then :
+
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "\"Can't find boost regex headers\"
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+
 ac_fn_cxx_check_header_mongrel "$LINENO" "boost/program_options/option.hpp" "ac_cv_header_boost_program_options_option_hpp" "$ac_includes_default"
 if test "x$ac_cv_header_boost_program_options_option_hpp" = xyes; then :
 
@@ -23245,6 +23265,98 @@ BOOST_RANDOM_LIBS="${LIBS}"
 LIBS="${saved_LIBS}"
 
 
+# boost-regex
+BOOST_REGEX_LIBS=""
+saved_LIBS="${LIBS}"
+LIBS=""
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for main in -lboost_regex-mt" >&5
+$as_echo_n "checking for main in -lboost_regex-mt... " >&6; }
+if ${ac_cv_lib_boost_regex_mt_main+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lboost_regex-mt  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+
+int
+main ()
+{
+return main ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ac_cv_lib_boost_regex_mt_main=yes
+else
+  ac_cv_lib_boost_regex_mt_main=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_boost_regex_mt_main" >&5
+$as_echo "$ac_cv_lib_boost_regex_mt_main" >&6; }
+if test "x$ac_cv_lib_boost_regex_mt_main" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBBOOST_REGEX_MT 1
+_ACEOF
+
+  LIBS="-lboost_regex-mt $LIBS"
+
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for main in -lboost_regex" >&5
+$as_echo_n "checking for main in -lboost_regex... " >&6; }
+if ${ac_cv_lib_boost_regex_main+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lboost_regex  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+
+int
+main ()
+{
+return main ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_link "$LINENO"; then :
+  ac_cv_lib_boost_regex_main=yes
+else
+  ac_cv_lib_boost_regex_main=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_boost_regex_main" >&5
+$as_echo "$ac_cv_lib_boost_regex_main" >&6; }
+if test "x$ac_cv_lib_boost_regex_main" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBBOOST_REGEX 1
+_ACEOF
+
+  LIBS="-lboost_regex $LIBS"
+
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "\"Boost regex library not found.\"
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+fi
+
+BOOST_REGEX_LIBS="${LIBS}"
+LIBS="${saved_LIBS}"
+
+
 #
 # Check for boost_program_options library (defines BOOST_PROGRAM_OPTIONS_LIBS).
 #
@@ -23373,6 +23485,7 @@ for ac_header in  \
 	sys/time.h \
 	sys/vfs.h \
 	sys/xattr.h \
+	sys/cdefs.h \
 	syslog.h \
 	utime.h \
 
@@ -23416,6 +23529,15 @@ $as_echo "#define CEPH_HAVE_FALLOCATE /**/" >>confdefs.h
 fi
 
 
+# mallinfo
+ac_fn_c_check_func "$LINENO" "mallinfo" "ac_cv_func_mallinfo"
+if test "x$ac_cv_func_mallinfo" = xyes; then :
+
+$as_echo "#define HAVE_MALLINFO 1" >>confdefs.h
+
+fi
+
+
 # getgrouplist
 for ac_func in getgrouplist
 do :
@@ -24898,6 +25020,10 @@ if test -z "${DARWIN_TRUE}" && test -z "${DARWIN_FALSE}"; then
   as_fn_error $? "conditional \"DARWIN\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${SOLARIS_TRUE}" && test -z "${SOLARIS_FALSE}"; then
+  as_fn_error $? "conditional \"SOLARIS\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${CLANG_TRUE}" && test -z "${CLANG_FALSE}"; then
   as_fn_error $? "conditional \"CLANG\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -25471,7 +25597,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ceph $as_me 9.2.0, which was
+This file was extended by ceph $as_me 10.0.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -25537,7 +25663,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ceph config.status 9.2.0
+ceph config.status 10.0.1
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 3ed4b04..5ba7d42 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [9.2.0], [ceph-devel at vger.kernel.org])
+AC_INIT([ceph], [10.0.1], [ceph-devel at vger.kernel.org])
 
 AX_CXX_COMPILE_STDCXX_11(, mandatory)
 
@@ -67,10 +67,14 @@ linux*)
 freebsd*)
 	freebsd="yes"
 	;;
+solaris*)
+	solaris="yes"
+	;;
 esac
 AM_CONDITIONAL(LINUX, test x"$linux" = x"yes")
 AM_CONDITIONAL(FREEBSD, test x"$freebsd" = x"yes")
 AM_CONDITIONAL(DARWIN, test x"$darwin" = x"yes")
+AM_CONDITIONAL(SOLARIS, test x"$solaris" = x"yes")
 
 # Checks for programs.
 AC_PROG_CXX
@@ -276,19 +280,19 @@ ACX_PTHREAD
 
 #Linux only dependencies
 if test x"$linux" = x"yes"; then
-  # libblkid
-  AC_CHECK_HEADER([blkid/blkid.h], [],
-    AC_MSG_ERROR([blkid/blkid.h not found (libblkid-dev, libblkid-devel)]))
-  AC_CHECK_LIB([blkid], [blkid_get_cache], [true],
-    AC_MSG_FAILURE([libblkid not found]))
-  AC_CHECK_LIB([blkid], [blkid_find_dev_with_tag], [true],
-    AC_MSG_FAILURE([libblkid not found]))
-  AC_CHECK_LIB([blkid], [blkid_dev_devname], [true],
-    AC_MSG_FAILURE([libblkid not found]))
 
   # rbd {map,unmap,showmapped} dependencies, Linux only
   if test x"$with_rbd" = x"yes"; then
+
     # libblkid
+    AC_CHECK_HEADER([blkid/blkid.h], [],
+      AC_MSG_ERROR([blkid/blkid.h not found (libblkid-dev, libblkid-devel)]))
+    AC_CHECK_LIB([blkid], [blkid_get_cache], [true],
+      AC_MSG_FAILURE([libblkid not found]))
+    AC_CHECK_LIB([blkid], [blkid_find_dev_with_tag], [true],
+      AC_MSG_FAILURE([libblkid not found]))
+    AC_CHECK_LIB([blkid], [blkid_dev_devname], [true],
+      AC_MSG_FAILURE([libblkid not found]))
     AC_CHECK_LIB([blkid], [blkid_devno_to_wholedisk], [true],
       AC_MSG_FAILURE([libblkid not found]))
 
@@ -790,8 +794,6 @@ AC_ARG_WITH([librocksdb-static],
 AS_IF([test "x$with_librocksdb_static" = "xcheck" -a "x$HAVE_CXX11" = "x1" ],
             [with_librocksdb_static="yes"])
 AS_IF([test "x$with_librocksdb_static" = "xyes"],
-            [AC_CONFIG_SUBDIRS([src/rocksdb])])
-AS_IF([test "x$with_librocksdb_static" = "xyes"],
             [AC_DEFINE([HAVE_LIBROCKSDB], [1], [Defined if you have librocksdb enabled])])
 AM_CONDITIONAL(WITH_SLIBROCKSDB, [ test "x$with_librocksdb_static" = "xyes" ])
 AM_CONDITIONAL(WITH_LIBROCKSDB, [ test "x$with_librocksdb_static" = "xyes" -o "x$with_librocksdb" = "xyes" ])
@@ -879,6 +881,8 @@ AC_CHECK_HEADER([boost/random/discrete_distribution.hpp],
 
 AC_CHECK_HEADER([boost/statechart/state.hpp], [],
     AC_MSG_FAILURE(["Can't find boost statechart headers; need 1.34 or later"]))
+AC_CHECK_HEADER([boost/regex.hpp], [],
+    AC_MSG_FAILURE(["Can't find boost regex headers"]))
 AC_CHECK_HEADER([boost/program_options/option.hpp], [],
     AC_MSG_FAILURE(["Can't find boost program_options headers"]))
 
@@ -910,6 +914,17 @@ BOOST_RANDOM_LIBS="${LIBS}"
 LIBS="${saved_LIBS}"
 AC_SUBST(BOOST_RANDOM_LIBS)
 
+# boost-regex
+BOOST_REGEX_LIBS=""
+saved_LIBS="${LIBS}"
+LIBS=""
+AC_CHECK_LIB(boost_regex-mt, main, [],
+    [AC_CHECK_LIB(boost_regex, main, [],
+        AC_MSG_FAILURE(["Boost regex library not found."]))])
+BOOST_REGEX_LIBS="${LIBS}"
+LIBS="${saved_LIBS}"
+AC_SUBST(BOOST_REGEX_LIBS)
+
 #
 # Check for boost_program_options library (defines BOOST_PROGRAM_OPTIONS_LIBS).
 #
@@ -945,6 +960,7 @@ AC_CHECK_HEADERS([ \
 	sys/time.h \
 	sys/vfs.h \
 	sys/xattr.h \
+	sys/cdefs.h \
 	syslog.h \
 	utime.h \
 ])
@@ -964,6 +980,10 @@ AC_CHECK_FUNC([fallocate],
 	[AC_DEFINE([CEPH_HAVE_FALLOCATE], [], [fallocate(2) is supported])],
 	[])
 
+# mallinfo
+AC_CHECK_FUNC([mallinfo],
+  [AC_DEFINE(HAVE_MALLINFO, 1, [Define if you have mallinfo])])
+
 # getgrouplist
 AC_CHECK_FUNCS([getgrouplist])
 
diff --git a/doc/Makefile.in b/doc/Makefile.in
index b8b4876..950177c 100644
--- a/doc/Makefile.in
+++ b/doc/Makefile.in
@@ -137,6 +137,7 @@ AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
 BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
+BOOST_REGEX_LIBS = @BOOST_REGEX_LIBS@
 BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
 CC = @CC@
 CCAS = @CCAS@
diff --git a/doc/man/8/ceph-disk.rst b/doc/man/8/ceph-disk.rst
index bb67163..8313f5c 100644
--- a/doc/man/8/ceph-disk.rst
+++ b/doc/man/8/ceph-disk.rst
@@ -1,7 +1,7 @@
 :orphan:
 
 ===================================================================
- ceph-disk -- Ceph disk preparation and activation utility for OSD
+ ceph-disk -- Ceph disk utility for OSD
 ===================================================================
 
 .. program:: ceph-disk
@@ -14,12 +14,18 @@ Synopsis
 
 | **ceph-disk** **activate** [*data-path*] [--activate-key *path*]
         [--mark-init *sysvinit|upstart|systemd|auto|none*]
-        [--no-start-daemon]
+        [--no-start-daemon] [--reactivate]
 
 | **ceph-disk** **activate-all**
 
 | **ceph-disk** **list**
 
+| **ceph-disk** **deactivate** [--cluster *clustername*] [*device-path*]
+        [--deactivate-by-id *id*] [--mark-out]
+
+| **ceph-disk** **destroy** [--cluster *clustername*] [*device-path*]
+        [--destroy-by-id *id*] [--dmcrypt-key-dir *KEYDIR*] [--zap]
+
 Description
 ===========
 
@@ -32,6 +38,10 @@ It actually automates the multiple steps involved in manual creation and start
 of an OSD into two steps of preparing and activating the OSD by using the
 subcommands ``prepare`` and ``activate``.
 
+:program:`ceph-disk` also automates the multiple steps involved to manually stop
+and destroy an OSD into two steps of deactivating and destroying the OSD by using
+the subcommands ``deactivate`` and ``destroy``.
+
 Subcommands
 ============
 
@@ -97,6 +107,13 @@ Usage::
 If the option :option:`--no-start-daemon` is given, the activation
 steps are performed but the OSD daemon is not started.
 
+The latest option :option:`--reactivate` can re-activate the OSD which has been
+deactivated with the ``deactivate`` subcommand.
+
+Usage::
+
+	ceph-disk activate [PATH] [--reactivate]
+
 activate-journal
 ----------------
 
@@ -176,6 +193,57 @@ Usage::
 
 Here, [PATH] is path to a block device or a directory.
 
+deactivate
+----------
+Deactivate the Ceph OSD. It stops OSD daemon and optionally marks it out. The
+content of the OSD is left untouched but the *ready*, *active*, *INIT-specific*
+files are removed (so that it is not automatically re-activated by the ``udev``
+rules) and the file deactive is created to remember the OSD is deactivated.
+If the OSD is dmcrypt, remove the data dmcrypt map. When deactivate finishes,
+the OSD is ``down``. A deactivated OSD can later be re-activated using the
+:option:`--reactivate` option of the ``activate`` subcommand.
+
+Usage::
+
+	ceph-disk deactivate [PATH]
+
+Here, [PATH] is a path to a block device or a directory.
+
+Another option :option:`--mark-out` can also be used with this subcommand.
+``--mark-out`` marks the OSD out. The objects it contains will be remapped.
+If you are not sure you will destroy OSD, do not use this option.
+
+You can also use ``osd-id`` to deactivate an OSD with the option :option:`--deactivate-by-id`.
+
+Usage::
+
+	ceph-disk deactivate --deactivate-by-id [OSD-ID]
+
+destroy
+-------
+Destroy the Ceph OSD. It removes the OSD from the cluster, the crushmap and
+deallocates OSD ID. It can only destroy an OSD which is *down*.
+
+Usage::
+
+	ceph-disk destroy [PATH]
+
+Here, [PATH] is a path to a block device or a directory.
+
+Another option :option:`--zap` can also be used with this subcommand.
+``--zap`` will destroy the partition table and content of the disk.
+
+Usage::
+
+	ceph-disk destroy [PATH] [--zap]
+
+You can also use the id of an OSD instead of the path with the option
+:option:`--destroy-by-id`.
+
+Usage::
+
+	ceph-disk destroy --destroy-by-id [OSD-ID]
+
 zap
 ---
 
@@ -255,15 +323,6 @@ Options
 
 	Directory where ``dm-crypt`` keys are stored.
 
-.. option:: --activate-key
-
-   Use when a copy of ``/var/lib/ceph/bootstrap-osd/{cluster}.keyring`` isn't
-   present in the OSD node. Suffix the option by the path to the keyring.
-
-.. option:: --mark-init
-
-   Provide init system to manage the OSD directory.
-
 Availability
 ============
 
diff --git a/doc/man/8/ceph.rst b/doc/man/8/ceph.rst
index 37bb897..1f7cbfe 100644
--- a/doc/man/8/ceph.rst
+++ b/doc/man/8/ceph.rst
@@ -47,7 +47,7 @@ Synopsis
 
 | **ceph** **osd** **tier** [ *add* \| *add-cache* \| *cache-mode* \| *remove* \| *remove-overlay* \| *set-overlay* ] ...
 
-| **ceph** **pg** [ *debug* \| *deep-scrub* \| *dump* \| *dump_json* \| *dump_pools_json* \| *dump_stuck* \| *force_create_pg* \| *getmap* \| *ls* \| *ls-by-osd* \| *ls-by-pool* \| *ls-by-primary* \| *map* \| *repair* \| *scrub* \| *send_pg_creates* \| *set_full_ratio* \| *set_nearfull_ratio* \| *stat* ] ...
+| **ceph** **pg** [ *debug* \| *deep-scrub* \| *dump* \| *dump_json* \| *dump_pools_json* \| *dump_stuck* \| *force_create_pg* \| *getmap* \| *ls* \| *ls-by-osd* \| *ls-by-pool* \| *ls-by-primary* \| *map* \| *repair* \| *scrub* \| *set_full_ratio* \| *set_nearfull_ratio* \| *stat* ] ...
 
 | **ceph** **quorum** [ *enter* \| *exit* ]
 
@@ -884,7 +884,7 @@ Only for tiered pools::
 	ceph osd pool get <poolname> hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|
 	target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|
 	cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|
-	min_read_recency_for_promote
+	min_read_recency_for_promote|hit_set_grade_decay_rate|hit_set_search_last_n
 
 Only for erasure coded pools::
 
@@ -934,7 +934,8 @@ Usage::
 	target_max_bytes|target_max_objects|cache_target_dirty_ratio|
 	cache_target_dirty_high_ratio|
 	cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|
-	min_read_recency_for_promote|write_fadvise_dontneed
+	min_read_recency_for_promote|write_fadvise_dontneed|hit_set_grade_decay_rate|
+	hit_set_search_last_n
 	<val> {--yes-i-really-mean-it}
 
 Subcommand ``set-quota`` sets object or byte limit on pool.
@@ -1229,12 +1230,6 @@ Usage::
 
 	ceph pg scrub <pgid>
 
-Subcommand ``send_pg_creates`` triggers pg creates to be issued.
-
-Usage::
-
-	ceph pg send_pg_creates
-
 Subcommand ``set_full_ratio`` sets ratio at which pgs are considered full.
 
 Usage::
diff --git a/doc/man/8/rbd.rst b/doc/man/8/rbd.rst
index d7eb72d..cd14180 100644
--- a/doc/man/8/rbd.rst
+++ b/doc/man/8/rbd.rst
@@ -9,8 +9,8 @@
 Synopsis
 ========
 
-| **rbd** [ -c *ceph.conf* ] [ -m *monaddr* ] [ -p | --pool *pool* ] [
-  --size *size* ] [ --order *bits* ] [ *command* ... ]
+| **rbd** [ -c *ceph.conf* ] [ -m *monaddr* ] [--cluster *cluster name*]
+  [ -p | --pool *pool* ] [--size *size* ] [ --order *bits* ] [ *command* ... ] 
 
 
 Description
@@ -35,6 +35,10 @@ Options
 
    Connect to specified monitor (instead of looking through ceph.conf).
 
+.. option:: --cluster cluster name
+
+   Use different cluster name as compared to default cluster name *ceph*.
+
 .. option:: -p pool-name, --pool pool-name
 
    Interact with the given pool. Required by most commands.
@@ -137,6 +141,7 @@ Parameters
    * object-map: object map support (requires exclusive-lock)
    * fast-diff: fast diff calculations (requires object-map)
    * deep-flatten: snapshot flatten support
+   * journaling: journaled IO support (requires exclusive-lock)
 
 .. option:: --image-shared
 
@@ -425,6 +430,10 @@ libceph (per client instance) options:
 
 * notcp_nodelay - Enable Nagle's algorithm on client sockets (since 4.0).
 
+* cephx_sign_messages - Enable message signing (since 4.4, default).
+
+* nocephx_sign_messages - Disable message signing (since 4.4).
+
 * mount_timeout=x - A timeout on various steps in `rbd map` and `rbd unmap`
   sequences (default is 60 seconds).  In particular, since 4.2 this can be used
   to ensure that `rbd unmap` eventually times out when there is no network
@@ -478,6 +487,10 @@ To map an image via the kernel with cephx enabled::
 
        rbd map mypool/myimage --id admin --keyfile secretfile
 
+To map an image via the kernel with different cluster name other than default *ceph*.
+
+       rbd map mypool/myimage --cluster *cluster name*
+
 To unmap an image::
 
        rbd unmap /dev/rbd0
diff --git a/install-deps.sh b/install-deps.sh
index 1bebf09..8249ea3 100755
--- a/install-deps.sh
+++ b/install-deps.sh
@@ -62,7 +62,7 @@ CentOS|Fedora|RedHatEnterpriseServer)
             CentOS|RedHatEnterpriseServer)
                 $SUDO yum install -y yum-utils
                 MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
-                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                if test $(lsb_release -si) = RedHatEnterpriseServer ; then
                     $SUDO yum install subscription-manager
                     $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
                 fi
@@ -70,6 +70,9 @@ CentOS|Fedora|RedHatEnterpriseServer)
                 $SUDO yum install --nogpgcheck -y epel-release
                 $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
                 $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
+                if test $(lsb_release -si) = CentOS -a $MAJOR_VERSION = 7 ; then
+                    $SUDO yum-config-manager --enable cr
+                fi
                 ;;
         esac
         sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
diff --git a/man/Makefile.in b/man/Makefile.in
index 2bf8c61..577ef31 100644
--- a/man/Makefile.in
+++ b/man/Makefile.in
@@ -226,6 +226,7 @@ AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
 BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
+BOOST_REGEX_LIBS = @BOOST_REGEX_LIBS@
 BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
 CC = @CC@
 CCAS = @CCAS@
diff --git a/man/ceph-authtool.8 b/man/ceph-authtool.8
index 7dc4922..e9e09e8 100644
--- a/man/ceph-authtool.8
+++ b/man/ceph-authtool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-AUTHTOOL" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-AUTHTOOL" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-authtool \- ceph keyring manipulation tool
 .
diff --git a/man/ceph-clsinfo.8 b/man/ceph-clsinfo.8
index bf86fe9..8791e10 100644
--- a/man/ceph-clsinfo.8
+++ b/man/ceph-clsinfo.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CLSINFO" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-CLSINFO" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-clsinfo \- show class object information
 .
diff --git a/man/ceph-conf.8 b/man/ceph-conf.8
index c297fdf..e4675ae 100644
--- a/man/ceph-conf.8
+++ b/man/ceph-conf.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CONF" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-CONF" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-conf \- ceph conf file tool
 .
diff --git a/man/ceph-create-keys.8 b/man/ceph-create-keys.8
index 5c6ebc1..82ec005 100644
--- a/man/ceph-create-keys.8
+++ b/man/ceph-create-keys.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-CREATE-KEYS" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-CREATE-KEYS" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-create-keys \- ceph keyring generate tool
 .
diff --git a/man/ceph-debugpack.8 b/man/ceph-debugpack.8
index 57cb828..6bb32d4 100644
--- a/man/ceph-debugpack.8
+++ b/man/ceph-debugpack.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DEBUGPACK" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-DEBUGPACK" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-debugpack \- ceph debug packer utility
 .
diff --git a/man/ceph-dencoder.8 b/man/ceph-dencoder.8
index e798079..0becaa1 100644
--- a/man/ceph-dencoder.8
+++ b/man/ceph-dencoder.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DENCODER" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-DENCODER" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-dencoder \- ceph encoder/decoder utility
 .
diff --git a/man/ceph-deploy.8 b/man/ceph-deploy.8
index 1559dbf..389d2b1 100644
--- a/man/ceph-deploy.8
+++ b/man/ceph-deploy.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DEPLOY" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-DEPLOY" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-deploy \- Ceph deployment tool
 .
diff --git a/man/ceph-detect-init.8 b/man/ceph-detect-init.8
index 5645738..e643f16 100644
--- a/man/ceph-detect-init.8
+++ b/man/ceph-detect-init.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DETECT-INIT" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-DETECT-INIT" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-detect-init \- display the init system Ceph should use
 .
diff --git a/man/ceph-disk.8 b/man/ceph-disk.8
index 2dd7a3e..267237e 100644
--- a/man/ceph-disk.8
+++ b/man/ceph-disk.8
@@ -1,8 +1,8 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-DISK" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-DISK" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
-ceph-disk \- Ceph disk preparation and activation utility for OSD
+ceph-disk \- Ceph disk utility for OSD
 .
 .nr rst2man-indent-level 0
 .
@@ -39,7 +39,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .nf
 \fBceph\-disk\fP \fBactivate\fP [\fIdata\-path\fP] [\-\-activate\-key \fIpath\fP]
 [\-\-mark\-init \fIsysvinit|upstart|systemd|auto|none\fP]
-[\-\-no\-start\-daemon]
+[\-\-no\-start\-daemon] [\-\-reactivate]
 .fi
 .sp
 .nf
@@ -50,6 +50,16 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 \fBceph\-disk\fP \fBlist\fP
 .fi
 .sp
+.nf
+\fBceph\-disk\fP \fBdeactivate\fP [\-\-cluster \fIclustername\fP] [\fIdevice\-path\fP]
+[\-\-deactivate\-by\-id \fIid\fP] [\-\-mark\-out]
+.fi
+.sp
+.nf
+\fBceph\-disk\fP \fBdestroy\fP [\-\-cluster \fIclustername\fP] [\fIdevice\-path\fP]
+[\-\-destroy\-by\-id \fIid\fP] [\-\-dmcrypt\-key\-dir \fIKEYDIR\fP] [\-\-zap]
+.fi
+.sp
 .SH DESCRIPTION
 .sp
 \fBceph\-disk\fP is a utility that can prepare and activate a disk, partition or
@@ -60,6 +70,10 @@ or \fBudev\fP\&. It can also be triggered by other deployment utilities like \fB
 It actually automates the multiple steps involved in manual creation and start
 of an OSD into two steps of preparing and activating the OSD by using the
 subcommands \fBprepare\fP and \fBactivate\fP\&.
+.sp
+\fBceph\-disk\fP also automates the multiple steps involved to manually stop
+and destroy an OSD into two steps of deactivating and destroying the OSD by using
+the subcommands \fBdeactivate\fP and \fBdestroy\fP\&.
 .SH SUBCOMMANDS
 .SS prepare
 .sp
@@ -109,7 +123,7 @@ ceph\-disk activate [PATH]
 .sp
 Here, [PATH] is path to a block device or a directory.
 .sp
-An additional option \fI\%\-\-activate\-key\fP has to be used with this
+An additional option \fI\-\-activate\-key\fP has to be used with this
 subcommand when a copy of \fB/var/lib/ceph/bootstrap\-osd/{cluster}.keyring\fP
 isn\(aqt present in the OSD node.
 .sp
@@ -125,7 +139,7 @@ ceph\-disk activate [PATH] [\-\-activate\-key PATH]
 .UNINDENT
 .UNINDENT
 .sp
-Another option \fI\%\-\-mark\-init\fP can also be used with this
+Another option \fI\-\-mark\-init\fP can also be used with this
 subcommand.  \fB\-\-mark\-init\fP provides init system to manage the OSD
 directory. It defaults to \fBauto\fP which detects the init system
 suitable for ceph (either \fBsysvinit\fP, \fBsystemd\fP or
@@ -150,6 +164,21 @@ ceph\-disk activate [PATH] [\-\-mark\-init *sysvinit|upstart|systemd|auto|none*]
 .sp
 If the option \fI\-\-no\-start\-daemon\fP is given, the activation
 steps are performed but the OSD daemon is not started.
+.sp
+The latest option \fI\-\-reactivate\fP can re\-activate the OSD which has been
+deactivated with the \fBdeactivate\fP subcommand.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph\-disk activate [PATH] [\-\-reactivate]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
 .SS activate\-journal
 .sp
 Activate an OSD via it\(aqs journal device. \fBudev\fP triggers
@@ -169,7 +198,7 @@ ceph\-disk activate\-journal [DEV]
 .sp
 Here, [DEV] is the path to a journal block device.
 .sp
-Others options like \fI\%\-\-activate\-key\fP and \fI\%\-\-mark\-init\fP can also
+Others options like \fI\-\-activate\-key\fP and \fI\-\-mark\-init\fP can also
 be used with this subcommand.
 .sp
 \fB\-\-mark\-init\fP provides init system to manage the OSD directory.
@@ -204,7 +233,7 @@ ceph\-disk activate\-all
 .UNINDENT
 .UNINDENT
 .sp
-Others options like \fI\%\-\-activate\-key\fP and \fI\%\-\-mark\-init\fP can
+Others options like \fI\-\-activate\-key\fP and \fI\-\-mark\-init\fP can
 also be used with this subcommand.
 .sp
 \fB\-\-mark\-init\fP provides init system to manage the OSD directory.
@@ -275,6 +304,95 @@ ceph\-disk unsuppress\-activate [PATH]
 .UNINDENT
 .sp
 Here, [PATH] is path to a block device or a directory.
+.SS deactivate
+.sp
+Deactivate the Ceph OSD. It stops OSD daemon and optionally marks it out. The
+content of the OSD is left untouched but the \fIready\fP, \fIactive\fP, \fIINIT\-specific\fP
+files are removed (so that it is not automatically re\-activated by the \fBudev\fP
+rules) and the file deactive is created to remember the OSD is deactivated.
+If the OSD is dmcrypt, remove the data dmcrypt map. When deactivate finishes,
+the OSD is \fBdown\fP\&. A deactivated OSD can later be re\-activated using the
+\fI\-\-reactivate\fP option of the \fBactivate\fP subcommand.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph\-disk deactivate [PATH]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Here, [PATH] is a path to a block device or a directory.
+.sp
+Another option \fI\-\-mark\-out\fP can also be used with this subcommand.
+\fB\-\-mark\-out\fP marks the OSD out. The objects it contains will be remapped.
+If you are not sure you will destroy OSD, do not use this option.
+.sp
+You can also use \fBosd\-id\fP to deactivate an OSD with the option \fI\-\-deactivate\-by\-id\fP\&.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph\-disk deactivate \-\-deactivate\-by\-id [OSD\-ID]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.SS destroy
+.sp
+Destroy the Ceph OSD. It removes the OSD from the cluster, the crushmap and
+deallocates OSD ID. It can only destroy an OSD which is \fIdown\fP\&.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph\-disk destroy [PATH]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+Here, [PATH] is a path to a block device or a directory.
+.sp
+Another option \fI\-\-zap\fP can also be used with this subcommand.
+\fB\-\-zap\fP will destroy the partition table and content of the disk.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph\-disk destroy [PATH] [\-\-zap]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
+.sp
+You can also use the id of an OSD instead of the path with the option
+\fI\-\-destroy\-by\-id\fP\&.
+.sp
+Usage:
+.INDENT 0.0
+.INDENT 3.5
+.sp
+.nf
+.ft C
+ceph\-disk destroy \-\-destroy\-by\-id [OSD\-ID]
+.ft P
+.fi
+.UNINDENT
+.UNINDENT
 .SS zap
 .sp
 Zap/erase/destroy a device\(aqs partition table and contents. It actually uses
@@ -373,17 +491,6 @@ Encrypt \fB[data\-path]\fP and/or journal devices with \fBdm\-crypt\fP\&.
 .B \-\-dmcrypt\-key\-dir
 Directory where \fBdm\-crypt\fP keys are stored.
 .UNINDENT
-.INDENT 0.0
-.TP
-.B \-\-activate\-key
-Use when a copy of \fB/var/lib/ceph/bootstrap\-osd/{cluster}.keyring\fP isn\(aqt
-present in the OSD node. Suffix the option by the path to the keyring.
-.UNINDENT
-.INDENT 0.0
-.TP
-.B \-\-mark\-init
-Provide init system to manage the OSD directory.
-.UNINDENT
 .SH AVAILABILITY
 .sp
 \fBceph\-disk\fP is part of Ceph, a massively scalable, open\-source, distributed storage system. Please refer to
diff --git a/man/ceph-fuse.8 b/man/ceph-fuse.8
index 0303b14..cdaaa3c 100644
--- a/man/ceph-fuse.8
+++ b/man/ceph-fuse.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-FUSE" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-FUSE" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-fuse \- FUSE-based client for ceph
 .
diff --git a/man/ceph-mds.8 b/man/ceph-mds.8
index b4d3009..1a644e1 100644
--- a/man/ceph-mds.8
+++ b/man/ceph-mds.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-MDS" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-MDS" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-mds \- ceph metadata server daemon
 .
diff --git a/man/ceph-mon.8 b/man/ceph-mon.8
index 86c1a59..1c67e88 100644
--- a/man/ceph-mon.8
+++ b/man/ceph-mon.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-MON" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-MON" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-mon \- ceph monitor daemon
 .
diff --git a/man/ceph-osd.8 b/man/ceph-osd.8
index 764900e..8265af1 100644
--- a/man/ceph-osd.8
+++ b/man/ceph-osd.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-OSD" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-OSD" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-osd \- ceph object storage daemon
 .
diff --git a/man/ceph-post-file.8 b/man/ceph-post-file.8
index 83dea9e..7031b47 100644
--- a/man/ceph-post-file.8
+++ b/man/ceph-post-file.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-POST-FILE" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-POST-FILE" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-post-file \- post files for ceph developers
 .
diff --git a/man/ceph-rbdnamer.8 b/man/ceph-rbdnamer.8
index a32bdbd..b190aef 100644
--- a/man/ceph-rbdnamer.8
+++ b/man/ceph-rbdnamer.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-RBDNAMER" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-RBDNAMER" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-rbdnamer \- udev helper to name RBD devices
 .
diff --git a/man/ceph-rest-api.8 b/man/ceph-rest-api.8
index f34fbb8..db7f2f5 100644
--- a/man/ceph-rest-api.8
+++ b/man/ceph-rest-api.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-REST-API" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-REST-API" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-rest-api \- ceph RESTlike administration server
 .
diff --git a/man/ceph-run.8 b/man/ceph-run.8
index caeab9b..a8bbd7d 100644
--- a/man/ceph-run.8
+++ b/man/ceph-run.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-RUN" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-RUN" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-run \- restart daemon on core dump
 .
diff --git a/man/ceph-syn.8 b/man/ceph-syn.8
index 1e59f85..b9e53d3 100644
--- a/man/ceph-syn.8
+++ b/man/ceph-syn.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH-SYN" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH-SYN" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph-syn \- ceph synthetic workload generator
 .
diff --git a/man/ceph.8 b/man/ceph.8
index 7a77e4f..4c15931 100644
--- a/man/ceph.8
+++ b/man/ceph.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPH" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPH" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 ceph \- ceph administration tool
 .
@@ -108,7 +108,7 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 .fi
 .sp
 .nf
-\fBceph\fP \fBpg\fP [ \fIdebug\fP | \fIdeep\-scrub\fP | \fIdump\fP | \fIdump_json\fP | \fIdump_pools_json\fP | \fIdump_stuck\fP | \fIforce_create_pg\fP | \fIgetmap\fP | \fIls\fP | \fIls\-by\-osd\fP | \fIls\-by\-pool\fP | \fIls\-by\-primary\fP | \fImap\fP | \fIrepair\fP | \fIscrub\fP | \fIsend_pg_creates\fP | \fIset_full_ratio\fP | \fIset_nearfull_ratio\fP | \fIstat\fP ] ...
+\fBceph\fP \fBpg\fP [ \fIdebug\fP | \fIdeep\-scrub\fP | \fIdump\fP | \fIdump_json\fP | \fIdump_pools_json\fP | \fIdump_stuck\fP | \fIforce_create_pg\fP | \fIgetmap\fP | \fIls\fP | \fIls\-by\-osd\fP | \fIls\-by\-pool\fP | \fIls\-by\-primary\fP | \fImap\fP | \fIrepair\fP | \fIscrub\fP | \fIset_full_ratio\fP | \fIset_nearfull_ratio\fP | \fIstat\fP ] ...
 .fi
 .sp
 .nf
@@ -1831,7 +1831,7 @@ Only for tiered pools:
 ceph osd pool get <poolname> hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|
 target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|
 cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|
-min_read_recency_for_promote
+min_read_recency_for_promote|hit_set_grade_decay_rate|hit_set_search_last_n
 .ft P
 .fi
 .UNINDENT
@@ -1945,7 +1945,8 @@ hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|debug_fake_ec_pool|
 target_max_bytes|target_max_objects|cache_target_dirty_ratio|
 cache_target_dirty_high_ratio|
 cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|
-min_read_recency_for_promote|write_fadvise_dontneed
+min_read_recency_for_promote|write_fadvise_dontneed|hit_set_grade_decay_rate|
+hit_set_search_last_n
 <val> {\-\-yes\-i\-really\-mean\-it}
 .ft P
 .fi
@@ -2553,20 +2554,6 @@ ceph pg scrub <pgid>
 .UNINDENT
 .UNINDENT
 .sp
-Subcommand \fBsend_pg_creates\fP triggers pg creates to be issued.
-.sp
-Usage:
-.INDENT 0.0
-.INDENT 3.5
-.sp
-.nf
-.ft C
-ceph pg send_pg_creates
-.ft P
-.fi
-.UNINDENT
-.UNINDENT
-.sp
 Subcommand \fBset_full_ratio\fP sets ratio at which pgs are considered full.
 .sp
 Usage:
diff --git a/man/cephfs.8 b/man/cephfs.8
index 078795b..c0624ec 100644
--- a/man/cephfs.8
+++ b/man/cephfs.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CEPHFS" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CEPHFS" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 cephfs \- ceph file system options utility
 .
diff --git a/man/crushtool.8 b/man/crushtool.8
index 190ae6e..a60e430 100644
--- a/man/crushtool.8
+++ b/man/crushtool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "CRUSHTOOL" "8" "November 03, 2015" "dev" "Ceph"
+.TH "CRUSHTOOL" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 crushtool \- CRUSH map manipulation tool
 .
diff --git a/man/librados-config.8 b/man/librados-config.8
index 731888c..fb153bb 100644
--- a/man/librados-config.8
+++ b/man/librados-config.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "LIBRADOS-CONFIG" "8" "November 03, 2015" "dev" "Ceph"
+.TH "LIBRADOS-CONFIG" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 librados-config \- display information about librados
 .
diff --git a/man/monmaptool.8 b/man/monmaptool.8
index 95ed4c4..b6a8334 100644
--- a/man/monmaptool.8
+++ b/man/monmaptool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "MONMAPTOOL" "8" "November 03, 2015" "dev" "Ceph"
+.TH "MONMAPTOOL" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 monmaptool \- ceph monitor cluster map manipulation tool
 .
diff --git a/man/mount.ceph.8 b/man/mount.ceph.8
index c5de08a..3a96d7a 100644
--- a/man/mount.ceph.8
+++ b/man/mount.ceph.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "MOUNT.CEPH" "8" "November 03, 2015" "dev" "Ceph"
+.TH "MOUNT.CEPH" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 mount.ceph \- mount a ceph file system
 .
diff --git a/man/osdmaptool.8 b/man/osdmaptool.8
index 7810e08..a6708bc 100644
--- a/man/osdmaptool.8
+++ b/man/osdmaptool.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "OSDMAPTOOL" "8" "November 03, 2015" "dev" "Ceph"
+.TH "OSDMAPTOOL" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 osdmaptool \- ceph osd cluster map manipulation tool
 .
diff --git a/man/rados.8 b/man/rados.8
index a9a0d96..83fb1f0 100644
--- a/man/rados.8
+++ b/man/rados.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOS" "8" "November 03, 2015" "dev" "Ceph"
+.TH "RADOS" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 rados \- rados object storage utility
 .
diff --git a/man/radosgw-admin.8 b/man/radosgw-admin.8
index d3c33e1..b50171d 100644
--- a/man/radosgw-admin.8
+++ b/man/radosgw-admin.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOSGW-ADMIN" "8" "November 03, 2015" "dev" "Ceph"
+.TH "RADOSGW-ADMIN" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 radosgw-admin \- rados REST gateway user administration utility
 .
diff --git a/man/radosgw.8 b/man/radosgw.8
index 2f077ad..a02e218 100644
--- a/man/radosgw.8
+++ b/man/radosgw.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RADOSGW" "8" "November 03, 2015" "dev" "Ceph"
+.TH "RADOSGW" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 radosgw \- rados REST gateway
 .
diff --git a/man/rbd-fuse.8 b/man/rbd-fuse.8
index e6a1a6d..4cab0d3 100644
--- a/man/rbd-fuse.8
+++ b/man/rbd-fuse.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-FUSE" "8" "November 03, 2015" "dev" "Ceph"
+.TH "RBD-FUSE" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 rbd-fuse \- expose rbd images as files
 .
diff --git a/man/rbd-replay-many.8 b/man/rbd-replay-many.8
index 929d92d..840831f 100644
--- a/man/rbd-replay-many.8
+++ b/man/rbd-replay-many.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY-MANY" "8" "November 03, 2015" "dev" "Ceph"
+.TH "RBD-REPLAY-MANY" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 rbd-replay-many \- replay a rados block device (RBD) workload on several clients
 .
diff --git a/man/rbd-replay-prep.8 b/man/rbd-replay-prep.8
index 102c2b2..91e63ab 100644
--- a/man/rbd-replay-prep.8
+++ b/man/rbd-replay-prep.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY-PREP" "8" "November 03, 2015" "dev" "Ceph"
+.TH "RBD-REPLAY-PREP" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 rbd-replay-prep \- prepare captured rados block device (RBD) workloads for replay
 .
diff --git a/man/rbd-replay.8 b/man/rbd-replay.8
index 5368d18..c007cfa 100644
--- a/man/rbd-replay.8
+++ b/man/rbd-replay.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD-REPLAY" "8" "November 03, 2015" "dev" "Ceph"
+.TH "RBD-REPLAY" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 rbd-replay \- replay rados block device (RBD) workloads
 .
diff --git a/man/rbd.8 b/man/rbd.8
index 60fe011..6258df6 100644
--- a/man/rbd.8
+++ b/man/rbd.8
@@ -1,6 +1,6 @@
 .\" Man page generated from reStructuredText.
 .
-.TH "RBD" "8" "November 03, 2015" "dev" "Ceph"
+.TH "RBD" "8" "December 14, 2015" "dev" "Ceph"
 .SH NAME
 rbd \- manage rados block device (RBD) images
 .
@@ -32,8 +32,8 @@ level margin: \\n[rst2man-indent\\n[rst2man-indent-level]]
 ..
 .SH SYNOPSIS
 .nf
-\fBrbd\fP [ \-c \fIceph.conf\fP ] [ \-m \fImonaddr\fP ] [ \-p | \-\-pool \fIpool\fP ] [
-\-\-size \fIsize\fP ] [ \-\-order \fIbits\fP ] [ \fIcommand\fP ... ]
+\fBrbd\fP [ \-c \fIceph.conf\fP ] [ \-m \fImonaddr\fP ] [\-\-cluster \fIcluster name\fP]
+[ \-p | \-\-pool \fIpool\fP ] [\-\-size \fIsize\fP ] [ \-\-order \fIbits\fP ] [ \fIcommand\fP ... ]
 .fi
 .sp
 .SH DESCRIPTION
@@ -57,6 +57,11 @@ Connect to specified monitor (instead of looking through ceph.conf).
 .UNINDENT
 .INDENT 0.0
 .TP
+.B \-\-cluster cluster name
+Use different cluster name as compared to default cluster name \fIceph\fP\&.
+.UNINDENT
+.INDENT 0.0
+.TP
 .B \-p pool\-name, \-\-pool pool\-name
 Interact with the given pool. Required by most commands.
 .UNINDENT
@@ -179,6 +184,8 @@ object\-map: object map support (requires exclusive\-lock)
 fast\-diff: fast diff calculations (requires object\-map)
 .IP \(bu 2
 deep\-flatten: snapshot flatten support
+.IP \(bu 2
+journaling: journaled IO support (requires exclusive\-lock)
 .UNINDENT
 .UNINDENT
 .INDENT 0.0
@@ -468,6 +475,10 @@ default).
 .IP \(bu 2
 notcp_nodelay \- Enable Nagle\(aqs algorithm on client sockets (since 4.0).
 .IP \(bu 2
+cephx_sign_messages \- Enable message signing (since 4.4, default).
+.IP \(bu 2
+nocephx_sign_messages \- Disable message signing (since 4.4).
+.IP \(bu 2
 mount_timeout=x \- A timeout on various steps in \fIrbd map\fP and \fIrbd unmap\fP
 sequences (default is 60 seconds).  In particular, since 4.2 this can be used
 to ensure that \fIrbd unmap\fP eventually times out when there is no network
@@ -585,6 +596,13 @@ rbd map mypool/myimage \-\-id admin \-\-keyfile secretfile
 .UNINDENT
 .UNINDENT
 .sp
+To map an image via the kernel with different cluster name other than default \fIceph\fP\&.
+.INDENT 0.0
+.INDENT 3.5
+rbd map mypool/myimage \-\-cluster \fIcluster name\fP
+.UNINDENT
+.UNINDENT
+.sp
 To unmap an image:
 .INDENT 0.0
 .INDENT 3.5
diff --git a/selinux/Makefile.in b/selinux/Makefile.in
index 0192ca9..36a629e 100644
--- a/selinux/Makefile.in
+++ b/selinux/Makefile.in
@@ -137,6 +137,7 @@ AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
 BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
+BOOST_REGEX_LIBS = @BOOST_REGEX_LIBS@
 BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
 CC = @CC@
 CCAS = @CCAS@
diff --git a/src/.git_version b/src/.git_version
index 3717ae8..f39fca9 100644
--- a/src/.git_version
+++ b/src/.git_version
@@ -1,2 +1,2 @@
-bb2ecea240f3a1d525bcb35670cb07bd1f0ca299
-v9.2.0
+9180a926a4450179534bc419d306f423670174c9
+v10.0.1
diff --git a/src/Makefile-client.am b/src/Makefile-client.am
index dcd1835..ff7638b 100644
--- a/src/Makefile-client.am
+++ b/src/Makefile-client.am
@@ -23,7 +23,6 @@ ceph: ceph.in ./ceph_ver.h Makefile
 	sed -ie "s|@PYTHON_EXECUTABLE@|/usr/bin/env python|" $@.tmp
 	grep CEPH_GIT_NICE_VER ./ceph_ver.h | cut -f 3 -d " " | sed s/\"//g | xargs -I "{}" sed -ie "s/@CEPH_GIT_NICE_VER@/{}/g" $@.tmp
 	grep CEPH_GIT_VER ./ceph_ver.h | cut -f 3 -d " " | sed s/\"//g | xargs -I "{}" sed -ie "s/@CEPH_GIT_VER@/{}/g" $@.tmp
-	cat $(srcdir)/$@.in >>$@.tmp
 	chmod a+x $@.tmp
 	chmod a-w $@.tmp
 	mv $@.tmp $@
@@ -48,7 +47,8 @@ bash_completion_DATA += \
 
 bin_SCRIPTS += \
 	ceph-rbdnamer \
-	rbd-replay-many
+	rbd-replay-many \
+        rbdmap
 
 python_PYTHON += pybind/rbd.py
 
@@ -58,12 +58,6 @@ if LINUX
 noinst_LTLIBRARIES += libkrbd.la
 endif # LINUX
 
-rbd_SOURCES = rbd.cc
-rbd_LDADD = $(LIBKRBD) $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL)
-if LINUX
-bin_PROGRAMS += rbd
-endif # LINUX
-
 endif # WITH_RBD
 
 # Fuse targets
diff --git a/src/Makefile-env.am b/src/Makefile-env.am
index 63ea49f..e3b6935 100644
--- a/src/Makefile-env.am
+++ b/src/Makefile-env.am
@@ -80,7 +80,6 @@ HARDENING_LDFLAGS =  \
 AM_COMMON_CPPFLAGS = \
 	-D__CEPH__ \
 	-D_FILE_OFFSET_BITS=64 \
-	-D_REENTRANT \
 	-D_THREAD_SAFE \
 	-D__STDC_FORMAT_MACROS \
 	-D_GNU_SOURCE \
@@ -88,6 +87,27 @@ AM_COMMON_CPPFLAGS = \
 	-DCEPH_PKGLIBDIR=\"${pkglibdir}\" \
 	-DGTEST_USE_OWN_TR1_TUPLE=0
 
+if LINUX
+AM_COMMON_CPPFLAGS += \
+	-D_REENTRANT
+endif
+
+if FREEBSD 
+AM_COMMON_CPPFLAGS += \
+	-D_REENTRANT
+endif
+
+if DARWIN 
+AM_COMMON_CPPFLAGS += \
+	-D_REENTRANT
+endif
+
+if SOLARIS
+AM_COMMON_CPPFLAGS += \
+       -D_PTHREADS \
+       -D_POSIX_C_SOURCE
+endif
+
 AM_COMMON_CFLAGS = \
 	-Wall \
 	${WARN_TYPE_LIMITS} \
@@ -100,6 +120,9 @@ AM_COMMON_CFLAGS = \
 if !CLANG
 	AM_COMMON_CFLAGS += -rdynamic
 endif
+if SOLARIS
+	AM_COMMON_CFLAGS += -Wno-unused-local-typedefs
+endif
 
 AM_CFLAGS = $(AM_COMMON_CFLAGS) $(HARDENING_CFLAGS)
 AM_CPPFLAGS = $(AM_COMMON_CPPFLAGS)
@@ -113,6 +136,11 @@ if !CLANG
 	AM_CXXFLAGS += -Wstrict-null-sentinel
 endif
 
+# solaris harding
+if SOLARIS
+	AM_CXXFLAGS += -lssp_nonshared
+endif
+
 # note: this is position dependant, it affects the -l options that
 # come after it on the command line. when you use ${AM_LDFLAGS} in
 # later rules, take care where you place it. for more information, see
@@ -169,13 +197,14 @@ LIBMSG = libmsg.la
 LIBCRUSH = libcrush.la
 LIBCOMPRESSOR = libcompressor.la -lsnappy
 LIBJSON_SPIRIT = libjson_spirit.la
+LIBKV = libkv.a
 LIBLOG = liblog.la
-LIBOS = libos.la
-LIBOS_TYPES = libos_types.la
-LIBOSD = libosd.la
+LIBOS = libos.a
+LIBOS_TYPES = libos_types.a
+LIBOSD = libosd.a
 LIBOSD_TYPES = libosd_types.la
 LIBOSDC = libosdc.la
-LIBMON = libmon.la
+LIBMON = libmon.a
 LIBMON_TYPES = libmon_types.la
 LIBMDS = libmds.la
 LIBCLIENT = libclient.la
@@ -198,10 +227,6 @@ if WITH_LIBZFS
 LIBOS += libos_zfs.a -lzfs
 endif # WITH_LIBZFS
 
-if WITH_LIBROCKSDB
-LIBOS += libos_rocksdb.la
-endif # WITH_LIBROCKSDB
-
 if WITH_TCMALLOC_MINIMAL
 LIBPERFGLUE += -ltcmalloc_minimal
 endif # WITH_TCMALLOC_MINIMAL
@@ -229,8 +254,17 @@ LIBMON += $(LIBPERFGLUE)
 LIBOSD += $(LIBPERFGLUE)
 LIBMDS += $(LIBPERFGLUE)
 
-# Always use system leveldb
-LIBOS += -lleveldb -lsnappy
+# OSD needs types
+LIBOSD += $(LIBOSD_TYPES) $(LIBOS_TYPES)
+
+# libkv/libos linking order is ornery
+if WITH_SLIBROCKSDB
+LIBKV += rocksdb/librocksdb.a
+endif
+LIBKV += -lbz2 -lz -lleveldb -lsnappy
+LIBOS += $(LIBOS_TYPES) $(LIBKV)
+
+LIBMON += $(LIBMON_TYPES)
 
 # Use this for binaries requiring libglobal
 CEPH_GLOBAL = $(LIBGLOBAL) $(LIBCOMMON) $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXTRALIBS)
diff --git a/src/Makefile-rocksdb.am b/src/Makefile-rocksdb.am
index 9d45f48..677829d 100644
--- a/src/Makefile-rocksdb.am
+++ b/src/Makefile-rocksdb.am
@@ -1,463 +1,754 @@
-if WITH_SLIBROCKSDB
-  SUBDIRS += rocksdb
-else
-  EXTRA_DIST += \
-	rocksdb/.gitignore \
-        rocksdb/CONTRIBUTING.md \
-        rocksdb/HISTORY.md \
-        rocksdb/INSTALL.md \
-        rocksdb/LICENSE \
-        rocksdb/Makefile.am \
-        rocksdb/PATENTS \
-        rocksdb/README.md \
-        rocksdb/ROCKSDB_LITE.md \
-	rocksdb/AUTHORS \
-	rocksdb/configure.ac \
-	rocksdb/CONTRIBUTING.md \
-	rocksdb/db/builder.cc \
-	rocksdb/db/builder.h \
-	rocksdb/db/c.cc \
-	rocksdb/db/column_family.cc \
-	rocksdb/db/column_family.h \
-	rocksdb/db/column_family_test.cc \
-	rocksdb/db/compact_files_test.cc \
-	rocksdb/db/compaction.cc \
-	rocksdb/db/compaction.h \
-	rocksdb/db/compaction_job.cc \
-	rocksdb/db/compaction_job.h \
-	rocksdb/db/compaction_job_test.cc \
-	rocksdb/db/compaction_picker.cc \
-	rocksdb/db/compaction_picker.h \
-	rocksdb/db/compaction_picker_test.cc \
-	rocksdb/db/comparator_db_test.cc \
-	rocksdb/db/corruption_test.cc \
-	rocksdb/db/c_test.c \
-	rocksdb/db/cuckoo_table_db_test.cc \
-	rocksdb/db/db_bench.cc \
-	rocksdb/db/db_filesnapshot.cc \
-	rocksdb/db/dbformat.cc \
-	rocksdb/db/dbformat.h \
-	rocksdb/db/dbformat_test.cc \
-	rocksdb/db/db_impl.cc \
-	rocksdb/db/db_impl_debug.cc \
-	rocksdb/db/db_impl_experimental.cc \
-	rocksdb/db/db_impl.h \
-	rocksdb/db/db_impl_readonly.cc \
-	rocksdb/db/db_impl_readonly.h \
-	rocksdb/db/db_iter.cc \
-	rocksdb/db/db_iter.h \
-	rocksdb/db/db_iter_test.cc \
-	rocksdb/db/db_test.cc \
-	rocksdb/db/deletefile_test.cc \
-	rocksdb/db/event_logger_helpers.cc \
-	rocksdb/db/event_logger_helpers.h \
-	rocksdb/db/experimental.cc \
-	rocksdb/db/fault_injection_test.cc \
-	rocksdb/db/file_indexer.cc \
-	rocksdb/db/file_indexer.h \
-	rocksdb/db/file_indexer_test.cc \
-	rocksdb/db/filename.cc \
-	rocksdb/db/filename.h \
-	rocksdb/db/filename_test.cc \
-	rocksdb/db/flush_job.cc \
-	rocksdb/db/flush_job.h \
-	rocksdb/db/flush_job_test.cc \
-	rocksdb/db/flush_scheduler.cc \
-	rocksdb/db/flush_scheduler.h \
-	rocksdb/db/forward_iterator.cc \
-	rocksdb/db/forward_iterator.h \
-	rocksdb/db/internal_stats.cc \
-	rocksdb/db/internal_stats.h \
-	rocksdb/db/job_context.h \
-	rocksdb/db/listener_test.cc \
-	rocksdb/db/log_format.h \
-	rocksdb/db/log_reader.cc \
-	rocksdb/db/log_reader.h \
-	rocksdb/db/log_test.cc \
-	rocksdb/db/log_writer.cc \
-	rocksdb/db/log_writer.h \
-	rocksdb/db/managed_iterator.cc \
-	rocksdb/db/managed_iterator.h \
-	rocksdb/db/memtable_allocator.cc \
-	rocksdb/db/memtable_allocator.h \
-	rocksdb/db/memtable.cc \
-	rocksdb/db/memtable.h \
-	rocksdb/db/memtable_list.cc \
-	rocksdb/db/memtable_list.h \
-	rocksdb/db/memtable_list_test.cc \
-	rocksdb/db/memtablerep_bench.cc \
-	rocksdb/db/merge_context.h \
-	rocksdb/db/merge_helper.cc \
-	rocksdb/db/merge_helper.h \
-	rocksdb/db/merge_operator.cc \
-	rocksdb/db/merge_test.cc \
-	rocksdb/db/perf_context_test.cc \
-	rocksdb/db/plain_table_db_test.cc \
-	rocksdb/db/prefix_test.cc \
-	rocksdb/db/repair.cc \
-	rocksdb/db/skiplist.h \
-	rocksdb/db/skiplist_test.cc \
-	rocksdb/db/slice.cc \
-	rocksdb/db/snapshot.h \
-	rocksdb/db/table_cache.cc \
-	rocksdb/db/table_cache.h \
-	rocksdb/db/table_properties_collector.cc \
-	rocksdb/db/table_properties_collector.h \
-	rocksdb/db/table_properties_collector_test.cc \
-	rocksdb/db/transaction_log_impl.cc \
-	rocksdb/db/transaction_log_impl.h \
-	rocksdb/db/version_builder.cc \
-	rocksdb/db/version_builder.h \
-	rocksdb/db/version_builder_test.cc \
-	rocksdb/db/version_edit.cc \
-	rocksdb/db/version_edit.h \
-	rocksdb/db/version_edit_test.cc \
-	rocksdb/db/version_set.cc \
-	rocksdb/db/version_set.h \
-	rocksdb/db/version_set_test.cc \
-	rocksdb/db/wal_manager.cc \
-	rocksdb/db/wal_manager.h \
-	rocksdb/db/wal_manager_test.cc \
-	rocksdb/db/write_batch_base.cc \
-	rocksdb/db/write_batch.cc \
-	rocksdb/db/write_batch_internal.h \
-	rocksdb/db/write_batch_test.cc \
-	rocksdb/db/writebuffer.h \
-	rocksdb/db/write_controller.cc \
-	rocksdb/db/write_controller.h \
-	rocksdb/db/write_controller_test.cc \
-	rocksdb/db/write_thread.cc \
-	rocksdb/db/write_thread.h \
-	rocksdb/doc/doc.css \
-	rocksdb/doc/index.html \
-	rocksdb/doc/log_format.txt \
-	rocksdb/doc/rockslogo.jpg \
-	rocksdb/doc/rockslogo.png \
-	rocksdb/examples/column_families_example.cc \
-	rocksdb/examples/compact_files_example.cc \
-	rocksdb/examples/c_simple_example.c \
-	rocksdb/examples/.gitignore \
-	rocksdb/examples/Makefile \
-	rocksdb/examples/README.md \
-	rocksdb/examples/simple_example.cc \
-	rocksdb/hdfs/env_hdfs.h \
-	rocksdb/hdfs/README \
-	rocksdb/hdfs/setup.sh \
-	rocksdb/HISTORY.md \
-	rocksdb/include/rocksdb/cache.h \
-	rocksdb/include/rocksdb/c.h \
-	rocksdb/include/rocksdb/compaction_filter.h \
-	rocksdb/include/rocksdb/comparator.h \
-	rocksdb/include/rocksdb/db.h \
-	rocksdb/include/rocksdb/env.h \
-	rocksdb/include/rocksdb/experimental.h \
-	rocksdb/include/rocksdb/filter_policy.h \
-	rocksdb/include/rocksdb/flush_block_policy.h \
-	rocksdb/include/rocksdb/immutable_options.h \
-	rocksdb/include/rocksdb/iostats_context.h \
-	rocksdb/include/rocksdb/iterator.h \
-	rocksdb/include/rocksdb/ldb_tool.h \
-	rocksdb/include/rocksdb/listener.h \
-	rocksdb/include/rocksdb/memtablerep.h \
-	rocksdb/include/rocksdb/merge_operator.h \
-	rocksdb/include/rocksdb/metadata.h \
-	rocksdb/include/rocksdb/options.h \
-	rocksdb/include/rocksdb/perf_context.h \
-	rocksdb/include/rocksdb/rate_limiter.h \
-	rocksdb/include/rocksdb/slice.h \
-	rocksdb/include/rocksdb/slice_transform.h \
-	rocksdb/include/rocksdb/sst_dump_tool.h \
-	rocksdb/include/rocksdb/statistics.h \
-	rocksdb/include/rocksdb/status.h \
-	rocksdb/include/rocksdb/table.h \
-	rocksdb/include/rocksdb/table_properties.h \
-	rocksdb/include/rocksdb/thread_status.h \
-	rocksdb/include/rocksdb/transaction_log.h \
-	rocksdb/include/rocksdb/types.h \
-	rocksdb/include/rocksdb/universal_compaction.h \
-	rocksdb/include/rocksdb/utilities/backupable_db.h \
-	rocksdb/include/rocksdb/utilities/checkpoint.h \
-	rocksdb/include/rocksdb/utilities/convenience.h \
-	rocksdb/include/rocksdb/utilities/db_ttl.h \
-	rocksdb/include/rocksdb/utilities/document_db.h \
-	rocksdb/include/rocksdb/utilities/flashcache.h \
-	rocksdb/include/rocksdb/utilities/geo_db.h \
-	rocksdb/include/rocksdb/utilities/json_document.h \
-	rocksdb/include/rocksdb/utilities/leveldb_options.h \
-	rocksdb/include/rocksdb/utilities/spatial_db.h \
-	rocksdb/include/rocksdb/utilities/stackable_db.h \
-	rocksdb/include/rocksdb/utilities/utility_db.h \
-	rocksdb/include/rocksdb/utilities/write_batch_with_index.h \
-	rocksdb/include/rocksdb/version.h \
-	rocksdb/include/rocksdb/write_batch_base.h \
-	rocksdb/include/rocksdb/write_batch.h \
-	rocksdb/include/utilities/backupable_db.h \
-	rocksdb/include/utilities/db_ttl.h \
-	rocksdb/include/utilities/document_db.h \
-	rocksdb/include/utilities/geo_db.h \
-	rocksdb/include/utilities/json_document.h \
-	rocksdb/include/utilities/stackable_db.h \
-	rocksdb/include/utilities/utility_db.h \
-	rocksdb/INSTALL.md \
-	rocksdb/LICENSE \
-	rocksdb/m4/libtool.m4 \
-	rocksdb/m4/lt~obsolete.m4 \
-	rocksdb/m4/ltoptions.m4 \
-	rocksdb/m4/ltsugar.m4 \
-	rocksdb/m4/ltversion.m4 \
-	rocksdb/Makefile.am \
-	rocksdb/PATENTS \
-	rocksdb/port/likely.h \
-	rocksdb/port/port_example.h \
-	rocksdb/port/port.h \
-	rocksdb/port/port_posix.cc \
-	rocksdb/port/port_posix.h \
-	rocksdb/port/README \
-	rocksdb/port/stack_trace.cc \
-	rocksdb/port/stack_trace.h \
-	rocksdb/port/win/stdint.h \
-	rocksdb/README.md \
-	rocksdb/ROCKSDB_LITE.md \
-	rocksdb/table/adaptive_table_factory.cc \
-	rocksdb/table/adaptive_table_factory.h \
-	rocksdb/table/block_based_filter_block.cc \
-	rocksdb/table/block_based_filter_block.h \
-	rocksdb/table/block_based_filter_block_test.cc \
-	rocksdb/table/block_based_table_builder.cc \
-	rocksdb/table/block_based_table_builder.h \
-	rocksdb/table/block_based_table_factory.cc \
-	rocksdb/table/block_based_table_factory.h \
-	rocksdb/table/block_based_table_reader.cc \
-	rocksdb/table/block_based_table_reader.h \
-	rocksdb/table/block_builder.cc \
-	rocksdb/table/block_builder.h \
-	rocksdb/table/block.cc \
-	rocksdb/table/block.h \
-	rocksdb/table/block_hash_index.cc \
-	rocksdb/table/block_hash_index.h \
-	rocksdb/table/block_hash_index_test.cc \
-	rocksdb/table/block_prefix_index.cc \
-	rocksdb/table/block_prefix_index.h \
-	rocksdb/table/block_test.cc \
-	rocksdb/table/bloom_block.cc \
-	rocksdb/table/bloom_block.h \
-	rocksdb/table/cuckoo_table_builder.cc \
-	rocksdb/table/cuckoo_table_builder.h \
-	rocksdb/table/cuckoo_table_builder_test.cc \
-	rocksdb/table/cuckoo_table_factory.cc \
-	rocksdb/table/cuckoo_table_factory.h \
-	rocksdb/table/cuckoo_table_reader.cc \
-	rocksdb/table/cuckoo_table_reader.h \
-	rocksdb/table/cuckoo_table_reader_test.cc \
-	rocksdb/table/filter_block.h \
-	rocksdb/table/flush_block_policy.cc \
-	rocksdb/table/format.cc \
-	rocksdb/table/format.h \
-	rocksdb/table/full_filter_block.cc \
-	rocksdb/table/full_filter_block.h \
-	rocksdb/table/full_filter_block_test.cc \
-	rocksdb/table/get_context.cc \
-	rocksdb/table/get_context.h \
-	rocksdb/table/iterator.cc \
-	rocksdb/table/iterator_wrapper.h \
-	rocksdb/table/iter_heap.h \
-	rocksdb/table/merger.cc \
-	rocksdb/table/merger.h \
-	rocksdb/table/merger_test.cc \
-	rocksdb/table/meta_blocks.cc \
-	rocksdb/table/meta_blocks.h \
-	rocksdb/table/mock_table.cc \
-	rocksdb/table/mock_table.h \
-	rocksdb/table/plain_table_builder.cc \
-	rocksdb/table/plain_table_builder.h \
-	rocksdb/table/plain_table_factory.cc \
-	rocksdb/table/plain_table_factory.h \
-	rocksdb/table/plain_table_index.cc \
-	rocksdb/table/plain_table_index.h \
-	rocksdb/table/plain_table_key_coding.cc \
-	rocksdb/table/plain_table_key_coding.h \
-	rocksdb/table/plain_table_reader.cc \
-	rocksdb/table/plain_table_reader.h \
-	rocksdb/table/table_builder.h \
-	rocksdb/table/table_properties.cc \
-	rocksdb/table/table_properties_internal.h \
-	rocksdb/table/table_reader_bench.cc \
-	rocksdb/table/table_reader.h \
-	rocksdb/table/table_test.cc \
-	rocksdb/table/two_level_iterator.cc \
-	rocksdb/table/two_level_iterator.h \
-	rocksdb/third-party/fbson/COMMIT.md \
-	rocksdb/third-party/fbson/FbsonDocument.h \
-	rocksdb/third-party/fbson/FbsonJsonParser.h \
-	rocksdb/third-party/fbson/FbsonStream.h \
-	rocksdb/third-party/fbson/FbsonUtil.h \
-	rocksdb/third-party/fbson/FbsonWriter.h \
-	rocksdb/third-party/flashcache/flashcache_ioctl.h \
-	rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \
-	rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h \
-	rocksdb/USERS.md \
-	rocksdb/util/allocator.h \
-	rocksdb/util/arena.cc \
-	rocksdb/util/arena.h \
-	rocksdb/util/arena_test.cc \
-	rocksdb/util/auto_roll_logger.cc \
-	rocksdb/util/auto_roll_logger.h \
-	rocksdb/util/auto_roll_logger_test.cc \
-	rocksdb/util/autovector.h \
-	rocksdb/util/autovector_test.cc \
-	rocksdb/util/bloom.cc \
-	rocksdb/util/bloom_test.cc \
-	rocksdb/util/build_version.h \
-	rocksdb/util/cache_bench.cc \
-	rocksdb/util/cache.cc \
-	rocksdb/util/cache_test.cc \
-	rocksdb/util/coding.cc \
-	rocksdb/util/coding.h \
-	rocksdb/util/coding_test.cc \
-	rocksdb/util/comparator.cc \
-	rocksdb/util/compression.h \
-	rocksdb/util/crc32c.cc \
-	rocksdb/util/crc32c.h \
-	rocksdb/util/crc32c_test.cc \
-	rocksdb/util/db_info_dumper.cc \
-	rocksdb/util/db_info_dumper.h \
-	rocksdb/util/dynamic_bloom.cc \
-	rocksdb/util/dynamic_bloom.h \
-	rocksdb/util/dynamic_bloom_test.cc \
-	rocksdb/util/env.cc \
-	rocksdb/util/env_hdfs.cc \
-	rocksdb/util/env_posix.cc \
-	rocksdb/util/env_test.cc \
-	rocksdb/util/event_logger.cc \
-	rocksdb/util/event_logger.h \
-	rocksdb/util/event_logger_test.cc \
-	rocksdb/util/filelock_test.cc \
-	rocksdb/util/file_util.cc \
-	rocksdb/util/file_util.h \
-	rocksdb/util/filter_policy.cc \
-	rocksdb/util/hash.cc \
-	rocksdb/util/hash_cuckoo_rep.cc \
-	rocksdb/util/hash_cuckoo_rep.h \
-	rocksdb/util/hash.h \
-	rocksdb/util/hash_linklist_rep.cc \
-	rocksdb/util/hash_linklist_rep.h \
-	rocksdb/util/hash_skiplist_rep.cc \
-	rocksdb/util/hash_skiplist_rep.h \
-	rocksdb/util/histogram.cc \
-	rocksdb/util/histogram.h \
-	rocksdb/util/histogram_test.cc \
-	rocksdb/util/instrumented_mutex.cc \
-	rocksdb/util/instrumented_mutex.h \
-	rocksdb/util/iostats_context.cc \
-	rocksdb/util/iostats_context_imp.h \
-	rocksdb/utilities/backupable/backupable_db.cc \
-	rocksdb/utilities/backupable/backupable_db_test.cc \
-	rocksdb/utilities/checkpoint/checkpoint.cc \
-	rocksdb/utilities/compacted_db/compacted_db_impl.cc \
-	rocksdb/utilities/compacted_db/compacted_db_impl.h \
-	rocksdb/utilities/convenience/convenience.cc \
-	rocksdb/utilities/document/document_db.cc \
-	rocksdb/utilities/document/document_db_test.cc \
-	rocksdb/utilities/document/json_document_builder.cc \
-	rocksdb/utilities/document/json_document.cc \
-	rocksdb/utilities/document/json_document_test.cc \
-	rocksdb/utilities/flashcache/flashcache.cc \
-	rocksdb/utilities/flashcache/flashcache.h \
-	rocksdb/utilities/geodb/geodb_impl.cc \
-	rocksdb/utilities/geodb/geodb_impl.h \
-	rocksdb/utilities/geodb/geodb_test.cc \
-	rocksdb/utilities/leveldb_options/leveldb_options.cc \
-	rocksdb/utilities/merge_operators.h \
-	rocksdb/utilities/merge_operators/put.cc \
-	rocksdb/utilities/merge_operators/string_append/stringappend2.cc \
-	rocksdb/utilities/merge_operators/string_append/stringappend2.h \
-	rocksdb/utilities/merge_operators/string_append/stringappend.cc \
-	rocksdb/utilities/merge_operators/string_append/stringappend.h \
-	rocksdb/utilities/merge_operators/string_append/stringappend_test.cc \
-	rocksdb/utilities/merge_operators/uint64add.cc \
-	rocksdb/utilities/redis/README \
-	rocksdb/utilities/redis/redis_list_exception.h \
-	rocksdb/utilities/redis/redis_list_iterator.h \
-	rocksdb/utilities/redis/redis_lists.cc \
-	rocksdb/utilities/redis/redis_lists.h \
-	rocksdb/utilities/redis/redis_lists_test.cc \
-	rocksdb/utilities/spatialdb/spatial_db.cc \
-	rocksdb/utilities/spatialdb/spatial_db_test.cc \
-	rocksdb/utilities/spatialdb/utils.h \
-	rocksdb/utilities/ttl/db_ttl_impl.cc \
-	rocksdb/utilities/ttl/db_ttl_impl.h \
-	rocksdb/utilities/ttl/ttl_test.cc \
-	rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc \
-	rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc \
-	rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h \
-	rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc \
-	rocksdb/util/ldb_cmd.cc \
-	rocksdb/util/ldb_cmd_execute_result.h \
-	rocksdb/util/ldb_cmd.h \
-	rocksdb/util/ldb_tool.cc \
-	rocksdb/util/log_buffer.cc \
-	rocksdb/util/log_buffer.h \
-	rocksdb/util/logging.cc \
-	rocksdb/util/logging.h \
-	rocksdb/util/log_write_bench.cc \
-	rocksdb/util/manual_compaction_test.cc \
-	rocksdb/util/memenv.cc \
-	rocksdb/util/memenv_test.cc \
-	rocksdb/util/mock_env.cc \
-	rocksdb/util/mock_env.h \
-	rocksdb/util/mock_env_test.cc \
-	rocksdb/util/murmurhash.cc \
-	rocksdb/util/murmurhash.h \
-	rocksdb/util/mutable_cf_options.cc \
-	rocksdb/util/mutable_cf_options.h \
-	rocksdb/util/mutexlock.h \
-	rocksdb/util/options_builder.cc \
-	rocksdb/util/options.cc \
-	rocksdb/util/options_helper.cc \
-	rocksdb/util/options_helper.h \
-	rocksdb/util/options_test.cc \
-	rocksdb/util/perf_context.cc \
-	rocksdb/util/perf_context_imp.h \
-	rocksdb/util/posix_logger.h \
-	rocksdb/util/random.h \
-	rocksdb/util/rate_limiter.cc \
-	rocksdb/util/rate_limiter.h \
-	rocksdb/util/rate_limiter_test.cc \
-	rocksdb/util/scoped_arena_iterator.h \
-	rocksdb/util/skiplistrep.cc \
-	rocksdb/util/slice.cc \
-	rocksdb/util/slice_transform_test.cc \
-	rocksdb/util/sst_dump_test.cc \
-	rocksdb/util/sst_dump_tool.cc \
-	rocksdb/util/sst_dump_tool_imp.h \
-	rocksdb/util/statistics.cc \
-	rocksdb/util/statistics.h \
-	rocksdb/util/status.cc \
-	rocksdb/util/stl_wrappers.h \
-	rocksdb/util/stop_watch.h \
-	rocksdb/util/string_util.cc \
-	rocksdb/util/string_util.h \
-	rocksdb/util/sync_point.cc \
-	rocksdb/util/sync_point.h \
-	rocksdb/util/testharness.cc \
-	rocksdb/util/testharness.h \
-	rocksdb/util/testutil.cc \
-	rocksdb/util/testutil.h \
-	rocksdb/util/thread_list_test.cc \
-	rocksdb/util/thread_local.cc \
-	rocksdb/util/thread_local.h \
-	rocksdb/util/thread_local_test.cc \
-	rocksdb/util/thread_operation.h \
-	rocksdb/util/thread_status_impl.cc \
-	rocksdb/util/thread_status_updater.cc \
-	rocksdb/util/thread_status_updater_debug.cc \
-	rocksdb/util/thread_status_updater.h \
-	rocksdb/util/thread_status_util.cc \
-	rocksdb/util/thread_status_util_debug.cc \
-	rocksdb/util/thread_status_util.h \
-	rocksdb/util/vectorrep.cc \
-	rocksdb/util/xfunc.cc \
-	rocksdb/util/xfunc.h \
-	rocksdb/util/xxhash.cc \
-	rocksdb/util/xxhash.h
-endif # WITH_SLIBROCKSDB
+EXTRA_DIST += \
+  rocksdb/.arcconfig \
+  rocksdb/PATENTS \
+  rocksdb/.clang-format \
+  rocksdb/AUTHORS \
+  rocksdb/CONTRIBUTING.md \
+  rocksdb/LICENSE \
+  rocksdb/README.md \
+  rocksdb/Vagrantfile \
+  rocksdb/arcanist_util/__phutil_library_init__.php \
+  rocksdb/arcanist_util/config/FacebookArcanistConfiguration.php \
+  rocksdb/arcanist_util/cpp_linter/ArcanistCpplintLinter.php \
+  rocksdb/arcanist_util/cpp_linter/cpplint.py \
+  rocksdb/arcanist_util/cpp_linter/BaseDirectoryScopedFormatLinter.php \
+  rocksdb/arcanist_util/cpp_linter/FacebookHowtoevenLinter.php \
+  rocksdb/arcanist_util/cpp_linter/FbcodeClangFormatLinter.php \
+  rocksdb/arcanist_util/cpp_linter/FbcodeCppLinter.php \
+  rocksdb/arcanist_util/lint_engine/FacebookFbcodeLintEngine.php \
+  rocksdb/arcanist_util/lint_engine/FacebookHowtoevenLintEngine.php \
+  rocksdb/arcanist_util/unit_engine/FacebookFbcodeUnitTestEngine.php \
+  rocksdb/arcanist_util/__phutil_library_map__.php \
+  rocksdb/build_tools/make_new_version.sh \
+  rocksdb/build_tools/make_package.sh \
+  rocksdb/build_tools/regression_build_test.sh \
+  rocksdb/build_tools/version.sh \
+  rocksdb/build_tools/amalgamate.py \
+  rocksdb/build_tools/build_detect_platform \
+  rocksdb/build_tools/dockerbuild.sh \
+  rocksdb/build_tools/fb_compile_mongo.sh \
+  rocksdb/build_tools/fbcode_config.sh \
+  rocksdb/build_tools/fbcode_config4.8.1.sh \
+  rocksdb/build_tools/format-diff.sh \
+  rocksdb/build_tools/rocksdb-lego-determinator \
+  rocksdb/build_tools/run_ci_db_test.ps1 \
+  rocksdb/coverage/coverage_test.sh \
+  rocksdb/coverage/parse_gcov_output.py \
+  rocksdb/db/compaction_picker.h \
+  rocksdb/db/compaction_picker_test.cc \
+  rocksdb/db/comparator_db_test.cc \
+  rocksdb/db/convenience.cc \
+  rocksdb/db/cuckoo_table_db_test.cc \
+  rocksdb/db/db_bench.cc \
+  rocksdb/db/db_filesnapshot.cc \
+  rocksdb/db/db_impl.cc \
+  rocksdb/db/db_impl.h \
+  rocksdb/db/db_impl_debug.cc \
+  rocksdb/db/db_impl_readonly.cc \
+  rocksdb/db/db_impl_readonly.h \
+  rocksdb/db/db_iter.cc \
+  rocksdb/db/db_iter_test.cc \
+  rocksdb/db/db_log_iter_test.cc \
+  rocksdb/db/dbformat.cc \
+  rocksdb/db/dbformat_test.cc \
+  rocksdb/db/deletefile_test.cc \
+  rocksdb/db/filename.cc \
+  rocksdb/db/log_reader.h \
+  rocksdb/db/file_indexer.h \
+  rocksdb/db/flush_job_test.cc \
+  rocksdb/db/plain_table_db_test.cc \
+  rocksdb/db/db_test.cc \
+  rocksdb/db/db_iter.h \
+  rocksdb/db/fault_injection_test.cc \
+  rocksdb/db/filename.h \
+  rocksdb/db/forward_iterator.cc \
+  rocksdb/db/forward_iterator.h \
+  rocksdb/db/job_context.h \
+  rocksdb/db/compaction_job.h \
+  rocksdb/db/memtable.cc \
+  rocksdb/db/file_indexer.cc \
+  rocksdb/db/flush_job.cc \
+  rocksdb/db/file_indexer_test.cc \
+  rocksdb/db/log_reader.cc \
+  rocksdb/db/table_cache.h \
+  rocksdb/db/filename_test.cc \
+  rocksdb/db/memtable_list_test.cc \
+  rocksdb/db/merge_helper.cc \
+  rocksdb/db/flush_scheduler.cc \
+  rocksdb/db/flush_scheduler.h \
+  rocksdb/db/internal_stats.h \
+  rocksdb/db/listener_test.cc \
+  rocksdb/db/log_writer.cc \
+  rocksdb/db/log_writer.h \
+  rocksdb/db/merge_helper.h \
+  rocksdb/db/merge_operator.cc \
+  rocksdb/db/merge_test.cc \
+  rocksdb/db/version_set.h \
+  rocksdb/db/log_format.h \
+  rocksdb/db/memtable.h \
+  rocksdb/db/memtable_list.cc \
+  rocksdb/db/skiplist.h \
+  rocksdb/db/c_test.c \
+  rocksdb/db/managed_iterator.h \
+  rocksdb/db/wal_manager_test.cc \
+  rocksdb/db/memtable_allocator.cc \
+  rocksdb/db/memtable_allocator.h \
+  rocksdb/db/memtablerep_bench.cc \
+  rocksdb/db/repair.cc \
+  rocksdb/db/internal_stats.cc \
+  rocksdb/db/merge_context.h \
+  rocksdb/db/managed_iterator.cc \
+  rocksdb/db/compacted_db_impl.h \
+  rocksdb/db/memtable_list.h \
+  rocksdb/db/perf_context_test.cc \
+  rocksdb/db/table_cache.cc \
+  rocksdb/db/db_impl_experimental.cc \
+  rocksdb/db/skiplist_test.cc \
+  rocksdb/db/slice.cc \
+  rocksdb/db/table_properties_collector.cc \
+  rocksdb/db/table_properties_collector.h \
+  rocksdb/db/table_properties_collector_test.cc \
+  rocksdb/db/transaction_log_impl.cc \
+  rocksdb/db/transaction_log_impl.h \
+  rocksdb/db/version_builder.cc \
+  rocksdb/db/version_builder.h \
+  rocksdb/db/version_builder_test.cc \
+  rocksdb/db/version_edit.cc \
+  rocksdb/db/version_edit.h \
+  rocksdb/db/version_edit_test.cc \
+  rocksdb/db/version_set.cc \
+  rocksdb/db/version_set_test.cc \
+  rocksdb/db/wal_manager.cc \
+  rocksdb/db/write_batch.cc \
+  rocksdb/db/write_batch_base.cc \
+  rocksdb/db/wal_manager.h \
+  rocksdb/db/write_batch_internal.h \
+  rocksdb/db/write_batch_test.cc \
+  rocksdb/db/write_callback.h \
+  rocksdb/db/write_controller.cc \
+  rocksdb/db/write_controller.h \
+  rocksdb/db/write_controller_test.cc \
+  rocksdb/db/write_thread.cc \
+  rocksdb/db/write_thread.h \
+  rocksdb/db/builder.cc \
+  rocksdb/db/c.cc \
+  rocksdb/db/writebuffer.h \
+  rocksdb/db/compaction_iterator.h \
+  rocksdb/db/experimental.cc \
+  rocksdb/db/column_family.h \
+  rocksdb/db/column_family_test.cc \
+  rocksdb/db/compact_files_test.cc \
+  rocksdb/db/compaction.cc \
+  rocksdb/db/compaction.h \
+  rocksdb/db/compaction_job.cc \
+  rocksdb/db/compaction_job_test.cc \
+  rocksdb/db/compaction_picker.cc \
+  rocksdb/db/column_family.cc \
+  rocksdb/db/dbformat.h \
+  rocksdb/db/builder.h \
+  rocksdb/db/compacted_db_impl.cc \
+  rocksdb/db/flush_job.h \
+  rocksdb/db/log_test.cc \
+  rocksdb/db/prefix_test.cc \
+  rocksdb/db/corruption_test.cc \
+  rocksdb/db/db_compaction_test.cc \
+  rocksdb/db/compaction_iterator.cc \
+  rocksdb/db/compaction_iterator_test.cc \
+  rocksdb/db/compaction_job_stats_test.cc \
+  rocksdb/db/db_compaction_filter_test.cc \
+  rocksdb/db/db_dynamic_level_test.cc \
+  rocksdb/db/db_inplace_update_test.cc \
+  rocksdb/db/db_tailing_iter_test.cc \
+  rocksdb/db/db_universal_compaction_test.cc \
+  rocksdb/db/db_wal_test.cc \
+  rocksdb/db/event_helpers.cc \
+  rocksdb/db/event_helpers.h \
+  rocksdb/db/merge_helper_test.cc \
+  rocksdb/db/snapshot_impl.cc \
+  rocksdb/db/snapshot_impl.h \
+  rocksdb/db/write_callback_test.cc \
+  rocksdb/doc/doc.css \
+  rocksdb/doc/index.html \
+  rocksdb/doc/log_format.txt \
+  rocksdb/doc/rockslogo.jpg \
+  rocksdb/doc/rockslogo.png \
+  rocksdb/examples/README.md \
+  rocksdb/examples/column_families_example.cc \
+  rocksdb/examples/simple_example.cc \
+  rocksdb/examples/.gitignore \
+  rocksdb/examples/Makefile \
+  rocksdb/examples/c_simple_example.c \
+  rocksdb/examples/compact_files_example.cc \
+  rocksdb/examples/compaction_filter_example.cc \
+  rocksdb/examples/optimistic_transaction_example.cc \
+  rocksdb/examples/rocksdb_option_file_example.ini \
+  rocksdb/examples/transaction_example.cc \
+  rocksdb/hdfs/README \
+  rocksdb/hdfs/setup.sh \
+  rocksdb/hdfs/env_hdfs.h \
+  rocksdb/include/rocksdb/filter_policy.h \
+  rocksdb/include/rocksdb/flush_block_policy.h \
+  rocksdb/include/rocksdb/iterator.h \
+  rocksdb/include/rocksdb/ldb_tool.h \
+  rocksdb/include/rocksdb/slice_transform.h \
+  rocksdb/include/rocksdb/sst_dump_tool.h \
+  rocksdb/include/rocksdb/types.h \
+  rocksdb/include/rocksdb/utilities/db_ttl.h \
+  rocksdb/include/rocksdb/utilities/document_db.h \
+  rocksdb/include/rocksdb/utilities/geo_db.h \
+  rocksdb/include/rocksdb/utilities/json_document.h \
+  rocksdb/include/rocksdb/utilities/leveldb_options.h \
+  rocksdb/include/rocksdb/utilities/flashcache.h \
+  rocksdb/include/rocksdb/utilities/backupable_db.h \
+  rocksdb/include/rocksdb/utilities/checkpoint.h \
+  rocksdb/include/rocksdb/utilities/convenience.h \
+  rocksdb/include/rocksdb/utilities/info_log_finder.h \
+  rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h \
+  rocksdb/include/rocksdb/utilities/spatial_db.h \
+  rocksdb/include/rocksdb/utilities/stackable_db.h \
+  rocksdb/include/rocksdb/utilities/table_properties_collectors.h \
+  rocksdb/include/rocksdb/utilities/transaction.h \
+  rocksdb/include/rocksdb/utilities/transaction_db.h \
+  rocksdb/include/rocksdb/utilities/transaction_db_mutex.h \
+  rocksdb/include/rocksdb/utilities/utility_db.h \
+  rocksdb/include/rocksdb/utilities/write_batch_with_index.h \
+  rocksdb/include/rocksdb/experimental.h \
+  rocksdb/include/rocksdb/convenience.h \
+  rocksdb/include/rocksdb/db_dump_tool.h \
+  rocksdb/include/rocksdb/immutable_options.h \
+  rocksdb/include/rocksdb/iostats_context.h \
+  rocksdb/include/rocksdb/listener.h \
+  rocksdb/include/rocksdb/memtablerep.h \
+  rocksdb/include/rocksdb/merge_operator.h \
+  rocksdb/include/rocksdb/metadata.h \
+  rocksdb/include/rocksdb/perf_context.h \
+  rocksdb/include/rocksdb/perf_level.h \
+  rocksdb/include/rocksdb/slice.h \
+  rocksdb/include/rocksdb/status.h \
+  rocksdb/include/rocksdb/table_properties.h \
+  rocksdb/include/rocksdb/transaction_log.h \
+  rocksdb/include/rocksdb/version.h \
+  rocksdb/include/rocksdb/write_batch_base.h \
+  rocksdb/include/rocksdb/c.h \
+  rocksdb/include/rocksdb/compaction_filter.h \
+  rocksdb/include/rocksdb/comparator.h \
+  rocksdb/include/rocksdb/db.h \
+  rocksdb/include/rocksdb/env.h \
+  rocksdb/include/rocksdb/options.h \
+  rocksdb/include/rocksdb/rate_limiter.h \
+  rocksdb/include/rocksdb/snapshot.h \
+  rocksdb/include/rocksdb/statistics.h \
+  rocksdb/include/rocksdb/table.h \
+  rocksdb/include/rocksdb/thread_status.h \
+  rocksdb/include/rocksdb/universal_compaction.h \
+  rocksdb/include/rocksdb/write_batch.h \
+  rocksdb/include/rocksdb/cache.h \
+  rocksdb/include/rocksdb/compaction_job_stats.h \
+  rocksdb/include/rocksdb/delete_scheduler.h \
+  rocksdb/include/rocksdb/sst_file_writer.h \
+  rocksdb/java/RELEASE.md \
+  rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java \
+  rocksdb/java/crossbuild/build-linux.sh \
+  rocksdb/java/crossbuild/Vagrantfile \
+  rocksdb/java/crossbuild/build-linux-centos.sh \
+  rocksdb/java/jdb_bench.sh \
+  rocksdb/java/rocksjni.pom \
+  rocksdb/java/rocksjni/backupablejni.cc \
+  rocksdb/java/rocksjni/checkpoint.cc \
+  rocksdb/java/rocksjni/columnfamilyhandle.cc \
+  rocksdb/java/rocksjni/comparator.cc \
+  rocksdb/java/rocksjni/comparatorjnicallback.h \
+  rocksdb/java/rocksjni/env.cc \
+  rocksdb/java/rocksjni/filter.cc \
+  rocksdb/java/rocksjni/iterator.cc \
+  rocksdb/java/rocksjni/loggerjnicallback.h \
+  rocksdb/java/rocksjni/memtablejni.cc \
+  rocksdb/java/rocksjni/merge_operator.cc \
+  rocksdb/java/rocksjni/ratelimiterjni.cc \
+  rocksdb/java/rocksjni/restorejni.cc \
+  rocksdb/java/rocksjni/slice.cc \
+  rocksdb/java/rocksjni/snapshot.cc \
+  rocksdb/java/rocksjni/statistics.cc \
+  rocksdb/java/rocksjni/table.cc \
+  rocksdb/java/rocksjni/transaction_log.cc \
+  rocksdb/java/rocksjni/ttl.cc \
+  rocksdb/java/rocksjni/write_batch.cc \
+  rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc \
+  rocksdb/java/rocksjni/writebatchhandlerjnicallback.h \
+  rocksdb/java/rocksjni/backupenginejni.cc \
+  rocksdb/java/rocksjni/compaction_filter.cc \
+  rocksdb/java/rocksjni/comparatorjnicallback.cc \
+  rocksdb/java/rocksjni/loggerjnicallback.cc \
+  rocksdb/java/rocksjni/options.cc \
+  rocksdb/java/rocksjni/portal.h \
+  rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc \
+  rocksdb/java/rocksjni/rocksjni.cc \
+  rocksdb/java/rocksjni/write_batch_test.cc \
+  rocksdb/java/rocksjni/write_batch_with_index.cc \
+  rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java \
+  rocksdb/java/samples/src/main/java/RocksDBSample.java \
+  rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java \
+  rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java \
+  rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java \
+  rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java \
+  rocksdb/java/src/main/java/org/rocksdb/BackupableDB.java \
+  rocksdb/java/src/main/java/org/rocksdb/BackupableDBOptions.java \
+  rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java \
+  rocksdb/java/src/main/java/org/rocksdb/BloomFilter.java \
+  rocksdb/java/src/main/java/org/rocksdb/BuiltinComparator.java \
+  rocksdb/java/src/main/java/org/rocksdb/Checkpoint.java \
+  rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java \
+  rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java \
+  rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java \
+  rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java \
+  rocksdb/java/src/main/java/org/rocksdb/Comparator.java \
+  rocksdb/java/src/main/java/org/rocksdb/ComparatorOptions.java \
+  rocksdb/java/src/main/java/org/rocksdb/CompressionType.java \
+  rocksdb/java/src/main/java/org/rocksdb/DBOptions.java \
+  rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java \
+  rocksdb/java/src/main/java/org/rocksdb/DirectComparator.java \
+  rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java \
+  rocksdb/java/src/main/java/org/rocksdb/EncodingType.java \
+  rocksdb/java/src/main/java/org/rocksdb/Filter.java \
+  rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java \
+  rocksdb/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java \
+  rocksdb/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java \
+  rocksdb/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java \
+  rocksdb/java/src/main/java/org/rocksdb/HistogramData.java \
+  rocksdb/java/src/main/java/org/rocksdb/HistogramType.java \
+  rocksdb/java/src/main/java/org/rocksdb/IndexType.java \
+  rocksdb/java/src/main/java/org/rocksdb/InfoLogLevel.java \
+  rocksdb/java/src/main/java/org/rocksdb/Logger.java \
+  rocksdb/java/src/main/java/org/rocksdb/MemTableConfig.java \
+  rocksdb/java/src/main/java/org/rocksdb/MergeOperator.java \
+  rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java \
+  rocksdb/java/src/main/java/org/rocksdb/PlainTableConfig.java \
+  rocksdb/java/src/main/java/org/rocksdb/RateLimiterConfig.java \
+  rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java \
+  rocksdb/java/src/main/java/org/rocksdb/RestoreBackupableDB.java \
+  rocksdb/java/src/main/java/org/rocksdb/RestoreOptions.java \
+  rocksdb/java/src/main/java/org/rocksdb/RocksDB.java \
+  rocksdb/java/src/main/java/org/rocksdb/RocksDBException.java \
+  rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java \
+  rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java \
+  rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java \
+  rocksdb/java/src/main/java/org/rocksdb/RocksObject.java \
+  rocksdb/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java \
+  rocksdb/java/src/main/java/org/rocksdb/Slice.java \
+  rocksdb/java/src/main/java/org/rocksdb/Snapshot.java \
+  rocksdb/java/src/main/java/org/rocksdb/Statistics.java \
+  rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java \
+  rocksdb/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java \
+  rocksdb/java/src/main/java/org/rocksdb/StatsCollectorInput.java \
+  rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java \
+  rocksdb/java/src/main/java/org/rocksdb/TableFormatConfig.java \
+  rocksdb/java/src/main/java/org/rocksdb/TickerType.java \
+  rocksdb/java/src/main/java/org/rocksdb/TransactionLogIterator.java \
+  rocksdb/java/src/main/java/org/rocksdb/TtlDB.java \
+  rocksdb/java/src/main/java/org/rocksdb/VectorMemTableConfig.java \
+  rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java \
+  rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java \
+  rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java \
+  rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java \
+  rocksdb/java/src/main/java/org/rocksdb/util/Environment.java \
+  rocksdb/java/src/main/java/org/rocksdb/util/SizeUnit.java \
+  rocksdb/java/src/main/java/org/rocksdb/Env.java \
+  rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java \
+  rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java \
+  rocksdb/java/src/main/java/org/rocksdb/AbstractSlice.java \
+  rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java \
+  rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java \
+  rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java \
+  rocksdb/java/src/main/java/org/rocksdb/Options.java \
+  rocksdb/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java \
+  rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java \
+  rocksdb/java/src/test/java/org/rocksdb/AbstractComparatorTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/BackupableDBTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/CheckPointTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/ComparatorTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/DirectComparatorTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/DirectSliceTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/FilterTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/FlushTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/LoggerTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/MemTableTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/MergeTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/MixedOptionsTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/PlainTableConfigTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/PlatformRandomHelper.java \
+  rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/RocksEnvTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/RocksMemoryResource.java \
+  rocksdb/java/src/test/java/org/rocksdb/SnapshotTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/StatsCallbackMock.java \
+  rocksdb/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/Types.java \
+  rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java \
+  rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/util/SizeUnitTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/SliceTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/TtlDBTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java \
+  rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java \
+  rocksdb/java/HISTORY-JAVA.md \
+  rocksdb/java/Makefile \
+  rocksdb/port/stack_trace.cc \
+  rocksdb/port/README \
+  rocksdb/port/likely.h \
+  rocksdb/port/port_example.h \
+  rocksdb/port/stack_trace.h \
+  rocksdb/port/dirent.h \
+  rocksdb/port/port.h \
+  rocksdb/port/port_posix.cc \
+  rocksdb/port/port_posix.h \
+  rocksdb/port/sys_time.h \
+  rocksdb/port/util_logger.h \
+  rocksdb/port/win/env_win.cc \
+  rocksdb/port/win/port_win.cc \
+  rocksdb/port/win/port_win.h \
+  rocksdb/port/win/win_logger.cc \
+  rocksdb/port/win/win_logger.h \
+  rocksdb/table/block_based_filter_block.cc \
+  rocksdb/table/mock_table.cc \
+  rocksdb/table/plain_table_builder.cc \
+  rocksdb/table/plain_table_factory.cc \
+  rocksdb/table/plain_table_key_coding.h \
+  rocksdb/table/table_builder.h \
+  rocksdb/table/two_level_iterator.cc \
+  rocksdb/table/two_level_iterator.h \
+  rocksdb/table/full_filter_block.cc \
+  rocksdb/table/block_based_filter_block.h \
+  rocksdb/table/block_based_filter_block_test.cc \
+  rocksdb/table/block.cc \
+  rocksdb/table/block_builder.cc \
+  rocksdb/table/block_builder.h \
+  rocksdb/table/block_hash_index.h \
+  rocksdb/table/block_hash_index_test.cc \
+  rocksdb/table/block_prefix_index.cc \
+  rocksdb/table/block_test.cc \
+  rocksdb/table/bloom_block.cc \
+  rocksdb/table/bloom_block.h \
+  rocksdb/table/table_reader_bench.cc \
+  rocksdb/table/table_test.cc \
+  rocksdb/table/meta_blocks.h \
+  rocksdb/table/plain_table_factory.h \
+  rocksdb/table/filter_block.h \
+  rocksdb/table/flush_block_policy.cc \
+  rocksdb/table/get_context.cc \
+  rocksdb/table/get_context.h \
+  rocksdb/table/sst_file_writer.cc \
+  rocksdb/table/full_filter_block.h \
+  rocksdb/table/full_filter_block_test.cc \
+  rocksdb/table/merger.cc \
+  rocksdb/table/iterator.cc \
+  rocksdb/table/iterator_wrapper.h \
+  rocksdb/table/merger.h \
+  rocksdb/table/block.h \
+  rocksdb/table/plain_table_index.h \
+  rocksdb/table/plain_table_key_coding.cc \
+  rocksdb/table/table_properties_internal.h \
+  rocksdb/table/table_reader.h \
+  rocksdb/table/block_based_table_builder.cc \
+  rocksdb/table/block_based_table_builder.h \
+  rocksdb/table/block_based_table_factory.cc \
+  rocksdb/table/block_based_table_factory.h \
+  rocksdb/table/block_based_table_reader.cc \
+  rocksdb/table/block_based_table_reader.h \
+  rocksdb/table/block_hash_index.cc \
+  rocksdb/table/block_prefix_index.h \
+  rocksdb/table/cuckoo_table_builder.cc \
+  rocksdb/table/cuckoo_table_builder.h \
+  rocksdb/table/cuckoo_table_builder_test.cc \
+  rocksdb/table/cuckoo_table_factory.cc \
+  rocksdb/table/cuckoo_table_factory.h \
+  rocksdb/table/cuckoo_table_reader.cc \
+  rocksdb/table/cuckoo_table_reader.h \
+  rocksdb/table/cuckoo_table_reader_test.cc \
+  rocksdb/table/format.cc \
+  rocksdb/table/format.h \
+  rocksdb/table/iter_heap.h \
+  rocksdb/table/merger_test.cc \
+  rocksdb/table/meta_blocks.cc \
+  rocksdb/table/mock_table.h \
+  rocksdb/table/plain_table_builder.h \
+  rocksdb/table/plain_table_index.cc \
+  rocksdb/table/plain_table_reader.cc \
+  rocksdb/table/plain_table_reader.h \
+  rocksdb/table/table_properties.cc \
+  rocksdb/table/adaptive_table_factory.h \
+  rocksdb/table/adaptive_table_factory.cc \
+  rocksdb/third-party/fbson/FbsonJsonParser.h \
+  rocksdb/third-party/fbson/FbsonUtil.h \
+  rocksdb/third-party/fbson/FbsonWriter.h \
+  rocksdb/third-party/fbson/COMMIT.md \
+  rocksdb/third-party/fbson/FbsonDocument.h \
+  rocksdb/third-party/fbson/FbsonStream.h \
+  rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \
+  rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h \
+  rocksdb/third-party/gtest-1.7.0/fused-src/gtest/CMakeLists.txt \
+  rocksdb/third-party/flashcache/flashcache_ioctl.h \
+  rocksdb/tools/auto_sanity_test.sh \
+  rocksdb/tools/benchmark_leveldb.sh \
+  rocksdb/tools/check_format_compatible.sh \
+  rocksdb/tools/generate_random_db.sh \
+  rocksdb/tools/run_leveldb.sh \
+  rocksdb/tools/verify_random_db.sh \
+  rocksdb/tools/dbench_monitor \
+  rocksdb/tools/ldb.cc \
+  rocksdb/tools/pflag \
+  rocksdb/tools/sst_dump.cc \
+  rocksdb/tools/dump/db_dump_tool.cc \
+  rocksdb/tools/dump/rocksdb_dump.cc \
+  rocksdb/tools/dump/rocksdb_undump.cc \
+  rocksdb/tools/Dockerfile \
+  rocksdb/tools/benchmark.sh \
+  rocksdb/tools/db_crashtest.py \
+  rocksdb/tools/db_crashtest2.py \
+  rocksdb/tools/db_repl_stress.cc \
+  rocksdb/tools/db_sanity_test.cc \
+  rocksdb/tools/db_stress.cc \
+  rocksdb/tools/ldb_test.py \
+  rocksdb/tools/reduce_levels_test.cc \
+  rocksdb/tools/rocksdb_dump_test.sh \
+  rocksdb/tools/run_flash_bench.sh \
+  rocksdb/tools/sample-dump.dmp \
+  rocksdb/util/ldb_cmd_execute_result.h \
+  rocksdb/util/bloom.cc \
+  rocksdb/util/allocator.h \
+  rocksdb/util/crc32c.h \
+  rocksdb/util/db_info_dumper.cc \
+  rocksdb/util/dynamic_bloom.h \
+  rocksdb/util/env_hdfs.cc \
+  rocksdb/util/env_posix.cc \
+  rocksdb/util/env_test.cc \
+  rocksdb/util/event_logger.cc \
+  rocksdb/util/file_util.cc \
+  rocksdb/util/file_util.h \
+  rocksdb/util/heap.h \
+  rocksdb/util/ldb_cmd.h \
+  rocksdb/util/mutable_cf_options.h \
+  rocksdb/util/build_version.h \
+  rocksdb/util/env.cc \
+  rocksdb/util/cache_bench.cc \
+  rocksdb/util/options.cc \
+  rocksdb/util/coding.cc \
+  rocksdb/util/coding.h \
+  rocksdb/util/coding_test.cc \
+  rocksdb/util/event_logger.h \
+  rocksdb/util/log_buffer.cc \
+  rocksdb/util/log_buffer.h \
+  rocksdb/util/memenv.cc \
+  rocksdb/util/crc32c_test.cc \
+  rocksdb/util/options_helper.cc \
+  rocksdb/util/db_info_dumper.h \
+  rocksdb/util/dynamic_bloom.cc \
+  rocksdb/util/hash_cuckoo_rep.cc \
+  rocksdb/util/options_helper.h \
+  rocksdb/util/histogram.cc \
+  rocksdb/util/histogram_test.cc \
+  rocksdb/util/mock_env.cc \
+  rocksdb/util/logging.cc \
+  rocksdb/util/logging.h \
+  rocksdb/util/statistics.cc \
+  rocksdb/util/event_logger_test.cc \
+  rocksdb/util/perf_level.cc \
+  rocksdb/util/status.cc \
+  rocksdb/util/filelock_test.cc \
+  rocksdb/util/filter_policy.cc \
+  rocksdb/util/hash.cc \
+  rocksdb/util/hash.h \
+  rocksdb/util/arena.h \
+  rocksdb/util/hash_cuckoo_rep.h \
+  rocksdb/util/perf_context_imp.h \
+  rocksdb/util/hash_linklist_rep.h \
+  rocksdb/util/hash_skiplist_rep.cc \
+  rocksdb/util/hash_skiplist_rep.h \
+  rocksdb/util/mock_env_test.cc \
+  rocksdb/util/mutable_cf_options.cc \
+  rocksdb/util/instrumented_mutex.cc \
+  rocksdb/util/instrumented_mutex.h \
+  rocksdb/util/ldb_cmd.cc \
+  rocksdb/util/autovector.h \
+  rocksdb/util/skiplistrep.cc \
+  rocksdb/util/manual_compaction_test.cc \
+  rocksdb/util/sync_point.cc \
+  rocksdb/util/ldb_tool.cc \
+  rocksdb/util/statistics.h \
+  rocksdb/util/xfunc.cc \
+  rocksdb/util/log_write_bench.cc \
+  rocksdb/util/xfunc.h \
+  rocksdb/util/memenv_test.cc \
+  rocksdb/util/mock_env.h \
+  rocksdb/util/options_test.cc \
+  rocksdb/util/perf_context.cc \
+  rocksdb/util/posix_logger.h \
+  rocksdb/util/rate_limiter.cc \
+  rocksdb/util/rate_limiter.h \
+  rocksdb/util/murmurhash.cc \
+  rocksdb/util/murmurhash.h \
+  rocksdb/util/sst_dump_test.cc \
+  rocksdb/util/sst_dump_tool.cc \
+  rocksdb/util/mutexlock.h \
+  rocksdb/util/sst_dump_tool_imp.h \
+  rocksdb/util/options_builder.cc \
+  rocksdb/util/testutil.cc \
+  rocksdb/util/thread_local.cc \
+  rocksdb/util/thread_operation.h \
+  rocksdb/util/thread_status_impl.cc \
+  rocksdb/util/arena_test.cc \
+  rocksdb/util/random.h \
+  rocksdb/util/slice.cc \
+  rocksdb/util/thread_status_util.cc \
+  rocksdb/util/rate_limiter_test.cc \
+  rocksdb/util/scoped_arena_iterator.h \
+  rocksdb/util/thread_status_util.h \
+  rocksdb/util/channel.h \
+  rocksdb/util/slice_transform_test.cc \
+  rocksdb/util/thread_status_updater.cc \
+  rocksdb/util/thread_status_updater.h \
+  rocksdb/util/stl_wrappers.h \
+  rocksdb/util/stop_watch.h \
+  rocksdb/util/sync_point.h \
+  rocksdb/util/compression.h \
+  rocksdb/util/string_util.h \
+  rocksdb/util/string_util.cc \
+  rocksdb/util/vectorrep.cc \
+  rocksdb/util/thread_status_util_debug.cc \
+  rocksdb/util/testharness.cc \
+  rocksdb/util/testharness.h \
+  rocksdb/util/heap_test.cc \
+  rocksdb/util/thread_list_test.cc \
+  rocksdb/util/thread_local.h \
+  rocksdb/util/thread_local_test.cc \
+  rocksdb/util/histogram.h \
+  rocksdb/util/cache_test.cc \
+  rocksdb/util/thread_status_updater_debug.cc \
+  rocksdb/util/xxhash.cc \
+  rocksdb/util/xxhash.h \
+  rocksdb/util/auto_roll_logger.cc \
+  rocksdb/util/auto_roll_logger.h \
+  rocksdb/util/auto_roll_logger_test.cc \
+  rocksdb/util/autovector_test.cc \
+  rocksdb/util/bloom_test.cc \
+  rocksdb/util/cache.cc \
+  rocksdb/util/comparator.cc \
+  rocksdb/util/crc32c.cc \
+  rocksdb/util/dynamic_bloom_test.cc \
+  rocksdb/util/iostats_context.cc \
+  rocksdb/util/iostats_context_imp.h \
+  rocksdb/util/hash_linklist_rep.cc \
+  rocksdb/util/testutil.h \
+  rocksdb/util/arena.cc \
+  rocksdb/util/aligned_buffer.h \
+  rocksdb/util/db_test_util.cc \
+  rocksdb/util/delete_scheduler_impl.h \
+  rocksdb/util/file_reader_writer.h \
+  rocksdb/util/options_parser.cc \
+  rocksdb/util/perf_step_timer.h \
+  rocksdb/util/db_test_util.h \
+  rocksdb/util/delete_scheduler_test.cc \
+  rocksdb/util/file_reader_writer_test.cc \
+  rocksdb/util/options_parser.h \
+  rocksdb/util/status_message.cc \
+  rocksdb/util/compaction_job_stats_impl.cc \
+  rocksdb/util/delete_scheduler_impl.cc \
+  rocksdb/util/file_reader_writer.cc \
+  rocksdb/util/ldb_cmd_test.cc \
+  rocksdb/util/perf_level_imp.h \
+  rocksdb/utilities/backupable/backupable_db.cc \
+  rocksdb/utilities/backupable/backupable_db_test.cc \
+  rocksdb/utilities/checkpoint/checkpoint.cc \
+  rocksdb/utilities/checkpoint/checkpoint_test.cc \
+  rocksdb/utilities/document/document_db.cc \
+  rocksdb/utilities/document/json_document_builder.cc \
+  rocksdb/utilities/document/document_db_test.cc \
+  rocksdb/utilities/document/json_document.cc \
+  rocksdb/utilities/document/json_document_test.cc \
+  rocksdb/utilities/geodb/geodb_impl.cc \
+  rocksdb/utilities/geodb/geodb_impl.h \
+  rocksdb/utilities/geodb/geodb_test.cc \
+  rocksdb/utilities/leveldb_options/leveldb_options.cc \
+  rocksdb/utilities/merge_operators.h \
+  rocksdb/utilities/merge_operators/put.cc \
+  rocksdb/utilities/merge_operators/string_append/stringappend.cc \
+  rocksdb/utilities/merge_operators/string_append/stringappend.h \
+  rocksdb/utilities/merge_operators/string_append/stringappend2.h \
+  rocksdb/utilities/merge_operators/string_append/stringappend2.cc \
+  rocksdb/utilities/merge_operators/string_append/stringappend_test.cc \
+  rocksdb/utilities/merge_operators/uint64add.cc \
+  rocksdb/utilities/redis/README \
+  rocksdb/utilities/redis/redis_list_exception.h \
+  rocksdb/utilities/redis/redis_list_iterator.h \
+  rocksdb/utilities/redis/redis_lists.cc \
+  rocksdb/utilities/redis/redis_lists.h \
+  rocksdb/utilities/redis/redis_lists_test.cc \
+  rocksdb/utilities/spatialdb/utils.h \
+  rocksdb/utilities/spatialdb/spatial_db.cc \
+  rocksdb/utilities/spatialdb/spatial_db_test.cc \
+  rocksdb/utilities/ttl/db_ttl_impl.cc \
+  rocksdb/utilities/ttl/db_ttl_impl.h \
+  rocksdb/utilities/ttl/ttl_test.cc \
+  rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc \
+  rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc \
+  rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h \
+  rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc \
+  rocksdb/utilities/flashcache/flashcache.cc \
+  rocksdb/utilities/flashcache/flashcache.h \
+  rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc \
+  rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h \
+  rocksdb/utilities/convenience/info_log_finder.cc \
+  rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc \
+  rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h \
+  rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \
+  rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc \
+  rocksdb/utilities/transactions/optimistic_transaction_db_impl.h \
+  rocksdb/utilities/transactions/optimistic_transaction_impl.cc \
+  rocksdb/utilities/transactions/optimistic_transaction_test.cc \
+  rocksdb/utilities/transactions/transaction_base.h \
+  rocksdb/utilities/transactions/transaction_db_impl.h \
+  rocksdb/utilities/transactions/transaction_db_mutex_impl.h \
+  rocksdb/utilities/transactions/transaction_impl.h \
+  rocksdb/utilities/transactions/transaction_lock_mgr.h \
+  rocksdb/utilities/transactions/transaction_util.cc \
+  rocksdb/utilities/transactions/optimistic_transaction_impl.h \
+  rocksdb/utilities/transactions/transaction_base.cc \
+  rocksdb/utilities/transactions/transaction_db_impl.cc \
+  rocksdb/utilities/transactions/transaction_db_mutex_impl.cc \
+  rocksdb/utilities/transactions/transaction_impl.cc \
+  rocksdb/utilities/transactions/transaction_lock_mgr.cc \
+  rocksdb/utilities/transactions/transaction_test.cc \
+  rocksdb/utilities/transactions/transaction_util.h \
+  rocksdb/.gitignore \
+  rocksdb/CMakeLists.txt \
+  rocksdb/HISTORY.md \
+  rocksdb/Makefile \
+  rocksdb/USERS.md \
+  rocksdb/appveyor.yml \
+  rocksdb/src.mk \
+  rocksdb/thirdparty.inc \
+  rocksdb/.travis.yml \
+  rocksdb/DUMP_FORMAT.md \
+  rocksdb/INSTALL.md \
+  rocksdb/ROCKSDB_LITE.md \
+  rocksdb/WINDOWS_PORT.md \
+  rocksdb/appveyordailytests.yml \
+  rocksdb/AUTHORS
diff --git a/src/Makefile-server.am b/src/Makefile-server.am
index 689b5c4..01ef492 100644
--- a/src/Makefile-server.am
+++ b/src/Makefile-server.am
@@ -27,7 +27,7 @@ su_sbin_SCRIPTS += mount.fuse.ceph
 if WITH_MON
 
 ceph_mon_SOURCES = ceph_mon.cc
-ceph_mon_LDADD = $(LIBMON) $(LIBOS) $(CEPH_GLOBAL) $(LIBCOMMON)
+ceph_mon_LDADD = $(LIBMON) $(LIBOS) $(CEPH_GLOBAL) $(LIBCOMMON) $(LIBAUTH) $(LIBCOMMON) $(LIBMON_TYPES)
 bin_PROGRAMS += ceph-mon
 
 endif # WITH_MON
@@ -43,7 +43,13 @@ bin_SCRIPTS += \
 	ceph-clsinfo
 
 ceph_osd_SOURCES = ceph_osd.cc
-ceph_osd_LDADD = $(LIBOSD) $(CEPH_GLOBAL) $(LIBCOMMON)
+ceph_osd_LDADD = \
+	$(LIBOSDC) $(LIBOSD) $(LIBOSD_TYPES) $(LIBOS_TYPES) \
+	$(LIBOS) \
+	$(CEPH_GLOBAL) $(LIBCOMMON)
+if WITH_LTTNG
+ceph_osd_LDADD += $(LIBOSD_TP)
+endif
 bin_PROGRAMS += ceph-osd
 
 endif # WITH_OSD
diff --git a/src/Makefile.am b/src/Makefile.am
index 90ec3f1..8085dce 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -14,6 +14,7 @@ include auth/Makefile.am
 include brag/Makefile.am
 include ceph-detect-init/Makefile.am
 include crush/Makefile.am
+include kv/Makefile.am
 include mon/Makefile.am
 include mds/Makefile.am
 include os/Makefile.am
@@ -31,6 +32,7 @@ include messages/Makefile.am
 include include/Makefile.am
 include librados/Makefile.am
 include libradosstriper/Makefile.am
+include journal/Makefile.am
 include librbd/Makefile.am
 include rgw/Makefile.am
 include cls/Makefile.am
@@ -106,6 +108,7 @@ EXTRA_DIST += \
 	mount.fuse.ceph \
 	rbd-replay-many \
 	rbdmap \
+	etc-rbdmap \
 	yasm-wrapper
 
 EXTRA_DIST += \
@@ -151,7 +154,7 @@ endif
 core-daemons: ceph-mon ceph-osd ceph-mds radosgw
 admin-tools: monmaptool osdmaptool crushtool ceph-authtool
 base: core-daemons admin-tools \
-	cephfs ceph-syn ceph-conf \
+	cephfs ceph-fuse ceph-syn ceph-conf \
 	rados radosgw-admin librados-config \
 	init-ceph ceph-post-file \
 	ceph
diff --git a/src/Makefile.in b/src/Makefile.in
index 15b44e7..ad24a20 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -87,8 +87,9 @@ target_triplet = @target@
 DIST_COMMON = $(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am \
 	$(srcdir)/auth/Makefile.am $(srcdir)/brag/Makefile.am \
 	$(srcdir)/ceph-detect-init/Makefile.am \
-	$(srcdir)/crush/Makefile.am $(srcdir)/mon/Makefile.am \
-	$(srcdir)/mds/Makefile.am $(srcdir)/mds/Makefile-client.am \
+	$(srcdir)/crush/Makefile.am $(srcdir)/kv/Makefile.am \
+	$(srcdir)/mon/Makefile.am $(srcdir)/mds/Makefile.am \
+	$(srcdir)/mds/Makefile-client.am \
 	$(srcdir)/mds/Makefile-server.am $(srcdir)/os/Makefile.am \
 	$(srcdir)/osd/Makefile.am $(srcdir)/erasure-code/Makefile.am \
 	$(srcdir)/erasure-code/jerasure/Makefile.am \
@@ -102,8 +103,9 @@ DIST_COMMON = $(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am \
 	$(srcdir)/messages/Makefile.am $(srcdir)/include/Makefile.am \
 	$(srcdir)/librados/Makefile.am \
 	$(srcdir)/libradosstriper/Makefile.am \
-	$(srcdir)/librbd/Makefile.am $(srcdir)/rgw/Makefile.am \
-	$(srcdir)/cls/Makefile.am $(srcdir)/cls/Makefile-client.am \
+	$(srcdir)/journal/Makefile.am $(srcdir)/librbd/Makefile.am \
+	$(srcdir)/rgw/Makefile.am $(srcdir)/cls/Makefile.am \
+	$(srcdir)/cls/Makefile-client.am \
 	$(srcdir)/cls/Makefile-server.am \
 	$(srcdir)/key_value_store/Makefile.am \
 	$(srcdir)/rbd_replay/Makefile.am $(srcdir)/test/Makefile.am \
@@ -123,9 +125,9 @@ DIST_COMMON = $(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am \
 	$(top_srcdir)/test-driver README TODO
 bin_PROGRAMS = $(am__EXEEXT_27) $(am__EXEEXT_28) $(am__EXEEXT_29) \
 	$(am__EXEEXT_30) $(am__EXEEXT_31) $(am__EXEEXT_32) \
-	$(am__EXEEXT_33) $(am__EXEEXT_34) monmaptool$(EXEEXT) \
-	crushtool$(EXEEXT) osdmaptool$(EXEEXT) ceph-conf$(EXEEXT) \
-	ceph-authtool$(EXEEXT) $(am__EXEEXT_35) $(am__EXEEXT_36) \
+	$(am__EXEEXT_33) $(am__EXEEXT_34) $(am__EXEEXT_35) \
+	monmaptool$(EXEEXT) crushtool$(EXEEXT) osdmaptool$(EXEEXT) \
+	ceph-conf$(EXEEXT) ceph-authtool$(EXEEXT) $(am__EXEEXT_36) \
 	$(am__EXEEXT_37) $(am__EXEEXT_38) $(am__EXEEXT_39) \
 	$(am__EXEEXT_40) $(am__EXEEXT_41) $(am__EXEEXT_42) \
 	$(am__EXEEXT_43)
@@ -138,29 +140,60 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 
 # when doing a debug build, make sure to make the targets
 @WITH_DEBUG_TRUE at am__append_1 = $(bin_DEBUGPROGRAMS)
- at LINUX_TRUE@am__append_2 = -Wl,--as-needed $(HARDENING_LDFLAGS)
- at USE_BOOST_SPIRIT_OLD_HDR_TRUE@am__append_3 = -DUSE_BOOST_SPIRIT_OLD_HDR
- at WITH_LIBATOMIC_TRUE@am__append_4 = -latomic_ops
- at ENABLE_COVERAGE_TRUE@am__append_5 = -fprofile-arcs -ftest-coverage
- at ENABLE_COVERAGE_TRUE@am__append_6 = -fprofile-arcs -ftest-coverage -O0
- at FREEBSD_TRUE@am__append_7 = -lexecinfo
- at LINUX_TRUE@am__append_8 = -lrt
- at WITH_PROFILER_TRUE@am__append_9 = -lprofiler
- at WITH_LIBAIO_TRUE@am__append_10 = -laio
- at WITH_LIBZFS_TRUE@am__append_11 = libos_zfs.a -lzfs
- at WITH_LIBROCKSDB_TRUE@am__append_12 = libos_rocksdb.la
- at WITH_TCMALLOC_MINIMAL_TRUE@am__append_13 = -ltcmalloc_minimal
- at WITH_TCMALLOC_TRUE@am__append_14 = -ltcmalloc
- at WITH_JEMALLOC_TRUE@am__append_15 = -ljemalloc
- at WITH_JEMALLOC_TRUE@am__append_16 = -ljemalloc
- at WITH_JEMALLOC_TRUE@am__append_17 = -ljemalloc
+ at LINUX_TRUE@am__append_2 = \
+ at LINUX_TRUE@	-D_REENTRANT
+
+ at FREEBSD_TRUE@am__append_3 = \
+ at FREEBSD_TRUE@	-D_REENTRANT
+
+ at DARWIN_TRUE@am__append_4 = \
+ at DARWIN_TRUE@	-D_REENTRANT
+
+ at SOLARIS_TRUE@am__append_5 = \
+ at SOLARIS_TRUE@       -D_PTHREADS \
+ at SOLARIS_TRUE@       -D_POSIX_C_SOURCE
+
+ at LINUX_TRUE@am__append_6 = -Wl,--as-needed $(HARDENING_LDFLAGS)
+ at USE_BOOST_SPIRIT_OLD_HDR_TRUE@am__append_7 = -DUSE_BOOST_SPIRIT_OLD_HDR
+ at WITH_LIBATOMIC_TRUE@am__append_8 = -latomic_ops
+ at ENABLE_COVERAGE_TRUE@am__append_9 = -fprofile-arcs -ftest-coverage
+ at ENABLE_COVERAGE_TRUE@am__append_10 = -fprofile-arcs -ftest-coverage -O0
+ at FREEBSD_TRUE@am__append_11 = -lexecinfo
+ at LINUX_TRUE@am__append_12 = -lrt
+ at WITH_PROFILER_TRUE@am__append_13 = -lprofiler
+ at WITH_LIBAIO_TRUE@am__append_14 = -laio
+ at WITH_LIBZFS_TRUE@am__append_15 = libos_zfs.a -lzfs
+ at WITH_TCMALLOC_MINIMAL_TRUE@am__append_16 = -ltcmalloc_minimal
+ at WITH_TCMALLOC_TRUE@am__append_17 = -ltcmalloc
 @WITH_JEMALLOC_TRUE at am__append_18 = -ljemalloc
- at ENABLE_COVERAGE_TRUE@am__append_19 = -lgcov
- at ENABLE_CLIENT_TRUE@am__append_20 = brag/client/ceph-brag ceph \
+ at WITH_JEMALLOC_TRUE@am__append_19 = -ljemalloc
+ at WITH_JEMALLOC_TRUE@am__append_20 = -ljemalloc
+ at WITH_JEMALLOC_TRUE@am__append_21 = -ljemalloc
+ at ENABLE_COVERAGE_TRUE@am__append_22 = -lgcov
+
+# libkv/libos linking order is ornery
+ at WITH_SLIBROCKSDB_TRUE@am__append_23 = rocksdb/librocksdb.a
+ at ENABLE_CLIENT_TRUE@am__append_24 = brag/client/ceph-brag ceph \
 @ENABLE_CLIENT_TRUE@	ceph-post-file
- at ENABLE_CLIENT_TRUE@am__append_21 = brag/server brag/README.md brag/client
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_22 = libmon.la
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_23 = \
+ at ENABLE_CLIENT_TRUE@am__append_25 = brag/server brag/README.md brag/client
+ at ENABLE_SERVER_TRUE@am__append_26 = libkv.a
+ at ENABLE_SERVER_TRUE@am__append_27 = \
+ at ENABLE_SERVER_TRUE@	kv/KeyValueDB.h \
+ at ENABLE_SERVER_TRUE@	kv/LevelDBStore.h
+
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_28 = -I rocksdb/include -fPIC
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_29 = kv/RocksDBStore.cc
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_30 = rocksdb/librocksdb.a
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_31 = kv/RocksDBStore.h
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_32 = kv/RocksDBStore.cc
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_33 = -lrocksdb
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_34 = kv/RocksDBStore.h
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_35 = kv/KineticStore.cc
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_36 = -std=gnu++11
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_37 = -lkinetic_client -lprotobuf -lglog -lgflags libcrypto.a
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_38 = kv/KineticStore.h
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_39 = libmon.a
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_40 = \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/AuthMonitor.h \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/DataHealthService.h \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/Elector.h \
@@ -189,15 +222,10 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 
 
 # There are no libmds_types so use the full mds library for dencoder for now
- at ENABLE_CLIENT_TRUE@am__append_24 = $(LIBMDS_SOURCES)
- at ENABLE_CLIENT_TRUE@am__append_25 = $(LIBMDS_DEPS) \
- at ENABLE_CLIENT_TRUE@	libcls_lock_client.la \
- at ENABLE_CLIENT_TRUE@	libcls_refcount_client.la \
- at ENABLE_CLIENT_TRUE@	libcls_replica_log_client.a \
- at ENABLE_CLIENT_TRUE@	libcls_rgw_client.la libcls_user_client.a \
- at ENABLE_CLIENT_TRUE@	libcls_numops_client.la
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_26 = libmds.la
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_27 =  \
+ at ENABLE_CLIENT_TRUE@am__append_41 = $(LIBMDS_SOURCES)
+ at ENABLE_CLIENT_TRUE@am__append_42 = $(LIBMDS_DEPS)
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_43 = libmds.la
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_44 =  \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/inode_backtrace.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/flock.h mds/locks.c \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/locks.h mds/CDentry.h \
@@ -223,6 +251,8 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/Mutation.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/Migrator.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/ScatterLock.h \
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/ScrubStack.h \
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/ScrubHeader.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/Server.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/SessionMap.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/SimpleLock.h \
@@ -249,16 +279,17 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/events/ETableClient.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/events/ETableServer.h \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	mds/events/EUpdate.h
- at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_28 = os/BtrfsFileStoreBackend.cc
- at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__append_29 = os/newstore/newstore_types.cc
- at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__append_30 = os/newstore/NewStore.cc
- at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE at am__append_31 = \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_45 = os/BtrfsFileStoreBackend.cc
+ at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__append_46 = os/newstore/newstore_types.cc
+ at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__append_47 = os/newstore/NewStore.cc
+ at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE at am__append_48 = \
 @ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE@    os/fs/XFS.cc \
 @ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE@    os/XfsFileStoreBackend.cc
 
- at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_32 = os/ZFSFileStoreBackend.cc
- at ENABLE_SERVER_TRUE@am__append_33 = libos.la
- at ENABLE_SERVER_TRUE@am__append_34 = \
+ at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_49 = os/ZFSFileStoreBackend.cc
+ at ENABLE_SERVER_TRUE@@WITH_LTTNG_TRUE at am__append_50 = $(LIBOS_TP)
+ at ENABLE_SERVER_TRUE@am__append_51 = libos.a
+ at ENABLE_SERVER_TRUE@am__append_52 = \
 @ENABLE_SERVER_TRUE@	os/btrfs_ioctl.h \
 @ENABLE_SERVER_TRUE@	os/chain_xattr.h \
 @ENABLE_SERVER_TRUE@	os/newstore/newstore_types.h \
@@ -277,8 +308,6 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@	os/IndexManager.h \
 @ENABLE_SERVER_TRUE@	os/Journal.h \
 @ENABLE_SERVER_TRUE@	os/JournalingObjectStore.h \
- at ENABLE_SERVER_TRUE@	os/KeyValueDB.h \
- at ENABLE_SERVER_TRUE@	os/LevelDBStore.h \
 @ENABLE_SERVER_TRUE@	os/LFNIndex.h \
 @ENABLE_SERVER_TRUE@	os/MemStore.h \
 @ENABLE_SERVER_TRUE@	os/KeyValueStore.h \
@@ -290,19 +319,10 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@	os/XfsFileStoreBackend.h \
 @ENABLE_SERVER_TRUE@	os/ZFSFileStoreBackend.h
 
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_35 = libos_rocksdb.la
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_36 = os/RocksDBStore.h
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_37 = libos_rocksdb.la
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_38 = os/RocksDBStore.h
- at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_39 = libos_zfs.a
- at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_40 = os/ZFS.h
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_41 = os/KineticStore.cc
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_42 = -std=gnu++11
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_43 = -lkinetic_client -lprotobuf -lglog -lgflags libcrypto.a
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__append_44 = os/KineticStore.h
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE@@WITH_OSD_TRUE at am__append_45 = -std=gnu++11
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_46 = libosd.la
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_47 = \
+ at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_53 = libos_zfs.a
+ at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__append_54 = os/ZFS.h
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_55 = libosd.a
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_56 = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/ClassHandler.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/HitSet.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/OSD.h \
@@ -324,26 +344,26 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/Watch.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/osd_types.h
 
- at LINUX_TRUE@am__append_48 = -export-symbols-regex '.*__erasure_code_.*'
- at LINUX_TRUE@am__append_49 = -export-symbols-regex '.*__erasure_code_.*'
- at HAVE_NEON_TRUE@am__append_50 = libec_jerasure_neon.la
- at LINUX_TRUE@am__append_51 = -export-symbols-regex '.*__erasure_code_.*'
- at HAVE_SSSE3_TRUE@am__append_52 = libec_jerasure_sse3.la
- at LINUX_TRUE@am__append_53 = -export-symbols-regex '.*__erasure_code_.*'
- at HAVE_SSE4_PCLMUL_TRUE@am__append_54 = libec_jerasure_sse4.la
- at LINUX_TRUE@am__append_55 = -export-symbols-regex '.*__erasure_code_.*'
- at LINUX_TRUE@am__append_56 = -export-symbols-regex '.*__erasure_code_.*'
 @LINUX_TRUE at am__append_57 = -export-symbols-regex '.*__erasure_code_.*'
 @LINUX_TRUE at am__append_58 = -export-symbols-regex '.*__erasure_code_.*'
- at HAVE_NEON_TRUE@am__append_59 = libec_shec_neon.la
+ at HAVE_NEON_TRUE@am__append_59 = libec_jerasure_neon.la
 @LINUX_TRUE at am__append_60 = -export-symbols-regex '.*__erasure_code_.*'
- at HAVE_SSSE3_TRUE@am__append_61 = libec_shec_sse3.la
+ at HAVE_SSSE3_TRUE@am__append_61 = libec_jerasure_sse3.la
 @LINUX_TRUE at am__append_62 = -export-symbols-regex '.*__erasure_code_.*'
- at HAVE_SSE4_PCLMUL_TRUE@am__append_63 = libec_shec_sse4.la
+ at HAVE_SSE4_PCLMUL_TRUE@am__append_63 = libec_jerasure_sse4.la
 @LINUX_TRUE at am__append_64 = -export-symbols-regex '.*__erasure_code_.*'
+ at LINUX_TRUE@am__append_65 = -export-symbols-regex '.*__erasure_code_.*'
+ at LINUX_TRUE@am__append_66 = -export-symbols-regex '.*__erasure_code_.*'
+ at LINUX_TRUE@am__append_67 = -export-symbols-regex '.*__erasure_code_.*'
+ at HAVE_NEON_TRUE@am__append_68 = libec_shec_neon.la
+ at LINUX_TRUE@am__append_69 = -export-symbols-regex '.*__erasure_code_.*'
+ at HAVE_SSSE3_TRUE@am__append_70 = libec_shec_sse3.la
+ at LINUX_TRUE@am__append_71 = -export-symbols-regex '.*__erasure_code_.*'
+ at HAVE_SSE4_PCLMUL_TRUE@am__append_72 = libec_shec_sse4.la
+ at LINUX_TRUE@am__append_73 = -export-symbols-regex '.*__erasure_code_.*'
 
 # ISA
- at WITH_BETTER_YASM_ELF64_TRUE@am__append_65 = \
+ at WITH_BETTER_YASM_ELF64_TRUE@am__append_74 = \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/ErasureCodeIsa.h \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/ErasureCodeIsaTableCache.h \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/xor_op.h \
@@ -354,10 +374,10 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/include/gf_vect_mul.h \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/include/types.h
 
- at LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE at am__append_66 = -export-symbols-regex '.*__erasure_code_.*'
- at WITH_BETTER_YASM_ELF64_TRUE@am__append_67 = libec_isa.la
- at ENABLE_CLIENT_TRUE@am__append_68 = libclient.la
- at ENABLE_CLIENT_TRUE@am__append_69 = \
+ at LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE at am__append_75 = -export-symbols-regex '.*__erasure_code_.*'
+ at WITH_BETTER_YASM_ELF64_TRUE@am__append_76 = libec_isa.la
+ at ENABLE_CLIENT_TRUE@am__append_77 = libclient.la
+ at ENABLE_CLIENT_TRUE@am__append_78 = \
 @ENABLE_CLIENT_TRUE@	client/Client.h \
 @ENABLE_CLIENT_TRUE@	client/Dentry.h \
 @ENABLE_CLIENT_TRUE@	client/Dir.h \
@@ -372,36 +392,52 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@	client/ioctl.h \
 @ENABLE_CLIENT_TRUE@	client/ObjecterWriteback.h
 
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am__append_70 = libclient_fuse.la
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am__append_71 = client/fuse_ll.h
- at ENABLE_CLIENT_TRUE@am__append_72 = ceph_test_ioctls
- at WITH_TCMALLOC_TRUE@am__append_73 = perfglue/heap_profiler.cc
- at WITH_TCMALLOC_TRUE@am__append_74 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
- at WITH_TCMALLOC_TRUE@am__append_75 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_76 = perfglue/heap_profiler.cc
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_77 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_78 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_FALSE at am__append_79 = perfglue/disabled_heap_profiler.cc
- at WITH_PROFILER_TRUE@am__append_80 = perfglue/cpu_profiler.cc
- at WITH_PROFILER_FALSE@am__append_81 = perfglue/disabled_stubs.cc
- at ENABLE_XIO_TRUE@am__append_82 = \
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am__append_79 = libclient_fuse.la
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am__append_80 = client/fuse_ll.h
+ at ENABLE_CLIENT_TRUE@am__append_81 = ceph_test_ioctls
+ at WITH_TCMALLOC_TRUE@am__append_82 = perfglue/heap_profiler.cc
+ at WITH_TCMALLOC_TRUE@am__append_83 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+ at WITH_TCMALLOC_TRUE@am__append_84 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_85 = perfglue/heap_profiler.cc
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_86 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__append_87 = -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_FALSE at am__append_88 = perfglue/disabled_heap_profiler.cc
+ at WITH_PROFILER_TRUE@am__append_89 = perfglue/cpu_profiler.cc
+ at WITH_PROFILER_FALSE@am__append_90 = perfglue/disabled_stubs.cc
+ at ENABLE_SERVER_TRUE@am__append_91 = \
+ at ENABLE_SERVER_TRUE@	common/xattr.c \
+ at ENABLE_SERVER_TRUE@	common/ipaddr.cc \
+ at ENABLE_SERVER_TRUE@	common/ceph_json.cc \
+ at ENABLE_SERVER_TRUE@	common/util.cc \
+ at ENABLE_SERVER_TRUE@	common/pick_address.cc
+
+ at LINUX_TRUE@am__append_92 = \
+ at LINUX_TRUE@	common/linux_version.c 
+
+ at SOLARIS_TRUE@am__append_93 = \
+ at SOLARIS_TRUE@        common/solaris_errno.cc
+
+ at LINUX_TRUE@@WITH_RBD_TRUE at am__append_94 = \
+ at LINUX_TRUE@@WITH_RBD_TRUE@	common/blkdev.cc
+
+ at ENABLE_XIO_TRUE@am__append_95 = \
 @ENABLE_XIO_TRUE@	common/address_helper.cc
 
- at WITH_GOOD_YASM_ELF64_TRUE@am__append_83 = common/crc32c_intel_fast_asm.S common/crc32c_intel_fast_zero_asm.S
- at HAVE_ARMV8_CRC_TRUE@am__append_84 = libcommon_crc_aarch64.la
- at HAVE_ARMV8_CRC_TRUE@am__append_85 = libcommon_crc_aarch64.la
- at LINUX_TRUE@am__append_86 = -lrt -lblkid
- at ENABLE_XIO_TRUE@am__append_87 = \
+ at WITH_GOOD_YASM_ELF64_TRUE@am__append_96 = common/crc32c_intel_fast_asm.S common/crc32c_intel_fast_zero_asm.S
+ at HAVE_ARMV8_CRC_TRUE@am__append_97 = libcommon_crc_aarch64.la
+ at HAVE_ARMV8_CRC_TRUE@am__append_98 = libcommon_crc_aarch64.la
+ at LINUX_TRUE@am__append_99 = -lrt -lblkid
+ at ENABLE_XIO_TRUE@am__append_100 = \
 @ENABLE_XIO_TRUE@	common/address_helper.h
 
- at LINUX_TRUE@am__append_88 = libsecret.la
- at LINUX_TRUE@am__append_89 = msg/async/EventEpoll.cc
- at DARWIN_TRUE@am__append_90 = msg/async/EventKqueue.cc
- at FREEBSD_TRUE@am__append_91 = msg/async/EventKqueue.cc
- at LINUX_TRUE@am__append_92 = msg/async/EventEpoll.h
- at DARWIN_TRUE@am__append_93 = msg/async/EventKqueue.h
- at FREEBSD_TRUE@am__append_94 = msg/async/EventKqueue.h
- at ENABLE_XIO_TRUE@am__append_95 = \
+ at LINUX_TRUE@am__append_101 = libsecret.la
+ at LINUX_TRUE@am__append_102 = msg/async/EventEpoll.cc
+ at DARWIN_TRUE@am__append_103 = msg/async/EventKqueue.cc
+ at FREEBSD_TRUE@am__append_104 = msg/async/EventKqueue.cc
+ at LINUX_TRUE@am__append_105 = msg/async/EventEpoll.h
+ at DARWIN_TRUE@am__append_106 = msg/async/EventKqueue.h
+ at FREEBSD_TRUE@am__append_107 = msg/async/EventKqueue.h
+ at ENABLE_XIO_TRUE@am__append_108 = \
 @ENABLE_XIO_TRUE@	msg/xio/QueueStrategy.cc \
 @ENABLE_XIO_TRUE@	msg/xio/XioConnection.cc \
 @ENABLE_XIO_TRUE@	msg/xio/XioMessenger.cc \
@@ -409,7 +445,7 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_XIO_TRUE@	msg/xio/XioPortal.cc \
 @ENABLE_XIO_TRUE@	msg/xio/XioPool.cc
 
- at ENABLE_XIO_TRUE@am__append_96 = \
+ at ENABLE_XIO_TRUE@am__append_109 = \
 @ENABLE_XIO_TRUE@	msg/xio/DispatchStrategy.h \
 @ENABLE_XIO_TRUE@	msg/xio/FastStrategy.h \
 @ENABLE_XIO_TRUE@	msg/xio/QueueStrategy.h \
@@ -421,17 +457,18 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_XIO_TRUE@	msg/xio/XioPortal.h \
 @ENABLE_XIO_TRUE@	msg/xio/XioSubmit.h
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_97 =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_110 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_internal.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_api.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_98 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_api.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libjournal.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_111 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_internal.la libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBOSDC) $(LIBCOMMON_DEPS)
 
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_99 = -fvisibility=hidden -fvisibility-inlines-hidden
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_100 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=ALL'
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_101 = librados.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_102 = \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_112 = -fvisibility=hidden -fvisibility-inlines-hidden
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_113 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=ALL'
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_114 = librados.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_115 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/snap_set_diff.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/AioCompletionImpl.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/IoCtxImpl.h \
@@ -440,19 +477,38 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/RadosXattrIter.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/ListObjectImpl.h
 
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_103 = -export-symbols-regex '^radosstriper_.*'
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_104 = libradosstriper.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_105 = \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_116 = -export-symbols-regex '^radosstriper_.*'
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_117 = libradosstriper.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_118 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	libradosstriper/RadosStriperImpl.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	libradosstriper/MultiAioCompletionImpl.h
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_106 = librbd_internal.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_119 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/AsyncOpTracker.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Entry.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Future.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/FutureImpl.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Journaler.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/JournalMetadata.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/JournalPlayer.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/JournalRecorder.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/JournalTrimmer.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/ObjectPlayer.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/ObjectRecorder.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/ReplayEntry.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/ReplayHandler.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Utils.h
+
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_120 = libjournal.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_121 = librbd_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_api.la
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_107 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=ALL'
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_108 = librbd.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_109 = \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_122 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=ALL'
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_123 = librbd.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_124 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioCompletion.h \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioImageRequest.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioImageRequestWQ.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioObjectRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncFlattenRequest.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncObjectThrottle.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncOperation.h \
@@ -464,6 +520,10 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageCtx.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageWatcher.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/internal.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Journal.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/JournalReplay.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/JournalTypes.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdAdminSocketHook.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdWriteback.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectMap.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/parent_types.h \
@@ -474,16 +534,16 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 
 
 # inject rgw stuff in the decoder testcase
- at ENABLE_CLIENT_TRUE@am__append_110 = \
+ at ENABLE_CLIENT_TRUE@am__append_125 = \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_dencoder.cc \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_acl.cc \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_common.cc \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_env.cc \
 @ENABLE_CLIENT_TRUE@	rgw/rgw_json_enc.cc
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_111 = librgw.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_126 = librgw.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcivetweb.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_112 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_127 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_rgw_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_log_client.a \
@@ -500,12 +560,12 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-lfcgi \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	-ldl
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_113 = radosgw \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_128 = radosgw \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	radosgw-admin \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	radosgw-object-expirer
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_114 = ceph_rgw_multiparser \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_129 = ceph_rgw_multiparser \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_rgw_jsonparser
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_115 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_130 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_acl.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_acl_s3.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_acl_swift.h \
@@ -564,18 +624,25 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	civetweb/include/civetweb_conf.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	civetweb/src/md5.h
 
- at ENABLE_CLIENT_TRUE@am__append_116 = libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@am__append_131 = libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@	libcls_refcount_client.la \
 @ENABLE_CLIENT_TRUE@	libcls_rgw_client.la libcls_rbd_client.la \
 @ENABLE_CLIENT_TRUE@	libcls_cephfs_client.la \
- at ENABLE_CLIENT_TRUE@	libcls_numops_client.la
- at ENABLE_CLIENT_TRUE@am__append_117 = libcls_version_client.a \
+ at ENABLE_CLIENT_TRUE@	libcls_numops_client.la \
+ at ENABLE_CLIENT_TRUE@	libcls_journal_client.la
+ at ENABLE_CLIENT_TRUE@am__append_132 = libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@	libcls_refcount_client.la \
+ at ENABLE_CLIENT_TRUE@	libcls_replica_log_client.a \
+ at ENABLE_CLIENT_TRUE@	libcls_rgw_client.la libcls_user_client.a \
+ at ENABLE_CLIENT_TRUE@	libcls_numops_client.la \
+ at ENABLE_CLIENT_TRUE@	libcls_journal_client.la
+ at ENABLE_CLIENT_TRUE@am__append_133 = libcls_version_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_log_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_statelog_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_timeindex_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_replica_log_client.a \
 @ENABLE_CLIENT_TRUE@	libcls_user_client.a
- at ENABLE_CLIENT_TRUE@am__append_118 = \
+ at ENABLE_CLIENT_TRUE@am__append_134 = \
 @ENABLE_CLIENT_TRUE@	cls/lock/cls_lock_types.h \
 @ENABLE_CLIENT_TRUE@	cls/lock/cls_lock_ops.h \
 @ENABLE_CLIENT_TRUE@	cls/lock/cls_lock_client.h \
@@ -606,9 +673,11 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@	cls/user/cls_user_ops.h \
 @ENABLE_CLIENT_TRUE@	cls/user/cls_user_types.h \
 @ENABLE_CLIENT_TRUE@	cls/cephfs/cls_cephfs.h \
- at ENABLE_CLIENT_TRUE@	cls/cephfs/cls_cephfs_client.h
+ at ENABLE_CLIENT_TRUE@	cls/cephfs/cls_cephfs_client.h \
+ at ENABLE_CLIENT_TRUE@	cls/journal/cls_journal_client.h \
+ at ENABLE_CLIENT_TRUE@	cls/journal/cls_journal_types.h
 
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_119 = libcls_hello.la \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_135 = libcls_hello.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_numops.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_rbd.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_lock.la \
@@ -620,14 +689,15 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_replica_log.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_user.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_rgw.la \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_cephfs.la
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_120 = libcls_kvs.la
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_121 = \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_cephfs.la \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libcls_journal.la
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_136 = libcls_kvs.la
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_137 = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	key_value_store/key_value_structure.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	key_value_store/kv_flat_btree_async.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	key_value_store/kvs_arg_types.h
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_122 = rbd_replay/ActionTypes.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_138 = rbd_replay/ActionTypes.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/actions.hpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/BoundedBuffer.hpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/BufferReader.h \
@@ -637,26 +707,26 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/rbd_loc.hpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/rbd_replay_debug.hpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd_replay/Replayer.hpp
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_123 = librbd_replay_types.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_139 = librbd_replay_types.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay_ios.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_124 = librbd_replay_types.la
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_125 = rbd-replay
- at ENABLE_CLIENT_TRUE@@WITH_BABELTRACE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_126 = rbd-replay-prep
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_127 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_140 = librbd_replay_types.la
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_141 = rbd-replay
+ at ENABLE_CLIENT_TRUE@@WITH_BABELTRACE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_142 = rbd-replay-prep
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_143 = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/test-erasure-code.sh \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/test-erasure-eio.sh
 
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_128 = test/erasure-code/ceph_erasure_code_benchmark.h \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_144 = test/erasure-code/ceph_erasure_code_benchmark.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ceph_erasure_code_benchmark.h \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ErasureCodeExample.h
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_129 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_130 = ceph_erasure_code_benchmark \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_145 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_146 = ceph_erasure_code_benchmark \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph_erasure_code
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_131 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_132 = ceph_erasure_code_non_regression
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_133 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_134 = libec_example.la \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_147 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_148 = ceph_erasure_code_non_regression
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_149 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_150 = libec_example.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_missing_entry_point.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_missing_version.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_hangs.la \
@@ -670,19 +740,19 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_shec_sse4.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_shec_sse3.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libec_test_shec_generic.la
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_135 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_136 = unittest_erasure_code_plugin \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_151 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_152 = unittest_erasure_code_plugin \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_jerasure \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_jerasure
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_137 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_138 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_139 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_140 = unittest_erasure_code_isa \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_153 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_154 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_155 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_156 = unittest_erasure_code_isa \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_isa
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_141 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_142 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_143 =  \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at am__append_157 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_158 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_159 =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_lrc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_lrc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec \
@@ -691,43 +761,44 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_shec_arguments \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_shec \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_example
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_144 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_145 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_146 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_147 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_148 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_149 = -ldl
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__append_150 = test/messenger/message_helper.h \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_160 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_161 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_162 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_163 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_164 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_165 = -ldl
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__append_166 = test/messenger/message_helper.h \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/simple_dispatcher.h \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_dispatcher.h
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_151 = -ldl
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_152 = -ldl
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__append_153 = simple_server \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_167 = -ldl
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_168 = -ldl
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at am__append_169 = simple_server \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	simple_client xio_server \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	xio_client
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_154 = -ldl
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_155 = -ldl
- at COMPILER_HAS_VTA_TRUE@@ENABLE_CLIENT_TRUE at am__append_156 = -fno-var-tracking-assignments
- at COMPILER_HAS_VTA_TRUE@@ENABLE_CLIENT_TRUE at am__append_157 = -fno-var-tracking-assignments
- at ENABLE_CLIENT_TRUE@am__append_158 = ceph-dencoder
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_159 = libradostest.la \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_170 = -ldl
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@@LINUX_TRUE at am__append_171 = -ldl
+ at COMPILER_HAS_VTA_TRUE@@ENABLE_CLIENT_TRUE at am__append_172 = -fno-var-tracking-assignments
+ at COMPILER_HAS_VTA_TRUE@@ENABLE_CLIENT_TRUE at am__append_173 = -fno-var-tracking-assignments
+ at ENABLE_CLIENT_TRUE@am__append_174 = ceph-dencoder
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_175 = libradostest.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_test_stub.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_160 = ceph_test_rados \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_176 = ceph_test_rados \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_mutate
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE at am__append_161 = test_build_librados
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_162 =  \
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE at am__append_177 = test_build_librados
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_178 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_smalliobench \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_omapbench \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_objectstore_bench
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_163 = ceph_kvstorebench \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at am__append_179 = ceph_kvstorebench \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_list_parallel \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_open_pools_parallel \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_delete_pools_parallel \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_watch_notify
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_164 =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_180 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_librados \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_librados_config
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_165 =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_librados_config \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_journal
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_181 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_multi_stress_watch \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rbd \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_refcount \
@@ -738,6 +809,7 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_lock \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_hello \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_numops \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_journal \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_cmd \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_io \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_c_write_operations \
@@ -754,8 +826,10 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_tier \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_lock \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_stress_watch
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_166 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_182 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/LibradosTestStub.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/MockTestMemIoCtxImpl.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/MockTestMemRadosClient.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestClassHandler.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestRadosClient.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestMemRadosClient.h \
@@ -763,14 +837,14 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestMemIoCtxImpl.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestIoCtxImpl.h
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_167 = ceph_smalliobenchrbd \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_183 = ceph_smalliobenchrbd \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	ceph_test_librbd \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	ceph_test_librbd_api
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_168 = unittest_rbd_replay
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_169 = librbd_test.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_170 = unittest_librbd
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_171 = test/run-rbd-unit-tests.sh
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_172 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_184 = unittest_rbd_replay
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_185 = librbd_test.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_186 = unittest_librbd
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_187 = test/run-rbd-unit-tests.sh
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_188 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_fixture.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_mock_fixture.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_support.h \
@@ -779,41 +853,41 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockImageWatcher.h \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/mock/MockObjectMap.h
 
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_173 = ceph_test_librbd_fsx
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_174 = libradosstripertest.la
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_175 = ceph_test_rados_striper_api_io \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_189 = ceph_test_librbd_fsx
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_190 = libradosstripertest.la
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__append_191 = ceph_test_rados_striper_api_io \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_striper_api_aio \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_striper_api_striping
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_176 = test_build_libcephfs
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_177 = unittest_encoding \
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_192 = test_build_libcephfs
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_193 = unittest_encoding \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_base64 \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_run_cmd \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_simple_spin \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_libcephfs_config
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_178 = test/libcephfs/flock.cc
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_179 = ceph_test_libcephfs \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_194 = test/libcephfs/flock.cc
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_195 = ceph_test_libcephfs \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	ceph_test_c_headers
- at CLANG_FALSE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_180 = -Werror -Wold-style-declaration
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_181 = test_build_librgw
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_182 = ceph_test_cors \
+ at CLANG_FALSE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_196 = -Werror -Wold-style-declaration
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_197 = test_build_librgw
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__append_198 = ceph_test_cors \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_rgw_manifest \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_rgw_obj \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw_meta \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw_log \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw_opstate \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_rgw
- at ENABLE_SERVER_TRUE@am__append_183 = ceph_test_async_driver \
+ at ENABLE_SERVER_TRUE@am__append_199 = ceph_test_async_driver \
 @ENABLE_SERVER_TRUE@	ceph_test_msgr ceph_streamtest \
 @ENABLE_SERVER_TRUE@	ceph_test_trans ceph_test_mon_workloadgen \
 @ENABLE_SERVER_TRUE@	ceph_test_mon_msg ceph_perf_objectstore \
 @ENABLE_SERVER_TRUE@	ceph_perf_local ceph_perf_msgr_server \
 @ENABLE_SERVER_TRUE@	ceph_perf_msgr_client
- at ENABLE_SERVER_TRUE@am__append_184 = test/perf_helper.h
- at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_185 =  \
+ at ENABLE_SERVER_TRUE@am__append_200 = test/perf_helper.h
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_201 =  \
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@	ceph_test_objectstore \
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@	ceph_test_keyvaluedb \
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@	ceph_test_filestore
- at ENABLE_SERVER_TRUE@am__append_186 =  \
+ at ENABLE_SERVER_TRUE@am__append_202 =  \
 @ENABLE_SERVER_TRUE@	ceph_test_objectstore_workloadgen \
 @ENABLE_SERVER_TRUE@	ceph_test_filestore_idempotent \
 @ENABLE_SERVER_TRUE@	ceph_test_filestore_idempotent_sequence \
@@ -821,513 +895,63 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @ENABLE_SERVER_TRUE@	ceph_test_object_map \
 @ENABLE_SERVER_TRUE@	ceph_test_keyvaluedb_atomicity \
 @ENABLE_SERVER_TRUE@	ceph_test_keyvaluedb_iterators
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE at am__append_187 = ceph_smalliobenchfs \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE at am__append_203 = ceph_smalliobenchfs \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	ceph_smalliobenchdumb \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	ceph_tpbench
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_188 = ceph_test_keys
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_189 = get_command_descriptions
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_190 =  \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_204 = ceph_test_keys
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_205 = get_command_descriptions
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_206 =  \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	unittest_mon_moncap \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	unittest_mon_pgmap
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_191 =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_207 =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_ecbackend \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_osdscrub \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_pglog \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_hitset \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_osd_osdcap \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_pageset
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_192 = -ldl
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_193 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_194 = ceph_test_snap_mapper
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_195 = unittest_rocksdb_option_static
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_196 = unittest_rocksdb_option
- at ENABLE_SERVER_TRUE@am__append_197 = unittest_chain_xattr \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_208 = -ldl
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_209 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_210 = ceph_test_snap_mapper
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__append_211 = unittest_rocksdb_option_static
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__append_212 = unittest_rocksdb_option
+ at ENABLE_SERVER_TRUE@am__append_213 = unittest_chain_xattr \
 @ENABLE_SERVER_TRUE@	unittest_lfnindex
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_198 = unittest_mds_authcap
- at WITH_BUILD_TESTS_TRUE@am__append_199 = test_build_libcommon
- at LINUX_TRUE@am__append_200 = libsystest.la
- at LINUX_TRUE@am__append_201 = unittest_blkdev
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_202 =  \
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_214 = unittest_mds_authcap
+ at WITH_BUILD_TESTS_TRUE@am__append_215 = test_build_libcommon
+ at LINUX_TRUE@am__append_216 = libsystest.la
+ at SOLARIS_TRUE@am__append_217 = \
+ at SOLARIS_TRUE@	-lsocket -lnsl
+
+ at LINUX_TRUE@am__append_218 = unittest_blkdev
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_219 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_scratchtool \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_scratchtoolpp \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_radosacl
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_203 = rados
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_204 = ceph-client-debug
- at ENABLE_SERVER_TRUE@am__append_205 = ceph-osdomap-tool \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_220 = rados
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_221 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/ArgumentTypes.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/IndentStream.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/OptionPrinter.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/Shell.h \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/Utils.h
+
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_222 = rbd
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_223 = ceph-client-debug
+ at ENABLE_SERVER_TRUE@am__append_224 = ceph-osdomap-tool \
 @ENABLE_SERVER_TRUE@	ceph-monstore-tool ceph-kvstore-tool
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_206 = -ldl
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_207 = ceph-objectstore-tool
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at am__append_208 = cephfs-journal-tool \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am__append_225 = -ldl
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_226 = ceph-objectstore-tool
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at am__append_227 = cephfs-journal-tool \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	cephfs-table-tool \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	cephfs-data-scan
- at WITH_SLIBROCKSDB_TRUE@am__append_209 = rocksdb
- at WITH_SLIBROCKSDB_FALSE@am__append_210 = \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/.gitignore \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/CONTRIBUTING.md \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/HISTORY.md \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/INSTALL.md \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/LICENSE \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/Makefile.am \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/PATENTS \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/README.md \
- at WITH_SLIBROCKSDB_FALSE@        rocksdb/ROCKSDB_LITE.md \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/AUTHORS \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/configure.ac \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/CONTRIBUTING.md \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/builder.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/builder.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/c.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/column_family.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/column_family.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/column_family_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compact_files_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction_job.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction_job.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction_job_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction_picker.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction_picker.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/compaction_picker_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/comparator_db_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/corruption_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/c_test.c \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/cuckoo_table_db_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_bench.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_filesnapshot.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/dbformat.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/dbformat.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/dbformat_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_impl.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_impl_debug.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_impl_experimental.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_impl.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_impl_readonly.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_impl_readonly.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_iter.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_iter.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_iter_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/db_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/deletefile_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/event_logger_helpers.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/event_logger_helpers.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/experimental.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/fault_injection_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/file_indexer.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/file_indexer.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/file_indexer_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/filename.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/filename.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/filename_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/flush_job.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/flush_job.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/flush_job_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/flush_scheduler.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/flush_scheduler.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/forward_iterator.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/forward_iterator.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/internal_stats.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/internal_stats.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/job_context.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/listener_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/log_format.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/log_reader.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/log_reader.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/log_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/log_writer.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/log_writer.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/managed_iterator.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/managed_iterator.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtable_allocator.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtable_allocator.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtable.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtable.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtable_list.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtable_list.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtable_list_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/memtablerep_bench.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/merge_context.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/merge_helper.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/merge_helper.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/merge_operator.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/merge_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/perf_context_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/plain_table_db_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/prefix_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/repair.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/skiplist.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/skiplist_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/slice.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/snapshot.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/table_cache.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/table_cache.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/table_properties_collector.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/table_properties_collector.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/table_properties_collector_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/transaction_log_impl.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/transaction_log_impl.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_builder.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_builder.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_builder_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_edit.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_edit.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_edit_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_set.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_set.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/version_set_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/wal_manager.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/wal_manager.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/wal_manager_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_batch_base.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_batch.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_batch_internal.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_batch_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/writebuffer.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_controller.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_controller.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_controller_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_thread.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/db/write_thread.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/doc/doc.css \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/doc/index.html \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/doc/log_format.txt \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/doc/rockslogo.jpg \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/doc/rockslogo.png \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/examples/column_families_example.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/examples/compact_files_example.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/examples/c_simple_example.c \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/examples/.gitignore \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/examples/Makefile \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/examples/README.md \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/examples/simple_example.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/hdfs/env_hdfs.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/hdfs/README \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/hdfs/setup.sh \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/HISTORY.md \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/cache.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/c.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/compaction_filter.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/comparator.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/db.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/env.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/experimental.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/filter_policy.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/flush_block_policy.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/immutable_options.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/iostats_context.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/iterator.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/ldb_tool.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/listener.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/memtablerep.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/merge_operator.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/metadata.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/options.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/perf_context.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/rate_limiter.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/slice.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/slice_transform.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/sst_dump_tool.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/statistics.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/status.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/table.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/table_properties.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/thread_status.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/transaction_log.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/types.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/universal_compaction.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/backupable_db.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/checkpoint.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/convenience.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/db_ttl.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/document_db.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/flashcache.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/geo_db.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/json_document.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/leveldb_options.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/spatial_db.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/stackable_db.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/utility_db.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/utilities/write_batch_with_index.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/version.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/write_batch_base.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/rocksdb/write_batch.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/utilities/backupable_db.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/utilities/db_ttl.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/utilities/document_db.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/utilities/geo_db.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/utilities/json_document.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/utilities/stackable_db.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/include/utilities/utility_db.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/INSTALL.md \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/LICENSE \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/m4/libtool.m4 \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/m4/lt~obsolete.m4 \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/m4/ltoptions.m4 \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/m4/ltsugar.m4 \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/m4/ltversion.m4 \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/Makefile.am \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/PATENTS \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/likely.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/port_example.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/port.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/port_posix.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/port_posix.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/README \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/stack_trace.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/stack_trace.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/port/win/stdint.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/README.md \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/ROCKSDB_LITE.md \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/adaptive_table_factory.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/adaptive_table_factory.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_filter_block.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_filter_block.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_filter_block_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_table_builder.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_table_builder.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_table_factory.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_table_factory.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_table_reader.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_based_table_reader.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_builder.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_builder.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_hash_index.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_hash_index.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_hash_index_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_prefix_index.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_prefix_index.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/block_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/bloom_block.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/bloom_block.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_builder.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_builder.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_builder_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_factory.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_factory.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_reader.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_reader.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/cuckoo_table_reader_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/filter_block.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/flush_block_policy.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/format.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/format.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/full_filter_block.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/full_filter_block.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/full_filter_block_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/get_context.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/get_context.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/iterator.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/iterator_wrapper.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/iter_heap.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/merger.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/merger.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/merger_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/meta_blocks.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/meta_blocks.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/mock_table.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/mock_table.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_builder.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_builder.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_factory.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_factory.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_index.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_index.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_key_coding.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_key_coding.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_reader.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/plain_table_reader.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/table_builder.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/table_properties.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/table_properties_internal.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/table_reader_bench.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/table_reader.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/table_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/two_level_iterator.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/table/two_level_iterator.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/fbson/COMMIT.md \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/fbson/FbsonDocument.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/fbson/FbsonJsonParser.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/fbson/FbsonStream.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/fbson/FbsonUtil.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/fbson/FbsonWriter.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/flashcache/flashcache_ioctl.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/USERS.md \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/allocator.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/arena.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/arena.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/arena_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/auto_roll_logger.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/auto_roll_logger.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/auto_roll_logger_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/autovector.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/autovector_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/bloom.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/bloom_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/build_version.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/cache_bench.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/cache.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/cache_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/coding.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/coding.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/coding_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/comparator.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/compression.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/crc32c.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/crc32c.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/crc32c_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/db_info_dumper.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/db_info_dumper.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/dynamic_bloom.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/dynamic_bloom.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/dynamic_bloom_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/env.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/env_hdfs.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/env_posix.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/env_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/event_logger.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/event_logger.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/event_logger_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/filelock_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/file_util.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/file_util.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/filter_policy.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash_cuckoo_rep.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash_cuckoo_rep.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash_linklist_rep.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash_linklist_rep.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash_skiplist_rep.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/hash_skiplist_rep.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/histogram.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/histogram.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/histogram_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/instrumented_mutex.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/instrumented_mutex.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/iostats_context.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/iostats_context_imp.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/backupable/backupable_db.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/backupable/backupable_db_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/checkpoint/checkpoint.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/compacted_db/compacted_db_impl.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/compacted_db/compacted_db_impl.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/convenience/convenience.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/document/document_db.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/document/document_db_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/document/json_document_builder.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/document/json_document.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/document/json_document_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/flashcache/flashcache.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/flashcache/flashcache.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/geodb/geodb_impl.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/geodb/geodb_impl.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/geodb/geodb_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/leveldb_options/leveldb_options.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators/put.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators/string_append/stringappend2.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators/string_append/stringappend2.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators/string_append/stringappend.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators/string_append/stringappend.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators/string_append/stringappend_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/merge_operators/uint64add.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/redis/README \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/redis/redis_list_exception.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/redis/redis_list_iterator.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/redis/redis_lists.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/redis/redis_lists.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/redis/redis_lists_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/spatialdb/spatial_db.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/spatialdb/spatial_db_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/spatialdb/utils.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/ttl/db_ttl_impl.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/ttl/db_ttl_impl.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/ttl/ttl_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/ldb_cmd.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/ldb_cmd_execute_result.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/ldb_cmd.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/ldb_tool.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/log_buffer.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/log_buffer.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/logging.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/logging.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/log_write_bench.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/manual_compaction_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/memenv.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/memenv_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/mock_env.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/mock_env.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/mock_env_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/murmurhash.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/murmurhash.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/mutable_cf_options.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/mutable_cf_options.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/mutexlock.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/options_builder.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/options.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/options_helper.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/options_helper.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/options_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/perf_context.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/perf_context_imp.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/posix_logger.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/random.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/rate_limiter.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/rate_limiter.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/rate_limiter_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/scoped_arena_iterator.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/skiplistrep.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/slice.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/slice_transform_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/sst_dump_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/sst_dump_tool.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/sst_dump_tool_imp.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/statistics.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/statistics.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/status.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/stl_wrappers.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/stop_watch.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/string_util.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/string_util.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/sync_point.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/sync_point.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/testharness.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/testharness.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/testutil.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/testutil.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_list_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_local.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_local.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_local_test.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_operation.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_status_impl.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_status_updater.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_status_updater_debug.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_status_updater.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_status_util.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_status_util_debug.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/thread_status_util.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/vectorrep.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/xfunc.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/xfunc.h \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/xxhash.cc \
- at WITH_SLIBROCKSDB_FALSE@	rocksdb/util/xxhash.h
-
- at WITH_LTTNG_TRUE@am__append_211 = \
+ at WITH_LTTNG_TRUE@am__append_228 = \
 @WITH_LTTNG_TRUE@	libosd_tp.la \
 @WITH_LTTNG_TRUE@	libos_tp.la \
 @WITH_LTTNG_TRUE@	librados_tp.la \
 @WITH_LTTNG_TRUE@	librbd_tp.la
 
- at WITH_LTTNG_TRUE@am__append_212 = \
+ at WITH_LTTNG_TRUE@am__append_229 = \
 @WITH_LTTNG_TRUE@	tracing/librados.h \
 @WITH_LTTNG_TRUE@	tracing/librbd.h \
 @WITH_LTTNG_TRUE@	tracing/objectstore.h \
@@ -1336,52 +960,53 @@ check_PROGRAMS = $(am__EXEEXT_57) $(am__EXEEXT_58) \
 @WITH_LTTNG_TRUE@	tracing/pg.h
 
 TESTS = $(am__EXEEXT_57) $(check_SCRIPTS)
- at ENABLE_CLIENT_TRUE@am__append_213 = \
+ at ENABLE_CLIENT_TRUE@am__append_230 = \
 @ENABLE_CLIENT_TRUE@	pybind/ceph_argparse.py \
 @ENABLE_CLIENT_TRUE@	pybind/ceph_daemon.py
 
- at ENABLE_CLIENT_TRUE@am__append_214 = ceph-syn
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_215 = \
+ at ENABLE_CLIENT_TRUE@am__append_231 = ceph-syn
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_232 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/bash_completion/rados \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/bash_completion/radosgw-admin
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_216 = pybind/rados.py
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_217 = librados-config
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_218 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_233 = pybind/rados.py
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__append_234 = librados-config
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_235 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(srcdir)/bash_completion/rbd
 
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_219 = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_236 = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	ceph-rbdnamer \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd-replay-many
-
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_220 = pybind/rbd.py
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_221 = libkrbd.la
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_222 = rbd
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at am__append_223 = ceph-fuse
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_224 = rbd-fuse
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_225 = cephfs
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_226 = pybind/cephfs.py
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_227 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=libcommon.a'
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_228 = libcephfs.la
- at ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_229 = libcephfs_jni.la
- at ENABLE_SERVER_TRUE@am__append_230 = ceph-run ceph-rest-api \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	rbd-replay-many \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@        rbdmap
+
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_237 = pybind/rbd.py
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_238 = libkrbd.la
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at am__append_239 = ceph-fuse
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__append_240 = rbd-fuse
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_241 = cephfs
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_242 = pybind/cephfs.py
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_243 = -Xcompiler -Xlinker -Xcompiler '--exclude-libs=libcommon.a'
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_244 = libcephfs.la
+ at ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__append_245 = libcephfs_jni.la
+ at ENABLE_SERVER_TRUE@am__append_246 = ceph-run ceph-rest-api \
 @ENABLE_SERVER_TRUE@	ceph-debugpack ceph-crush-location \
 @ENABLE_SERVER_TRUE@	ceph-coverage
- at ENABLE_SERVER_TRUE@am__append_231 = pybind/ceph_rest_api.py
- at ENABLE_SERVER_TRUE@am__append_232 = ceph-coverage init-ceph
- at ENABLE_SERVER_TRUE@am__append_233 = init-ceph
- at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_234 = mount.ceph
- at ENABLE_SERVER_TRUE@am__append_235 = mount.fuse.ceph
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_236 = ceph-mon
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_237 = \
+ at ENABLE_SERVER_TRUE@am__append_247 = pybind/ceph_rest_api.py
+ at ENABLE_SERVER_TRUE@am__append_248 = ceph-coverage init-ceph
+ at ENABLE_SERVER_TRUE@am__append_249 = init-ceph
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__append_250 = mount.ceph
+ at ENABLE_SERVER_TRUE@am__append_251 = mount.fuse.ceph
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am__append_252 = ceph-mon
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_253 = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph-disk \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph-disk-udev
 
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_238 = \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_254 = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph-clsinfo
 
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_239 = ceph-osd
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_240 = ceph-mds
+ at ENABLE_SERVER_TRUE@@WITH_LTTNG_TRUE@@WITH_OSD_TRUE at am__append_255 = $(LIBOSD_TP)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__append_256 = ceph-osd
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am__append_257 = ceph-mds
 subdir = src
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ac_check_classpath.m4 \
@@ -1462,11 +1087,118 @@ am__libcls_version_client_a_SOURCES_DIST =  \
 @ENABLE_CLIENT_TRUE@	cls/version/cls_version_types.$(OBJEXT)
 libcls_version_client_a_OBJECTS =  \
 	$(am_libcls_version_client_a_OBJECTS)
+libkv_a_AR = $(AR) $(ARFLAGS)
+am__DEPENDENCIES_1 =
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__DEPENDENCIES_2 =  \
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE@	libcrypto.a
+ at ENABLE_SERVER_TRUE@libkv_a_DEPENDENCIES = $(am__append_30) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_1) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_2)
+am__libkv_a_SOURCES_DIST = kv/KeyValueDB.cc kv/LevelDBStore.cc \
+	kv/RocksDBStore.cc kv/KineticStore.cc
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am__objects_1 = kv/libkv_a-RocksDBStore.$(OBJEXT)
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am__objects_2 = kv/libkv_a-RocksDBStore.$(OBJEXT)
+ at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__objects_3 = kv/libkv_a-KineticStore.$(OBJEXT)
+ at ENABLE_SERVER_TRUE@am_libkv_a_OBJECTS =  \
+ at ENABLE_SERVER_TRUE@	kv/libkv_a-KeyValueDB.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	kv/libkv_a-LevelDBStore.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	$(am__objects_1) $(am__objects_2) \
+ at ENABLE_SERVER_TRUE@	$(am__objects_3)
+libkv_a_OBJECTS = $(am_libkv_a_OBJECTS)
+libmon_a_AR = $(AR) $(ARFLAGS)
+libmon_a_DEPENDENCIES =
+am__libmon_a_SOURCES_DIST = mon/Monitor.cc mon/Paxos.cc \
+	mon/PaxosService.cc mon/OSDMonitor.cc mon/MDSMonitor.cc \
+	mon/MonmapMonitor.cc mon/PGMonitor.cc mon/LogMonitor.cc \
+	mon/AuthMonitor.cc mon/Elector.cc mon/HealthMonitor.cc \
+	mon/DataHealthService.cc mon/ConfigKeyService.cc
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am_libmon_a_OBJECTS =  \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/Monitor.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/Paxos.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/PaxosService.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/OSDMonitor.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/MDSMonitor.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/MonmapMonitor.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/PGMonitor.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/LogMonitor.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/AuthMonitor.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/Elector.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/HealthMonitor.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/DataHealthService.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/ConfigKeyService.$(OBJEXT)
+libmon_a_OBJECTS = $(am_libmon_a_OBJECTS)
+libos_a_AR = $(AR) $(ARFLAGS)
+ at ENABLE_SERVER_TRUE@libos_a_DEPENDENCIES = libos_types.a libkv.a \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_1)
+am__libos_a_SOURCES_DIST = os/chain_xattr.cc os/fs/FS.cc \
+	os/DBObjectMap.cc os/GenericObjectMap.cc os/FileJournal.cc \
+	os/FileStore.cc os/GenericFileStoreBackend.cc os/HashIndex.cc \
+	os/IndexManager.cc os/JournalingObjectStore.cc os/LFNIndex.cc \
+	os/MemStore.cc os/KeyValueStore.cc os/ObjectStore.cc \
+	os/WBThrottle.cc os/BtrfsFileStoreBackend.cc \
+	os/newstore/NewStore.cc os/fs/XFS.cc os/XfsFileStoreBackend.cc \
+	os/ZFSFileStoreBackend.cc
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__objects_4 = os/BtrfsFileStoreBackend.$(OBJEXT)
+ at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__objects_5 = os/newstore/NewStore.$(OBJEXT)
+ at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE at am__objects_6 =  \
+ at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE@	os/fs/XFS.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE@	os/XfsFileStoreBackend.$(OBJEXT)
+ at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__objects_7 = os/ZFSFileStoreBackend.$(OBJEXT)
+ at ENABLE_SERVER_TRUE@am_libos_a_OBJECTS = os/chain_xattr.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	os/fs/FS.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	os/DBObjectMap.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	os/GenericObjectMap.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	os/FileJournal.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	os/FileStore.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	os/GenericFileStoreBackend.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	os/HashIndex.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	os/IndexManager.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	os/JournalingObjectStore.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	os/LFNIndex.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	os/MemStore.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	os/KeyValueStore.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	os/ObjectStore.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@	os/WBThrottle.$(OBJEXT) $(am__objects_4) \
+ at ENABLE_SERVER_TRUE@	$(am__objects_5) $(am__objects_6) \
+ at ENABLE_SERVER_TRUE@	$(am__objects_7)
+libos_a_OBJECTS = $(am_libos_a_OBJECTS)
+libos_types_a_AR = $(AR) $(ARFLAGS)
+libos_types_a_LIBADD =
+am__libos_types_a_SOURCES_DIST = os/Transaction.cc \
+	os/newstore/newstore_types.cc
+ at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__objects_8 = os/newstore/libos_types_a-newstore_types.$(OBJEXT)
+am_libos_types_a_OBJECTS = os/libos_types_a-Transaction.$(OBJEXT) \
+	$(am__objects_8)
+libos_types_a_OBJECTS = $(am_libos_types_a_OBJECTS)
 libos_zfs_a_AR = $(AR) $(ARFLAGS)
 libos_zfs_a_LIBADD =
 am__libos_zfs_a_SOURCES_DIST = os/ZFS.cc
 @ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am_libos_zfs_a_OBJECTS = os/libos_zfs_a-ZFS.$(OBJEXT)
 libos_zfs_a_OBJECTS = $(am_libos_zfs_a_OBJECTS)
+libosd_a_AR = $(AR) $(ARFLAGS)
+libosd_a_DEPENDENCIES =
+am__libosd_a_SOURCES_DIST = osd/PG.cc osd/ReplicatedPG.cc \
+	osd/ReplicatedBackend.cc osd/ECBackend.cc osd/ECMsgTypes.cc \
+	osd/ECTransaction.cc osd/PGBackend.cc osd/HitSet.cc osd/OSD.cc \
+	osd/OSDCap.cc osd/Watch.cc osd/ClassHandler.cc \
+	osd/OpRequest.cc osd/SnapMapper.cc objclass/class_api.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libosd_a_OBJECTS =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_a-PG.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_a-ReplicatedPG.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_a-ReplicatedBackend.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_a-ECBackend.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_a-ECMsgTypes.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_a-ECTransaction.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_a-PGBackend.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_a-HitSet.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_a-OSD.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_a-OSDCap.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_a-Watch.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_a-ClassHandler.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_a-OpRequest.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_a-SnapMapper.$(OBJEXT) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	objclass/libosd_a-class_api.$(OBJEXT)
+libosd_a_OBJECTS = $(am_libosd_a_OBJECTS)
 am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
 am__vpath_adj = case $$p in \
     $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
@@ -1529,14 +1261,13 @@ am_libauth_la_OBJECTS = auth/AuthAuthorizeHandler.lo \
 	auth/unknown/AuthUnknownAuthorizeHandler.lo auth/Crypto.lo \
 	auth/KeyRing.lo auth/RotatingKeyRing.lo
 libauth_la_OBJECTS = $(am_libauth_la_OBJECTS)
-am__DEPENDENCIES_1 =
-am__DEPENDENCIES_2 = $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
+am__DEPENDENCIES_3 = $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
 	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_la_DEPENDENCIES = $(LIBCLIENT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_3)
 am__libcephfs_la_SOURCES_DIST = libcephfs.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_libcephfs_la_OBJECTS = libcephfs.lo
 libcephfs_la_OBJECTS = $(am_libcephfs_la_OBJECTS)
@@ -1548,7 +1279,7 @@ libcephfs_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(libdir)
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_jni_la_DEPENDENCIES = $(LIBCEPHFS) \
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON) \
- at ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_3)
 am__libcephfs_jni_la_SOURCES_DIST = java/native/libcephfs_jni.cc \
 	java/native/ScopedLocalRef.h java/native/JniConstants.cpp \
 	java/native/JniConstants.h
@@ -1599,7 +1330,7 @@ libclient_fuse_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE at am_libclient_fuse_la_rpath =
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_cephfs_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libcls_cephfs_la_SOURCES_DIST = cls/cephfs/cls_cephfs.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_cephfs_la_OBJECTS =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/cephfs/cls_cephfs.lo
@@ -1620,7 +1351,7 @@ libcls_cephfs_client_la_OBJECTS =  \
 @ENABLE_CLIENT_TRUE at am_libcls_cephfs_client_la_rpath =
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_hello_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libcls_hello_la_SOURCES_DIST = cls/hello/cls_hello.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_hello_la_OBJECTS =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/hello/cls_hello.lo
@@ -1631,8 +1362,33 @@ libcls_hello_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(LDFLAGS) -o $@
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_hello_la_rpath = -rpath \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(radoslibdir)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_journal_la_DEPENDENCIES =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
+am__libcls_journal_la_SOURCES_DIST = cls/journal/cls_journal.cc \
+	cls/journal/cls_journal_types.cc
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_journal_la_OBJECTS =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/journal/cls_journal.lo \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/journal/cls_journal_types.lo
+libcls_journal_la_OBJECTS = $(am_libcls_journal_la_OBJECTS)
+libcls_journal_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(AM_CXXFLAGS) $(CXXFLAGS) $(libcls_journal_la_LDFLAGS) \
+	$(LDFLAGS) -o $@
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_journal_la_rpath =  \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-rpath $(radoslibdir)
+libcls_journal_client_la_LIBADD =
+am__libcls_journal_client_la_SOURCES_DIST =  \
+	cls/journal/cls_journal_client.cc \
+	cls/journal/cls_journal_types.cc
+ at ENABLE_CLIENT_TRUE@am_libcls_journal_client_la_OBJECTS =  \
+ at ENABLE_CLIENT_TRUE@	cls/journal/cls_journal_client.lo \
+ at ENABLE_CLIENT_TRUE@	cls/journal/cls_journal_types.lo
+libcls_journal_client_la_OBJECTS =  \
+	$(am_libcls_journal_client_la_OBJECTS)
+ at ENABLE_CLIENT_TRUE@am_libcls_journal_client_la_rpath =
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at libcls_kvs_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libcls_kvs_la_SOURCES_DIST = key_value_store/cls_kvs.cc
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at am_libcls_kvs_la_OBJECTS = key_value_store/cls_kvs.lo
 libcls_kvs_la_OBJECTS = $(am_libcls_kvs_la_OBJECTS)
@@ -1645,7 +1401,7 @@ libcls_kvs_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE@	$(radoslibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_lock_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libcls_lock_la_SOURCES_DIST = cls/lock/cls_lock.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_lock_la_OBJECTS =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/lock/cls_lock.lo
@@ -1667,7 +1423,7 @@ libcls_lock_client_la_OBJECTS = $(am_libcls_lock_client_la_OBJECTS)
 @ENABLE_CLIENT_TRUE at am_libcls_lock_client_la_rpath =
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_log_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libcls_log_la_SOURCES_DIST = cls/log/cls_log.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_log_la_OBJECTS =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/log/cls_log.lo
@@ -1699,7 +1455,7 @@ libcls_numops_client_la_OBJECTS =  \
 @ENABLE_CLIENT_TRUE at am_libcls_numops_client_la_rpath =
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_rbd_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libcls_rbd_la_SOURCES_DIST = cls/rbd/cls_rbd.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_rbd_la_OBJECTS =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/rbd/cls_rbd.lo
@@ -1719,7 +1475,7 @@ libcls_rbd_client_la_OBJECTS = $(am_libcls_rbd_client_la_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_refcount_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libjson_spirit.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libcls_refcount_la_SOURCES_DIST = cls/refcount/cls_refcount.cc \
 	cls/refcount/cls_refcount_ops.cc common/ceph_json.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_refcount_la_OBJECTS = cls/refcount/cls_refcount.lo \
@@ -1744,7 +1500,7 @@ libcls_refcount_client_la_OBJECTS =  \
 @ENABLE_CLIENT_TRUE at am_libcls_refcount_client_la_rpath =
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_replica_log_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libcls_replica_log_la_SOURCES_DIST =  \
 	cls/replica_log/cls_replica_log.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_replica_log_la_OBJECTS = cls/replica_log/cls_replica_log.lo
@@ -1758,7 +1514,7 @@ libcls_replica_log_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_rgw_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	libjson_spirit.la \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libcls_rgw_la_SOURCES_DIST = cls/rgw/cls_rgw.cc \
 	cls/rgw/cls_rgw_ops.cc cls/rgw/cls_rgw_types.cc \
 	common/ceph_json.cc
@@ -1785,7 +1541,7 @@ libcls_rgw_client_la_OBJECTS = $(am_libcls_rgw_client_la_OBJECTS)
 @ENABLE_CLIENT_TRUE at am_libcls_rgw_client_la_rpath =
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_statelog_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libcls_statelog_la_SOURCES_DIST = cls/statelog/cls_statelog.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_statelog_la_OBJECTS = cls/statelog/cls_statelog.lo
 libcls_statelog_la_OBJECTS = $(am_libcls_statelog_la_OBJECTS)
@@ -1797,7 +1553,7 @@ libcls_statelog_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-rpath $(radoslibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_timeindex_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libcls_timeindex_la_SOURCES_DIST = cls/timeindex/cls_timeindex.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_timeindex_la_OBJECTS = cls/timeindex/cls_timeindex.lo
 libcls_timeindex_la_OBJECTS = $(am_libcls_timeindex_la_OBJECTS)
@@ -1809,7 +1565,7 @@ libcls_timeindex_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-rpath $(radoslibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_user_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libcls_user_la_SOURCES_DIST = cls/user/cls_user.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_user_la_OBJECTS =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/user/cls_user.lo
@@ -1822,7 +1578,7 @@ libcls_user_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(radoslibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_version_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libcls_version_la_SOURCES_DIST = cls/version/cls_version.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_version_la_OBJECTS =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/version/cls_version.lo
@@ -1833,11 +1589,11 @@ libcls_version_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(LDFLAGS) -o $@
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libcls_version_la_rpath =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-rpath $(radoslibdir)
-am__DEPENDENCIES_3 = libcommon_internal.la libcommon_crc.la \
-	$(am__append_84) $(LIBERASURE_CODE) $(LIBMSG) $(LIBAUTH) \
+am__DEPENDENCIES_4 = libcommon_internal.la libcommon_crc.la \
+	$(am__append_97) $(LIBERASURE_CODE) $(LIBMSG) $(LIBAUTH) \
 	$(LIBCRUSH) $(LIBJSON_SPIRIT) $(LIBLOG) $(LIBARCH) \
 	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1)
-libcommon_la_DEPENDENCIES = $(am__DEPENDENCIES_3)
+libcommon_la_DEPENDENCIES = $(am__DEPENDENCIES_4)
 am_libcommon_la_OBJECTS = common/buffer.lo
 libcommon_la_OBJECTS = $(am_libcommon_la_OBJECTS)
 libcommon_crc_la_LIBADD =
@@ -1845,12 +1601,12 @@ am__libcommon_crc_la_SOURCES_DIST = common/sctp_crc32.c \
 	common/crc32c.cc common/crc32c_intel_baseline.c \
 	common/crc32c_intel_fast.c common/crc32c_intel_fast_asm.S \
 	common/crc32c_intel_fast_zero_asm.S
- at WITH_GOOD_YASM_ELF64_TRUE@am__objects_1 = common/libcommon_crc_la-crc32c_intel_fast_asm.lo \
+ at WITH_GOOD_YASM_ELF64_TRUE@am__objects_9 = common/libcommon_crc_la-crc32c_intel_fast_asm.lo \
 @WITH_GOOD_YASM_ELF64_TRUE@	common/libcommon_crc_la-crc32c_intel_fast_zero_asm.lo
 am_libcommon_crc_la_OBJECTS = common/libcommon_crc_la-sctp_crc32.lo \
 	common/libcommon_crc_la-crc32c.lo \
 	common/libcommon_crc_la-crc32c_intel_baseline.lo \
-	common/libcommon_crc_la-crc32c_intel_fast.lo $(am__objects_1)
+	common/libcommon_crc_la-crc32c_intel_fast.lo $(am__objects_9)
 libcommon_crc_la_OBJECTS = $(am_libcommon_crc_la_OBJECTS)
 libcommon_crc_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(libcommon_crc_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
@@ -1877,7 +1633,7 @@ am__libcommon_internal_la_SOURCES_DIST = ceph_ver.c \
 	common/Throttle.cc common/Timer.cc common/Finisher.cc \
 	common/environment.cc common/assert.cc common/run_cmd.cc \
 	common/WorkQueue.cc common/ConfUtils.cc common/MemoryModel.cc \
-	common/armor.c common/fd.cc common/xattr.c common/safe_io.c \
+	common/armor.c common/fd.cc common/safe_io.c \
 	common/snap_types.cc common/str_list.cc common/str_map.cc \
 	common/errno.cc common/RefCountedObj.cc common/common_init.cc \
 	common/pipe.c common/ceph_argparse.cc common/ceph_context.cc \
@@ -1887,18 +1643,25 @@ am__libcommon_internal_la_SOURCES_DIST = ceph_ver.c \
 	common/config.cc common/utf8.c common/mime.c common/strtol.cc \
 	common/page.cc common/lockdep.cc common/version.cc \
 	common/hex.cc common/entity_name.cc common/ceph_crypto.cc \
-	common/ceph_crypto_cms.cc common/ceph_json.cc common/ipaddr.cc \
-	common/pick_address.cc common/util.cc common/TextTable.cc \
+	common/ceph_crypto_cms.cc common/TextTable.cc \
 	common/ceph_fs.cc common/ceph_hash.cc common/ceph_strings.cc \
 	common/ceph_frag.cc common/addr_parsing.c common/hobject.cc \
-	common/bloom_filter.cc common/linux_version.c common/module.c \
-	common/Readahead.cc common/Cycles.cc \
-	common/ContextCompletion.cc common/TracepointProvider.cc \
+	common/bloom_filter.cc common/module.c common/Readahead.cc \
+	common/Cycles.cc common/ContextCompletion.cc \
+	common/TracepointProvider.cc common/xattr.c common/ipaddr.cc \
+	common/ceph_json.cc common/util.cc common/pick_address.cc \
+	common/linux_version.c common/solaris_errno.cc \
 	common/blkdev.cc common/address_helper.cc mon/MonCap.cc \
 	mon/MonClient.cc mon/MonMap.cc osd/OSDMap.cc osd/osd_types.cc \
 	osd/ECMsgTypes.cc osd/HitSet.cc mds/MDSMap.cc \
 	mds/inode_backtrace.cc mds/mdstypes.cc mds/flock.cc
- at ENABLE_XIO_TRUE@am__objects_2 = common/address_helper.lo
+ at ENABLE_SERVER_TRUE@am__objects_10 = common/xattr.lo common/ipaddr.lo \
+ at ENABLE_SERVER_TRUE@	common/ceph_json.lo common/util.lo \
+ at ENABLE_SERVER_TRUE@	common/pick_address.lo
+ at LINUX_TRUE@am__objects_11 = common/linux_version.lo
+ at SOLARIS_TRUE@am__objects_12 = common/solaris_errno.lo
+ at LINUX_TRUE@@WITH_RBD_TRUE at am__objects_13 = common/blkdev.lo
+ at ENABLE_XIO_TRUE@am__objects_14 = common/address_helper.lo
 am_libcommon_internal_la_OBJECTS = ceph_ver.lo common/DecayCounter.lo \
 	common/LogClient.lo common/LogEntry.lo \
 	common/PrebufferedStreambuf.lo common/SloppyCRCMap.lo \
@@ -1909,7 +1672,7 @@ am_libcommon_internal_la_OBJECTS = ceph_ver.lo common/DecayCounter.lo \
 	common/Throttle.lo common/Timer.lo common/Finisher.lo \
 	common/environment.lo common/assert.lo common/run_cmd.lo \
 	common/WorkQueue.lo common/ConfUtils.lo common/MemoryModel.lo \
-	common/armor.lo common/fd.lo common/xattr.lo common/safe_io.lo \
+	common/armor.lo common/fd.lo common/safe_io.lo \
 	common/snap_types.lo common/str_list.lo common/str_map.lo \
 	common/errno.lo common/RefCountedObj.lo common/common_init.lo \
 	common/pipe.lo common/ceph_argparse.lo common/ceph_context.lo \
@@ -1920,17 +1683,17 @@ am_libcommon_internal_la_OBJECTS = ceph_ver.lo common/DecayCounter.lo \
 	common/strtol.lo common/page.lo common/lockdep.lo \
 	common/version.lo common/hex.lo common/entity_name.lo \
 	common/ceph_crypto.lo common/ceph_crypto_cms.lo \
-	common/ceph_json.lo common/ipaddr.lo common/pick_address.lo \
-	common/util.lo common/TextTable.lo common/ceph_fs.lo \
-	common/ceph_hash.lo common/ceph_strings.lo common/ceph_frag.lo \
+	common/TextTable.lo common/ceph_fs.lo common/ceph_hash.lo \
+	common/ceph_strings.lo common/ceph_frag.lo \
 	common/addr_parsing.lo common/hobject.lo \
-	common/bloom_filter.lo common/linux_version.lo \
-	common/module.lo common/Readahead.lo common/Cycles.lo \
-	common/ContextCompletion.lo common/TracepointProvider.lo \
-	common/blkdev.lo $(am__objects_2) mon/MonCap.lo \
-	mon/MonClient.lo mon/MonMap.lo osd/OSDMap.lo osd/osd_types.lo \
-	osd/ECMsgTypes.lo osd/HitSet.lo mds/MDSMap.lo \
-	mds/inode_backtrace.lo mds/mdstypes.lo mds/flock.lo
+	common/bloom_filter.lo common/module.lo common/Readahead.lo \
+	common/Cycles.lo common/ContextCompletion.lo \
+	common/TracepointProvider.lo $(am__objects_10) \
+	$(am__objects_11) $(am__objects_12) $(am__objects_13) \
+	$(am__objects_14) mon/MonCap.lo mon/MonClient.lo mon/MonMap.lo \
+	osd/OSDMap.lo osd/osd_types.lo osd/ECMsgTypes.lo osd/HitSet.lo \
+	mds/MDSMap.lo mds/inode_backtrace.lo mds/mdstypes.lo \
+	mds/flock.lo
 libcommon_internal_la_OBJECTS = $(am_libcommon_internal_la_OBJECTS)
 libcompressor_la_DEPENDENCIES = $(LIBCOMMON)
 am_libcompressor_la_OBJECTS = compressor/Compressor.lo \
@@ -1944,7 +1707,7 @@ libcrush_la_OBJECTS = $(am_libcrush_la_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_example_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCRUSH) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_example_la_SOURCES_DIST = erasure-code/ErasureCode.cc \
 	test/erasure-code/ErasureCodePluginExample.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_example_la_OBJECTS = erasure-code/libec_example_la-ErasureCode.lo \
@@ -1958,7 +1721,7 @@ libec_example_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_fail_to_initialize_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_fail_to_initialize_la_SOURCES_DIST =  \
 	test/erasure-code/ErasureCodePluginFailToInitialize.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_fail_to_initialize_la_OBJECTS = test/erasure-code/libec_fail_to_initialize_la-ErasureCodePluginFailToInitialize.lo
@@ -1973,7 +1736,7 @@ libec_fail_to_initialize_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_fail_to_register_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_fail_to_register_la_SOURCES_DIST =  \
 	test/erasure-code/ErasureCodePluginFailToRegister.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_fail_to_register_la_OBJECTS = test/erasure-code/libec_fail_to_register_la-ErasureCodePluginFailToRegister.lo
@@ -1988,7 +1751,7 @@ libec_fail_to_register_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_hangs_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_hangs_la_SOURCES_DIST =  \
 	test/erasure-code/ErasureCodePluginHangs.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_hangs_la_OBJECTS = test/erasure-code/libec_hangs_la-ErasureCodePluginHangs.lo
@@ -2001,7 +1764,7 @@ libec_hangs_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 @WITH_BETTER_YASM_ELF64_TRUE at libec_isa_la_DEPENDENCIES = $(LIBCRUSH) \
 @WITH_BETTER_YASM_ELF64_TRUE@	$(am__DEPENDENCIES_1) \
- at WITH_BETTER_YASM_ELF64_TRUE@	$(am__DEPENDENCIES_2)
+ at WITH_BETTER_YASM_ELF64_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_isa_la_SOURCES_DIST = erasure-code/ErasureCode.cc \
 	erasure-code/isa/isa-l/erasure_code/ec_base.c \
 	erasure-code/isa/isa-l/erasure_code/ec_highlevel_func.c \
@@ -2048,7 +1811,7 @@ am__libec_isa_la_SOURCES_DIST = erasure-code/ErasureCode.cc \
 	erasure-code/isa/ErasureCodeIsaTableCache.cc \
 	erasure-code/isa/ErasureCodePluginIsa.cc \
 	erasure-code/isa/xor_op.cc
- at WITH_BETTER_YASM_ELF64_TRUE@am__objects_3 = erasure-code/libec_isa_la-ErasureCode.lo \
+ at WITH_BETTER_YASM_ELF64_TRUE@am__objects_15 = erasure-code/libec_isa_la-ErasureCode.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-ec_base.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-ec_highlevel_func.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/libec_isa_la-ec_multibinary.asm.lo \
@@ -2095,7 +1858,7 @@ am__libec_isa_la_SOURCES_DIST = erasure-code/ErasureCode.cc \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/libec_isa_la-ErasureCodePluginIsa.lo \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/libec_isa_la-xor_op.lo
 @WITH_BETTER_YASM_ELF64_TRUE at am_libec_isa_la_OBJECTS =  \
- at WITH_BETTER_YASM_ELF64_TRUE@	$(am__objects_3)
+ at WITH_BETTER_YASM_ELF64_TRUE@	$(am__objects_15)
 libec_isa_la_OBJECTS = $(am_libec_isa_la_OBJECTS)
 libec_isa_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(libec_isa_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
@@ -2104,7 +1867,7 @@ libec_isa_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @WITH_BETTER_YASM_ELF64_TRUE at am_libec_isa_la_rpath = -rpath \
 @WITH_BETTER_YASM_ELF64_TRUE@	$(erasure_codelibdir)
 libec_jerasure_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_2)
+	$(am__DEPENDENCIES_3)
 am_libec_jerasure_la_OBJECTS = erasure-code/jerasure/libec_jerasure_la-ErasureCodePluginSelectJerasure.lo
 libec_jerasure_la_OBJECTS = $(am_libec_jerasure_la_OBJECTS)
 libec_jerasure_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
@@ -2112,8 +1875,9 @@ libec_jerasure_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(libec_jerasure_la_CXXFLAGS) $(CXXFLAGS) \
 	$(libec_jerasure_la_LDFLAGS) $(LDFLAGS) -o $@
 libec_jerasure_generic_la_DEPENDENCIES = $(LIBCRUSH) \
-	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2)
-am__objects_4 = erasure-code/libec_jerasure_generic_la-ErasureCode.lo \
+	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_3)
+am__objects_16 =  \
+	erasure-code/libec_jerasure_generic_la-ErasureCode.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_generic_la-cauchy.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_generic_la-galois.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_generic_la-jerasure.lo \
@@ -2132,7 +1896,7 @@ am__objects_4 = erasure-code/libec_jerasure_generic_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_jerasure_generic_la-gf_w8.lo \
 	erasure-code/jerasure/libec_jerasure_generic_la-ErasureCodePluginJerasure.lo \
 	erasure-code/jerasure/libec_jerasure_generic_la-ErasureCodeJerasure.lo
-am_libec_jerasure_generic_la_OBJECTS = $(am__objects_4)
+am_libec_jerasure_generic_la_OBJECTS = $(am__objects_16)
 libec_jerasure_generic_la_OBJECTS =  \
 	$(am_libec_jerasure_generic_la_OBJECTS)
 libec_jerasure_generic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
@@ -2140,8 +1904,8 @@ libec_jerasure_generic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(libec_jerasure_generic_la_CXXFLAGS) $(CXXFLAGS) \
 	$(libec_jerasure_generic_la_LDFLAGS) $(LDFLAGS) -o $@
 libec_jerasure_neon_la_DEPENDENCIES = $(LIBCRUSH) \
-	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2)
-am__objects_5 = erasure-code/libec_jerasure_neon_la-ErasureCode.lo \
+	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_3)
+am__objects_17 = erasure-code/libec_jerasure_neon_la-ErasureCode.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_neon_la-cauchy.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_neon_la-galois.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_neon_la-jerasure.lo \
@@ -2160,7 +1924,7 @@ am__objects_5 = erasure-code/libec_jerasure_neon_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_jerasure_neon_la-gf_w8.lo \
 	erasure-code/jerasure/libec_jerasure_neon_la-ErasureCodePluginJerasure.lo \
 	erasure-code/jerasure/libec_jerasure_neon_la-ErasureCodeJerasure.lo
-am_libec_jerasure_neon_la_OBJECTS = $(am__objects_5) \
+am_libec_jerasure_neon_la_OBJECTS = $(am__objects_17) \
 	erasure-code/jerasure/gf-complete/src/neon/libec_jerasure_neon_la-gf_w4_neon.lo \
 	erasure-code/jerasure/gf-complete/src/neon/libec_jerasure_neon_la-gf_w8_neon.lo \
 	erasure-code/jerasure/gf-complete/src/neon/libec_jerasure_neon_la-gf_w16_neon.lo \
@@ -2174,8 +1938,8 @@ libec_jerasure_neon_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @HAVE_NEON_TRUE at am_libec_jerasure_neon_la_rpath = -rpath \
 @HAVE_NEON_TRUE@	$(erasure_codelibdir)
 libec_jerasure_sse3_la_DEPENDENCIES = $(LIBCRUSH) \
-	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2)
-am__objects_6 = erasure-code/libec_jerasure_sse3_la-ErasureCode.lo \
+	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_3)
+am__objects_18 = erasure-code/libec_jerasure_sse3_la-ErasureCode.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse3_la-cauchy.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse3_la-galois.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse3_la-jerasure.lo \
@@ -2194,7 +1958,7 @@ am__objects_6 = erasure-code/libec_jerasure_sse3_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_jerasure_sse3_la-gf_w8.lo \
 	erasure-code/jerasure/libec_jerasure_sse3_la-ErasureCodePluginJerasure.lo \
 	erasure-code/jerasure/libec_jerasure_sse3_la-ErasureCodeJerasure.lo
-am_libec_jerasure_sse3_la_OBJECTS = $(am__objects_6)
+am_libec_jerasure_sse3_la_OBJECTS = $(am__objects_18)
 libec_jerasure_sse3_la_OBJECTS = $(am_libec_jerasure_sse3_la_OBJECTS)
 libec_jerasure_sse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -2203,8 +1967,8 @@ libec_jerasure_sse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @HAVE_SSSE3_TRUE at am_libec_jerasure_sse3_la_rpath = -rpath \
 @HAVE_SSSE3_TRUE@	$(erasure_codelibdir)
 libec_jerasure_sse4_la_DEPENDENCIES = $(LIBCRUSH) \
-	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2)
-am__objects_7 = erasure-code/libec_jerasure_sse4_la-ErasureCode.lo \
+	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_3)
+am__objects_19 = erasure-code/libec_jerasure_sse4_la-ErasureCode.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse4_la-cauchy.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse4_la-galois.lo \
 	erasure-code/jerasure/jerasure/src/libec_jerasure_sse4_la-jerasure.lo \
@@ -2223,7 +1987,7 @@ am__objects_7 = erasure-code/libec_jerasure_sse4_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_jerasure_sse4_la-gf_w8.lo \
 	erasure-code/jerasure/libec_jerasure_sse4_la-ErasureCodePluginJerasure.lo \
 	erasure-code/jerasure/libec_jerasure_sse4_la-ErasureCodeJerasure.lo
-am_libec_jerasure_sse4_la_OBJECTS = $(am__objects_7)
+am_libec_jerasure_sse4_la_OBJECTS = $(am__objects_19)
 libec_jerasure_sse4_la_OBJECTS = $(am_libec_jerasure_sse4_la_OBJECTS)
 libec_jerasure_sse4_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -2233,10 +1997,10 @@ libec_jerasure_sse4_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @HAVE_SSE4_PCLMUL_TRUE@	$(erasure_codelibdir)
 libec_lrc_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
 	$(LIBJSON_SPIRIT)
-am__objects_8 = erasure-code/libec_lrc_la-ErasureCode.lo \
+am__objects_20 = erasure-code/libec_lrc_la-ErasureCode.lo \
 	erasure-code/lrc/libec_lrc_la-ErasureCodePluginLrc.lo \
 	erasure-code/lrc/libec_lrc_la-ErasureCodeLrc.lo
-am_libec_lrc_la_OBJECTS = $(am__objects_8) \
+am_libec_lrc_la_OBJECTS = $(am__objects_20) \
 	common/libec_lrc_la-str_map.lo
 libec_lrc_la_OBJECTS = $(am_libec_lrc_la_OBJECTS)
 libec_lrc_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
@@ -2244,7 +2008,7 @@ libec_lrc_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(CXXFLAGS) $(libec_lrc_la_LDFLAGS) $(LDFLAGS) -o $@
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_missing_entry_point_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_missing_entry_point_la_SOURCES_DIST =  \
 	test/erasure-code/ErasureCodePluginMissingEntryPoint.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_missing_entry_point_la_OBJECTS = test/erasure-code/libec_missing_entry_point_la-ErasureCodePluginMissingEntryPoint.lo
@@ -2259,7 +2023,7 @@ libec_missing_entry_point_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_missing_version_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_missing_version_la_SOURCES_DIST =  \
 	test/erasure-code/ErasureCodePluginMissingVersion.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_missing_version_la_OBJECTS = test/erasure-code/libec_missing_version_la-ErasureCodePluginMissingVersion.lo
@@ -2273,7 +2037,7 @@ libec_missing_version_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	-rpath \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 libec_shec_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_2)
+	$(am__DEPENDENCIES_3)
 am_libec_shec_la_OBJECTS = erasure-code/shec/libec_shec_la-ErasureCodePluginSelectShec.lo
 libec_shec_la_OBJECTS = $(am_libec_shec_la_OBJECTS)
 libec_shec_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
@@ -2281,8 +2045,8 @@ libec_shec_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(libec_shec_la_CXXFLAGS) $(CXXFLAGS) $(libec_shec_la_LDFLAGS) \
 	$(LDFLAGS) -o $@
 libec_shec_generic_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_2)
-am__objects_9 = erasure-code/libec_shec_generic_la-ErasureCode.lo \
+	$(am__DEPENDENCIES_3)
+am__objects_21 = erasure-code/libec_shec_generic_la-ErasureCode.lo \
 	erasure-code/shec/libec_shec_generic_la-ErasureCodePluginShec.lo \
 	erasure-code/shec/libec_shec_generic_la-ErasureCodeShec.lo \
 	erasure-code/shec/libec_shec_generic_la-ErasureCodeShecTableCache.lo \
@@ -2303,15 +2067,15 @@ am__objects_9 = erasure-code/libec_shec_generic_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w4.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_rand.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_generic_la-gf_w8.lo
-am_libec_shec_generic_la_OBJECTS = $(am__objects_9)
+am_libec_shec_generic_la_OBJECTS = $(am__objects_21)
 libec_shec_generic_la_OBJECTS = $(am_libec_shec_generic_la_OBJECTS)
 libec_shec_generic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(libec_shec_generic_la_CXXFLAGS) $(CXXFLAGS) \
 	$(libec_shec_generic_la_LDFLAGS) $(LDFLAGS) -o $@
 libec_shec_neon_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_2)
-am__objects_10 = erasure-code/libec_shec_neon_la-ErasureCode.lo \
+	$(am__DEPENDENCIES_3)
+am__objects_22 = erasure-code/libec_shec_neon_la-ErasureCode.lo \
 	erasure-code/shec/libec_shec_neon_la-ErasureCodePluginShec.lo \
 	erasure-code/shec/libec_shec_neon_la-ErasureCodeShec.lo \
 	erasure-code/shec/libec_shec_neon_la-ErasureCodeShecTableCache.lo \
@@ -2332,7 +2096,7 @@ am__objects_10 = erasure-code/libec_shec_neon_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w4.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_rand.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_neon_la-gf_w8.lo
-am_libec_shec_neon_la_OBJECTS = $(am__objects_10) \
+am_libec_shec_neon_la_OBJECTS = $(am__objects_22) \
 	erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w4_neon.lo \
 	erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w8_neon.lo \
 	erasure-code/jerasure/gf-complete/src/neon/libec_shec_neon_la-gf_w16_neon.lo \
@@ -2346,8 +2110,8 @@ libec_shec_neon_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @HAVE_NEON_TRUE at am_libec_shec_neon_la_rpath = -rpath \
 @HAVE_NEON_TRUE@	$(erasure_codelibdir)
 libec_shec_sse3_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_2)
-am__objects_11 = erasure-code/libec_shec_sse3_la-ErasureCode.lo \
+	$(am__DEPENDENCIES_3)
+am__objects_23 = erasure-code/libec_shec_sse3_la-ErasureCode.lo \
 	erasure-code/shec/libec_shec_sse3_la-ErasureCodePluginShec.lo \
 	erasure-code/shec/libec_shec_sse3_la-ErasureCodeShec.lo \
 	erasure-code/shec/libec_shec_sse3_la-ErasureCodeShecTableCache.lo \
@@ -2368,7 +2132,7 @@ am__objects_11 = erasure-code/libec_shec_sse3_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w4.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_rand.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_sse3_la-gf_w8.lo
-am_libec_shec_sse3_la_OBJECTS = $(am__objects_11)
+am_libec_shec_sse3_la_OBJECTS = $(am__objects_23)
 libec_shec_sse3_la_OBJECTS = $(am_libec_shec_sse3_la_OBJECTS)
 libec_shec_sse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -2377,8 +2141,8 @@ libec_shec_sse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @HAVE_SSSE3_TRUE at am_libec_shec_sse3_la_rpath = -rpath \
 @HAVE_SSSE3_TRUE@	$(erasure_codelibdir)
 libec_shec_sse4_la_DEPENDENCIES = $(LIBCRUSH) $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_2)
-am__objects_12 = erasure-code/libec_shec_sse4_la-ErasureCode.lo \
+	$(am__DEPENDENCIES_3)
+am__objects_24 = erasure-code/libec_shec_sse4_la-ErasureCode.lo \
 	erasure-code/shec/libec_shec_sse4_la-ErasureCodePluginShec.lo \
 	erasure-code/shec/libec_shec_sse4_la-ErasureCodeShec.lo \
 	erasure-code/shec/libec_shec_sse4_la-ErasureCodeShecTableCache.lo \
@@ -2399,7 +2163,7 @@ am__objects_12 = erasure-code/libec_shec_sse4_la-ErasureCode.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w4.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_rand.lo \
 	erasure-code/jerasure/gf-complete/src/libec_shec_sse4_la-gf_w8.lo
-am_libec_shec_sse4_la_OBJECTS = $(am__objects_12)
+am_libec_shec_sse4_la_OBJECTS = $(am__objects_24)
 libec_shec_sse4_la_OBJECTS = $(am_libec_shec_sse4_la_OBJECTS)
 libec_shec_sse4_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -2409,7 +2173,7 @@ libec_shec_sse4_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @HAVE_SSE4_PCLMUL_TRUE@	$(erasure_codelibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_jerasure_generic_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_test_jerasure_generic_la_SOURCES_DIST =  \
 	test/erasure-code/TestJerasurePluginGeneric.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_jerasure_generic_la_OBJECTS = test/erasure-code/libec_test_jerasure_generic_la-TestJerasurePluginGeneric.lo
@@ -2424,7 +2188,7 @@ libec_test_jerasure_generic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_jerasure_neon_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_test_jerasure_neon_la_SOURCES_DIST =  \
 	test/erasure-code/TestJerasurePluginNEON.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_jerasure_neon_la_OBJECTS = test/erasure-code/libec_test_jerasure_neon_la-TestJerasurePluginNEON.lo
@@ -2439,7 +2203,7 @@ libec_test_jerasure_neon_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_jerasure_sse3_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_test_jerasure_sse3_la_SOURCES_DIST =  \
 	test/erasure-code/TestJerasurePluginSSE3.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_jerasure_sse3_la_OBJECTS = test/erasure-code/libec_test_jerasure_sse3_la-TestJerasurePluginSSE3.lo
@@ -2454,7 +2218,7 @@ libec_test_jerasure_sse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_jerasure_sse4_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_test_jerasure_sse4_la_SOURCES_DIST =  \
 	test/erasure-code/TestJerasurePluginSSE4.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_jerasure_sse4_la_OBJECTS = test/erasure-code/libec_test_jerasure_sse4_la-TestJerasurePluginSSE4.lo
@@ -2469,7 +2233,7 @@ libec_test_jerasure_sse4_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_generic_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_test_shec_generic_la_SOURCES_DIST =  \
 	test/erasure-code/TestShecPluginGeneric.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_shec_generic_la_OBJECTS = test/erasure-code/libec_test_shec_generic_la-TestShecPluginGeneric.lo
@@ -2484,7 +2248,7 @@ libec_test_shec_generic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_neon_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_test_shec_neon_la_SOURCES_DIST =  \
 	test/erasure-code/TestShecPluginNEON.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_shec_neon_la_OBJECTS = test/erasure-code/libec_test_shec_neon_la-TestShecPluginNEON.lo
@@ -2499,7 +2263,7 @@ libec_test_shec_neon_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_sse3_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_test_shec_sse3_la_SOURCES_DIST =  \
 	test/erasure-code/TestShecPluginSSE3.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_shec_sse3_la_OBJECTS = test/erasure-code/libec_test_shec_sse3_la-TestShecPluginSSE3.lo
@@ -2514,7 +2278,7 @@ libec_test_shec_sse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(erasure_codelibdir)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_sse4_la_DEPENDENCIES =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_3)
 am__libec_test_shec_sse4_la_SOURCES_DIST =  \
 	test/erasure-code/TestShecPluginSSE4.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libec_test_shec_sse4_la_OBJECTS = test/erasure-code/libec_test_shec_sse4_la-TestShecPluginSSE4.lo
@@ -2532,8 +2296,29 @@ liberasure_code_la_OBJECTS = $(am_liberasure_code_la_OBJECTS)
 libglobal_la_DEPENDENCIES = $(LIBCOMMON)
 am_libglobal_la_OBJECTS = global/global_context.lo \
 	global/global_init.lo global/pidfile.lo \
-	global/signal_handler.lo
+	global/signal_handler.lo common/TrackedOp.lo
 libglobal_la_OBJECTS = $(am_libglobal_la_OBJECTS)
+libjournal_la_LIBADD =
+am__libjournal_la_SOURCES_DIST = journal/AsyncOpTracker.cc \
+	journal/Entry.cc journal/Future.cc journal/FutureImpl.cc \
+	journal/Journaler.cc journal/JournalMetadata.cc \
+	journal/JournalPlayer.cc journal/JournalRecorder.cc \
+	journal/JournalTrimmer.cc journal/ObjectPlayer.cc \
+	journal/ObjectRecorder.cc journal/Utils.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_libjournal_la_OBJECTS = journal/AsyncOpTracker.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Entry.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Future.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/FutureImpl.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Journaler.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/JournalMetadata.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/JournalPlayer.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/JournalRecorder.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/JournalTrimmer.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/ObjectPlayer.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/ObjectRecorder.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Utils.lo
+libjournal_la_OBJECTS = $(am_libjournal_la_OBJECTS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_libjournal_la_rpath =
 libjson_spirit_la_DEPENDENCIES = $(am__DEPENDENCIES_1)
 am_libjson_spirit_la_OBJECTS = json_spirit/json_spirit_reader.lo \
 	json_spirit/json_spirit_writer.lo
@@ -2557,52 +2342,25 @@ am__libmds_la_SOURCES_DIST = mds/Capability.cc mds/MDSDaemon.cc \
 	mds/Migrator.cc mds/MDBalancer.cc mds/CDentry.cc mds/CDir.cc \
 	mds/CInode.cc mds/LogEvent.cc mds/MDSTable.cc mds/InoTable.cc \
 	mds/JournalPointer.cc mds/MDSTableClient.cc \
-	mds/MDSTableServer.cc mds/SimpleLock.cc mds/SnapRealm.cc \
-	mds/SnapServer.cc mds/snap.cc mds/SessionMap.cc \
-	mds/MDSContext.cc mds/MDSAuthCaps.cc mds/MDLog.cc \
-	common/TrackedOp.cc
-am__objects_13 = mds/Capability.lo mds/MDSDaemon.lo mds/MDSRank.lo \
+	mds/MDSTableServer.cc mds/SimpleLock.cc mds/ScrubStack.cc \
+	mds/SnapRealm.cc mds/SnapServer.cc mds/snap.cc \
+	mds/SessionMap.cc mds/MDSContext.cc mds/MDSAuthCaps.cc \
+	mds/MDLog.cc
+am__objects_25 = mds/Capability.lo mds/MDSDaemon.lo mds/MDSRank.lo \
 	mds/Beacon.lo mds/locks.lo mds/journal.lo mds/Server.lo \
 	mds/Mutation.lo mds/MDCache.lo mds/RecoveryQueue.lo \
 	mds/StrayManager.lo mds/Locker.lo mds/Migrator.lo \
 	mds/MDBalancer.lo mds/CDentry.lo mds/CDir.lo mds/CInode.lo \
 	mds/LogEvent.lo mds/MDSTable.lo mds/InoTable.lo \
 	mds/JournalPointer.lo mds/MDSTableClient.lo \
-	mds/MDSTableServer.lo mds/SimpleLock.lo mds/SnapRealm.lo \
-	mds/SnapServer.lo mds/snap.lo mds/SessionMap.lo \
-	mds/MDSContext.lo mds/MDSAuthCaps.lo mds/MDLog.lo \
-	common/TrackedOp.lo
+	mds/MDSTableServer.lo mds/SimpleLock.lo mds/ScrubStack.lo \
+	mds/SnapRealm.lo mds/SnapServer.lo mds/snap.lo \
+	mds/SessionMap.lo mds/MDSContext.lo mds/MDSAuthCaps.lo \
+	mds/MDLog.lo
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am_libmds_la_OBJECTS =  \
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__objects_13)
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__objects_25)
 libmds_la_OBJECTS = $(am_libmds_la_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am_libmds_la_rpath =
- at WITH_LIBZFS_TRUE@am__DEPENDENCIES_4 = libos_zfs.a
-am__DEPENDENCIES_5 = libos.la $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_4) $(am__append_12)
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at libmon_la_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(LIBAUTH) $(LIBCOMMON) \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(LIBMON_TYPES)
-am__libmon_la_SOURCES_DIST = mon/Monitor.cc mon/Paxos.cc \
-	mon/PaxosService.cc mon/OSDMonitor.cc mon/MDSMonitor.cc \
-	mon/MonmapMonitor.cc mon/PGMonitor.cc mon/LogMonitor.cc \
-	mon/AuthMonitor.cc mon/Elector.cc mon/HealthMonitor.cc \
-	mon/DataHealthService.cc mon/ConfigKeyService.cc
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am_libmon_la_OBJECTS =  \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/Monitor.lo mon/Paxos.lo \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/PaxosService.lo \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/OSDMonitor.lo \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/MDSMonitor.lo \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/MonmapMonitor.lo \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/PGMonitor.lo \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/LogMonitor.lo \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/AuthMonitor.lo \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/Elector.lo \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/HealthMonitor.lo \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/DataHealthService.lo \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/ConfigKeyService.lo
-libmon_la_OBJECTS = $(am_libmon_la_OBJECTS)
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am_libmon_la_rpath =
 libmon_types_la_LIBADD =
 am_libmon_types_la_OBJECTS = mon/PGMap.lo
 libmon_types_la_OBJECTS = $(am_libmon_types_la_OBJECTS)
@@ -2618,11 +2376,11 @@ am__libmsg_la_SOURCES_DIST = msg/Message.cc msg/Messenger.cc \
 	msg/async/EventKqueue.h msg/xio/QueueStrategy.cc \
 	msg/xio/XioConnection.cc msg/xio/XioMessenger.cc \
 	msg/xio/XioMsg.cc msg/xio/XioPortal.cc msg/xio/XioPool.cc
- at LINUX_TRUE@am__objects_14 = msg/async/EventEpoll.lo
- at DARWIN_TRUE@am__objects_15 = msg/async/EventKqueue.lo
- at FREEBSD_TRUE@am__objects_16 = msg/async/EventKqueue.lo
-am__objects_17 =
- at ENABLE_XIO_TRUE@am__objects_18 = msg/xio/QueueStrategy.lo \
+ at LINUX_TRUE@am__objects_26 = msg/async/EventEpoll.lo
+ at DARWIN_TRUE@am__objects_27 = msg/async/EventKqueue.lo
+ at FREEBSD_TRUE@am__objects_28 = msg/async/EventKqueue.lo
+am__objects_29 =
+ at ENABLE_XIO_TRUE@am__objects_30 = msg/xio/QueueStrategy.lo \
 @ENABLE_XIO_TRUE@	msg/xio/XioConnection.lo \
 @ENABLE_XIO_TRUE@	msg/xio/XioMessenger.lo msg/xio/XioMsg.lo \
 @ENABLE_XIO_TRUE@	msg/xio/XioPortal.lo msg/xio/XioPool.lo
@@ -2632,68 +2390,10 @@ am_libmsg_la_OBJECTS = msg/Message.lo msg/Messenger.lo \
 	msg/simple/PipeConnection.lo msg/simple/SimpleMessenger.lo \
 	msg/async/AsyncConnection.lo msg/async/AsyncMessenger.lo \
 	msg/async/Event.lo msg/async/net_handler.lo \
-	msg/async/EventSelect.lo $(am__objects_14) $(am__objects_15) \
-	$(am__objects_16) $(am__objects_17) $(am__objects_17) \
-	$(am__objects_17) $(am__objects_18)
+	msg/async/EventSelect.lo $(am__objects_26) $(am__objects_27) \
+	$(am__objects_28) $(am__objects_29) $(am__objects_29) \
+	$(am__objects_29) $(am__objects_30)
 libmsg_la_OBJECTS = $(am_libmsg_la_OBJECTS)
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__DEPENDENCIES_6 =  \
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE@	libcrypto.a
- at ENABLE_SERVER_TRUE@libos_la_DEPENDENCIES = $(LIBOS_TYPES) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_6)
-am__libos_la_SOURCES_DIST = os/chain_xattr.cc os/fs/FS.cc \
-	os/DBObjectMap.cc os/GenericObjectMap.cc os/FileJournal.cc \
-	os/FileStore.cc os/GenericFileStoreBackend.cc os/HashIndex.cc \
-	os/IndexManager.cc os/JournalingObjectStore.cc \
-	os/LevelDBStore.cc os/LFNIndex.cc os/MemStore.cc \
-	os/KeyValueDB.cc os/KeyValueStore.cc os/ObjectStore.cc \
-	os/WBThrottle.cc common/TrackedOp.cc \
-	os/BtrfsFileStoreBackend.cc os/newstore/NewStore.cc \
-	os/fs/XFS.cc os/XfsFileStoreBackend.cc \
-	os/ZFSFileStoreBackend.cc os/KineticStore.cc
- at ENABLE_SERVER_TRUE@@LINUX_TRUE at am__objects_19 = os/libos_la-BtrfsFileStoreBackend.lo
- at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__objects_20 = os/newstore/libos_la-NewStore.lo
- at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE at am__objects_21 =  \
- at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE@	os/fs/libos_la-XFS.lo \
- at ENABLE_SERVER_TRUE@@WITH_LIBXFS_TRUE@	os/libos_la-XfsFileStoreBackend.lo
- at ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at am__objects_22 = os/libos_la-ZFSFileStoreBackend.lo
- at ENABLE_SERVER_TRUE@@WITH_KINETIC_TRUE at am__objects_23 = os/libos_la-KineticStore.lo
- at ENABLE_SERVER_TRUE@am_libos_la_OBJECTS = os/libos_la-chain_xattr.lo \
- at ENABLE_SERVER_TRUE@	os/fs/libos_la-FS.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-DBObjectMap.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-GenericObjectMap.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-FileJournal.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-FileStore.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-GenericFileStoreBackend.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-HashIndex.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-IndexManager.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-JournalingObjectStore.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-LevelDBStore.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-LFNIndex.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-MemStore.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-KeyValueDB.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-KeyValueStore.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-ObjectStore.lo \
- at ENABLE_SERVER_TRUE@	os/libos_la-WBThrottle.lo \
- at ENABLE_SERVER_TRUE@	common/libos_la-TrackedOp.lo \
- at ENABLE_SERVER_TRUE@	$(am__objects_19) $(am__objects_20) \
- at ENABLE_SERVER_TRUE@	$(am__objects_21) $(am__objects_22) \
- at ENABLE_SERVER_TRUE@	$(am__objects_23)
-libos_la_OBJECTS = $(am_libos_la_OBJECTS)
-libos_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(libos_la_CXXFLAGS) \
-	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
- at ENABLE_SERVER_TRUE@am_libos_la_rpath =
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at libos_rocksdb_la_DEPENDENCIES = rocksdb/librocksdb.la
-am__libos_rocksdb_la_SOURCES_DIST = os/RocksDBStore.cc
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_FALSE@@WITH_SLIBROCKSDB_TRUE at am_libos_rocksdb_la_OBJECTS = os/libos_rocksdb_la-RocksDBStore.lo
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am_libos_rocksdb_la_OBJECTS = os/libos_rocksdb_la-RocksDBStore.lo
-libos_rocksdb_la_OBJECTS = $(am_libos_rocksdb_la_OBJECTS)
-libos_rocksdb_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
-	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
-	$(libos_rocksdb_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
-	$(LDFLAGS) -o $@
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am_libos_rocksdb_la_rpath =
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am_libos_rocksdb_la_rpath =
 libos_tp_la_DEPENDENCIES =
 am__libos_tp_la_SOURCES_DIST = tracing/objectstore.c
 @WITH_LTTNG_TRUE at am_libos_tp_la_OBJECTS =  \
@@ -2705,50 +2405,6 @@ libos_tp_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(libos_tp_la_CFLAGS) \
 	$(CFLAGS) $(libos_tp_la_LDFLAGS) $(LDFLAGS) -o $@
 @WITH_LTTNG_TRUE at am_libos_tp_la_rpath = -rpath $(libdir)
-libos_types_la_LIBADD =
-am__libos_types_la_SOURCES_DIST = os/Transaction.cc \
-	os/newstore/newstore_types.cc
- at ENABLE_SERVER_TRUE@@WITH_LIBAIO_TRUE at am__objects_24 = os/newstore/libos_types_la-newstore_types.lo
-am_libos_types_la_OBJECTS = os/libos_types_la-Transaction.lo \
-	$(am__objects_24)
-libos_types_la_OBJECTS = $(am_libos_types_la_OBJECTS)
-libos_types_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
-	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
-	$(libos_types_la_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
-	$(LDFLAGS) -o $@
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libosd_la_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSDC) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD_TYPES) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOS_TYPES)
-am__libosd_la_SOURCES_DIST = osd/PG.cc osd/ReplicatedPG.cc \
-	osd/ReplicatedBackend.cc osd/ECBackend.cc osd/ECMsgTypes.cc \
-	osd/ECTransaction.cc osd/PGBackend.cc osd/HitSet.cc osd/OSD.cc \
-	osd/OSDCap.cc osd/Watch.cc osd/ClassHandler.cc \
-	osd/OpRequest.cc common/TrackedOp.cc osd/SnapMapper.cc \
-	objclass/class_api.cc
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libosd_la_OBJECTS =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-PG.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-ReplicatedPG.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-ReplicatedBackend.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-ECBackend.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-ECMsgTypes.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-ECTransaction.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-PGBackend.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-HitSet.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-OSD.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-OSDCap.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-Watch.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-ClassHandler.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-OpRequest.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	common/libosd_la-TrackedOp.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/libosd_la-SnapMapper.lo \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	objclass/libosd_la-class_api.lo
-libosd_la_OBJECTS = $(am_libosd_la_OBJECTS)
-libosd_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(libosd_la_CXXFLAGS) \
-	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_libosd_la_rpath =
 libosd_tp_la_DEPENDENCIES =
 am__libosd_tp_la_SOURCES_DIST = tracing/oprequest.c tracing/osd.c \
 	tracing/pg.c
@@ -2779,25 +2435,25 @@ libperfglue_la_DEPENDENCIES =
 am__libperfglue_la_SOURCES_DIST = perfglue/heap_profiler.cc \
 	perfglue/disabled_heap_profiler.cc perfglue/cpu_profiler.cc \
 	perfglue/disabled_stubs.cc
- at WITH_TCMALLOC_TRUE@am__objects_25 = perfglue/heap_profiler.lo
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__objects_26 = perfglue/heap_profiler.lo
- at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_FALSE at am__objects_27 = perfglue/disabled_heap_profiler.lo
- at WITH_PROFILER_TRUE@am__objects_28 = perfglue/cpu_profiler.lo
- at WITH_PROFILER_FALSE@am__objects_29 = perfglue/disabled_stubs.lo
-am_libperfglue_la_OBJECTS = $(am__objects_25) $(am__objects_26) \
-	$(am__objects_27) $(am__objects_28) $(am__objects_29)
+ at WITH_TCMALLOC_TRUE@am__objects_31 = perfglue/heap_profiler.lo
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at am__objects_32 = perfglue/heap_profiler.lo
+ at WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_FALSE at am__objects_33 = perfglue/disabled_heap_profiler.lo
+ at WITH_PROFILER_TRUE@am__objects_34 = perfglue/cpu_profiler.lo
+ at WITH_PROFILER_FALSE@am__objects_35 = perfglue/disabled_stubs.lo
+am_libperfglue_la_OBJECTS = $(am__objects_31) $(am__objects_32) \
+	$(am__objects_33) $(am__objects_34) $(am__objects_35)
 libperfglue_la_OBJECTS = $(am_libperfglue_la_OBJECTS)
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__DEPENDENCIES_7 =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__DEPENDENCIES_5 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBOSDC) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_3)
-am__DEPENDENCIES_8 = $(am__DEPENDENCIES_7)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_4)
+am__DEPENDENCIES_6 = $(am__DEPENDENCIES_5)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at librados_la_DEPENDENCIES =  \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_6) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_3)
 am__librados_la_SOURCES_DIST = common/buffer.cc librados/librados.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_librados_la_OBJECTS = common/librados_la-buffer.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/librados_la-librados.lo
@@ -2856,15 +2512,15 @@ librados_tp_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
 	$(librados_tp_la_CFLAGS) $(CFLAGS) $(librados_tp_la_LDFLAGS) \
 	$(LDFLAGS) -o $@
 @WITH_LTTNG_TRUE at am_librados_tp_la_rpath = -rpath $(libdir)
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__DEPENDENCIES_9 = librados_internal.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at am__DEPENDENCIES_7 = librados_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(LIBOSDC) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_3)
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at libradosstriper_la_DEPENDENCIES = $(am__DEPENDENCIES_9) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_4)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at libradosstriper_la_DEPENDENCIES = $(am__DEPENDENCIES_7) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_3)
 am__libradosstriper_la_SOURCES_DIST =  \
 	libradosstriper/libradosstriper.cc \
 	libradosstriper/RadosStriperImpl.cc \
@@ -2902,14 +2558,16 @@ libradostest_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_libradostest_la_rpath =
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_la_DEPENDENCIES = librbd_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD_TYPES) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libjournal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBCOMMON) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBOSDC) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_journal_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_3)
 am__librbd_la_SOURCES_DIST = librbd/librbd.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_la_OBJECTS = librbd/librbd_la-librbd.lo
 librbd_la_OBJECTS = $(am_librbd_la_OBJECTS)
@@ -2926,16 +2584,20 @@ librbd_api_la_OBJECTS = $(am_librbd_api_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_api_la_rpath =
 librbd_internal_la_LIBADD =
 am__librbd_internal_la_SOURCES_DIST = librbd/AioCompletion.cc \
-	librbd/AioRequest.cc librbd/AsyncFlattenRequest.cc \
+	librbd/AioImageRequest.cc librbd/AioImageRequestWQ.cc \
+	librbd/AioObjectRequest.cc librbd/AsyncFlattenRequest.cc \
 	librbd/AsyncObjectThrottle.cc librbd/AsyncOperation.cc \
 	librbd/AsyncRequest.cc librbd/AsyncResizeRequest.cc \
 	librbd/AsyncTrimRequest.cc librbd/CopyupRequest.cc \
 	librbd/DiffIterate.cc librbd/ImageCtx.cc \
-	librbd/ImageWatcher.cc librbd/internal.cc \
+	librbd/ImageWatcher.cc librbd/internal.cc librbd/Journal.cc \
+	librbd/JournalReplay.cc librbd/LibrbdAdminSocketHook.cc \
 	librbd/LibrbdWriteback.cc librbd/ObjectMap.cc \
 	librbd/RebuildObjectMapRequest.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_internal_la_OBJECTS = librbd/AioCompletion.lo \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioImageRequest.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioImageRequestWQ.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioObjectRequest.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncFlattenRequest.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncObjectThrottle.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncOperation.lo \
@@ -2947,17 +2609,20 @@ am__librbd_internal_la_SOURCES_DIST = librbd/AioCompletion.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageCtx.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageWatcher.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/internal.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Journal.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/JournalReplay.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdAdminSocketHook.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdWriteback.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectMap.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/RebuildObjectMapRequest.lo
 librbd_internal_la_OBJECTS = $(am_librbd_internal_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_internal_la_rpath =
-am__DEPENDENCIES_10 = $(LIBGLOBAL) $(LIBCOMMON) $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2)
+am__DEPENDENCIES_8 = $(LIBGLOBAL) $(LIBCOMMON) $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_3)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_replay_la_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_8)
 am__librbd_replay_la_SOURCES_DIST = rbd_replay/actions.cc \
 	rbd_replay/BufferReader.cc rbd_replay/ImageNameMap.cc \
 	rbd_replay/PendingIO.cc rbd_replay/rbd_loc.cc \
@@ -2973,7 +2638,7 @@ librbd_replay_la_OBJECTS = $(am_librbd_replay_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_replay_ios_la_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay.la
 am__librbd_replay_ios_la_SOURCES_DIST = rbd_replay/ios.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_replay_ios_la_OBJECTS = rbd_replay/ios.lo
@@ -2988,12 +2653,16 @@ librbd_test_la_LIBADD =
 am__librbd_test_la_SOURCES_DIST = test/librbd/test_fixture.cc \
 	test/librbd/test_support.cc test/librbd/test_librbd.cc \
 	test/librbd/test_ImageWatcher.cc test/librbd/test_internal.cc \
+	test/librbd/test_JournalEntries.cc \
+	test/librbd/test_JournalReplay.cc \
 	test/librbd/test_ObjectMap.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_librbd_test_la_OBJECTS = test/librbd/librbd_test_la-test_fixture.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/librbd_test_la-test_support.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/librbd_test_la-test_librbd.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/librbd_test_la-test_ImageWatcher.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/librbd_test_la-test_internal.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/librbd_test_la-test_JournalEntries.lo \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/librbd_test_la-test_JournalReplay.lo \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/librbd_test_la-test_ObjectMap.lo
 librbd_test_la_OBJECTS = $(am_librbd_test_la_OBJECTS)
 librbd_test_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
@@ -3013,7 +2682,8 @@ librbd_tp_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
 	$(CFLAGS) $(librbd_tp_la_LDFLAGS) $(LDFLAGS) -o $@
 @WITH_LTTNG_TRUE at am_librbd_tp_la_rpath = -rpath $(libdir)
 librbd_types_la_LIBADD =
-am_librbd_types_la_OBJECTS = librbd/WatchNotifyTypes.lo
+am_librbd_types_la_OBJECTS = librbd/JournalTypes.lo \
+	librbd/WatchNotifyTypes.lo
 librbd_types_la_OBJECTS = $(am_librbd_types_la_OBJECTS)
 librgw_la_LIBADD =
 am__librgw_la_SOURCES_DIST = rgw/librgw.cc rgw/rgw_acl.cc \
@@ -3073,7 +2743,7 @@ libsecret_la_DEPENDENCIES = $(am__DEPENDENCIES_1)
 am_libsecret_la_OBJECTS = common/secret.lo
 libsecret_la_OBJECTS = $(am_libsecret_la_OBJECTS)
 @LINUX_TRUE at am_libsecret_la_rpath =
- at LINUX_TRUE@libsystest_la_DEPENDENCIES = $(am__DEPENDENCIES_10)
+ at LINUX_TRUE@libsystest_la_DEPENDENCIES = $(am__DEPENDENCIES_8)
 am__libsystest_la_SOURCES_DIST = test/system/cross_process_sem.cc \
 	test/system/systest_runnable.cc \
 	test/system/systest_settings.cc
@@ -3110,6 +2780,7 @@ libsystest_la_OBJECTS = $(am_libsystest_la_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_lock$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_hello$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_numops$(EXEEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_cls_journal$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_cmd$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_io$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	ceph_test_rados_api_c_write_operations$(EXEEXT) \
@@ -3201,14 +2872,14 @@ am__EXEEXT_26 = $(am__EXEEXT_1) $(am__EXEEXT_2) $(am__EXEEXT_3) \
 @ENABLE_CLIENT_TRUE@@WITH_BABELTRACE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_30 = rbd-replay-prep$(EXEEXT)
 @ENABLE_CLIENT_TRUE at am__EXEEXT_31 = ceph-dencoder$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_32 = rados$(EXEEXT)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_33 = ceph-objectstore-tool$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_34 = cephfs-journal-tool$(EXEEXT) \
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_33 = rbd$(EXEEXT)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am__EXEEXT_34 = ceph-objectstore-tool$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_35 = cephfs-journal-tool$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	cephfs-table-tool$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	cephfs-data-scan$(EXEEXT)
- at ENABLE_CLIENT_TRUE@am__EXEEXT_35 = ceph-syn$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_36 =  \
+ at ENABLE_CLIENT_TRUE@am__EXEEXT_36 = ceph-syn$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_37 =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados-config$(EXEEXT)
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_37 = rbd$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_38 = ceph-fuse$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_39 = rbd-fuse$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_40 = cephfs$(EXEEXT)
@@ -3230,7 +2901,8 @@ am__EXEEXT_26 = $(am__EXEEXT_1) $(am__EXEEXT_2) $(am__EXEEXT_3) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_plugin_shec$(EXEEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	unittest_erasure_code_example$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_47 = unittest_librados$(EXEEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_librados_config$(EXEEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_librados_config$(EXEEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	unittest_journal$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am__EXEEXT_48 = unittest_rbd_replay$(EXEEXT)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__EXEEXT_49 = unittest_encoding$(EXEEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	unittest_base64$(EXEEXT) \
@@ -3297,17 +2969,17 @@ PROGRAMS = $(bin_PROGRAMS) $(noinst_PROGRAMS) $(sbin_PROGRAMS) \
 	$(su_sbin_PROGRAMS)
 am_ceph_authtool_OBJECTS = tools/ceph_authtool.$(OBJEXT)
 ceph_authtool_OBJECTS = $(am_ceph_authtool_OBJECTS)
-ceph_authtool_DEPENDENCIES = $(am__DEPENDENCIES_10) $(LIBCOMMON)
+ceph_authtool_DEPENDENCIES = $(am__DEPENDENCIES_8)
 am__ceph_client_debug_SOURCES_DIST = tools/ceph-client-debug.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_ceph_client_debug_OBJECTS = tools/ceph-client-debug.$(OBJEXT)
 ceph_client_debug_OBJECTS = $(am_ceph_client_debug_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_client_debug_DEPENDENCIES = $(LIBCEPHFS) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(LIBCLIENT) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON)
 am_ceph_conf_OBJECTS = tools/ceph_conf.$(OBJEXT)
 ceph_conf_OBJECTS = $(am_ceph_conf_OBJECTS)
-ceph_conf_DEPENDENCIES = $(am__DEPENDENCIES_10) $(LIBCOMMON)
+ceph_conf_DEPENDENCIES = $(am__DEPENDENCIES_8)
 am__ceph_dencoder_SOURCES_DIST = test/encoding/ceph_dencoder.cc \
 	mds/Capability.cc mds/MDSDaemon.cc mds/MDSRank.cc \
 	mds/Beacon.cc mds/locks.c mds/journal.cc mds/Server.cc \
@@ -3316,13 +2988,13 @@ am__ceph_dencoder_SOURCES_DIST = test/encoding/ceph_dencoder.cc \
 	mds/MDBalancer.cc mds/CDentry.cc mds/CDir.cc mds/CInode.cc \
 	mds/LogEvent.cc mds/MDSTable.cc mds/InoTable.cc \
 	mds/JournalPointer.cc mds/MDSTableClient.cc \
-	mds/MDSTableServer.cc mds/SimpleLock.cc mds/SnapRealm.cc \
-	mds/SnapServer.cc mds/snap.cc mds/SessionMap.cc \
-	mds/MDSContext.cc mds/MDSAuthCaps.cc mds/MDLog.cc \
-	common/TrackedOp.cc perfglue/disabled_heap_profiler.cc \
+	mds/MDSTableServer.cc mds/SimpleLock.cc mds/ScrubStack.cc \
+	mds/SnapRealm.cc mds/SnapServer.cc mds/snap.cc \
+	mds/SessionMap.cc mds/MDSContext.cc mds/MDSAuthCaps.cc \
+	mds/MDLog.cc perfglue/disabled_heap_profiler.cc \
 	perfglue/disabled_stubs.cc rgw/rgw_dencoder.cc rgw/rgw_acl.cc \
 	rgw/rgw_common.cc rgw/rgw_env.cc rgw/rgw_json_enc.cc
-am__objects_30 = mds/ceph_dencoder-Capability.$(OBJEXT) \
+am__objects_36 = mds/ceph_dencoder-Capability.$(OBJEXT) \
 	mds/ceph_dencoder-MDSDaemon.$(OBJEXT) \
 	mds/ceph_dencoder-MDSRank.$(OBJEXT) \
 	mds/ceph_dencoder-Beacon.$(OBJEXT) \
@@ -3346,32 +3018,32 @@ am__objects_30 = mds/ceph_dencoder-Capability.$(OBJEXT) \
 	mds/ceph_dencoder-MDSTableClient.$(OBJEXT) \
 	mds/ceph_dencoder-MDSTableServer.$(OBJEXT) \
 	mds/ceph_dencoder-SimpleLock.$(OBJEXT) \
+	mds/ceph_dencoder-ScrubStack.$(OBJEXT) \
 	mds/ceph_dencoder-SnapRealm.$(OBJEXT) \
 	mds/ceph_dencoder-SnapServer.$(OBJEXT) \
 	mds/ceph_dencoder-snap.$(OBJEXT) \
 	mds/ceph_dencoder-SessionMap.$(OBJEXT) \
 	mds/ceph_dencoder-MDSContext.$(OBJEXT) \
 	mds/ceph_dencoder-MDSAuthCaps.$(OBJEXT) \
-	mds/ceph_dencoder-MDLog.$(OBJEXT) \
-	common/ceph_dencoder-TrackedOp.$(OBJEXT)
- at ENABLE_CLIENT_TRUE@am__objects_31 = $(am__objects_30)
- at ENABLE_CLIENT_TRUE@am__objects_32 =  \
+	mds/ceph_dencoder-MDLog.$(OBJEXT)
+ at ENABLE_CLIENT_TRUE@am__objects_37 = $(am__objects_36)
+ at ENABLE_CLIENT_TRUE@am__objects_38 =  \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_dencoder.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_acl.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_common.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_env.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@	rgw/ceph_dencoder-rgw_json_enc.$(OBJEXT)
-am__objects_33 = $(am__objects_31) \
+am__objects_39 = $(am__objects_37) \
 	perfglue/ceph_dencoder-disabled_heap_profiler.$(OBJEXT) \
 	perfglue/ceph_dencoder-disabled_stubs.$(OBJEXT) \
-	$(am__objects_32)
+	$(am__objects_38)
 @ENABLE_CLIENT_TRUE at am_ceph_dencoder_OBJECTS = test/encoding/ceph_dencoder-ceph_dencoder.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@	$(am__objects_33)
+ at ENABLE_CLIENT_TRUE@	$(am__objects_39)
 ceph_dencoder_OBJECTS = $(am_ceph_dencoder_OBJECTS)
 @ENABLE_CLIENT_TRUE at ceph_dencoder_DEPENDENCIES = $(LIBRBD_TYPES) \
 @ENABLE_CLIENT_TRUE@	$(LIBOSD_TYPES) $(LIBOS_TYPES) \
 @ENABLE_CLIENT_TRUE@	$(LIBMON_TYPES) $(DENCODER_DEPS) \
- at ENABLE_CLIENT_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@	$(am__DEPENDENCIES_8)
 ceph_dencoder_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
@@ -3380,13 +3052,17 @@ am__ceph_fuse_SOURCES_DIST = ceph_fuse.cc
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at am_ceph_fuse_OBJECTS = ceph_fuse.$(OBJEXT)
 ceph_fuse_OBJECTS = $(am_ceph_fuse_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at ceph_fuse_DEPENDENCIES = $(LIBCLIENT_FUSE) \
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_kvstore_tool_SOURCES_DIST = tools/ceph_kvstore_tool.cc
 @ENABLE_SERVER_TRUE at am_ceph_kvstore_tool_OBJECTS = tools/ceph_kvstore_tool-ceph_kvstore_tool.$(OBJEXT)
 ceph_kvstore_tool_OBJECTS = $(am_ceph_kvstore_tool_OBJECTS)
+ at WITH_LIBZFS_TRUE@am__DEPENDENCIES_9 = libos_zfs.a
+am__DEPENDENCIES_10 = libkv.a $(am__append_23)
+am__DEPENDENCIES_11 = libos.a $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_9) $(LIBOS_TYPES) $(am__DEPENDENCIES_10)
 @ENABLE_SERVER_TRUE at ceph_kvstore_tool_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 ceph_kvstore_tool_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_kvstore_tool_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -3395,81 +3071,88 @@ am__ceph_mds_SOURCES_DIST = ceph_mds.cc
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am_ceph_mds_OBJECTS =  \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	ceph_mds.$(OBJEXT)
 ceph_mds_OBJECTS = $(am_ceph_mds_OBJECTS)
-am__DEPENDENCIES_11 = libperfglue.la $(am__DEPENDENCIES_1) \
+am__DEPENDENCIES_12 = libperfglue.la $(am__DEPENDENCIES_1) \
 	$(am__DEPENDENCIES_1)
-am__DEPENDENCIES_12 = libmds.la $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_11)
+am__DEPENDENCIES_13 = libmds.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_12)
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at ceph_mds_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__DEPENDENCIES_12) \
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__DEPENDENCIES_13) \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(LIBOSDC) \
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(LIBCOMMON)
 am__ceph_mon_SOURCES_DIST = ceph_mon.cc
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am_ceph_mon_OBJECTS =  \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	ceph_mon.$(OBJEXT)
 ceph_mon_OBJECTS = $(am_ceph_mon_OBJECTS)
-am__DEPENDENCIES_13 = libmon.la $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_11)
+am__DEPENDENCIES_14 = libmon.a $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_12) $(LIBMON_TYPES)
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at ceph_mon_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_13) \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(LIBCOMMON)
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_14) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(LIBCOMMON) $(LIBAUTH) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(LIBMON_TYPES)
 am__ceph_monstore_tool_SOURCES_DIST = tools/ceph_monstore_tool.cc
 @ENABLE_SERVER_TRUE at am_ceph_monstore_tool_OBJECTS =  \
 @ENABLE_SERVER_TRUE@	tools/ceph_monstore_tool.$(OBJEXT)
 ceph_monstore_tool_OBJECTS = $(am_ceph_monstore_tool_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_monstore_tool_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_1)
 am__ceph_objectstore_tool_SOURCES_DIST =  \
 	tools/ceph_objectstore_tool.cc tools/RadosDump.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_ceph_objectstore_tool_OBJECTS = tools/ceph_objectstore_tool.$(OBJEXT) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	tools/RadosDump.$(OBJEXT)
 ceph_objectstore_tool_OBJECTS = $(am_ceph_objectstore_tool_OBJECTS)
-am__DEPENDENCIES_14 = libosd.la $(am__DEPENDENCIES_1) $(LIBOSDC) \
-	$(am__DEPENDENCIES_5) $(am__DEPENDENCIES_11)
+am__DEPENDENCIES_15 = libosd.a $(am__DEPENDENCIES_1) $(LIBOSDC) \
+	$(am__DEPENDENCIES_11) $(am__DEPENDENCIES_12) $(LIBOSD_TYPES) \
+	$(LIBOS_TYPES)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_objectstore_tool_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 am__ceph_osd_SOURCES_DIST = ceph_osd.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_ceph_osd_OBJECTS =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	ceph_osd.$(OBJEXT)
 ceph_osd_OBJECTS = $(am_ceph_osd_OBJECTS)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_osd_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_osd_DEPENDENCIES = $(LIBOSDC) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD_TYPES) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOS_TYPES) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 am__ceph_osdomap_tool_SOURCES_DIST = tools/ceph_osdomap_tool.cc
 @ENABLE_SERVER_TRUE at am_ceph_osdomap_tool_OBJECTS =  \
 @ENABLE_SERVER_TRUE@	tools/ceph_osdomap_tool.$(OBJEXT)
 ceph_osdomap_tool_OBJECTS = $(am_ceph_osdomap_tool_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_osdomap_tool_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_1)
 am__ceph_syn_SOURCES_DIST = ceph_syn.cc client/SyntheticClient.cc
 @ENABLE_CLIENT_TRUE at am_ceph_syn_OBJECTS = ceph_syn.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@	client/SyntheticClient.$(OBJEXT)
 ceph_syn_OBJECTS = $(am_ceph_syn_OBJECTS)
 @ENABLE_CLIENT_TRUE at ceph_syn_DEPENDENCIES = $(LIBCLIENT) \
- at ENABLE_CLIENT_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@	$(am__DEPENDENCIES_8)
 am_ceph_bench_log_OBJECTS = test/bench_log.$(OBJEXT)
 ceph_bench_log_OBJECTS = $(am_ceph_bench_log_OBJECTS)
-ceph_bench_log_DEPENDENCIES = $(am__DEPENDENCIES_10)
+ceph_bench_log_DEPENDENCIES = $(am__DEPENDENCIES_8)
 am__ceph_erasure_code_SOURCES_DIST =  \
 	test/erasure-code/ceph_erasure_code.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_ceph_erasure_code_OBJECTS = test/erasure-code/ceph_erasure_code.$(OBJEXT)
 ceph_erasure_code_OBJECTS = $(am_ceph_erasure_code_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_erasure_code_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 am__ceph_erasure_code_benchmark_SOURCES_DIST =  \
 	erasure-code/ErasureCode.cc \
@@ -3479,10 +3162,10 @@ am__ceph_erasure_code_benchmark_SOURCES_DIST =  \
 ceph_erasure_code_benchmark_OBJECTS =  \
 	$(am_ceph_erasure_code_benchmark_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_erasure_code_benchmark_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 am__ceph_erasure_code_non_regression_SOURCES_DIST =  \
 	test/erasure-code/ceph_erasure_code_non_regression.cc
@@ -3490,10 +3173,10 @@ am__ceph_erasure_code_non_regression_SOURCES_DIST =  \
 ceph_erasure_code_non_regression_OBJECTS =  \
 	$(am_ceph_erasure_code_non_regression_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_erasure_code_non_regression_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 am__ceph_kvstorebench_SOURCES_DIST = test/kv_store_bench.cc \
 	key_value_store/kv_flat_btree_async.cc
@@ -3502,27 +3185,27 @@ am__ceph_kvstorebench_SOURCES_DIST = test/kv_store_bench.cc \
 ceph_kvstorebench_OBJECTS = $(am_ceph_kvstorebench_OBJECTS)
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE at ceph_kvstorebench_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_multi_stress_watch_SOURCES_DIST = test/multi_stress_watch.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_ceph_multi_stress_watch_OBJECTS = test/multi_stress_watch.$(OBJEXT)
 ceph_multi_stress_watch_OBJECTS =  \
 	$(am_ceph_multi_stress_watch_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_multi_stress_watch_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 am__ceph_objectstore_bench_SOURCES_DIST = test/objectstore_bench.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_ceph_objectstore_bench_OBJECTS = test/objectstore_bench.$(OBJEXT)
 ceph_objectstore_bench_OBJECTS = $(am_ceph_objectstore_bench_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_objectstore_bench_DEPENDENCIES =  \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_omapbench_SOURCES_DIST = test/omap_bench.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_ceph_omapbench_OBJECTS = test/omap_bench.$(OBJEXT)
 ceph_omapbench_OBJECTS = $(am_ceph_omapbench_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_omapbench_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_perf_local_SOURCES_DIST = test/perf_local.cc \
 	test/perf_helper.cc
 @ENABLE_SERVER_TRUE at am_ceph_perf_local_OBJECTS =  \
@@ -3530,8 +3213,8 @@ am__ceph_perf_local_SOURCES_DIST = test/perf_local.cc \
 @ENABLE_SERVER_TRUE@	test/ceph_perf_local-perf_helper.$(OBJEXT)
 ceph_perf_local_OBJECTS = $(am_ceph_perf_local_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_perf_local_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 ceph_perf_local_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_perf_local_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -3540,14 +3223,14 @@ am__ceph_perf_msgr_client_SOURCES_DIST =  \
 	test/msgr/perf_msgr_client.cc
 @ENABLE_SERVER_TRUE at am_ceph_perf_msgr_client_OBJECTS = test/msgr/ceph_perf_msgr_client-perf_msgr_client.$(OBJEXT)
 ceph_perf_msgr_client_OBJECTS = $(am_ceph_perf_msgr_client_OBJECTS)
-am__DEPENDENCIES_15 = $(top_builddir)/src/gmock/lib/libgmock_main.la \
+am__DEPENDENCIES_16 = $(top_builddir)/src/gmock/lib/libgmock_main.la \
 	$(top_builddir)/src/gmock/lib/libgmock.la \
 	$(top_builddir)/src/gmock/gtest/lib/libgtest.la \
-	$(am__DEPENDENCIES_1)
+	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1)
 @ENABLE_SERVER_TRUE at ceph_perf_msgr_client_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 ceph_perf_msgr_client_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_perf_msgr_client_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -3557,9 +3240,9 @@ am__ceph_perf_msgr_server_SOURCES_DIST =  \
 @ENABLE_SERVER_TRUE at am_ceph_perf_msgr_server_OBJECTS = test/msgr/ceph_perf_msgr_server-perf_msgr_server.$(OBJEXT)
 ceph_perf_msgr_server_OBJECTS = $(am_ceph_perf_msgr_server_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_perf_msgr_server_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 ceph_perf_msgr_server_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_perf_msgr_server_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -3569,23 +3252,23 @@ am__ceph_perf_objectstore_SOURCES_DIST =  \
 @ENABLE_SERVER_TRUE at am_ceph_perf_objectstore_OBJECTS = test/objectstore/ceph_perf_objectstore-ObjectStoreTransactionBenchmark.$(OBJEXT)
 ceph_perf_objectstore_OBJECTS = $(am_ceph_perf_objectstore_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_perf_objectstore_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 ceph_perf_objectstore_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_perf_objectstore_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 am_ceph_psim_OBJECTS = tools/psim.$(OBJEXT)
 ceph_psim_OBJECTS = $(am_ceph_psim_OBJECTS)
-ceph_psim_DEPENDENCIES = $(am__DEPENDENCIES_10)
+ceph_psim_DEPENDENCIES = $(am__DEPENDENCIES_8)
 am__ceph_radosacl_SOURCES_DIST = tools/radosacl.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_ceph_radosacl_OBJECTS =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	tools/radosacl.$(OBJEXT)
 ceph_radosacl_OBJECTS = $(am_ceph_radosacl_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_radosacl_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_rgw_jsonparser_SOURCES_DIST = rgw/rgw_jsonparser.cc \
 	rgw/rgw_common.cc rgw/rgw_env.cc rgw/rgw_json_enc.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_ceph_rgw_jsonparser_OBJECTS = rgw/rgw_jsonparser.$(OBJEXT) \
@@ -3593,8 +3276,8 @@ am__ceph_rgw_jsonparser_SOURCES_DIST = rgw/rgw_jsonparser.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_env.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_json_enc.$(OBJEXT)
 ceph_rgw_jsonparser_OBJECTS = $(am_ceph_rgw_jsonparser_OBJECTS)
-am__DEPENDENCIES_16 = librgw.la $(am__DEPENDENCIES_1)
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__DEPENDENCIES_17 = $(LIBRADOS) \
+am__DEPENDENCIES_17 = librgw.la $(am__DEPENDENCIES_1)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__DEPENDENCIES_18 = $(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_rgw_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_log_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_statelog_client.a \
@@ -3604,28 +3287,28 @@ am__DEPENDENCIES_16 = librgw.la $(am__DEPENDENCIES_1)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_refcount_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_version_client.a
-am__DEPENDENCIES_18 = $(am__DEPENDENCIES_17)
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_rgw_jsonparser_DEPENDENCIES = $(am__DEPENDENCIES_16) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_18) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+am__DEPENDENCIES_19 = $(am__DEPENDENCIES_18)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_rgw_jsonparser_DEPENDENCIES = $(am__DEPENDENCIES_17) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_19) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_rgw_multiparser_SOURCES_DIST = rgw/rgw_multiparser.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_ceph_rgw_multiparser_OBJECTS = rgw/rgw_multiparser.$(OBJEXT)
 ceph_rgw_multiparser_OBJECTS = $(am_ceph_rgw_multiparser_OBJECTS)
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_rgw_multiparser_DEPENDENCIES = $(am__DEPENDENCIES_16) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_18) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_rgw_multiparser_DEPENDENCIES = $(am__DEPENDENCIES_17) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_19) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_scratchtool_SOURCES_DIST = tools/scratchtool.c
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_ceph_scratchtool_OBJECTS = tools/scratchtool.$(OBJEXT)
 ceph_scratchtool_OBJECTS = $(am_ceph_scratchtool_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_scratchtool_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_scratchtoolpp_SOURCES_DIST = tools/scratchtoolpp.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_ceph_scratchtoolpp_OBJECTS = tools/scratchtoolpp.$(OBJEXT)
 ceph_scratchtoolpp_OBJECTS = $(am_ceph_scratchtoolpp_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_scratchtoolpp_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_smalliobench_SOURCES_DIST = test/bench/small_io_bench.cc \
 	test/bench/rados_backend.cc \
 	test/bench/detailed_stat_collector.cc test/bench/bencher.cc
@@ -3637,7 +3320,7 @@ ceph_smalliobench_OBJECTS = $(am_ceph_smalliobench_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_smalliobench_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_smalliobenchdumb_SOURCES_DIST =  \
 	test/bench/small_io_bench_dumb.cc test/bench/dumb_backend.cc \
 	test/bench/detailed_stat_collector.cc test/bench/bencher.cc
@@ -3648,8 +3331,8 @@ am__ceph_smalliobenchdumb_SOURCES_DIST =  \
 ceph_smalliobenchdumb_OBJECTS = $(am_ceph_smalliobenchdumb_OBJECTS)
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE at ceph_smalliobenchdumb_DEPENDENCIES = $(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_smalliobenchfs_SOURCES_DIST =  \
 	test/bench/small_io_bench_fs.cc \
 	test/bench/testfilestore_backend.cc \
@@ -3661,8 +3344,8 @@ am__ceph_smalliobenchfs_SOURCES_DIST =  \
 ceph_smalliobenchfs_OBJECTS = $(am_ceph_smalliobenchfs_OBJECTS)
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE at ceph_smalliobenchfs_DEPENDENCIES = $(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_smalliobenchrbd_SOURCES_DIST =  \
 	test/bench/small_io_bench_rbd.cc test/bench/rbd_backend.cc \
 	test/bench/detailed_stat_collector.cc test/bench/bencher.cc
@@ -3675,22 +3358,22 @@ ceph_smalliobenchrbd_OBJECTS = $(am_ceph_smalliobenchrbd_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_streamtest_SOURCES_DIST = test/streamtest.cc
 @ENABLE_SERVER_TRUE at am_ceph_streamtest_OBJECTS =  \
 @ENABLE_SERVER_TRUE@	test/streamtest.$(OBJEXT)
 ceph_streamtest_OBJECTS = $(am_ceph_streamtest_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_streamtest_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_test_async_driver_SOURCES_DIST =  \
 	test/msgr/test_async_driver.cc
 @ENABLE_SERVER_TRUE at am_ceph_test_async_driver_OBJECTS = test/msgr/ceph_test_async_driver-test_async_driver.$(OBJEXT)
 ceph_test_async_driver_OBJECTS = $(am_ceph_test_async_driver_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_test_async_driver_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 ceph_test_async_driver_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_async_driver_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -3716,13 +3399,27 @@ ceph_test_cls_hello_OBJECTS = $(am_ceph_test_cls_hello_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_hello_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_cls_hello_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_cls_hello_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
+am__ceph_test_cls_journal_SOURCES_DIST =  \
+	test/cls_journal/test_cls_journal.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_ceph_test_cls_journal_OBJECTS = test/cls_journal/ceph_test_cls_journal-test_cls_journal.$(OBJEXT)
+ceph_test_cls_journal_OBJECTS = $(am_ceph_test_cls_journal_OBJECTS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_journal_DEPENDENCIES =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_journal_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) $(LIBCOMMON) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
+ceph_test_cls_journal_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(ceph_test_cls_journal_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
 am__ceph_test_cls_lock_SOURCES_DIST = test/cls_lock/test_cls_lock.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_ceph_test_cls_lock_OBJECTS = test/cls_lock/ceph_test_cls_lock-test_cls_lock.$(OBJEXT)
 ceph_test_cls_lock_OBJECTS = $(am_ceph_test_cls_lock_OBJECTS)
@@ -3730,7 +3427,7 @@ ceph_test_cls_lock_OBJECTS = $(am_ceph_test_cls_lock_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_cls_lock_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -3742,8 +3439,8 @@ ceph_test_cls_log_OBJECTS = $(am_ceph_test_cls_log_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_log_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_log_client.a \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_cls_log_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -3756,8 +3453,8 @@ ceph_test_cls_numops_OBJECTS = $(am_ceph_test_cls_numops_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_numops_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_numops_client.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_cls_numops_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -3771,10 +3468,10 @@ ceph_test_cls_rbd_OBJECTS = $(am_ceph_test_cls_rbd_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_rbd_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_3)
 ceph_test_cls_rbd_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_cls_rbd_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -3786,7 +3483,7 @@ ceph_test_cls_refcount_OBJECTS = $(am_ceph_test_cls_refcount_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_refcount_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_refcount_client.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_cls_refcount_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -3800,8 +3497,8 @@ ceph_test_cls_replica_log_OBJECTS =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_replica_log_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_replica_log_client.a \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_cls_replica_log_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -3814,8 +3511,8 @@ ceph_test_cls_rgw_OBJECTS = $(am_ceph_test_cls_rgw_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_rgw_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_cls_rgw_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -3825,9 +3522,9 @@ am__ceph_test_cls_rgw_log_SOURCES_DIST = test/test_rgw_admin_log.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_ceph_test_cls_rgw_log_OBJECTS = test/ceph_test_cls_rgw_log-test_rgw_admin_log.$(OBJEXT)
 ceph_test_cls_rgw_log_OBJECTS = $(am_ceph_test_cls_rgw_log_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_rgw_log_DEPENDENCIES = $(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_17) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_version_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_log_client.a \
@@ -3844,9 +3541,9 @@ am__ceph_test_cls_rgw_meta_SOURCES_DIST = test/test_rgw_admin_meta.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_ceph_test_cls_rgw_meta_OBJECTS = test/ceph_test_cls_rgw_meta-test_rgw_admin_meta.$(OBJEXT)
 ceph_test_cls_rgw_meta_OBJECTS = $(am_ceph_test_cls_rgw_meta_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_rgw_meta_DEPENDENCIES = $(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_17) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_version_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_log_client.a \
@@ -3865,9 +3562,9 @@ am__ceph_test_cls_rgw_opstate_SOURCES_DIST =  \
 ceph_test_cls_rgw_opstate_OBJECTS =  \
 	$(am_ceph_test_cls_rgw_opstate_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_rgw_opstate_DEPENDENCIES = $(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_17) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_version_client.a \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	libcls_log_client.a \
@@ -3889,8 +3586,8 @@ ceph_test_cls_statelog_OBJECTS = $(am_ceph_test_cls_statelog_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_statelog_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_statelog_client.a \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_cls_statelog_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -3903,7 +3600,7 @@ ceph_test_cls_version_OBJECTS = $(am_ceph_test_cls_version_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_version_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_version_client.a \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_cls_version_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -3913,23 +3610,23 @@ am__ceph_test_cors_SOURCES_DIST = test/test_cors.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_ceph_test_cors_OBJECTS = test/ceph_test_cors-test_cors.$(OBJEXT)
 ceph_test_cors_OBJECTS = $(am_ceph_test_cors_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_cors_DEPENDENCIES = $(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_17) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16)
 ceph_test_cors_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_cors_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 am_ceph_test_crypto_OBJECTS = test/testcrypto.$(OBJEXT)
 ceph_test_crypto_OBJECTS = $(am_ceph_test_crypto_OBJECTS)
-ceph_test_crypto_DEPENDENCIES = $(am__DEPENDENCIES_10)
+ceph_test_crypto_DEPENDENCIES = $(am__DEPENDENCIES_8)
 am__ceph_test_filejournal_SOURCES_DIST = test/test_filejournal.cc
 @ENABLE_SERVER_TRUE at am_ceph_test_filejournal_OBJECTS = test/ceph_test_filejournal-test_filejournal.$(OBJEXT)
 ceph_test_filejournal_OBJECTS = $(am_ceph_test_filejournal_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_test_filejournal_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 ceph_test_filejournal_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_filejournal_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -3939,9 +3636,9 @@ am__ceph_test_filestore_SOURCES_DIST =  \
 @ENABLE_SERVER_TRUE@@LINUX_TRUE at am_ceph_test_filestore_OBJECTS = test/filestore/ceph_test_filestore-TestFileStore.$(OBJEXT)
 ceph_test_filestore_OBJECTS = $(am_ceph_test_filestore_OBJECTS)
 @ENABLE_SERVER_TRUE@@LINUX_TRUE at ceph_test_filestore_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_8)
 ceph_test_filestore_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_filestore_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -3956,8 +3653,8 @@ am__ceph_test_filestore_idempotent_SOURCES_DIST =  \
 ceph_test_filestore_idempotent_OBJECTS =  \
 	$(am_ceph_test_filestore_idempotent_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_test_filestore_idempotent_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_test_filestore_idempotent_sequence_SOURCES_DIST =  \
 	test/objectstore/test_idempotent_sequence.cc \
 	test/objectstore/DeterministicOpSequence.cc \
@@ -3970,8 +3667,8 @@ am__ceph_test_filestore_idempotent_sequence_SOURCES_DIST =  \
 ceph_test_filestore_idempotent_sequence_OBJECTS =  \
 	$(am_ceph_test_filestore_idempotent_sequence_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_test_filestore_idempotent_sequence_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 am_ceph_test_get_blkdev_size_OBJECTS =  \
 	test/test_get_blkdev_size.$(OBJEXT)
 ceph_test_get_blkdev_size_OBJECTS =  \
@@ -3987,15 +3684,15 @@ am__ceph_test_keys_SOURCES_DIST = test/testkeys.cc
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	test/testkeys.$(OBJEXT)
 ceph_test_keys_OBJECTS = $(am_ceph_test_keys_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at ceph_test_keys_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_13) \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_14) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_test_keyvaluedb_SOURCES_DIST = test/objectstore/test_kv.cc
 @ENABLE_SERVER_TRUE@@LINUX_TRUE at am_ceph_test_keyvaluedb_OBJECTS = test/objectstore/ceph_test_keyvaluedb-test_kv.$(OBJEXT)
 ceph_test_keyvaluedb_OBJECTS = $(am_ceph_test_keyvaluedb_OBJECTS)
 @ENABLE_SERVER_TRUE@@LINUX_TRUE at ceph_test_keyvaluedb_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_8)
 ceph_test_keyvaluedb_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_keyvaluedb_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4006,9 +3703,9 @@ am__ceph_test_keyvaluedb_atomicity_SOURCES_DIST =  \
 ceph_test_keyvaluedb_atomicity_OBJECTS =  \
 	$(am_ceph_test_keyvaluedb_atomicity_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_test_keyvaluedb_atomicity_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 ceph_test_keyvaluedb_atomicity_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_keyvaluedb_atomicity_CXXFLAGS) $(CXXFLAGS) \
@@ -4021,25 +3718,29 @@ am__ceph_test_keyvaluedb_iterators_SOURCES_DIST =  \
 ceph_test_keyvaluedb_iterators_OBJECTS =  \
 	$(am_ceph_test_keyvaluedb_iterators_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_test_keyvaluedb_iterators_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 ceph_test_keyvaluedb_iterators_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_keyvaluedb_iterators_CXXFLAGS) $(CXXFLAGS) \
 	$(AM_LDFLAGS) $(LDFLAGS) -o $@
 am__ceph_test_libcephfs_SOURCES_DIST = test/libcephfs/test.cc \
 	test/libcephfs/readdir_r_cb.cc test/libcephfs/caps.cc \
-	test/libcephfs/multiclient.cc test/libcephfs/flock.cc
- at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__objects_34 = test/libcephfs/ceph_test_libcephfs-flock.$(OBJEXT)
+	test/libcephfs/multiclient.cc test/libcephfs/access.cc \
+	test/libcephfs/flock.cc
+ at ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am__objects_40 = test/libcephfs/ceph_test_libcephfs-flock.$(OBJEXT)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_ceph_test_libcephfs_OBJECTS = test/libcephfs/ceph_test_libcephfs-test.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/ceph_test_libcephfs-readdir_r_cb.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/ceph_test_libcephfs-caps.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/ceph_test_libcephfs-multiclient.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_34)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/ceph_test_libcephfs-access.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_40)
 ceph_test_libcephfs_OBJECTS = $(am_ceph_test_libcephfs_OBJECTS)
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_libcephfs_DEPENDENCIES = $(LIBCEPHFS) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_libcephfs_DEPENDENCIES = $(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(LIBCEPHFS) \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON) \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16)
 ceph_test_libcephfs_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_libcephfs_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4053,10 +3754,12 @@ ceph_test_librbd_OBJECTS = $(am_ceph_test_librbd_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD_TYPES) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libjournal.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_journal_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_api.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_6) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_8) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_librbd_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4072,7 +3775,7 @@ ceph_test_librbd_api_OBJECTS = $(am_ceph_test_librbd_api_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBCOMMON) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_librbd_api_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4094,9 +3797,9 @@ am__ceph_test_mon_msg_SOURCES_DIST = test/mon/test-mon-msg.cc
 @ENABLE_SERVER_TRUE at am_ceph_test_mon_msg_OBJECTS = test/mon/ceph_test_mon_msg-test-mon-msg.$(OBJEXT)
 ceph_test_mon_msg_OBJECTS = $(am_ceph_test_mon_msg_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_test_mon_msg_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) $(LIBOSDC) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) $(LIBOSDC) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_16)
 ceph_test_mon_msg_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_mon_msg_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4108,15 +3811,15 @@ am__ceph_test_mon_workloadgen_SOURCES_DIST =  \
 ceph_test_mon_workloadgen_OBJECTS =  \
 	$(am_ceph_test_mon_workloadgen_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_test_mon_workloadgen_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) $(LIBOSDC) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) $(LIBOSDC) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_test_msgr_SOURCES_DIST = test/msgr/test_msgr.cc
 @ENABLE_SERVER_TRUE at am_ceph_test_msgr_OBJECTS = test/msgr/ceph_test_msgr-test_msgr.$(OBJEXT)
 ceph_test_msgr_OBJECTS = $(am_ceph_test_msgr_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_test_msgr_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 ceph_test_msgr_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_msgr_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4126,7 +3829,7 @@ am__ceph_test_mutate_SOURCES_DIST = test/test_mutate.cc
 ceph_test_mutate_OBJECTS = $(am_ceph_test_mutate_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_mutate_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_test_object_map_SOURCES_DIST =  \
 	test/ObjectMap/test_object_map.cc \
 	test/ObjectMap/KeyValueDBMemory.cc
@@ -4134,9 +3837,9 @@ am__ceph_test_object_map_SOURCES_DIST =  \
 @ENABLE_SERVER_TRUE@	test/ObjectMap/ceph_test_object_map-KeyValueDBMemory.$(OBJEXT)
 ceph_test_object_map_OBJECTS = $(am_ceph_test_object_map_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_test_object_map_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 ceph_test_object_map_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_object_map_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4147,15 +3850,15 @@ am_ceph_test_objectcacher_stress_OBJECTS =  \
 ceph_test_objectcacher_stress_OBJECTS =  \
 	$(am_ceph_test_objectcacher_stress_OBJECTS)
 ceph_test_objectcacher_stress_DEPENDENCIES = $(LIBOSDC) \
-	$(am__DEPENDENCIES_10)
+	$(am__DEPENDENCIES_8)
 am__ceph_test_objectstore_SOURCES_DIST =  \
 	test/objectstore/store_test.cc
 @ENABLE_SERVER_TRUE@@LINUX_TRUE at am_ceph_test_objectstore_OBJECTS = test/objectstore/ceph_test_objectstore-store_test.$(OBJEXT)
 ceph_test_objectstore_OBJECTS = $(am_ceph_test_objectstore_OBJECTS)
 @ENABLE_SERVER_TRUE@@LINUX_TRUE at ceph_test_objectstore_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@LINUX_TRUE@	$(am__DEPENDENCIES_8)
 ceph_test_objectstore_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_objectstore_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4168,8 +3871,8 @@ am__ceph_test_objectstore_workloadgen_SOURCES_DIST =  \
 ceph_test_objectstore_workloadgen_OBJECTS =  \
 	$(am_ceph_test_objectstore_workloadgen_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_test_objectstore_workloadgen_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_test_rados_SOURCES_DIST = test/osd/TestRados.cc \
 	test/osd/TestOpStat.cc test/osd/Object.cc \
 	test/osd/RadosModel.cc
@@ -4180,14 +3883,14 @@ am__ceph_test_rados_SOURCES_DIST = test/osd/TestRados.cc \
 ceph_test_rados_OBJECTS = $(am_ceph_test_rados_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_test_rados_api_aio_SOURCES_DIST = test/librados/aio.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_ceph_test_rados_api_aio_OBJECTS = test/librados/ceph_test_rados_api_aio-aio.$(OBJEXT)
 ceph_test_rados_api_aio_OBJECTS =  \
 	$(am_ceph_test_rados_api_aio_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_aio_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) $(LIBCOMMON) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_aio_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4200,7 +3903,7 @@ ceph_test_rados_api_c_read_operations_OBJECTS =  \
 	$(am_ceph_test_rados_api_c_read_operations_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_c_read_operations_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_c_read_operations_LINK = $(LIBTOOL) $(AM_V_lt) \
 	--tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
@@ -4213,7 +3916,7 @@ ceph_test_rados_api_c_write_operations_OBJECTS =  \
 	$(am_ceph_test_rados_api_c_write_operations_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_c_write_operations_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_c_write_operations_LINK = $(LIBTOOL) $(AM_V_lt) \
 	--tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
@@ -4225,7 +3928,7 @@ ceph_test_rados_api_cls_OBJECTS =  \
 	$(am_ceph_test_rados_api_cls_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_cls_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_cls_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4238,7 +3941,7 @@ ceph_test_rados_api_cmd_OBJECTS =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_cmd_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON) $(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_cmd_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4249,7 +3952,7 @@ am__ceph_test_rados_api_io_SOURCES_DIST = test/librados/io.cc
 ceph_test_rados_api_io_OBJECTS = $(am_ceph_test_rados_api_io_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_io_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_io_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4261,7 +3964,7 @@ ceph_test_rados_api_list_OBJECTS =  \
 	$(am_ceph_test_rados_api_list_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_list_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_list_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4273,7 +3976,7 @@ ceph_test_rados_api_lock_OBJECTS =  \
 	$(am_ceph_test_rados_api_lock_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_lock_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_lock_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4285,8 +3988,8 @@ ceph_test_rados_api_misc_OBJECTS =  \
 	$(am_ceph_test_rados_api_misc_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_misc_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_misc_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4298,7 +4001,7 @@ ceph_test_rados_api_nlist_OBJECTS =  \
 	$(am_ceph_test_rados_api_nlist_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_nlist_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_nlist_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4310,7 +4013,7 @@ ceph_test_rados_api_pool_OBJECTS =  \
 	$(am_ceph_test_rados_api_pool_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_pool_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_pool_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4323,7 +4026,7 @@ ceph_test_rados_api_snapshots_OBJECTS =  \
 	$(am_ceph_test_rados_api_snapshots_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_snapshots_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_snapshots_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4335,7 +4038,7 @@ ceph_test_rados_api_stat_OBJECTS =  \
 	$(am_ceph_test_rados_api_stat_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_stat_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_stat_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4349,8 +4052,8 @@ ceph_test_rados_api_tier_OBJECTS =  \
 	$(am_ceph_test_rados_api_tier_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_tier_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_tier_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4363,7 +4066,7 @@ ceph_test_rados_api_watch_notify_OBJECTS =  \
 	$(am_ceph_test_rados_api_watch_notify_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_watch_notify_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_rados_api_watch_notify_LINK = $(LIBTOOL) $(AM_V_lt) \
 	--tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
@@ -4415,7 +4118,7 @@ ceph_test_rados_striper_api_aio_OBJECTS =  \
 	$(am_ceph_test_rados_striper_api_aio_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_striper_api_aio_DEPENDENCIES = $(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOSSTRIPER) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(RADOS_STRIPER_TEST_LDADD)
 ceph_test_rados_striper_api_aio_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4429,7 +4132,7 @@ ceph_test_rados_striper_api_io_OBJECTS =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_striper_api_io_DEPENDENCIES = $(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOSSTRIPER) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(RADOS_STRIPER_TEST_LDADD)
 ceph_test_rados_striper_api_io_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4442,7 +4145,7 @@ ceph_test_rados_striper_api_striping_OBJECTS =  \
 	$(am_ceph_test_rados_striper_api_striping_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_striper_api_striping_DEPENDENCIES = $(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOSSTRIPER) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(RADOS_STRIPER_TEST_LDADD)
 ceph_test_rados_striper_api_striping_LINK = $(LIBTOOL) $(AM_V_lt) \
 	--tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
@@ -4472,16 +4175,16 @@ ceph_test_rewrite_latency_OBJECTS =  \
 	$(am_ceph_test_rewrite_latency_OBJECTS)
 ceph_test_rewrite_latency_DEPENDENCIES = $(LIBCOMMON) \
 	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1) \
-	$(am__DEPENDENCIES_2)
+	$(am__DEPENDENCIES_3)
 am__ceph_test_rgw_manifest_SOURCES_DIST =  \
 	test/rgw/test_rgw_manifest.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_ceph_test_rgw_manifest_OBJECTS = test/rgw/ceph_test_rgw_manifest-test_rgw_manifest.$(OBJEXT)
 ceph_test_rgw_manifest_OBJECTS = $(am_ceph_test_rgw_manifest_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_rgw_manifest_DEPENDENCIES = $(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_17) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_19) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_18) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1)
 ceph_test_rgw_manifest_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4491,10 +4194,10 @@ am__ceph_test_rgw_obj_SOURCES_DIST = test/rgw/test_rgw_obj.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_ceph_test_rgw_obj_OBJECTS = test/rgw/ceph_test_rgw_obj-test_rgw_obj.$(OBJEXT)
 ceph_test_rgw_obj_OBJECTS = $(am_ceph_test_rgw_obj_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at ceph_test_rgw_obj_DEPENDENCIES = $(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_17) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_19) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_18) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1)
 ceph_test_rgw_obj_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4504,14 +4207,14 @@ am_ceph_test_signal_handlers_OBJECTS =  \
 	test/TestSignalHandlers.$(OBJEXT)
 ceph_test_signal_handlers_OBJECTS =  \
 	$(am_ceph_test_signal_handlers_OBJECTS)
-ceph_test_signal_handlers_DEPENDENCIES = $(am__DEPENDENCIES_10)
+ceph_test_signal_handlers_DEPENDENCIES = $(am__DEPENDENCIES_8)
 am__ceph_test_snap_mapper_SOURCES_DIST = test/test_snap_mapper.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_ceph_test_snap_mapper_OBJECTS = test/ceph_test_snap_mapper-test_snap_mapper.$(OBJEXT)
 ceph_test_snap_mapper_OBJECTS = $(am_ceph_test_snap_mapper_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_test_snap_mapper_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8)
 ceph_test_snap_mapper_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_test_snap_mapper_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4521,7 +4224,7 @@ am__ceph_test_stress_watch_SOURCES_DIST = test/test_stress_watch.cc
 ceph_test_stress_watch_OBJECTS = $(am_ceph_test_stress_watch_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_stress_watch_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) $(LIBCOMMON) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
 ceph_test_stress_watch_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4529,14 +4232,14 @@ ceph_test_stress_watch_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(LDFLAGS) -o $@
 am_ceph_test_timers_OBJECTS = test/TestTimers.$(OBJEXT)
 ceph_test_timers_OBJECTS = $(am_ceph_test_timers_OBJECTS)
-ceph_test_timers_DEPENDENCIES = $(am__DEPENDENCIES_10)
+ceph_test_timers_DEPENDENCIES = $(am__DEPENDENCIES_8)
 am__ceph_test_trans_SOURCES_DIST = test/test_trans.cc
 @ENABLE_SERVER_TRUE at am_ceph_test_trans_OBJECTS =  \
 @ENABLE_SERVER_TRUE@	test/test_trans.$(OBJEXT)
 ceph_test_trans_OBJECTS = $(am_ceph_test_trans_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_test_trans_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_tpbench_SOURCES_DIST = test/bench/tp_bench.cc \
 	test/bench/detailed_stat_collector.cc
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE at am_ceph_tpbench_OBJECTS = test/bench/tp_bench.$(OBJEXT) \
@@ -4544,15 +4247,15 @@ am__ceph_tpbench_SOURCES_DIST = test/bench/tp_bench.cc \
 ceph_tpbench_OBJECTS = $(am_ceph_tpbench_OBJECTS)
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE at ceph_tpbench_DEPENDENCIES = $(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__ceph_xattr_bench_SOURCES_DIST = test/xattr_bench.cc
 @ENABLE_SERVER_TRUE at am_ceph_xattr_bench_OBJECTS = test/ceph_xattr_bench-xattr_bench.$(OBJEXT)
 ceph_xattr_bench_OBJECTS = $(am_ceph_xattr_bench_OBJECTS)
 @ENABLE_SERVER_TRUE at ceph_xattr_bench_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 ceph_xattr_bench_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(ceph_xattr_bench_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4567,10 +4270,10 @@ am__cephfs_data_scan_SOURCES_DIST = tools/cephfs/cephfs-data-scan.cc \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/DataScan.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/MDSUtility.$(OBJEXT)
 cephfs_data_scan_OBJECTS = $(am_cephfs_data_scan_OBJECTS)
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at cephfs_data_scan_DEPENDENCIES = $(am__DEPENDENCIES_12) \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at cephfs_data_scan_DEPENDENCIES = $(am__DEPENDENCIES_13) \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	libcls_cephfs_client.la \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__cephfs_journal_tool_SOURCES_DIST =  \
 	tools/cephfs/cephfs-journal-tool.cc \
 	tools/cephfs/JournalTool.cc tools/cephfs/JournalFilter.cc \
@@ -4586,9 +4289,9 @@ am__cephfs_journal_tool_SOURCES_DIST =  \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/Resetter.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/MDSUtility.$(OBJEXT)
 cephfs_journal_tool_OBJECTS = $(am_cephfs_journal_tool_OBJECTS)
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at cephfs_journal_tool_DEPENDENCIES = $(am__DEPENDENCIES_12) \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at cephfs_journal_tool_DEPENDENCIES = $(am__DEPENDENCIES_13) \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__cephfs_table_tool_SOURCES_DIST =  \
 	tools/cephfs/cephfs-table-tool.cc tools/cephfs/TableTool.cc \
 	tools/cephfs/MDSUtility.cc
@@ -4596,30 +4299,32 @@ am__cephfs_table_tool_SOURCES_DIST =  \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/TableTool.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/MDSUtility.$(OBJEXT)
 cephfs_table_tool_OBJECTS = $(am_cephfs_table_tool_OBJECTS)
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at cephfs_table_tool_DEPENDENCIES = $(am__DEPENDENCIES_12) \
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at cephfs_table_tool_DEPENDENCIES = $(am__DEPENDENCIES_13) \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am_crushtool_OBJECTS = tools/crushtool.$(OBJEXT)
 crushtool_OBJECTS = $(am_crushtool_OBJECTS)
-crushtool_DEPENDENCIES = $(am__DEPENDENCIES_10)
+crushtool_DEPENDENCIES = $(am__DEPENDENCIES_8)
 am__get_command_descriptions_SOURCES_DIST =  \
 	test/common/get_command_descriptions.cc
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am_get_command_descriptions_OBJECTS = test/common/get_command_descriptions.$(OBJEXT)
 get_command_descriptions_OBJECTS =  \
 	$(am_get_command_descriptions_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at get_command_descriptions_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_13) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_14) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(LIBMON_TYPES) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_11) \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(LIBCOMMON) \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_8)
 am__librados_config_SOURCES_DIST = librados-config.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_librados_config_OBJECTS = librados-config.$(OBJEXT)
 librados_config_OBJECTS = $(am_librados_config_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at librados_config_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am_monmaptool_OBJECTS = tools/monmaptool.$(OBJEXT)
 monmaptool_OBJECTS = $(am_monmaptool_OBJECTS)
-monmaptool_DEPENDENCIES = $(am__DEPENDENCIES_10) $(LIBCOMMON)
+monmaptool_DEPENDENCIES = $(am__DEPENDENCIES_8)
 am__mount_ceph_SOURCES_DIST = mount/mount.ceph.c
 @ENABLE_SERVER_TRUE at am_mount_ceph_OBJECTS =  \
 @ENABLE_SERVER_TRUE@	mount/mount.ceph.$(OBJEXT)
@@ -4628,7 +4333,7 @@ mount_ceph_OBJECTS = $(am_mount_ceph_OBJECTS)
 @ENABLE_SERVER_TRUE@	$(LIBCOMMON)
 am_osdmaptool_OBJECTS = tools/osdmaptool.$(OBJEXT)
 osdmaptool_OBJECTS = $(am_osdmaptool_OBJECTS)
-osdmaptool_DEPENDENCIES = $(am__DEPENDENCIES_10)
+osdmaptool_DEPENDENCIES = $(am__DEPENDENCIES_8)
 am__rados_SOURCES_DIST = tools/rados/rados.cc tools/RadosDump.cc \
 	tools/rados/RadosImport.cc tools/rados/PoolDump.cc \
 	common/obj_bencher.cc
@@ -4642,7 +4347,7 @@ rados_OBJECTS = $(am_rados_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_lock_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOSSTRIPER) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__radosgw_SOURCES_DIST = rgw/rgw_resolve.cc rgw/rgw_rest.cc \
 	rgw/rgw_rest_swift.cc rgw/rgw_rest_s3.cc rgw/rgw_rest_usage.cc \
 	rgw/rgw_rest_user.cc rgw/rgw_rest_bucket.cc \
@@ -4670,38 +4375,86 @@ am__radosgw_SOURCES_DIST = rgw/rgw_resolve.cc rgw/rgw_rest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_loadgen.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_main.$(OBJEXT)
 radosgw_OBJECTS = $(am_radosgw_OBJECTS)
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_DEPENDENCIES = $(am__DEPENDENCIES_17) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(LIBCIVETWEB) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_18) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_19) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__radosgw_admin_SOURCES_DIST = rgw/rgw_admin.cc rgw/rgw_orphan.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_radosgw_admin_OBJECTS = rgw/rgw_admin.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/rgw_orphan.$(OBJEXT)
 radosgw_admin_OBJECTS = $(am_radosgw_admin_OBJECTS)
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_admin_DEPENDENCIES = $(am__DEPENDENCIES_16) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_18) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_admin_DEPENDENCIES = $(am__DEPENDENCIES_17) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_19) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 am__radosgw_object_expirer_SOURCES_DIST = rgw/rgw_object_expirer.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_radosgw_object_expirer_OBJECTS = rgw/rgw_object_expirer.$(OBJEXT)
 radosgw_object_expirer_OBJECTS = $(am_radosgw_object_expirer_OBJECTS)
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_object_expirer_DEPENDENCIES = $(am__DEPENDENCIES_16) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_18) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
-am__rbd_SOURCES_DIST = rbd.cc
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_rbd_OBJECTS = rbd.$(OBJEXT)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at radosgw_object_expirer_DEPENDENCIES = $(am__DEPENDENCIES_17) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_19) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
+am__rbd_SOURCES_DIST = tools/rbd/rbd.cc tools/rbd/ArgumentTypes.cc \
+	tools/rbd/IndentStream.cc tools/rbd/OptionPrinter.cc \
+	tools/rbd/Shell.cc tools/rbd/Utils.cc \
+	tools/rbd/action/BenchWrite.cc tools/rbd/action/Children.cc \
+	tools/rbd/action/Clone.cc tools/rbd/action/Copy.cc \
+	tools/rbd/action/Create.cc tools/rbd/action/Diff.cc \
+	tools/rbd/action/DiskUsage.cc tools/rbd/action/Export.cc \
+	tools/rbd/action/ExportDiff.cc tools/rbd/action/Feature.cc \
+	tools/rbd/action/Flatten.cc tools/rbd/action/ImageMeta.cc \
+	tools/rbd/action/Import.cc tools/rbd/action/ImportDiff.cc \
+	tools/rbd/action/Info.cc tools/rbd/action/Kernel.cc \
+	tools/rbd/action/List.cc tools/rbd/action/Lock.cc \
+	tools/rbd/action/MergeDiff.cc tools/rbd/action/ObjectMap.cc \
+	tools/rbd/action/Remove.cc tools/rbd/action/Rename.cc \
+	tools/rbd/action/Resize.cc tools/rbd/action/Snap.cc \
+	tools/rbd/action/Status.cc tools/rbd/action/Watch.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_rbd_OBJECTS = tools/rbd/rbd.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/ArgumentTypes.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/IndentStream.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/OptionPrinter.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/Shell.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/Utils.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/BenchWrite.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Children.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Clone.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Copy.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Create.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Diff.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/DiskUsage.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Export.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/ExportDiff.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Feature.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Flatten.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/ImageMeta.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Import.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/ImportDiff.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Info.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Kernel.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/List.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Lock.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/MergeDiff.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/ObjectMap.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Remove.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Rename.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Resize.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Snap.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Status.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Watch.$(OBJEXT)
 rbd_OBJECTS = $(am_rbd_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_DEPENDENCIES = $(LIBKRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_1) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_1)
 am__rbd_fuse_SOURCES_DIST = rbd_fuse/rbd-fuse.cc
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_rbd_fuse_OBJECTS = rbd_fuse/rbd_fuse-rbd-fuse.$(OBJEXT)
 rbd_fuse_OBJECTS = $(am_rbd_fuse_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_fuse_DEPENDENCIES = $(am__DEPENDENCIES_1) \
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_8)
 rbd_fuse_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(rbd_fuse_CXXFLAGS) \
 	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
@@ -4712,7 +4465,7 @@ rbd_replay_OBJECTS = $(am_rbd_replay_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay_types.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBCOMMON)
 am__rbd_replay_prep_SOURCES_DIST = rbd_replay/rbd-replay-prep.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at am_rbd_replay_prep_OBJECTS = rbd_replay/rbd-replay-prep.$(OBJEXT)
@@ -4722,7 +4475,7 @@ rbd_replay_prep_OBJECTS = $(am_rbd_replay_prep_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay_types.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBCOMMON)
 am__simple_client_SOURCES_DIST = test/messenger/simple_client.cc \
 	test/messenger/simple_dispatcher.cc
@@ -4730,11 +4483,11 @@ am__simple_client_SOURCES_DIST = test/messenger/simple_client.cc \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/simple_client-simple_dispatcher.$(OBJEXT)
 simple_client_OBJECTS = $(am_simple_client_OBJECTS)
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at simple_client_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_5) \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_11) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(LIBCOMMON) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_2) \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_3) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_1)
 simple_client_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4746,11 +4499,11 @@ am__simple_server_SOURCES_DIST = test/messenger/simple_server.cc \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/simple_server-simple_dispatcher.$(OBJEXT)
 simple_server_OBJECTS = $(am_simple_server_OBJECTS)
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at simple_server_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_5) \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_11) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(LIBCOMMON) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_2) \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_3) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_1)
 simple_server_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -4759,48 +4512,48 @@ simple_server_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am__test_build_libcephfs_SOURCES_DIST = test/buildtest_skeleton.cc \
 	osdc/Objecter.cc osdc/ObjectCacher.cc osdc/Filer.cc \
 	osdc/Striper.cc osdc/Journaler.cc
-am__objects_35 = osdc/test_build_libcephfs-Objecter.$(OBJEXT) \
+am__objects_41 = osdc/test_build_libcephfs-Objecter.$(OBJEXT) \
 	osdc/test_build_libcephfs-ObjectCacher.$(OBJEXT) \
 	osdc/test_build_libcephfs-Filer.$(OBJEXT) \
 	osdc/test_build_libcephfs-Striper.$(OBJEXT) \
 	osdc/test_build_libcephfs-Journaler.$(OBJEXT)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_test_build_libcephfs_OBJECTS = test/test_build_libcephfs-buildtest_skeleton.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_35)
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_41)
 test_build_libcephfs_OBJECTS = $(am_test_build_libcephfs_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at test_build_libcephfs_DEPENDENCIES = $(LIBCEPHFS) \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_3)
 test_build_libcephfs_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(test_build_libcephfs_CXXFLAGS) $(CXXFLAGS) \
 	$(test_build_libcephfs_LDFLAGS) $(LDFLAGS) -o $@
 am__test_build_libcommon_SOURCES_DIST = test/buildtest_skeleton.cc \
 	common/buffer.cc
-am__objects_36 = common/test_build_libcommon-buffer.$(OBJEXT)
+am__objects_42 = common/test_build_libcommon-buffer.$(OBJEXT)
 @WITH_BUILD_TESTS_TRUE at am_test_build_libcommon_OBJECTS = test/test_build_libcommon-buildtest_skeleton.$(OBJEXT) \
- at WITH_BUILD_TESTS_TRUE@	$(am__objects_36)
+ at WITH_BUILD_TESTS_TRUE@	$(am__objects_42)
 test_build_libcommon_OBJECTS = $(am_test_build_libcommon_OBJECTS)
 @WITH_BUILD_TESTS_TRUE at test_build_libcommon_DEPENDENCIES =  \
- at WITH_BUILD_TESTS_TRUE@	$(am__DEPENDENCIES_3) \
+ at WITH_BUILD_TESTS_TRUE@	$(am__DEPENDENCIES_4) \
 @WITH_BUILD_TESTS_TRUE@	$(am__DEPENDENCIES_1) \
 @WITH_BUILD_TESTS_TRUE@	$(am__DEPENDENCIES_1) \
- at WITH_BUILD_TESTS_TRUE@	$(am__DEPENDENCIES_2)
+ at WITH_BUILD_TESTS_TRUE@	$(am__DEPENDENCIES_3)
 test_build_libcommon_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(test_build_libcommon_CXXFLAGS) $(CXXFLAGS) \
 	$(test_build_libcommon_LDFLAGS) $(LDFLAGS) -o $@
 am__test_build_librados_SOURCES_DIST = test/buildtest_skeleton.cc \
 	common/buffer.cc librados/librados.cc
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__objects_37 = common/test_build_librados-buffer.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am__objects_43 = common/test_build_librados-buffer.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados/test_build_librados-librados.$(OBJEXT)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE at am_test_build_librados_OBJECTS = test/test_build_librados-buildtest_skeleton.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_37)
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE@	$(am__objects_43)
 test_build_librados_OBJECTS = $(am_test_build_librados_OBJECTS)
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE at test_build_librados_DEPENDENCIES = $(am__DEPENDENCIES_8) \
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE at test_build_librados_DEPENDENCIES = $(am__DEPENDENCIES_6) \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_2)
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_3)
 test_build_librados_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(test_build_librados_CXXFLAGS) $(CXXFLAGS) \
@@ -4818,7 +4571,7 @@ am__test_build_librgw_SOURCES_DIST = test/buildtest_skeleton.cc \
 	rgw/rgw_cors_s3.cc rgw/rgw_auth_s3.cc rgw/rgw_metadata.cc \
 	rgw/rgw_replica_log.cc rgw/rgw_keystone.cc rgw/rgw_quota.cc \
 	rgw/rgw_dencoder.cc rgw/rgw_object_expirer_core.cc
- at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__objects_38 = rgw/test_build_librgw-librgw.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am__objects_44 = rgw/test_build_librgw-librgw.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_acl.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_acl_s3.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_acl_swift.$(OBJEXT) \
@@ -4854,21 +4607,21 @@ am__test_build_librgw_SOURCES_DIST = test/buildtest_skeleton.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_dencoder.$(OBJEXT) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/test_build_librgw-rgw_object_expirer_core.$(OBJEXT)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at am_test_build_librgw_OBJECTS = test/test_build_librgw-buildtest_skeleton.$(OBJEXT) \
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__objects_38)
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__objects_44)
 test_build_librgw_OBJECTS = $(am_test_build_librgw_OBJECTS)
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at test_build_librgw_DEPENDENCIES = $(am__DEPENDENCIES_18) \
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at test_build_librgw_DEPENDENCIES = $(am__DEPENDENCIES_19) \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_2) \
- at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_3) \
+ at ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8)
 test_build_librgw_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(test_build_librgw_CXXFLAGS) $(CXXFLAGS) \
 	$(test_build_librgw_LDFLAGS) $(LDFLAGS) -o $@
 am_unittest_addrs_OBJECTS = test/unittest_addrs-test_addrs.$(OBJEXT)
 unittest_addrs_OBJECTS = $(am_unittest_addrs_OBJECTS)
-unittest_addrs_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_addrs_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_addrs_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_addrs_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4876,16 +4629,16 @@ unittest_addrs_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_admin_socket_OBJECTS =  \
 	test/unittest_admin_socket-admin_socket.$(OBJEXT)
 unittest_admin_socket_OBJECTS = $(am_unittest_admin_socket_OBJECTS)
-unittest_admin_socket_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_admin_socket_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_admin_socket_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_admin_socket_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 am_unittest_arch_OBJECTS = test/unittest_arch-test_arch.$(OBJEXT)
 unittest_arch_OBJECTS = $(am_unittest_arch_OBJECTS)
-unittest_arch_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_arch_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_arch_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_arch_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
@@ -4893,9 +4646,9 @@ unittest_arch_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_async_compressor_OBJECTS = test/common/unittest_async_compressor-test_async_compressor.$(OBJEXT)
 unittest_async_compressor_OBJECTS =  \
 	$(am_unittest_async_compressor_OBJECTS)
-am__DEPENDENCIES_19 = libcompressor.la
-unittest_async_compressor_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10) $(am__DEPENDENCIES_19)
+am__DEPENDENCIES_20 = libcompressor.la
+unittest_async_compressor_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8) $(am__DEPENDENCIES_20)
 unittest_async_compressor_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_async_compressor_CXXFLAGS) $(CXXFLAGS) \
@@ -4904,8 +4657,8 @@ am__unittest_base64_SOURCES_DIST = test/base64.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_unittest_base64_OBJECTS = test/unittest_base64-base64.$(OBJEXT)
 unittest_base64_OBJECTS = $(am_unittest_base64_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at unittest_base64_DEPENDENCIES = $(LIBCEPHFS) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16)
 unittest_base64_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_base64_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4913,8 +4666,8 @@ unittest_base64_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_bit_vector_OBJECTS =  \
 	test/common/unittest_bit_vector-test_bit_vector.$(OBJEXT)
 unittest_bit_vector_OBJECTS = $(am_unittest_bit_vector_OBJECTS)
-unittest_bit_vector_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_bit_vector_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_bit_vector_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_bit_vector_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4922,8 +4675,8 @@ unittest_bit_vector_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_blkdev_OBJECTS =  \
 	test/common/unittest_blkdev-test_blkdev.$(OBJEXT)
 unittest_blkdev_OBJECTS = $(am_unittest_blkdev_OBJECTS)
-unittest_blkdev_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_blkdev_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_blkdev_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_blkdev_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4931,8 +4684,8 @@ unittest_blkdev_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_bloom_filter_OBJECTS =  \
 	test/common/unittest_bloom_filter-test_bloom_filter.$(OBJEXT)
 unittest_bloom_filter_OBJECTS = $(am_unittest_bloom_filter_OBJECTS)
-unittest_bloom_filter_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_bloom_filter_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_bloom_filter_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_bloom_filter_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4940,8 +4693,8 @@ unittest_bloom_filter_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_bufferlist_OBJECTS =  \
 	test/unittest_bufferlist-bufferlist.$(OBJEXT)
 unittest_bufferlist_OBJECTS = $(am_unittest_bufferlist_OBJECTS)
-unittest_bufferlist_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_bufferlist_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_bufferlist_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_bufferlist_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4949,8 +4702,8 @@ unittest_bufferlist_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_ceph_argparse_OBJECTS =  \
 	test/unittest_ceph_argparse-ceph_argparse.$(OBJEXT)
 unittest_ceph_argparse_OBJECTS = $(am_unittest_ceph_argparse_OBJECTS)
-unittest_ceph_argparse_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_ceph_argparse_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_ceph_argparse_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_ceph_argparse_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4959,8 +4712,8 @@ am_unittest_ceph_compatset_OBJECTS =  \
 	test/unittest_ceph_compatset-ceph_compatset.$(OBJEXT)
 unittest_ceph_compatset_OBJECTS =  \
 	$(am_unittest_ceph_compatset_OBJECTS)
-unittest_ceph_compatset_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_ceph_compatset_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_ceph_compatset_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_ceph_compatset_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4968,8 +4721,8 @@ unittest_ceph_compatset_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_ceph_crypto_OBJECTS =  \
 	test/unittest_ceph_crypto-ceph_crypto.$(OBJEXT)
 unittest_ceph_crypto_OBJECTS = $(am_unittest_ceph_crypto_OBJECTS)
-unittest_ceph_crypto_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_ceph_crypto_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_ceph_crypto_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_ceph_crypto_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4979,9 +4732,9 @@ am__unittest_chain_xattr_SOURCES_DIST =  \
 @ENABLE_SERVER_TRUE at am_unittest_chain_xattr_OBJECTS = test/objectstore/unittest_chain_xattr-chain_xattr.$(OBJEXT)
 unittest_chain_xattr_OBJECTS = $(am_unittest_chain_xattr_OBJECTS)
 @ENABLE_SERVER_TRUE at unittest_chain_xattr_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 unittest_chain_xattr_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_chain_xattr_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4989,8 +4742,8 @@ unittest_chain_xattr_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_config_OBJECTS =  \
 	test/common/unittest_config-test_config.$(OBJEXT)
 unittest_config_OBJECTS = $(am_unittest_config_OBJECTS)
-unittest_config_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_config_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_config_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_config_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -4998,8 +4751,8 @@ unittest_config_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_confutils_OBJECTS =  \
 	test/unittest_confutils-confutils.$(OBJEXT)
 unittest_confutils_OBJECTS = $(am_unittest_confutils_OBJECTS)
-unittest_confutils_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_confutils_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_confutils_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_confutils_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5007,8 +4760,8 @@ unittest_confutils_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_context_OBJECTS =  \
 	test/common/unittest_context-test_context.$(OBJEXT)
 unittest_context_OBJECTS = $(am_unittest_context_OBJECTS)
-unittest_context_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_context_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_context_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_context_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5016,16 +4769,16 @@ unittest_context_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_crc32c_OBJECTS =  \
 	test/common/unittest_crc32c-test_crc32c.$(OBJEXT)
 unittest_crc32c_OBJECTS = $(am_unittest_crc32c_OBJECTS)
-unittest_crc32c_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_crc32c_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_crc32c_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_crc32c_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 am_unittest_crush_OBJECTS = test/crush/unittest_crush-crush.$(OBJEXT)
 unittest_crush_OBJECTS = $(am_unittest_crush_OBJECTS)
-unittest_crush_DEPENDENCIES = $(LIBCOMMON) $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_2) $(am__DEPENDENCIES_10)
+unittest_crush_DEPENDENCIES = $(LIBCOMMON) $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_3) $(am__DEPENDENCIES_8)
 unittest_crush_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_crush_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5033,16 +4786,16 @@ unittest_crush_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_crush_wrapper_OBJECTS =  \
 	test/crush/unittest_crush_wrapper-CrushWrapper.$(OBJEXT)
 unittest_crush_wrapper_OBJECTS = $(am_unittest_crush_wrapper_OBJECTS)
-unittest_crush_wrapper_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10) $(LIBCRUSH)
+unittest_crush_wrapper_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8) $(LIBCRUSH)
 unittest_crush_wrapper_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_crush_wrapper_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 am_unittest_crypto_OBJECTS = test/unittest_crypto-crypto.$(OBJEXT)
 unittest_crypto_OBJECTS = $(am_unittest_crypto_OBJECTS)
-unittest_crypto_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_crypto_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_crypto_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_crypto_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5050,8 +4803,8 @@ unittest_crypto_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_crypto_init_OBJECTS =  \
 	test/unittest_crypto_init-crypto_init.$(OBJEXT)
 unittest_crypto_init_OBJECTS = $(am_unittest_crypto_init_OBJECTS)
-unittest_crypto_init_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_crypto_init_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_crypto_init_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_crypto_init_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5059,8 +4812,8 @@ unittest_crypto_init_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_daemon_config_OBJECTS =  \
 	test/unittest_daemon_config-daemon_config.$(OBJEXT)
 unittest_daemon_config_OBJECTS = $(am_unittest_daemon_config_OBJECTS)
-unittest_daemon_config_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_daemon_config_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_daemon_config_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_daemon_config_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5069,9 +4822,9 @@ am__unittest_ecbackend_SOURCES_DIST = test/osd/TestECBackend.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_ecbackend_OBJECTS = test/osd/unittest_ecbackend-TestECBackend.$(OBJEXT)
 unittest_ecbackend_OBJECTS = $(am_unittest_ecbackend_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_ecbackend_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8)
 unittest_ecbackend_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_ecbackend_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5081,8 +4834,8 @@ am__unittest_encoding_SOURCES_DIST = test/encoding.cc
 unittest_encoding_OBJECTS = $(am_unittest_encoding_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at unittest_encoding_DEPENDENCIES = $(LIBCEPHFS) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16)
 unittest_encoding_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_encoding_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5093,10 +4846,10 @@ am__unittest_erasure_code_SOURCES_DIST = erasure-code/ErasureCode.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/unittest_erasure_code-TestErasureCode.$(OBJEXT)
 unittest_erasure_code_OBJECTS = $(am_unittest_erasure_code_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8)
 unittest_erasure_code_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_erasure_code_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5109,10 +4862,10 @@ am__unittest_erasure_code_example_SOURCES_DIST =  \
 unittest_erasure_code_example_OBJECTS =  \
 	$(am_unittest_erasure_code_example_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_example_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8)
 unittest_erasure_code_example_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_erasure_code_example_CXXFLAGS) $(CXXFLAGS) \
@@ -5124,10 +4877,10 @@ am__unittest_erasure_code_isa_SOURCES_DIST =  \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	test/erasure-code/unittest_erasure_code_isa-TestErasureCodeIsa.$(OBJEXT)
 unittest_erasure_code_isa_OBJECTS =  \
 	$(am_unittest_erasure_code_isa_OBJECTS)
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_isa_DEPENDENCIES = $(am__DEPENDENCIES_14) \
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_isa_DEPENDENCIES = $(am__DEPENDENCIES_15) \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	.libs/libec_isa.la \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(LIBERASURE_CODE) \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
@@ -5156,7 +4909,7 @@ am__unittest_erasure_code_jerasure_SOURCES_DIST =  \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c \
 	erasure-code/jerasure/ErasureCodePluginJerasure.cc \
 	erasure-code/jerasure/ErasureCodeJerasure.cc
-am__objects_39 = erasure-code/unittest_erasure_code_jerasure-ErasureCode.$(OBJEXT) \
+am__objects_45 = erasure-code/unittest_erasure_code_jerasure-ErasureCode.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_jerasure-cauchy.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_jerasure-galois.$(OBJEXT) \
 	erasure-code/jerasure/jerasure/src/unittest_erasure_code_jerasure-jerasure.$(OBJEXT) \
@@ -5176,14 +4929,14 @@ am__objects_39 = erasure-code/unittest_erasure_code_jerasure-ErasureCode.$(OBJEX
 	erasure-code/jerasure/unittest_erasure_code_jerasure-ErasureCodePluginJerasure.$(OBJEXT) \
 	erasure-code/jerasure/unittest_erasure_code_jerasure-ErasureCodeJerasure.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_jerasure_OBJECTS = test/erasure-code/unittest_erasure_code_jerasure-TestErasureCodeJerasure.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_39)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_45)
 unittest_erasure_code_jerasure_OBJECTS =  \
 	$(am_unittest_erasure_code_jerasure_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_jerasure_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 unittest_erasure_code_jerasure_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -5194,19 +4947,19 @@ am__unittest_erasure_code_lrc_SOURCES_DIST =  \
 	erasure-code/ErasureCode.cc \
 	erasure-code/lrc/ErasureCodePluginLrc.cc \
 	erasure-code/lrc/ErasureCodeLrc.cc
-am__objects_40 =  \
+am__objects_46 =  \
 	erasure-code/unittest_erasure_code_lrc-ErasureCode.$(OBJEXT) \
 	erasure-code/lrc/unittest_erasure_code_lrc-ErasureCodePluginLrc.$(OBJEXT) \
 	erasure-code/lrc/unittest_erasure_code_lrc-ErasureCodeLrc.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_lrc_OBJECTS = test/erasure-code/unittest_erasure_code_lrc-TestErasureCodeLrc.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_40)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_46)
 unittest_erasure_code_lrc_OBJECTS =  \
 	$(am_unittest_erasure_code_lrc_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_lrc_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 unittest_erasure_code_lrc_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -5220,10 +4973,10 @@ am__unittest_erasure_code_plugin_SOURCES_DIST =  \
 unittest_erasure_code_plugin_OBJECTS =  \
 	$(am_unittest_erasure_code_plugin_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 unittest_erasure_code_plugin_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -5236,10 +4989,10 @@ am__unittest_erasure_code_plugin_isa_SOURCES_DIST =  \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	test/erasure-code/unittest_erasure_code_plugin_isa-TestErasureCodePluginIsa.$(OBJEXT)
 unittest_erasure_code_plugin_isa_OBJECTS =  \
 	$(am_unittest_erasure_code_plugin_isa_OBJECTS)
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_isa_DEPENDENCIES = $(am__DEPENDENCIES_14) \
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_isa_DEPENDENCIES = $(am__DEPENDENCIES_15) \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	.libs/libec_isa.la \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(LIBERASURE_CODE) \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
@@ -5253,10 +5006,10 @@ am__unittest_erasure_code_plugin_jerasure_SOURCES_DIST =  \
 unittest_erasure_code_plugin_jerasure_OBJECTS =  \
 	$(am_unittest_erasure_code_plugin_jerasure_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_jerasure_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 unittest_erasure_code_plugin_jerasure_LINK = $(LIBTOOL) $(AM_V_lt) \
 	--tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
@@ -5268,10 +5021,10 @@ am__unittest_erasure_code_plugin_lrc_SOURCES_DIST =  \
 unittest_erasure_code_plugin_lrc_OBJECTS =  \
 	$(am_unittest_erasure_code_plugin_lrc_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_lrc_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 unittest_erasure_code_plugin_lrc_LINK = $(LIBTOOL) $(AM_V_lt) \
 	--tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
@@ -5283,10 +5036,10 @@ am__unittest_erasure_code_plugin_shec_SOURCES_DIST =  \
 unittest_erasure_code_plugin_shec_OBJECTS =  \
 	$(am_unittest_erasure_code_plugin_shec_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_shec_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 unittest_erasure_code_plugin_shec_LINK = $(LIBTOOL) $(AM_V_lt) \
 	--tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
@@ -5315,7 +5068,7 @@ am__unittest_erasure_code_shec_SOURCES_DIST =  \
 	erasure-code/jerasure/gf-complete/src/gf_w4.c \
 	erasure-code/jerasure/gf-complete/src/gf_rand.c \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c
-am__objects_41 =  \
+am__objects_47 =  \
 	erasure-code/unittest_erasure_code_shec-ErasureCode.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec-ErasureCodePluginShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec-ErasureCodeShec.$(OBJEXT) \
@@ -5338,14 +5091,14 @@ am__objects_41 =  \
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec-gf_rand.$(OBJEXT) \
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec-gf_w8.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_shec_OBJECTS = test/erasure-code/unittest_erasure_code_shec-TestErasureCodeShec.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_41)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_47)
 unittest_erasure_code_shec_OBJECTS =  \
 	$(am_unittest_erasure_code_shec_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 unittest_erasure_code_shec_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -5374,7 +5127,7 @@ am__unittest_erasure_code_shec_all_SOURCES_DIST =  \
 	erasure-code/jerasure/gf-complete/src/gf_w4.c \
 	erasure-code/jerasure/gf-complete/src/gf_rand.c \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c
-am__objects_42 = erasure-code/unittest_erasure_code_shec_all-ErasureCode.$(OBJEXT) \
+am__objects_48 = erasure-code/unittest_erasure_code_shec_all-ErasureCode.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_all-ErasureCodePluginShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_all-ErasureCodeShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_all-ErasureCodeShecTableCache.$(OBJEXT) \
@@ -5396,14 +5149,14 @@ am__objects_42 = erasure-code/unittest_erasure_code_shec_all-ErasureCode.$(OBJEX
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_all-gf_rand.$(OBJEXT) \
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_all-gf_w8.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_shec_all_OBJECTS = test/erasure-code/unittest_erasure_code_shec_all-TestErasureCodeShec_all.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_42)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_48)
 unittest_erasure_code_shec_all_OBJECTS =  \
 	$(am_unittest_erasure_code_shec_all_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_all_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 unittest_erasure_code_shec_all_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -5432,7 +5185,7 @@ am__unittest_erasure_code_shec_arguments_SOURCES_DIST =  \
 	erasure-code/jerasure/gf-complete/src/gf_w4.c \
 	erasure-code/jerasure/gf-complete/src/gf_rand.c \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c
-am__objects_43 = erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.$(OBJEXT) \
+am__objects_49 = erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodePluginShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_arguments-ErasureCodeShecTableCache.$(OBJEXT) \
@@ -5454,14 +5207,14 @@ am__objects_43 = erasure-code/unittest_erasure_code_shec_arguments-ErasureCode.$
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_rand.$(OBJEXT) \
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_arguments-gf_w8.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_shec_arguments_OBJECTS = test/erasure-code/unittest_erasure_code_shec_arguments-TestErasureCodeShec_arguments.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_43)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_49)
 unittest_erasure_code_shec_arguments_OBJECTS =  \
 	$(am_unittest_erasure_code_shec_arguments_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_arguments_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 unittest_erasure_code_shec_arguments_LINK = $(LIBTOOL) $(AM_V_lt) \
 	--tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
@@ -5490,7 +5243,7 @@ am__unittest_erasure_code_shec_thread_SOURCES_DIST =  \
 	erasure-code/jerasure/gf-complete/src/gf_w4.c \
 	erasure-code/jerasure/gf-complete/src/gf_rand.c \
 	erasure-code/jerasure/gf-complete/src/gf_w8.c
-am__objects_44 = erasure-code/unittest_erasure_code_shec_thread-ErasureCode.$(OBJEXT) \
+am__objects_50 = erasure-code/unittest_erasure_code_shec_thread-ErasureCode.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_thread-ErasureCodePluginShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_thread-ErasureCodeShec.$(OBJEXT) \
 	erasure-code/shec/unittest_erasure_code_shec_thread-ErasureCodeShecTableCache.$(OBJEXT) \
@@ -5512,14 +5265,14 @@ am__objects_44 = erasure-code/unittest_erasure_code_shec_thread-ErasureCode.$(OB
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_thread-gf_rand.$(OBJEXT) \
 	erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_thread-gf_w8.$(OBJEXT)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_erasure_code_shec_thread_OBJECTS = test/erasure-code/unittest_erasure_code_shec_thread-TestErasureCodeShec_thread.$(OBJEXT) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_44)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__objects_50)
 unittest_erasure_code_shec_thread_OBJECTS =  \
 	$(am_unittest_erasure_code_shec_thread_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_thread_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 unittest_erasure_code_shec_thread_LINK = $(LIBTOOL) $(AM_V_lt) \
 	--tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link \
@@ -5527,8 +5280,8 @@ unittest_erasure_code_shec_thread_LINK = $(LIBTOOL) $(AM_V_lt) \
 	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 am_unittest_escape_OBJECTS = test/unittest_escape-escape.$(OBJEXT)
 unittest_escape_OBJECTS = $(am_unittest_escape_OBJECTS)
-unittest_escape_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_escape_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_escape_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_escape_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5537,16 +5290,16 @@ am_unittest_formatter_OBJECTS =  \
 	test/unittest_formatter-formatter.$(OBJEXT) \
 	rgw/unittest_formatter-rgw_formats.$(OBJEXT)
 unittest_formatter_OBJECTS = $(am_unittest_formatter_OBJECTS)
-unittest_formatter_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_formatter_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_formatter_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_formatter_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 am_unittest_gather_OBJECTS = test/unittest_gather-gather.$(OBJEXT)
 unittest_gather_OBJECTS = $(am_unittest_gather_OBJECTS)
-unittest_gather_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_gather_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_gather_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_gather_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5555,7 +5308,7 @@ am_unittest_heartbeatmap_OBJECTS =  \
 	test/unittest_heartbeatmap-heartbeat_map.$(OBJEXT)
 unittest_heartbeatmap_OBJECTS = $(am_unittest_heartbeatmap_OBJECTS)
 unittest_heartbeatmap_DEPENDENCIES = $(LIBCOMMON) \
-	$(am__DEPENDENCIES_15) $(am__DEPENDENCIES_10)
+	$(am__DEPENDENCIES_16) $(am__DEPENDENCIES_8)
 unittest_heartbeatmap_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_heartbeatmap_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5563,8 +5316,8 @@ unittest_heartbeatmap_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_histogram_OBJECTS =  \
 	test/common/unittest_histogram-histogram.$(OBJEXT)
 unittest_histogram_OBJECTS = $(am_unittest_histogram_OBJECTS)
-unittest_histogram_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_histogram_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_histogram_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_histogram_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5573,9 +5326,9 @@ am__unittest_hitset_SOURCES_DIST = test/osd/hitset.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_hitset_OBJECTS = test/osd/unittest_hitset-hitset.$(OBJEXT)
 unittest_hitset_OBJECTS = $(am_unittest_hitset_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_hitset_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8)
 unittest_hitset_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_hitset_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5583,8 +5336,8 @@ unittest_hitset_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_io_priority_OBJECTS =  \
 	test/common/unittest_io_priority-test_io_priority.$(OBJEXT)
 unittest_io_priority_OBJECTS = $(am_unittest_io_priority_OBJECTS)
-unittest_io_priority_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_io_priority_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_io_priority_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_io_priority_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5592,19 +5345,53 @@ unittest_io_priority_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_ipaddr_OBJECTS =  \
 	test/unittest_ipaddr-test_ipaddr.$(OBJEXT)
 unittest_ipaddr_OBJECTS = $(am_unittest_ipaddr_OBJECTS)
-unittest_ipaddr_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_ipaddr_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_ipaddr_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_ipaddr_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
+am__unittest_journal_SOURCES_DIST = test/journal/test_main.cc \
+	test/journal/test_Entry.cc test/journal/test_FutureImpl.cc \
+	test/journal/test_Journaler.cc \
+	test/journal/test_JournalMetadata.cc \
+	test/journal/test_JournalPlayer.cc \
+	test/journal/test_JournalRecorder.cc \
+	test/journal/test_JournalTrimmer.cc \
+	test/journal/test_ObjectPlayer.cc \
+	test/journal/test_ObjectRecorder.cc \
+	test/journal/RadosTestFixture.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at am_unittest_journal_OBJECTS = test/journal/unittest_journal-test_main.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/unittest_journal-test_Entry.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/unittest_journal-test_FutureImpl.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/unittest_journal-test_Journaler.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/unittest_journal-test_JournalMetadata.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/unittest_journal-test_JournalPlayer.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/unittest_journal-test_JournalRecorder.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/unittest_journal-test_JournalTrimmer.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/unittest_journal-test_ObjectPlayer.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/unittest_journal-test_ObjectRecorder.$(OBJEXT) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/unittest_journal-RadosTestFixture.$(OBJEXT)
+unittest_journal_OBJECTS = $(am_unittest_journal_OBJECTS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at unittest_journal_DEPENDENCIES =  \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libjournal.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libcls_journal_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_test_stub.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_internal.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(RADOS_TEST_LDADD)
+unittest_journal_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
+	$(unittest_journal_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
 am__unittest_lfnindex_SOURCES_DIST = test/os/TestLFNIndex.cc
 @ENABLE_SERVER_TRUE at am_unittest_lfnindex_OBJECTS = test/os/unittest_lfnindex-TestLFNIndex.$(OBJEXT)
 unittest_lfnindex_OBJECTS = $(am_unittest_lfnindex_OBJECTS)
 @ENABLE_SERVER_TRUE at unittest_lfnindex_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@	$(am__DEPENDENCIES_8)
 unittest_lfnindex_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_lfnindex_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5614,8 +5401,8 @@ am__unittest_libcephfs_config_SOURCES_DIST = test/libcephfs_config.cc
 unittest_libcephfs_config_OBJECTS =  \
 	$(am_unittest_libcephfs_config_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at unittest_libcephfs_config_DEPENDENCIES = $(LIBCEPHFS) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16)
 unittest_libcephfs_config_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_libcephfs_config_CXXFLAGS) $(CXXFLAGS) \
@@ -5625,8 +5412,8 @@ am__unittest_librados_SOURCES_DIST = test/librados/librados.cc
 unittest_librados_OBJECTS = $(am_unittest_librados_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at unittest_librados_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16)
 unittest_librados_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_librados_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5638,8 +5425,8 @@ unittest_librados_config_OBJECTS =  \
 	$(am_unittest_librados_config_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at unittest_librados_config_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16)
 unittest_librados_config_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_librados_config_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5655,11 +5442,13 @@ unittest_librbd_OBJECTS = $(am_unittest_librbd_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD_TYPES) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libjournal.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_journal_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_test_stub.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBOSDC) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(RADOS_TEST_LDADD)
 unittest_librbd_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -5667,14 +5456,14 @@ unittest_librbd_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(LDFLAGS) -o $@
 am_unittest_log_OBJECTS = log/unittest_log-test.$(OBJEXT)
 unittest_log_OBJECTS = $(am_unittest_log_OBJECTS)
-unittest_log_DEPENDENCIES = $(LIBCOMMON) $(am__DEPENDENCIES_15)
+unittest_log_DEPENDENCIES = $(LIBCOMMON) $(am__DEPENDENCIES_16)
 unittest_log_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(unittest_log_CXXFLAGS) \
 	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 am_unittest_lru_OBJECTS = test/common/unittest_lru-test_lru.$(OBJEXT)
 unittest_lru_OBJECTS = $(am_unittest_lru_OBJECTS)
-unittest_lru_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_lru_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_lru_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(unittest_lru_CXXFLAGS) \
 	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
@@ -5682,9 +5471,9 @@ am__unittest_mds_authcap_SOURCES_DIST = test/mds/TestMDSAuthCaps.cc
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at am_unittest_mds_authcap_OBJECTS = test/mds/unittest_mds_authcap-TestMDSAuthCaps.$(OBJEXT)
 unittest_mds_authcap_OBJECTS = $(am_unittest_mds_authcap_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at unittest_mds_authcap_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__DEPENDENCIES_12) \
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__DEPENDENCIES_13) \
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@	$(am__DEPENDENCIES_8)
 unittest_mds_authcap_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_mds_authcap_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5692,16 +5481,16 @@ unittest_mds_authcap_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_mds_types_OBJECTS =  \
 	test/fs/unittest_mds_types-mds_types.$(OBJEXT)
 unittest_mds_types_OBJECTS = $(am_unittest_mds_types_OBJECTS)
-unittest_mds_types_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_mds_types_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_mds_types_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_mds_types_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 am_unittest_mime_OBJECTS = test/unittest_mime-mime.$(OBJEXT)
 unittest_mime_OBJECTS = $(am_unittest_mime_OBJECTS)
-unittest_mime_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_mime_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_mime_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_mime_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
@@ -5710,9 +5499,9 @@ am__unittest_mon_moncap_SOURCES_DIST = test/mon/moncap.cc
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am_unittest_mon_moncap_OBJECTS = test/mon/unittest_mon_moncap-moncap.$(OBJEXT)
 unittest_mon_moncap_OBJECTS = $(am_unittest_mon_moncap_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at unittest_mon_moncap_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_13) \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_14) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_8)
 unittest_mon_moncap_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_mon_moncap_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5721,9 +5510,9 @@ am__unittest_mon_pgmap_SOURCES_DIST = test/mon/PGMap.cc
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at am_unittest_mon_pgmap_OBJECTS = test/mon/unittest_mon_pgmap-PGMap.$(OBJEXT)
 unittest_mon_pgmap_OBJECTS = $(am_unittest_mon_pgmap_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at unittest_mon_pgmap_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_13) \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_14) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	$(am__DEPENDENCIES_8)
 unittest_mon_pgmap_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_mon_pgmap_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5735,9 +5524,9 @@ am__unittest_osd_osdcap_SOURCES_DIST = test/osd/osdcap.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_osd_osdcap_OBJECTS = test/osd/unittest_osd_osdcap-osdcap.$(OBJEXT)
 unittest_osd_osdcap_OBJECTS = $(am_unittest_osd_osdcap_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_osd_osdcap_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8)
 unittest_osd_osdcap_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_osd_osdcap_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5745,8 +5534,8 @@ unittest_osd_osdcap_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_osd_types_OBJECTS =  \
 	test/osd/unittest_osd_types-types.$(OBJEXT)
 unittest_osd_types_OBJECTS = $(am_unittest_osd_types_OBJECTS)
-unittest_osd_types_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_osd_types_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_osd_types_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_osd_types_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5754,8 +5543,8 @@ unittest_osd_types_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_osdmap_OBJECTS =  \
 	test/osd/unittest_osdmap-TestOSDMap.$(OBJEXT)
 unittest_osdmap_OBJECTS = $(am_unittest_osdmap_OBJECTS)
-unittest_osdmap_DEPENDENCIES = $(am__DEPENDENCIES_15) $(LIBCOMMON) \
-	$(am__DEPENDENCIES_10)
+unittest_osdmap_DEPENDENCIES = $(am__DEPENDENCIES_16) $(LIBCOMMON) \
+	$(am__DEPENDENCIES_8)
 unittest_osdmap_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_osdmap_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5764,9 +5553,9 @@ am__unittest_osdscrub_SOURCES_DIST = test/osd/TestOSDScrub.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_osdscrub_OBJECTS = test/osd/unittest_osdscrub-TestOSDScrub.$(OBJEXT)
 unittest_osdscrub_OBJECTS = $(am_unittest_osdscrub_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_osdscrub_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 unittest_osdscrub_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -5776,7 +5565,7 @@ am__unittest_pageset_SOURCES_DIST = test/test_pageset.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_pageset_OBJECTS = test/unittest_pageset-test_pageset.$(OBJEXT)
 unittest_pageset_OBJECTS = $(am_unittest_pageset_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pageset_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16)
 unittest_pageset_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_pageset_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5784,8 +5573,8 @@ unittest_pageset_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_perf_counters_OBJECTS =  \
 	test/unittest_perf_counters-perf_counters.$(OBJEXT)
 unittest_perf_counters_OBJECTS = $(am_unittest_perf_counters_OBJECTS)
-unittest_perf_counters_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_perf_counters_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_perf_counters_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_perf_counters_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5794,9 +5583,9 @@ am__unittest_pglog_SOURCES_DIST = test/osd/TestPGLog.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at am_unittest_pglog_OBJECTS = test/osd/unittest_pglog-TestPGLog.$(OBJEXT)
 unittest_pglog_OBJECTS = $(am_unittest_pglog_OBJECTS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pglog_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_14) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__DEPENDENCIES_1)
 unittest_pglog_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
@@ -5806,7 +5595,7 @@ am_unittest_prebufferedstreambuf_OBJECTS = test/unittest_prebufferedstreambuf-te
 unittest_prebufferedstreambuf_OBJECTS =  \
 	$(am_unittest_prebufferedstreambuf_OBJECTS)
 unittest_prebufferedstreambuf_DEPENDENCIES = $(LIBCOMMON) \
-	$(am__DEPENDENCIES_15) $(am__DEPENDENCIES_2)
+	$(am__DEPENDENCIES_16) $(am__DEPENDENCIES_3)
 unittest_prebufferedstreambuf_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_prebufferedstreambuf_CXXFLAGS) $(CXXFLAGS) \
@@ -5814,8 +5603,8 @@ unittest_prebufferedstreambuf_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_prioritized_queue_OBJECTS = test/common/unittest_prioritized_queue-test_prioritized_queue.$(OBJEXT)
 unittest_prioritized_queue_OBJECTS =  \
 	$(am_unittest_prioritized_queue_OBJECTS)
-unittest_prioritized_queue_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_prioritized_queue_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_prioritized_queue_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_prioritized_queue_CXXFLAGS) $(CXXFLAGS) \
@@ -5826,10 +5615,10 @@ unittest_rbd_replay_OBJECTS = $(am_unittest_rbd_replay_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_rbd_replay_DEPENDENCIES =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRBD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_replay_ios.la \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_15)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__DEPENDENCIES_16)
 unittest_rbd_replay_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_rbd_replay_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5837,8 +5626,8 @@ unittest_rbd_replay_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_readahead_OBJECTS =  \
 	test/common/unittest_readahead-Readahead.$(OBJEXT)
 unittest_readahead_OBJECTS = $(am_unittest_readahead_OBJECTS)
-unittest_readahead_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_readahead_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_readahead_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_readahead_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5848,9 +5637,9 @@ am__unittest_rocksdb_option_SOURCES_DIST =  \
 @ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at am_unittest_rocksdb_option_OBJECTS = test/objectstore/unittest_rocksdb_option-TestRocksdbOptionParse.$(OBJEXT)
 unittest_rocksdb_option_OBJECTS =  \
 	$(am_unittest_rocksdb_option_OBJECTS)
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at unittest_rocksdb_option_DEPENDENCIES = $(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE@	$(am__DEPENDENCIES_10)
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at unittest_rocksdb_option_DEPENDENCIES = $(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE@	$(am__DEPENDENCIES_8)
 unittest_rocksdb_option_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_rocksdb_option_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5860,10 +5649,9 @@ am__unittest_rocksdb_option_static_SOURCES_DIST =  \
 @ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at am_unittest_rocksdb_option_static_OBJECTS = test/objectstore/unittest_rocksdb_option_static-TestRocksdbOptionParse.$(OBJEXT)
 unittest_rocksdb_option_static_OBJECTS =  \
 	$(am_unittest_rocksdb_option_static_OBJECTS)
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at unittest_rocksdb_option_static_DEPENDENCIES = $(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE@	$(am__DEPENDENCIES_15) \
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE@	rocksdb/librocksdb.la
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at unittest_rocksdb_option_static_DEPENDENCIES = $(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE@	$(am__DEPENDENCIES_16) \
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE@	$(am__DEPENDENCIES_8)
 unittest_rocksdb_option_static_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_rocksdb_option_static_CXXFLAGS) $(CXXFLAGS) \
@@ -5872,8 +5660,8 @@ am__unittest_run_cmd_SOURCES_DIST = test/run_cmd.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_unittest_run_cmd_OBJECTS = test/unittest_run_cmd-run_cmd.$(OBJEXT)
 unittest_run_cmd_OBJECTS = $(am_unittest_run_cmd_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at unittest_run_cmd_DEPENDENCIES = $(LIBCEPHFS) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16)
 unittest_run_cmd_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_run_cmd_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5881,8 +5669,8 @@ unittest_run_cmd_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_safe_io_OBJECTS =  \
 	test/common/unittest_safe_io-test_safe_io.$(OBJEXT)
 unittest_safe_io_OBJECTS = $(am_unittest_safe_io_OBJECTS)
-unittest_safe_io_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_safe_io_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_safe_io_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_safe_io_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5890,8 +5678,8 @@ unittest_safe_io_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_shared_cache_OBJECTS =  \
 	test/common/unittest_shared_cache-test_shared_cache.$(OBJEXT)
 unittest_shared_cache_OBJECTS = $(am_unittest_shared_cache_OBJECTS)
-unittest_shared_cache_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_shared_cache_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_shared_cache_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_shared_cache_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5899,16 +5687,16 @@ unittest_shared_cache_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_sharedptr_registry_OBJECTS = test/common/unittest_sharedptr_registry-test_sharedptr_registry.$(OBJEXT)
 unittest_sharedptr_registry_OBJECTS =  \
 	$(am_unittest_sharedptr_registry_OBJECTS)
-unittest_sharedptr_registry_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_sharedptr_registry_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_sharedptr_registry_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_sharedptr_registry_CXXFLAGS) $(CXXFLAGS) \
 	$(AM_LDFLAGS) $(LDFLAGS) -o $@
 am_unittest_signals_OBJECTS = test/unittest_signals-signals.$(OBJEXT)
 unittest_signals_OBJECTS = $(am_unittest_signals_OBJECTS)
-unittest_signals_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_signals_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_signals_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_signals_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5917,8 +5705,8 @@ am__unittest_simple_spin_SOURCES_DIST = test/simple_spin.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at am_unittest_simple_spin_OBJECTS = test/unittest_simple_spin-simple_spin.$(OBJEXT)
 unittest_simple_spin_OBJECTS = $(am_unittest_simple_spin_OBJECTS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at unittest_simple_spin_DEPENDENCIES = $(LIBCEPHFS) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_10) \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_15)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_8) \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__DEPENDENCIES_16)
 unittest_simple_spin_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_simple_spin_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5926,8 +5714,8 @@ unittest_simple_spin_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_sloppy_crc_map_OBJECTS = test/common/unittest_sloppy_crc_map-test_sloppy_crc_map.$(OBJEXT)
 unittest_sloppy_crc_map_OBJECTS =  \
 	$(am_unittest_sloppy_crc_map_OBJECTS)
-unittest_sloppy_crc_map_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_sloppy_crc_map_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_sloppy_crc_map_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_sloppy_crc_map_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5935,8 +5723,8 @@ unittest_sloppy_crc_map_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_str_list_OBJECTS =  \
 	test/unittest_str_list-test_str_list.$(OBJEXT)
 unittest_str_list_OBJECTS = $(am_unittest_str_list_OBJECTS)
-unittest_str_list_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_str_list_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_str_list_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_str_list_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5944,8 +5732,8 @@ unittest_str_list_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_str_map_OBJECTS =  \
 	test/common/unittest_str_map-test_str_map.$(OBJEXT)
 unittest_str_map_OBJECTS = $(am_unittest_str_map_OBJECTS)
-unittest_str_map_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_str_map_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_str_map_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_str_map_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5953,16 +5741,16 @@ unittest_str_map_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_striper_OBJECTS =  \
 	test/unittest_striper-test_striper.$(OBJEXT)
 unittest_striper_OBJECTS = $(am_unittest_striper_OBJECTS)
-unittest_striper_DEPENDENCIES = $(LIBOSDC) $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_striper_DEPENDENCIES = $(LIBOSDC) $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_striper_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_striper_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 am_unittest_strtol_OBJECTS = test/unittest_strtol-strtol.$(OBJEXT)
 unittest_strtol_OBJECTS = $(am_unittest_strtol_OBJECTS)
-unittest_strtol_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_strtol_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_strtol_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_strtol_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5970,7 +5758,7 @@ unittest_strtol_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_subprocess_OBJECTS =  \
 	test/unittest_subprocess-test_subprocess.$(OBJEXT)
 unittest_subprocess_OBJECTS = $(am_unittest_subprocess_OBJECTS)
-unittest_subprocess_DEPENDENCIES = $(LIBCOMMON) $(am__DEPENDENCIES_15)
+unittest_subprocess_DEPENDENCIES = $(LIBCOMMON) $(am__DEPENDENCIES_16)
 unittest_subprocess_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_subprocess_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5978,8 +5766,8 @@ unittest_subprocess_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_tableformatter_OBJECTS = test/common/unittest_tableformatter-test_tableformatter.$(OBJEXT)
 unittest_tableformatter_OBJECTS =  \
 	$(am_unittest_tableformatter_OBJECTS)
-unittest_tableformatter_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_tableformatter_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_tableformatter_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_tableformatter_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5987,7 +5775,7 @@ unittest_tableformatter_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_texttable_OBJECTS =  \
 	test/unittest_texttable-test_texttable.$(OBJEXT)
 unittest_texttable_OBJECTS = $(am_unittest_texttable_OBJECTS)
-unittest_texttable_DEPENDENCIES = $(LIBCOMMON) $(am__DEPENDENCIES_15)
+unittest_texttable_DEPENDENCIES = $(LIBCOMMON) $(am__DEPENDENCIES_16)
 unittest_texttable_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_texttable_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -5995,16 +5783,16 @@ unittest_texttable_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_throttle_OBJECTS =  \
 	test/common/unittest_throttle-Throttle.$(OBJEXT)
 unittest_throttle_OBJECTS = $(am_unittest_throttle_OBJECTS)
-unittest_throttle_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_throttle_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_throttle_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_throttle_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 am_unittest_utf8_OBJECTS = test/unittest_utf8-utf8.$(OBJEXT)
 unittest_utf8_OBJECTS = $(am_unittest_utf8_OBJECTS)
-unittest_utf8_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_utf8_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_utf8_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_utf8_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
@@ -6012,8 +5800,8 @@ unittest_utf8_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_util_OBJECTS =  \
 	test/common/unittest_util-test_util.$(OBJEXT)
 unittest_util_OBJECTS = $(am_unittest_util_OBJECTS)
-unittest_util_DEPENDENCIES = $(LIBCOMMON) $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2)
+unittest_util_DEPENDENCIES = $(LIBCOMMON) $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_3)
 unittest_util_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_util_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
@@ -6021,15 +5809,15 @@ unittest_util_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 am_unittest_workqueue_OBJECTS =  \
 	test/unittest_workqueue-test_workqueue.$(OBJEXT)
 unittest_workqueue_OBJECTS = $(am_unittest_workqueue_OBJECTS)
-unittest_workqueue_DEPENDENCIES = $(am__DEPENDENCIES_15) \
-	$(am__DEPENDENCIES_10)
+unittest_workqueue_DEPENDENCIES = $(am__DEPENDENCIES_16) \
+	$(am__DEPENDENCIES_8)
 unittest_workqueue_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_workqueue_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
 	$(LDFLAGS) -o $@
 am_unittest_xlist_OBJECTS = test/unittest_xlist-test_xlist.$(OBJEXT)
 unittest_xlist_OBJECTS = $(am_unittest_xlist_OBJECTS)
-unittest_xlist_DEPENDENCIES = $(am__DEPENDENCIES_15) $(LIBCOMMON)
+unittest_xlist_DEPENDENCIES = $(am__DEPENDENCIES_16) $(LIBCOMMON)
 unittest_xlist_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX \
 	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CXXLD) \
 	$(unittest_xlist_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) \
@@ -6040,11 +5828,11 @@ am__xio_client_SOURCES_DIST = test/messenger/xio_client.cc \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_client-xio_dispatcher.$(OBJEXT)
 xio_client_OBJECTS = $(am_xio_client_OBJECTS)
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at xio_client_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_2) \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_3) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_1)
 xio_client_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(xio_client_CXXFLAGS) \
@@ -6055,11 +5843,11 @@ am__xio_server_SOURCES_DIST = test/messenger/xio_server.cc \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_server-xio_dispatcher.$(OBJEXT)
 xio_server_OBJECTS = $(am_xio_server_OBJECTS)
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at xio_server_DEPENDENCIES =  \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_5) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_10) \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_11) \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_8) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_1) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_2) \
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_3) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__DEPENDENCIES_1)
 xio_server_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
 	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(xio_server_CXXFLAGS) \
@@ -6143,17 +5931,20 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(libcls_statelog_client_a_SOURCES) \
 	$(libcls_timeindex_client_a_SOURCES) \
 	$(libcls_user_client_a_SOURCES) \
-	$(libcls_version_client_a_SOURCES) $(libos_zfs_a_SOURCES) \
-	$(libarch_la_SOURCES) $(libauth_la_SOURCES) \
-	$(libcephfs_la_SOURCES) $(libcephfs_jni_la_SOURCES) \
-	$(libcivetweb_la_SOURCES) $(libclient_la_SOURCES) \
-	$(libclient_fuse_la_SOURCES) $(libcls_cephfs_la_SOURCES) \
-	$(libcls_cephfs_client_la_SOURCES) $(libcls_hello_la_SOURCES) \
-	$(libcls_kvs_la_SOURCES) $(libcls_lock_la_SOURCES) \
-	$(libcls_lock_client_la_SOURCES) $(libcls_log_la_SOURCES) \
-	$(libcls_numops_la_SOURCES) $(libcls_numops_client_la_SOURCES) \
-	$(libcls_rbd_la_SOURCES) $(libcls_rbd_client_la_SOURCES) \
-	$(libcls_refcount_la_SOURCES) \
+	$(libcls_version_client_a_SOURCES) $(libkv_a_SOURCES) \
+	$(libmon_a_SOURCES) $(libos_a_SOURCES) \
+	$(libos_types_a_SOURCES) $(libos_zfs_a_SOURCES) \
+	$(libosd_a_SOURCES) $(libarch_la_SOURCES) \
+	$(libauth_la_SOURCES) $(libcephfs_la_SOURCES) \
+	$(libcephfs_jni_la_SOURCES) $(libcivetweb_la_SOURCES) \
+	$(libclient_la_SOURCES) $(libclient_fuse_la_SOURCES) \
+	$(libcls_cephfs_la_SOURCES) $(libcls_cephfs_client_la_SOURCES) \
+	$(libcls_hello_la_SOURCES) $(libcls_journal_la_SOURCES) \
+	$(libcls_journal_client_la_SOURCES) $(libcls_kvs_la_SOURCES) \
+	$(libcls_lock_la_SOURCES) $(libcls_lock_client_la_SOURCES) \
+	$(libcls_log_la_SOURCES) $(libcls_numops_la_SOURCES) \
+	$(libcls_numops_client_la_SOURCES) $(libcls_rbd_la_SOURCES) \
+	$(libcls_rbd_client_la_SOURCES) $(libcls_refcount_la_SOURCES) \
 	$(libcls_refcount_client_la_SOURCES) \
 	$(libcls_replica_log_la_SOURCES) $(libcls_rgw_la_SOURCES) \
 	$(libcls_rgw_client_la_SOURCES) $(libcls_statelog_la_SOURCES) \
@@ -6183,16 +5974,15 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(libec_test_shec_sse3_la_SOURCES) \
 	$(libec_test_shec_sse4_la_SOURCES) \
 	$(liberasure_code_la_SOURCES) $(libglobal_la_SOURCES) \
-	$(libjson_spirit_la_SOURCES) $(libkrbd_la_SOURCES) \
-	$(liblog_la_SOURCES) $(libmds_la_SOURCES) $(libmon_la_SOURCES) \
-	$(libmon_types_la_SOURCES) $(libmsg_la_SOURCES) \
-	$(libos_la_SOURCES) $(libos_rocksdb_la_SOURCES) \
-	$(libos_tp_la_SOURCES) $(nodist_libos_tp_la_SOURCES) \
-	$(libos_types_la_SOURCES) $(libosd_la_SOURCES) \
-	$(libosd_tp_la_SOURCES) $(nodist_libosd_tp_la_SOURCES) \
-	$(libosd_types_la_SOURCES) $(libosdc_la_SOURCES) \
-	$(libperfglue_la_SOURCES) $(librados_la_SOURCES) \
-	$(librados_api_la_SOURCES) $(librados_internal_la_SOURCES) \
+	$(libjournal_la_SOURCES) $(libjson_spirit_la_SOURCES) \
+	$(libkrbd_la_SOURCES) $(liblog_la_SOURCES) \
+	$(libmds_la_SOURCES) $(libmon_types_la_SOURCES) \
+	$(libmsg_la_SOURCES) $(libos_tp_la_SOURCES) \
+	$(nodist_libos_tp_la_SOURCES) $(libosd_tp_la_SOURCES) \
+	$(nodist_libosd_tp_la_SOURCES) $(libosd_types_la_SOURCES) \
+	$(libosdc_la_SOURCES) $(libperfglue_la_SOURCES) \
+	$(librados_la_SOURCES) $(librados_api_la_SOURCES) \
+	$(librados_internal_la_SOURCES) \
 	$(librados_test_stub_la_SOURCES) $(librados_tp_la_SOURCES) \
 	$(nodist_librados_tp_la_SOURCES) $(libradosstriper_la_SOURCES) \
 	$(libradosstripertest_la_SOURCES) $(libradostest_la_SOURCES) \
@@ -6227,7 +6017,8 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(ceph_streamtest_SOURCES) $(ceph_test_async_driver_SOURCES) \
 	$(ceph_test_c_headers_SOURCES) \
 	$(ceph_test_cfuse_cache_invalidate_SOURCES) \
-	$(ceph_test_cls_hello_SOURCES) $(ceph_test_cls_lock_SOURCES) \
+	$(ceph_test_cls_hello_SOURCES) \
+	$(ceph_test_cls_journal_SOURCES) $(ceph_test_cls_lock_SOURCES) \
 	$(ceph_test_cls_log_SOURCES) $(ceph_test_cls_numops_SOURCES) \
 	$(ceph_test_cls_rbd_SOURCES) $(ceph_test_cls_refcount_SOURCES) \
 	$(ceph_test_cls_replica_log_SOURCES) \
@@ -6326,7 +6117,7 @@ SOURCES = $(libcls_log_client_a_SOURCES) \
 	$(unittest_gather_SOURCES) $(unittest_heartbeatmap_SOURCES) \
 	$(unittest_histogram_SOURCES) $(unittest_hitset_SOURCES) \
 	$(unittest_io_priority_SOURCES) $(unittest_ipaddr_SOURCES) \
-	$(unittest_lfnindex_SOURCES) \
+	$(unittest_journal_SOURCES) $(unittest_lfnindex_SOURCES) \
 	$(unittest_libcephfs_config_SOURCES) \
 	$(unittest_librados_SOURCES) \
 	$(unittest_librados_config_SOURCES) $(unittest_librbd_SOURCES) \
@@ -6362,8 +6153,11 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__libcls_timeindex_client_a_SOURCES_DIST) \
 	$(am__libcls_user_client_a_SOURCES_DIST) \
 	$(am__libcls_version_client_a_SOURCES_DIST) \
-	$(am__libos_zfs_a_SOURCES_DIST) $(libarch_la_SOURCES) \
-	$(libauth_la_SOURCES) $(am__libcephfs_la_SOURCES_DIST) \
+	$(am__libkv_a_SOURCES_DIST) $(am__libmon_a_SOURCES_DIST) \
+	$(am__libos_a_SOURCES_DIST) $(am__libos_types_a_SOURCES_DIST) \
+	$(am__libos_zfs_a_SOURCES_DIST) $(am__libosd_a_SOURCES_DIST) \
+	$(libarch_la_SOURCES) $(libauth_la_SOURCES) \
+	$(am__libcephfs_la_SOURCES_DIST) \
 	$(am__libcephfs_jni_la_SOURCES_DIST) \
 	$(am__libcivetweb_la_SOURCES_DIST) \
 	$(am__libclient_la_SOURCES_DIST) \
@@ -6371,6 +6165,8 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__libcls_cephfs_la_SOURCES_DIST) \
 	$(am__libcls_cephfs_client_la_SOURCES_DIST) \
 	$(am__libcls_hello_la_SOURCES_DIST) \
+	$(am__libcls_journal_la_SOURCES_DIST) \
+	$(am__libcls_journal_client_la_SOURCES_DIST) \
 	$(am__libcls_kvs_la_SOURCES_DIST) \
 	$(am__libcls_lock_la_SOURCES_DIST) \
 	$(am__libcls_lock_client_la_SOURCES_DIST) \
@@ -6415,16 +6211,12 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__libec_test_shec_sse3_la_SOURCES_DIST) \
 	$(am__libec_test_shec_sse4_la_SOURCES_DIST) \
 	$(liberasure_code_la_SOURCES) $(libglobal_la_SOURCES) \
-	$(libjson_spirit_la_SOURCES) $(am__libkrbd_la_SOURCES_DIST) \
-	$(liblog_la_SOURCES) $(am__libmds_la_SOURCES_DIST) \
-	$(am__libmon_la_SOURCES_DIST) $(libmon_types_la_SOURCES) \
-	$(am__libmsg_la_SOURCES_DIST) $(am__libos_la_SOURCES_DIST) \
-	$(am__libos_rocksdb_la_SOURCES_DIST) \
-	$(am__libos_tp_la_SOURCES_DIST) \
-	$(am__libos_types_la_SOURCES_DIST) \
-	$(am__libosd_la_SOURCES_DIST) $(am__libosd_tp_la_SOURCES_DIST) \
-	$(libosd_types_la_SOURCES) $(libosdc_la_SOURCES) \
-	$(am__libperfglue_la_SOURCES_DIST) \
+	$(am__libjournal_la_SOURCES_DIST) $(libjson_spirit_la_SOURCES) \
+	$(am__libkrbd_la_SOURCES_DIST) $(liblog_la_SOURCES) \
+	$(am__libmds_la_SOURCES_DIST) $(libmon_types_la_SOURCES) \
+	$(am__libmsg_la_SOURCES_DIST) $(am__libos_tp_la_SOURCES_DIST) \
+	$(am__libosd_tp_la_SOURCES_DIST) $(libosd_types_la_SOURCES) \
+	$(libosdc_la_SOURCES) $(am__libperfglue_la_SOURCES_DIST) \
 	$(am__librados_la_SOURCES_DIST) \
 	$(am__librados_api_la_SOURCES_DIST) \
 	$(am__librados_internal_la_SOURCES_DIST) \
@@ -6478,6 +6270,7 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(am__ceph_test_c_headers_SOURCES_DIST) \
 	$(ceph_test_cfuse_cache_invalidate_SOURCES) \
 	$(am__ceph_test_cls_hello_SOURCES_DIST) \
+	$(am__ceph_test_cls_journal_SOURCES_DIST) \
 	$(am__ceph_test_cls_lock_SOURCES_DIST) \
 	$(am__ceph_test_cls_log_SOURCES_DIST) \
 	$(am__ceph_test_cls_numops_SOURCES_DIST) \
@@ -6601,6 +6394,7 @@ DIST_SOURCES = $(am__libcls_log_client_a_SOURCES_DIST) \
 	$(unittest_histogram_SOURCES) \
 	$(am__unittest_hitset_SOURCES_DIST) \
 	$(unittest_io_priority_SOURCES) $(unittest_ipaddr_SOURCES) \
+	$(am__unittest_journal_SOURCES_DIST) \
 	$(am__unittest_lfnindex_SOURCES_DIST) \
 	$(am__unittest_libcephfs_config_SOURCES_DIST) \
 	$(am__unittest_librados_SOURCES_DIST) \
@@ -6684,30 +6478,32 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	crush/CrushWrapper.h crush/CrushWrapper.i crush/builder.h \
 	crush/crush.h crush/crush_compat.h crush/crush_ln_table.h \
 	crush/grammar.h crush/hash.h crush/mapper.h crush/sample.txt \
-	crush/types.h mon/AuthMonitor.h mon/DataHealthService.h \
-	mon/Elector.h mon/LogMonitor.h mon/ConfigKeyService.h \
-	mon/HealthMonitor.h mon/HealthService.h mon/MDSMonitor.h \
-	mon/MonmapMonitor.h mon/MonCap.h mon/MonClient.h \
-	mon/MonCommands.h mon/DumplingMonCommands.h mon/MonMap.h \
-	mon/Monitor.h mon/MonitorDBStore.h mon/MonOpRequest.h \
-	mon/OSDMonitor.h mon/PGMap.h mon/PGMonitor.h mon/Paxos.h \
-	mon/PaxosService.h mon/QuorumService.h mon/Session.h \
-	mon/mon_types.h mds/inode_backtrace.h mds/flock.h mds/locks.c \
-	mds/locks.h mds/CDentry.h mds/CDir.h mds/CInode.h \
-	mds/Capability.h mds/InoTable.h mds/JournalPointer.h \
-	mds/LocalLock.h mds/Locker.h mds/LogEvent.h mds/LogSegment.h \
-	mds/MDBalancer.h mds/MDCache.h mds/RecoveryQueue.h \
-	mds/StrayManager.h mds/MDLog.h mds/MDSRank.h mds/MDSDaemon.h \
-	mds/Beacon.h mds/MDSContext.h mds/MDSAuthCaps.h mds/MDSMap.h \
-	mds/MDSTable.h mds/MDSTableServer.h mds/MDSTableClient.h \
-	mds/Mutation.h mds/Migrator.h mds/ScatterLock.h mds/Server.h \
-	mds/SessionMap.h mds/SimpleLock.h mds/SnapClient.h \
-	mds/SnapRealm.h mds/SnapServer.h mds/mds_table_types.h \
-	mds/mdstypes.h mds/snap.h mds/MDSContinuation.h \
-	mds/events/ECommitted.h mds/events/EExport.h \
-	mds/events/EFragment.h mds/events/EImportFinish.h \
-	mds/events/EImportStart.h mds/events/EMetaBlob.h \
-	mds/events/ENoOp.h mds/events/EOpen.h \
+	crush/types.h kv/KeyValueDB.h kv/LevelDBStore.h \
+	kv/RocksDBStore.h kv/KineticStore.h mon/AuthMonitor.h \
+	mon/DataHealthService.h mon/Elector.h mon/LogMonitor.h \
+	mon/ConfigKeyService.h mon/HealthMonitor.h mon/HealthService.h \
+	mon/MDSMonitor.h mon/MonmapMonitor.h mon/MonCap.h \
+	mon/MonClient.h mon/MonCommands.h mon/DumplingMonCommands.h \
+	mon/MonMap.h mon/Monitor.h mon/MonitorDBStore.h \
+	mon/MonOpRequest.h mon/OSDMonitor.h mon/PGMap.h \
+	mon/PGMonitor.h mon/Paxos.h mon/PaxosService.h \
+	mon/QuorumService.h mon/Session.h mon/mon_types.h \
+	mds/inode_backtrace.h mds/flock.h mds/locks.c mds/locks.h \
+	mds/CDentry.h mds/CDir.h mds/CInode.h mds/Capability.h \
+	mds/InoTable.h mds/JournalPointer.h mds/LocalLock.h \
+	mds/Locker.h mds/LogEvent.h mds/LogSegment.h mds/MDBalancer.h \
+	mds/MDCache.h mds/RecoveryQueue.h mds/StrayManager.h \
+	mds/MDLog.h mds/MDSRank.h mds/MDSDaemon.h mds/Beacon.h \
+	mds/MDSContext.h mds/MDSAuthCaps.h mds/MDSMap.h mds/MDSTable.h \
+	mds/MDSTableServer.h mds/MDSTableClient.h mds/Mutation.h \
+	mds/Migrator.h mds/ScatterLock.h mds/ScrubStack.h \
+	mds/ScrubHeader.h mds/Server.h mds/SessionMap.h \
+	mds/SimpleLock.h mds/SnapClient.h mds/SnapRealm.h \
+	mds/SnapServer.h mds/mds_table_types.h mds/mdstypes.h \
+	mds/snap.h mds/MDSContinuation.h mds/events/ECommitted.h \
+	mds/events/EExport.h mds/events/EFragment.h \
+	mds/events/EImportFinish.h mds/events/EImportStart.h \
+	mds/events/EMetaBlob.h mds/events/ENoOp.h mds/events/EOpen.h \
 	mds/events/EResetJournal.h mds/events/ESession.h \
 	mds/events/ESessions.h mds/events/ESlaveUpdate.h \
 	mds/events/ESubtreeMap.h mds/events/ETableClient.h \
@@ -6718,17 +6514,16 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	os/FileJournal.h os/FileStore.h os/FDCache.h os/fs/FS.h \
 	os/fs/XFS.h os/GenericFileStoreBackend.h os/HashIndex.h \
 	os/IndexManager.h os/Journal.h os/JournalingObjectStore.h \
-	os/KeyValueDB.h os/LevelDBStore.h os/LFNIndex.h os/MemStore.h \
-	os/KeyValueStore.h os/ObjectMap.h os/ObjectStore.h \
-	os/PageSet.h os/SequencerPosition.h os/WBThrottle.h \
-	os/XfsFileStoreBackend.h os/ZFSFileStoreBackend.h \
-	os/RocksDBStore.h os/ZFS.h os/KineticStore.h \
-	osd/ClassHandler.h osd/HitSet.h osd/OSD.h osd/OSDCap.h \
-	osd/OSDMap.h osd/ObjectVersioner.h osd/OpRequest.h \
-	osd/SnapMapper.h osd/PG.h osd/PGLog.h osd/ReplicatedPG.h \
-	osd/PGBackend.h osd/ReplicatedBackend.h osd/TierAgentState.h \
-	osd/ECBackend.h osd/ECUtil.h osd/ECMsgTypes.h \
-	osd/ECTransaction.h osd/Watch.h osd/osd_types.h \
+	os/LFNIndex.h os/MemStore.h os/KeyValueStore.h os/ObjectMap.h \
+	os/ObjectStore.h os/PageSet.h os/SequencerPosition.h \
+	os/WBThrottle.h os/XfsFileStoreBackend.h \
+	os/ZFSFileStoreBackend.h os/ZFS.h osd/ClassHandler.h \
+	osd/HitSet.h osd/OSD.h osd/OSDCap.h osd/OSDMap.h \
+	osd/ObjectVersioner.h osd/OpRequest.h osd/SnapMapper.h \
+	osd/PG.h osd/PGLog.h osd/ReplicatedPG.h osd/PGBackend.h \
+	osd/ReplicatedBackend.h osd/TierAgentState.h osd/ECBackend.h \
+	osd/ECUtil.h osd/ECMsgTypes.h osd/ECTransaction.h osd/Watch.h \
+	osd/osd_types.h \
 	erasure-code/jerasure/gf-complete/include/gf_complete.h \
 	erasure-code/jerasure/gf-complete/include/gf_general.h \
 	erasure-code/jerasure/gf-complete/include/gf_int.h \
@@ -6919,39 +6714,48 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	librados/RadosXattrIter.h librados/ListObjectImpl.h \
 	libradosstriper/RadosStriperImpl.h \
 	libradosstriper/MultiAioCompletionImpl.h \
-	librbd/AioCompletion.h librbd/AioRequest.h \
+	journal/AsyncOpTracker.h journal/Entry.h journal/Future.h \
+	journal/FutureImpl.h journal/Journaler.h \
+	journal/JournalMetadata.h journal/JournalPlayer.h \
+	journal/JournalRecorder.h journal/JournalTrimmer.h \
+	journal/ObjectPlayer.h journal/ObjectRecorder.h \
+	journal/ReplayEntry.h journal/ReplayHandler.h journal/Utils.h \
+	librbd/AioCompletion.h librbd/AioImageRequest.h \
+	librbd/AioImageRequestWQ.h librbd/AioObjectRequest.h \
 	librbd/AsyncFlattenRequest.h librbd/AsyncObjectThrottle.h \
 	librbd/AsyncOperation.h librbd/AsyncRequest.h \
 	librbd/AsyncResizeRequest.h librbd/AsyncTrimRequest.h \
 	librbd/CopyupRequest.h librbd/DiffIterate.h librbd/ImageCtx.h \
-	librbd/ImageWatcher.h librbd/internal.h \
-	librbd/LibrbdWriteback.h librbd/ObjectMap.h \
-	librbd/parent_types.h librbd/RebuildObjectMapRequest.h \
-	librbd/SnapInfo.h librbd/TaskFinisher.h \
-	librbd/WatchNotifyTypes.h rgw/rgw_acl.h rgw/rgw_acl_s3.h \
-	rgw/rgw_acl_swift.h rgw/rgw_client_io.h rgw/rgw_fcgi.h \
-	rgw/rgw_xml.h rgw/rgw_cache.h rgw/rgw_common.h rgw/rgw_cors.h \
-	rgw/rgw_cors_s3.h rgw/rgw_cors_swift.h rgw/rgw_string.h \
-	rgw/rgw_formats.h rgw/rgw_http_errors.h rgw/rgw_log.h \
-	rgw/rgw_loadgen.h rgw/rgw_multi.h rgw/rgw_policy_s3.h \
-	rgw/rgw_gc.h rgw/rgw_metadata.h rgw/rgw_multi_del.h \
-	rgw/rgw_object_expirer_core.h rgw/rgw_op.h rgw/rgw_orphan.h \
-	rgw/rgw_http_client.h rgw/rgw_swift.h rgw/rgw_swift_auth.h \
-	rgw/rgw_quota.h rgw/rgw_rados.h rgw/rgw_replica_log.h \
-	rgw/rgw_resolve.h rgw/rgw_rest.h rgw/rgw_rest_swift.h \
-	rgw/rgw_rest_s3.h rgw/rgw_auth_s3.h rgw/rgw_rest_admin.h \
-	rgw/rgw_rest_usage.h rgw/rgw_rest_user.h rgw/rgw_rest_bucket.h \
-	rgw/rgw_rest_client.h rgw/rgw_rest_conn.h rgw/rgw_tools.h \
-	rgw/rgw_rest_metadata.h rgw/rgw_rest_log.h \
-	rgw/rgw_rest_opstate.h rgw/rgw_rest_replica_log.h \
-	rgw/rgw_rest_config.h rgw/rgw_usage.h rgw/rgw_user.h \
-	rgw/rgw_bucket.h rgw/rgw_keystone.h rgw/rgw_civetweb.h \
-	rgw/rgw_civetweb_log.h civetweb/civetweb.h \
-	civetweb/include/civetweb.h civetweb/include/civetweb_conf.h \
-	civetweb/src/md5.h cls/lock/cls_lock_types.h \
-	cls/lock/cls_lock_ops.h cls/lock/cls_lock_client.h \
-	cls/numops/cls_numops_client.h cls/rbd/cls_rbd.h \
-	cls/rbd/cls_rbd_client.h cls/refcount/cls_refcount_ops.h \
+	librbd/ImageWatcher.h librbd/internal.h librbd/Journal.h \
+	librbd/JournalReplay.h librbd/JournalTypes.h \
+	librbd/LibrbdAdminSocketHook.h librbd/LibrbdWriteback.h \
+	librbd/ObjectMap.h librbd/parent_types.h \
+	librbd/RebuildObjectMapRequest.h librbd/SnapInfo.h \
+	librbd/TaskFinisher.h librbd/WatchNotifyTypes.h rgw/rgw_acl.h \
+	rgw/rgw_acl_s3.h rgw/rgw_acl_swift.h rgw/rgw_client_io.h \
+	rgw/rgw_fcgi.h rgw/rgw_xml.h rgw/rgw_cache.h rgw/rgw_common.h \
+	rgw/rgw_cors.h rgw/rgw_cors_s3.h rgw/rgw_cors_swift.h \
+	rgw/rgw_string.h rgw/rgw_formats.h rgw/rgw_http_errors.h \
+	rgw/rgw_log.h rgw/rgw_loadgen.h rgw/rgw_multi.h \
+	rgw/rgw_policy_s3.h rgw/rgw_gc.h rgw/rgw_metadata.h \
+	rgw/rgw_multi_del.h rgw/rgw_object_expirer_core.h rgw/rgw_op.h \
+	rgw/rgw_orphan.h rgw/rgw_http_client.h rgw/rgw_swift.h \
+	rgw/rgw_swift_auth.h rgw/rgw_quota.h rgw/rgw_rados.h \
+	rgw/rgw_replica_log.h rgw/rgw_resolve.h rgw/rgw_rest.h \
+	rgw/rgw_rest_swift.h rgw/rgw_rest_s3.h rgw/rgw_auth_s3.h \
+	rgw/rgw_rest_admin.h rgw/rgw_rest_usage.h rgw/rgw_rest_user.h \
+	rgw/rgw_rest_bucket.h rgw/rgw_rest_client.h \
+	rgw/rgw_rest_conn.h rgw/rgw_tools.h rgw/rgw_rest_metadata.h \
+	rgw/rgw_rest_log.h rgw/rgw_rest_opstate.h \
+	rgw/rgw_rest_replica_log.h rgw/rgw_rest_config.h \
+	rgw/rgw_usage.h rgw/rgw_user.h rgw/rgw_bucket.h \
+	rgw/rgw_keystone.h rgw/rgw_civetweb.h rgw/rgw_civetweb_log.h \
+	civetweb/civetweb.h civetweb/include/civetweb.h \
+	civetweb/include/civetweb_conf.h civetweb/src/md5.h \
+	cls/lock/cls_lock_types.h cls/lock/cls_lock_ops.h \
+	cls/lock/cls_lock_client.h cls/numops/cls_numops_client.h \
+	cls/rbd/cls_rbd.h cls/rbd/cls_rbd_client.h \
+	cls/refcount/cls_refcount_ops.h \
 	cls/refcount/cls_refcount_client.h \
 	cls/version/cls_version_types.h cls/version/cls_version_ops.h \
 	cls/version/cls_version_client.h cls/log/cls_log_types.h \
@@ -6969,6 +6773,8 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	cls/rgw/cls_rgw_types.h cls/user/cls_user_client.h \
 	cls/user/cls_user_ops.h cls/user/cls_user_types.h \
 	cls/cephfs/cls_cephfs.h cls/cephfs/cls_cephfs_client.h \
+	cls/journal/cls_journal_client.h \
+	cls/journal/cls_journal_types.h \
 	key_value_store/key_value_structure.h \
 	key_value_store/kv_flat_btree_async.h \
 	key_value_store/kvs_arg_types.h rbd_replay/ActionTypes.h \
@@ -6983,6 +6789,8 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	test/messenger/simple_dispatcher.h \
 	test/messenger/xio_dispatcher.h \
 	test/librados_test_stub/LibradosTestStub.h \
+	test/librados_test_stub/MockTestMemIoCtxImpl.h \
+	test/librados_test_stub/MockTestMemRadosClient.h \
 	test/librados_test_stub/TestClassHandler.h \
 	test/librados_test_stub/TestRadosClient.h \
 	test/librados_test_stub/TestMemRadosClient.h \
@@ -7017,19 +6825,21 @@ am__noinst_HEADERS_DIST = arch/intel.h arch/arm.h arch/probe.h \
 	test/system/st_rados_list_objects.h \
 	test/system/st_rados_notify.h test/system/st_rados_watch.h \
 	test/system/systest_runnable.h test/system/systest_settings.h \
-	test/unit.h tools/cephfs/JournalTool.h \
-	tools/cephfs/JournalScanner.h tools/cephfs/JournalFilter.h \
-	tools/cephfs/EventOutput.h tools/cephfs/Resetter.h \
-	tools/cephfs/Dumper.h tools/cephfs/TableTool.h \
-	tools/cephfs/MDSUtility.h tools/RadosDump.h \
-	tools/rados/RadosImport.h tools/ceph_objectstore_tool.h \
-	tools/rados/PoolDump.h tools/cephfs/DataScan.h \
-	compressor/Compressor.h compressor/AsyncCompressor.h \
-	compressor/SnappyCompressor.h cls_acl.cc cls_crypto.cc \
-	fetch_config logrotate.conf sample.ceph.conf \
-	bash_completion/ceph bash_completion/rados bash_completion/rbd \
-	bash_completion/radosgw-admin mount/canonicalize.c \
-	mount/mtab.c objclass/objclass.h
+	test/unit.h test/journal/RadosTestFixture.h \
+	tools/rbd/ArgumentTypes.h tools/rbd/IndentStream.h \
+	tools/rbd/OptionPrinter.h tools/rbd/Shell.h tools/rbd/Utils.h \
+	tools/cephfs/JournalTool.h tools/cephfs/JournalScanner.h \
+	tools/cephfs/JournalFilter.h tools/cephfs/EventOutput.h \
+	tools/cephfs/Resetter.h tools/cephfs/Dumper.h \
+	tools/cephfs/TableTool.h tools/cephfs/MDSUtility.h \
+	tools/RadosDump.h tools/rados/RadosImport.h \
+	tools/ceph_objectstore_tool.h tools/rados/PoolDump.h \
+	tools/cephfs/DataScan.h compressor/Compressor.h \
+	compressor/AsyncCompressor.h compressor/SnappyCompressor.h \
+	cls_acl.cc cls_crypto.cc fetch_config logrotate.conf \
+	sample.ceph.conf bash_completion/ceph bash_completion/rados \
+	bash_completion/rbd bash_completion/radosgw-admin \
+	mount/canonicalize.c mount/mtab.c objclass/objclass.h
 HEADERS = $(noinst_HEADERS)
 RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
   distclean-recursive maintainer-clean-recursive
@@ -7265,8 +7075,8 @@ ACLOCAL = @ACLOCAL@
 AMTAR = @AMTAR@
 AM_CXXFLAGS = @AM_CXXFLAGS@ $(AM_COMMON_CFLAGS) -ftemplate-depth-1024 \
 	-Wnon-virtual-dtor -Wno-invalid-offsetof $(HARDENING_CFLAGS) \
-	$(am__append_3) $(am__append_6) $(am__append_75) \
-	$(am__append_78)
+	$(am__append_7) $(am__append_10) $(am__append_84) \
+	$(am__append_87)
 AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
 ARM_CRC_FLAGS = @ARM_CRC_FLAGS@
@@ -7278,6 +7088,7 @@ AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
 BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
+BOOST_REGEX_LIBS = @BOOST_REGEX_LIBS@
 BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
 CC = @CC@
 CCAS = ${srcdir}/yasm-wrapper
@@ -7463,12 +7274,12 @@ top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 user_rgw = @user_rgw@
 AUTOMAKE_OPTIONS = gnu subdir-objects
-SUBDIRS = ocf java $(am__append_209)
+SUBDIRS = ocf java
 DIST_SUBDIRS = gmock ocf java
-BUILT_SOURCES = $(am__append_212) $(am__append_233)
+BUILT_SOURCES = $(am__append_229) $(am__append_249)
 
 # extra bits
-EXTRA_DIST = $(am__append_21) ceph-detect-init/AUTHORS.rst \
+EXTRA_DIST = $(am__append_25) ceph-detect-init/AUTHORS.rst \
 	ceph-detect-init/ceph_detect_init/centos/__init__.py \
 	ceph-detect-init/ceph_detect_init/exc.py \
 	ceph-detect-init/ceph_detect_init/main.py \
@@ -7523,7 +7334,594 @@ EXTRA_DIST = $(am__append_21) ceph-detect-init/AUTHORS.rst \
 	$(srcdir)/test/opensuse-13.2/install-deps.sh \
 	$(srcdir)/test/opensuse-13.2/ceph.spec.in \
 	$(srcdir)/test/coverage.sh $(patsubst \
-	%,$(srcdir)/%,$(check_SCRIPTS)) $(am__append_210) \
+	%,$(srcdir)/%,$(check_SCRIPTS)) rocksdb/.arcconfig \
+	rocksdb/PATENTS rocksdb/.clang-format rocksdb/AUTHORS \
+	rocksdb/CONTRIBUTING.md rocksdb/LICENSE rocksdb/README.md \
+	rocksdb/Vagrantfile \
+	rocksdb/arcanist_util/__phutil_library_init__.php \
+	rocksdb/arcanist_util/config/FacebookArcanistConfiguration.php \
+	rocksdb/arcanist_util/cpp_linter/ArcanistCpplintLinter.php \
+	rocksdb/arcanist_util/cpp_linter/cpplint.py \
+	rocksdb/arcanist_util/cpp_linter/BaseDirectoryScopedFormatLinter.php \
+	rocksdb/arcanist_util/cpp_linter/FacebookHowtoevenLinter.php \
+	rocksdb/arcanist_util/cpp_linter/FbcodeClangFormatLinter.php \
+	rocksdb/arcanist_util/cpp_linter/FbcodeCppLinter.php \
+	rocksdb/arcanist_util/lint_engine/FacebookFbcodeLintEngine.php \
+	rocksdb/arcanist_util/lint_engine/FacebookHowtoevenLintEngine.php \
+	rocksdb/arcanist_util/unit_engine/FacebookFbcodeUnitTestEngine.php \
+	rocksdb/arcanist_util/__phutil_library_map__.php \
+	rocksdb/build_tools/make_new_version.sh \
+	rocksdb/build_tools/make_package.sh \
+	rocksdb/build_tools/regression_build_test.sh \
+	rocksdb/build_tools/version.sh \
+	rocksdb/build_tools/amalgamate.py \
+	rocksdb/build_tools/build_detect_platform \
+	rocksdb/build_tools/dockerbuild.sh \
+	rocksdb/build_tools/fb_compile_mongo.sh \
+	rocksdb/build_tools/fbcode_config.sh \
+	rocksdb/build_tools/fbcode_config4.8.1.sh \
+	rocksdb/build_tools/format-diff.sh \
+	rocksdb/build_tools/rocksdb-lego-determinator \
+	rocksdb/build_tools/run_ci_db_test.ps1 \
+	rocksdb/coverage/coverage_test.sh \
+	rocksdb/coverage/parse_gcov_output.py \
+	rocksdb/db/compaction_picker.h \
+	rocksdb/db/compaction_picker_test.cc \
+	rocksdb/db/comparator_db_test.cc rocksdb/db/convenience.cc \
+	rocksdb/db/cuckoo_table_db_test.cc rocksdb/db/db_bench.cc \
+	rocksdb/db/db_filesnapshot.cc rocksdb/db/db_impl.cc \
+	rocksdb/db/db_impl.h rocksdb/db/db_impl_debug.cc \
+	rocksdb/db/db_impl_readonly.cc rocksdb/db/db_impl_readonly.h \
+	rocksdb/db/db_iter.cc rocksdb/db/db_iter_test.cc \
+	rocksdb/db/db_log_iter_test.cc rocksdb/db/dbformat.cc \
+	rocksdb/db/dbformat_test.cc rocksdb/db/deletefile_test.cc \
+	rocksdb/db/filename.cc rocksdb/db/log_reader.h \
+	rocksdb/db/file_indexer.h rocksdb/db/flush_job_test.cc \
+	rocksdb/db/plain_table_db_test.cc rocksdb/db/db_test.cc \
+	rocksdb/db/db_iter.h rocksdb/db/fault_injection_test.cc \
+	rocksdb/db/filename.h rocksdb/db/forward_iterator.cc \
+	rocksdb/db/forward_iterator.h rocksdb/db/job_context.h \
+	rocksdb/db/compaction_job.h rocksdb/db/memtable.cc \
+	rocksdb/db/file_indexer.cc rocksdb/db/flush_job.cc \
+	rocksdb/db/file_indexer_test.cc rocksdb/db/log_reader.cc \
+	rocksdb/db/table_cache.h rocksdb/db/filename_test.cc \
+	rocksdb/db/memtable_list_test.cc rocksdb/db/merge_helper.cc \
+	rocksdb/db/flush_scheduler.cc rocksdb/db/flush_scheduler.h \
+	rocksdb/db/internal_stats.h rocksdb/db/listener_test.cc \
+	rocksdb/db/log_writer.cc rocksdb/db/log_writer.h \
+	rocksdb/db/merge_helper.h rocksdb/db/merge_operator.cc \
+	rocksdb/db/merge_test.cc rocksdb/db/version_set.h \
+	rocksdb/db/log_format.h rocksdb/db/memtable.h \
+	rocksdb/db/memtable_list.cc rocksdb/db/skiplist.h \
+	rocksdb/db/c_test.c rocksdb/db/managed_iterator.h \
+	rocksdb/db/wal_manager_test.cc \
+	rocksdb/db/memtable_allocator.cc \
+	rocksdb/db/memtable_allocator.h \
+	rocksdb/db/memtablerep_bench.cc rocksdb/db/repair.cc \
+	rocksdb/db/internal_stats.cc rocksdb/db/merge_context.h \
+	rocksdb/db/managed_iterator.cc rocksdb/db/compacted_db_impl.h \
+	rocksdb/db/memtable_list.h rocksdb/db/perf_context_test.cc \
+	rocksdb/db/table_cache.cc rocksdb/db/db_impl_experimental.cc \
+	rocksdb/db/skiplist_test.cc rocksdb/db/slice.cc \
+	rocksdb/db/table_properties_collector.cc \
+	rocksdb/db/table_properties_collector.h \
+	rocksdb/db/table_properties_collector_test.cc \
+	rocksdb/db/transaction_log_impl.cc \
+	rocksdb/db/transaction_log_impl.h \
+	rocksdb/db/version_builder.cc rocksdb/db/version_builder.h \
+	rocksdb/db/version_builder_test.cc rocksdb/db/version_edit.cc \
+	rocksdb/db/version_edit.h rocksdb/db/version_edit_test.cc \
+	rocksdb/db/version_set.cc rocksdb/db/version_set_test.cc \
+	rocksdb/db/wal_manager.cc rocksdb/db/write_batch.cc \
+	rocksdb/db/write_batch_base.cc rocksdb/db/wal_manager.h \
+	rocksdb/db/write_batch_internal.h \
+	rocksdb/db/write_batch_test.cc rocksdb/db/write_callback.h \
+	rocksdb/db/write_controller.cc rocksdb/db/write_controller.h \
+	rocksdb/db/write_controller_test.cc rocksdb/db/write_thread.cc \
+	rocksdb/db/write_thread.h rocksdb/db/builder.cc \
+	rocksdb/db/c.cc rocksdb/db/writebuffer.h \
+	rocksdb/db/compaction_iterator.h rocksdb/db/experimental.cc \
+	rocksdb/db/column_family.h rocksdb/db/column_family_test.cc \
+	rocksdb/db/compact_files_test.cc rocksdb/db/compaction.cc \
+	rocksdb/db/compaction.h rocksdb/db/compaction_job.cc \
+	rocksdb/db/compaction_job_test.cc \
+	rocksdb/db/compaction_picker.cc rocksdb/db/column_family.cc \
+	rocksdb/db/dbformat.h rocksdb/db/builder.h \
+	rocksdb/db/compacted_db_impl.cc rocksdb/db/flush_job.h \
+	rocksdb/db/log_test.cc rocksdb/db/prefix_test.cc \
+	rocksdb/db/corruption_test.cc rocksdb/db/db_compaction_test.cc \
+	rocksdb/db/compaction_iterator.cc \
+	rocksdb/db/compaction_iterator_test.cc \
+	rocksdb/db/compaction_job_stats_test.cc \
+	rocksdb/db/db_compaction_filter_test.cc \
+	rocksdb/db/db_dynamic_level_test.cc \
+	rocksdb/db/db_inplace_update_test.cc \
+	rocksdb/db/db_tailing_iter_test.cc \
+	rocksdb/db/db_universal_compaction_test.cc \
+	rocksdb/db/db_wal_test.cc rocksdb/db/event_helpers.cc \
+	rocksdb/db/event_helpers.h rocksdb/db/merge_helper_test.cc \
+	rocksdb/db/snapshot_impl.cc rocksdb/db/snapshot_impl.h \
+	rocksdb/db/write_callback_test.cc rocksdb/doc/doc.css \
+	rocksdb/doc/index.html rocksdb/doc/log_format.txt \
+	rocksdb/doc/rockslogo.jpg rocksdb/doc/rockslogo.png \
+	rocksdb/examples/README.md \
+	rocksdb/examples/column_families_example.cc \
+	rocksdb/examples/simple_example.cc rocksdb/examples/.gitignore \
+	rocksdb/examples/Makefile rocksdb/examples/c_simple_example.c \
+	rocksdb/examples/compact_files_example.cc \
+	rocksdb/examples/compaction_filter_example.cc \
+	rocksdb/examples/optimistic_transaction_example.cc \
+	rocksdb/examples/rocksdb_option_file_example.ini \
+	rocksdb/examples/transaction_example.cc rocksdb/hdfs/README \
+	rocksdb/hdfs/setup.sh rocksdb/hdfs/env_hdfs.h \
+	rocksdb/include/rocksdb/filter_policy.h \
+	rocksdb/include/rocksdb/flush_block_policy.h \
+	rocksdb/include/rocksdb/iterator.h \
+	rocksdb/include/rocksdb/ldb_tool.h \
+	rocksdb/include/rocksdb/slice_transform.h \
+	rocksdb/include/rocksdb/sst_dump_tool.h \
+	rocksdb/include/rocksdb/types.h \
+	rocksdb/include/rocksdb/utilities/db_ttl.h \
+	rocksdb/include/rocksdb/utilities/document_db.h \
+	rocksdb/include/rocksdb/utilities/geo_db.h \
+	rocksdb/include/rocksdb/utilities/json_document.h \
+	rocksdb/include/rocksdb/utilities/leveldb_options.h \
+	rocksdb/include/rocksdb/utilities/flashcache.h \
+	rocksdb/include/rocksdb/utilities/backupable_db.h \
+	rocksdb/include/rocksdb/utilities/checkpoint.h \
+	rocksdb/include/rocksdb/utilities/convenience.h \
+	rocksdb/include/rocksdb/utilities/info_log_finder.h \
+	rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h \
+	rocksdb/include/rocksdb/utilities/spatial_db.h \
+	rocksdb/include/rocksdb/utilities/stackable_db.h \
+	rocksdb/include/rocksdb/utilities/table_properties_collectors.h \
+	rocksdb/include/rocksdb/utilities/transaction.h \
+	rocksdb/include/rocksdb/utilities/transaction_db.h \
+	rocksdb/include/rocksdb/utilities/transaction_db_mutex.h \
+	rocksdb/include/rocksdb/utilities/utility_db.h \
+	rocksdb/include/rocksdb/utilities/write_batch_with_index.h \
+	rocksdb/include/rocksdb/experimental.h \
+	rocksdb/include/rocksdb/convenience.h \
+	rocksdb/include/rocksdb/db_dump_tool.h \
+	rocksdb/include/rocksdb/immutable_options.h \
+	rocksdb/include/rocksdb/iostats_context.h \
+	rocksdb/include/rocksdb/listener.h \
+	rocksdb/include/rocksdb/memtablerep.h \
+	rocksdb/include/rocksdb/merge_operator.h \
+	rocksdb/include/rocksdb/metadata.h \
+	rocksdb/include/rocksdb/perf_context.h \
+	rocksdb/include/rocksdb/perf_level.h \
+	rocksdb/include/rocksdb/slice.h \
+	rocksdb/include/rocksdb/status.h \
+	rocksdb/include/rocksdb/table_properties.h \
+	rocksdb/include/rocksdb/transaction_log.h \
+	rocksdb/include/rocksdb/version.h \
+	rocksdb/include/rocksdb/write_batch_base.h \
+	rocksdb/include/rocksdb/c.h \
+	rocksdb/include/rocksdb/compaction_filter.h \
+	rocksdb/include/rocksdb/comparator.h \
+	rocksdb/include/rocksdb/db.h rocksdb/include/rocksdb/env.h \
+	rocksdb/include/rocksdb/options.h \
+	rocksdb/include/rocksdb/rate_limiter.h \
+	rocksdb/include/rocksdb/snapshot.h \
+	rocksdb/include/rocksdb/statistics.h \
+	rocksdb/include/rocksdb/table.h \
+	rocksdb/include/rocksdb/thread_status.h \
+	rocksdb/include/rocksdb/universal_compaction.h \
+	rocksdb/include/rocksdb/write_batch.h \
+	rocksdb/include/rocksdb/cache.h \
+	rocksdb/include/rocksdb/compaction_job_stats.h \
+	rocksdb/include/rocksdb/delete_scheduler.h \
+	rocksdb/include/rocksdb/sst_file_writer.h \
+	rocksdb/java/RELEASE.md \
+	rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java \
+	rocksdb/java/crossbuild/build-linux.sh \
+	rocksdb/java/crossbuild/Vagrantfile \
+	rocksdb/java/crossbuild/build-linux-centos.sh \
+	rocksdb/java/jdb_bench.sh rocksdb/java/rocksjni.pom \
+	rocksdb/java/rocksjni/backupablejni.cc \
+	rocksdb/java/rocksjni/checkpoint.cc \
+	rocksdb/java/rocksjni/columnfamilyhandle.cc \
+	rocksdb/java/rocksjni/comparator.cc \
+	rocksdb/java/rocksjni/comparatorjnicallback.h \
+	rocksdb/java/rocksjni/env.cc rocksdb/java/rocksjni/filter.cc \
+	rocksdb/java/rocksjni/iterator.cc \
+	rocksdb/java/rocksjni/loggerjnicallback.h \
+	rocksdb/java/rocksjni/memtablejni.cc \
+	rocksdb/java/rocksjni/merge_operator.cc \
+	rocksdb/java/rocksjni/ratelimiterjni.cc \
+	rocksdb/java/rocksjni/restorejni.cc \
+	rocksdb/java/rocksjni/slice.cc \
+	rocksdb/java/rocksjni/snapshot.cc \
+	rocksdb/java/rocksjni/statistics.cc \
+	rocksdb/java/rocksjni/table.cc \
+	rocksdb/java/rocksjni/transaction_log.cc \
+	rocksdb/java/rocksjni/ttl.cc \
+	rocksdb/java/rocksjni/write_batch.cc \
+	rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc \
+	rocksdb/java/rocksjni/writebatchhandlerjnicallback.h \
+	rocksdb/java/rocksjni/backupenginejni.cc \
+	rocksdb/java/rocksjni/compaction_filter.cc \
+	rocksdb/java/rocksjni/comparatorjnicallback.cc \
+	rocksdb/java/rocksjni/loggerjnicallback.cc \
+	rocksdb/java/rocksjni/options.cc \
+	rocksdb/java/rocksjni/portal.h \
+	rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc \
+	rocksdb/java/rocksjni/rocksjni.cc \
+	rocksdb/java/rocksjni/write_batch_test.cc \
+	rocksdb/java/rocksjni/write_batch_with_index.cc \
+	rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java \
+	rocksdb/java/samples/src/main/java/RocksDBSample.java \
+	rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java \
+	rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java \
+	rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java \
+	rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java \
+	rocksdb/java/src/main/java/org/rocksdb/BackupableDB.java \
+	rocksdb/java/src/main/java/org/rocksdb/BackupableDBOptions.java \
+	rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java \
+	rocksdb/java/src/main/java/org/rocksdb/BloomFilter.java \
+	rocksdb/java/src/main/java/org/rocksdb/BuiltinComparator.java \
+	rocksdb/java/src/main/java/org/rocksdb/Checkpoint.java \
+	rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java \
+	rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java \
+	rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java \
+	rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java \
+	rocksdb/java/src/main/java/org/rocksdb/Comparator.java \
+	rocksdb/java/src/main/java/org/rocksdb/ComparatorOptions.java \
+	rocksdb/java/src/main/java/org/rocksdb/CompressionType.java \
+	rocksdb/java/src/main/java/org/rocksdb/DBOptions.java \
+	rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java \
+	rocksdb/java/src/main/java/org/rocksdb/DirectComparator.java \
+	rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java \
+	rocksdb/java/src/main/java/org/rocksdb/EncodingType.java \
+	rocksdb/java/src/main/java/org/rocksdb/Filter.java \
+	rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java \
+	rocksdb/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java \
+	rocksdb/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java \
+	rocksdb/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java \
+	rocksdb/java/src/main/java/org/rocksdb/HistogramData.java \
+	rocksdb/java/src/main/java/org/rocksdb/HistogramType.java \
+	rocksdb/java/src/main/java/org/rocksdb/IndexType.java \
+	rocksdb/java/src/main/java/org/rocksdb/InfoLogLevel.java \
+	rocksdb/java/src/main/java/org/rocksdb/Logger.java \
+	rocksdb/java/src/main/java/org/rocksdb/MemTableConfig.java \
+	rocksdb/java/src/main/java/org/rocksdb/MergeOperator.java \
+	rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java \
+	rocksdb/java/src/main/java/org/rocksdb/PlainTableConfig.java \
+	rocksdb/java/src/main/java/org/rocksdb/RateLimiterConfig.java \
+	rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java \
+	rocksdb/java/src/main/java/org/rocksdb/RestoreBackupableDB.java \
+	rocksdb/java/src/main/java/org/rocksdb/RestoreOptions.java \
+	rocksdb/java/src/main/java/org/rocksdb/RocksDB.java \
+	rocksdb/java/src/main/java/org/rocksdb/RocksDBException.java \
+	rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java \
+	rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java \
+	rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java \
+	rocksdb/java/src/main/java/org/rocksdb/RocksObject.java \
+	rocksdb/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java \
+	rocksdb/java/src/main/java/org/rocksdb/Slice.java \
+	rocksdb/java/src/main/java/org/rocksdb/Snapshot.java \
+	rocksdb/java/src/main/java/org/rocksdb/Statistics.java \
+	rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java \
+	rocksdb/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java \
+	rocksdb/java/src/main/java/org/rocksdb/StatsCollectorInput.java \
+	rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java \
+	rocksdb/java/src/main/java/org/rocksdb/TableFormatConfig.java \
+	rocksdb/java/src/main/java/org/rocksdb/TickerType.java \
+	rocksdb/java/src/main/java/org/rocksdb/TransactionLogIterator.java \
+	rocksdb/java/src/main/java/org/rocksdb/TtlDB.java \
+	rocksdb/java/src/main/java/org/rocksdb/VectorMemTableConfig.java \
+	rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java \
+	rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java \
+	rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java \
+	rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java \
+	rocksdb/java/src/main/java/org/rocksdb/util/Environment.java \
+	rocksdb/java/src/main/java/org/rocksdb/util/SizeUnit.java \
+	rocksdb/java/src/main/java/org/rocksdb/Env.java \
+	rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java \
+	rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java \
+	rocksdb/java/src/main/java/org/rocksdb/AbstractSlice.java \
+	rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java \
+	rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java \
+	rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java \
+	rocksdb/java/src/main/java/org/rocksdb/Options.java \
+	rocksdb/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java \
+	rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java \
+	rocksdb/java/src/test/java/org/rocksdb/AbstractComparatorTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/BackupableDBTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/CheckPointTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/ComparatorTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/DirectComparatorTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/DirectSliceTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/FilterTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/FlushTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/LoggerTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/MemTableTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/MergeTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/MixedOptionsTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/PlainTableConfigTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/PlatformRandomHelper.java \
+	rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/RocksEnvTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/RocksMemoryResource.java \
+	rocksdb/java/src/test/java/org/rocksdb/SnapshotTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/StatsCallbackMock.java \
+	rocksdb/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/Types.java \
+	rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java \
+	rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/util/SizeUnitTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/SliceTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/TtlDBTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java \
+	rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java \
+	rocksdb/java/HISTORY-JAVA.md rocksdb/java/Makefile \
+	rocksdb/port/stack_trace.cc rocksdb/port/README \
+	rocksdb/port/likely.h rocksdb/port/port_example.h \
+	rocksdb/port/stack_trace.h rocksdb/port/dirent.h \
+	rocksdb/port/port.h rocksdb/port/port_posix.cc \
+	rocksdb/port/port_posix.h rocksdb/port/sys_time.h \
+	rocksdb/port/util_logger.h rocksdb/port/win/env_win.cc \
+	rocksdb/port/win/port_win.cc rocksdb/port/win/port_win.h \
+	rocksdb/port/win/win_logger.cc rocksdb/port/win/win_logger.h \
+	rocksdb/table/block_based_filter_block.cc \
+	rocksdb/table/mock_table.cc \
+	rocksdb/table/plain_table_builder.cc \
+	rocksdb/table/plain_table_factory.cc \
+	rocksdb/table/plain_table_key_coding.h \
+	rocksdb/table/table_builder.h \
+	rocksdb/table/two_level_iterator.cc \
+	rocksdb/table/two_level_iterator.h \
+	rocksdb/table/full_filter_block.cc \
+	rocksdb/table/block_based_filter_block.h \
+	rocksdb/table/block_based_filter_block_test.cc \
+	rocksdb/table/block.cc rocksdb/table/block_builder.cc \
+	rocksdb/table/block_builder.h rocksdb/table/block_hash_index.h \
+	rocksdb/table/block_hash_index_test.cc \
+	rocksdb/table/block_prefix_index.cc \
+	rocksdb/table/block_test.cc rocksdb/table/bloom_block.cc \
+	rocksdb/table/bloom_block.h \
+	rocksdb/table/table_reader_bench.cc \
+	rocksdb/table/table_test.cc rocksdb/table/meta_blocks.h \
+	rocksdb/table/plain_table_factory.h \
+	rocksdb/table/filter_block.h \
+	rocksdb/table/flush_block_policy.cc \
+	rocksdb/table/get_context.cc rocksdb/table/get_context.h \
+	rocksdb/table/sst_file_writer.cc \
+	rocksdb/table/full_filter_block.h \
+	rocksdb/table/full_filter_block_test.cc \
+	rocksdb/table/merger.cc rocksdb/table/iterator.cc \
+	rocksdb/table/iterator_wrapper.h rocksdb/table/merger.h \
+	rocksdb/table/block.h rocksdb/table/plain_table_index.h \
+	rocksdb/table/plain_table_key_coding.cc \
+	rocksdb/table/table_properties_internal.h \
+	rocksdb/table/table_reader.h \
+	rocksdb/table/block_based_table_builder.cc \
+	rocksdb/table/block_based_table_builder.h \
+	rocksdb/table/block_based_table_factory.cc \
+	rocksdb/table/block_based_table_factory.h \
+	rocksdb/table/block_based_table_reader.cc \
+	rocksdb/table/block_based_table_reader.h \
+	rocksdb/table/block_hash_index.cc \
+	rocksdb/table/block_prefix_index.h \
+	rocksdb/table/cuckoo_table_builder.cc \
+	rocksdb/table/cuckoo_table_builder.h \
+	rocksdb/table/cuckoo_table_builder_test.cc \
+	rocksdb/table/cuckoo_table_factory.cc \
+	rocksdb/table/cuckoo_table_factory.h \
+	rocksdb/table/cuckoo_table_reader.cc \
+	rocksdb/table/cuckoo_table_reader.h \
+	rocksdb/table/cuckoo_table_reader_test.cc \
+	rocksdb/table/format.cc rocksdb/table/format.h \
+	rocksdb/table/iter_heap.h rocksdb/table/merger_test.cc \
+	rocksdb/table/meta_blocks.cc rocksdb/table/mock_table.h \
+	rocksdb/table/plain_table_builder.h \
+	rocksdb/table/plain_table_index.cc \
+	rocksdb/table/plain_table_reader.cc \
+	rocksdb/table/plain_table_reader.h \
+	rocksdb/table/table_properties.cc \
+	rocksdb/table/adaptive_table_factory.h \
+	rocksdb/table/adaptive_table_factory.cc \
+	rocksdb/third-party/fbson/FbsonJsonParser.h \
+	rocksdb/third-party/fbson/FbsonUtil.h \
+	rocksdb/third-party/fbson/FbsonWriter.h \
+	rocksdb/third-party/fbson/COMMIT.md \
+	rocksdb/third-party/fbson/FbsonDocument.h \
+	rocksdb/third-party/fbson/FbsonStream.h \
+	rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \
+	rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h \
+	rocksdb/third-party/gtest-1.7.0/fused-src/gtest/CMakeLists.txt \
+	rocksdb/third-party/flashcache/flashcache_ioctl.h \
+	rocksdb/tools/auto_sanity_test.sh \
+	rocksdb/tools/benchmark_leveldb.sh \
+	rocksdb/tools/check_format_compatible.sh \
+	rocksdb/tools/generate_random_db.sh \
+	rocksdb/tools/run_leveldb.sh rocksdb/tools/verify_random_db.sh \
+	rocksdb/tools/dbench_monitor rocksdb/tools/ldb.cc \
+	rocksdb/tools/pflag rocksdb/tools/sst_dump.cc \
+	rocksdb/tools/dump/db_dump_tool.cc \
+	rocksdb/tools/dump/rocksdb_dump.cc \
+	rocksdb/tools/dump/rocksdb_undump.cc rocksdb/tools/Dockerfile \
+	rocksdb/tools/benchmark.sh rocksdb/tools/db_crashtest.py \
+	rocksdb/tools/db_crashtest2.py rocksdb/tools/db_repl_stress.cc \
+	rocksdb/tools/db_sanity_test.cc rocksdb/tools/db_stress.cc \
+	rocksdb/tools/ldb_test.py rocksdb/tools/reduce_levels_test.cc \
+	rocksdb/tools/rocksdb_dump_test.sh \
+	rocksdb/tools/run_flash_bench.sh rocksdb/tools/sample-dump.dmp \
+	rocksdb/util/ldb_cmd_execute_result.h rocksdb/util/bloom.cc \
+	rocksdb/util/allocator.h rocksdb/util/crc32c.h \
+	rocksdb/util/db_info_dumper.cc rocksdb/util/dynamic_bloom.h \
+	rocksdb/util/env_hdfs.cc rocksdb/util/env_posix.cc \
+	rocksdb/util/env_test.cc rocksdb/util/event_logger.cc \
+	rocksdb/util/file_util.cc rocksdb/util/file_util.h \
+	rocksdb/util/heap.h rocksdb/util/ldb_cmd.h \
+	rocksdb/util/mutable_cf_options.h rocksdb/util/build_version.h \
+	rocksdb/util/env.cc rocksdb/util/cache_bench.cc \
+	rocksdb/util/options.cc rocksdb/util/coding.cc \
+	rocksdb/util/coding.h rocksdb/util/coding_test.cc \
+	rocksdb/util/event_logger.h rocksdb/util/log_buffer.cc \
+	rocksdb/util/log_buffer.h rocksdb/util/memenv.cc \
+	rocksdb/util/crc32c_test.cc rocksdb/util/options_helper.cc \
+	rocksdb/util/db_info_dumper.h rocksdb/util/dynamic_bloom.cc \
+	rocksdb/util/hash_cuckoo_rep.cc rocksdb/util/options_helper.h \
+	rocksdb/util/histogram.cc rocksdb/util/histogram_test.cc \
+	rocksdb/util/mock_env.cc rocksdb/util/logging.cc \
+	rocksdb/util/logging.h rocksdb/util/statistics.cc \
+	rocksdb/util/event_logger_test.cc rocksdb/util/perf_level.cc \
+	rocksdb/util/status.cc rocksdb/util/filelock_test.cc \
+	rocksdb/util/filter_policy.cc rocksdb/util/hash.cc \
+	rocksdb/util/hash.h rocksdb/util/arena.h \
+	rocksdb/util/hash_cuckoo_rep.h rocksdb/util/perf_context_imp.h \
+	rocksdb/util/hash_linklist_rep.h \
+	rocksdb/util/hash_skiplist_rep.cc \
+	rocksdb/util/hash_skiplist_rep.h rocksdb/util/mock_env_test.cc \
+	rocksdb/util/mutable_cf_options.cc \
+	rocksdb/util/instrumented_mutex.cc \
+	rocksdb/util/instrumented_mutex.h rocksdb/util/ldb_cmd.cc \
+	rocksdb/util/autovector.h rocksdb/util/skiplistrep.cc \
+	rocksdb/util/manual_compaction_test.cc \
+	rocksdb/util/sync_point.cc rocksdb/util/ldb_tool.cc \
+	rocksdb/util/statistics.h rocksdb/util/xfunc.cc \
+	rocksdb/util/log_write_bench.cc rocksdb/util/xfunc.h \
+	rocksdb/util/memenv_test.cc rocksdb/util/mock_env.h \
+	rocksdb/util/options_test.cc rocksdb/util/perf_context.cc \
+	rocksdb/util/posix_logger.h rocksdb/util/rate_limiter.cc \
+	rocksdb/util/rate_limiter.h rocksdb/util/murmurhash.cc \
+	rocksdb/util/murmurhash.h rocksdb/util/sst_dump_test.cc \
+	rocksdb/util/sst_dump_tool.cc rocksdb/util/mutexlock.h \
+	rocksdb/util/sst_dump_tool_imp.h \
+	rocksdb/util/options_builder.cc rocksdb/util/testutil.cc \
+	rocksdb/util/thread_local.cc rocksdb/util/thread_operation.h \
+	rocksdb/util/thread_status_impl.cc rocksdb/util/arena_test.cc \
+	rocksdb/util/random.h rocksdb/util/slice.cc \
+	rocksdb/util/thread_status_util.cc \
+	rocksdb/util/rate_limiter_test.cc \
+	rocksdb/util/scoped_arena_iterator.h \
+	rocksdb/util/thread_status_util.h rocksdb/util/channel.h \
+	rocksdb/util/slice_transform_test.cc \
+	rocksdb/util/thread_status_updater.cc \
+	rocksdb/util/thread_status_updater.h \
+	rocksdb/util/stl_wrappers.h rocksdb/util/stop_watch.h \
+	rocksdb/util/sync_point.h rocksdb/util/compression.h \
+	rocksdb/util/string_util.h rocksdb/util/string_util.cc \
+	rocksdb/util/vectorrep.cc \
+	rocksdb/util/thread_status_util_debug.cc \
+	rocksdb/util/testharness.cc rocksdb/util/testharness.h \
+	rocksdb/util/heap_test.cc rocksdb/util/thread_list_test.cc \
+	rocksdb/util/thread_local.h rocksdb/util/thread_local_test.cc \
+	rocksdb/util/histogram.h rocksdb/util/cache_test.cc \
+	rocksdb/util/thread_status_updater_debug.cc \
+	rocksdb/util/xxhash.cc rocksdb/util/xxhash.h \
+	rocksdb/util/auto_roll_logger.cc \
+	rocksdb/util/auto_roll_logger.h \
+	rocksdb/util/auto_roll_logger_test.cc \
+	rocksdb/util/autovector_test.cc rocksdb/util/bloom_test.cc \
+	rocksdb/util/cache.cc rocksdb/util/comparator.cc \
+	rocksdb/util/crc32c.cc rocksdb/util/dynamic_bloom_test.cc \
+	rocksdb/util/iostats_context.cc \
+	rocksdb/util/iostats_context_imp.h \
+	rocksdb/util/hash_linklist_rep.cc rocksdb/util/testutil.h \
+	rocksdb/util/arena.cc rocksdb/util/aligned_buffer.h \
+	rocksdb/util/db_test_util.cc \
+	rocksdb/util/delete_scheduler_impl.h \
+	rocksdb/util/file_reader_writer.h \
+	rocksdb/util/options_parser.cc rocksdb/util/perf_step_timer.h \
+	rocksdb/util/db_test_util.h \
+	rocksdb/util/delete_scheduler_test.cc \
+	rocksdb/util/file_reader_writer_test.cc \
+	rocksdb/util/options_parser.h rocksdb/util/status_message.cc \
+	rocksdb/util/compaction_job_stats_impl.cc \
+	rocksdb/util/delete_scheduler_impl.cc \
+	rocksdb/util/file_reader_writer.cc \
+	rocksdb/util/ldb_cmd_test.cc rocksdb/util/perf_level_imp.h \
+	rocksdb/utilities/backupable/backupable_db.cc \
+	rocksdb/utilities/backupable/backupable_db_test.cc \
+	rocksdb/utilities/checkpoint/checkpoint.cc \
+	rocksdb/utilities/checkpoint/checkpoint_test.cc \
+	rocksdb/utilities/document/document_db.cc \
+	rocksdb/utilities/document/json_document_builder.cc \
+	rocksdb/utilities/document/document_db_test.cc \
+	rocksdb/utilities/document/json_document.cc \
+	rocksdb/utilities/document/json_document_test.cc \
+	rocksdb/utilities/geodb/geodb_impl.cc \
+	rocksdb/utilities/geodb/geodb_impl.h \
+	rocksdb/utilities/geodb/geodb_test.cc \
+	rocksdb/utilities/leveldb_options/leveldb_options.cc \
+	rocksdb/utilities/merge_operators.h \
+	rocksdb/utilities/merge_operators/put.cc \
+	rocksdb/utilities/merge_operators/string_append/stringappend.cc \
+	rocksdb/utilities/merge_operators/string_append/stringappend.h \
+	rocksdb/utilities/merge_operators/string_append/stringappend2.h \
+	rocksdb/utilities/merge_operators/string_append/stringappend2.cc \
+	rocksdb/utilities/merge_operators/string_append/stringappend_test.cc \
+	rocksdb/utilities/merge_operators/uint64add.cc \
+	rocksdb/utilities/redis/README \
+	rocksdb/utilities/redis/redis_list_exception.h \
+	rocksdb/utilities/redis/redis_list_iterator.h \
+	rocksdb/utilities/redis/redis_lists.cc \
+	rocksdb/utilities/redis/redis_lists.h \
+	rocksdb/utilities/redis/redis_lists_test.cc \
+	rocksdb/utilities/spatialdb/utils.h \
+	rocksdb/utilities/spatialdb/spatial_db.cc \
+	rocksdb/utilities/spatialdb/spatial_db_test.cc \
+	rocksdb/utilities/ttl/db_ttl_impl.cc \
+	rocksdb/utilities/ttl/db_ttl_impl.h \
+	rocksdb/utilities/ttl/ttl_test.cc \
+	rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc \
+	rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc \
+	rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h \
+	rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc \
+	rocksdb/utilities/flashcache/flashcache.cc \
+	rocksdb/utilities/flashcache/flashcache.h \
+	rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc \
+	rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h \
+	rocksdb/utilities/convenience/info_log_finder.cc \
+	rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc \
+	rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h \
+	rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \
+	rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc \
+	rocksdb/utilities/transactions/optimistic_transaction_db_impl.h \
+	rocksdb/utilities/transactions/optimistic_transaction_impl.cc \
+	rocksdb/utilities/transactions/optimistic_transaction_test.cc \
+	rocksdb/utilities/transactions/transaction_base.h \
+	rocksdb/utilities/transactions/transaction_db_impl.h \
+	rocksdb/utilities/transactions/transaction_db_mutex_impl.h \
+	rocksdb/utilities/transactions/transaction_impl.h \
+	rocksdb/utilities/transactions/transaction_lock_mgr.h \
+	rocksdb/utilities/transactions/transaction_util.cc \
+	rocksdb/utilities/transactions/optimistic_transaction_impl.h \
+	rocksdb/utilities/transactions/transaction_base.cc \
+	rocksdb/utilities/transactions/transaction_db_impl.cc \
+	rocksdb/utilities/transactions/transaction_db_mutex_impl.cc \
+	rocksdb/utilities/transactions/transaction_impl.cc \
+	rocksdb/utilities/transactions/transaction_lock_mgr.cc \
+	rocksdb/utilities/transactions/transaction_test.cc \
+	rocksdb/utilities/transactions/transaction_util.h \
+	rocksdb/.gitignore rocksdb/CMakeLists.txt rocksdb/HISTORY.md \
+	rocksdb/Makefile rocksdb/USERS.md rocksdb/appveyor.yml \
+	rocksdb/src.mk rocksdb/thirdparty.inc rocksdb/.travis.yml \
+	rocksdb/DUMP_FORMAT.md rocksdb/INSTALL.md \
+	rocksdb/ROCKSDB_LITE.md rocksdb/WINDOWS_PORT.md \
+	rocksdb/appveyordailytests.yml rocksdb/AUTHORS \
 	tracing/tracing-common.h $(srcdir)/$(shell_scripts:%=%.in) \
 	$(srcdir)/vstart.sh $(srcdir)/stop.sh ceph-run \
 	$(srcdir)/ceph-osd-prestart.sh $(srcdir)/ceph_common.sh \
@@ -7548,7 +7946,7 @@ EXTRA_DIST = $(am__append_21) ceph-detect-init/AUTHORS.rst \
 	$(srcdir)/upstart/radosgw-all-starter.conf \
 	$(srcdir)/upstart/rbdmap.conf ceph.in ceph-disk ceph-disk-udev \
 	ceph-create-keys ceph-rest-api ceph-crush-location \
-	mount.fuse.ceph rbd-replay-many rbdmap yasm-wrapper \
+	mount.fuse.ceph rbd-replay-many rbdmap etc-rbdmap yasm-wrapper \
 	unittest_bufferlist.sh
 CLEANFILES = $(BUILT_SOURCES) $(shell_scripts) ceph_ver.h \
 	sample.fetch_config
@@ -7584,9 +7982,10 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	crush/CrushWrapper.h crush/CrushWrapper.i crush/builder.h \
 	crush/crush.h crush/crush_compat.h crush/crush_ln_table.h \
 	crush/grammar.h crush/hash.h crush/mapper.h crush/sample.txt \
-	crush/types.h $(am__append_23) $(am__append_27) \
-	$(am__append_34) $(am__append_36) $(am__append_38) \
-	$(am__append_40) $(am__append_44) $(am__append_47) \
+	crush/types.h $(am__append_27) $(am__append_31) \
+	$(am__append_34) $(am__append_38) $(am__append_40) \
+	$(am__append_44) $(am__append_52) $(am__append_54) \
+	$(am__append_56) \
 	erasure-code/jerasure/gf-complete/include/gf_complete.h \
 	erasure-code/jerasure/gf-complete/include/gf_general.h \
 	erasure-code/jerasure/gf-complete/include/gf_int.h \
@@ -7616,11 +8015,11 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	erasure-code/jerasure/gf-complete/include/gf_rand.h \
 	erasure-code/jerasure/gf-complete/include/gf_method.h \
 	erasure-code/jerasure/gf-complete/include/gf_general.h \
-	$(am__append_65) erasure-code/ErasureCode.h \
+	$(am__append_74) erasure-code/ErasureCode.h \
 	erasure-code/ErasureCodeInterface.h \
 	erasure-code/ErasureCodePlugin.h osdc/Filer.h osdc/Journaler.h \
 	osdc/ObjectCacher.h osdc/Objecter.h osdc/Striper.h \
-	osdc/WritebackHandler.h $(am__append_69) $(am__append_71) \
+	osdc/WritebackHandler.h $(am__append_78) $(am__append_80) \
 	global/pidfile.h global/global_init.h global/global_context.h \
 	global/signal_handler.h json_spirit/json_spirit.h \
 	json_spirit/json_spirit_error_position.h \
@@ -7670,7 +8069,7 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	common/Readahead.h common/Cycles.h common/Initialize.h \
 	common/ContextCompletion.h common/bit_vector.hpp \
 	common/SubProcess.h common/valgrind.h \
-	common/TracepointProvider.h $(am__append_87) common/secret.h \
+	common/TracepointProvider.h $(am__append_100) common/secret.h \
 	msg/Connection.h msg/Dispatcher.h msg/Message.h \
 	msg/Messenger.h msg/SimplePolicyMessenger.h msg/msg_types.h \
 	msg/simple/Accepter.h msg/simple/DispatchQueue.h \
@@ -7678,7 +8077,7 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	msg/simple/SimpleMessenger.h msg/async/AsyncConnection.h \
 	msg/async/AsyncMessenger.h msg/async/Event.h \
 	msg/async/EventEpoll.h msg/async/EventSelect.h \
-	msg/async/net_handler.h $(am__append_96) messages/MAuth.h \
+	msg/async/net_handler.h $(am__append_109) messages/MAuth.h \
 	messages/MAuthReply.h messages/MCacheExpire.h \
 	messages/MClientCaps.h messages/MClientCapRelease.h \
 	messages/MClientLease.h messages/MClientReconnect.h \
@@ -7765,15 +8164,15 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	include/util.h include/stat.h include/on_exit.h \
 	include/memory.h include/rados/memory.h \
 	include/unordered_set.h include/unordered_map.h \
-	include/timegm.h $(am__append_102) $(am__append_105) \
-	$(am__append_109) $(am__append_115) $(am__append_118) \
-	$(am__append_121) $(am__append_122) $(am__append_128) \
-	$(am__append_150) $(am__append_166) $(am__append_172) \
-	$(am__append_184) test/bench/backend.h test/bench/bencher.h \
-	test/bench/detailed_stat_collector.h test/bench/distribution.h \
-	test/bench/dumb_backend.h test/bench/rados_backend.h \
-	test/bench/rbd_backend.h test/bench/stat_collector.h \
-	test/bench/testfilestore_backend.h \
+	include/timegm.h $(am__append_115) $(am__append_118) \
+	$(am__append_119) $(am__append_124) $(am__append_130) \
+	$(am__append_134) $(am__append_137) $(am__append_138) \
+	$(am__append_144) $(am__append_166) $(am__append_182) \
+	$(am__append_188) $(am__append_200) test/bench/backend.h \
+	test/bench/bencher.h test/bench/detailed_stat_collector.h \
+	test/bench/distribution.h test/bench/dumb_backend.h \
+	test/bench/rados_backend.h test/bench/rbd_backend.h \
+	test/bench/stat_collector.h test/bench/testfilestore_backend.h \
 	test/common/ObjectContents.h test/encoding/types.h \
 	test/objectstore/DeterministicOpSequence.h \
 	test/objectstore/FileStoreDiff.h \
@@ -7792,55 +8191,55 @@ noinst_HEADERS = arch/intel.h arch/arm.h arch/probe.h \
 	test/system/st_rados_list_objects.h \
 	test/system/st_rados_notify.h test/system/st_rados_watch.h \
 	test/system/systest_runnable.h test/system/systest_settings.h \
-	test/unit.h tools/cephfs/JournalTool.h \
-	tools/cephfs/JournalScanner.h tools/cephfs/JournalFilter.h \
-	tools/cephfs/EventOutput.h tools/cephfs/Resetter.h \
-	tools/cephfs/Dumper.h tools/cephfs/TableTool.h \
-	tools/cephfs/MDSUtility.h tools/RadosDump.h \
-	tools/rados/RadosImport.h tools/ceph_objectstore_tool.h \
-	tools/rados/PoolDump.h tools/cephfs/DataScan.h \
-	compressor/Compressor.h compressor/AsyncCompressor.h \
-	compressor/SnappyCompressor.h cls_acl.cc cls_crypto.cc \
-	fetch_config logrotate.conf sample.ceph.conf \
-	bash_completion/ceph bash_completion/rados bash_completion/rbd \
-	bash_completion/radosgw-admin mount/canonicalize.c \
-	mount/mtab.c objclass/objclass.h
-bin_SCRIPTS = $(am__append_20) $(am__append_219) $(am__append_230) \
-	$(am__append_238)
+	test/unit.h test/journal/RadosTestFixture.h $(am__append_221) \
+	tools/cephfs/JournalTool.h tools/cephfs/JournalScanner.h \
+	tools/cephfs/JournalFilter.h tools/cephfs/EventOutput.h \
+	tools/cephfs/Resetter.h tools/cephfs/Dumper.h \
+	tools/cephfs/TableTool.h tools/cephfs/MDSUtility.h \
+	tools/RadosDump.h tools/rados/RadosImport.h \
+	tools/ceph_objectstore_tool.h tools/rados/PoolDump.h \
+	tools/cephfs/DataScan.h compressor/Compressor.h \
+	compressor/AsyncCompressor.h compressor/SnappyCompressor.h \
+	cls_acl.cc cls_crypto.cc fetch_config logrotate.conf \
+	sample.ceph.conf bash_completion/ceph bash_completion/rados \
+	bash_completion/rbd bash_completion/radosgw-admin \
+	mount/canonicalize.c mount/mtab.c objclass/objclass.h
+bin_SCRIPTS = $(am__append_24) $(am__append_236) $(am__append_246) \
+	$(am__append_254)
 sbin_SCRIPTS = 
-su_sbin_SCRIPTS = $(am__append_235)
+su_sbin_SCRIPTS = $(am__append_251)
 dist_bin_SCRIPTS = 
-lib_LTLIBRARIES = $(am__append_101) $(am__append_104) \
-	$(am__append_108) $(am__append_211) $(am__append_228) \
-	$(am__append_229)
+lib_LTLIBRARIES = $(am__append_114) $(am__append_117) \
+	$(am__append_123) $(am__append_228) $(am__append_244) \
+	$(am__append_245)
 noinst_LTLIBRARIES = libarch.la libauth.la libcrush.la libmon_types.la \
-	$(am__append_22) $(am__append_26) libos_types.la \
-	$(am__append_33) $(am__append_35) $(am__append_37) \
-	libosd_types.la $(am__append_46) liberasure_code.la libosdc.la \
-	$(am__append_68) $(am__append_70) libglobal.la \
+	$(am__append_43) libosd_types.la liberasure_code.la libosdc.la \
+	$(am__append_77) $(am__append_79) libglobal.la \
 	libjson_spirit.la liblog.la libperfglue.la \
-	libcommon_internal.la libcommon_crc.la $(am__append_85) \
-	libcommon.la $(am__append_88) libmsg.la $(am__append_97) \
-	librbd_types.la $(am__append_106) $(am__append_111) \
-	$(am__append_116) $(am__append_123) $(am__append_159) \
-	$(am__append_169) $(am__append_174) $(am__append_200) \
-	libcompressor.la $(am__append_221)
-noinst_LIBRARIES = $(am__append_39) $(am__append_117)
-radoslib_LTLIBRARIES = $(am__append_119) $(am__append_120)
+	libcommon_internal.la libcommon_crc.la $(am__append_98) \
+	libcommon.la $(am__append_101) libmsg.la $(am__append_110) \
+	librbd_types.la $(am__append_121) $(am__append_126) \
+	$(am__append_131) $(am__append_139) $(am__append_175) \
+	$(am__append_185) $(am__append_190) $(am__append_216) \
+	libcompressor.la $(am__append_238)
+noinst_LIBRARIES = $(am__append_26) $(am__append_39) libos_types.a \
+	$(am__append_51) $(am__append_53) $(am__append_55) \
+	$(am__append_133)
+radoslib_LTLIBRARIES = $(am__append_135) $(am__append_136)
 
 # like bin_PROGRAMS, but these targets are only built for debug builds
-bin_DEBUGPROGRAMS = $(am__append_72) $(am__append_114) \
-	$(am__append_130) $(am__append_160) $(am__append_161) \
-	$(am__append_162) $(am__append_163) $(am__append_165) \
-	$(am__append_167) $(am__append_173) $(am__append_175) \
-	$(am__append_176) $(am__append_179) $(am__append_181) \
-	$(am__append_182) $(am__append_183) $(am__append_185) \
-	$(am__append_186) $(am__append_187) $(am__append_188) \
-	$(am__append_194) ceph_test_timers ceph_test_signal_handlers \
-	ceph_test_rewrite_latency ceph_test_crypto $(am__append_199) \
+bin_DEBUGPROGRAMS = $(am__append_81) $(am__append_129) \
+	$(am__append_146) $(am__append_176) $(am__append_177) \
+	$(am__append_178) $(am__append_179) $(am__append_181) \
+	$(am__append_183) $(am__append_189) $(am__append_191) \
+	$(am__append_192) $(am__append_195) $(am__append_197) \
+	$(am__append_198) $(am__append_199) $(am__append_201) \
+	$(am__append_202) $(am__append_203) $(am__append_204) \
+	$(am__append_210) ceph_test_timers ceph_test_signal_handlers \
+	ceph_test_rewrite_latency ceph_test_crypto $(am__append_215) \
 	ceph_bench_log ceph_test_objectcacher_stress \
 	ceph_test_cfuse_cache_invalidate ceph_test_get_blkdev_size \
-	$(am__append_202) $(am__append_204) $(am__append_205) \
+	$(am__append_219) $(am__append_223) $(am__append_224) \
 	ceph_psim
 
 # like sbin_SCRIPTS but can be used to install to e.g. /usr/sbin
@@ -7850,11 +8249,11 @@ ceph_sbindir = $(sbindir)
 su_sbindir = /sbin
 
 # C/C++ tests to build and executed will be appended to this
-check_TESTPROGRAMS = $(am__append_136) $(am__append_140) \
-	$(am__append_143) $(am__append_164) $(am__append_168) \
-	$(am__append_177) $(am__append_190) $(am__append_191) \
-	$(am__append_195) $(am__append_196) $(am__append_197) \
-	$(am__append_198) unittest_addrs $(am__append_201) \
+check_TESTPROGRAMS = $(am__append_152) $(am__append_156) \
+	$(am__append_159) $(am__append_180) $(am__append_184) \
+	$(am__append_193) $(am__append_206) $(am__append_207) \
+	$(am__append_211) $(am__append_212) $(am__append_213) \
+	$(am__append_214) unittest_addrs $(am__append_218) \
 	unittest_bloom_filter unittest_histogram \
 	unittest_prioritized_queue unittest_str_map \
 	unittest_sharedptr_registry unittest_shared_cache \
@@ -7898,8 +8297,8 @@ check_TESTPROGRAMS = $(am__append_136) $(am__append_140) \
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see `<http://www.gnu.org/licenses/>`.
 #
-check_SCRIPTS = ceph-detect-init/run-tox.sh $(am__append_127) \
-	$(am__append_171) test/ceph_objectstore_tool.py \
+check_SCRIPTS = ceph-detect-init/run-tox.sh $(am__append_143) \
+	$(am__append_187) test/ceph_objectstore_tool.py \
 	test/test-ceph-helpers.sh test/cephtool-test-osd.sh \
 	test/cephtool-test-mon.sh test/cephtool-test-mds.sh \
 	test/cephtool-test-rados.sh unittest_bufferlist.sh \
@@ -7907,7 +8306,8 @@ check_SCRIPTS = ceph-detect-init/run-tox.sh $(am__append_127) \
 	test/mon/misc.sh test/mon/osd-crush.sh test/mon/mon-ping.sh \
 	test/mon/osd-erasure-code-profile.sh test/mon/mkfs.sh \
 	test/mon/mon-scrub.sh test/osd/osd-scrub-repair.sh \
-	test/osd/osd-config.sh test/osd/osd-bench.sh \
+	test/osd/osd-scrub-snaps.sh test/osd/osd-config.sh \
+	test/osd/osd-bench.sh test/osd/osd-reactivate.sh \
 	test/osd/osd-copy-from.sh test/mon/mon-handle-forward.sh \
 	test/libradosstriper/rados-striper.sh \
 	test/test_objectstore_memstore.sh test/ceph-disk.sh \
@@ -7933,17 +8333,11 @@ HARDENING_LDFLAGS = \
                      -Wl,-z,relro \
                      -Wl,-z,now
 
-AM_COMMON_CPPFLAGS = \
-	-D__CEPH__ \
-	-D_FILE_OFFSET_BITS=64 \
-	-D_REENTRANT \
-	-D_THREAD_SAFE \
-	-D__STDC_FORMAT_MACROS \
-	-D_GNU_SOURCE \
-	-DCEPH_LIBDIR=\"${libdir}\" \
-	-DCEPH_PKGLIBDIR=\"${pkglibdir}\" \
-	-DGTEST_USE_OWN_TR1_TUPLE=0
-
+AM_COMMON_CPPFLAGS = -D__CEPH__ -D_FILE_OFFSET_BITS=64 -D_THREAD_SAFE \
+	-D__STDC_FORMAT_MACROS -D_GNU_SOURCE \
+	-DCEPH_LIBDIR=\"${libdir}\" -DCEPH_PKGLIBDIR=\"${pkglibdir}\" \
+	-DGTEST_USE_OWN_TR1_TUPLE=0 $(am__append_2) $(am__append_3) \
+	$(am__append_4) $(am__append_5)
 AM_COMMON_CFLAGS = \
 	-Wall \
 	${WARN_TYPE_LIMITS} \
@@ -7954,8 +8348,8 @@ AM_COMMON_CFLAGS = \
 	-fno-strict-aliasing \
 	-fsigned-char
 
-AM_CFLAGS = $(AM_COMMON_CFLAGS) $(HARDENING_CFLAGS) $(am__append_5) \
-	$(am__append_74) $(am__append_77)
+AM_CFLAGS = $(AM_COMMON_CFLAGS) $(HARDENING_CFLAGS) $(am__append_9) \
+	$(am__append_83) $(am__append_86)
 AM_CPPFLAGS = $(AM_COMMON_CPPFLAGS)
 
 # note: this is position dependant, it affects the -l options that
@@ -7966,43 +8360,45 @@ AM_CPPFLAGS = $(AM_COMMON_CPPFLAGS)
 # http://www.gentoo.org/proj/en/qa/asneeded.xml
 # http://gcc.gnu.org/ml/gcc-help/2010-12/msg00338.html
 # http://sigquit.wordpress.com/2011/02/16/why-asneeded-doesnt-work-as-expected-for-your-libraries-on-your-autotools-project/
-AM_LDFLAGS = $(am__append_2) $(am__append_4)
+AM_LDFLAGS = $(am__append_6) $(am__append_8)
 AM_CCASFLAGS = -f elf64
 
 #####################
-EXTRALIBS = -lm $(am__append_7) $(am__append_8) $(am__append_9) \
-	$(am__append_19)
+EXTRALIBS = -lm $(am__append_11) $(am__append_12) $(am__append_13) \
+	$(am__append_22)
 LIBGLOBAL = libglobal.la
 LIBCOMMON = libcommon.la
 LIBSECRET = libsecret.la
 LIBARCH = libarch.la
-LIBPERFGLUE = libperfglue.la $(am__append_13) $(am__append_14)
+LIBPERFGLUE = libperfglue.la $(am__append_16) $(am__append_17)
 LIBAUTH = libauth.la
 LIBMSG = libmsg.la
 LIBCRUSH = libcrush.la
 LIBCOMPRESSOR = libcompressor.la -lsnappy
 LIBJSON_SPIRIT = libjson_spirit.la
+LIBKV = libkv.a $(am__append_23) -lbz2 -lz -lleveldb -lsnappy
 LIBLOG = liblog.la
-
-# Always use system leveldb
-LIBOS = libos.la $(am__append_10) $(am__append_11) $(am__append_12) \
-	-lleveldb -lsnappy
-LIBOS_TYPES = libos_types.la
+LIBOS = libos.a $(am__append_14) $(am__append_15) $(LIBOS_TYPES) \
+	$(LIBKV)
+LIBOS_TYPES = libos_types.a
 
 # Libosd always needs osdc and os
-LIBOSD = libosd.la $(am__append_16) $(LIBOSDC) $(LIBOS) $(LIBPERFGLUE)
+
+# OSD needs types
+LIBOSD = libosd.a $(am__append_19) $(LIBOSDC) $(LIBOS) $(LIBPERFGLUE) \
+	$(LIBOSD_TYPES) $(LIBOS_TYPES)
 LIBOSD_TYPES = libosd_types.la
 LIBOSDC = libosdc.la
 
 # These have references to syms like ceph_using_tcmalloc(), glue libperfglue to them
-LIBMON = libmon.la $(am__append_15) $(LIBPERFGLUE)
+LIBMON = libmon.a $(am__append_18) $(LIBPERFGLUE) $(LIBMON_TYPES)
 LIBMON_TYPES = libmon_types.la
-LIBMDS = libmds.la $(am__append_17) $(LIBPERFGLUE)
+LIBMDS = libmds.la $(am__append_20) $(LIBPERFGLUE)
 LIBCLIENT = libclient.la
 LIBCLIENT_FUSE = libclient_fuse.la
 LIBRADOS = librados.la
 LIBRADOSSTRIPER = libradosstriper.la
-LIBRGW = librgw.la $(am__append_18)
+LIBRGW = librgw.la $(am__append_21)
 LIBCIVETWEB = libcivetweb.la
 LIBRBD = librbd.la
 LIBRBD_TYPES = librbd_types.la
@@ -8017,18 +8413,19 @@ CEPH_GLOBAL = $(LIBGLOBAL) $(LIBCOMMON) $(PTHREAD_LIBS) -lm $(CRYPTO_LIBS) $(EXT
 
 # important; libmsg before libauth!
 LIBCOMMON_DEPS = libcommon_internal.la libcommon_crc.la \
-	$(am__append_84) $(LIBERASURE_CODE) $(LIBMSG) $(LIBAUTH) \
+	$(am__append_97) $(LIBERASURE_CODE) $(LIBMSG) $(LIBAUTH) \
 	$(LIBCRUSH) $(LIBJSON_SPIRIT) $(LIBLOG) $(LIBARCH) \
-	$(BOOST_RANDOM_LIBS) $(am__append_86)
-LIBRADOS_DEPS = $(am__append_98)
-LIBRGW_DEPS = $(am__append_112)
+	$(BOOST_RANDOM_LIBS) $(am__append_99)
+LIBRADOS_DEPS = $(am__append_111)
+LIBRGW_DEPS = $(am__append_127)
 
 # This is used by the dencoder test
 
 # Do not use TCMALLOC with dencoder
-DENCODER_SOURCES = $(am__append_24) perfglue/disabled_heap_profiler.cc \
-	perfglue/disabled_stubs.cc $(am__append_110)
-DENCODER_DEPS = $(am__append_25) $(am__append_124)
+DENCODER_SOURCES = $(am__append_41) perfglue/disabled_heap_profiler.cc \
+	perfglue/disabled_stubs.cc $(am__append_125)
+DENCODER_DEPS = $(am__append_42) $(am__append_120) $(am__append_132) \
+	$(am__append_140)
 radoslibdir = $(libdir)/rados-classes
 libarch_la_SOURCES = \
 	arch/intel.c \
@@ -8062,10 +8459,18 @@ libcrush_la_SOURCES = \
 	crush/CrushCompiler.cc \
 	crush/CrushTester.cc
 
+ at ENABLE_SERVER_TRUE@libkv_a_SOURCES = kv/KeyValueDB.cc \
+ at ENABLE_SERVER_TRUE@	kv/LevelDBStore.cc $(am__append_29) \
+ at ENABLE_SERVER_TRUE@	$(am__append_32) $(am__append_35)
+ at ENABLE_SERVER_TRUE@libkv_a_CXXFLAGS = ${AM_CXXFLAGS} -I \
+ at ENABLE_SERVER_TRUE@	rocksdb/include $(am__append_28) \
+ at ENABLE_SERVER_TRUE@	$(am__append_36)
+ at ENABLE_SERVER_TRUE@libkv_a_LIBADD = $(am__append_30) $(am__append_33) \
+ at ENABLE_SERVER_TRUE@	$(am__append_37)
 libmon_types_la_SOURCES = \
 	mon/PGMap.cc
 
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at libmon_la_SOURCES = \
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at libmon_a_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/Monitor.cc \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/Paxos.cc \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/PaxosService.cc \
@@ -8080,7 +8485,7 @@ libmon_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/DataHealthService.cc \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE@	mon/ConfigKeyService.cc
 
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at libmon_la_LIBADD = $(LIBAUTH) $(LIBCOMMON) $(LIBOS) $(LIBMON_TYPES)
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at libmon_a_LIBADD = 
 LIBMDS_SOURCES = \
 	mds/Capability.cc \
 	mds/MDSDaemon.cc \
@@ -8106,42 +8511,32 @@ LIBMDS_SOURCES = \
 	mds/MDSTableClient.cc \
 	mds/MDSTableServer.cc \
 	mds/SimpleLock.cc \
+	mds/ScrubStack.cc \
 	mds/SnapRealm.cc \
 	mds/SnapServer.cc \
 	mds/snap.cc \
 	mds/SessionMap.cc \
 	mds/MDSContext.cc \
 	mds/MDSAuthCaps.cc \
-	mds/MDLog.cc \
-	common/TrackedOp.cc
+	mds/MDLog.cc
 
 LIBMDS_DEPS = $(LIBOSDC)
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at libmds_la_SOURCES = $(LIBMDS_SOURCES)
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at libmds_la_LIBADD = $(LIBMDS_DEPS)
-libos_types_la_SOURCES = os/Transaction.cc $(am__append_29)
-libos_types_la_CXXFLAGS = ${AM_CXXFLAGS}
- at ENABLE_SERVER_TRUE@libos_la_SOURCES = os/chain_xattr.cc os/fs/FS.cc \
+libos_types_a_SOURCES = os/Transaction.cc $(am__append_46)
+libos_types_a_CXXFLAGS = ${AM_CXXFLAGS}
+ at ENABLE_SERVER_TRUE@libos_a_SOURCES = os/chain_xattr.cc os/fs/FS.cc \
 @ENABLE_SERVER_TRUE@	os/DBObjectMap.cc os/GenericObjectMap.cc \
 @ENABLE_SERVER_TRUE@	os/FileJournal.cc os/FileStore.cc \
 @ENABLE_SERVER_TRUE@	os/GenericFileStoreBackend.cc \
 @ENABLE_SERVER_TRUE@	os/HashIndex.cc os/IndexManager.cc \
- at ENABLE_SERVER_TRUE@	os/JournalingObjectStore.cc \
- at ENABLE_SERVER_TRUE@	os/LevelDBStore.cc os/LFNIndex.cc \
- at ENABLE_SERVER_TRUE@	os/MemStore.cc os/KeyValueDB.cc \
- at ENABLE_SERVER_TRUE@	os/KeyValueStore.cc os/ObjectStore.cc \
- at ENABLE_SERVER_TRUE@	os/WBThrottle.cc common/TrackedOp.cc \
- at ENABLE_SERVER_TRUE@	$(am__append_28) $(am__append_30) \
- at ENABLE_SERVER_TRUE@	$(am__append_31) $(am__append_32) \
- at ENABLE_SERVER_TRUE@	$(am__append_41)
- at ENABLE_SERVER_TRUE@libos_la_CXXFLAGS = ${AM_CXXFLAGS} \
- at ENABLE_SERVER_TRUE@	$(am__append_42)
- at ENABLE_SERVER_TRUE@libos_la_LIBADD = $(LIBOS_TYPES) $(am__append_43)
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at libos_rocksdb_la_SOURCES = os/RocksDBStore.cc
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at libos_rocksdb_la_SOURCES = os/RocksDBStore.cc
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at libos_rocksdb_la_CXXFLAGS = ${AM_CXXFLAGS} ${LIBROCKSDB_CFLAGS} -std=gnu++11
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at libos_rocksdb_la_CXXFLAGS = ${AM_CXXFLAGS} ${LIBROCKSDB_CFLAGS} -std=gnu++11 -I rocksdb/include
- at ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at libos_rocksdb_la_LIBADD = -lrocksdb
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at libos_rocksdb_la_LIBADD = rocksdb/librocksdb.la
+ at ENABLE_SERVER_TRUE@	os/JournalingObjectStore.cc os/LFNIndex.cc \
+ at ENABLE_SERVER_TRUE@	os/MemStore.cc os/KeyValueStore.cc \
+ at ENABLE_SERVER_TRUE@	os/ObjectStore.cc os/WBThrottle.cc \
+ at ENABLE_SERVER_TRUE@	$(am__append_45) $(am__append_47) \
+ at ENABLE_SERVER_TRUE@	$(am__append_48) $(am__append_49)
+ at ENABLE_SERVER_TRUE@libos_a_LIBADD = libos_types.a libkv.a \
+ at ENABLE_SERVER_TRUE@	$(am__append_50)
 @ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at libos_zfs_a_SOURCES = os/ZFS.cc
 @ENABLE_SERVER_TRUE@@WITH_LIBZFS_TRUE at libos_zfs_a_CXXFLAGS = ${AM_CXXFLAGS} ${LIBZFS_CFLAGS}
 libosd_types_la_SOURCES = \
@@ -8150,7 +8545,7 @@ libosd_types_la_SOURCES = \
 	osd/ECUtil.cc
 
 libosd_types_la_CXXFLAGS = ${AM_CXXFLAGS}
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libosd_la_SOURCES = \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libosd_a_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/PG.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/ReplicatedPG.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/ReplicatedBackend.cc \
@@ -8164,20 +8559,17 @@ libosd_types_la_CXXFLAGS = ${AM_CXXFLAGS}
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/Watch.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/ClassHandler.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/OpRequest.cc \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	common/TrackedOp.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	osd/SnapMapper.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	objclass/class_api.cc
 
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libosd_la_CXXFLAGS =  \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${AM_CXXFLAGS} \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_45)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libosd_la_LIBADD = $(LIBOSDC) $(LIBOS) $(LIBOSD_TYPES) $(LIBOS_TYPES)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libosd_a_CXXFLAGS = ${AM_CXXFLAGS}
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libosd_a_LIBADD = 
 erasure_codelibdir = $(pkglibdir)/erasure-code
 erasure_codelib_LTLIBRARIES = libec_jerasure_generic.la \
-	$(am__append_50) $(am__append_52) $(am__append_54) \
-	libec_jerasure.la libec_lrc.la libec_shec_generic.la \
 	$(am__append_59) $(am__append_61) $(am__append_63) \
-	libec_shec.la $(am__append_67) $(am__append_134)
+	libec_jerasure.la libec_lrc.la libec_shec_generic.la \
+	$(am__append_68) $(am__append_70) $(am__append_72) \
+	libec_shec.la $(am__append_76) $(am__append_150)
 jerasure_sources = \
   erasure-code/ErasureCode.cc \
   erasure-code/jerasure/jerasure/src/cauchy.c \
@@ -8209,8 +8601,8 @@ libec_jerasure_generic_la_CXXFLAGS = ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/jerasure/jerasure/include
 
 libec_jerasure_generic_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_jerasure_generic_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:0:0 \
-	$(am__append_48)
+libec_jerasure_generic_la_LDFLAGS = ${AM_LDFLAGS} -module \
+	-avoid-version -shared $(am__append_57)
 libec_jerasure_neon_la_SOURCES = ${jerasure_sources}                                       \
                                   erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c  \
                                   erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c  \
@@ -8229,8 +8621,8 @@ libec_jerasure_neon_la_CXXFLAGS = ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/jerasure/jerasure/include
 
 libec_jerasure_neon_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_jerasure_neon_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:0:0 \
-	$(am__append_49)
+libec_jerasure_neon_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
+	-shared $(am__append_58)
 libec_jerasure_sse3_la_SOURCES = ${jerasure_sources}
 libec_jerasure_sse3_la_CFLAGS = ${AM_CFLAGS}  \
 	${INTEL_SSE_FLAGS} \
@@ -8249,8 +8641,8 @@ libec_jerasure_sse3_la_CXXFLAGS = ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/jerasure/jerasure/include
 
 libec_jerasure_sse3_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_jerasure_sse3_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:0:0 \
-	$(am__append_51)
+libec_jerasure_sse3_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
+	-shared $(am__append_60)
 libec_jerasure_sse4_la_SOURCES = ${jerasure_sources}
 libec_jerasure_sse4_la_CFLAGS = ${AM_CFLAGS}  \
 	${INTEL_SSE_FLAGS} \
@@ -8273,16 +8665,16 @@ libec_jerasure_sse4_la_CXXFLAGS = ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/jerasure/jerasure/include
 
 libec_jerasure_sse4_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_jerasure_sse4_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:0:0 \
-	$(am__append_53)
+libec_jerasure_sse4_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
+	-shared $(am__append_62)
 libec_jerasure_la_SOURCES = \
 	erasure-code/jerasure/ErasureCodePluginSelectJerasure.cc
 
 libec_jerasure_la_CFLAGS = ${AM_CFLAGS}
 libec_jerasure_la_CXXFLAGS = ${AM_CXXFLAGS}
 libec_jerasure_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_jerasure_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:0:0 \
-	$(am__append_55)
+libec_jerasure_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
+	-shared $(am__append_64)
 lrc_sources = \
   erasure-code/ErasureCode.cc \
   erasure-code/lrc/ErasureCodePluginLrc.cc \
@@ -8292,8 +8684,8 @@ libec_lrc_la_SOURCES = ${lrc_sources} common/str_map.cc
 libec_lrc_la_CFLAGS = ${AM_CFLAGS}
 libec_lrc_la_CXXFLAGS = ${AM_CXXFLAGS}
 libec_lrc_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(LIBJSON_SPIRIT)
-libec_lrc_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
-	$(am__append_56)
+libec_lrc_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared \
+	$(am__append_65)
 
 # SHEC plugin
 shec_sources = \
@@ -8333,8 +8725,8 @@ libec_shec_generic_la_CXXFLAGS = ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/shec
 
 libec_shec_generic_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_shec_generic_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
-	$(am__append_57)
+libec_shec_generic_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
+	-shared $(am__append_66)
 libec_shec_neon_la_SOURCES = ${shec_sources} \
 	erasure-code/jerasure/gf-complete/src/neon/gf_w4_neon.c \
 	erasure-code/jerasure/gf-complete/src/neon/gf_w8_neon.c \
@@ -8357,8 +8749,8 @@ libec_shec_neon_la_CXXFLAGS = ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/shec
 
 libec_shec_neon_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_shec_neon_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
-	$(am__append_58)
+libec_shec_neon_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
+	-shared $(am__append_67)
 libec_shec_sse3_la_SOURCES = ${shec_sources}
 libec_shec_sse3_la_CFLAGS = ${AM_CFLAGS}  \
 	${INTEL_SSE_FLAGS} \
@@ -8381,8 +8773,8 @@ libec_shec_sse3_la_CXXFLAGS = ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/shec
 
 libec_shec_sse3_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_shec_sse3_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
-	$(am__append_60)
+libec_shec_sse3_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
+	-shared $(am__append_69)
 libec_shec_sse4_la_SOURCES = ${shec_sources}
 libec_shec_sse4_la_CFLAGS = ${AM_CFLAGS}  \
 	${INTEL_SSE_FLAGS} \
@@ -8409,16 +8801,16 @@ libec_shec_sse4_la_CXXFLAGS = ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/shec
 
 libec_shec_sse4_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_shec_sse4_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
-	$(am__append_62)
+libec_shec_sse4_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version \
+	-shared $(am__append_71)
 libec_shec_la_SOURCES = \
 	erasure-code/shec/ErasureCodePluginSelectShec.cc
 
 libec_shec_la_CFLAGS = ${AM_CFLAGS}
 libec_shec_la_CXXFLAGS = ${AM_CXXFLAGS}
 libec_shec_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_shec_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
-	$(am__append_64)
+libec_shec_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared \
+	$(am__append_73)
 @WITH_BETTER_YASM_ELF64_TRUE at isa_sources = \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/ErasureCode.cc \
 @WITH_BETTER_YASM_ELF64_TRUE@	erasure-code/isa/isa-l/erasure_code/ec_base.c \
@@ -8473,8 +8865,8 @@ libec_shec_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 \
 @WITH_BETTER_YASM_ELF64_TRUE at libec_isa_la_CCASFLAGS = ${AM_CCASFLAGS} -I $(abs_srcdir)/erasure-code/isa/isa-l/include/
 @WITH_BETTER_YASM_ELF64_TRUE at libec_isa_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
 @WITH_BETTER_YASM_ELF64_TRUE at libec_isa_la_LDFLAGS = ${AM_LDFLAGS} \
- at WITH_BETTER_YASM_ELF64_TRUE@	-version-info 2:14:0 \
- at WITH_BETTER_YASM_ELF64_TRUE@	$(am__append_66)
+ at WITH_BETTER_YASM_ELF64_TRUE@	-module -avoid-version -shared \
+ at WITH_BETTER_YASM_ELF64_TRUE@	$(am__append_75)
 @WITH_BETTER_YASM_ELF64_TRUE at libec_isa_la_LIBTOOLFLAGS = --tag=CC
 liberasure_code_la_SOURCES = \
 	erasure-code/ErasureCodePlugin.cc
@@ -8506,7 +8898,8 @@ libglobal_la_SOURCES = \
 	global/global_context.cc \
 	global/global_init.cc \
 	global/pidfile.cc \
-	global/signal_handler.cc
+	global/signal_handler.cc \
+	common/TrackedOp.cc
 
 libglobal_la_LIBADD = $(LIBCOMMON)
 libjson_spirit_la_SOURCES = \
@@ -8518,8 +8911,8 @@ liblog_la_SOURCES = \
 	log/Log.cc \
 	log/SubsystemMap.cc
 
-libperfglue_la_SOURCES = $(am__append_73) $(am__append_76) \
-	$(am__append_79) $(am__append_80) $(am__append_81)
+libperfglue_la_SOURCES = $(am__append_82) $(am__append_85) \
+	$(am__append_88) $(am__append_89) $(am__append_90)
 @WITH_TCMALLOC_FALSE@@WITH_TCMALLOC_MINIMAL_TRUE at libperfglue_la_LIBADD = -ltcmalloc_minimal
 @WITH_TCMALLOC_TRUE at libperfglue_la_LIBADD = -ltcmalloc
 
@@ -8534,7 +8927,7 @@ libcommon_internal_la_SOURCES = ceph_ver.c common/DecayCounter.cc \
 	common/Throttle.cc common/Timer.cc common/Finisher.cc \
 	common/environment.cc common/assert.cc common/run_cmd.cc \
 	common/WorkQueue.cc common/ConfUtils.cc common/MemoryModel.cc \
-	common/armor.c common/fd.cc common/xattr.c common/safe_io.c \
+	common/armor.c common/fd.cc common/safe_io.c \
 	common/snap_types.cc common/str_list.cc common/str_map.cc \
 	common/errno.cc common/RefCountedObj.cc common/common_init.cc \
 	common/pipe.c common/ceph_argparse.cc common/ceph_context.cc \
@@ -8544,22 +8937,21 @@ libcommon_internal_la_SOURCES = ceph_ver.c common/DecayCounter.cc \
 	common/config.cc common/utf8.c common/mime.c common/strtol.cc \
 	common/page.cc common/lockdep.cc common/version.cc \
 	common/hex.cc common/entity_name.cc common/ceph_crypto.cc \
-	common/ceph_crypto_cms.cc common/ceph_json.cc common/ipaddr.cc \
-	common/pick_address.cc common/util.cc common/TextTable.cc \
+	common/ceph_crypto_cms.cc common/TextTable.cc \
 	common/ceph_fs.cc common/ceph_hash.cc common/ceph_strings.cc \
 	common/ceph_frag.cc common/addr_parsing.c common/hobject.cc \
-	common/bloom_filter.cc common/linux_version.c common/module.c \
-	common/Readahead.cc common/Cycles.cc \
-	common/ContextCompletion.cc common/TracepointProvider.cc \
-	common/blkdev.cc $(am__append_82) mon/MonCap.cc \
-	mon/MonClient.cc mon/MonMap.cc osd/OSDMap.cc osd/osd_types.cc \
-	osd/ECMsgTypes.cc osd/HitSet.cc mds/MDSMap.cc \
+	common/bloom_filter.cc common/module.c common/Readahead.cc \
+	common/Cycles.cc common/ContextCompletion.cc \
+	common/TracepointProvider.cc $(am__append_91) $(am__append_92) \
+	$(am__append_93) $(am__append_94) $(am__append_95) \
+	mon/MonCap.cc mon/MonClient.cc mon/MonMap.cc osd/OSDMap.cc \
+	osd/osd_types.cc osd/ECMsgTypes.cc osd/HitSet.cc mds/MDSMap.cc \
 	mds/inode_backtrace.cc mds/mdstypes.cc mds/flock.cc
 
 # inject crc in common
 libcommon_crc_la_SOURCES = common/sctp_crc32.c common/crc32c.cc \
 	common/crc32c_intel_baseline.c common/crc32c_intel_fast.c \
-	$(am__append_83)
+	$(am__append_96)
 @WITH_GOOD_YASM_ELF64_TRUE at libcommon_crc_la_LIBTOOLFLAGS = --tag=CC
 @HAVE_ARMV8_CRC_TRUE at libcommon_crc_aarch64_la_SOURCES = common/crc32c_aarch64.c
 @HAVE_ARMV8_CRC_TRUE at libcommon_crc_aarch64_la_CFLAGS = $(AM_CFLAGS) $(ARM_CRC_FLAGS)
@@ -8573,9 +8965,9 @@ libmsg_la_SOURCES = msg/Message.cc msg/Messenger.cc msg/msg_types.cc \
 	msg/simple/SimpleMessenger.cc msg/async/AsyncConnection.cc \
 	msg/async/AsyncMessenger.cc msg/async/Event.cc \
 	msg/async/net_handler.cc msg/async/EventSelect.cc \
-	$(am__append_89) $(am__append_90) $(am__append_91) \
-	$(am__append_92) $(am__append_93) $(am__append_94) \
-	$(am__append_95)
+	$(am__append_102) $(am__append_103) $(am__append_104) \
+	$(am__append_105) $(am__append_106) $(am__append_107) \
+	$(am__append_108)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at rados_includedir = $(includedir)/rados
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at rados_include_DATA = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(srcdir)/include/rados/librados.h \
@@ -8618,12 +9010,12 @@ libmsg_la_SOURCES = msg/Message.cc msg/Messenger.cc msg/msg_types.cc \
 # We need this to avoid basename conflicts with the librados build tests in test/Makefile.am
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at librados_la_CXXFLAGS =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	${AM_CXXFLAGS} \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__append_99)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__append_112)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at librados_la_LIBADD = $(LIBRADOS_DEPS) $(PTHREAD_LIBS) $(CRYPTO_LIBS) $(EXTRALIBS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at librados_la_LDFLAGS =  \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	${AM_LDFLAGS} \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	-version-info 2:0:0 \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__append_100)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(am__append_113)
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at libradosstriper_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	libradosstriper/libradosstriper.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	libradosstriper/RadosStriperImpl.cc \
@@ -8637,13 +9029,30 @@ libmsg_la_SOURCES = msg/Message.cc msg/Messenger.cc msg/msg_types.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE at libradosstriper_la_LDFLAGS = ${AM_LDFLAGS} \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	-version-info \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	1:0:0 \
- at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__append_103)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOSSTRIPER_TRUE@@WITH_RADOS_TRUE@	$(am__append_116)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at libjournal_la_SOURCES = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/AsyncOpTracker.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Entry.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Future.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/FutureImpl.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Journaler.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/JournalMetadata.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/JournalPlayer.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/JournalRecorder.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/JournalTrimmer.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/ObjectPlayer.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/ObjectRecorder.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	journal/Utils.cc
+
 librbd_types_la_SOURCES = \
+	librbd/JournalTypes.cc \
 	librbd/WatchNotifyTypes.cc
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_internal_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioCompletion.cc \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioImageRequest.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioImageRequestWQ.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AioObjectRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncFlattenRequest.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncObjectThrottle.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/AsyncOperation.cc \
@@ -8655,6 +9064,9 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageCtx.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ImageWatcher.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/internal.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/Journal.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/JournalReplay.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdAdminSocketHook.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/LibrbdWriteback.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/ObjectMap.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/RebuildObjectMapRequest.cc
@@ -8666,17 +9078,18 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd/librbd.cc
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_la_LIBADD = \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_internal.la $(LIBRBD_TYPES) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_internal.la $(LIBRBD_TYPES) libjournal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBRADOS) $(LIBCOMMON) $(LIBOSDC) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_journal_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(PTHREAD_LIBS) $(EXTRALIBS)
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_la_LDFLAGS = ${AM_LDFLAGS} \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	-version-info \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	1:0:0 \
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_107)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(am__append_122)
 @ENABLE_CLIENT_TRUE@@LINUX_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_la_CXXFLAGS = -fvisibility=hidden -fvisibility-inlines-hidden
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at librgw_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	rgw/librgw.cc \
@@ -8792,6 +9205,10 @@ librbd_types_la_SOURCES = \
 
 @ENABLE_CLIENT_TRUE at libcls_cephfs_client_la_SOURCES = cls/cephfs/cls_cephfs_client.cc
 @ENABLE_CLIENT_TRUE at libcls_numops_client_la_SOURCES = cls/numops/cls_numops_client.cc
+ at ENABLE_CLIENT_TRUE@libcls_journal_client_la_SOURCES = \
+ at ENABLE_CLIENT_TRUE@	cls/journal/cls_journal_client.cc \
+ at ENABLE_CLIENT_TRUE@	cls/journal/cls_journal_types.cc
+
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_hello_la_SOURCES = cls/hello/cls_hello.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_hello_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_hello_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
@@ -8827,7 +9244,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_replica_log_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_user_la_SOURCES = cls/user/cls_user.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_user_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_user_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_user_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_rgw_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/rgw/cls_rgw.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/rgw/cls_rgw_ops.cc \
@@ -8839,6 +9256,12 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_cephfs_la_SOURCES = cls/cephfs/cls_cephfs.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_cephfs_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_cephfs_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_journal_la_SOURCES = \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/journal/cls_journal.cc \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	cls/journal/cls_journal_types.cc
+
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_journal_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libcls_journal_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at libcls_kvs_la_SOURCES = key_value_store/cls_kvs.cc
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at libcls_kvs_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 @ENABLE_SERVER_TRUE@@LINUX_TRUE@@WITH_OSD_TRUE at libcls_kvs_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
@@ -8900,7 +9323,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(BOOST_PROGRAM_OPTIONS_LIBS) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_129)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_145)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_erasure_code_non_regression_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ceph_erasure_code_non_regression.cc
 
@@ -8908,7 +9331,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(BOOST_PROGRAM_OPTIONS_LIBS) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_131)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_147)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_erasure_code_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ceph_erasure_code.cc
 
@@ -8916,7 +9339,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(BOOST_PROGRAM_OPTIONS_LIBS) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_133)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_149)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_example_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	erasure-code/ErasureCode.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/ErasureCodePluginExample.cc
@@ -8979,7 +9402,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_135)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_151)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	erasure-code/ErasureCode.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCode.cc
@@ -9002,7 +9425,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_137)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_153)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_jerasure_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodePluginJerasure.cc
 
@@ -9011,7 +9434,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_138)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_154)
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_isa_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	erasure-code/ErasureCode.cc \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeIsa.cc
@@ -9023,7 +9446,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	.libs/libec_isa.la \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(LIBERASURE_CODE) \
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__append_139)
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__append_155)
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_isa_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	erasure-code/ErasureCode.cc \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodePluginIsa.cc
@@ -9035,7 +9458,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	.libs/libec_isa.la \
 @ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(LIBERASURE_CODE) \
- at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__append_141)
+ at ENABLE_SERVER_TRUE@@WITH_BETTER_YASM_ELF64_TRUE@@WITH_OSD_TRUE@	$(am__append_157)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_lrc_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeLrc.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${lrc_sources}
@@ -9045,7 +9468,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_142)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_158)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_lrc_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodePluginLrc.cc
 
@@ -9054,7 +9477,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_144)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_160)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeShec.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${shec_sources}
@@ -9075,7 +9498,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_145)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_161)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_all_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeShec_all.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${shec_sources}
@@ -9096,7 +9519,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_146)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_162)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_thread_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeShec_thread.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${shec_sources}
@@ -9117,7 +9540,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_147)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_163)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_shec_arguments_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	test/erasure-code/TestErasureCodeShec_arguments.cc \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	${shec_sources}
@@ -9138,7 +9561,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_148)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_164)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_erasure_code_plugin_shec_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@        test/erasure-code/TestErasureCodePluginShec.cc
 
@@ -9147,7 +9570,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_149)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_165)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_neon_la_SOURCES = test/erasure-code/TestShecPluginNEON.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_neon_la_CFLAGS = ${AM_CFLAGS}
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at libec_test_shec_neon_la_CXXFLAGS = ${AM_CXXFLAGS}
@@ -9185,7 +9608,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(PTHREAD_LIBS) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(EXTRALIBS) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_151)
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_167)
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at simple_client_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/simple_client.cc \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/simple_dispatcher.cc
@@ -9197,7 +9620,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(PTHREAD_LIBS) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(EXTRALIBS) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_152)
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_168)
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at xio_server_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_server.cc \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_dispatcher.cc
@@ -9209,7 +9632,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(PTHREAD_LIBS) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(EXTRALIBS) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_154)
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_170)
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE at xio_client_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_client.cc \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	test/messenger/xio_dispatcher.cc
@@ -9221,7 +9644,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(LIBCOMMON) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(PTHREAD_LIBS) \
 @ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(EXTRALIBS) \
- at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_155)
+ at ENABLE_SERVER_TRUE@@ENABLE_XIO_TRUE@	$(am__append_171)
 
 # This should use LIBMDS_TYPES once it exists
 @ENABLE_CLIENT_TRUE at ceph_dencoder_SOURCES = \
@@ -9239,9 +9662,9 @@ librbd_types_la_SOURCES = \
 
 # These should always use explicit _CFLAGS/_CXXFLAGS so avoid basename conflicts
 @ENABLE_CLIENT_TRUE at ceph_dencoder_CFLAGS = ${AM_CFLAGS} \
- at ENABLE_CLIENT_TRUE@	$(am__append_156)
+ at ENABLE_CLIENT_TRUE@	$(am__append_172)
 @ENABLE_CLIENT_TRUE at ceph_dencoder_CXXFLAGS = ${AM_CXXFLAGS} \
- at ENABLE_CLIENT_TRUE@	$(am__append_157)
+ at ENABLE_CLIENT_TRUE@	$(am__append_173)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at libradostest_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados/test.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados/TestCase.cc
@@ -9362,6 +9785,12 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@    $(UNITTEST_LDADD) $(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_numops_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_journal_SOURCES = test/cls_journal/test_cls_journal.cc
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_journal_LDADD = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@        libcls_journal_client.la $(LIBRADOS) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@        $(LIBCOMMON) $(CRYPTO_LIBS) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD) -luuid
+
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_cls_journal_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_cmd_SOURCES = test/librados/cmd.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at ceph_test_rados_api_cmd_LDADD = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(LIBCOMMON) $(LIBRADOS) $(CRYPTO_LIBS) \
@@ -9433,6 +9862,25 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestRadosClient.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/librados_test_stub/TestWatchNotify.cc
 
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at unittest_journal_SOURCES = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/test_main.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@        test/journal/test_Entry.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/test_FutureImpl.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/test_Journaler.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/test_JournalMetadata.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/test_JournalPlayer.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/test_JournalRecorder.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/test_JournalTrimmer.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/test_ObjectPlayer.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/test_ObjectRecorder.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	test/journal/RadosTestFixture.cc
+
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at unittest_journal_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at unittest_journal_LDADD = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	libjournal.la libcls_journal_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	librados_test_stub.la librados_internal.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	$(UNITTEST_LDADD) $(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
+
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_smalliobenchrbd_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/bench/small_io_bench_rbd.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/bench/rbd_backend.cc \
@@ -9455,6 +9903,8 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_librbd.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_ImageWatcher.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_internal.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_JournalEntries.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_JournalReplay.cc \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	test/librbd/test_ObjectMap.cc
 
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at librbd_test_la_CXXFLAGS = $(UNITTEST_CXXFLAGS)
@@ -9466,6 +9916,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at unittest_librbd_LDADD = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_test.la librbd_api.la librbd_internal.la $(LIBRBD_TYPES) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libjournal.la libcls_journal_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_test_stub.la librados_internal.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBOSDC) $(UNITTEST_LDADD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
@@ -9477,6 +9928,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at ceph_test_librbd_LDADD = \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librbd_test.la librbd_api.la librbd_internal.la $(LIBRBD_TYPES) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libcls_rbd_client.la libcls_lock_client.la \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	libjournal.la libcls_journal_client.la \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	librados_api.la $(LIBRADOS_DEPS) $(UNITTEST_LDADD) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
 
@@ -9542,8 +9994,9 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/readdir_r_cb.cc \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/caps.cc \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/multiclient.cc \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_178)
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_libcephfs_LDADD = $(LIBCEPHFS) $(UNITTEST_LDADD)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	test/libcephfs/access.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_194)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_libcephfs_LDADD = $(LIBRADOS) $(LIBCEPHFS) $(LIBCOMMON) $(UNITTEST_LDADD)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_libcephfs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at unittest_encoding_SOURCES = test/encoding.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_test_c_headers_SOURCES = test/test_c_headers.c
@@ -9565,7 +10018,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wignored-qualifiers \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wold-style-definition \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-Wtype-limits \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_180)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_196)
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE at test_build_librgw_SOURCES = \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	test/buildtest_skeleton.cc \
 @ENABLE_CLIENT_TRUE@@WITH_BUILD_TESTS_TRUE@@WITH_RADOSGW_TRUE@@WITH_RADOS_TRUE@	$(librgw_la_SOURCES)
@@ -9748,7 +10201,7 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at ceph_test_keys_SOURCES = test/testkeys.cc
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at ceph_test_keys_LDADD = $(LIBMON) $(CEPH_GLOBAL) 
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at get_command_descriptions_SOURCES = test/common/get_command_descriptions.cc
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at get_command_descriptions_LDADD = $(LIBMON) $(LIBCOMMON) $(CEPH_GLOBAL)
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at get_command_descriptions_LDADD = $(LIBMON) $(LIBMON_TYPES) $(LIBOS) $(LIBCOMMON) $(CEPH_GLOBAL)
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at unittest_mon_moncap_SOURCES = test/mon/moncap.cc
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at unittest_mon_moncap_LDADD = $(LIBMON) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at unittest_mon_moncap_CXXFLAGS = $(UNITTEST_CXXFLAGS)
@@ -9763,13 +10216,13 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_osdscrub_LDADD =  \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_192)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_208)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pglog_SOURCES = test/osd/TestPGLog.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pglog_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pglog_LDADD = $(LIBOSD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(UNITTEST_LDADD) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_193)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_209)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_hitset_SOURCES = test/osd/hitset.cc
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_hitset_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_hitset_LDADD = $(LIBOSD) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
@@ -9783,8 +10236,8 @@ librbd_types_la_SOURCES = \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pageset_LDADD = $(UNITTEST_LDADD)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at unittest_pageset_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 @ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at unittest_rocksdb_option_static_SOURCES = test/objectstore/TestRocksdbOptionParse.cc
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at unittest_rocksdb_option_static_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL) rocksdb/librocksdb.la
- at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at unittest_rocksdb_option_static_CXXFLAGS = $(UNITTEST_CXXFLAGS) ${AM_CXXFLAGS} ${LIBROCKSDB_CFLAGS} -std=gnu++11 -I rocksdb/include
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at unittest_rocksdb_option_static_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at unittest_rocksdb_option_static_CXXFLAGS = $(UNITTEST_CXXFLAGS) ${AM_CXXFLAGS} ${LIBROCKSDB_CFLAGS} -I rocksdb/include
 @ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at unittest_rocksdb_option_SOURCES = test/objectstore/TestRocksdbOptionParse.cc
 @ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at unittest_rocksdb_option_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL) -lrocksdb
 @ENABLE_SERVER_TRUE@@WITH_DLIBROCKSDB_TRUE at unittest_rocksdb_option_CXXFLAGS = $(UNITTEST_CXXFLAGS) ${AM_CXXFLAGS} ${LIBROCKSDB_CFLAGS} -std=gnu++11
@@ -9833,12 +10286,10 @@ UNITTEST_CXXFLAGS = \
 	-I$(top_srcdir)/src/gmock/gtest/include \
 	-I$(top_builddir)/src/gmock/gtest/include
 
-UNITTEST_LDADD = \
-	$(top_builddir)/src/gmock/lib/libgmock_main.la \
+UNITTEST_LDADD = $(top_builddir)/src/gmock/lib/libgmock_main.la \
 	$(top_builddir)/src/gmock/lib/libgmock.la \
 	$(top_builddir)/src/gmock/gtest/lib/libgtest.la \
-	$(PTHREAD_LIBS)
-
+	$(PTHREAD_LIBS) $(am__append_217)
 unittest_addrs_SOURCES = test/test_addrs.cc
 unittest_addrs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_addrs_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
@@ -10031,6 +10482,44 @@ ceph_test_get_blkdev_size_LDADD = $(LIBCOMMON)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	needs cleanup so it can \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@	go in libcommon.la
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at rados_LDADD = libcls_lock_client.la $(LIBRADOS) $(LIBRADOSSTRIPER) $(CEPH_GLOBAL)
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_SOURCES = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/rbd.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/ArgumentTypes.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/IndentStream.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/OptionPrinter.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/Shell.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/Utils.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/BenchWrite.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Children.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Clone.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Copy.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Create.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Diff.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/DiskUsage.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Export.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/ExportDiff.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Feature.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Flatten.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/ImageMeta.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Import.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/ImportDiff.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Info.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Kernel.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/List.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Lock.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/MergeDiff.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/ObjectMap.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Remove.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Rename.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Resize.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Snap.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Status.cc \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	tools/rbd/action/Watch.cc
+
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_LDADD = \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(LIBKRBD) $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL) \
+ at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE@	$(BOOST_REGEX_LIBS) $(BOOST_PROGRAM_OPTIONS_LIBS)
+
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_client_debug_SOURCES = tools/ceph-client-debug.cc
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at ceph_client_debug_LDADD = $(LIBCEPHFS) $(LIBCLIENT) $(CEPH_GLOBAL) $(LIBCOMMON)
 @ENABLE_SERVER_TRUE at ceph_osdomap_tool_SOURCES = tools/ceph_osdomap_tool.cc
@@ -10047,7 +10536,7 @@ ceph_test_get_blkdev_size_LDADD = $(LIBCOMMON)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBOS) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) \
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(BOOST_PROGRAM_OPTIONS_LIBS) \
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_206)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_225)
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at cephfs_journal_tool_SOURCES = \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/cephfs-journal-tool.cc \
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE@	tools/cephfs/JournalTool.cc \
@@ -10072,7 +10561,7 @@ ceph_test_get_blkdev_size_LDADD = $(LIBCOMMON)
 
 @ENABLE_CLIENT_TRUE@@ENABLE_SERVER_TRUE@@WITH_MDS_TRUE@@WITH_RADOS_TRUE at cephfs_data_scan_LDADD = $(LIBMDS) libcls_cephfs_client.la $(LIBRADOS) $(CEPH_GLOBAL)
 monmaptool_SOURCES = tools/monmaptool.cc
-monmaptool_LDADD = $(CEPH_GLOBAL) $(LIBCOMMON)
+monmaptool_LDADD = $(CEPH_GLOBAL)
 crushtool_SOURCES = tools/crushtool.cc
 crushtool_LDADD = $(CEPH_GLOBAL)
 osdmaptool_SOURCES = tools/osdmaptool.cc
@@ -10080,9 +10569,9 @@ osdmaptool_LDADD = $(CEPH_GLOBAL)
 ceph_psim_SOURCES = tools/psim.cc
 ceph_psim_LDADD = $(CEPH_GLOBAL)
 ceph_conf_SOURCES = tools/ceph_conf.cc
-ceph_conf_LDADD = $(CEPH_GLOBAL) $(LIBCOMMON)
+ceph_conf_LDADD = $(CEPH_GLOBAL)
 ceph_authtool_SOURCES = tools/ceph_authtool.cc
-ceph_authtool_LDADD = $(CEPH_GLOBAL) $(LIBCOMMON)
+ceph_authtool_LDADD = $(CEPH_GLOBAL)
 libcompressor_la_SOURCES = \
 	compressor/Compressor.cc \
 	compressor/AsyncCompressor.cc
@@ -10142,7 +10631,7 @@ editpaths = sed \
 	-e 's|@@GCOV_PREFIX_STRIP[@][@]|$(GCOV_PREFIX_STRIP)|g'
 
 shell_scripts = ceph-debugpack ceph-post-file ceph-crush-location \
-	$(am__append_232)
+	$(am__append_248)
 doc_DATA = $(srcdir)/sample.ceph.conf sample.fetch_config
 
 # various scripts
@@ -10157,12 +10646,12 @@ ceph_libexec_SCRIPTS = ceph-osd-prestart.sh
 @WITH_LTTNG_TRUE at TESTS_ENVIRONMENT = LD_PRELOAD=liblttng-ust-fork.so; export LD_PRELOAD; echo "LD_PRELOAD=$${LD_PRELOAD}";
 
 # pybind
-python_PYTHON = $(am__append_213) $(am__append_216) $(am__append_220) \
-	$(am__append_226) $(am__append_231)
+python_PYTHON = $(am__append_230) $(am__append_233) $(am__append_237) \
+	$(am__append_242) $(am__append_247)
 @ENABLE_CLIENT_TRUE at bash_completiondir = $(sysconfdir)/bash_completion.d
 @ENABLE_CLIENT_TRUE at bash_completion_DATA =  \
 @ENABLE_CLIENT_TRUE@	$(srcdir)/bash_completion/ceph \
- at ENABLE_CLIENT_TRUE@	$(am__append_215) $(am__append_218)
+ at ENABLE_CLIENT_TRUE@	$(am__append_232) $(am__append_235)
 @ENABLE_CLIENT_TRUE at ceph_syn_SOURCES = ceph_syn.cc \
 @ENABLE_CLIENT_TRUE@	client/SyntheticClient.cc # uses g_conf.. \
 @ENABLE_CLIENT_TRUE@	needs cleanup
@@ -10171,8 +10660,6 @@ python_PYTHON = $(am__append_213) $(am__append_216) $(am__append_220) \
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE at librados_config_LDADD = $(LIBRADOS) $(CEPH_GLOBAL)
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at libkrbd_la_SOURCES = krbd.cc
 @ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at libkrbd_la_LIBADD = $(LIBSECRET) $(LIBCOMMON) -lblkid -ludev
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_SOURCES = rbd.cc
- at ENABLE_CLIENT_TRUE@@WITH_RADOS_TRUE@@WITH_RBD_TRUE at rbd_LDADD = $(LIBKRBD) $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL)
 
 # Fuse targets
 @ENABLE_CLIENT_TRUE@@WITH_FUSE_TRUE@@WITH_RADOS_TRUE at ceph_fuse_SOURCES = ceph_fuse.cc
@@ -10191,7 +10678,7 @@ python_PYTHON = $(am__append_213) $(am__append_216) $(am__append_220) \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	1:0:0 \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	-export-symbols-regex \
 @ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	'^ceph_.*' \
- at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_227)
+ at ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE@	$(am__append_243)
 
 # jni library (java source is in src/java)
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_jni_la_SOURCES = \
@@ -10204,13 +10691,17 @@ python_PYTHON = $(am__append_213) $(am__append_216) $(am__append_220) \
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_jni_la_CPPFLAGS = $(JDK_CPPFLAGS) $(AM_CPPFLAGS)
 @ENABLE_CEPHFS_JAVA_TRUE@@ENABLE_CLIENT_TRUE@@WITH_CEPHFS_TRUE@@WITH_RADOS_TRUE at libcephfs_jni_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
 @ENABLE_SERVER_TRUE at ceph_sbin_SCRIPTS = ceph-create-keys \
- at ENABLE_SERVER_TRUE@	$(am__append_237)
+ at ENABLE_SERVER_TRUE@	$(am__append_253)
 @ENABLE_SERVER_TRUE at mount_ceph_SOURCES = mount/mount.ceph.c
 @ENABLE_SERVER_TRUE at mount_ceph_LDADD = $(LIBSECRET) $(LIBCOMMON)
 @ENABLE_SERVER_TRUE@@WITH_MON_TRUE at ceph_mon_SOURCES = ceph_mon.cc
- at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at ceph_mon_LDADD = $(LIBMON) $(LIBOS) $(CEPH_GLOBAL) $(LIBCOMMON)
+ at ENABLE_SERVER_TRUE@@WITH_MON_TRUE at ceph_mon_LDADD = $(LIBMON) $(LIBOS) $(CEPH_GLOBAL) $(LIBCOMMON) $(LIBAUTH) $(LIBCOMMON) $(LIBMON_TYPES)
 @ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_osd_SOURCES = ceph_osd.cc
- at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_osd_LDADD = $(LIBOSD) $(CEPH_GLOBAL) $(LIBCOMMON)
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE at ceph_osd_LDADD = $(LIBOSDC) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOSD) $(LIBOSD_TYPES) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(LIBOS_TYPES) $(LIBOS) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(CEPH_GLOBAL) $(LIBCOMMON) \
+ at ENABLE_SERVER_TRUE@@WITH_OSD_TRUE@	$(am__append_255)
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at ceph_mds_SOURCES = ceph_mds.cc
 @ENABLE_SERVER_TRUE@@WITH_MDS_TRUE at ceph_mds_LDADD = $(LIBMDS) $(LIBOSDC) $(CEPH_GLOBAL) $(LIBCOMMON)
 @ENABLE_COVERAGE_TRUE@@ENABLE_SERVER_TRUE at COV_DIR = $(DESTDIR)$(libdir)/ceph/coverage
@@ -10221,7 +10712,7 @@ all: $(BUILT_SOURCES) acconfig.h
 
 .SUFFIXES:
 .SUFFIXES: .S .c .cc .cpp .lo .log .o .obj .s .test .test$(EXEEXT) .trs
-$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am $(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am $(srcdir)/auth/Makefile.am $(srcdir)/brag/Makefile.am $(srcdir)/ceph-detect-init/Makefile.am $(srcdir)/crush/Makefile.am $(srcdir)/mon/Makefile.am $(srcdir)/mds/Makefile.am $(srcdir)/mds/Makefile-client.am $(srcdir)/mds/Makefile-server.am $(srcdir)/os/Makefile.am $(srcdir)/osd/Makefile.am $(srcdir)/erasure-code/Makefile.am $(srcdir)/erasure-code/jerasure/Makefile.am $(srcdir)/erasure-code/l [...]
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am $(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am $(srcdir)/auth/Makefile.am $(srcdir)/brag/Makefile.am $(srcdir)/ceph-detect-init/Makefile.am $(srcdir)/crush/Makefile.am $(srcdir)/kv/Makefile.am $(srcdir)/mon/Makefile.am $(srcdir)/mds/Makefile.am $(srcdir)/mds/Makefile-client.am $(srcdir)/mds/Makefile-server.am $(srcdir)/os/Makefile.am $(srcdir)/osd/Makefile.am $(srcdir)/erasure-code/Makefile.am $(srcdir)/erasure-code/jerasure/Makefile.am [...]
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -10242,7 +10733,7 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
 	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
 	esac;
-$(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am $(srcdir)/auth/Makefile.am $(srcdir)/brag/Makefile.am $(srcdir)/ceph-detect-init/Makefile.am $(srcdir)/crush/Makefile.am $(srcdir)/mon/Makefile.am $(srcdir)/mds/Makefile.am $(srcdir)/mds/Makefile-client.am $(srcdir)/mds/Makefile-server.am $(srcdir)/os/Makefile.am $(srcdir)/osd/Makefile.am $(srcdir)/erasure-code/Makefile.am $(srcdir)/erasure-code/jerasure/Makefile.am $(srcdir)/erasure-code/lrc/Makefile.am $(srcdir)/erasure-code/shec/Mak [...]
+$(srcdir)/Makefile-env.am $(srcdir)/arch/Makefile.am $(srcdir)/auth/Makefile.am $(srcdir)/brag/Makefile.am $(srcdir)/ceph-detect-init/Makefile.am $(srcdir)/crush/Makefile.am $(srcdir)/kv/Makefile.am $(srcdir)/mon/Makefile.am $(srcdir)/mds/Makefile.am $(srcdir)/mds/Makefile-client.am $(srcdir)/mds/Makefile-server.am $(srcdir)/os/Makefile.am $(srcdir)/osd/Makefile.am $(srcdir)/erasure-code/Makefile.am $(srcdir)/erasure-code/jerasure/Makefile.am $(srcdir)/erasure-code/lrc/Makefile.am $(srcd [...]
 
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
@@ -10363,12 +10854,132 @@ libcls_version_client.a: $(libcls_version_client_a_OBJECTS) $(libcls_version_cli
 	$(AM_V_at)-rm -f libcls_version_client.a
 	$(AM_V_AR)$(libcls_version_client_a_AR) libcls_version_client.a $(libcls_version_client_a_OBJECTS) $(libcls_version_client_a_LIBADD)
 	$(AM_V_at)$(RANLIB) libcls_version_client.a
+kv/$(am__dirstamp):
+	@$(MKDIR_P) kv
+	@: > kv/$(am__dirstamp)
+kv/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) kv/$(DEPDIR)
+	@: > kv/$(DEPDIR)/$(am__dirstamp)
+kv/libkv_a-KeyValueDB.$(OBJEXT): kv/$(am__dirstamp) \
+	kv/$(DEPDIR)/$(am__dirstamp)
+kv/libkv_a-LevelDBStore.$(OBJEXT): kv/$(am__dirstamp) \
+	kv/$(DEPDIR)/$(am__dirstamp)
+kv/libkv_a-RocksDBStore.$(OBJEXT): kv/$(am__dirstamp) \
+	kv/$(DEPDIR)/$(am__dirstamp)
+kv/libkv_a-KineticStore.$(OBJEXT): kv/$(am__dirstamp) \
+	kv/$(DEPDIR)/$(am__dirstamp)
+
+libkv.a: $(libkv_a_OBJECTS) $(libkv_a_DEPENDENCIES) $(EXTRA_libkv_a_DEPENDENCIES) 
+	$(AM_V_at)-rm -f libkv.a
+	$(AM_V_AR)$(libkv_a_AR) libkv.a $(libkv_a_OBJECTS) $(libkv_a_LIBADD)
+	$(AM_V_at)$(RANLIB) libkv.a
+mon/$(am__dirstamp):
+	@$(MKDIR_P) mon
+	@: > mon/$(am__dirstamp)
+mon/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) mon/$(DEPDIR)
+	@: > mon/$(DEPDIR)/$(am__dirstamp)
+mon/Monitor.$(OBJEXT): mon/$(am__dirstamp) \
+	mon/$(DEPDIR)/$(am__dirstamp)
+mon/Paxos.$(OBJEXT): mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
+mon/PaxosService.$(OBJEXT): mon/$(am__dirstamp) \
+	mon/$(DEPDIR)/$(am__dirstamp)
+mon/OSDMonitor.$(OBJEXT): mon/$(am__dirstamp) \
+	mon/$(DEPDIR)/$(am__dirstamp)
+mon/MDSMonitor.$(OBJEXT): mon/$(am__dirstamp) \
+	mon/$(DEPDIR)/$(am__dirstamp)
+mon/MonmapMonitor.$(OBJEXT): mon/$(am__dirstamp) \
+	mon/$(DEPDIR)/$(am__dirstamp)
+mon/PGMonitor.$(OBJEXT): mon/$(am__dirstamp) \
+	mon/$(DEPDIR)/$(am__dirstamp)
+mon/LogMonitor.$(OBJEXT): mon/$(am__dirstamp) \
+	mon/$(DEPDIR)/$(am__dirstamp)
+mon/AuthMonitor.$(OBJEXT): mon/$(am__dirstamp) \
+	mon/$(DEPDIR)/$(am__dirstamp)
+mon/Elector.$(OBJEXT): mon/$(am__dirstamp) \
+	mon/$(DEPDIR)/$(am__dirstamp)
+mon/HealthMonitor.$(OBJEXT): mon/$(am__dirstamp) \
+	mon/$(DEPDIR)/$(am__dirstamp)
+mon/DataHealthService.$(OBJEXT): mon/$(am__dirstamp) \
+	mon/$(DEPDIR)/$(am__dirstamp)
+mon/ConfigKeyService.$(OBJEXT): mon/$(am__dirstamp) \
+	mon/$(DEPDIR)/$(am__dirstamp)
+
+libmon.a: $(libmon_a_OBJECTS) $(libmon_a_DEPENDENCIES) $(EXTRA_libmon_a_DEPENDENCIES) 
+	$(AM_V_at)-rm -f libmon.a
+	$(AM_V_AR)$(libmon_a_AR) libmon.a $(libmon_a_OBJECTS) $(libmon_a_LIBADD)
+	$(AM_V_at)$(RANLIB) libmon.a
 os/$(am__dirstamp):
 	@$(MKDIR_P) os
 	@: > os/$(am__dirstamp)
 os/$(DEPDIR)/$(am__dirstamp):
 	@$(MKDIR_P) os/$(DEPDIR)
 	@: > os/$(DEPDIR)/$(am__dirstamp)
+os/chain_xattr.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/fs/$(am__dirstamp):
+	@$(MKDIR_P) os/fs
+	@: > os/fs/$(am__dirstamp)
+os/fs/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) os/fs/$(DEPDIR)
+	@: > os/fs/$(DEPDIR)/$(am__dirstamp)
+os/fs/FS.$(OBJEXT): os/fs/$(am__dirstamp) \
+	os/fs/$(DEPDIR)/$(am__dirstamp)
+os/DBObjectMap.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/GenericObjectMap.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/FileJournal.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/FileStore.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/GenericFileStoreBackend.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/HashIndex.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/IndexManager.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/JournalingObjectStore.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/LFNIndex.$(OBJEXT): os/$(am__dirstamp) os/$(DEPDIR)/$(am__dirstamp)
+os/MemStore.$(OBJEXT): os/$(am__dirstamp) os/$(DEPDIR)/$(am__dirstamp)
+os/KeyValueStore.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/ObjectStore.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/WBThrottle.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/BtrfsFileStoreBackend.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/newstore/$(am__dirstamp):
+	@$(MKDIR_P) os/newstore
+	@: > os/newstore/$(am__dirstamp)
+os/newstore/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) os/newstore/$(DEPDIR)
+	@: > os/newstore/$(DEPDIR)/$(am__dirstamp)
+os/newstore/NewStore.$(OBJEXT): os/newstore/$(am__dirstamp) \
+	os/newstore/$(DEPDIR)/$(am__dirstamp)
+os/fs/XFS.$(OBJEXT): os/fs/$(am__dirstamp) \
+	os/fs/$(DEPDIR)/$(am__dirstamp)
+os/XfsFileStoreBackend.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/ZFSFileStoreBackend.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+
+libos.a: $(libos_a_OBJECTS) $(libos_a_DEPENDENCIES) $(EXTRA_libos_a_DEPENDENCIES) 
+	$(AM_V_at)-rm -f libos.a
+	$(AM_V_AR)$(libos_a_AR) libos.a $(libos_a_OBJECTS) $(libos_a_LIBADD)
+	$(AM_V_at)$(RANLIB) libos.a
+os/libos_types_a-Transaction.$(OBJEXT): os/$(am__dirstamp) \
+	os/$(DEPDIR)/$(am__dirstamp)
+os/newstore/libos_types_a-newstore_types.$(OBJEXT):  \
+	os/newstore/$(am__dirstamp) \
+	os/newstore/$(DEPDIR)/$(am__dirstamp)
+
+libos_types.a: $(libos_types_a_OBJECTS) $(libos_types_a_DEPENDENCIES) $(EXTRA_libos_types_a_DEPENDENCIES) 
+	$(AM_V_at)-rm -f libos_types.a
+	$(AM_V_AR)$(libos_types_a_AR) libos_types.a $(libos_types_a_OBJECTS) $(libos_types_a_LIBADD)
+	$(AM_V_at)$(RANLIB) libos_types.a
 os/libos_zfs_a-ZFS.$(OBJEXT): os/$(am__dirstamp) \
 	os/$(DEPDIR)/$(am__dirstamp)
 
@@ -10376,6 +10987,53 @@ libos_zfs.a: $(libos_zfs_a_OBJECTS) $(libos_zfs_a_DEPENDENCIES) $(EXTRA_libos_zf
 	$(AM_V_at)-rm -f libos_zfs.a
 	$(AM_V_AR)$(libos_zfs_a_AR) libos_zfs.a $(libos_zfs_a_OBJECTS) $(libos_zfs_a_LIBADD)
 	$(AM_V_at)$(RANLIB) libos_zfs.a
+osd/$(am__dirstamp):
+	@$(MKDIR_P) osd
+	@: > osd/$(am__dirstamp)
+osd/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) osd/$(DEPDIR)
+	@: > osd/$(DEPDIR)/$(am__dirstamp)
+osd/libosd_a-PG.$(OBJEXT): osd/$(am__dirstamp) \
+	osd/$(DEPDIR)/$(am__dirstamp)
+osd/libosd_a-ReplicatedPG.$(OBJEXT): osd/$(am__dirstamp) \
+	osd/$(DEPDIR)/$(am__dirstamp)
+osd/libosd_a-ReplicatedBackend.$(OBJEXT): osd/$(am__dirstamp) \
+	osd/$(DEPDIR)/$(am__dirstamp)
+osd/libosd_a-ECBackend.$(OBJEXT): osd/$(am__dirstamp) \
+	osd/$(DEPDIR)/$(am__dirstamp)
+osd/libosd_a-ECMsgTypes.$(OBJEXT): osd/$(am__dirstamp) \
+	osd/$(DEPDIR)/$(am__dirstamp)
+osd/libosd_a-ECTransaction.$(OBJEXT): osd/$(am__dirstamp) \
+	osd/$(DEPDIR)/$(am__dirstamp)
+osd/libosd_a-PGBackend.$(OBJEXT): osd/$(am__dirstamp) \
+	osd/$(DEPDIR)/$(am__dirstamp)
+osd/libosd_a-HitSet.$(OBJEXT): osd/$(am__dirstamp) \
+	osd/$(DEPDIR)/$(am__dirstamp)
+osd/libosd_a-OSD.$(OBJEXT): osd/$(am__dirstamp) \
+	osd/$(DEPDIR)/$(am__dirstamp)
+osd/libosd_a-OSDCap.$(OBJEXT): osd/$(am__dirstamp) \
+	osd/$(DEPDIR)/$(am__dirstamp)
+osd/libosd_a-Watch.$(OBJEXT): osd/$(am__dirstamp) \
+	osd/$(DEPDIR)/$(am__dirstamp)
+osd/libosd_a-ClassHandler.$(OBJEXT): osd/$(am__dirstamp) \
+	osd/$(DEPDIR)/$(am__dirstamp)
+osd/libosd_a-OpRequest.$(OBJEXT): osd/$(am__dirstamp) \
+	osd/$(DEPDIR)/$(am__dirstamp)
+osd/libosd_a-SnapMapper.$(OBJEXT): osd/$(am__dirstamp) \
+	osd/$(DEPDIR)/$(am__dirstamp)
+objclass/$(am__dirstamp):
+	@$(MKDIR_P) objclass
+	@: > objclass/$(am__dirstamp)
+objclass/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) objclass/$(DEPDIR)
+	@: > objclass/$(DEPDIR)/$(am__dirstamp)
+objclass/libosd_a-class_api.$(OBJEXT): objclass/$(am__dirstamp) \
+	objclass/$(DEPDIR)/$(am__dirstamp)
+
+libosd.a: $(libosd_a_OBJECTS) $(libosd_a_DEPENDENCIES) $(EXTRA_libosd_a_DEPENDENCIES) 
+	$(AM_V_at)-rm -f libosd.a
+	$(AM_V_AR)$(libosd_a_AR) libosd.a $(libosd_a_OBJECTS) $(libosd_a_LIBADD)
+	$(AM_V_at)$(RANLIB) libosd.a
 
 install-erasure_codelibLTLIBRARIES: $(erasure_codelib_LTLIBRARIES)
 	@$(NORMAL_INSTALL)
@@ -10656,6 +11314,24 @@ cls/hello/cls_hello.lo: cls/hello/$(am__dirstamp) \
 
 libcls_hello.la: $(libcls_hello_la_OBJECTS) $(libcls_hello_la_DEPENDENCIES) $(EXTRA_libcls_hello_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(libcls_hello_la_LINK) $(am_libcls_hello_la_rpath) $(libcls_hello_la_OBJECTS) $(libcls_hello_la_LIBADD) $(LIBS)
+cls/journal/$(am__dirstamp):
+	@$(MKDIR_P) cls/journal
+	@: > cls/journal/$(am__dirstamp)
+cls/journal/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) cls/journal/$(DEPDIR)
+	@: > cls/journal/$(DEPDIR)/$(am__dirstamp)
+cls/journal/cls_journal.lo: cls/journal/$(am__dirstamp) \
+	cls/journal/$(DEPDIR)/$(am__dirstamp)
+cls/journal/cls_journal_types.lo: cls/journal/$(am__dirstamp) \
+	cls/journal/$(DEPDIR)/$(am__dirstamp)
+
+libcls_journal.la: $(libcls_journal_la_OBJECTS) $(libcls_journal_la_DEPENDENCIES) $(EXTRA_libcls_journal_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(libcls_journal_la_LINK) $(am_libcls_journal_la_rpath) $(libcls_journal_la_OBJECTS) $(libcls_journal_la_LIBADD) $(LIBS)
+cls/journal/cls_journal_client.lo: cls/journal/$(am__dirstamp) \
+	cls/journal/$(DEPDIR)/$(am__dirstamp)
+
+libcls_journal_client.la: $(libcls_journal_client_la_OBJECTS) $(libcls_journal_client_la_DEPENDENCIES) $(EXTRA_libcls_journal_client_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(CXXLINK) $(am_libcls_journal_client_la_rpath) $(libcls_journal_client_la_OBJECTS) $(libcls_journal_client_la_LIBADD) $(LIBS)
 key_value_store/$(am__dirstamp):
 	@$(MKDIR_P) key_value_store
 	@: > key_value_store/$(am__dirstamp)
@@ -10871,8 +11547,6 @@ common/MemoryModel.lo: common/$(am__dirstamp) \
 common/armor.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/fd.lo: common/$(am__dirstamp) common/$(DEPDIR)/$(am__dirstamp)
-common/xattr.lo: common/$(am__dirstamp) \
-	common/$(DEPDIR)/$(am__dirstamp)
 common/safe_io.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/snap_types.lo: common/$(am__dirstamp) \
@@ -10932,12 +11606,6 @@ common/ceph_crypto.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/ceph_crypto_cms.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
-common/ipaddr.lo: common/$(am__dirstamp) \
-	common/$(DEPDIR)/$(am__dirstamp)
-common/pick_address.lo: common/$(am__dirstamp) \
-	common/$(DEPDIR)/$(am__dirstamp)
-common/util.lo: common/$(am__dirstamp) \
-	common/$(DEPDIR)/$(am__dirstamp)
 common/TextTable.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/ceph_fs.lo: common/$(am__dirstamp) \
@@ -10954,8 +11622,6 @@ common/hobject.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/bloom_filter.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
-common/linux_version.lo: common/$(am__dirstamp) \
-	common/$(DEPDIR)/$(am__dirstamp)
 common/module.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/Readahead.lo: common/$(am__dirstamp) \
@@ -10966,25 +11632,25 @@ common/ContextCompletion.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/TracepointProvider.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
+common/xattr.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
+common/ipaddr.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
+common/util.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
+common/pick_address.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
+common/linux_version.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
+common/solaris_errno.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
 common/blkdev.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
 common/address_helper.lo: common/$(am__dirstamp) \
 	common/$(DEPDIR)/$(am__dirstamp)
-mon/$(am__dirstamp):
-	@$(MKDIR_P) mon
-	@: > mon/$(am__dirstamp)
-mon/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) mon/$(DEPDIR)
-	@: > mon/$(DEPDIR)/$(am__dirstamp)
 mon/MonCap.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
 mon/MonClient.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
 mon/MonMap.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
-osd/$(am__dirstamp):
-	@$(MKDIR_P) osd
-	@: > osd/$(am__dirstamp)
-osd/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) osd/$(DEPDIR)
-	@: > osd/$(DEPDIR)/$(am__dirstamp)
 osd/OSDMap.lo: osd/$(am__dirstamp) osd/$(DEPDIR)/$(am__dirstamp)
 osd/osd_types.lo: osd/$(am__dirstamp) osd/$(DEPDIR)/$(am__dirstamp)
 osd/ECMsgTypes.lo: osd/$(am__dirstamp) osd/$(DEPDIR)/$(am__dirstamp)
@@ -11903,9 +12569,44 @@ global/pidfile.lo: global/$(am__dirstamp) \
 	global/$(DEPDIR)/$(am__dirstamp)
 global/signal_handler.lo: global/$(am__dirstamp) \
 	global/$(DEPDIR)/$(am__dirstamp)
+common/TrackedOp.lo: common/$(am__dirstamp) \
+	common/$(DEPDIR)/$(am__dirstamp)
 
 libglobal.la: $(libglobal_la_OBJECTS) $(libglobal_la_DEPENDENCIES) $(EXTRA_libglobal_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(CXXLINK)  $(libglobal_la_OBJECTS) $(libglobal_la_LIBADD) $(LIBS)
+journal/$(am__dirstamp):
+	@$(MKDIR_P) journal
+	@: > journal/$(am__dirstamp)
+journal/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) journal/$(DEPDIR)
+	@: > journal/$(DEPDIR)/$(am__dirstamp)
+journal/AsyncOpTracker.lo: journal/$(am__dirstamp) \
+	journal/$(DEPDIR)/$(am__dirstamp)
+journal/Entry.lo: journal/$(am__dirstamp) \
+	journal/$(DEPDIR)/$(am__dirstamp)
+journal/Future.lo: journal/$(am__dirstamp) \
+	journal/$(DEPDIR)/$(am__dirstamp)
+journal/FutureImpl.lo: journal/$(am__dirstamp) \
+	journal/$(DEPDIR)/$(am__dirstamp)
+journal/Journaler.lo: journal/$(am__dirstamp) \
+	journal/$(DEPDIR)/$(am__dirstamp)
+journal/JournalMetadata.lo: journal/$(am__dirstamp) \
+	journal/$(DEPDIR)/$(am__dirstamp)
+journal/JournalPlayer.lo: journal/$(am__dirstamp) \
+	journal/$(DEPDIR)/$(am__dirstamp)
+journal/JournalRecorder.lo: journal/$(am__dirstamp) \
+	journal/$(DEPDIR)/$(am__dirstamp)
+journal/JournalTrimmer.lo: journal/$(am__dirstamp) \
+	journal/$(DEPDIR)/$(am__dirstamp)
+journal/ObjectPlayer.lo: journal/$(am__dirstamp) \
+	journal/$(DEPDIR)/$(am__dirstamp)
+journal/ObjectRecorder.lo: journal/$(am__dirstamp) \
+	journal/$(DEPDIR)/$(am__dirstamp)
+journal/Utils.lo: journal/$(am__dirstamp) \
+	journal/$(DEPDIR)/$(am__dirstamp)
+
+libjournal.la: $(libjournal_la_OBJECTS) $(libjournal_la_DEPENDENCIES) $(EXTRA_libjournal_la_DEPENDENCIES) 
+	$(AM_V_CXXLD)$(CXXLINK) $(am_libjournal_la_rpath) $(libjournal_la_OBJECTS) $(libjournal_la_LIBADD) $(LIBS)
 json_spirit/$(am__dirstamp):
 	@$(MKDIR_P) json_spirit
 	@: > json_spirit/$(am__dirstamp)
@@ -11961,6 +12662,7 @@ mds/MDSTableClient.lo: mds/$(am__dirstamp) \
 mds/MDSTableServer.lo: mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
 mds/SimpleLock.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
+mds/ScrubStack.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/SnapRealm.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/SnapServer.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/snap.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
@@ -11968,31 +12670,9 @@ mds/SessionMap.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/MDSContext.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/MDSAuthCaps.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
 mds/MDLog.lo: mds/$(am__dirstamp) mds/$(DEPDIR)/$(am__dirstamp)
-common/TrackedOp.lo: common/$(am__dirstamp) \
-	common/$(DEPDIR)/$(am__dirstamp)
 
 libmds.la: $(libmds_la_OBJECTS) $(libmds_la_DEPENDENCIES) $(EXTRA_libmds_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(CXXLINK) $(am_libmds_la_rpath) $(libmds_la_OBJECTS) $(libmds_la_LIBADD) $(LIBS)
-mon/Monitor.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
-mon/Paxos.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
-mon/PaxosService.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
-mon/OSDMonitor.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
-mon/MDSMonitor.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
-mon/MonmapMonitor.lo: mon/$(am__dirstamp) \
-	mon/$(DEPDIR)/$(am__dirstamp)
-mon/PGMonitor.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
-mon/LogMonitor.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
-mon/AuthMonitor.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
-mon/Elector.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
-mon/HealthMonitor.lo: mon/$(am__dirstamp) \
-	mon/$(DEPDIR)/$(am__dirstamp)
-mon/DataHealthService.lo: mon/$(am__dirstamp) \
-	mon/$(DEPDIR)/$(am__dirstamp)
-mon/ConfigKeyService.lo: mon/$(am__dirstamp) \
-	mon/$(DEPDIR)/$(am__dirstamp)
-
-libmon.la: $(libmon_la_OBJECTS) $(libmon_la_DEPENDENCIES) $(EXTRA_libmon_la_DEPENDENCIES) 
-	$(AM_V_CXXLD)$(CXXLINK) $(am_libmon_la_rpath) $(libmon_la_OBJECTS) $(libmon_la_LIBADD) $(LIBS)
 mon/PGMap.lo: mon/$(am__dirstamp) mon/$(DEPDIR)/$(am__dirstamp)
 
 libmon_types.la: $(libmon_types_la_OBJECTS) $(libmon_types_la_DEPENDENCIES) $(EXTRA_libmon_types_la_DEPENDENCIES) 
@@ -12063,74 +12743,6 @@ msg/xio/XioPool.lo: msg/xio/$(am__dirstamp) \
 
 libmsg.la: $(libmsg_la_OBJECTS) $(libmsg_la_DEPENDENCIES) $(EXTRA_libmsg_la_DEPENDENCIES) 
 	$(AM_V_CXXLD)$(CXXLINK)  $(libmsg_la_OBJECTS) $(libmsg_la_LIBADD) $(LIBS)
-os/libos_la-chain_xattr.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/fs/$(am__dirstamp):
-	@$(MKDIR_P) os/fs
-	@: > os/fs/$(am__dirstamp)
-os/fs/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) os/fs/$(DEPDIR)
-	@: > os/fs/$(DEPDIR)/$(am__dirstamp)
-os/fs/libos_la-FS.lo: os/fs/$(am__dirstamp) \
-	os/fs/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-DBObjectMap.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-GenericObjectMap.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-FileJournal.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-FileStore.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-GenericFileStoreBackend.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-HashIndex.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-IndexManager.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-JournalingObjectStore.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-LevelDBStore.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-LFNIndex.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-MemStore.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-KeyValueDB.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-KeyValueStore.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-ObjectStore.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-WBThrottle.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-common/libos_la-TrackedOp.lo: common/$(am__dirstamp) \
-	common/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-BtrfsFileStoreBackend.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/newstore/$(am__dirstamp):
-	@$(MKDIR_P) os/newstore
-	@: > os/newstore/$(am__dirstamp)
-os/newstore/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) os/newstore/$(DEPDIR)
-	@: > os/newstore/$(DEPDIR)/$(am__dirstamp)
-os/newstore/libos_la-NewStore.lo: os/newstore/$(am__dirstamp) \
-	os/newstore/$(DEPDIR)/$(am__dirstamp)
-os/fs/libos_la-XFS.lo: os/fs/$(am__dirstamp) \
-	os/fs/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-XfsFileStoreBackend.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-ZFSFileStoreBackend.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/libos_la-KineticStore.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-
-libos.la: $(libos_la_OBJECTS) $(libos_la_DEPENDENCIES) $(EXTRA_libos_la_DEPENDENCIES) 
-	$(AM_V_CXXLD)$(libos_la_LINK) $(am_libos_la_rpath) $(libos_la_OBJECTS) $(libos_la_LIBADD) $(LIBS)
-os/libos_rocksdb_la-RocksDBStore.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-
-libos_rocksdb.la: $(libos_rocksdb_la_OBJECTS) $(libos_rocksdb_la_DEPENDENCIES) $(EXTRA_libos_rocksdb_la_DEPENDENCIES) 
-	$(AM_V_CXXLD)$(libos_rocksdb_la_LINK) $(am_libos_rocksdb_la_rpath) $(libos_rocksdb_la_OBJECTS) $(libos_rocksdb_la_LIBADD) $(LIBS)
 tracing/$(am__dirstamp):
 	@$(MKDIR_P) tracing
 	@: > tracing/$(am__dirstamp)
@@ -12142,54 +12754,6 @@ tracing/libos_tp_la-objectstore.lo: tracing/$(am__dirstamp) \
 
 libos_tp.la: $(libos_tp_la_OBJECTS) $(libos_tp_la_DEPENDENCIES) $(EXTRA_libos_tp_la_DEPENDENCIES) 
 	$(AM_V_CCLD)$(libos_tp_la_LINK) $(am_libos_tp_la_rpath) $(libos_tp_la_OBJECTS) $(libos_tp_la_LIBADD) $(LIBS)
-os/libos_types_la-Transaction.lo: os/$(am__dirstamp) \
-	os/$(DEPDIR)/$(am__dirstamp)
-os/newstore/libos_types_la-newstore_types.lo:  \
-	os/newstore/$(am__dirstamp) \
-	os/newstore/$(DEPDIR)/$(am__dirstamp)
-
-libos_types.la: $(libos_types_la_OBJECTS) $(libos_types_la_DEPENDENCIES) $(EXTRA_libos_types_la_DEPENDENCIES) 
-	$(AM_V_CXXLD)$(libos_types_la_LINK)  $(libos_types_la_OBJECTS) $(libos_types_la_LIBADD) $(LIBS)
-osd/libosd_la-PG.lo: osd/$(am__dirstamp) osd/$(DEPDIR)/$(am__dirstamp)
-osd/libosd_la-ReplicatedPG.lo: osd/$(am__dirstamp) \
-	osd/$(DEPDIR)/$(am__dirstamp)
-osd/libosd_la-ReplicatedBackend.lo: osd/$(am__dirstamp) \
-	osd/$(DEPDIR)/$(am__dirstamp)
-osd/libosd_la-ECBackend.lo: osd/$(am__dirstamp) \
-	osd/$(DEPDIR)/$(am__dirstamp)
-osd/libosd_la-ECMsgTypes.lo: osd/$(am__dirstamp) \
-	osd/$(DEPDIR)/$(am__dirstamp)
-osd/libosd_la-ECTransaction.lo: osd/$(am__dirstamp) \
-	osd/$(DEPDIR)/$(am__dirstamp)
-osd/libosd_la-PGBackend.lo: osd/$(am__dirstamp) \
-	osd/$(DEPDIR)/$(am__dirstamp)
-osd/libosd_la-HitSet.lo: osd/$(am__dirstamp) \
-	osd/$(DEPDIR)/$(am__dirstamp)
-osd/libosd_la-OSD.lo: osd/$(am__dirstamp) \
-	osd/$(DEPDIR)/$(am__dirstamp)
-osd/libosd_la-OSDCap.lo: osd/$(am__dirstamp) \
-	osd/$(DEPDIR)/$(am__dirstamp)
-osd/libosd_la-Watch.lo: osd/$(am__dirstamp) \
-	osd/$(DEPDIR)/$(am__dirstamp)
-osd/libosd_la-ClassHandler.lo: osd/$(am__dirstamp) \
-	osd/$(DEPDIR)/$(am__dirstamp)
-osd/libosd_la-OpRequest.lo: osd/$(am__dirstamp) \
-	osd/$(DEPDIR)/$(am__dirstamp)
-common/libosd_la-TrackedOp.lo: common/$(am__dirstamp) \
-	common/$(DEPDIR)/$(am__dirstamp)
-osd/libosd_la-SnapMapper.lo: osd/$(am__dirstamp) \
-	osd/$(DEPDIR)/$(am__dirstamp)
-objclass/$(am__dirstamp):
-	@$(MKDIR_P) objclass
-	@: > objclass/$(am__dirstamp)
-objclass/$(DEPDIR)/$(am__dirstamp):
-	@$(MKDIR_P) objclass/$(DEPDIR)
-	@: > objclass/$(DEPDIR)/$(am__dirstamp)
-objclass/libosd_la-class_api.lo: objclass/$(am__dirstamp) \
-	objclass/$(DEPDIR)/$(am__dirstamp)
-
-libosd.la: $(libosd_la_OBJECTS) $(libosd_la_DEPENDENCIES) $(EXTRA_libosd_la_DEPENDENCIES) 
-	$(AM_V_CXXLD)$(libosd_la_LINK) $(am_libosd_la_rpath) $(libosd_la_OBJECTS) $(libosd_la_LIBADD) $(LIBS)
 tracing/libosd_tp_la-oprequest.lo: tracing/$(am__dirstamp) \
 	tracing/$(DEPDIR)/$(am__dirstamp)
 tracing/libosd_tp_la-osd.lo: tracing/$(am__dirstamp) \
@@ -12366,7 +12930,11 @@ librbd_api.la: $(librbd_api_la_OBJECTS) $(librbd_api_la_DEPENDENCIES) $(EXTRA_li
 	$(AM_V_CXXLD)$(CXXLINK) $(am_librbd_api_la_rpath) $(librbd_api_la_OBJECTS) $(librbd_api_la_LIBADD) $(LIBS)
 librbd/AioCompletion.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
-librbd/AioRequest.lo: librbd/$(am__dirstamp) \
+librbd/AioImageRequest.lo: librbd/$(am__dirstamp) \
+	librbd/$(DEPDIR)/$(am__dirstamp)
+librbd/AioImageRequestWQ.lo: librbd/$(am__dirstamp) \
+	librbd/$(DEPDIR)/$(am__dirstamp)
+librbd/AioObjectRequest.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/AsyncFlattenRequest.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
@@ -12390,6 +12958,12 @@ librbd/ImageWatcher.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/internal.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
+librbd/Journal.lo: librbd/$(am__dirstamp) \
+	librbd/$(DEPDIR)/$(am__dirstamp)
+librbd/JournalReplay.lo: librbd/$(am__dirstamp) \
+	librbd/$(DEPDIR)/$(am__dirstamp)
+librbd/LibrbdAdminSocketHook.lo: librbd/$(am__dirstamp) \
+	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/LibrbdWriteback.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/ObjectMap.lo: librbd/$(am__dirstamp) \
@@ -12445,10 +13019,16 @@ test/librbd/librbd_test_la-test_support.lo:  \
 test/librbd/librbd_test_la-test_librbd.lo:  \
 	test/librbd/$(am__dirstamp) \
 	test/librbd/$(DEPDIR)/$(am__dirstamp)
-test/librbd/librbd_test_la-test_ImageWatcher.lo:  \
+test/librbd/librbd_test_la-test_ImageWatcher.lo:  \
+	test/librbd/$(am__dirstamp) \
+	test/librbd/$(DEPDIR)/$(am__dirstamp)
+test/librbd/librbd_test_la-test_internal.lo:  \
+	test/librbd/$(am__dirstamp) \
+	test/librbd/$(DEPDIR)/$(am__dirstamp)
+test/librbd/librbd_test_la-test_JournalEntries.lo:  \
 	test/librbd/$(am__dirstamp) \
 	test/librbd/$(DEPDIR)/$(am__dirstamp)
-test/librbd/librbd_test_la-test_internal.lo:  \
+test/librbd/librbd_test_la-test_JournalReplay.lo:  \
 	test/librbd/$(am__dirstamp) \
 	test/librbd/$(DEPDIR)/$(am__dirstamp)
 test/librbd/librbd_test_la-test_ObjectMap.lo:  \
@@ -12462,6 +13042,8 @@ tracing/librbd_tp_la-librbd.lo: tracing/$(am__dirstamp) \
 
 librbd_tp.la: $(librbd_tp_la_OBJECTS) $(librbd_tp_la_DEPENDENCIES) $(EXTRA_librbd_tp_la_DEPENDENCIES) 
 	$(AM_V_CCLD)$(librbd_tp_la_LINK) $(am_librbd_tp_la_rpath) $(librbd_tp_la_OBJECTS) $(librbd_tp_la_LIBADD) $(LIBS)
+librbd/JournalTypes.lo: librbd/$(am__dirstamp) \
+	librbd/$(DEPDIR)/$(am__dirstamp)
 librbd/WatchNotifyTypes.lo: librbd/$(am__dirstamp) \
 	librbd/$(DEPDIR)/$(am__dirstamp)
 
@@ -12806,6 +13388,8 @@ mds/ceph_dencoder-MDSTableServer.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
 mds/ceph_dencoder-SimpleLock.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
+mds/ceph_dencoder-ScrubStack.$(OBJEXT): mds/$(am__dirstamp) \
+	mds/$(DEPDIR)/$(am__dirstamp)
 mds/ceph_dencoder-SnapRealm.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
 mds/ceph_dencoder-SnapServer.$(OBJEXT): mds/$(am__dirstamp) \
@@ -12820,8 +13404,6 @@ mds/ceph_dencoder-MDSAuthCaps.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
 mds/ceph_dencoder-MDLog.$(OBJEXT): mds/$(am__dirstamp) \
 	mds/$(DEPDIR)/$(am__dirstamp)
-common/ceph_dencoder-TrackedOp.$(OBJEXT): common/$(am__dirstamp) \
-	common/$(DEPDIR)/$(am__dirstamp)
 perfglue/ceph_dencoder-disabled_heap_profiler.$(OBJEXT):  \
 	perfglue/$(am__dirstamp) perfglue/$(DEPDIR)/$(am__dirstamp)
 perfglue/ceph_dencoder-disabled_stubs.$(OBJEXT):  \
@@ -13112,6 +13694,19 @@ test/cls_hello/ceph_test_cls_hello-test_cls_hello.$(OBJEXT):  \
 ceph_test_cls_hello$(EXEEXT): $(ceph_test_cls_hello_OBJECTS) $(ceph_test_cls_hello_DEPENDENCIES) $(EXTRA_ceph_test_cls_hello_DEPENDENCIES) 
 	@rm -f ceph_test_cls_hello$(EXEEXT)
 	$(AM_V_CXXLD)$(ceph_test_cls_hello_LINK) $(ceph_test_cls_hello_OBJECTS) $(ceph_test_cls_hello_LDADD) $(LIBS)
+test/cls_journal/$(am__dirstamp):
+	@$(MKDIR_P) test/cls_journal
+	@: > test/cls_journal/$(am__dirstamp)
+test/cls_journal/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) test/cls_journal/$(DEPDIR)
+	@: > test/cls_journal/$(DEPDIR)/$(am__dirstamp)
+test/cls_journal/ceph_test_cls_journal-test_cls_journal.$(OBJEXT):  \
+	test/cls_journal/$(am__dirstamp) \
+	test/cls_journal/$(DEPDIR)/$(am__dirstamp)
+
+ceph_test_cls_journal$(EXEEXT): $(ceph_test_cls_journal_OBJECTS) $(ceph_test_cls_journal_DEPENDENCIES) $(EXTRA_ceph_test_cls_journal_DEPENDENCIES) 
+	@rm -f ceph_test_cls_journal$(EXEEXT)
+	$(AM_V_CXXLD)$(ceph_test_cls_journal_LINK) $(ceph_test_cls_journal_OBJECTS) $(ceph_test_cls_journal_LDADD) $(LIBS)
 test/cls_lock/$(am__dirstamp):
 	@$(MKDIR_P) test/cls_lock
 	@: > test/cls_lock/$(am__dirstamp)
@@ -13378,6 +13973,9 @@ test/libcephfs/ceph_test_libcephfs-caps.$(OBJEXT):  \
 test/libcephfs/ceph_test_libcephfs-multiclient.$(OBJEXT):  \
 	test/libcephfs/$(am__dirstamp) \
 	test/libcephfs/$(DEPDIR)/$(am__dirstamp)
+test/libcephfs/ceph_test_libcephfs-access.$(OBJEXT):  \
+	test/libcephfs/$(am__dirstamp) \
+	test/libcephfs/$(DEPDIR)/$(am__dirstamp)
 test/libcephfs/ceph_test_libcephfs-flock.$(OBJEXT):  \
 	test/libcephfs/$(am__dirstamp) \
 	test/libcephfs/$(DEPDIR)/$(am__dirstamp)
@@ -13896,6 +14494,89 @@ rgw/rgw_object_expirer.$(OBJEXT): rgw/$(am__dirstamp) \
 radosgw-object-expirer$(EXEEXT): $(radosgw_object_expirer_OBJECTS) $(radosgw_object_expirer_DEPENDENCIES) $(EXTRA_radosgw_object_expirer_DEPENDENCIES) 
 	@rm -f radosgw-object-expirer$(EXEEXT)
 	$(AM_V_CXXLD)$(CXXLINK) $(radosgw_object_expirer_OBJECTS) $(radosgw_object_expirer_LDADD) $(LIBS)
+tools/rbd/$(am__dirstamp):
+	@$(MKDIR_P) tools/rbd
+	@: > tools/rbd/$(am__dirstamp)
+tools/rbd/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) tools/rbd/$(DEPDIR)
+	@: > tools/rbd/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/rbd.$(OBJEXT): tools/rbd/$(am__dirstamp) \
+	tools/rbd/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/ArgumentTypes.$(OBJEXT): tools/rbd/$(am__dirstamp) \
+	tools/rbd/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/IndentStream.$(OBJEXT): tools/rbd/$(am__dirstamp) \
+	tools/rbd/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/OptionPrinter.$(OBJEXT): tools/rbd/$(am__dirstamp) \
+	tools/rbd/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/Shell.$(OBJEXT): tools/rbd/$(am__dirstamp) \
+	tools/rbd/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/Utils.$(OBJEXT): tools/rbd/$(am__dirstamp) \
+	tools/rbd/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/$(am__dirstamp):
+	@$(MKDIR_P) tools/rbd/action
+	@: > tools/rbd/action/$(am__dirstamp)
+tools/rbd/action/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) tools/rbd/action/$(DEPDIR)
+	@: > tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/BenchWrite.$(OBJEXT):  \
+	tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Children.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Clone.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Copy.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Create.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Diff.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/DiskUsage.$(OBJEXT):  \
+	tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Export.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/ExportDiff.$(OBJEXT):  \
+	tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Feature.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Flatten.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/ImageMeta.$(OBJEXT):  \
+	tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Import.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/ImportDiff.$(OBJEXT):  \
+	tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Info.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Kernel.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/List.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Lock.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/MergeDiff.$(OBJEXT):  \
+	tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/ObjectMap.$(OBJEXT):  \
+	tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Remove.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Rename.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Resize.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Snap.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Status.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+tools/rbd/action/Watch.$(OBJEXT): tools/rbd/action/$(am__dirstamp) \
+	tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
 
 rbd$(EXEEXT): $(rbd_OBJECTS) $(rbd_DEPENDENCIES) $(EXTRA_rbd_DEPENDENCIES) 
 	@rm -f rbd$(EXEEXT)
@@ -14702,6 +15383,49 @@ test/unittest_ipaddr-test_ipaddr.$(OBJEXT): test/$(am__dirstamp) \
 unittest_ipaddr$(EXEEXT): $(unittest_ipaddr_OBJECTS) $(unittest_ipaddr_DEPENDENCIES) $(EXTRA_unittest_ipaddr_DEPENDENCIES) 
 	@rm -f unittest_ipaddr$(EXEEXT)
 	$(AM_V_CXXLD)$(unittest_ipaddr_LINK) $(unittest_ipaddr_OBJECTS) $(unittest_ipaddr_LDADD) $(LIBS)
+test/journal/$(am__dirstamp):
+	@$(MKDIR_P) test/journal
+	@: > test/journal/$(am__dirstamp)
+test/journal/$(DEPDIR)/$(am__dirstamp):
+	@$(MKDIR_P) test/journal/$(DEPDIR)
+	@: > test/journal/$(DEPDIR)/$(am__dirstamp)
+test/journal/unittest_journal-test_main.$(OBJEXT):  \
+	test/journal/$(am__dirstamp) \
+	test/journal/$(DEPDIR)/$(am__dirstamp)
+test/journal/unittest_journal-test_Entry.$(OBJEXT):  \
+	test/journal/$(am__dirstamp) \
+	test/journal/$(DEPDIR)/$(am__dirstamp)
+test/journal/unittest_journal-test_FutureImpl.$(OBJEXT):  \
+	test/journal/$(am__dirstamp) \
+	test/journal/$(DEPDIR)/$(am__dirstamp)
+test/journal/unittest_journal-test_Journaler.$(OBJEXT):  \
+	test/journal/$(am__dirstamp) \
+	test/journal/$(DEPDIR)/$(am__dirstamp)
+test/journal/unittest_journal-test_JournalMetadata.$(OBJEXT):  \
+	test/journal/$(am__dirstamp) \
+	test/journal/$(DEPDIR)/$(am__dirstamp)
+test/journal/unittest_journal-test_JournalPlayer.$(OBJEXT):  \
+	test/journal/$(am__dirstamp) \
+	test/journal/$(DEPDIR)/$(am__dirstamp)
+test/journal/unittest_journal-test_JournalRecorder.$(OBJEXT):  \
+	test/journal/$(am__dirstamp) \
+	test/journal/$(DEPDIR)/$(am__dirstamp)
+test/journal/unittest_journal-test_JournalTrimmer.$(OBJEXT):  \
+	test/journal/$(am__dirstamp) \
+	test/journal/$(DEPDIR)/$(am__dirstamp)
+test/journal/unittest_journal-test_ObjectPlayer.$(OBJEXT):  \
+	test/journal/$(am__dirstamp) \
+	test/journal/$(DEPDIR)/$(am__dirstamp)
+test/journal/unittest_journal-test_ObjectRecorder.$(OBJEXT):  \
+	test/journal/$(am__dirstamp) \
+	test/journal/$(DEPDIR)/$(am__dirstamp)
+test/journal/unittest_journal-RadosTestFixture.$(OBJEXT):  \
+	test/journal/$(am__dirstamp) \
+	test/journal/$(DEPDIR)/$(am__dirstamp)
+
+unittest_journal$(EXEEXT): $(unittest_journal_OBJECTS) $(unittest_journal_DEPENDENCIES) $(EXTRA_unittest_journal_DEPENDENCIES) 
+	@rm -f unittest_journal$(EXEEXT)
+	$(AM_V_CXXLD)$(unittest_journal_LINK) $(unittest_journal_OBJECTS) $(unittest_journal_LDADD) $(LIBS)
 test/os/$(am__dirstamp):
 	@$(MKDIR_P) test/os
 	@: > test/os/$(am__dirstamp)
@@ -15330,6 +16054,8 @@ mostlyclean-compile:
 	-rm -f cls/cephfs/*.lo
 	-rm -f cls/hello/*.$(OBJEXT)
 	-rm -f cls/hello/*.lo
+	-rm -f cls/journal/*.$(OBJEXT)
+	-rm -f cls/journal/*.lo
 	-rm -f cls/lock/*.$(OBJEXT)
 	-rm -f cls/lock/*.lo
 	-rm -f cls/log/*.$(OBJEXT)
@@ -15380,10 +16106,13 @@ mostlyclean-compile:
 	-rm -f global/*.lo
 	-rm -f java/native/*.$(OBJEXT)
 	-rm -f java/native/*.lo
+	-rm -f journal/*.$(OBJEXT)
+	-rm -f journal/*.lo
 	-rm -f json_spirit/*.$(OBJEXT)
 	-rm -f json_spirit/*.lo
 	-rm -f key_value_store/*.$(OBJEXT)
 	-rm -f key_value_store/*.lo
+	-rm -f kv/*.$(OBJEXT)
 	-rm -f librados/*.$(OBJEXT)
 	-rm -f librados/*.lo
 	-rm -f libradosstriper/*.$(OBJEXT)
@@ -15406,13 +16135,9 @@ mostlyclean-compile:
 	-rm -f msg/xio/*.$(OBJEXT)
 	-rm -f msg/xio/*.lo
 	-rm -f objclass/*.$(OBJEXT)
-	-rm -f objclass/*.lo
 	-rm -f os/*.$(OBJEXT)
-	-rm -f os/*.lo
 	-rm -f os/fs/*.$(OBJEXT)
-	-rm -f os/fs/*.lo
 	-rm -f os/newstore/*.$(OBJEXT)
-	-rm -f os/newstore/*.lo
 	-rm -f osd/*.$(OBJEXT)
 	-rm -f osd/*.lo
 	-rm -f osdc/*.$(OBJEXT)
@@ -15428,6 +16153,7 @@ mostlyclean-compile:
 	-rm -f test/ObjectMap/*.$(OBJEXT)
 	-rm -f test/bench/*.$(OBJEXT)
 	-rm -f test/cls_hello/*.$(OBJEXT)
+	-rm -f test/cls_journal/*.$(OBJEXT)
 	-rm -f test/cls_lock/*.$(OBJEXT)
 	-rm -f test/cls_log/*.$(OBJEXT)
 	-rm -f test/cls_numops/*.$(OBJEXT)
@@ -15444,6 +16170,7 @@ mostlyclean-compile:
 	-rm -f test/erasure-code/*.lo
 	-rm -f test/filestore/*.$(OBJEXT)
 	-rm -f test/fs/*.$(OBJEXT)
+	-rm -f test/journal/*.$(OBJEXT)
 	-rm -f test/libcephfs/*.$(OBJEXT)
 	-rm -f test/librados/*.$(OBJEXT)
 	-rm -f test/librados/*.lo
@@ -15467,6 +16194,8 @@ mostlyclean-compile:
 	-rm -f tools/*.$(OBJEXT)
 	-rm -f tools/cephfs/*.$(OBJEXT)
 	-rm -f tools/rados/*.$(OBJEXT)
+	-rm -f tools/rbd/*.$(OBJEXT)
+	-rm -f tools/rbd/action/*.$(OBJEXT)
 	-rm -f tracing/*.$(OBJEXT)
 	-rm -f tracing/*.lo
 
@@ -15483,7 +16212,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/krbd.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/libcephfs.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/librados-config.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at ./$(DEPDIR)/rbd.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at arch/$(DEPDIR)/arm.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at arch/$(DEPDIR)/intel.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at arch/$(DEPDIR)/probe.Plo at am__quote@
@@ -15517,6 +16245,9 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at cls/cephfs/$(DEPDIR)/cls_cephfs.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/cephfs/$(DEPDIR)/cls_cephfs_client.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/hello/$(DEPDIR)/cls_hello.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at cls/journal/$(DEPDIR)/cls_journal.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at cls/journal/$(DEPDIR)/cls_journal_client.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at cls/journal/$(DEPDIR)/cls_journal_types.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/lock/$(DEPDIR)/cls_lock.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/lock/$(DEPDIR)/cls_lock_client.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at cls/lock/$(DEPDIR)/cls_lock_ops.Plo at am__quote@
@@ -15587,7 +16318,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/ceph_context.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/ceph_crypto.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/ceph_crypto_cms.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/ceph_dencoder-TrackedOp.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/ceph_frag.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/ceph_fs.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/ceph_hash.Plo at am__quote@
@@ -15616,8 +16346,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/libcommon_crc_la-crc32c_intel_fast_zero_asm.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/libcommon_crc_la-sctp_crc32.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/libec_lrc_la-str_map.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/libos_la-TrackedOp.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/libosd_la-TrackedOp.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/librados_la-buffer.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/linux_version.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/lockdep.Plo at am__quote@
@@ -15634,6 +16362,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/signal.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/simple_spin.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/snap_types.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/solaris_errno.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/str_list.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/str_map.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at common/$(DEPDIR)/strtol.Plo at am__quote@
@@ -15955,10 +16684,26 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at global/$(DEPDIR)/signal_handler.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at java/native/$(DEPDIR)/libcephfs_jni_la-JniConstants.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at java/native/$(DEPDIR)/libcephfs_jni_la-libcephfs_jni.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at journal/$(DEPDIR)/AsyncOpTracker.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at journal/$(DEPDIR)/Entry.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at journal/$(DEPDIR)/Future.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at journal/$(DEPDIR)/FutureImpl.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at journal/$(DEPDIR)/JournalMetadata.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at journal/$(DEPDIR)/JournalPlayer.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at journal/$(DEPDIR)/JournalRecorder.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at journal/$(DEPDIR)/JournalTrimmer.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at journal/$(DEPDIR)/Journaler.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at journal/$(DEPDIR)/ObjectPlayer.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at journal/$(DEPDIR)/ObjectRecorder.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at journal/$(DEPDIR)/Utils.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at json_spirit/$(DEPDIR)/json_spirit_reader.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at json_spirit/$(DEPDIR)/json_spirit_writer.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at key_value_store/$(DEPDIR)/cls_kvs.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at key_value_store/$(DEPDIR)/kv_flat_btree_async.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at kv/$(DEPDIR)/libkv_a-KeyValueDB.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at kv/$(DEPDIR)/libkv_a-KineticStore.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at kv/$(DEPDIR)/libkv_a-LevelDBStore.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at kv/$(DEPDIR)/libkv_a-RocksDBStore.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librados/$(DEPDIR)/IoCtxImpl.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librados/$(DEPDIR)/RadosClient.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librados/$(DEPDIR)/RadosXattrIter.Plo at am__quote@
@@ -15970,7 +16715,9 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at libradosstriper/$(DEPDIR)/libradosstriper_la-RadosStriperImpl.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at libradosstriper/$(DEPDIR)/libradosstriper_la-libradosstriper.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AioCompletion.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AioRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AioImageRequest.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AioImageRequestWQ.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AioObjectRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AsyncFlattenRequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AsyncObjectThrottle.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/AsyncOperation.Plo at am__quote@
@@ -15981,6 +16728,10 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/DiffIterate.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/ImageCtx.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/ImageWatcher.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/Journal.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/JournalReplay.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/JournalTypes.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/LibrbdAdminSocketHook.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/LibrbdWriteback.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/ObjectMap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at librbd/$(DEPDIR)/RebuildObjectMapRequest.Plo at am__quote@
@@ -16014,6 +16765,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/Migrator.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/Mutation.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/RecoveryQueue.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ScrubStack.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/Server.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/SessionMap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/SimpleLock.Plo at am__quote@
@@ -16042,6 +16794,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-Migrator.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-Mutation.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-RecoveryQueue.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-ScrubStack.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-Server.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-SessionMap.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/ceph_dencoder-SimpleLock.Po at am__quote@
@@ -16057,23 +16810,23 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/locks.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/mdstypes.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mds/$(DEPDIR)/snap.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/AuthMonitor.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/ConfigKeyService.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/DataHealthService.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/Elector.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/HealthMonitor.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/LogMonitor.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/MDSMonitor.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/AuthMonitor.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/ConfigKeyService.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/DataHealthService.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/Elector.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/HealthMonitor.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/LogMonitor.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/MDSMonitor.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/MonCap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/MonClient.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/MonMap.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/Monitor.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/MonmapMonitor.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/OSDMonitor.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/Monitor.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/MonmapMonitor.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/OSDMonitor.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/PGMap.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/PGMonitor.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/Paxos.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/PaxosService.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/PGMonitor.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/Paxos.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at mon/$(DEPDIR)/PaxosService.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at mount/$(DEPDIR)/mount.ceph.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at msg/$(DEPDIR)/Message.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at msg/$(DEPDIR)/Messenger.Plo at am__quote@
@@ -16096,52 +16849,48 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at msg/xio/$(DEPDIR)/XioMsg.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at msg/xio/$(DEPDIR)/XioPool.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at msg/xio/$(DEPDIR)/XioPortal.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at objclass/$(DEPDIR)/libosd_la-class_api.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-BtrfsFileStoreBackend.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-DBObjectMap.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-FileJournal.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-FileStore.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-GenericFileStoreBackend.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-GenericObjectMap.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-HashIndex.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-IndexManager.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-JournalingObjectStore.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-KeyValueDB.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-KeyValueStore.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-KineticStore.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-LFNIndex.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-LevelDBStore.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-MemStore.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-ObjectStore.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-WBThrottle.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-XfsFileStoreBackend.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-ZFSFileStoreBackend.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_la-chain_xattr.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_rocksdb_la-RocksDBStore.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_types_la-Transaction.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at objclass/$(DEPDIR)/libosd_a-class_api.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/BtrfsFileStoreBackend.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/DBObjectMap.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/FileJournal.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/FileStore.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/GenericFileStoreBackend.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/GenericObjectMap.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/HashIndex.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/IndexManager.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/JournalingObjectStore.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/KeyValueStore.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/LFNIndex.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/MemStore.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/ObjectStore.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/WBThrottle.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/XfsFileStoreBackend.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/ZFSFileStoreBackend.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/chain_xattr.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_types_a-Transaction.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at os/$(DEPDIR)/libos_zfs_a-ZFS.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/fs/$(DEPDIR)/libos_la-FS.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/fs/$(DEPDIR)/libos_la-XFS.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/newstore/$(DEPDIR)/libos_la-NewStore.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at os/newstore/$(DEPDIR)/libos_types_la-newstore_types.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/fs/$(DEPDIR)/FS.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/fs/$(DEPDIR)/XFS.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/newstore/$(DEPDIR)/NewStore.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at os/newstore/$(DEPDIR)/libos_types_a-newstore_types.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/ECMsgTypes.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/HitSet.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/OSDMap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/ceph_test_rados_api_tier-HitSet.Po at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-ClassHandler.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-ECBackend.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-ECMsgTypes.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-ECTransaction.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-HitSet.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-OSD.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-OSDCap.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-OpRequest.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-PG.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-PGBackend.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-ReplicatedBackend.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-ReplicatedPG.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-SnapMapper.Plo at am__quote@
- at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_la-Watch.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_a-ClassHandler.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_a-ECBackend.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_a-ECMsgTypes.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_a-ECTransaction.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_a-HitSet.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_a-OSD.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_a-OSDCap.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_a-OpRequest.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_a-PG.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_a-PGBackend.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_a-ReplicatedBackend.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_a-ReplicatedPG.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_a-SnapMapper.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_a-Watch.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_types_la-ECUtil.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_types_la-PGLog.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at osd/$(DEPDIR)/libosd_types_la-osd_types.Plo at am__quote@
@@ -16360,6 +17109,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/bench/$(DEPDIR)/testfilestore_backend.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/bench/$(DEPDIR)/tp_bench.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/cls_hello/$(DEPDIR)/ceph_test_cls_hello-test_cls_hello.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/cls_journal/$(DEPDIR)/ceph_test_cls_journal-test_cls_journal.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/cls_lock/$(DEPDIR)/ceph_test_cls_lock-test_cls_lock.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/cls_log/$(DEPDIR)/ceph_test_cls_log-test_cls_log.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/cls_numops/$(DEPDIR)/ceph_test_cls_numops-test_cls_numops.Po at am__quote@
@@ -16427,6 +17177,18 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/erasure-code/$(DEPDIR)/unittest_erasure_code_shec_thread-TestErasureCodeShec_thread.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/filestore/$(DEPDIR)/ceph_test_filestore-TestFileStore.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/fs/$(DEPDIR)/unittest_mds_types-mds_types.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/journal/$(DEPDIR)/unittest_journal-RadosTestFixture.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/journal/$(DEPDIR)/unittest_journal-test_Entry.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/journal/$(DEPDIR)/unittest_journal-test_FutureImpl.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/journal/$(DEPDIR)/unittest_journal-test_JournalMetadata.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/journal/$(DEPDIR)/unittest_journal-test_JournalPlayer.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/journal/$(DEPDIR)/unittest_journal-test_JournalRecorder.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/journal/$(DEPDIR)/unittest_journal-test_JournalTrimmer.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/journal/$(DEPDIR)/unittest_journal-test_Journaler.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/journal/$(DEPDIR)/unittest_journal-test_ObjectPlayer.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/journal/$(DEPDIR)/unittest_journal-test_ObjectRecorder.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/journal/$(DEPDIR)/unittest_journal-test_main.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-access.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-caps.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-flock.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-multiclient.Po at am__quote@
@@ -16468,6 +17230,8 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/ceph_test_librbd_api-test_support.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/ceph_test_librbd_fsx-fsx.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_ImageWatcher.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_JournalEntries.Plo at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_JournalReplay.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_ObjectMap.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_fixture.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at test/librbd/$(DEPDIR)/librbd_test_la-test_internal.Plo at am__quote@
@@ -16564,6 +17328,38 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rados/$(DEPDIR)/PoolDump.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rados/$(DEPDIR)/RadosImport.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tools/rados/$(DEPDIR)/rados.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/$(DEPDIR)/ArgumentTypes.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/$(DEPDIR)/IndentStream.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/$(DEPDIR)/OptionPrinter.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/$(DEPDIR)/Shell.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/$(DEPDIR)/Utils.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/$(DEPDIR)/rbd.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/BenchWrite.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Children.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Clone.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Copy.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Create.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Diff.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/DiskUsage.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Export.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/ExportDiff.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Feature.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Flatten.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/ImageMeta.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Import.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/ImportDiff.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Info.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Kernel.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/List.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Lock.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/MergeDiff.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/ObjectMap.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Remove.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Rename.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Resize.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Snap.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Status.Po at am__quote@
+ at AMDEP_TRUE@@am__include@ @am__quote at tools/rbd/action/$(DEPDIR)/Watch.Po at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tracing/$(DEPDIR)/libos_tp_la-objectstore.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tracing/$(DEPDIR)/libosd_tp_la-oprequest.Plo at am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote at tracing/$(DEPDIR)/libosd_tp_la-osd.Plo at am__quote@
@@ -18946,6 +19742,90 @@ erasure-code/jerasure/gf-complete/src/unittest_erasure_code_shec_thread-gf_w8.ob
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
 
+kv/libkv_a-KeyValueDB.o: kv/KeyValueDB.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -MT kv/libkv_a-KeyValueDB.o -MD -MP -MF kv/$(DEPDIR)/libkv_a-KeyValueDB.Tpo -c -o kv/libkv_a-KeyValueDB.o `test -f 'kv/KeyValueDB.cc' || echo '$(srcdir)/'`kv/KeyValueDB.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) kv/$(DEPDIR)/libkv_a-KeyValueDB.Tpo kv/$(DEPDIR)/libkv_a-KeyValueDB.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='kv/KeyValueDB.cc' object='kv/libkv_a-KeyValueDB.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -c -o kv/libkv_a-KeyValueDB.o `test -f 'kv/KeyValueDB.cc' || echo '$(srcdir)/'`kv/KeyValueDB.cc
+
+kv/libkv_a-KeyValueDB.obj: kv/KeyValueDB.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -MT kv/libkv_a-KeyValueDB.obj -MD -MP -MF kv/$(DEPDIR)/libkv_a-KeyValueDB.Tpo -c -o kv/libkv_a-KeyValueDB.obj `if test -f 'kv/KeyValueDB.cc'; then $(CYGPATH_W) 'kv/KeyValueDB.cc'; else $(CYGPATH_W) '$(srcdir)/kv/KeyValueDB.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) kv/$(DEPDIR)/libkv_a-KeyValueDB.Tpo kv/$(DEPDIR)/libkv_a-KeyValueDB.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='kv/KeyValueDB.cc' object='kv/libkv_a-KeyValueDB.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -c -o kv/libkv_a-KeyValueDB.obj `if test -f 'kv/KeyValueDB.cc'; then $(CYGPATH_W) 'kv/KeyValueDB.cc'; else $(CYGPATH_W) '$(srcdir)/kv/KeyValueDB.cc'; fi`
+
+kv/libkv_a-LevelDBStore.o: kv/LevelDBStore.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -MT kv/libkv_a-LevelDBStore.o -MD -MP -MF kv/$(DEPDIR)/libkv_a-LevelDBStore.Tpo -c -o kv/libkv_a-LevelDBStore.o `test -f 'kv/LevelDBStore.cc' || echo '$(srcdir)/'`kv/LevelDBStore.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) kv/$(DEPDIR)/libkv_a-LevelDBStore.Tpo kv/$(DEPDIR)/libkv_a-LevelDBStore.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='kv/LevelDBStore.cc' object='kv/libkv_a-LevelDBStore.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -c -o kv/libkv_a-LevelDBStore.o `test -f 'kv/LevelDBStore.cc' || echo '$(srcdir)/'`kv/LevelDBStore.cc
+
+kv/libkv_a-LevelDBStore.obj: kv/LevelDBStore.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -MT kv/libkv_a-LevelDBStore.obj -MD -MP -MF kv/$(DEPDIR)/libkv_a-LevelDBStore.Tpo -c -o kv/libkv_a-LevelDBStore.obj `if test -f 'kv/LevelDBStore.cc'; then $(CYGPATH_W) 'kv/LevelDBStore.cc'; else $(CYGPATH_W) '$(srcdir)/kv/LevelDBStore.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) kv/$(DEPDIR)/libkv_a-LevelDBStore.Tpo kv/$(DEPDIR)/libkv_a-LevelDBStore.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='kv/LevelDBStore.cc' object='kv/libkv_a-LevelDBStore.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -c -o kv/libkv_a-LevelDBStore.obj `if test -f 'kv/LevelDBStore.cc'; then $(CYGPATH_W) 'kv/LevelDBStore.cc'; else $(CYGPATH_W) '$(srcdir)/kv/LevelDBStore.cc'; fi`
+
+kv/libkv_a-RocksDBStore.o: kv/RocksDBStore.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -MT kv/libkv_a-RocksDBStore.o -MD -MP -MF kv/$(DEPDIR)/libkv_a-RocksDBStore.Tpo -c -o kv/libkv_a-RocksDBStore.o `test -f 'kv/RocksDBStore.cc' || echo '$(srcdir)/'`kv/RocksDBStore.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) kv/$(DEPDIR)/libkv_a-RocksDBStore.Tpo kv/$(DEPDIR)/libkv_a-RocksDBStore.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='kv/RocksDBStore.cc' object='kv/libkv_a-RocksDBStore.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -c -o kv/libkv_a-RocksDBStore.o `test -f 'kv/RocksDBStore.cc' || echo '$(srcdir)/'`kv/RocksDBStore.cc
+
+kv/libkv_a-RocksDBStore.obj: kv/RocksDBStore.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -MT kv/libkv_a-RocksDBStore.obj -MD -MP -MF kv/$(DEPDIR)/libkv_a-RocksDBStore.Tpo -c -o kv/libkv_a-RocksDBStore.obj `if test -f 'kv/RocksDBStore.cc'; then $(CYGPATH_W) 'kv/RocksDBStore.cc'; else $(CYGPATH_W) '$(srcdir)/kv/RocksDBStore.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) kv/$(DEPDIR)/libkv_a-RocksDBStore.Tpo kv/$(DEPDIR)/libkv_a-RocksDBStore.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='kv/RocksDBStore.cc' object='kv/libkv_a-RocksDBStore.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -c -o kv/libkv_a-RocksDBStore.obj `if test -f 'kv/RocksDBStore.cc'; then $(CYGPATH_W) 'kv/RocksDBStore.cc'; else $(CYGPATH_W) '$(srcdir)/kv/RocksDBStore.cc'; fi`
+
+kv/libkv_a-KineticStore.o: kv/KineticStore.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -MT kv/libkv_a-KineticStore.o -MD -MP -MF kv/$(DEPDIR)/libkv_a-KineticStore.Tpo -c -o kv/libkv_a-KineticStore.o `test -f 'kv/KineticStore.cc' || echo '$(srcdir)/'`kv/KineticStore.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) kv/$(DEPDIR)/libkv_a-KineticStore.Tpo kv/$(DEPDIR)/libkv_a-KineticStore.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='kv/KineticStore.cc' object='kv/libkv_a-KineticStore.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -c -o kv/libkv_a-KineticStore.o `test -f 'kv/KineticStore.cc' || echo '$(srcdir)/'`kv/KineticStore.cc
+
+kv/libkv_a-KineticStore.obj: kv/KineticStore.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -MT kv/libkv_a-KineticStore.obj -MD -MP -MF kv/$(DEPDIR)/libkv_a-KineticStore.Tpo -c -o kv/libkv_a-KineticStore.obj `if test -f 'kv/KineticStore.cc'; then $(CYGPATH_W) 'kv/KineticStore.cc'; else $(CYGPATH_W) '$(srcdir)/kv/KineticStore.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) kv/$(DEPDIR)/libkv_a-KineticStore.Tpo kv/$(DEPDIR)/libkv_a-KineticStore.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='kv/KineticStore.cc' object='kv/libkv_a-KineticStore.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libkv_a_CXXFLAGS) $(CXXFLAGS) -c -o kv/libkv_a-KineticStore.obj `if test -f 'kv/KineticStore.cc'; then $(CYGPATH_W) 'kv/KineticStore.cc'; else $(CYGPATH_W) '$(srcdir)/kv/KineticStore.cc'; fi`
+
+os/libos_types_a-Transaction.o: os/Transaction.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_a_CXXFLAGS) $(CXXFLAGS) -MT os/libos_types_a-Transaction.o -MD -MP -MF os/$(DEPDIR)/libos_types_a-Transaction.Tpo -c -o os/libos_types_a-Transaction.o `test -f 'os/Transaction.cc' || echo '$(srcdir)/'`os/Transaction.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_types_a-Transaction.Tpo os/$(DEPDIR)/libos_types_a-Transaction.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/Transaction.cc' object='os/libos_types_a-Transaction.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_a_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_types_a-Transaction.o `test -f 'os/Transaction.cc' || echo '$(srcdir)/'`os/Transaction.cc
+
+os/libos_types_a-Transaction.obj: os/Transaction.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_a_CXXFLAGS) $(CXXFLAGS) -MT os/libos_types_a-Transaction.obj -MD -MP -MF os/$(DEPDIR)/libos_types_a-Transaction.Tpo -c -o os/libos_types_a-Transaction.obj `if test -f 'os/Transaction.cc'; then $(CYGPATH_W) 'os/Transaction.cc'; else $(CYGPATH_W) '$(srcdir)/os/Transaction.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_types_a-Transaction.Tpo os/$(DEPDIR)/libos_types_a-Transaction.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/Transaction.cc' object='os/libos_types_a-Transaction.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_a_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_types_a-Transaction.obj `if test -f 'os/Transaction.cc'; then $(CYGPATH_W) 'os/Transaction.cc'; else $(CYGPATH_W) '$(srcdir)/os/Transaction.cc'; fi`
+
+os/newstore/libos_types_a-newstore_types.o: os/newstore/newstore_types.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_a_CXXFLAGS) $(CXXFLAGS) -MT os/newstore/libos_types_a-newstore_types.o -MD -MP -MF os/newstore/$(DEPDIR)/libos_types_a-newstore_types.Tpo -c -o os/newstore/libos_types_a-newstore_types.o `test -f 'os/newstore/newstore_types.cc' || echo '$(srcdir)/'`os/newstore/newstore_types.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/newstore/$(DEPDIR)/libos_types_a-newstore_types.Tpo os/newstore/$(DEPDIR)/libos_types_a-newstore_types.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/newstore/newstore_types.cc' object='os/newstore/libos_types_a-newstore_types.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_a_CXXFLAGS) $(CXXFLAGS) -c -o os/newstore/libos_types_a-newstore_types.o `test -f 'os/newstore/newstore_types.cc' || echo '$(srcdir)/'`os/newstore/newstore_types.cc
+
+os/newstore/libos_types_a-newstore_types.obj: os/newstore/newstore_types.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_a_CXXFLAGS) $(CXXFLAGS) -MT os/newstore/libos_types_a-newstore_types.obj -MD -MP -MF os/newstore/$(DEPDIR)/libos_types_a-newstore_types.Tpo -c -o os/newstore/libos_types_a-newstore_types.obj `if test -f 'os/newstore/newstore_types.cc'; then $(CYGPATH_W) 'os/newstore/newstore_types.cc'; else $(CYGPATH_W) '$(srcdir)/os/newstore/newstore_types.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/newstore/$(DEPDIR)/libos_types_a-newstore_types.Tpo os/newstore/$(DEPDIR)/libos_types_a-newstore_types.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/newstore/newstore_types.cc' object='os/newstore/libos_types_a-newstore_types.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_a_CXXFLAGS) $(CXXFLAGS) -c -o os/newstore/libos_types_a-newstore_types.obj `if test -f 'os/newstore/newstore_types.cc'; then $(CYGPATH_W) 'os/newstore/newstore_types.cc'; else $(CYGPATH_W) '$(srcdir)/os/newstore/newstore_types.cc'; fi`
+
 os/libos_zfs_a-ZFS.o: os/ZFS.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_zfs_a_CXXFLAGS) $(CXXFLAGS) -MT os/libos_zfs_a-ZFS.o -MD -MP -MF os/$(DEPDIR)/libos_zfs_a-ZFS.Tpo -c -o os/libos_zfs_a-ZFS.o `test -f 'os/ZFS.cc' || echo '$(srcdir)/'`os/ZFS.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_zfs_a-ZFS.Tpo os/$(DEPDIR)/libos_zfs_a-ZFS.Po
@@ -18960,6 +19840,216 @@ os/libos_zfs_a-ZFS.obj: os/ZFS.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_zfs_a_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_zfs_a-ZFS.obj `if test -f 'os/ZFS.cc'; then $(CYGPATH_W) 'os/ZFS.cc'; else $(CYGPATH_W) '$(srcdir)/os/ZFS.cc'; fi`
 
+osd/libosd_a-PG.o: osd/PG.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-PG.o -MD -MP -MF osd/$(DEPDIR)/libosd_a-PG.Tpo -c -o osd/libosd_a-PG.o `test -f 'osd/PG.cc' || echo '$(srcdir)/'`osd/PG.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-PG.Tpo osd/$(DEPDIR)/libosd_a-PG.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/PG.cc' object='osd/libosd_a-PG.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-PG.o `test -f 'osd/PG.cc' || echo '$(srcdir)/'`osd/PG.cc
+
+osd/libosd_a-PG.obj: osd/PG.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-PG.obj -MD -MP -MF osd/$(DEPDIR)/libosd_a-PG.Tpo -c -o osd/libosd_a-PG.obj `if test -f 'osd/PG.cc'; then $(CYGPATH_W) 'osd/PG.cc'; else $(CYGPATH_W) '$(srcdir)/osd/PG.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-PG.Tpo osd/$(DEPDIR)/libosd_a-PG.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/PG.cc' object='osd/libosd_a-PG.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-PG.obj `if test -f 'osd/PG.cc'; then $(CYGPATH_W) 'osd/PG.cc'; else $(CYGPATH_W) '$(srcdir)/osd/PG.cc'; fi`
+
+osd/libosd_a-ReplicatedPG.o: osd/ReplicatedPG.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-ReplicatedPG.o -MD -MP -MF osd/$(DEPDIR)/libosd_a-ReplicatedPG.Tpo -c -o osd/libosd_a-ReplicatedPG.o `test -f 'osd/ReplicatedPG.cc' || echo '$(srcdir)/'`osd/ReplicatedPG.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-ReplicatedPG.Tpo osd/$(DEPDIR)/libosd_a-ReplicatedPG.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ReplicatedPG.cc' object='osd/libosd_a-ReplicatedPG.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-ReplicatedPG.o `test -f 'osd/ReplicatedPG.cc' || echo '$(srcdir)/'`osd/ReplicatedPG.cc
+
+osd/libosd_a-ReplicatedPG.obj: osd/ReplicatedPG.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-ReplicatedPG.obj -MD -MP -MF osd/$(DEPDIR)/libosd_a-ReplicatedPG.Tpo -c -o osd/libosd_a-ReplicatedPG.obj `if test -f 'osd/ReplicatedPG.cc'; then $(CYGPATH_W) 'osd/ReplicatedPG.cc'; else $(CYGPATH_W) '$(srcdir)/osd/ReplicatedPG.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-ReplicatedPG.Tpo osd/$(DEPDIR)/libosd_a-ReplicatedPG.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ReplicatedPG.cc' object='osd/libosd_a-ReplicatedPG.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-ReplicatedPG.obj `if test -f 'osd/ReplicatedPG.cc'; then $(CYGPATH_W) 'osd/ReplicatedPG.cc'; else $(CYGPATH_W) '$(srcdir)/osd/ReplicatedPG.cc'; fi`
+
+osd/libosd_a-ReplicatedBackend.o: osd/ReplicatedBackend.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-ReplicatedBackend.o -MD -MP -MF osd/$(DEPDIR)/libosd_a-ReplicatedBackend.Tpo -c -o osd/libosd_a-ReplicatedBackend.o `test -f 'osd/ReplicatedBackend.cc' || echo '$(srcdir)/'`osd/ReplicatedBackend.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-ReplicatedBackend.Tpo osd/$(DEPDIR)/libosd_a-ReplicatedBackend.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ReplicatedBackend.cc' object='osd/libosd_a-ReplicatedBackend.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-ReplicatedBackend.o `test -f 'osd/ReplicatedBackend.cc' || echo '$(srcdir)/'`osd/ReplicatedBackend.cc
+
+osd/libosd_a-ReplicatedBackend.obj: osd/ReplicatedBackend.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-ReplicatedBackend.obj -MD -MP -MF osd/$(DEPDIR)/libosd_a-ReplicatedBackend.Tpo -c -o osd/libosd_a-ReplicatedBackend.obj `if test -f 'osd/ReplicatedBackend.cc'; then $(CYGPATH_W) 'osd/ReplicatedBackend.cc'; else $(CYGPATH_W) '$(srcdir)/osd/ReplicatedBackend.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-ReplicatedBackend.Tpo osd/$(DEPDIR)/libosd_a-ReplicatedBackend.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ReplicatedBackend.cc' object='osd/libosd_a-ReplicatedBackend.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-ReplicatedBackend.obj `if test -f 'osd/ReplicatedBackend.cc'; then $(CYGPATH_W) 'osd/ReplicatedBackend.cc'; else $(CYGPATH_W) '$(srcdir)/osd/ReplicatedBackend.cc'; fi`
+
+osd/libosd_a-ECBackend.o: osd/ECBackend.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-ECBackend.o -MD -MP -MF osd/$(DEPDIR)/libosd_a-ECBackend.Tpo -c -o osd/libosd_a-ECBackend.o `test -f 'osd/ECBackend.cc' || echo '$(srcdir)/'`osd/ECBackend.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-ECBackend.Tpo osd/$(DEPDIR)/libosd_a-ECBackend.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ECBackend.cc' object='osd/libosd_a-ECBackend.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-ECBackend.o `test -f 'osd/ECBackend.cc' || echo '$(srcdir)/'`osd/ECBackend.cc
+
+osd/libosd_a-ECBackend.obj: osd/ECBackend.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-ECBackend.obj -MD -MP -MF osd/$(DEPDIR)/libosd_a-ECBackend.Tpo -c -o osd/libosd_a-ECBackend.obj `if test -f 'osd/ECBackend.cc'; then $(CYGPATH_W) 'osd/ECBackend.cc'; else $(CYGPATH_W) '$(srcdir)/osd/ECBackend.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-ECBackend.Tpo osd/$(DEPDIR)/libosd_a-ECBackend.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ECBackend.cc' object='osd/libosd_a-ECBackend.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-ECBackend.obj `if test -f 'osd/ECBackend.cc'; then $(CYGPATH_W) 'osd/ECBackend.cc'; else $(CYGPATH_W) '$(srcdir)/osd/ECBackend.cc'; fi`
+
+osd/libosd_a-ECMsgTypes.o: osd/ECMsgTypes.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-ECMsgTypes.o -MD -MP -MF osd/$(DEPDIR)/libosd_a-ECMsgTypes.Tpo -c -o osd/libosd_a-ECMsgTypes.o `test -f 'osd/ECMsgTypes.cc' || echo '$(srcdir)/'`osd/ECMsgTypes.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-ECMsgTypes.Tpo osd/$(DEPDIR)/libosd_a-ECMsgTypes.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ECMsgTypes.cc' object='osd/libosd_a-ECMsgTypes.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-ECMsgTypes.o `test -f 'osd/ECMsgTypes.cc' || echo '$(srcdir)/'`osd/ECMsgTypes.cc
+
+osd/libosd_a-ECMsgTypes.obj: osd/ECMsgTypes.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-ECMsgTypes.obj -MD -MP -MF osd/$(DEPDIR)/libosd_a-ECMsgTypes.Tpo -c -o osd/libosd_a-ECMsgTypes.obj `if test -f 'osd/ECMsgTypes.cc'; then $(CYGPATH_W) 'osd/ECMsgTypes.cc'; else $(CYGPATH_W) '$(srcdir)/osd/ECMsgTypes.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-ECMsgTypes.Tpo osd/$(DEPDIR)/libosd_a-ECMsgTypes.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ECMsgTypes.cc' object='osd/libosd_a-ECMsgTypes.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-ECMsgTypes.obj `if test -f 'osd/ECMsgTypes.cc'; then $(CYGPATH_W) 'osd/ECMsgTypes.cc'; else $(CYGPATH_W) '$(srcdir)/osd/ECMsgTypes.cc'; fi`
+
+osd/libosd_a-ECTransaction.o: osd/ECTransaction.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-ECTransaction.o -MD -MP -MF osd/$(DEPDIR)/libosd_a-ECTransaction.Tpo -c -o osd/libosd_a-ECTransaction.o `test -f 'osd/ECTransaction.cc' || echo '$(srcdir)/'`osd/ECTransaction.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-ECTransaction.Tpo osd/$(DEPDIR)/libosd_a-ECTransaction.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ECTransaction.cc' object='osd/libosd_a-ECTransaction.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-ECTransaction.o `test -f 'osd/ECTransaction.cc' || echo '$(srcdir)/'`osd/ECTransaction.cc
+
+osd/libosd_a-ECTransaction.obj: osd/ECTransaction.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-ECTransaction.obj -MD -MP -MF osd/$(DEPDIR)/libosd_a-ECTransaction.Tpo -c -o osd/libosd_a-ECTransaction.obj `if test -f 'osd/ECTransaction.cc'; then $(CYGPATH_W) 'osd/ECTransaction.cc'; else $(CYGPATH_W) '$(srcdir)/osd/ECTransaction.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-ECTransaction.Tpo osd/$(DEPDIR)/libosd_a-ECTransaction.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ECTransaction.cc' object='osd/libosd_a-ECTransaction.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-ECTransaction.obj `if test -f 'osd/ECTransaction.cc'; then $(CYGPATH_W) 'osd/ECTransaction.cc'; else $(CYGPATH_W) '$(srcdir)/osd/ECTransaction.cc'; fi`
+
+osd/libosd_a-PGBackend.o: osd/PGBackend.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-PGBackend.o -MD -MP -MF osd/$(DEPDIR)/libosd_a-PGBackend.Tpo -c -o osd/libosd_a-PGBackend.o `test -f 'osd/PGBackend.cc' || echo '$(srcdir)/'`osd/PGBackend.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-PGBackend.Tpo osd/$(DEPDIR)/libosd_a-PGBackend.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/PGBackend.cc' object='osd/libosd_a-PGBackend.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-PGBackend.o `test -f 'osd/PGBackend.cc' || echo '$(srcdir)/'`osd/PGBackend.cc
+
+osd/libosd_a-PGBackend.obj: osd/PGBackend.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-PGBackend.obj -MD -MP -MF osd/$(DEPDIR)/libosd_a-PGBackend.Tpo -c -o osd/libosd_a-PGBackend.obj `if test -f 'osd/PGBackend.cc'; then $(CYGPATH_W) 'osd/PGBackend.cc'; else $(CYGPATH_W) '$(srcdir)/osd/PGBackend.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-PGBackend.Tpo osd/$(DEPDIR)/libosd_a-PGBackend.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/PGBackend.cc' object='osd/libosd_a-PGBackend.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-PGBackend.obj `if test -f 'osd/PGBackend.cc'; then $(CYGPATH_W) 'osd/PGBackend.cc'; else $(CYGPATH_W) '$(srcdir)/osd/PGBackend.cc'; fi`
+
+osd/libosd_a-HitSet.o: osd/HitSet.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-HitSet.o -MD -MP -MF osd/$(DEPDIR)/libosd_a-HitSet.Tpo -c -o osd/libosd_a-HitSet.o `test -f 'osd/HitSet.cc' || echo '$(srcdir)/'`osd/HitSet.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-HitSet.Tpo osd/$(DEPDIR)/libosd_a-HitSet.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/HitSet.cc' object='osd/libosd_a-HitSet.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-HitSet.o `test -f 'osd/HitSet.cc' || echo '$(srcdir)/'`osd/HitSet.cc
+
+osd/libosd_a-HitSet.obj: osd/HitSet.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-HitSet.obj -MD -MP -MF osd/$(DEPDIR)/libosd_a-HitSet.Tpo -c -o osd/libosd_a-HitSet.obj `if test -f 'osd/HitSet.cc'; then $(CYGPATH_W) 'osd/HitSet.cc'; else $(CYGPATH_W) '$(srcdir)/osd/HitSet.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-HitSet.Tpo osd/$(DEPDIR)/libosd_a-HitSet.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/HitSet.cc' object='osd/libosd_a-HitSet.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-HitSet.obj `if test -f 'osd/HitSet.cc'; then $(CYGPATH_W) 'osd/HitSet.cc'; else $(CYGPATH_W) '$(srcdir)/osd/HitSet.cc'; fi`
+
+osd/libosd_a-OSD.o: osd/OSD.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-OSD.o -MD -MP -MF osd/$(DEPDIR)/libosd_a-OSD.Tpo -c -o osd/libosd_a-OSD.o `test -f 'osd/OSD.cc' || echo '$(srcdir)/'`osd/OSD.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-OSD.Tpo osd/$(DEPDIR)/libosd_a-OSD.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/OSD.cc' object='osd/libosd_a-OSD.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-OSD.o `test -f 'osd/OSD.cc' || echo '$(srcdir)/'`osd/OSD.cc
+
+osd/libosd_a-OSD.obj: osd/OSD.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-OSD.obj -MD -MP -MF osd/$(DEPDIR)/libosd_a-OSD.Tpo -c -o osd/libosd_a-OSD.obj `if test -f 'osd/OSD.cc'; then $(CYGPATH_W) 'osd/OSD.cc'; else $(CYGPATH_W) '$(srcdir)/osd/OSD.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-OSD.Tpo osd/$(DEPDIR)/libosd_a-OSD.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/OSD.cc' object='osd/libosd_a-OSD.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-OSD.obj `if test -f 'osd/OSD.cc'; then $(CYGPATH_W) 'osd/OSD.cc'; else $(CYGPATH_W) '$(srcdir)/osd/OSD.cc'; fi`
+
+osd/libosd_a-OSDCap.o: osd/OSDCap.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-OSDCap.o -MD -MP -MF osd/$(DEPDIR)/libosd_a-OSDCap.Tpo -c -o osd/libosd_a-OSDCap.o `test -f 'osd/OSDCap.cc' || echo '$(srcdir)/'`osd/OSDCap.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-OSDCap.Tpo osd/$(DEPDIR)/libosd_a-OSDCap.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/OSDCap.cc' object='osd/libosd_a-OSDCap.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-OSDCap.o `test -f 'osd/OSDCap.cc' || echo '$(srcdir)/'`osd/OSDCap.cc
+
+osd/libosd_a-OSDCap.obj: osd/OSDCap.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-OSDCap.obj -MD -MP -MF osd/$(DEPDIR)/libosd_a-OSDCap.Tpo -c -o osd/libosd_a-OSDCap.obj `if test -f 'osd/OSDCap.cc'; then $(CYGPATH_W) 'osd/OSDCap.cc'; else $(CYGPATH_W) '$(srcdir)/osd/OSDCap.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-OSDCap.Tpo osd/$(DEPDIR)/libosd_a-OSDCap.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/OSDCap.cc' object='osd/libosd_a-OSDCap.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-OSDCap.obj `if test -f 'osd/OSDCap.cc'; then $(CYGPATH_W) 'osd/OSDCap.cc'; else $(CYGPATH_W) '$(srcdir)/osd/OSDCap.cc'; fi`
+
+osd/libosd_a-Watch.o: osd/Watch.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-Watch.o -MD -MP -MF osd/$(DEPDIR)/libosd_a-Watch.Tpo -c -o osd/libosd_a-Watch.o `test -f 'osd/Watch.cc' || echo '$(srcdir)/'`osd/Watch.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-Watch.Tpo osd/$(DEPDIR)/libosd_a-Watch.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/Watch.cc' object='osd/libosd_a-Watch.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-Watch.o `test -f 'osd/Watch.cc' || echo '$(srcdir)/'`osd/Watch.cc
+
+osd/libosd_a-Watch.obj: osd/Watch.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-Watch.obj -MD -MP -MF osd/$(DEPDIR)/libosd_a-Watch.Tpo -c -o osd/libosd_a-Watch.obj `if test -f 'osd/Watch.cc'; then $(CYGPATH_W) 'osd/Watch.cc'; else $(CYGPATH_W) '$(srcdir)/osd/Watch.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-Watch.Tpo osd/$(DEPDIR)/libosd_a-Watch.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/Watch.cc' object='osd/libosd_a-Watch.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-Watch.obj `if test -f 'osd/Watch.cc'; then $(CYGPATH_W) 'osd/Watch.cc'; else $(CYGPATH_W) '$(srcdir)/osd/Watch.cc'; fi`
+
+osd/libosd_a-ClassHandler.o: osd/ClassHandler.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-ClassHandler.o -MD -MP -MF osd/$(DEPDIR)/libosd_a-ClassHandler.Tpo -c -o osd/libosd_a-ClassHandler.o `test -f 'osd/ClassHandler.cc' || echo '$(srcdir)/'`osd/ClassHandler.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-ClassHandler.Tpo osd/$(DEPDIR)/libosd_a-ClassHandler.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ClassHandler.cc' object='osd/libosd_a-ClassHandler.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-ClassHandler.o `test -f 'osd/ClassHandler.cc' || echo '$(srcdir)/'`osd/ClassHandler.cc
+
+osd/libosd_a-ClassHandler.obj: osd/ClassHandler.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-ClassHandler.obj -MD -MP -MF osd/$(DEPDIR)/libosd_a-ClassHandler.Tpo -c -o osd/libosd_a-ClassHandler.obj `if test -f 'osd/ClassHandler.cc'; then $(CYGPATH_W) 'osd/ClassHandler.cc'; else $(CYGPATH_W) '$(srcdir)/osd/ClassHandler.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-ClassHandler.Tpo osd/$(DEPDIR)/libosd_a-ClassHandler.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ClassHandler.cc' object='osd/libosd_a-ClassHandler.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-ClassHandler.obj `if test -f 'osd/ClassHandler.cc'; then $(CYGPATH_W) 'osd/ClassHandler.cc'; else $(CYGPATH_W) '$(srcdir)/osd/ClassHandler.cc'; fi`
+
+osd/libosd_a-OpRequest.o: osd/OpRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-OpRequest.o -MD -MP -MF osd/$(DEPDIR)/libosd_a-OpRequest.Tpo -c -o osd/libosd_a-OpRequest.o `test -f 'osd/OpRequest.cc' || echo '$(srcdir)/'`osd/OpRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-OpRequest.Tpo osd/$(DEPDIR)/libosd_a-OpRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/OpRequest.cc' object='osd/libosd_a-OpRequest.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-OpRequest.o `test -f 'osd/OpRequest.cc' || echo '$(srcdir)/'`osd/OpRequest.cc
+
+osd/libosd_a-OpRequest.obj: osd/OpRequest.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-OpRequest.obj -MD -MP -MF osd/$(DEPDIR)/libosd_a-OpRequest.Tpo -c -o osd/libosd_a-OpRequest.obj `if test -f 'osd/OpRequest.cc'; then $(CYGPATH_W) 'osd/OpRequest.cc'; else $(CYGPATH_W) '$(srcdir)/osd/OpRequest.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-OpRequest.Tpo osd/$(DEPDIR)/libosd_a-OpRequest.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/OpRequest.cc' object='osd/libosd_a-OpRequest.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-OpRequest.obj `if test -f 'osd/OpRequest.cc'; then $(CYGPATH_W) 'osd/OpRequest.cc'; else $(CYGPATH_W) '$(srcdir)/osd/OpRequest.cc'; fi`
+
+osd/libosd_a-SnapMapper.o: osd/SnapMapper.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-SnapMapper.o -MD -MP -MF osd/$(DEPDIR)/libosd_a-SnapMapper.Tpo -c -o osd/libosd_a-SnapMapper.o `test -f 'osd/SnapMapper.cc' || echo '$(srcdir)/'`osd/SnapMapper.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-SnapMapper.Tpo osd/$(DEPDIR)/libosd_a-SnapMapper.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/SnapMapper.cc' object='osd/libosd_a-SnapMapper.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-SnapMapper.o `test -f 'osd/SnapMapper.cc' || echo '$(srcdir)/'`osd/SnapMapper.cc
+
+osd/libosd_a-SnapMapper.obj: osd/SnapMapper.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_a-SnapMapper.obj -MD -MP -MF osd/$(DEPDIR)/libosd_a-SnapMapper.Tpo -c -o osd/libosd_a-SnapMapper.obj `if test -f 'osd/SnapMapper.cc'; then $(CYGPATH_W) 'osd/SnapMapper.cc'; else $(CYGPATH_W) '$(srcdir)/osd/SnapMapper.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_a-SnapMapper.Tpo osd/$(DEPDIR)/libosd_a-SnapMapper.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/SnapMapper.cc' object='osd/libosd_a-SnapMapper.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_a-SnapMapper.obj `if test -f 'osd/SnapMapper.cc'; then $(CYGPATH_W) 'osd/SnapMapper.cc'; else $(CYGPATH_W) '$(srcdir)/osd/SnapMapper.cc'; fi`
+
+objclass/libosd_a-class_api.o: objclass/class_api.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT objclass/libosd_a-class_api.o -MD -MP -MF objclass/$(DEPDIR)/libosd_a-class_api.Tpo -c -o objclass/libosd_a-class_api.o `test -f 'objclass/class_api.cc' || echo '$(srcdir)/'`objclass/class_api.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) objclass/$(DEPDIR)/libosd_a-class_api.Tpo objclass/$(DEPDIR)/libosd_a-class_api.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='objclass/class_api.cc' object='objclass/libosd_a-class_api.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o objclass/libosd_a-class_api.o `test -f 'objclass/class_api.cc' || echo '$(srcdir)/'`objclass/class_api.cc
+
+objclass/libosd_a-class_api.obj: objclass/class_api.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -MT objclass/libosd_a-class_api.obj -MD -MP -MF objclass/$(DEPDIR)/libosd_a-class_api.Tpo -c -o objclass/libosd_a-class_api.obj `if test -f 'objclass/class_api.cc'; then $(CYGPATH_W) 'objclass/class_api.cc'; else $(CYGPATH_W) '$(srcdir)/objclass/class_api.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) objclass/$(DEPDIR)/libosd_a-class_api.Tpo objclass/$(DEPDIR)/libosd_a-class_api.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='objclass/class_api.cc' object='objclass/libosd_a-class_api.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_a_CXXFLAGS) $(CXXFLAGS) -c -o objclass/libosd_a-class_api.obj `if test -f 'objclass/class_api.cc'; then $(CYGPATH_W) 'objclass/class_api.cc'; else $(CYGPATH_W) '$(srcdir)/objclass/class_api.cc'; fi`
+
 java/native/libcephfs_jni_la-libcephfs_jni.lo: java/native/libcephfs_jni.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libcephfs_jni_la_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT java/native/libcephfs_jni_la-libcephfs_jni.lo -MD -MP -MF java/native/$(DEPDIR)/libcephfs_jni_la-libcephfs_jni.Tpo -c -o java/native/libcephfs_jni_la-libcephfs_jni.lo `test -f 'java/native/libcephfs_jni.cc' || echo '$(srcdir)/'`java/native/libcephfs_jni.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) java/native/$(DEPDIR)/libcephfs_jni_la-libcephfs_jni.Tpo java/native/$(DEPDIR)/libcephfs_jni_la-libcephfs_jni.Plo
@@ -19380,307 +20470,6 @@ test/erasure-code/libec_test_shec_sse4_la-TestShecPluginSSE4.lo: test/erasure-co
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libec_test_shec_sse4_la_CXXFLAGS) $(CXXFLAGS) -c -o test/erasure-code/libec_test_shec_sse4_la-TestShecPluginSSE4.lo `test -f 'test/erasure-code/TestShecPluginSSE4.cc' || echo '$(srcdir)/'`test/erasure-code/TestShecPluginSSE4.cc
 
-os/libos_la-chain_xattr.lo: os/chain_xattr.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-chain_xattr.lo -MD -MP -MF os/$(DEPDIR)/libos_la-chain_xattr.Tpo -c -o os/libos_la-chain_xattr.lo `test -f 'os/chain_xattr.cc' || echo '$(srcdir)/'`os/chain_xattr.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-chain_xattr.Tpo os/$(DEPDIR)/libos_la-chain_xattr.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/chain_xattr.cc' object='os/libos_la-chain_xattr.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-chain_xattr.lo `test -f 'os/chain_xattr.cc' || echo '$(srcdir)/'`os/chain_xattr.cc
-
-os/fs/libos_la-FS.lo: os/fs/FS.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/fs/libos_la-FS.lo -MD -MP -MF os/fs/$(DEPDIR)/libos_la-FS.Tpo -c -o os/fs/libos_la-FS.lo `test -f 'os/fs/FS.cc' || echo '$(srcdir)/'`os/fs/FS.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/fs/$(DEPDIR)/libos_la-FS.Tpo os/fs/$(DEPDIR)/libos_la-FS.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/fs/FS.cc' object='os/fs/libos_la-FS.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/fs/libos_la-FS.lo `test -f 'os/fs/FS.cc' || echo '$(srcdir)/'`os/fs/FS.cc
-
-os/libos_la-DBObjectMap.lo: os/DBObjectMap.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-DBObjectMap.lo -MD -MP -MF os/$(DEPDIR)/libos_la-DBObjectMap.Tpo -c -o os/libos_la-DBObjectMap.lo `test -f 'os/DBObjectMap.cc' || echo '$(srcdir)/'`os/DBObjectMap.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-DBObjectMap.Tpo os/$(DEPDIR)/libos_la-DBObjectMap.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/DBObjectMap.cc' object='os/libos_la-DBObjectMap.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-DBObjectMap.lo `test -f 'os/DBObjectMap.cc' || echo '$(srcdir)/'`os/DBObjectMap.cc
-
-os/libos_la-GenericObjectMap.lo: os/GenericObjectMap.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-GenericObjectMap.lo -MD -MP -MF os/$(DEPDIR)/libos_la-GenericObjectMap.Tpo -c -o os/libos_la-GenericObjectMap.lo `test -f 'os/GenericObjectMap.cc' || echo '$(srcdir)/'`os/GenericObjectMap.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-GenericObjectMap.Tpo os/$(DEPDIR)/libos_la-GenericObjectMap.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/GenericObjectMap.cc' object='os/libos_la-GenericObjectMap.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-GenericObjectMap.lo `test -f 'os/GenericObjectMap.cc' || echo '$(srcdir)/'`os/GenericObjectMap.cc
-
-os/libos_la-FileJournal.lo: os/FileJournal.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-FileJournal.lo -MD -MP -MF os/$(DEPDIR)/libos_la-FileJournal.Tpo -c -o os/libos_la-FileJournal.lo `test -f 'os/FileJournal.cc' || echo '$(srcdir)/'`os/FileJournal.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-FileJournal.Tpo os/$(DEPDIR)/libos_la-FileJournal.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/FileJournal.cc' object='os/libos_la-FileJournal.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-FileJournal.lo `test -f 'os/FileJournal.cc' || echo '$(srcdir)/'`os/FileJournal.cc
-
-os/libos_la-FileStore.lo: os/FileStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-FileStore.lo -MD -MP -MF os/$(DEPDIR)/libos_la-FileStore.Tpo -c -o os/libos_la-FileStore.lo `test -f 'os/FileStore.cc' || echo '$(srcdir)/'`os/FileStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-FileStore.Tpo os/$(DEPDIR)/libos_la-FileStore.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/FileStore.cc' object='os/libos_la-FileStore.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-FileStore.lo `test -f 'os/FileStore.cc' || echo '$(srcdir)/'`os/FileStore.cc
-
-os/libos_la-GenericFileStoreBackend.lo: os/GenericFileStoreBackend.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-GenericFileStoreBackend.lo -MD -MP -MF os/$(DEPDIR)/libos_la-GenericFileStoreBackend.Tpo -c -o os/libos_la-GenericFileStoreBackend.lo `test -f 'os/GenericFileStoreBackend.cc' || echo '$(srcdir)/'`os/GenericFileStoreBackend.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-GenericFileStoreBackend.Tpo os/$(DEPDIR)/libos_la-GenericFileStoreBackend.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/GenericFileStoreBackend.cc' object='os/libos_la-GenericFileStoreBackend.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-GenericFileStoreBackend.lo `test -f 'os/GenericFileStoreBackend.cc' || echo '$(srcdir)/'`os/GenericFileStoreBackend.cc
-
-os/libos_la-HashIndex.lo: os/HashIndex.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-HashIndex.lo -MD -MP -MF os/$(DEPDIR)/libos_la-HashIndex.Tpo -c -o os/libos_la-HashIndex.lo `test -f 'os/HashIndex.cc' || echo '$(srcdir)/'`os/HashIndex.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-HashIndex.Tpo os/$(DEPDIR)/libos_la-HashIndex.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/HashIndex.cc' object='os/libos_la-HashIndex.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-HashIndex.lo `test -f 'os/HashIndex.cc' || echo '$(srcdir)/'`os/HashIndex.cc
-
-os/libos_la-IndexManager.lo: os/IndexManager.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-IndexManager.lo -MD -MP -MF os/$(DEPDIR)/libos_la-IndexManager.Tpo -c -o os/libos_la-IndexManager.lo `test -f 'os/IndexManager.cc' || echo '$(srcdir)/'`os/IndexManager.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-IndexManager.Tpo os/$(DEPDIR)/libos_la-IndexManager.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/IndexManager.cc' object='os/libos_la-IndexManager.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-IndexManager.lo `test -f 'os/IndexManager.cc' || echo '$(srcdir)/'`os/IndexManager.cc
-
-os/libos_la-JournalingObjectStore.lo: os/JournalingObjectStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-JournalingObjectStore.lo -MD -MP -MF os/$(DEPDIR)/libos_la-JournalingObjectStore.Tpo -c -o os/libos_la-JournalingObjectStore.lo `test -f 'os/JournalingObjectStore.cc' || echo '$(srcdir)/'`os/JournalingObjectStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-JournalingObjectStore.Tpo os/$(DEPDIR)/libos_la-JournalingObjectStore.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/JournalingObjectStore.cc' object='os/libos_la-JournalingObjectStore.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-JournalingObjectStore.lo `test -f 'os/JournalingObjectStore.cc' || echo '$(srcdir)/'`os/JournalingObjectStore.cc
-
-os/libos_la-LevelDBStore.lo: os/LevelDBStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-LevelDBStore.lo -MD -MP -MF os/$(DEPDIR)/libos_la-LevelDBStore.Tpo -c -o os/libos_la-LevelDBStore.lo `test -f 'os/LevelDBStore.cc' || echo '$(srcdir)/'`os/LevelDBStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-LevelDBStore.Tpo os/$(DEPDIR)/libos_la-LevelDBStore.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/LevelDBStore.cc' object='os/libos_la-LevelDBStore.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-LevelDBStore.lo `test -f 'os/LevelDBStore.cc' || echo '$(srcdir)/'`os/LevelDBStore.cc
-
-os/libos_la-LFNIndex.lo: os/LFNIndex.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-LFNIndex.lo -MD -MP -MF os/$(DEPDIR)/libos_la-LFNIndex.Tpo -c -o os/libos_la-LFNIndex.lo `test -f 'os/LFNIndex.cc' || echo '$(srcdir)/'`os/LFNIndex.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-LFNIndex.Tpo os/$(DEPDIR)/libos_la-LFNIndex.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/LFNIndex.cc' object='os/libos_la-LFNIndex.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-LFNIndex.lo `test -f 'os/LFNIndex.cc' || echo '$(srcdir)/'`os/LFNIndex.cc
-
-os/libos_la-MemStore.lo: os/MemStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-MemStore.lo -MD -MP -MF os/$(DEPDIR)/libos_la-MemStore.Tpo -c -o os/libos_la-MemStore.lo `test -f 'os/MemStore.cc' || echo '$(srcdir)/'`os/MemStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-MemStore.Tpo os/$(DEPDIR)/libos_la-MemStore.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/MemStore.cc' object='os/libos_la-MemStore.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-MemStore.lo `test -f 'os/MemStore.cc' || echo '$(srcdir)/'`os/MemStore.cc
-
-os/libos_la-KeyValueDB.lo: os/KeyValueDB.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-KeyValueDB.lo -MD -MP -MF os/$(DEPDIR)/libos_la-KeyValueDB.Tpo -c -o os/libos_la-KeyValueDB.lo `test -f 'os/KeyValueDB.cc' || echo '$(srcdir)/'`os/KeyValueDB.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-KeyValueDB.Tpo os/$(DEPDIR)/libos_la-KeyValueDB.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/KeyValueDB.cc' object='os/libos_la-KeyValueDB.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-KeyValueDB.lo `test -f 'os/KeyValueDB.cc' || echo '$(srcdir)/'`os/KeyValueDB.cc
-
-os/libos_la-KeyValueStore.lo: os/KeyValueStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-KeyValueStore.lo -MD -MP -MF os/$(DEPDIR)/libos_la-KeyValueStore.Tpo -c -o os/libos_la-KeyValueStore.lo `test -f 'os/KeyValueStore.cc' || echo '$(srcdir)/'`os/KeyValueStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-KeyValueStore.Tpo os/$(DEPDIR)/libos_la-KeyValueStore.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/KeyValueStore.cc' object='os/libos_la-KeyValueStore.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-KeyValueStore.lo `test -f 'os/KeyValueStore.cc' || echo '$(srcdir)/'`os/KeyValueStore.cc
-
-os/libos_la-ObjectStore.lo: os/ObjectStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-ObjectStore.lo -MD -MP -MF os/$(DEPDIR)/libos_la-ObjectStore.Tpo -c -o os/libos_la-ObjectStore.lo `test -f 'os/ObjectStore.cc' || echo '$(srcdir)/'`os/ObjectStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-ObjectStore.Tpo os/$(DEPDIR)/libos_la-ObjectStore.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/ObjectStore.cc' object='os/libos_la-ObjectStore.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-ObjectStore.lo `test -f 'os/ObjectStore.cc' || echo '$(srcdir)/'`os/ObjectStore.cc
-
-os/libos_la-WBThrottle.lo: os/WBThrottle.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-WBThrottle.lo -MD -MP -MF os/$(DEPDIR)/libos_la-WBThrottle.Tpo -c -o os/libos_la-WBThrottle.lo `test -f 'os/WBThrottle.cc' || echo '$(srcdir)/'`os/WBThrottle.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-WBThrottle.Tpo os/$(DEPDIR)/libos_la-WBThrottle.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/WBThrottle.cc' object='os/libos_la-WBThrottle.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-WBThrottle.lo `test -f 'os/WBThrottle.cc' || echo '$(srcdir)/'`os/WBThrottle.cc
-
-common/libos_la-TrackedOp.lo: common/TrackedOp.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT common/libos_la-TrackedOp.lo -MD -MP -MF common/$(DEPDIR)/libos_la-TrackedOp.Tpo -c -o common/libos_la-TrackedOp.lo `test -f 'common/TrackedOp.cc' || echo '$(srcdir)/'`common/TrackedOp.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/libos_la-TrackedOp.Tpo common/$(DEPDIR)/libos_la-TrackedOp.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='common/TrackedOp.cc' object='common/libos_la-TrackedOp.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o common/libos_la-TrackedOp.lo `test -f 'common/TrackedOp.cc' || echo '$(srcdir)/'`common/TrackedOp.cc
-
-os/libos_la-BtrfsFileStoreBackend.lo: os/BtrfsFileStoreBackend.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-BtrfsFileStoreBackend.lo -MD -MP -MF os/$(DEPDIR)/libos_la-BtrfsFileStoreBackend.Tpo -c -o os/libos_la-BtrfsFileStoreBackend.lo `test -f 'os/BtrfsFileStoreBackend.cc' || echo '$(srcdir)/'`os/BtrfsFileStoreBackend.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-BtrfsFileStoreBackend.Tpo os/$(DEPDIR)/libos_la-BtrfsFileStoreBackend.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/BtrfsFileStoreBackend.cc' object='os/libos_la-BtrfsFileStoreBackend.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-BtrfsFileStoreBackend.lo `test -f 'os/BtrfsFileStoreBackend.cc' || echo '$(srcdir)/'`os/BtrfsFileStoreBackend.cc
-
-os/newstore/libos_la-NewStore.lo: os/newstore/NewStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/newstore/libos_la-NewStore.lo -MD -MP -MF os/newstore/$(DEPDIR)/libos_la-NewStore.Tpo -c -o os/newstore/libos_la-NewStore.lo `test -f 'os/newstore/NewStore.cc' || echo '$(srcdir)/'`os/newstore/NewStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/newstore/$(DEPDIR)/libos_la-NewStore.Tpo os/newstore/$(DEPDIR)/libos_la-NewStore.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/newstore/NewStore.cc' object='os/newstore/libos_la-NewStore.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/newstore/libos_la-NewStore.lo `test -f 'os/newstore/NewStore.cc' || echo '$(srcdir)/'`os/newstore/NewStore.cc
-
-os/fs/libos_la-XFS.lo: os/fs/XFS.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/fs/libos_la-XFS.lo -MD -MP -MF os/fs/$(DEPDIR)/libos_la-XFS.Tpo -c -o os/fs/libos_la-XFS.lo `test -f 'os/fs/XFS.cc' || echo '$(srcdir)/'`os/fs/XFS.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/fs/$(DEPDIR)/libos_la-XFS.Tpo os/fs/$(DEPDIR)/libos_la-XFS.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/fs/XFS.cc' object='os/fs/libos_la-XFS.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/fs/libos_la-XFS.lo `test -f 'os/fs/XFS.cc' || echo '$(srcdir)/'`os/fs/XFS.cc
-
-os/libos_la-XfsFileStoreBackend.lo: os/XfsFileStoreBackend.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-XfsFileStoreBackend.lo -MD -MP -MF os/$(DEPDIR)/libos_la-XfsFileStoreBackend.Tpo -c -o os/libos_la-XfsFileStoreBackend.lo `test -f 'os/XfsFileStoreBackend.cc' || echo '$(srcdir)/'`os/XfsFileStoreBackend.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-XfsFileStoreBackend.Tpo os/$(DEPDIR)/libos_la-XfsFileStoreBackend.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/XfsFileStoreBackend.cc' object='os/libos_la-XfsFileStoreBackend.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-XfsFileStoreBackend.lo `test -f 'os/XfsFileStoreBackend.cc' || echo '$(srcdir)/'`os/XfsFileStoreBackend.cc
-
-os/libos_la-ZFSFileStoreBackend.lo: os/ZFSFileStoreBackend.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-ZFSFileStoreBackend.lo -MD -MP -MF os/$(DEPDIR)/libos_la-ZFSFileStoreBackend.Tpo -c -o os/libos_la-ZFSFileStoreBackend.lo `test -f 'os/ZFSFileStoreBackend.cc' || echo '$(srcdir)/'`os/ZFSFileStoreBackend.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-ZFSFileStoreBackend.Tpo os/$(DEPDIR)/libos_la-ZFSFileStoreBackend.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/ZFSFileStoreBackend.cc' object='os/libos_la-ZFSFileStoreBackend.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-ZFSFileStoreBackend.lo `test -f 'os/ZFSFileStoreBackend.cc' || echo '$(srcdir)/'`os/ZFSFileStoreBackend.cc
-
-os/libos_la-KineticStore.lo: os/KineticStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_la-KineticStore.lo -MD -MP -MF os/$(DEPDIR)/libos_la-KineticStore.Tpo -c -o os/libos_la-KineticStore.lo `test -f 'os/KineticStore.cc' || echo '$(srcdir)/'`os/KineticStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_la-KineticStore.Tpo os/$(DEPDIR)/libos_la-KineticStore.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/KineticStore.cc' object='os/libos_la-KineticStore.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_la-KineticStore.lo `test -f 'os/KineticStore.cc' || echo '$(srcdir)/'`os/KineticStore.cc
-
-os/libos_rocksdb_la-RocksDBStore.lo: os/RocksDBStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_rocksdb_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_rocksdb_la-RocksDBStore.lo -MD -MP -MF os/$(DEPDIR)/libos_rocksdb_la-RocksDBStore.Tpo -c -o os/libos_rocksdb_la-RocksDBStore.lo `test -f 'os/RocksDBStore.cc' || echo '$(srcdir)/'`os/RocksDBStore.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_rocksdb_la-RocksDBStore.Tpo os/$(DEPDIR)/libos_rocksdb_la-RocksDBStore.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/RocksDBStore.cc' object='os/libos_rocksdb_la-RocksDBStore.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_rocksdb_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_rocksdb_la-RocksDBStore.lo `test -f 'os/RocksDBStore.cc' || echo '$(srcdir)/'`os/RocksDBStore.cc
-
-os/libos_types_la-Transaction.lo: os/Transaction.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_la_CXXFLAGS) $(CXXFLAGS) -MT os/libos_types_la-Transaction.lo -MD -MP -MF os/$(DEPDIR)/libos_types_la-Transaction.Tpo -c -o os/libos_types_la-Transaction.lo `test -f 'os/Transaction.cc' || echo '$(srcdir)/'`os/Transaction.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/$(DEPDIR)/libos_types_la-Transaction.Tpo os/$(DEPDIR)/libos_types_la-Transaction.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/Transaction.cc' object='os/libos_types_la-Transaction.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_la_CXXFLAGS) $(CXXFLAGS) -c -o os/libos_types_la-Transaction.lo `test -f 'os/Transaction.cc' || echo '$(srcdir)/'`os/Transaction.cc
-
-os/newstore/libos_types_la-newstore_types.lo: os/newstore/newstore_types.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_la_CXXFLAGS) $(CXXFLAGS) -MT os/newstore/libos_types_la-newstore_types.lo -MD -MP -MF os/newstore/$(DEPDIR)/libos_types_la-newstore_types.Tpo -c -o os/newstore/libos_types_la-newstore_types.lo `test -f 'os/newstore/newstore_types.cc' || echo '$(srcdir)/'`os/newstore/newstore_types.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) os/newstore/$(DEPDIR)/libos_types_la-newstore_types.Tpo os/newstore/$(DEPDIR)/libos_types_la-newstore_types.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='os/newstore/newstore_types.cc' object='os/newstore/libos_types_la-newstore_types.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libos_types_la_CXXFLAGS) $(CXXFLAGS) -c -o os/newstore/libos_types_la-newstore_types.lo `test -f 'os/newstore/newstore_types.cc' || echo '$(srcdir)/'`os/newstore/newstore_types.cc
-
-osd/libosd_la-PG.lo: osd/PG.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-PG.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-PG.Tpo -c -o osd/libosd_la-PG.lo `test -f 'osd/PG.cc' || echo '$(srcdir)/'`osd/PG.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-PG.Tpo osd/$(DEPDIR)/libosd_la-PG.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/PG.cc' object='osd/libosd_la-PG.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-PG.lo `test -f 'osd/PG.cc' || echo '$(srcdir)/'`osd/PG.cc
-
-osd/libosd_la-ReplicatedPG.lo: osd/ReplicatedPG.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-ReplicatedPG.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-ReplicatedPG.Tpo -c -o osd/libosd_la-ReplicatedPG.lo `test -f 'osd/ReplicatedPG.cc' || echo '$(srcdir)/'`osd/ReplicatedPG.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-ReplicatedPG.Tpo osd/$(DEPDIR)/libosd_la-ReplicatedPG.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ReplicatedPG.cc' object='osd/libosd_la-ReplicatedPG.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-ReplicatedPG.lo `test -f 'osd/ReplicatedPG.cc' || echo '$(srcdir)/'`osd/ReplicatedPG.cc
-
-osd/libosd_la-ReplicatedBackend.lo: osd/ReplicatedBackend.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-ReplicatedBackend.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-ReplicatedBackend.Tpo -c -o osd/libosd_la-ReplicatedBackend.lo `test -f 'osd/ReplicatedBackend.cc' || echo '$(srcdir)/'`osd/ReplicatedBackend.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-ReplicatedBackend.Tpo osd/$(DEPDIR)/libosd_la-ReplicatedBackend.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ReplicatedBackend.cc' object='osd/libosd_la-ReplicatedBackend.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-ReplicatedBackend.lo `test -f 'osd/ReplicatedBackend.cc' || echo '$(srcdir)/'`osd/ReplicatedBackend.cc
-
-osd/libosd_la-ECBackend.lo: osd/ECBackend.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-ECBackend.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-ECBackend.Tpo -c -o osd/libosd_la-ECBackend.lo `test -f 'osd/ECBackend.cc' || echo '$(srcdir)/'`osd/ECBackend.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-ECBackend.Tpo osd/$(DEPDIR)/libosd_la-ECBackend.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ECBackend.cc' object='osd/libosd_la-ECBackend.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-ECBackend.lo `test -f 'osd/ECBackend.cc' || echo '$(srcdir)/'`osd/ECBackend.cc
-
-osd/libosd_la-ECMsgTypes.lo: osd/ECMsgTypes.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-ECMsgTypes.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-ECMsgTypes.Tpo -c -o osd/libosd_la-ECMsgTypes.lo `test -f 'osd/ECMsgTypes.cc' || echo '$(srcdir)/'`osd/ECMsgTypes.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-ECMsgTypes.Tpo osd/$(DEPDIR)/libosd_la-ECMsgTypes.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ECMsgTypes.cc' object='osd/libosd_la-ECMsgTypes.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-ECMsgTypes.lo `test -f 'osd/ECMsgTypes.cc' || echo '$(srcdir)/'`osd/ECMsgTypes.cc
-
-osd/libosd_la-ECTransaction.lo: osd/ECTransaction.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-ECTransaction.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-ECTransaction.Tpo -c -o osd/libosd_la-ECTransaction.lo `test -f 'osd/ECTransaction.cc' || echo '$(srcdir)/'`osd/ECTransaction.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-ECTransaction.Tpo osd/$(DEPDIR)/libosd_la-ECTransaction.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ECTransaction.cc' object='osd/libosd_la-ECTransaction.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-ECTransaction.lo `test -f 'osd/ECTransaction.cc' || echo '$(srcdir)/'`osd/ECTransaction.cc
-
-osd/libosd_la-PGBackend.lo: osd/PGBackend.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-PGBackend.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-PGBackend.Tpo -c -o osd/libosd_la-PGBackend.lo `test -f 'osd/PGBackend.cc' || echo '$(srcdir)/'`osd/PGBackend.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-PGBackend.Tpo osd/$(DEPDIR)/libosd_la-PGBackend.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/PGBackend.cc' object='osd/libosd_la-PGBackend.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-PGBackend.lo `test -f 'osd/PGBackend.cc' || echo '$(srcdir)/'`osd/PGBackend.cc
-
-osd/libosd_la-HitSet.lo: osd/HitSet.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-HitSet.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-HitSet.Tpo -c -o osd/libosd_la-HitSet.lo `test -f 'osd/HitSet.cc' || echo '$(srcdir)/'`osd/HitSet.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-HitSet.Tpo osd/$(DEPDIR)/libosd_la-HitSet.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/HitSet.cc' object='osd/libosd_la-HitSet.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-HitSet.lo `test -f 'osd/HitSet.cc' || echo '$(srcdir)/'`osd/HitSet.cc
-
-osd/libosd_la-OSD.lo: osd/OSD.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-OSD.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-OSD.Tpo -c -o osd/libosd_la-OSD.lo `test -f 'osd/OSD.cc' || echo '$(srcdir)/'`osd/OSD.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-OSD.Tpo osd/$(DEPDIR)/libosd_la-OSD.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/OSD.cc' object='osd/libosd_la-OSD.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-OSD.lo `test -f 'osd/OSD.cc' || echo '$(srcdir)/'`osd/OSD.cc
-
-osd/libosd_la-OSDCap.lo: osd/OSDCap.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-OSDCap.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-OSDCap.Tpo -c -o osd/libosd_la-OSDCap.lo `test -f 'osd/OSDCap.cc' || echo '$(srcdir)/'`osd/OSDCap.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-OSDCap.Tpo osd/$(DEPDIR)/libosd_la-OSDCap.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/OSDCap.cc' object='osd/libosd_la-OSDCap.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-OSDCap.lo `test -f 'osd/OSDCap.cc' || echo '$(srcdir)/'`osd/OSDCap.cc
-
-osd/libosd_la-Watch.lo: osd/Watch.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-Watch.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-Watch.Tpo -c -o osd/libosd_la-Watch.lo `test -f 'osd/Watch.cc' || echo '$(srcdir)/'`osd/Watch.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-Watch.Tpo osd/$(DEPDIR)/libosd_la-Watch.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/Watch.cc' object='osd/libosd_la-Watch.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-Watch.lo `test -f 'osd/Watch.cc' || echo '$(srcdir)/'`osd/Watch.cc
-
-osd/libosd_la-ClassHandler.lo: osd/ClassHandler.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-ClassHandler.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-ClassHandler.Tpo -c -o osd/libosd_la-ClassHandler.lo `test -f 'osd/ClassHandler.cc' || echo '$(srcdir)/'`osd/ClassHandler.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-ClassHandler.Tpo osd/$(DEPDIR)/libosd_la-ClassHandler.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/ClassHandler.cc' object='osd/libosd_la-ClassHandler.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-ClassHandler.lo `test -f 'osd/ClassHandler.cc' || echo '$(srcdir)/'`osd/ClassHandler.cc
-
-osd/libosd_la-OpRequest.lo: osd/OpRequest.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-OpRequest.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-OpRequest.Tpo -c -o osd/libosd_la-OpRequest.lo `test -f 'osd/OpRequest.cc' || echo '$(srcdir)/'`osd/OpRequest.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-OpRequest.Tpo osd/$(DEPDIR)/libosd_la-OpRequest.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/OpRequest.cc' object='osd/libosd_la-OpRequest.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-OpRequest.lo `test -f 'osd/OpRequest.cc' || echo '$(srcdir)/'`osd/OpRequest.cc
-
-common/libosd_la-TrackedOp.lo: common/TrackedOp.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT common/libosd_la-TrackedOp.lo -MD -MP -MF common/$(DEPDIR)/libosd_la-TrackedOp.Tpo -c -o common/libosd_la-TrackedOp.lo `test -f 'common/TrackedOp.cc' || echo '$(srcdir)/'`common/TrackedOp.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/libosd_la-TrackedOp.Tpo common/$(DEPDIR)/libosd_la-TrackedOp.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='common/TrackedOp.cc' object='common/libosd_la-TrackedOp.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o common/libosd_la-TrackedOp.lo `test -f 'common/TrackedOp.cc' || echo '$(srcdir)/'`common/TrackedOp.cc
-
-osd/libosd_la-SnapMapper.lo: osd/SnapMapper.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_la-SnapMapper.lo -MD -MP -MF osd/$(DEPDIR)/libosd_la-SnapMapper.Tpo -c -o osd/libosd_la-SnapMapper.lo `test -f 'osd/SnapMapper.cc' || echo '$(srcdir)/'`osd/SnapMapper.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_la-SnapMapper.Tpo osd/$(DEPDIR)/libosd_la-SnapMapper.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='osd/SnapMapper.cc' object='osd/libosd_la-SnapMapper.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o osd/libosd_la-SnapMapper.lo `test -f 'osd/SnapMapper.cc' || echo '$(srcdir)/'`osd/SnapMapper.cc
-
-objclass/libosd_la-class_api.lo: objclass/class_api.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -MT objclass/libosd_la-class_api.lo -MD -MP -MF objclass/$(DEPDIR)/libosd_la-class_api.Tpo -c -o objclass/libosd_la-class_api.lo `test -f 'objclass/class_api.cc' || echo '$(srcdir)/'`objclass/class_api.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) objclass/$(DEPDIR)/libosd_la-class_api.Tpo objclass/$(DEPDIR)/libosd_la-class_api.Plo
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='objclass/class_api.cc' object='objclass/libosd_la-class_api.lo' libtool=yes @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_la_CXXFLAGS) $(CXXFLAGS) -c -o objclass/libosd_la-class_api.lo `test -f 'objclass/class_api.cc' || echo '$(srcdir)/'`objclass/class_api.cc
-
 osd/libosd_types_la-PGLog.lo: osd/PGLog.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libosd_types_la_CXXFLAGS) $(CXXFLAGS) -MT osd/libosd_types_la-PGLog.lo -MD -MP -MF osd/$(DEPDIR)/libosd_types_la-PGLog.Tpo -c -o osd/libosd_types_la-PGLog.lo `test -f 'osd/PGLog.cc' || echo '$(srcdir)/'`osd/PGLog.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) osd/$(DEPDIR)/libosd_types_la-PGLog.Tpo osd/$(DEPDIR)/libosd_types_la-PGLog.Plo
@@ -19800,6 +20589,20 @@ test/librbd/librbd_test_la-test_internal.lo: test/librbd/test_internal.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_test_la_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/librbd_test_la-test_internal.lo `test -f 'test/librbd/test_internal.cc' || echo '$(srcdir)/'`test/librbd/test_internal.cc
 
+test/librbd/librbd_test_la-test_JournalEntries.lo: test/librbd/test_JournalEntries.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_test_la_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/librbd_test_la-test_JournalEntries.lo -MD -MP -MF test/librbd/$(DEPDIR)/librbd_test_la-test_JournalEntries.Tpo -c -o test/librbd/librbd_test_la-test_JournalEntries.lo `test -f 'test/librbd/test_JournalEntries.cc' || echo '$(srcdir)/'`test/librbd/te [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/$(DEPDIR)/librbd_test_la-test_JournalEntries.Tpo test/librbd/$(DEPDIR)/librbd_test_la-test_JournalEntries.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/test_JournalEntries.cc' object='test/librbd/librbd_test_la-test_JournalEntries.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_test_la_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/librbd_test_la-test_JournalEntries.lo `test -f 'test/librbd/test_JournalEntries.cc' || echo '$(srcdir)/'`test/librbd/test_JournalEntries.cc
+
+test/librbd/librbd_test_la-test_JournalReplay.lo: test/librbd/test_JournalReplay.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_test_la_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/librbd_test_la-test_JournalReplay.lo -MD -MP -MF test/librbd/$(DEPDIR)/librbd_test_la-test_JournalReplay.Tpo -c -o test/librbd/librbd_test_la-test_JournalReplay.lo `test -f 'test/librbd/test_JournalReplay.cc' || echo '$(srcdir)/'`test/librbd/test_J [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/$(DEPDIR)/librbd_test_la-test_JournalReplay.Tpo test/librbd/$(DEPDIR)/librbd_test_la-test_JournalReplay.Plo
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/librbd/test_JournalReplay.cc' object='test/librbd/librbd_test_la-test_JournalReplay.lo' libtool=yes @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_test_la_CXXFLAGS) $(CXXFLAGS) -c -o test/librbd/librbd_test_la-test_JournalReplay.lo `test -f 'test/librbd/test_JournalReplay.cc' || echo '$(srcdir)/'`test/librbd/test_JournalReplay.cc
+
 test/librbd/librbd_test_la-test_ObjectMap.lo: test/librbd/test_ObjectMap.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(librbd_test_la_CXXFLAGS) $(CXXFLAGS) -MT test/librbd/librbd_test_la-test_ObjectMap.lo -MD -MP -MF test/librbd/$(DEPDIR)/librbd_test_la-test_ObjectMap.Tpo -c -o test/librbd/librbd_test_la-test_ObjectMap.lo `test -f 'test/librbd/test_ObjectMap.cc' || echo '$(srcdir)/'`test/librbd/test_ObjectMap.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/librbd/$(DEPDIR)/librbd_test_la-test_ObjectMap.Tpo test/librbd/$(DEPDIR)/librbd_test_la-test_ObjectMap.Plo
@@ -20388,6 +21191,20 @@ mds/ceph_dencoder-SimpleLock.obj: mds/SimpleLock.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-SimpleLock.obj `if test -f 'mds/SimpleLock.cc'; then $(CYGPATH_W) 'mds/SimpleLock.cc'; else $(CYGPATH_W) '$(srcdir)/mds/SimpleLock.cc'; fi`
 
+mds/ceph_dencoder-ScrubStack.o: mds/ScrubStack.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-ScrubStack.o -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-ScrubStack.Tpo -c -o mds/ceph_dencoder-ScrubStack.o `test -f 'mds/ScrubStack.cc' || echo '$(srcdir)/'`mds/ScrubStack.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-ScrubStack.Tpo mds/$(DEPDIR)/ceph_dencoder-ScrubStack.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='mds/ScrubStack.cc' object='mds/ceph_dencoder-ScrubStack.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-ScrubStack.o `test -f 'mds/ScrubStack.cc' || echo '$(srcdir)/'`mds/ScrubStack.cc
+
+mds/ceph_dencoder-ScrubStack.obj: mds/ScrubStack.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-ScrubStack.obj -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-ScrubStack.Tpo -c -o mds/ceph_dencoder-ScrubStack.obj `if test -f 'mds/ScrubStack.cc'; then $(CYGPATH_W) 'mds/ScrubStack.cc'; else $(CYGPATH_W) '$(srcdir)/mds/ScrubStack.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-ScrubStack.Tpo mds/$(DEPDIR)/ceph_dencoder-ScrubStack.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='mds/ScrubStack.cc' object='mds/ceph_dencoder-ScrubStack.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-ScrubStack.obj `if test -f 'mds/ScrubStack.cc'; then $(CYGPATH_W) 'mds/ScrubStack.cc'; else $(CYGPATH_W) '$(srcdir)/mds/ScrubStack.cc'; fi`
+
 mds/ceph_dencoder-SnapRealm.o: mds/SnapRealm.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT mds/ceph_dencoder-SnapRealm.o -MD -MP -MF mds/$(DEPDIR)/ceph_dencoder-SnapRealm.Tpo -c -o mds/ceph_dencoder-SnapRealm.o `test -f 'mds/SnapRealm.cc' || echo '$(srcdir)/'`mds/SnapRealm.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) mds/$(DEPDIR)/ceph_dencoder-SnapRealm.Tpo mds/$(DEPDIR)/ceph_dencoder-SnapRealm.Po
@@ -20486,20 +21303,6 @@ mds/ceph_dencoder-MDLog.obj: mds/MDLog.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o mds/ceph_dencoder-MDLog.obj `if test -f 'mds/MDLog.cc'; then $(CYGPATH_W) 'mds/MDLog.cc'; else $(CYGPATH_W) '$(srcdir)/mds/MDLog.cc'; fi`
 
-common/ceph_dencoder-TrackedOp.o: common/TrackedOp.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT common/ceph_dencoder-TrackedOp.o -MD -MP -MF common/$(DEPDIR)/ceph_dencoder-TrackedOp.Tpo -c -o common/ceph_dencoder-TrackedOp.o `test -f 'common/TrackedOp.cc' || echo '$(srcdir)/'`common/TrackedOp.cc
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/ceph_dencoder-TrackedOp.Tpo common/$(DEPDIR)/ceph_dencoder-TrackedOp.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='common/TrackedOp.cc' object='common/ceph_dencoder-TrackedOp.o' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o common/ceph_dencoder-TrackedOp.o `test -f 'common/TrackedOp.cc' || echo '$(srcdir)/'`common/TrackedOp.cc
-
-common/ceph_dencoder-TrackedOp.obj: common/TrackedOp.cc
- at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT common/ceph_dencoder-TrackedOp.obj -MD -MP -MF common/$(DEPDIR)/ceph_dencoder-TrackedOp.Tpo -c -o common/ceph_dencoder-TrackedOp.obj `if test -f 'common/TrackedOp.cc'; then $(CYGPATH_W) 'common/TrackedOp.cc'; else $(CYGPATH_W) '$(srcdir)/common/TrackedOp.cc'; fi`
- at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) common/$(DEPDIR)/ceph_dencoder-TrackedOp.Tpo common/$(DEPDIR)/ceph_dencoder-TrackedOp.Po
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='common/TrackedOp.cc' object='common/ceph_dencoder-TrackedOp.obj' libtool=no @AMDEPBACKSLASH@
- at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
- at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -c -o common/ceph_dencoder-TrackedOp.obj `if test -f 'common/TrackedOp.cc'; then $(CYGPATH_W) 'common/TrackedOp.cc'; else $(CYGPATH_W) '$(srcdir)/common/TrackedOp.cc'; fi`
-
 perfglue/ceph_dencoder-disabled_heap_profiler.o: perfglue/disabled_heap_profiler.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_dencoder_CXXFLAGS) $(CXXFLAGS) -MT perfglue/ceph_dencoder-disabled_heap_profiler.o -MD -MP -MF perfglue/$(DEPDIR)/ceph_dencoder-disabled_heap_profiler.Tpo -c -o perfglue/ceph_dencoder-disabled_heap_profiler.o `test -f 'perfglue/disabled_heap_profiler.cc' || echo '$(srcdir)/'`perfglue/disabled_heap_profiler.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) perfglue/$(DEPDIR)/ceph_dencoder-disabled_heap_profiler.Tpo perfglue/$(DEPDIR)/ceph_dencoder-disabled_heap_profiler.Po
@@ -20710,6 +21513,20 @@ test/cls_hello/ceph_test_cls_hello-test_cls_hello.obj: test/cls_hello/test_cls_h
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_cls_hello_CXXFLAGS) $(CXXFLAGS) -c -o test/cls_hello/ceph_test_cls_hello-test_cls_hello.obj `if test -f 'test/cls_hello/test_cls_hello.cc'; then $(CYGPATH_W) 'test/cls_hello/test_cls_hello.cc'; else $(CYGPATH_W) '$(srcdir)/test/cls_hello/test_cls_hello.cc'; fi`
 
+test/cls_journal/ceph_test_cls_journal-test_cls_journal.o: test/cls_journal/test_cls_journal.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_cls_journal_CXXFLAGS) $(CXXFLAGS) -MT test/cls_journal/ceph_test_cls_journal-test_cls_journal.o -MD -MP -MF test/cls_journal/$(DEPDIR)/ceph_test_cls_journal-test_cls_journal.Tpo -c -o test/cls_journal/ceph_test_cls_journal-test_cls_journal.o `test -f 'test/cls_journal/test_cls_journal.cc' || echo '$(srcdir)/'`test/cls_journal/test_cls_journal.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/cls_journal/$(DEPDIR)/ceph_test_cls_journal-test_cls_journal.Tpo test/cls_journal/$(DEPDIR)/ceph_test_cls_journal-test_cls_journal.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/cls_journal/test_cls_journal.cc' object='test/cls_journal/ceph_test_cls_journal-test_cls_journal.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_cls_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/cls_journal/ceph_test_cls_journal-test_cls_journal.o `test -f 'test/cls_journal/test_cls_journal.cc' || echo '$(srcdir)/'`test/cls_journal/test_cls_journal.cc
+
+test/cls_journal/ceph_test_cls_journal-test_cls_journal.obj: test/cls_journal/test_cls_journal.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_cls_journal_CXXFLAGS) $(CXXFLAGS) -MT test/cls_journal/ceph_test_cls_journal-test_cls_journal.obj -MD -MP -MF test/cls_journal/$(DEPDIR)/ceph_test_cls_journal-test_cls_journal.Tpo -c -o test/cls_journal/ceph_test_cls_journal-test_cls_journal.obj `if test -f 'test/cls_journal/test_cls_journal.cc'; then $(CYGPATH_W) 'test/cls_journal/test_cls_journal.cc'; else $(CYGPATH_W) [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/cls_journal/$(DEPDIR)/ceph_test_cls_journal-test_cls_journal.Tpo test/cls_journal/$(DEPDIR)/ceph_test_cls_journal-test_cls_journal.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/cls_journal/test_cls_journal.cc' object='test/cls_journal/ceph_test_cls_journal-test_cls_journal.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_cls_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/cls_journal/ceph_test_cls_journal-test_cls_journal.obj `if test -f 'test/cls_journal/test_cls_journal.cc'; then $(CYGPATH_W) 'test/cls_journal/test_cls_journal.cc'; else $(CYGPATH_W) '$(srcdir)/test/cls_journal/test_cls_journal.cc'; fi`
+
 test/cls_lock/ceph_test_cls_lock-test_cls_lock.o: test/cls_lock/test_cls_lock.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_cls_lock_CXXFLAGS) $(CXXFLAGS) -MT test/cls_lock/ceph_test_cls_lock-test_cls_lock.o -MD -MP -MF test/cls_lock/$(DEPDIR)/ceph_test_cls_lock-test_cls_lock.Tpo -c -o test/cls_lock/ceph_test_cls_lock-test_cls_lock.o `test -f 'test/cls_lock/test_cls_lock.cc' || echo '$(srcdir)/'`test/cls_lock/test_cls_lock.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/cls_lock/$(DEPDIR)/ceph_test_cls_lock-test_cls_lock.Tpo test/cls_lock/$(DEPDIR)/ceph_test_cls_lock-test_cls_lock.Po
@@ -21032,6 +21849,20 @@ test/libcephfs/ceph_test_libcephfs-multiclient.obj: test/libcephfs/multiclient.c
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_libcephfs_CXXFLAGS) $(CXXFLAGS) -c -o test/libcephfs/ceph_test_libcephfs-multiclient.obj `if test -f 'test/libcephfs/multiclient.cc'; then $(CYGPATH_W) 'test/libcephfs/multiclient.cc'; else $(CYGPATH_W) '$(srcdir)/test/libcephfs/multiclient.cc'; fi`
 
+test/libcephfs/ceph_test_libcephfs-access.o: test/libcephfs/access.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_libcephfs_CXXFLAGS) $(CXXFLAGS) -MT test/libcephfs/ceph_test_libcephfs-access.o -MD -MP -MF test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-access.Tpo -c -o test/libcephfs/ceph_test_libcephfs-access.o `test -f 'test/libcephfs/access.cc' || echo '$(srcdir)/'`test/libcephfs/access.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-access.Tpo test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-access.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/libcephfs/access.cc' object='test/libcephfs/ceph_test_libcephfs-access.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_libcephfs_CXXFLAGS) $(CXXFLAGS) -c -o test/libcephfs/ceph_test_libcephfs-access.o `test -f 'test/libcephfs/access.cc' || echo '$(srcdir)/'`test/libcephfs/access.cc
+
+test/libcephfs/ceph_test_libcephfs-access.obj: test/libcephfs/access.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_libcephfs_CXXFLAGS) $(CXXFLAGS) -MT test/libcephfs/ceph_test_libcephfs-access.obj -MD -MP -MF test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-access.Tpo -c -o test/libcephfs/ceph_test_libcephfs-access.obj `if test -f 'test/libcephfs/access.cc'; then $(CYGPATH_W) 'test/libcephfs/access.cc'; else $(CYGPATH_W) '$(srcdir)/test/libcephfs/access.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-access.Tpo test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-access.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/libcephfs/access.cc' object='test/libcephfs/ceph_test_libcephfs-access.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_libcephfs_CXXFLAGS) $(CXXFLAGS) -c -o test/libcephfs/ceph_test_libcephfs-access.obj `if test -f 'test/libcephfs/access.cc'; then $(CYGPATH_W) 'test/libcephfs/access.cc'; else $(CYGPATH_W) '$(srcdir)/test/libcephfs/access.cc'; fi`
+
 test/libcephfs/ceph_test_libcephfs-flock.o: test/libcephfs/flock.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(ceph_test_libcephfs_CXXFLAGS) $(CXXFLAGS) -MT test/libcephfs/ceph_test_libcephfs-flock.o -MD -MP -MF test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-flock.Tpo -c -o test/libcephfs/ceph_test_libcephfs-flock.o `test -f 'test/libcephfs/flock.cc' || echo '$(srcdir)/'`test/libcephfs/flock.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-flock.Tpo test/libcephfs/$(DEPDIR)/ceph_test_libcephfs-flock.Po
@@ -23286,6 +24117,160 @@ test/unittest_ipaddr-test_ipaddr.obj: test/test_ipaddr.cc
 @AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_ipaddr_CXXFLAGS) $(CXXFLAGS) -c -o test/unittest_ipaddr-test_ipaddr.obj `if test -f 'test/test_ipaddr.cc'; then $(CYGPATH_W) 'test/test_ipaddr.cc'; else $(CYGPATH_W) '$(srcdir)/test/test_ipaddr.cc'; fi`
 
+test/journal/unittest_journal-test_main.o: test/journal/test_main.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_main.o -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_main.Tpo -c -o test/journal/unittest_journal-test_main.o `test -f 'test/journal/test_main.cc' || echo '$(srcdir)/'`test/journal/test_main.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_main.Tpo test/journal/$(DEPDIR)/unittest_journal-test_main.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_main.cc' object='test/journal/unittest_journal-test_main.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_main.o `test -f 'test/journal/test_main.cc' || echo '$(srcdir)/'`test/journal/test_main.cc
+
+test/journal/unittest_journal-test_main.obj: test/journal/test_main.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_main.obj -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_main.Tpo -c -o test/journal/unittest_journal-test_main.obj `if test -f 'test/journal/test_main.cc'; then $(CYGPATH_W) 'test/journal/test_main.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_main.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_main.Tpo test/journal/$(DEPDIR)/unittest_journal-test_main.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_main.cc' object='test/journal/unittest_journal-test_main.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_main.obj `if test -f 'test/journal/test_main.cc'; then $(CYGPATH_W) 'test/journal/test_main.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_main.cc'; fi`
+
+test/journal/unittest_journal-test_Entry.o: test/journal/test_Entry.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_Entry.o -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_Entry.Tpo -c -o test/journal/unittest_journal-test_Entry.o `test -f 'test/journal/test_Entry.cc' || echo '$(srcdir)/'`test/journal/test_Entry.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_Entry.Tpo test/journal/$(DEPDIR)/unittest_journal-test_Entry.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_Entry.cc' object='test/journal/unittest_journal-test_Entry.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_Entry.o `test -f 'test/journal/test_Entry.cc' || echo '$(srcdir)/'`test/journal/test_Entry.cc
+
+test/journal/unittest_journal-test_Entry.obj: test/journal/test_Entry.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_Entry.obj -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_Entry.Tpo -c -o test/journal/unittest_journal-test_Entry.obj `if test -f 'test/journal/test_Entry.cc'; then $(CYGPATH_W) 'test/journal/test_Entry.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_Entry.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_Entry.Tpo test/journal/$(DEPDIR)/unittest_journal-test_Entry.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_Entry.cc' object='test/journal/unittest_journal-test_Entry.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_Entry.obj `if test -f 'test/journal/test_Entry.cc'; then $(CYGPATH_W) 'test/journal/test_Entry.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_Entry.cc'; fi`
+
+test/journal/unittest_journal-test_FutureImpl.o: test/journal/test_FutureImpl.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_FutureImpl.o -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_FutureImpl.Tpo -c -o test/journal/unittest_journal-test_FutureImpl.o `test -f 'test/journal/test_FutureImpl.cc' || echo '$(srcdir)/'`test/journal/test_FutureImpl.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_FutureImpl.Tpo test/journal/$(DEPDIR)/unittest_journal-test_FutureImpl.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_FutureImpl.cc' object='test/journal/unittest_journal-test_FutureImpl.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_FutureImpl.o `test -f 'test/journal/test_FutureImpl.cc' || echo '$(srcdir)/'`test/journal/test_FutureImpl.cc
+
+test/journal/unittest_journal-test_FutureImpl.obj: test/journal/test_FutureImpl.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_FutureImpl.obj -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_FutureImpl.Tpo -c -o test/journal/unittest_journal-test_FutureImpl.obj `if test -f 'test/journal/test_FutureImpl.cc'; then $(CYGPATH_W) 'test/journal/test_FutureImpl.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_FutureImpl.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_FutureImpl.Tpo test/journal/$(DEPDIR)/unittest_journal-test_FutureImpl.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_FutureImpl.cc' object='test/journal/unittest_journal-test_FutureImpl.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_FutureImpl.obj `if test -f 'test/journal/test_FutureImpl.cc'; then $(CYGPATH_W) 'test/journal/test_FutureImpl.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_FutureImpl.cc'; fi`
+
+test/journal/unittest_journal-test_Journaler.o: test/journal/test_Journaler.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_Journaler.o -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_Journaler.Tpo -c -o test/journal/unittest_journal-test_Journaler.o `test -f 'test/journal/test_Journaler.cc' || echo '$(srcdir)/'`test/journal/test_Journaler.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_Journaler.Tpo test/journal/$(DEPDIR)/unittest_journal-test_Journaler.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_Journaler.cc' object='test/journal/unittest_journal-test_Journaler.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_Journaler.o `test -f 'test/journal/test_Journaler.cc' || echo '$(srcdir)/'`test/journal/test_Journaler.cc
+
+test/journal/unittest_journal-test_Journaler.obj: test/journal/test_Journaler.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_Journaler.obj -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_Journaler.Tpo -c -o test/journal/unittest_journal-test_Journaler.obj `if test -f 'test/journal/test_Journaler.cc'; then $(CYGPATH_W) 'test/journal/test_Journaler.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_Journaler.cc'; fi`
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_Journaler.Tpo test/journal/$(DEPDIR)/unittest_journal-test_Journaler.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_Journaler.cc' object='test/journal/unittest_journal-test_Journaler.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_Journaler.obj `if test -f 'test/journal/test_Journaler.cc'; then $(CYGPATH_W) 'test/journal/test_Journaler.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_Journaler.cc'; fi`
+
+test/journal/unittest_journal-test_JournalMetadata.o: test/journal/test_JournalMetadata.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_JournalMetadata.o -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_JournalMetadata.Tpo -c -o test/journal/unittest_journal-test_JournalMetadata.o `test -f 'test/journal/test_JournalMetadata.cc' || echo '$(srcdir)/'`test/journal/test_JournalMetadata.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_JournalMetadata.Tpo test/journal/$(DEPDIR)/unittest_journal-test_JournalMetadata.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_JournalMetadata.cc' object='test/journal/unittest_journal-test_JournalMetadata.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_JournalMetadata.o `test -f 'test/journal/test_JournalMetadata.cc' || echo '$(srcdir)/'`test/journal/test_JournalMetadata.cc
+
+test/journal/unittest_journal-test_JournalMetadata.obj: test/journal/test_JournalMetadata.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_JournalMetadata.obj -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_JournalMetadata.Tpo -c -o test/journal/unittest_journal-test_JournalMetadata.obj `if test -f 'test/journal/test_JournalMetadata.cc'; then $(CYGPATH_W) 'test/journal/test_JournalMetadata.cc'; else $(CYGPATH_W) '$(srcdir)/test/jou [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_JournalMetadata.Tpo test/journal/$(DEPDIR)/unittest_journal-test_JournalMetadata.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_JournalMetadata.cc' object='test/journal/unittest_journal-test_JournalMetadata.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_JournalMetadata.obj `if test -f 'test/journal/test_JournalMetadata.cc'; then $(CYGPATH_W) 'test/journal/test_JournalMetadata.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_JournalMetadata.cc'; fi`
+
+test/journal/unittest_journal-test_JournalPlayer.o: test/journal/test_JournalPlayer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_JournalPlayer.o -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_JournalPlayer.Tpo -c -o test/journal/unittest_journal-test_JournalPlayer.o `test -f 'test/journal/test_JournalPlayer.cc' || echo '$(srcdir)/'`test/journal/test_JournalPlayer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_JournalPlayer.Tpo test/journal/$(DEPDIR)/unittest_journal-test_JournalPlayer.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_JournalPlayer.cc' object='test/journal/unittest_journal-test_JournalPlayer.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_JournalPlayer.o `test -f 'test/journal/test_JournalPlayer.cc' || echo '$(srcdir)/'`test/journal/test_JournalPlayer.cc
+
+test/journal/unittest_journal-test_JournalPlayer.obj: test/journal/test_JournalPlayer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_JournalPlayer.obj -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_JournalPlayer.Tpo -c -o test/journal/unittest_journal-test_JournalPlayer.obj `if test -f 'test/journal/test_JournalPlayer.cc'; then $(CYGPATH_W) 'test/journal/test_JournalPlayer.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_ [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_JournalPlayer.Tpo test/journal/$(DEPDIR)/unittest_journal-test_JournalPlayer.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_JournalPlayer.cc' object='test/journal/unittest_journal-test_JournalPlayer.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_JournalPlayer.obj `if test -f 'test/journal/test_JournalPlayer.cc'; then $(CYGPATH_W) 'test/journal/test_JournalPlayer.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_JournalPlayer.cc'; fi`
+
+test/journal/unittest_journal-test_JournalRecorder.o: test/journal/test_JournalRecorder.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_JournalRecorder.o -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_JournalRecorder.Tpo -c -o test/journal/unittest_journal-test_JournalRecorder.o `test -f 'test/journal/test_JournalRecorder.cc' || echo '$(srcdir)/'`test/journal/test_JournalRecorder.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_JournalRecorder.Tpo test/journal/$(DEPDIR)/unittest_journal-test_JournalRecorder.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_JournalRecorder.cc' object='test/journal/unittest_journal-test_JournalRecorder.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_JournalRecorder.o `test -f 'test/journal/test_JournalRecorder.cc' || echo '$(srcdir)/'`test/journal/test_JournalRecorder.cc
+
+test/journal/unittest_journal-test_JournalRecorder.obj: test/journal/test_JournalRecorder.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_JournalRecorder.obj -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_JournalRecorder.Tpo -c -o test/journal/unittest_journal-test_JournalRecorder.obj `if test -f 'test/journal/test_JournalRecorder.cc'; then $(CYGPATH_W) 'test/journal/test_JournalRecorder.cc'; else $(CYGPATH_W) '$(srcdir)/test/jou [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_JournalRecorder.Tpo test/journal/$(DEPDIR)/unittest_journal-test_JournalRecorder.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_JournalRecorder.cc' object='test/journal/unittest_journal-test_JournalRecorder.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_JournalRecorder.obj `if test -f 'test/journal/test_JournalRecorder.cc'; then $(CYGPATH_W) 'test/journal/test_JournalRecorder.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_JournalRecorder.cc'; fi`
+
+test/journal/unittest_journal-test_JournalTrimmer.o: test/journal/test_JournalTrimmer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_JournalTrimmer.o -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_JournalTrimmer.Tpo -c -o test/journal/unittest_journal-test_JournalTrimmer.o `test -f 'test/journal/test_JournalTrimmer.cc' || echo '$(srcdir)/'`test/journal/test_JournalTrimmer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_JournalTrimmer.Tpo test/journal/$(DEPDIR)/unittest_journal-test_JournalTrimmer.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_JournalTrimmer.cc' object='test/journal/unittest_journal-test_JournalTrimmer.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_JournalTrimmer.o `test -f 'test/journal/test_JournalTrimmer.cc' || echo '$(srcdir)/'`test/journal/test_JournalTrimmer.cc
+
+test/journal/unittest_journal-test_JournalTrimmer.obj: test/journal/test_JournalTrimmer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_JournalTrimmer.obj -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_JournalTrimmer.Tpo -c -o test/journal/unittest_journal-test_JournalTrimmer.obj `if test -f 'test/journal/test_JournalTrimmer.cc'; then $(CYGPATH_W) 'test/journal/test_JournalTrimmer.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/ [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_JournalTrimmer.Tpo test/journal/$(DEPDIR)/unittest_journal-test_JournalTrimmer.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_JournalTrimmer.cc' object='test/journal/unittest_journal-test_JournalTrimmer.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_JournalTrimmer.obj `if test -f 'test/journal/test_JournalTrimmer.cc'; then $(CYGPATH_W) 'test/journal/test_JournalTrimmer.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_JournalTrimmer.cc'; fi`
+
+test/journal/unittest_journal-test_ObjectPlayer.o: test/journal/test_ObjectPlayer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_ObjectPlayer.o -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_ObjectPlayer.Tpo -c -o test/journal/unittest_journal-test_ObjectPlayer.o `test -f 'test/journal/test_ObjectPlayer.cc' || echo '$(srcdir)/'`test/journal/test_ObjectPlayer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_ObjectPlayer.Tpo test/journal/$(DEPDIR)/unittest_journal-test_ObjectPlayer.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_ObjectPlayer.cc' object='test/journal/unittest_journal-test_ObjectPlayer.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_ObjectPlayer.o `test -f 'test/journal/test_ObjectPlayer.cc' || echo '$(srcdir)/'`test/journal/test_ObjectPlayer.cc
+
+test/journal/unittest_journal-test_ObjectPlayer.obj: test/journal/test_ObjectPlayer.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_ObjectPlayer.obj -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_ObjectPlayer.Tpo -c -o test/journal/unittest_journal-test_ObjectPlayer.obj `if test -f 'test/journal/test_ObjectPlayer.cc'; then $(CYGPATH_W) 'test/journal/test_ObjectPlayer.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_Objec [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_ObjectPlayer.Tpo test/journal/$(DEPDIR)/unittest_journal-test_ObjectPlayer.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_ObjectPlayer.cc' object='test/journal/unittest_journal-test_ObjectPlayer.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_ObjectPlayer.obj `if test -f 'test/journal/test_ObjectPlayer.cc'; then $(CYGPATH_W) 'test/journal/test_ObjectPlayer.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_ObjectPlayer.cc'; fi`
+
+test/journal/unittest_journal-test_ObjectRecorder.o: test/journal/test_ObjectRecorder.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_ObjectRecorder.o -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_ObjectRecorder.Tpo -c -o test/journal/unittest_journal-test_ObjectRecorder.o `test -f 'test/journal/test_ObjectRecorder.cc' || echo '$(srcdir)/'`test/journal/test_ObjectRecorder.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_ObjectRecorder.Tpo test/journal/$(DEPDIR)/unittest_journal-test_ObjectRecorder.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_ObjectRecorder.cc' object='test/journal/unittest_journal-test_ObjectRecorder.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_ObjectRecorder.o `test -f 'test/journal/test_ObjectRecorder.cc' || echo '$(srcdir)/'`test/journal/test_ObjectRecorder.cc
+
+test/journal/unittest_journal-test_ObjectRecorder.obj: test/journal/test_ObjectRecorder.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-test_ObjectRecorder.obj -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-test_ObjectRecorder.Tpo -c -o test/journal/unittest_journal-test_ObjectRecorder.obj `if test -f 'test/journal/test_ObjectRecorder.cc'; then $(CYGPATH_W) 'test/journal/test_ObjectRecorder.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/ [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-test_ObjectRecorder.Tpo test/journal/$(DEPDIR)/unittest_journal-test_ObjectRecorder.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/test_ObjectRecorder.cc' object='test/journal/unittest_journal-test_ObjectRecorder.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-test_ObjectRecorder.obj `if test -f 'test/journal/test_ObjectRecorder.cc'; then $(CYGPATH_W) 'test/journal/test_ObjectRecorder.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/test_ObjectRecorder.cc'; fi`
+
+test/journal/unittest_journal-RadosTestFixture.o: test/journal/RadosTestFixture.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-RadosTestFixture.o -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-RadosTestFixture.Tpo -c -o test/journal/unittest_journal-RadosTestFixture.o `test -f 'test/journal/RadosTestFixture.cc' || echo '$(srcdir)/'`test/journal/RadosTestFixture.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-RadosTestFixture.Tpo test/journal/$(DEPDIR)/unittest_journal-RadosTestFixture.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/RadosTestFixture.cc' object='test/journal/unittest_journal-RadosTestFixture.o' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-RadosTestFixture.o `test -f 'test/journal/RadosTestFixture.cc' || echo '$(srcdir)/'`test/journal/RadosTestFixture.cc
+
+test/journal/unittest_journal-RadosTestFixture.obj: test/journal/RadosTestFixture.cc
+ at am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -MT test/journal/unittest_journal-RadosTestFixture.obj -MD -MP -MF test/journal/$(DEPDIR)/unittest_journal-RadosTestFixture.Tpo -c -o test/journal/unittest_journal-RadosTestFixture.obj `if test -f 'test/journal/RadosTestFixture.cc'; then $(CYGPATH_W) 'test/journal/RadosTestFixture.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/RadosTestFixtur [...]
+ at am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/journal/$(DEPDIR)/unittest_journal-RadosTestFixture.Tpo test/journal/$(DEPDIR)/unittest_journal-RadosTestFixture.Po
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='test/journal/RadosTestFixture.cc' object='test/journal/unittest_journal-RadosTestFixture.obj' libtool=no @AMDEPBACKSLASH@
+ at AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+ at am__fastdepCXX_FALSE@	$(AM_V_CXX at am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_journal_CXXFLAGS) $(CXXFLAGS) -c -o test/journal/unittest_journal-RadosTestFixture.obj `if test -f 'test/journal/RadosTestFixture.cc'; then $(CYGPATH_W) 'test/journal/RadosTestFixture.cc'; else $(CYGPATH_W) '$(srcdir)/test/journal/RadosTestFixture.cc'; fi`
+
 test/os/unittest_lfnindex-TestLFNIndex.o: test/os/TestLFNIndex.cc
 @am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(unittest_lfnindex_CXXFLAGS) $(CXXFLAGS) -MT test/os/unittest_lfnindex-TestLFNIndex.o -MD -MP -MF test/os/$(DEPDIR)/unittest_lfnindex-TestLFNIndex.Tpo -c -o test/os/unittest_lfnindex-TestLFNIndex.o `test -f 'test/os/TestLFNIndex.cc' || echo '$(srcdir)/'`test/os/TestLFNIndex.cc
 @am__fastdepCXX_TRUE@	$(AM_V_at)$(am__mv) test/os/$(DEPDIR)/unittest_lfnindex-TestLFNIndex.Tpo test/os/$(DEPDIR)/unittest_lfnindex-TestLFNIndex.Po
@@ -24136,6 +25121,7 @@ clean-libtool:
 	-rm -rf client/.libs client/_libs
 	-rm -rf cls/cephfs/.libs cls/cephfs/_libs
 	-rm -rf cls/hello/.libs cls/hello/_libs
+	-rm -rf cls/journal/.libs cls/journal/_libs
 	-rm -rf cls/lock/.libs cls/lock/_libs
 	-rm -rf cls/log/.libs cls/log/_libs
 	-rm -rf cls/numops/.libs cls/numops/_libs
@@ -24161,6 +25147,7 @@ clean-libtool:
 	-rm -rf erasure-code/shec/.libs erasure-code/shec/_libs
 	-rm -rf global/.libs global/_libs
 	-rm -rf java/native/.libs java/native/_libs
+	-rm -rf journal/.libs journal/_libs
 	-rm -rf json_spirit/.libs json_spirit/_libs
 	-rm -rf key_value_store/.libs key_value_store/_libs
 	-rm -rf librados/.libs librados/_libs
@@ -24173,10 +25160,6 @@ clean-libtool:
 	-rm -rf msg/async/.libs msg/async/_libs
 	-rm -rf msg/simple/.libs msg/simple/_libs
 	-rm -rf msg/xio/.libs msg/xio/_libs
-	-rm -rf objclass/.libs objclass/_libs
-	-rm -rf os/.libs os/_libs
-	-rm -rf os/fs/.libs os/fs/_libs
-	-rm -rf os/newstore/.libs os/newstore/_libs
 	-rm -rf osd/.libs osd/_libs
 	-rm -rf osdc/.libs osdc/_libs
 	-rm -rf perfglue/.libs perfglue/_libs
@@ -24716,6 +25699,13 @@ unittest_librados_config.log: unittest_librados_config$(EXEEXT)
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
+unittest_journal.log: unittest_journal$(EXEEXT)
+	@p='unittest_journal$(EXEEXT)'; \
+	b='unittest_journal'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
 unittest_rbd_replay.log: unittest_rbd_replay$(EXEEXT)
 	@p='unittest_rbd_replay$(EXEEXT)'; \
 	b='unittest_rbd_replay'; \
@@ -25360,6 +26350,13 @@ test/osd/osd-scrub-repair.sh.log: test/osd/osd-scrub-repair.sh
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
+test/osd/osd-scrub-snaps.sh.log: test/osd/osd-scrub-snaps.sh
+	@p='test/osd/osd-scrub-snaps.sh'; \
+	b='test/osd/osd-scrub-snaps.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
 test/osd/osd-config.sh.log: test/osd/osd-config.sh
 	@p='test/osd/osd-config.sh'; \
 	b='test/osd/osd-config.sh'; \
@@ -25374,6 +26371,13 @@ test/osd/osd-bench.sh.log: test/osd/osd-bench.sh
 	--log-file $$b.log --trs-file $$b.trs \
 	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
 	"$$tst" $(AM_TESTS_FD_REDIRECT)
+test/osd/osd-reactivate.sh.log: test/osd/osd-reactivate.sh
+	@p='test/osd/osd-reactivate.sh'; \
+	b='test/osd/osd-reactivate.sh'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
 test/osd/osd-copy-from.sh.log: test/osd/osd-copy-from.sh
 	@p='test/osd/osd-copy-from.sh'; \
 	b='test/osd/osd-copy-from.sh'; \
@@ -25573,6 +26577,8 @@ distclean-generic:
 	-rm -f cls/cephfs/$(am__dirstamp)
 	-rm -f cls/hello/$(DEPDIR)/$(am__dirstamp)
 	-rm -f cls/hello/$(am__dirstamp)
+	-rm -f cls/journal/$(DEPDIR)/$(am__dirstamp)
+	-rm -f cls/journal/$(am__dirstamp)
 	-rm -f cls/lock/$(DEPDIR)/$(am__dirstamp)
 	-rm -f cls/lock/$(am__dirstamp)
 	-rm -f cls/log/$(DEPDIR)/$(am__dirstamp)
@@ -25623,10 +26629,14 @@ distclean-generic:
 	-rm -f global/$(am__dirstamp)
 	-rm -f java/native/$(DEPDIR)/$(am__dirstamp)
 	-rm -f java/native/$(am__dirstamp)
+	-rm -f journal/$(DEPDIR)/$(am__dirstamp)
+	-rm -f journal/$(am__dirstamp)
 	-rm -f json_spirit/$(DEPDIR)/$(am__dirstamp)
 	-rm -f json_spirit/$(am__dirstamp)
 	-rm -f key_value_store/$(DEPDIR)/$(am__dirstamp)
 	-rm -f key_value_store/$(am__dirstamp)
+	-rm -f kv/$(DEPDIR)/$(am__dirstamp)
+	-rm -f kv/$(am__dirstamp)
 	-rm -f librados/$(DEPDIR)/$(am__dirstamp)
 	-rm -f librados/$(am__dirstamp)
 	-rm -f libradosstriper/$(DEPDIR)/$(am__dirstamp)
@@ -25677,6 +26687,8 @@ distclean-generic:
 	-rm -f test/bench/$(am__dirstamp)
 	-rm -f test/cls_hello/$(DEPDIR)/$(am__dirstamp)
 	-rm -f test/cls_hello/$(am__dirstamp)
+	-rm -f test/cls_journal/$(DEPDIR)/$(am__dirstamp)
+	-rm -f test/cls_journal/$(am__dirstamp)
 	-rm -f test/cls_lock/$(DEPDIR)/$(am__dirstamp)
 	-rm -f test/cls_lock/$(am__dirstamp)
 	-rm -f test/cls_log/$(DEPDIR)/$(am__dirstamp)
@@ -25707,6 +26719,8 @@ distclean-generic:
 	-rm -f test/filestore/$(am__dirstamp)
 	-rm -f test/fs/$(DEPDIR)/$(am__dirstamp)
 	-rm -f test/fs/$(am__dirstamp)
+	-rm -f test/journal/$(DEPDIR)/$(am__dirstamp)
+	-rm -f test/journal/$(am__dirstamp)
 	-rm -f test/libcephfs/$(DEPDIR)/$(am__dirstamp)
 	-rm -f test/libcephfs/$(am__dirstamp)
 	-rm -f test/librados/$(DEPDIR)/$(am__dirstamp)
@@ -25743,6 +26757,10 @@ distclean-generic:
 	-rm -f tools/cephfs/$(am__dirstamp)
 	-rm -f tools/rados/$(DEPDIR)/$(am__dirstamp)
 	-rm -f tools/rados/$(am__dirstamp)
+	-rm -f tools/rbd/$(DEPDIR)/$(am__dirstamp)
+	-rm -f tools/rbd/$(am__dirstamp)
+	-rm -f tools/rbd/action/$(DEPDIR)/$(am__dirstamp)
+	-rm -f tools/rbd/action/$(am__dirstamp)
 	-rm -f tracing/$(DEPDIR)/$(am__dirstamp)
 	-rm -f tracing/$(am__dirstamp)
 
@@ -25760,7 +26778,7 @@ clean-am: clean-binPROGRAMS clean-checkPROGRAMS \
 	clean-sbinPROGRAMS clean-su_sbinPROGRAMS mostlyclean-am
 
 distclean: distclean-recursive
-	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/cephfs/$(DEPDIR) cls/hello/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/numops/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/timeindex/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) compressor/$(DEPDIR) crush/$(DEPDIR) erasure-cod [...]
+	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/cephfs/$(DEPDIR) cls/hello/$(DEPDIR) cls/journal/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/numops/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/timeindex/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) compressor/$(DEPDIR) crush [...]
 	-rm -f Makefile
 distclean-am: clean-am distclean-compile distclean-generic \
 	distclean-hdr distclean-tags
@@ -25817,7 +26835,7 @@ install-ps-am:
 installcheck-am:
 
 maintainer-clean: maintainer-clean-recursive
-	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/cephfs/$(DEPDIR) cls/hello/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/numops/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/timeindex/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) compressor/$(DEPDIR) crush/$(DEPDIR) erasure-cod [...]
+	-rm -rf ./$(DEPDIR) arch/$(DEPDIR) auth/$(DEPDIR) auth/cephx/$(DEPDIR) auth/none/$(DEPDIR) auth/unknown/$(DEPDIR) civetweb/src/$(DEPDIR) client/$(DEPDIR) cls/cephfs/$(DEPDIR) cls/hello/$(DEPDIR) cls/journal/$(DEPDIR) cls/lock/$(DEPDIR) cls/log/$(DEPDIR) cls/numops/$(DEPDIR) cls/rbd/$(DEPDIR) cls/refcount/$(DEPDIR) cls/replica_log/$(DEPDIR) cls/rgw/$(DEPDIR) cls/statelog/$(DEPDIR) cls/timeindex/$(DEPDIR) cls/user/$(DEPDIR) cls/version/$(DEPDIR) common/$(DEPDIR) compressor/$(DEPDIR) crush [...]
 	-rm -f Makefile
 maintainer-clean-am: distclean-am maintainer-clean-generic
 
@@ -25906,8 +26924,12 @@ export PYTHONPATH=$(top_srcdir)/src/pybind
 				HARDENING_CFLAGS += -fstack-protector
 		endif
 @CLANG_FALSE@	AM_COMMON_CFLAGS += -rdynamic
+ at SOLARIS_TRUE@	AM_COMMON_CFLAGS += -Wno-unused-local-typedefs
 @CLANG_FALSE@	AM_CXXFLAGS += -Wstrict-null-sentinel
 
+# solaris harding
+ at SOLARIS_TRUE@	AM_CXXFLAGS += -lssp_nonshared
+
 @NO_GIT_VERSION_TRUE at export NO_VERSION="yes"
 
 all-local::
@@ -25935,6 +26957,12 @@ install-data-local::
 #	$(srcdir)/crush/mapper.h \
 #	$(srcdir)/crush/types.h
 
+# build rocksdb with its own makefile
+# for some stupid reason this needs -fPIC...
+# PORTABLE=1 fixes the aarch64 build (-march=native doesn't work there)
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE at rocksdb/librocksdb.a:
+ at ENABLE_SERVER_TRUE@@WITH_SLIBROCKSDB_TRUE@	cd rocksdb && EXTRA_CXXFLAGS=-fPIC PORTABLE=1 make -j$(shell nproc) static_lib
+
 erasure-code/jerasure/ErasureCodePluginJerasure.cc: ./ceph_ver.h
 
 erasure-code/jerasure/ErasureCodePluginSelectJerasure.cc: ./ceph_ver.h
@@ -25983,7 +27011,7 @@ docdir ?= ${datadir}/doc/ceph
 core-daemons: ceph-mon ceph-osd ceph-mds radosgw
 admin-tools: monmaptool osdmaptool crushtool ceph-authtool
 base: core-daemons admin-tools \
-	cephfs ceph-syn ceph-conf \
+	cephfs ceph-fuse ceph-syn ceph-conf \
 	rados radosgw-admin librados-config \
 	init-ceph ceph-post-file \
 	ceph
@@ -26074,7 +27102,6 @@ coverity-submit:
 @ENABLE_CLIENT_TRUE@	sed -ie "s|@PYTHON_EXECUTABLE@|/usr/bin/env python|" $@.tmp
 @ENABLE_CLIENT_TRUE@	grep CEPH_GIT_NICE_VER ./ceph_ver.h | cut -f 3 -d " " | sed s/\"//g | xargs -I "{}" sed -ie "s/@CEPH_GIT_NICE_VER@/{}/g" $@.tmp
 @ENABLE_CLIENT_TRUE@	grep CEPH_GIT_VER ./ceph_ver.h | cut -f 3 -d " " | sed s/\"//g | xargs -I "{}" sed -ie "s/@CEPH_GIT_VER@/{}/g" $@.tmp
- at ENABLE_CLIENT_TRUE@	cat $(srcdir)/$@.in >>$@.tmp
 @ENABLE_CLIENT_TRUE@	chmod a+x $@.tmp
 @ENABLE_CLIENT_TRUE@	chmod a-w $@.tmp
 @ENABLE_CLIENT_TRUE@	mv $@.tmp $@
diff --git a/src/acconfig.h.in b/src/acconfig.h.in
index 8b74b6c..0090f4f 100644
--- a/src/acconfig.h.in
+++ b/src/acconfig.h.in
@@ -125,6 +125,9 @@
 /* Define to 1 if you have the `boost_regex' library (-lboost_regex). */
 #undef HAVE_LIBBOOST_REGEX
 
+/* Define to 1 if you have the `boost_regex-mt' library (-lboost_regex-mt). */
+#undef HAVE_LIBBOOST_REGEX_MT
+
 /* Define to 1 if you have the `boost_system' library (-lboost_system). */
 #undef HAVE_LIBBOOST_SYSTEM
 
@@ -181,6 +184,9 @@
 /* Define to 1 if you have the <linux/version.h> header file. */
 #undef HAVE_LINUX_VERSION_H
 
+/* Define if you have mallinfo */
+#undef HAVE_MALLINFO
+
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H
 
@@ -280,6 +286,9 @@
 /* Define to 1 if you have the <syslog.h> header file. */
 #undef HAVE_SYSLOG_H
 
+/* Define to 1 if you have the <sys/cdefs.h> header file. */
+#undef HAVE_SYS_CDEFS_H
+
 /* Define to 1 if you have the <sys/dir.h> header file, and it defines `DIR'.
    */
 #undef HAVE_SYS_DIR_H
diff --git a/src/auth/cephx/CephxSessionHandler.cc b/src/auth/cephx/CephxSessionHandler.cc
index eaebd15..30ed852 100644
--- a/src/auth/cephx/CephxSessionHandler.cc
+++ b/src/auth/cephx/CephxSessionHandler.cc
@@ -41,8 +41,9 @@ int CephxSessionHandler::_calc_signature(Message *m, uint64_t *psig)
     __le32 middle_crc;
     __le32 data_crc;
   } __attribute__ ((packed)) sigblock = {
-    1, AUTH_ENC_MAGIC, 4*4,
-    header.crc, footer.front_crc, footer.middle_crc, footer.data_crc
+    1, mswab64(AUTH_ENC_MAGIC), mswab32(4*4),
+    mswab32(header.crc), mswab32(footer.front_crc),
+    mswab32(footer.middle_crc), mswab32(footer.data_crc)
   };
   bufferlist bl_plaintext;
   bl_plaintext.append(buffer::create_static(sizeof(sigblock), (char*)&sigblock));
diff --git a/src/bash_completion/rbd b/src/bash_completion/rbd
index 5e9a733..bef1670 100644
--- a/src/bash_completion/rbd
+++ b/src/bash_completion/rbd
@@ -8,41 +8,76 @@
 # License version 2.1, as published by the Free Software
 # Foundation.  See file COPYING.
 #
+_rbd_commands="" # lazy init
+_rbd_global_options="" # lazy init
 
 _rbd()
 {
-        local cur prev
+	if [ "x${_rbd_commands}" == "x" ]; then
+		local rbc="$(rbd bash-completion 2>/dev/null)"
+		_rbd_commands="$(echo ${rbc} | sed -e 's/|-[^|]*//g' -e 's/||*/|/g')"
+		_rbd_global_options="$(echo ${rbc} | sed -e 's/|[^-][^\|]*//g' -e 's/||*/|/g')"
+	fi
 
         COMPREPLY=()
-        cur="${COMP_WORDS[COMP_CWORD]}"
-        prev="${COMP_WORDS[COMP_CWORD-1]}"
-
-        if [[ ${cur} == -* ]] ; then
-            COMPREPLY=( $(compgen -W "-c --conf -m -d -f -p --pool --snap -i -o --image --dest --dest-pool --path --size --id --keyfile" -- ${cur}) )
-            return 0
-        fi
-
-        case "${prev}" in
-            --conf | -c | --path | --keyfile | --keyring)
-                COMPREPLY=( $(compgen -f ${cur}) )
-                return 0
-                ;;
-            -m)
-                COMPREPLY=( $(compgen -A hostname ${cur}) )
-                return 0
-                ;;
-            snap)
-                COMPREPLY=( $(compgen -W "ls create rollback rm purge protect unprotect" -- ${cur}) )
-                return 0
-                ;;
-            lock)
-                COMPREPLY=( $(compgen -W "list add remove" -- ${cur}) )
-                return 0
-                ;;
-            rbd)
-                COMPREPLY=( $(compgen -W "ls list info create clone flatten resize rm export import diff export-diff import-diff cp copy mv rename snap watch lock bench-write map unmap showmapped" -- ${cur}) )
-                return 0
-            ;;
-        esac
+
+	local arg_count=${#COMP_WORDS[@]}
+	local args=()
+	local help_request="false"
+	for (( i=1; i<arg_count; i++ )); do
+		word="${COMP_WORDS[i]}"
+		if [[ "x${word}" == "xhelp" && ${#args} == 0 ]]; then
+			# treat help request as a special case
+			help_request="true"
+			continue
+		elif [[ $(echo ${_rbd_global_options} | grep "|${word}|") ]]; then
+			# skip flag option
+			continue
+		elif [[ "x${word:0:1}" == "x-" ]]; then
+			# skip option with argument
+			let i=$i+1
+			continue
+		elif [[ "x${word}" == "x" ]]; then
+			# skip blank arguments
+			continue
+		fi
+
+		args+=("${word}")
+	done
+
+        local cur="${COMP_WORDS[COMP_CWORD]}"
+        local prev="${COMP_WORDS[COMP_CWORD-1]}"
+
+	local options_exp=${_rbd_global_options}
+	local command="${args[@]}"
+	local valid_command="false"
+	if [[ ${#args} != 0 && "${args[-1]}" != "${cur}" &&
+	      $(echo "${_rbd_commands}" | grep -c "|${command}|") == 1 ]]; then
+		# combine global and command-specific options
+                local rbd_command_options="$(rbd bash-completion ${args[@]} 2>/dev/null)"
+                options_exp="${options_exp} ${rbd_command_options}"
+		valid_command="true"
+	fi
+
+	if [[ "x${cur}" == "xhelp" ]]; then
+		COMPREPLY=()
+	elif [[ "${options_exp}}" =~ "|${prev} path|" ]];  then
+		# perform path completion for path argument
+		COMPREPLY=($(compgen -f ${cur}))
+	elif [[ "${options_exp}}" =~ "|${prev} host|" ]];  then
+		# perform host completion for host argument
+		COMPREPLY=($(compgen -A hostname ${cur}))
+	elif [[ "${help_request}" == "false" && ( "x${cur:0:1}" == "x-" ||
+		( "x${cur}" == "x" && "${valid_command}" == "true" ) ) ]]; then
+		# all valid options for current command
+		options="$(echo ${options_exp} | sed -e 's/||*/ /g' -r -e 's/ (arg|path|host)//g')"
+		COMPREPLY=($(compgen -W "${options}" -- ${cur}))
+	elif [[ "${valid_command}" == "false" ]]; then
+		# search for valid command
+		[[ "x${command}" != "x" && "x${cur}" == "x" ]] && command="${command} "
+		COMPREPLY=($(echo ${_rbd_commands} | grep -o "|${command}[^ |]*" | \
+			uniq | sed -e 's/|//g' | awk -F' ' '{print $(NF)}'))
+	fi
 }
+
 complete -F _rbd rbd
diff --git a/src/ceph-disk b/src/ceph-disk
index 3f00951..7f4a009 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -51,13 +51,31 @@ Activate:
  - if encrypted, map the dmcrypt volume
  - mount the volume in a temp location
  - allocate an osd id (if needed)
+ - if deactived, no-op (to activate with --reactivate flag)
  - remount in the correct location /var/lib/ceph/osd/$cluster-$id
+ - remove the deactive flag (with --reactivate flag)
  - start ceph-osd
 
  - triggered by udev when it sees the OSD gpt partition type
  - triggered by admin 'ceph-disk activate <path>'
  - triggered on ceph service startup with 'ceph-disk activate-all'
 
+Deactivate:
+ - check partition type (support dmcrypt, mpath, normal)
+ - stop ceph-osd service if needed (make osd out with option --mark-out)
+ - remove 'ready', 'active', and INIT-specific files
+ - create deactive flag
+ - umount device and remove mount point
+ - if the partition type is dmcrypt, remove the data dmcrypt map.
+
+Destroy:
+ - check partition type (support dmcrypt, mpath, normal)
+ - remove OSD from CRUSH map
+ - remove OSD cephx key
+ - deallocate OSD ID
+ - if the partition type is dmcrypt, remove the journal dmcrypt map.
+ - destroy data (with --zap option)
+
 We rely on /dev/disk/by-partuuid to find partitions by their UUID;
 this is what the journal symlink inside the osd data volume normally
 points to.
@@ -96,6 +114,14 @@ DMCRYPT_JOURNAL_TOBE_UUID = '89c57f98-2fe5-4dc0-89c1-35865ceff2be'
 DEFAULT_FS_TYPE = 'xfs'
 SYSFS = '/sys'
 
+"""
+OSD STATUS Definition
+"""
+OSD_STATUS_OUT_DOWN = 	0
+OSD_STATUS_OUT_UP = 	1
+OSD_STATUS_IN_DOWN =	2
+OSD_STATUS_IN_UP =	3
+
 MOUNT_OPTIONS = dict(
     btrfs='noatime,user_subvol_rm_allowed',
     # user_xattr is default ever since linux 2.6.39 / 3.0, but we'll
@@ -1194,9 +1220,9 @@ def get_free_partition_index(dev):
         'BYT;' not in lines):
         raise Error('parted output expected to contain one of ' +
                     'CHH; CYL; or BYT; : ' + lines)
-    if dev not in lines:
+    if os.path.realpath(dev) not in lines:
         raise Error('parted output expected to contain ' + dev + ': ' + lines)
-    _, partitions = lines.split(dev)
+    _, partitions = lines.split(os.path.realpath(dev))
     partition_numbers = extract_parted_partition_numbers(partitions)
     if partition_numbers:
         return max(partition_numbers) + 1
@@ -1896,7 +1922,7 @@ def main_prepare(args):
             raise Error('not a dir or block device', args.data)
         prepare_lock.release()  # noqa
 
-    except Error as e:
+    except Error:
         if journal_dm_keypath:
             try:
                 os.unlink(journal_dm_keypath)
@@ -2108,6 +2134,60 @@ def start_daemon(
         raise Error('ceph osd start failed', e)
 
 
+def stop_daemon(
+    cluster,
+    osd_id,
+    ):
+    LOG.debug('Stoping %s osd.%s...', cluster, osd_id)
+
+    path = (STATEDIR + '/osd/{cluster}-{osd_id}').format(
+        cluster=cluster, osd_id=osd_id)
+
+    try:
+        if os.path.exists(os.path.join(path,'upstart')):
+            command_check_call(
+                [
+                    '/sbin/initctl',
+                    'stop',
+                    'ceph-osd',
+                    'cluster={cluster}'.format(cluster=cluster),
+                    'id={osd_id}'.format(osd_id=osd_id),
+                    ],
+                )
+        elif os.path.exists(os.path.join(path, 'sysvinit')):
+            svc = which('service')
+            command_check_call(
+                [
+                    svc,
+                    'ceph',
+                    '--cluster',
+                    '{cluster}'.format(cluster=cluster),
+                    'stop',
+                    'osd.{osd_id}'.format(osd_id=osd_id),
+                    ],
+                )
+        elif os.path.exists(os.path.join(path, 'systemd')):
+            command_check_call(
+                [
+                    'systemctl',
+                    'disable',
+                    'ceph-osd@{osd_id}'.format(osd_id=osd_id),
+                    ],
+                )
+            command_check_call(
+                [
+                    'systemctl',
+                    'stop',
+                    'ceph-osd@{osd_id}'.format(osd_id=osd_id),
+                    ],
+                )
+        else:
+            raise Error('{cluster} osd.{osd_id} is not tagged with an init '\
+                        ' system'.format(cluster=cluster,osd_id=osd_id))
+    except subprocess.CalledProcessError as e:
+        raise Error('ceph osd stop failed', e)
+
+
 def detect_fstype(
     dev,
     ):
@@ -2117,7 +2197,7 @@ def detect_fstype(
             # we don't want stale cached results
             '-p',
             '-s', 'TYPE',
-            '-o' 'value',
+            '-o', 'value',
             '--',
             dev,
             ],
@@ -2132,6 +2212,7 @@ def mount_activate(
     init,
     dmcrypt,
     dmcrypt_key_dir,
+    reactivate=False,
     ):
 
     if dmcrypt:
@@ -2191,11 +2272,35 @@ def mount_activate(
 
     path = mount(dev=dev, fstype=fstype, options=mount_options)
 
+    # check if the disk is deactive, change the journal owner, group
+    # mode for correct user and group.
+    if os.path.exists(os.path.join(path, 'deactive')):
+        # logging to syslog will help us easy to know udev triggered failure
+        if not reactivate:
+            unmount(path)
+            # we need to unmap again because dmcrypt map will create again
+            # on bootup stage (due to deactivate)
+            if '/dev/mapper/' in dev:
+                part_uuid = dev.replace('/dev/mapper/', '')
+                dmcrypt_unmap(part_uuid)
+            LOG.info('OSD deactivated! reactivate with: --reactivate')
+            raise Error('OSD deactivated! reactivate with: --reactivate')
+        # flag to activate a deactive osd.
+        deactive = True
+    else:
+        deactive = False
+
     osd_id = None
     cluster = None
     try:
         (osd_id, cluster) = activate(path, activate_key_template, init)
 
+        # Now active successfully
+        # If we got reactivate and deactive, remove the deactive file
+        if deactive and reactivate:
+            os.remove(os.path.join(path, 'deactive'))
+            LOG.info('Remove `deactive` file.')
+
         # check if the disk is already active, or if something else is already
         # mounted there
         active = False
@@ -2422,6 +2527,7 @@ def main_activate(args):
                 init=args.mark_init,
                 dmcrypt=args.dmcrypt,
                 dmcrypt_key_dir=args.dmcrypt_key_dir,
+                reactivate=args.reactivate,
                 )
             osd_data = get_mount_point(cluster, osd_id)
 
@@ -2461,6 +2567,291 @@ def main_activate(args):
 
 ###########################
 
+def _mark_osd_out(cluster, osd_id):
+    LOG.info('Prepare to mark osd.%d out...', osd_id)
+    command(
+            [
+                'ceph',
+                'osd',
+                'out',
+                'osd.%d' % osd_id,
+                ],
+            )
+
+
+def _check_osd_status(cluster, osd_id):
+    """
+    report the osd status:
+    00(0) : means OSD OUT AND DOWN
+    01(1) : means OSD OUT AND UP
+    10(2) : means OSD IN AND DOWN
+    11(3) : means OSD IN AND UP
+    """
+    LOG.info("Checking osd id: %s ..." % osd_id)
+    found = False
+    status_code = 0
+    out, ret = command(
+            [
+                'ceph',
+                'osd',
+                'dump',
+                '--cluster={cluster}'.format(
+                    cluster=cluster,
+                    ),
+                '--format',
+                'json',
+                ],
+            )
+    out_json = json.loads(out)
+    for item in out_json[u'osds']:
+        if item.get(u'osd') == int(osd_id):
+            found = True
+            if item.get(u'in') is 1:
+                status_code += 2
+            if item.get(u'up') is 1:
+                status_code += 1
+    if not found:
+        raise Error('Could not osd.%s in osd tree!' % osd_id)
+    return status_code
+
+
+def _remove_osd_directory_files(mounted_path, cluster):
+    """
+    To remove the 'ready', 'active', INIT-specific files.
+    """
+    if os.path.exists(os.path.join(mounted_path, 'ready')):
+        os.remove(os.path.join(mounted_path, 'ready'))
+        LOG.info('Remove `ready` file.')
+    else:
+        LOG.info('`ready` file is already removed.')
+
+    if os.path.exists(os.path.join(mounted_path, 'active')):
+        os.remove(os.path.join(mounted_path, 'active'))
+        LOG.info('Remove `active` file.')
+    else:
+        LOG.info('`active` file is already removed.')
+
+    # Just check `upstart` and `sysvinit` directly if filename is init-spec.
+    conf_val = get_conf(
+        cluster=cluster,
+        variable='init'
+        )
+    if conf_val is not None:
+        init = conf_val
+    else:
+        init = init_get()
+    os.remove(os.path.join(mounted_path, init))
+    LOG.info('Remove `%s` file.', init)
+    return
+
+
+def main_deactivate(args):
+    osd_id = args.deactivate_by_id
+    path = args.path
+    target_dev = None
+    dmcrypt = False
+    devices = list_devices([])
+
+    # list all devices and found we need
+    for device in devices:
+        if 'partitions' in device:
+            for dev_part in device.get('partitions'):
+                if osd_id and \
+                   'whoami' in dev_part and \
+                   dev_part['whoami'] == osd_id:
+                    target_dev = dev_part
+                elif path and \
+                   'path' in dev_part and \
+                   dev_part['path'] == path:
+                    target_dev = dev_part
+    if not target_dev:
+        raise Error('Cannot find any match device!!')
+
+    # set up all we need variable
+    osd_id = target_dev['whoami']
+    part_type = target_dev['ptype']
+    mounted_path = target_dev['mount']
+    part_uuid = target_dev['uuid']
+    if part_type == DMCRYPT_OSD_UUID or \
+       part_type == DMCRYPT_LUKS_OSD_UUID:
+        dmcrypt = True
+
+    # Do not do anything if osd is already down.
+    status_code = _check_osd_status(args.cluster, osd_id)
+    if status_code == OSD_STATUS_IN_UP:
+        if args.mark_out is True:
+            _mark_osd_out(args.cluster, int(osd_id))
+        stop_daemon(args.cluster, osd_id)
+    elif status_code == OSD_STATUS_IN_DOWN:
+        if args.mark_out is True:
+            _mark_osd_out(args.cluster, int(osd_id))
+        LOG.info("OSD already out/down. Do not do anything now.")
+        return
+    elif status_code == OSD_STATUS_OUT_UP:
+        stop_daemon(args.cluster, osd_id)
+    elif status_code == OSD_STATUS_OUT_DOWN:
+        LOG.info("OSD already out/down. Do not do anything now.")
+        return
+
+    # remove 'ready', 'active', and INIT-specific files.
+    _remove_osd_directory_files(mounted_path, args.cluster)
+
+    # Write deactivate to osd directory!
+    with open(os.path.join(mounted_path, 'deactive'), 'w'):
+        path_set_context(os.path.join(mounted_path, 'deactive'))
+
+    unmount(mounted_path)
+    LOG.info("Umount `%s` successfully.", mounted_path)
+
+    # we remove the crypt map and device mapper (if dmcrypt is True)
+    if dmcrypt:
+        dmcrypt_unmap(part_uuid)
+
+    return
+
+###########################
+
+def _remove_from_crush_map(cluster, osd_id):
+    LOG.info("Prepare to remove osd.%s from crush map..." % osd_id)
+    command(
+            [
+                'ceph',
+                'osd',
+                'crush',
+                'remove',
+                'osd.%s' % osd_id,
+                ],
+            )
+
+def _delete_osd_auth_key(cluster, osd_id):
+    LOG.info("Prepare to delete osd.%s cephx key..." % osd_id)
+    command(
+            [
+                'ceph',
+                'auth',
+                'del',
+                'osd.%s' % osd_id,
+                ],
+            )
+
+def _deallocate_osd_id(cluster, osd_id):
+    LOG.info("Prepare to deallocate the osd-id: %s..." % osd_id)
+    command(
+            [
+                'ceph',
+                'osd',
+                'rm',
+                '%s' % osd_id,
+                ],
+            )
+
+def main_destroy(args):
+    osd_id = args.destroy_by_id
+    path = args.path
+    dmcrypt_key_dir = args.dmcrypt_key_dir
+    dmcrypt = False
+    target_dev = None
+
+    if path and not is_partition(path):
+        raise Error("It should input the partition dev!!")
+
+    devices = list_devices([])
+    for device in devices:
+        if 'partitions' in device:
+            for dev_part in device.get('partitions'):
+                """
+                re-map the unmapped device for check device information
+                we need more overhead if user pass the osd_id
+
+                the reason is we must re-map the dmcrypt map that we can
+                confirm the osd_id match with whoami
+                """
+                if path and 'path' in dev_part and \
+                   dev_part['path'] != path:
+                    continue
+                elif osd_id and 'whoami' in dev_part and \
+                     dev_part['whoami'] != osd_id:
+                    continue
+                elif path and dev_part['path'] == path and \
+                   not dev_part['dmcrypt']:
+                    target_dev = dev_part
+                    break
+                elif osd_id and 'whoami' in dev_part and \
+                     dev_part['whoami'] == osd_id and not dev_part['dmcrypt']:
+                    target_dev = dev_part
+                    break
+                elif dev_part['dmcrypt'] and \
+                     not dev_part['dmcrypt']['holders']:
+                    rawdev = dev_part['path']
+                    ptype = dev_part['ptype']
+                    if ptype in [DMCRYPT_OSD_UUID]:
+                        luks = False
+                        cryptsetup_parameters = ['--key-size', '256']
+                    elif ptype in [DMCRYPT_LUKS_OSD_UUID]:
+                        luks = True
+                        cryptsetup_parameters = []
+                    else:
+                        raise Error('Cannot identify the device partiton type!!!')
+                    part_uuid = dev_part['uuid']
+                    dmcrypt_key_path = get_dmcrypt_key_path(part_uuid, dmcrypt_key_dir, luks)
+                    dev_path = dmcrypt_map(
+                            rawdev=rawdev,
+                            keypath=dmcrypt_key_path,
+                            _uuid=part_uuid,
+                            cryptsetup_parameters=cryptsetup_parameters,
+                            luks=luks,
+                            format_dev=False,
+                            )
+                    devices = list_devices([rawdev])
+                    for dev in devices:
+                        if (path and 'path' in dev and dev['path'] == path) or \
+                           (osd_id and 'whoami' in dev and dev['whoami'] == osd_id):
+                            dmcrypt = True
+                            target_dev = dev
+                            break
+                    dmcrypt_unmap(part_uuid)
+    if not target_dev:
+        raise Error('Cannot find any match device!!')
+    osd_id = target_dev['whoami']
+    dev_path = target_dev['path']
+    journal_part_uuid = target_dev['journal_uuid']
+    if target_dev['ptype'] == MPATH_OSD_UUID:
+        base_dev = get_partition_base_mpath(dev_path)
+    else:
+        base_dev = get_partition_base(dev_path)
+
+    # Before osd deactivate, we cannot destroy it
+    status_code = _check_osd_status(args.cluster, osd_id)
+    if status_code != OSD_STATUS_OUT_DOWN and \
+       status_code != OSD_STATUS_IN_DOWN:
+        raise Error("Could not destroy the active osd. (osd-id: %s)" % \
+                    osd_id)
+
+    # Remove OSD from crush map
+    _remove_from_crush_map(args.cluster, osd_id)
+
+    # Remove OSD cephx key
+    _delete_osd_auth_key(args.cluster, osd_id)
+
+    # Deallocate OSD ID
+    _deallocate_osd_id(args.cluster, osd_id)
+
+    # we remove the crypt map and device mapper (if dmcrypt is True)
+    if dmcrypt:
+        if journal_part_uuid:
+            dmcrypt_unmap(journal_part_uuid)
+
+    # Check zap flag. If we found zap flag, we need to find device for
+    # destroy this osd data.
+    if args.zap is True:
+        # earse the osd data
+        LOG.info("Prepare to zap the device %s" % base_dev)
+        zap(base_dev)
+
+    return
+
+###########################
+
 def get_journal_osd_uuid(path):
     if not os.path.exists(path):
         raise Error('%s does not exist' % path)
@@ -2738,6 +3129,8 @@ def list_format_dev_plain(dev, devices=[], prefix=''):
             desc = ['ceph data (dmcrypt %s)' % dmcrypt['type'], 'not currently mapped']
         elif len(dmcrypt['holders']) == 1:
             holder = '/dev/' + dmcrypt['holders'][0]
+            # re-list with the dm-x path
+            devices = list_devices([holder])
             def lookup_dev(devices, path):
                 for device in devices:
                     if device['path'] == path:
@@ -2851,8 +3244,8 @@ def list_dev(dev, uuid_map, journal_map):
 
     return info
 
-def list_devices(args):
-    partmap = list_all_partitions(args.path)
+def list_devices(path):
+    partmap = list_all_partitions(path)
 
     uuid_map = {}
     journal_map = {}
@@ -2912,7 +3305,7 @@ def list_devices(args):
     return devices
 
 def main_list(args):
-    devices = list_devices(args)
+    devices = list_devices(args.path)
     if args.format == 'json':
         print json.dumps(devices)
     else:
@@ -3250,6 +3643,8 @@ def parse_args(argv):
     make_activate_all_parser(subparsers)
     make_list_parser(subparsers)
     make_suppress_parser(subparsers)
+    make_deactivate_parser(subparsers)
+    make_destroy_parser(subparsers)
     make_zap_parser(subparsers)
     make_trigger_parser(subparsers)
 
@@ -3394,6 +3789,11 @@ def make_activate_parser(subparsers):
         default='/etc/ceph/dmcrypt-keys',
         help='directory where dm-crypt keys are stored',
         )
+    activate_parser.add_argument(
+        '--reactivate',
+        action='store_true', default=False,
+        help='activate the deactived OSD',
+        )
     activate_parser.set_defaults(
         activate_key_template='{statedir}/bootstrap-osd/{cluster}.keyring',
         func=main_activate,
@@ -3501,6 +3901,68 @@ def make_suppress_parser(subparsers):
         )
     return suppress_parser
 
+def make_deactivate_parser(subparsers):
+    deactivate_parser = subparsers.add_parser('deactivate', help='Deactivate a Ceph OSD')
+    deactivate_parser.add_argument(
+        '--cluster',
+        metavar='NAME',
+        default='ceph',
+        help='cluster name to assign this disk to',
+        )
+    deactivate_parser.add_argument(
+        'path',
+        metavar='PATH',
+        nargs='?',
+        help='path to block device or directory',
+        )
+    deactivate_parser.add_argument(
+        '--deactivate-by-id',
+        metavar='<id>',
+        help='ID of OSD to deactive'
+        )
+    deactivate_parser.add_argument(
+        '--mark-out',
+        action='store_true', default=False,
+        help='option to mark the osd out',
+        )
+    deactivate_parser.set_defaults(
+        func=main_deactivate,
+        )
+
+def make_destroy_parser(subparsers):
+    destroy_parser = subparsers.add_parser('destroy', help='Destroy a Ceph OSD')
+    destroy_parser.add_argument(
+        '--cluster',
+        metavar='NAME',
+        default='ceph',
+        help='cluster name to assign this disk to',
+        )
+    destroy_parser.add_argument(
+        'path',
+        metavar='PATH',
+        nargs='?',
+        help='path to block device or directory',
+        )
+    destroy_parser.add_argument(
+        '--destroy-by-id',
+        metavar='<id>',
+        help='ID of OSD to destroy'
+        )
+    destroy_parser.add_argument(
+        '--dmcrypt-key-dir',
+        metavar='KEYDIR',
+        default='/etc/ceph/dmcrypt-keys',
+        help='directory where dm-crypt keys are stored (If you don\'t know how it work, dont use it. we have default value)',
+        )
+    destroy_parser.add_argument(
+        '--zap',
+        action='store_true', default=False,
+        help='option to erase data and partition',
+        )
+    destroy_parser.set_defaults(
+        func=main_destroy,
+        )
+
 def make_zap_parser(subparsers):
     zap_parser = subparsers.add_parser('zap', help='Zap/erase/destroy a device\'s partition table (and contents)')
     zap_parser.add_argument(
@@ -3539,7 +4001,7 @@ def setup_logging(verbose, log_stdout):
     if log_stdout:
         ch = logging.StreamHandler(stream=sys.stdout)
         ch.setLevel(loglevel)
-        formatter = logging.Formatter('%(filename): %(message)s')
+        formatter = logging.Formatter('%(filename)s: %(message)s')
         ch.setFormatter(formatter)
         LOG.addHandler(ch)
         LOG.setLevel(loglevel)
diff --git a/src/ceph.in b/src/ceph.in
index c6c7c49..bd2a6dd 100755
--- a/src/ceph.in
+++ b/src/ceph.in
@@ -19,6 +19,7 @@ License version 2, as published by the Free Software
 Foundation.  See file COPYING.
 """
 
+import codecs
 import os
 import sys
 import platform
@@ -109,6 +110,12 @@ from ceph_daemon import DaemonWatcher, admin_socket
 verbose = False
 cluster_handle = None
 
+# Always use Unicode (UTF-8) for stdout
+raw_stdout = sys.__stdout__
+raw_stderr = sys.__stderr__
+sys.stdout = codecs.getwriter('utf-8')(raw_stdout)
+sys.stderr = codecs.getwriter('utf-8')(raw_stderr)
+
 ############################################################################
 
 def osdids():
@@ -394,14 +401,11 @@ def new_style_command(parsed_args, cmdargs, target, sigdict, inbuf, verbose):
             sig = cmd['sig']
             print '{0}: {1}'.format(cmdtag, concise_sig(sig))
 
-    got_command = False
-
-    if not got_command:
+    if True:
         if cmdargs:
             # Validate input args against list of sigs
             valid_dict = validate_command(sigdict, cmdargs, verbose)
             if valid_dict:
-                got_command = True
                 if parsed_args.output_format:
                     valid_dict['format'] = parsed_args.output_format
             else:
@@ -422,7 +426,11 @@ def new_style_command(parsed_args, cmdargs, target, sigdict, inbuf, verbose):
                 except Exception as e:
                     print >> sys.stderr, \
                             'error handling command target: {0}'.format(e)
-                    return 1, '', ''
+                    continue
+                if len(cmdargs) and cmdargs[0] == 'tell':
+                    print >> sys.stderr, \
+                          'Can not use \'tell\' in interactive mode.'
+                    continue
                 valid_dict = validate_command(sigdict, cmdargs, verbose)
                 if valid_dict:
                     if parsed_args.output_format:
@@ -887,6 +895,8 @@ def main():
             if outs:
                 print >> sys.stderr, prefix + outs
 
+        sys.stdout.flush()
+
         if (parsed_args.output_file):
             outf.write(outbuf)
         else:
@@ -897,13 +907,16 @@ def main():
             if parsed_args.output_format and \
                parsed_args.output_format.startswith('json') and \
                not compat:
-                sys.stdout.write('\n')
+                raw_stdout.write('\n')
 
             # if we are prettifying things, normalize newlines.  sigh.
             if suffix != '':
                 outbuf = outbuf.rstrip()
             if outbuf != '':
-                sys.stdout.write(prefix + outbuf + suffix)
+                # Write directly to binary stdout
+                raw_stdout.write(prefix)
+                raw_stdout.write(outbuf)
+                raw_stdout.write(suffix)
 
         sys.stdout.flush()
 
diff --git a/src/ceph_fuse.cc b/src/ceph_fuse.cc
index 9a00c29..4d166eb 100644
--- a/src/ceph_fuse.cc
+++ b/src/ceph_fuse.cc
@@ -34,10 +34,6 @@ using namespace std;
 #include "global/global_init.h"
 #include "common/safe_io.h"
        
-#if !defined(DARWIN) && !defined(__FreeBSD__)
-#include <envz.h>
-#endif // DARWIN
-
 #include <sys/types.h>
 #include <fcntl.h>
 
@@ -103,14 +99,13 @@ int main(int argc, const char **argv, const char *envp[]) {
   }
 
   if (childpid == 0) {
+    if (restart_log)
+      g_ceph_context->_log->start();
     common_init_finish(g_ceph_context);
 
     //cout << "child, mounting" << std::endl;
     ::close(fd[0]);
 
-    if (restart_log)
-      g_ceph_context->_log->start();
-
     class RemountTest : public Thread {
     public:
       CephFuse *cfuse;
diff --git a/src/ceph_mds.cc b/src/ceph_mds.cc
index 91ff002..8b88f58 100644
--- a/src/ceph_mds.cc
+++ b/src/ceph_mds.cc
@@ -187,7 +187,7 @@ int main(int argc, const char **argv)
     exit(1);
 
   if (shadow != MDSMap::STATE_ONESHOT_REPLAY)
-    global_init_daemonize(g_ceph_context, 0);
+    global_init_daemonize(g_ceph_context);
   common_init_finish(g_ceph_context);
 
   // get monmap
diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc
index 3180b65..edec5d9 100644
--- a/src/ceph_mon.cc
+++ b/src/ceph_mon.cc
@@ -339,7 +339,7 @@ int main(int argc, const char **argv)
     // resolve public_network -> public_addr
     pick_addresses(g_ceph_context, CEPH_PICK_ADDRESS_PUBLIC);
 
-    common_init_finish(g_ceph_context, flags);
+    common_init_finish(g_ceph_context);
 
     bufferlist monmapbl, osdmapbl;
     std::string error;
@@ -496,7 +496,7 @@ int main(int argc, const char **argv)
   // screwing us over
   Preforker prefork;
   if (!(flags & CINIT_FLAG_NO_DAEMON_ACTIONS)) {
-    if (global_init_prefork(g_ceph_context, 0) >= 0) {
+    if (global_init_prefork(g_ceph_context) >= 0) {
       string err_msg;
       err = prefork.prefork(err_msg);
       if (err < 0) {
@@ -749,7 +749,7 @@ int main(int argc, const char **argv)
   }
 
   if (g_conf->daemonize) {
-    global_init_postfork_finish(g_ceph_context, 0);
+    global_init_postfork_finish(g_ceph_context);
     prefork.daemonize();
   }
 
diff --git a/src/ceph_osd.cc b/src/ceph_osd.cc
index 68d2c91..7a429ff 100644
--- a/src/ceph_osd.cc
+++ b/src/ceph_osd.cc
@@ -535,7 +535,7 @@ int main(int argc, const char **argv)
     exit(1);
 
   // Set up crypto, daemonize, etc.
-  global_init_daemonize(g_ceph_context, 0);
+  global_init_daemonize(g_ceph_context);
   common_init_finish(g_ceph_context);
 
   TracepointProvider::initialize<osd_tracepoint_traits>(g_ceph_context);
diff --git a/src/ceph_syn.cc b/src/ceph_syn.cc
index 1d10fa2..b872b82 100644
--- a/src/ceph_syn.cc
+++ b/src/ceph_syn.cc
@@ -31,10 +31,6 @@ using namespace std;
 #include "common/ceph_argparse.h"
 #include "common/pick_address.h"
 
-#if !defined(DARWIN) && !defined(__FreeBSD__)
-#include <envz.h>
-#endif // DARWIN || __FreeBSD__
-
 #include <sys/types.h>
 #include <fcntl.h>
 
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 546066a..aab3052 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -21,6 +21,7 @@
 #include <sys/stat.h>
 #include <sys/param.h>
 #include <fcntl.h>
+#include <sys/file.h>
 #include <sys/utsname.h>
 #include <sys/uio.h>
 
@@ -266,6 +267,9 @@ Client::Client(Messenger *m, MonClient *mc)
   _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
   _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
 
+  user_id = cct->_conf->client_mount_uid;
+  group_id = cct->_conf->client_mount_gid;
+
   lru.lru_set_max(cct->_conf->client_cache_size);
   lru.lru_set_midpoint(cct->_conf->client_cache_mid);
 
@@ -1471,7 +1475,7 @@ int Client::verify_reply_trace(int r,
 			 << " got_ino " << got_created_ino
 			 << " ino " << created_ino
 			 << dendl;
-	  r = _do_lookup(d->dir->parent_inode, d->name, &target);
+	  r = _do_lookup(d->dir->parent_inode, d->name, &target, uid, gid);
 	} else {
 	  // if the dentry is not linked, just do our best. see #5021.
 	  assert(0 == "how did this happen?  i want logs!");
@@ -1540,10 +1544,11 @@ int Client::make_request(MetaRequest *request,
   if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)\
     oldest_tid = tid;
 
-  if (uid < 0) {
-    uid = geteuid();
-    gid = getegid();
-  }
+  if (uid < 0)
+    uid = get_uid();
+  if (gid < 0)
+    gid = get_gid();
+
   request->set_caller_uid(uid);
   request->set_caller_gid(gid);
 
@@ -3021,6 +3026,9 @@ void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
 				   flush,
 				   cap->mseq,
                                    cap_epoch_barrier);
+  m->caller_uid = in->cap_dirtier_uid;
+  m->caller_gid = in->cap_dirtier_gid;
+
   m->head.issue_seq = cap->issue_seq;
   m->set_tid(flush_tid);
 
@@ -3291,7 +3299,12 @@ void Client::flush_snaps(Inode *in, bool all_again, CapSnap *again)
 
     capsnap->flush_tid = ++last_flush_tid;
     MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
-        cap_epoch_barrier);
+				     cap_epoch_barrier);
+    if (user_id >= 0)
+      m->caller_uid = user_id;
+    if (group_id >= 0)
+      m->caller_gid = group_id;
+
     m->set_client_tid(capsnap->flush_tid);
     m->head.snap_follows = p->first;
 
@@ -4484,6 +4497,11 @@ void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MCl
       sync_cond.Signal();
   }
 
+  if (!dirty) {
+    in->cap_dirtier_uid = -1;
+    in->cap_dirtier_gid = -1;
+  }
+
   if (!cleaned) {
     ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
   } else {
@@ -4988,8 +5006,6 @@ void Client::handle_command_reply(MCommandReply *m)
     *op.outs = m->rs;
   }
 
-  op.con->mark_down();
-
   if (op.on_finish) {
     op.on_finish->complete(m->r);
   }
@@ -5032,8 +5048,6 @@ int Client::mount(const std::string &mount_root, bool require_mds)
     }
   }
 
-  // hack: get+pin root inode.
-  //  fuse assumes it's always there.
   filepath fp(CEPH_INO_ROOT);
   if (!mount_root.empty())
     fp = filepath(mount_root.c_str());
@@ -5042,9 +5056,13 @@ int Client::mount(const std::string &mount_root, bool require_mds)
     req->set_filepath(fp);
     req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
     int res = make_request(req, -1, -1);
-    ldout(cct, 10) << "root getattr result=" << res << dendl;
-    if (res < 0)
+    if (res < 0) {
+      if (res == -EACCES && root) {
+	ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
+	break;
+      }
       return res;
+    }
 
     if (fp.depth())
       fp.pop_dentry();
@@ -5052,7 +5070,6 @@ int Client::mount(const std::string &mount_root, bool require_mds)
       break;
   }
 
-  assert(root_ancestor->is_root());
   assert(root);
   _ll_get(root);
 
@@ -5299,7 +5316,8 @@ void Client::renew_caps(MetaSession *session)
 // ===============================================================
 // high level (POSIXy) interface
 
-int Client::_do_lookup(Inode *dir, const string& name, InodeRef *target)
+int Client::_do_lookup(Inode *dir, const string& name, InodeRef *target,
+		       int uid, int gid)
 {
   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
   MetaRequest *req = new MetaRequest(op);
@@ -5311,12 +5329,13 @@ int Client::_do_lookup(Inode *dir, const string& name, InodeRef *target)
   req->head.args.getattr.mask = 0;
   ldout(cct, 10) << "_do_lookup on " << path << dendl;
 
-  int r = make_request(req, 0, 0, target);
+  int r = make_request(req, uid, gid, target);
   ldout(cct, 10) << "_do_lookup res is " << r << dendl;
   return r;
 }
 
-int Client::_lookup(Inode *dir, const string& dname, InodeRef *target)
+int Client::_lookup(Inode *dir, const string& dname, InodeRef *target,
+		    int uid, int gid)
 {
   int r = 0;
   Dentry *dn = NULL;
@@ -5397,7 +5416,7 @@ int Client::_lookup(Inode *dir, const string& dname, InodeRef *target)
     }
   }
 
-  r = _do_lookup(dir, dname, target);
+  r = _do_lookup(dir, dname, target, uid, gid);
   goto done;
 
  hit_dn:
@@ -5448,7 +5467,8 @@ int Client::get_or_create(Inode *dir, const char* name,
   return 0;
 }
 
-int Client::path_walk(const filepath& origpath, InodeRef *end, bool followsym)
+int Client::path_walk(const filepath& origpath, InodeRef *end, bool followsym,
+		      int uid, int gid)
 {
   filepath path = origpath;
   InodeRef cur;
@@ -5468,7 +5488,7 @@ int Client::path_walk(const filepath& origpath, InodeRef *end, bool followsym)
     ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
     ldout(cct, 20) << "  (path is " << path << ")" << dendl;
     InodeRef next;
-    int r = _lookup(cur.get(), dname, &next);
+    int r = _lookup(cur.get(), dname, &next, uid, gid);
     if (r < 0)
       return r;
     // only follow trailing symlink if followsym.  always follow
@@ -5611,7 +5631,7 @@ int Client::mkdir(const char *relpath, mode_t mode)
   return _mkdir(dir.get(), name.c_str(), mode);
 }
 
-int Client::mkdirs(const char *relpath, mode_t mode)
+int Client::mkdirs(const char *relpath, mode_t mode, int uid, int gid)
 {
   Mutex::Locker lock(client_lock);
   ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
@@ -5626,8 +5646,9 @@ int Client::mkdirs(const char *relpath, mode_t mode)
   InodeRef cur, next;
   cur = cwd;
   for (i=0; i<path.depth(); ++i) {
-    r=_lookup(cur.get(), path[i].c_str(), &next);
-    if (r < 0) break;
+    r = _lookup(cur.get(), path[i].c_str(), &next, uid, gid);
+    if (r < 0)
+      break;
     cur.swap(next);
   }
   //check that we have work left to do
@@ -5640,8 +5661,8 @@ int Client::mkdirs(const char *relpath, mode_t mode)
     r = _mkdir(cur.get(), path[i].c_str(), mode);
     //check proper creation/existence
     if (r < 0) return r;
-    r = _lookup(cur.get(), path[i], &next);
-    if(r < 0) {
+    r = _lookup(cur.get(), path[i], &next, uid, gid);
+    if (r < 0) {
       ldout(cct, 0) << "mkdirs: successfully created new directory " << path[i]
 	      << " but can't _lookup it!" << dendl;
       return r;
@@ -5772,11 +5793,40 @@ int Client::_setattr(Inode *in, struct stat *attr, int mask, int uid, int gid,
       is_quota_bytes_exceeded(in, (unsigned long)attr->st_size - in->size)) {
     return -EDQUOT;
   }
+
+  if (uid < 0) {
+    uid = get_uid();
+    gid = get_gid();
+  }
+
   // make the change locally?
+  if ((in->cap_dirtier_uid >= 0 && uid != in->cap_dirtier_uid) ||
+      (in->cap_dirtier_gid >= 0 && gid != in->cap_dirtier_gid)) {
+    ldout(cct, 10) << __func__ << " caller " << uid << ":" << gid
+		   << " != cap dirtier " << in->cap_dirtier_uid << ":"
+		   << in->cap_dirtier_gid << ", forcing sync setattr"
+		   << dendl;
+    /*
+     * This works because we implicitly flush the caps as part of the
+     * request, so the cap update check will happen with the writeback
+     * cap context, and then the setattr check will happen with the
+     * caller's context.
+     *
+     * In reality this pattern is likely pretty rare (different users
+     * setattr'ing the same file).  If that turns out not to be the
+     * case later, we can build a more complex pipelined cap writeback
+     * infrastructure...
+     */
+    if (!mask)
+      mask |= CEPH_SETATTR_CTIME;
+    goto force_request;
+  }
 
   if (!mask) {
     // caller just needs us to bump the ctime
     in->ctime = ceph_clock_now(cct);
+    in->cap_dirtier_uid = uid;
+    in->cap_dirtier_gid = gid;
     if (issued & CEPH_CAP_AUTH_EXCL)
       mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
     else if (issued & CEPH_CAP_FILE_EXCL)
@@ -5790,6 +5840,8 @@ int Client::_setattr(Inode *in, struct stat *attr, int mask, int uid, int gid,
   if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
     if (mask & CEPH_SETATTR_MODE) {
       in->ctime = ceph_clock_now(cct);
+      in->cap_dirtier_uid = uid;
+      in->cap_dirtier_gid = gid;
       in->mode = (in->mode & ~07777) | (attr->st_mode & 07777);
       mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
       mask &= ~CEPH_SETATTR_MODE;
@@ -5797,6 +5849,8 @@ int Client::_setattr(Inode *in, struct stat *attr, int mask, int uid, int gid,
     }
     if (mask & CEPH_SETATTR_UID) {
       in->ctime = ceph_clock_now(cct);
+      in->cap_dirtier_uid = uid;
+      in->cap_dirtier_gid = gid;
       in->uid = attr->st_uid;
       mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
       mask &= ~CEPH_SETATTR_UID;
@@ -5804,6 +5858,8 @@ int Client::_setattr(Inode *in, struct stat *attr, int mask, int uid, int gid,
     }
     if (mask & CEPH_SETATTR_GID) {
       in->ctime = ceph_clock_now(cct);
+      in->cap_dirtier_uid = uid;
+      in->cap_dirtier_gid = gid;
       in->gid = attr->st_gid;
       mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
       mask &= ~CEPH_SETATTR_GID;
@@ -5817,6 +5873,8 @@ int Client::_setattr(Inode *in, struct stat *attr, int mask, int uid, int gid,
       if (mask & CEPH_SETATTR_ATIME)
         in->atime = utime_t(stat_get_atime_sec(attr), stat_get_atime_nsec(attr));
       in->ctime = ceph_clock_now(cct);
+      in->cap_dirtier_uid = uid;
+      in->cap_dirtier_gid = gid;
       in->time_warp_seq++;
       mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
       mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
@@ -5825,6 +5883,7 @@ int Client::_setattr(Inode *in, struct stat *attr, int mask, int uid, int gid,
   if (!mask)
     return 0;
 
+force_request:
   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
 
   filepath path;
@@ -6203,6 +6262,8 @@ int Client::_opendir(Inode *in, dir_result_t **dirpp, int uid, int gid)
     (*dirpp)->ordered_count = in->dir->ordered_count;
   }
   (*dirpp)->start_shared_gen = in->shared_gen;
+  (*dirpp)->owner_uid = uid;
+  (*dirpp)->owner_gid = gid;
   ldout(cct, 10) << "_opendir " << in->ino << ", our cache says the first dirfrag is " << (*dirpp)->frag() << dendl;
   ldout(cct, 3) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
   return 0;
@@ -6364,7 +6425,7 @@ int Client::_readdir_get_frag(dir_result_t *dirp)
   
   
   bufferlist dirbl;
-  int res = make_request(req, -1, -1, NULL, NULL, -1, &dirbl);
+  int res = make_request(req, dirp->owner_uid, dirp->owner_gid, NULL, NULL, -1, &dirbl);
   
   if (res == -EAGAIN) {
     ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
@@ -6882,8 +6943,8 @@ int Client::open(const char *relpath, int flags, mode_t mode, int stripe_unit,
 
   if (!created) {
     // posix says we can only check permissions of existing files
-    uid_t uid = geteuid();
-    gid_t gid = getegid();
+    uid_t uid = get_uid();
+    gid_t gid = get_gid();
     r = check_permissions(in.get(), flags, uid, gid);
     if (r < 0)
       goto out;
@@ -8694,7 +8755,7 @@ int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
   InodeRef in;
   int r = 0;
 
-  r = _lookup(parent, dname, &in);
+  r = _lookup(parent, dname, &in, uid, gid);
   if (r < 0) {
     attr->st_ino = 0;
     goto out;
@@ -8911,7 +8972,7 @@ int Client::getxattr(const char *path, const char *name, void *value, size_t siz
   int r = Client::path_walk(path, &in, true);
   if (r < 0)
     return r;
-  return Client::_getxattr(in.get(), name, value, size, getuid(), getgid());
+  return Client::_getxattr(in.get(), name, value, size);
 }
 
 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size)
@@ -8921,7 +8982,7 @@ int Client::lgetxattr(const char *path, const char *name, void *value, size_t si
   int r = Client::path_walk(path, &in, false);
   if (r < 0)
     return r;
-  return Client::_getxattr(in.get(), name, value, size, getuid(), getgid());
+  return Client::_getxattr(in.get(), name, value, size);
 }
 
 int Client::fgetxattr(int fd, const char *name, void *value, size_t size)
@@ -8930,7 +8991,7 @@ int Client::fgetxattr(int fd, const char *name, void *value, size_t size)
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
-  return Client::_getxattr(f->inode.get(), name, value, size, getuid(), getgid());
+  return Client::_getxattr(f->inode.get(), name, value, size);
 }
 
 int Client::listxattr(const char *path, char *list, size_t size)
@@ -8940,7 +9001,7 @@ int Client::listxattr(const char *path, char *list, size_t size)
   int r = Client::path_walk(path, &in, true);
   if (r < 0)
     return r;
-  return Client::_listxattr(in.get(), list, size, getuid(), getgid());
+  return Client::_listxattr(in.get(), list, size);
 }
 
 int Client::llistxattr(const char *path, char *list, size_t size)
@@ -8950,7 +9011,7 @@ int Client::llistxattr(const char *path, char *list, size_t size)
   int r = Client::path_walk(path, &in, false);
   if (r < 0)
     return r;
-  return Client::_listxattr(in.get(), list, size, getuid(), getgid());
+  return Client::_listxattr(in.get(), list, size);
 }
 
 int Client::flistxattr(int fd, char *list, size_t size)
@@ -8959,7 +9020,7 @@ int Client::flistxattr(int fd, char *list, size_t size)
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
-  return Client::_listxattr(f->inode.get(), list, size, getuid(), getgid());
+  return Client::_listxattr(f->inode.get(), list, size);
 }
 
 int Client::removexattr(const char *path, const char *name)
@@ -8969,7 +9030,7 @@ int Client::removexattr(const char *path, const char *name)
   int r = Client::path_walk(path, &in, true);
   if (r < 0)
     return r;
-  return Client::_removexattr(in.get(), name, getuid(), getgid());
+  return Client::_removexattr(in.get(), name);
 }
 
 int Client::lremovexattr(const char *path, const char *name)
@@ -8979,7 +9040,7 @@ int Client::lremovexattr(const char *path, const char *name)
   int r = Client::path_walk(path, &in, false);
   if (r < 0)
     return r;
-  return Client::_removexattr(in.get(), name, getuid(), getgid());
+  return Client::_removexattr(in.get(), name);
 }
 
 int Client::fremovexattr(int fd, const char *name)
@@ -8988,7 +9049,7 @@ int Client::fremovexattr(int fd, const char *name)
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
-  return Client::_removexattr(f->inode.get(), name, getuid(), getgid());
+  return Client::_removexattr(f->inode.get(), name);
 }
 
 int Client::setxattr(const char *path, const char *name, const void *value, size_t size, int flags)
@@ -8998,7 +9059,7 @@ int Client::setxattr(const char *path, const char *name, const void *value, size
   int r = Client::path_walk(path, &in, true);
   if (r < 0)
     return r;
-  return Client::_setxattr(in.get(), name, value, size, flags, getuid(), getgid());
+  return Client::_setxattr(in.get(), name, value, size, flags);
 }
 
 int Client::lsetxattr(const char *path, const char *name, const void *value, size_t size, int flags)
@@ -9008,7 +9069,7 @@ int Client::lsetxattr(const char *path, const char *name, const void *value, siz
   int r = Client::path_walk(path, &in, false);
   if (r < 0)
     return r;
-  return Client::_setxattr(in.get(), name, value, size, flags, getuid(), getgid());
+  return Client::_setxattr(in.get(), name, value, size, flags);
 }
 
 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size, int flags)
@@ -9017,7 +9078,7 @@ int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
   Fh *f = get_filehandle(fd);
   if (!f)
     return -EBADF;
-  return Client::_setxattr(f->inode.get(), name, value, size, flags, getuid(), getgid());
+  return Client::_setxattr(f->inode.get(), name, value, size, flags);
 }
 
 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
@@ -9866,7 +9927,7 @@ int Client::_unlink(Inode *dir, const char *name, int uid, int gid)
   req->dentry_drop = CEPH_CAP_FILE_SHARED;
   req->dentry_unless = CEPH_CAP_FILE_EXCL;
 
-  res = _lookup(dir, name, &otherin);
+  res = _lookup(dir, name, &otherin, uid, gid);
   if (res < 0)
     goto fail;
   req->set_other_inode(otherin.get());
@@ -9924,7 +9985,7 @@ int Client::_rmdir(Inode *dir, const char *name, int uid, int gid)
   int res = get_or_create(dir, name, &de);
   if (res < 0)
     goto fail;
-  res = _lookup(dir, name, &in);
+  res = _lookup(dir, name, &in, uid, gid);
   if (res < 0)
     goto fail;
   if (req->get_op() == CEPH_MDS_OP_RMDIR) {
@@ -10014,13 +10075,13 @@ int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const ch
     req->dentry_unless = CEPH_CAP_FILE_EXCL;
 
     InodeRef oldin, otherin;
-    res = _lookup(fromdir, fromname, &oldin);
+    res = _lookup(fromdir, fromname, &oldin, uid, gid);
     if (res < 0)
       goto fail;
     req->set_old_inode(oldin.get());
     req->old_inode_drop = CEPH_CAP_LINK_SHARED;
 
-    res = _lookup(todir, toname, &otherin);
+    res = _lookup(todir, toname, &otherin, uid, gid);
     if (res != 0 && res != -ENOENT) {
       goto fail;
     } else if (res == 0) {
@@ -10249,7 +10310,7 @@ int Client::ll_opendir(Inode *in, dir_result_t** dirpp, int uid, int gid)
   if (vino.snapid == CEPH_SNAPDIR) {
     *dirpp = new dir_result_t(in);
   } else {
-    r = _opendir(in, dirpp);
+    r = _opendir(in, dirpp, uid, gid);
   }
 
   tout(cct) << (unsigned long)*dirpp << std::endl;
@@ -10294,8 +10355,8 @@ int Client::ll_open(Inode *in, int flags, Fh **fhp, int uid, int gid)
 
   int r;
   if (uid < 0) {
-    uid = geteuid();
-    gid = getegid();
+    uid = get_uid();
+    gid = get_gid();
   }
   if (!cct->_conf->fuse_default_permissions) {
     r = check_permissions(in, flags, uid, gid);
@@ -10331,7 +10392,7 @@ int Client::ll_create(Inode *parent, const char *name, mode_t mode,
 
   bool created = false;
   InodeRef in;
-  int r = _lookup(parent, name, &in);
+  int r = _lookup(parent, name, &in, uid, gid);
 
   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
     return -EEXIST;
@@ -11283,27 +11344,34 @@ int Client::check_pool_perm(Inode *in, int need)
   }
 
   if (!have) {
+    if (in->snapid != CEPH_NOSNAP) {
+      // pool permission check needs to write to the first object. But for snapshot,
+      // head of the first object may have alread been deleted. To avoid creating
+      // orphan object, skip the check for now.
+      return 0;
+    }
+
     pool_perms[pool] = POOL_CHECKING;
 
     char oid_buf[32];
     snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
     object_t oid = oid_buf;
 
+    SnapContext nullsnapc;
+
     C_SaferCond rd_cond;
     ObjectOperation rd_op;
     rd_op.stat(NULL, (utime_t*)NULL, NULL);
 
     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
-		     in->snaprealm->get_snap_context(), ceph_clock_now(cct), 0,
-		     &rd_cond, NULL);
+		     nullsnapc, ceph_clock_now(cct), 0, &rd_cond, NULL);
 
     C_SaferCond wr_cond;
     ObjectOperation wr_op;
     wr_op.create(true);
 
     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
-		     in->snaprealm->get_snap_context(), ceph_clock_now(cct), 0,
-		     &wr_cond, NULL);
+		     nullsnapc, ceph_clock_now(cct), 0, &wr_cond, NULL);
 
     client_lock.Unlock();
     int rd_ret = rd_cond.wait();
diff --git a/src/client/Client.h b/src/client/Client.h
index d92609f..0482360 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -171,6 +171,8 @@ struct dir_result_t {
 
 
   InodeRef inode;
+  int owner_uid;
+  int owner_gid;
 
   int64_t offset;        // high bits: frag_t, low bits: an offset
 
@@ -260,11 +262,24 @@ class Client : public Dispatcher, public md_config_obs_t {
 public:
   void tick();
 
- protected:
+protected:
   MonClient *monclient;
   Messenger *messenger;  
   client_t whoami;
 
+  int user_id, group_id;
+
+  int get_uid() {
+    if (user_id >= 0)
+      return user_id;
+    return ::geteuid();
+  }
+  int get_gid() {
+    if (group_id >= 0)
+      return group_id;
+    return ::getegid();
+  }
+
   void set_cap_epoch_barrier(epoch_t e);
   epoch_t cap_epoch_barrier;
 
@@ -455,7 +470,8 @@ protected:
 
   // path traversal for high-level interface
   InodeRef cwd;
-  int path_walk(const filepath& fp, InodeRef *end, bool followsym=true);
+  int path_walk(const filepath& fp, InodeRef *end, bool followsym=true,
+		int uid=-1, int gid=-1);
   int fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat=0, nest_info_t *rstat=0);
   int fill_stat(InodeRef& in, struct stat *st, frag_info_t *dirstat=0, nest_info_t *rstat=0) {
     return fill_stat(in.get(), st, dirstat, rstat);
@@ -683,8 +699,8 @@ private:
 
   // internal interface
   //   call these with client_lock held!
-  int _do_lookup(Inode *dir, const string& name, InodeRef *target);
-  int _lookup(Inode *dir, const string& dname, InodeRef *target);
+  int _do_lookup(Inode *dir, const string& name, InodeRef *target, int uid, int gid);
+  int _lookup(Inode *dir, const string& dname, InodeRef *target, int uid, int gid);
 
   int _link(Inode *in, Inode *dir, const char *name, int uid=-1, int gid=-1, InodeRef *inp = 0);
   int _unlink(Inode *dir, const char *name, int uid=-1, int gid=-1);
@@ -849,7 +865,7 @@ public:
 
   // dirs
   int mkdir(const char *path, mode_t mode);
-  int mkdirs(const char *path, mode_t mode);
+  int mkdirs(const char *path, mode_t mode, int uid=-1, int gid=-1);
   int rmdir(const char *path);
 
   // symlinks
diff --git a/src/client/Inode.h b/src/client/Inode.h
index 07061e2..a13fd98 100644
--- a/src/client/Inode.h
+++ b/src/client/Inode.h
@@ -234,6 +234,8 @@ struct Inode {
   // per-mds caps
   map<mds_rank_t, Cap*> caps;            // mds -> Cap
   Cap *auth_cap;
+  int64_t cap_dirtier_uid;
+  int64_t cap_dirtier_gid;
   unsigned dirty_caps, flushing_caps;
   std::map<ceph_tid_t, int> flushing_cap_tids;
   int shared_gen, cache_gen;
@@ -304,6 +306,7 @@ struct Inode {
       flags(0),
       qtree(NULL),
       dir_hashed(false), dir_replicated(false), auth_cap(NULL),
+      cap_dirtier_uid(-1), cap_dirtier_gid(-1),
       dirty_caps(0), flushing_caps(0), shared_gen(0), cache_gen(0),
       snap_caps(0), snap_cap_refs(0),
       cap_item(this), flushing_cap_item(this),
diff --git a/src/client/ObjecterWriteback.h b/src/client/ObjecterWriteback.h
index 8c05e96..69a9806 100644
--- a/src/client/ObjecterWriteback.h
+++ b/src/client/ObjecterWriteback.h
@@ -30,9 +30,10 @@ class ObjecterWriteback : public WritebackHandler {
   }
 
   virtual ceph_tid_t write(const object_t& oid, const object_locator_t& oloc,
-		      uint64_t off, uint64_t len, const SnapContext& snapc,
-		      const bufferlist &bl, utime_t mtime, uint64_t trunc_size,
-		      __u32 trunc_seq, Context *oncommit) {
+                           uint64_t off, uint64_t len, const SnapContext& snapc,
+                           const bufferlist &bl, utime_t mtime,
+                           uint64_t trunc_size, __u32 trunc_seq,
+                           ceph_tid_t journal_tid, Context *oncommit) {
     return m_objecter->write_trunc(oid, oloc, off, len, snapc, bl, mtime, 0,
 				   trunc_size, trunc_seq, NULL,
 				   new C_OnFinisher(new C_Lock(m_lock, oncommit),
diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc
index fa92f62..736448c 100644
--- a/src/client/SyntheticClient.cc
+++ b/src/client/SyntheticClient.cc
@@ -2400,7 +2400,7 @@ int SyntheticClient::object_rw(int nobj, int osize, int wrpc,
 
 int SyntheticClient::read_random(string& fn, int size, int rdsize)   // size is in MB, wrsize in bytes
 {
-  __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)rdsize;
+  uint64_t chunks = (uint64_t)size * (uint64_t)(1024*1024) / (uint64_t)rdsize;
 
   int fd = client->open(fn.c_str(), O_RDWR);
   dout(5) << "reading from " << fn << " fd " << fd << dendl;
@@ -2478,7 +2478,7 @@ int SyntheticClient::read_random(string& fn, int size, int rdsize)   // size is
       //{
 
       offset=(rand())%(chunks+1);
-    __uint64_t *p = (__uint64_t*)buf;
+    uint64_t *p = (uint64_t*)buf;
     while ((char*)p < buf + rdsize) {
       *p = offset*rdsize + (char*)p - buf;      
       p++;
@@ -2496,11 +2496,11 @@ int SyntheticClient::read_random(string& fn, int size, int rdsize)   // size is
     if ( read )
     {
     int bad = 0;
-    __int64_t *p = (__int64_t*)buf;
-    __int64_t readoff, readclient;
+    int64_t *p = (int64_t*)buf;
+    int64_t readoff, readclient;
     while ((char*)p + 32 < buf + rdsize) {
       readoff = *p;
-      __int64_t wantoff = offset*rdsize + (__int64_t)((char*)p - buf);
+      int64_t wantoff = offset*rdsize + (int64_t)((char*)p - buf);
       p++;
       readclient = *p;
       p++;
@@ -2573,7 +2573,7 @@ int normdist(int min, int max, int stdev) /* specifies input values */
 
 int SyntheticClient::read_random_ex(string& fn, int size, int rdsize)   // size is in MB, wrsize in bytes
 {
-  __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)rdsize;
+  uint64_t chunks = (uint64_t)size * (uint64_t)(1024*1024) / (uint64_t)rdsize;
   
   int fd = client->open(fn.c_str(), O_RDWR);
   dout(5) << "reading from " << fn << " fd " << fd << dendl;
@@ -2660,7 +2660,7 @@ int SyntheticClient::read_random_ex(string& fn, int size, int rdsize)   // size
 	  {
 	    
 	    offset=(rand())%(chunks+1);
-	    __uint64_t *p = (__uint64_t*)buf;
+	    uint64_t *p = (uint64_t*)buf;
 	    while ((char*)p < buf + rdsize) {
 	      *p = offset*rdsize + (char*)p - buf;      
 	      p++;
@@ -2678,11 +2678,11 @@ int SyntheticClient::read_random_ex(string& fn, int size, int rdsize)   // size
     if ( read )
       {
 	int bad = 0;
-	__int64_t *p = (__int64_t*)buf;
-	__int64_t readoff, readclient;
+	int64_t *p = (int64_t*)buf;
+	int64_t readoff, readclient;
 	while ((char*)p + 32 < buf + rdsize) {
 	  readoff = *p;
-	  __int64_t wantoff = offset*rdsize + (__int64_t)((char*)p - buf);
+	  int64_t wantoff = offset*rdsize + (int64_t)((char*)p - buf);
 	  p++;
 	  readclient = *p;
 	  p++;
diff --git a/src/client/Trace.h b/src/client/Trace.h
index c4a0f9e..24145a8 100644
--- a/src/client/Trace.h
+++ b/src/client/Trace.h
@@ -54,7 +54,7 @@ class Trace {
   const char *peek_string(char *buf, const char *prefix);
   const char *get_string(char *buf, const char *prefix);
 
-  __int64_t get_int() {
+  int64_t get_int() {
     char buf[20];
     return atoll(get_string(buf, 0));
   }
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index 219cf0e..e46d6ac 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -14,6 +14,7 @@
 
 #define FUSE_USE_VERSION 30
 
+#include <sys/file.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <signal.h>
diff --git a/src/cls/Makefile-client.am b/src/cls/Makefile-client.am
index 134fb0c..f1f7983 100644
--- a/src/cls/Makefile-client.am
+++ b/src/cls/Makefile-client.am
@@ -58,6 +58,12 @@ libcls_numops_client_la_SOURCES = cls/numops/cls_numops_client.cc
 noinst_LTLIBRARIES += libcls_numops_client.la
 DENCODER_DEPS += libcls_numops_client.la
 
+libcls_journal_client_la_SOURCES = \
+	cls/journal/cls_journal_client.cc \
+	cls/journal/cls_journal_types.cc
+noinst_LTLIBRARIES += libcls_journal_client.la
+DENCODER_DEPS += libcls_journal_client.la
+
 noinst_HEADERS += \
 	cls/lock/cls_lock_types.h \
 	cls/lock/cls_lock_ops.h \
@@ -89,4 +95,6 @@ noinst_HEADERS += \
 	cls/user/cls_user_ops.h \
 	cls/user/cls_user_types.h \
 	cls/cephfs/cls_cephfs.h \
-	cls/cephfs/cls_cephfs_client.h
+	cls/cephfs/cls_cephfs_client.h \
+	cls/journal/cls_journal_client.h \
+	cls/journal/cls_journal_types.h
diff --git a/src/cls/Makefile-server.am b/src/cls/Makefile-server.am
index 9b719e3..cf0b26a 100644
--- a/src/cls/Makefile-server.am
+++ b/src/cls/Makefile-server.am
@@ -55,7 +55,7 @@ radoslib_LTLIBRARIES += libcls_replica_log.la
 
 libcls_user_la_SOURCES = cls/user/cls_user.cc
 libcls_user_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
-libcls_user_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0 -export-symbols-regex '.*__cls_.*'
+libcls_user_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 radoslib_LTLIBRARIES += libcls_user.la
 
 libcls_rgw_la_SOURCES = \
@@ -71,4 +71,12 @@ libcls_cephfs_la_SOURCES = cls/cephfs/cls_cephfs.cc
 libcls_cephfs_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
 libcls_cephfs_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
 radoslib_LTLIBRARIES += libcls_cephfs.la
+
+libcls_journal_la_SOURCES = \
+	cls/journal/cls_journal.cc \
+	cls/journal/cls_journal_types.cc
+libcls_journal_la_LIBADD = $(PTHREAD_LIBS) $(EXTRALIBS)
+libcls_journal_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared -export-symbols-regex '.*__cls_.*'
+radoslib_LTLIBRARIES += libcls_journal.la
+
 endif # WITH_OSD
diff --git a/src/cls/journal/cls_journal.cc b/src/cls/journal/cls_journal.cc
new file mode 100644
index 0000000..0f2f3e4
--- /dev/null
+++ b/src/cls/journal/cls_journal.cc
@@ -0,0 +1,619 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include "common/errno.h"
+#include "objclass/objclass.h"
+#include "cls/journal/cls_journal_types.h"
+#include <errno.h>
+#include <map>
+#include <string>
+#include <sstream>
+
+CLS_VER(1, 0)
+CLS_NAME(journal)
+
+cls_handle_t h_class;
+cls_method_handle_t h_journal_create;
+cls_method_handle_t h_journal_get_order;
+cls_method_handle_t h_journal_get_splay_width;
+cls_method_handle_t h_journal_get_pool_id;
+cls_method_handle_t h_journal_get_minimum_set;
+cls_method_handle_t h_journal_set_minimum_set;
+cls_method_handle_t h_journal_get_active_set;
+cls_method_handle_t h_journal_set_active_set;
+cls_method_handle_t h_journal_client_register;
+cls_method_handle_t h_journal_client_unregister;
+cls_method_handle_t h_journal_client_commit;
+cls_method_handle_t h_journal_client_list;
+cls_method_handle_t h_journal_object_guard_append;
+
+namespace {
+
+static const uint64_t MAX_KEYS_READ = 64;
+
+static const std::string HEADER_KEY_ORDER         = "order";
+static const std::string HEADER_KEY_SPLAY_WIDTH   = "splay_width";
+static const std::string HEADER_KEY_POOL_ID       = "pool_id";
+static const std::string HEADER_KEY_MINIMUM_SET   = "minimum_set";
+static const std::string HEADER_KEY_ACTIVE_SET    = "active_set";
+static const std::string HEADER_KEY_CLIENT_PREFIX = "client_";
+
+static void key_from_client_id(const std::string &client_id, string *key) {
+  *key = HEADER_KEY_CLIENT_PREFIX + client_id;
+}
+
+template <typename T>
+int read_key(cls_method_context_t hctx, const string &key, T *t) {
+  bufferlist bl;
+  int r = cls_cxx_map_get_val(hctx, key, &bl);
+  if (r < 0) {
+    CLS_ERR("failed to get omap key: %s", key.c_str());
+    return r;
+  }
+
+  try {
+    bufferlist::iterator iter = bl.begin();
+    ::decode(*t, iter);
+  } catch (const buffer::error &err) {
+    CLS_ERR("failed to decode input parameters: %s", err.what());
+    return -EINVAL;
+  }
+  return 0;
+}
+
+template <typename T>
+int write_key(cls_method_context_t hctx, const string &key, const T &t) {
+  bufferlist bl;
+  ::encode(t, bl);
+
+  int r = cls_cxx_map_set_val(hctx, key, &bl);
+  if (r < 0) {
+    CLS_ERR("failed to set omap key: %s", key.c_str());
+    return r;
+  }
+  return 0;
+}
+
+} // anonymous namespace
+
+/**
+ * Input:
+ * @param order (uint8_t) - bits to shift to compute the object max size
+ * @param splay width (uint8_t) - number of active journal objects
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int journal_create(cls_method_context_t hctx, bufferlist *in, bufferlist *out) {
+  uint8_t order;
+  uint8_t splay_width;
+  int64_t pool_id;
+  try {
+    bufferlist::iterator iter = in->begin();
+    ::decode(order, iter);
+    ::decode(splay_width, iter);
+    ::decode(pool_id, iter);
+  } catch (const buffer::error &err) {
+    CLS_ERR("failed to decode input parameters: %s", err.what());
+    return -EINVAL;
+  }
+
+  bufferlist stored_orderbl;
+  int r = cls_cxx_map_get_val(hctx, HEADER_KEY_ORDER, &stored_orderbl);
+  if (r != -ENOENT) {
+    CLS_ERR("journal already exists");
+    return -EEXIST;
+  }
+
+  r = write_key(hctx, HEADER_KEY_ORDER, order);
+  if (r < 0) {
+    return r;
+  }
+
+  r = write_key(hctx, HEADER_KEY_SPLAY_WIDTH, splay_width);
+  if (r < 0) {
+    return r;
+  }
+
+  r = write_key(hctx, HEADER_KEY_POOL_ID, pool_id);
+  if (r < 0) {
+    return r;
+  }
+
+  uint64_t object_set = 0;
+  r = write_key(hctx, HEADER_KEY_ACTIVE_SET, object_set);
+  if (r < 0) {
+    return r;
+  }
+
+  r = write_key(hctx, HEADER_KEY_MINIMUM_SET, object_set);
+  if (r < 0) {
+    return r;
+  }
+  return 0;
+}
+
+/**
+ * Input:
+ * none
+ *
+ * Output:
+ * order (uint8_t)
+ * @returns 0 on success, negative error code on failure
+ */
+int journal_get_order(cls_method_context_t hctx, bufferlist *in,
+                      bufferlist *out) {
+  uint8_t order;
+  int r = read_key(hctx, HEADER_KEY_ORDER, &order);
+  if (r < 0) {
+    return r;
+  }
+
+  ::encode(order, *out);
+  return 0;
+}
+
+/**
+ * Input:
+ * none
+ *
+ * Output:
+ * order (uint8_t)
+ * @returns 0 on success, negative error code on failure
+ */
+int journal_get_splay_width(cls_method_context_t hctx, bufferlist *in,
+                            bufferlist *out) {
+  uint8_t splay_width;
+  int r = read_key(hctx, HEADER_KEY_SPLAY_WIDTH, &splay_width);
+  if (r < 0) {
+    return r;
+  }
+
+  ::encode(splay_width, *out);
+  return 0;
+}
+
+/**
+ * Input:
+ * none
+ *
+ * Output:
+ * pool_id (int64_t)
+ * @returns 0 on success, negative error code on failure
+ */
+int journal_get_pool_id(cls_method_context_t hctx, bufferlist *in,
+                            bufferlist *out) {
+  int64_t pool_id;
+  int r = read_key(hctx, HEADER_KEY_POOL_ID, &pool_id);
+  if (r < 0) {
+    return r;
+  }
+
+  ::encode(pool_id, *out);
+  return 0;
+}
+
+/**
+ * Input:
+ * none
+ *
+ * Output:
+ * object set (uint64_t)
+ * @returns 0 on success, negative error code on failure
+ */
+int journal_get_minimum_set(cls_method_context_t hctx, bufferlist *in,
+                            bufferlist *out) {
+  uint64_t minimum_set;
+  int r = read_key(hctx, HEADER_KEY_MINIMUM_SET, &minimum_set);
+  if (r < 0) {
+    return r;
+  }
+
+  ::encode(minimum_set, *out);
+  return 0;
+}
+
+/**
+ * Input:
+ * @param object set (uint64_t)
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int journal_set_minimum_set(cls_method_context_t hctx, bufferlist *in,
+                            bufferlist *out) {
+  uint64_t object_set;
+  try {
+    bufferlist::iterator iter = in->begin();
+    ::decode(object_set, iter);
+  } catch (const buffer::error &err) {
+    CLS_ERR("failed to decode input parameters: %s", err.what());
+    return -EINVAL;
+  }
+
+  uint64_t current_active_set;
+  int r = read_key(hctx, HEADER_KEY_ACTIVE_SET, &current_active_set);
+  if (r < 0) {
+    return r;
+  }
+
+  if (current_active_set < object_set) {
+    CLS_ERR("active object set earlier than minimum: %" PRIu64
+            " < %" PRIu64, current_active_set, object_set);
+    return -EINVAL;
+  }
+
+  uint64_t current_minimum_set;
+  r = read_key(hctx, HEADER_KEY_MINIMUM_SET, &current_minimum_set);
+  if (r < 0) {
+    return r;
+  }
+
+  if (object_set == current_minimum_set) {
+    return 0;
+  } else if (object_set < current_minimum_set) {
+    CLS_ERR("object number earlier than current object: %" PRIu64 " < %" PRIu64,
+            object_set, current_minimum_set);
+    return -ESTALE;
+  }
+
+  r = write_key(hctx, HEADER_KEY_MINIMUM_SET, object_set);
+  if (r < 0) {
+    return r;
+  }
+  return 0;
+}
+
+/**
+ * Input:
+ * none
+ *
+ * Output:
+ * object set (uint64_t)
+ * @returns 0 on success, negative error code on failure
+ */
+int journal_get_active_set(cls_method_context_t hctx, bufferlist *in,
+                           bufferlist *out) {
+  uint64_t active_set;
+  int r = read_key(hctx, HEADER_KEY_ACTIVE_SET, &active_set);
+  if (r < 0) {
+    return r;
+  }
+
+  ::encode(active_set, *out);
+  return 0;
+}
+
+/**
+ * Input:
+ * @param object set (uint64_t)
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int journal_set_active_set(cls_method_context_t hctx, bufferlist *in,
+                           bufferlist *out) {
+  uint64_t object_set;
+  try {
+    bufferlist::iterator iter = in->begin();
+    ::decode(object_set, iter);
+  } catch (const buffer::error &err) {
+    CLS_ERR("failed to decode input parameters: %s", err.what());
+    return -EINVAL;
+  }
+
+  uint64_t current_minimum_set;
+  int r = read_key(hctx, HEADER_KEY_MINIMUM_SET, &current_minimum_set);
+  if (r < 0) {
+    return r;
+  }
+
+  if (current_minimum_set > object_set) {
+    CLS_ERR("minimum object set later than active: %" PRIu64
+            " > %" PRIu64, current_minimum_set, object_set);
+    return -EINVAL;
+  }
+
+  uint64_t current_active_set;
+  r = read_key(hctx, HEADER_KEY_ACTIVE_SET, &current_active_set);
+  if (r < 0) {
+    return r;
+  }
+
+  if (object_set == current_active_set) {
+    return 0;
+  } else if (object_set < current_active_set) {
+    CLS_ERR("object number earlier than current object: %" PRIu64 " < %" PRIu64,
+            object_set, current_active_set);
+    return -ESTALE;
+  }
+
+  r = write_key(hctx, HEADER_KEY_ACTIVE_SET, object_set);
+  if (r < 0) {
+    return r;
+  }
+  return 0;
+}
+
+/**
+ * Input:
+ * @param id (string) - unique client id
+ * @param description (string) - human-readable description of the client
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int journal_client_register(cls_method_context_t hctx, bufferlist *in,
+                            bufferlist *out) {
+  std::string id;
+  std::string description;
+  try {
+    bufferlist::iterator iter = in->begin();
+    ::decode(id, iter);
+    ::decode(description, iter);
+  } catch (const buffer::error &err) {
+    CLS_ERR("failed to decode input parameters: %s", err.what());
+    return -EINVAL;
+  }
+
+  std::string key;
+  key_from_client_id(id, &key);
+
+  bufferlist stored_clientbl;
+  int r = cls_cxx_map_get_val(hctx, key, &stored_clientbl);
+  if (r != -ENOENT) {
+    CLS_ERR("duplicate client id: %s", id.c_str());
+    return -EEXIST;
+  }
+
+  cls::journal::Client client(id, description);
+  r = write_key(hctx, key, client);
+  if (r < 0) {
+    return r;
+  }
+  return 0;
+}
+
+/**
+ * Input:
+ * @param id (string) - unique client id
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int journal_client_unregister(cls_method_context_t hctx, bufferlist *in,
+                              bufferlist *out) {
+  std::string id;
+  try {
+    bufferlist::iterator iter = in->begin();
+    ::decode(id, iter);
+  } catch (const buffer::error &err) {
+    CLS_ERR("failed to decode input parameters: %s", err.what());
+    return -EINVAL;
+  }
+
+  std::string key;
+  key_from_client_id(id, &key);
+
+  bufferlist bl;
+  int r = cls_cxx_map_get_val(hctx, key, &bl);
+  if (r < 0) {
+    CLS_ERR("client is not registered: %s", id.c_str());
+    return r;
+  }
+
+  r = cls_cxx_map_remove_key(hctx, key);
+  if (r < 0) {
+    CLS_ERR("failed to remove omap key: %s", key.c_str());
+    return r;
+  }
+  return 0;
+}
+
+/**
+ * Input:
+ * @param client_id (uint64_t) - unique client id
+ * @param commit_position (ObjectSetPosition)
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure
+ */
+int journal_client_commit(cls_method_context_t hctx, bufferlist *in,
+                          bufferlist *out) {
+  std::string id;
+  cls::journal::ObjectSetPosition commit_position;
+  try {
+    bufferlist::iterator iter = in->begin();
+    ::decode(id, iter);
+    ::decode(commit_position, iter);
+  } catch (const buffer::error &err) {
+    CLS_ERR("failed to decode input parameters: %s", err.what());
+    return -EINVAL;
+  }
+
+  uint8_t splay_width;
+  int r = read_key(hctx, HEADER_KEY_SPLAY_WIDTH, &splay_width);
+  if (r < 0) {
+    return r;
+  }
+  if (commit_position.entry_positions.size() > splay_width) {
+    CLS_ERR("too many entry positions");
+    return -EINVAL;
+  }
+
+  std::string key;
+  key_from_client_id(id, &key);
+
+  cls::journal::Client client;
+  r = read_key(hctx, key, &client);
+  if (r < 0) {
+    return r;
+  }
+
+  if (client.commit_position == commit_position) {
+    return 0;
+  }
+
+  client.commit_position = commit_position;
+  r = write_key(hctx, key, client);
+  if (r < 0) {
+    return r;
+  }
+  return 0;
+}
+
+/**
+ * Input:
+ * @param start_after (string)
+ * @param max_return (uint64_t)
+ *
+ * Output:
+ * clients (set<cls::journal::Client>) - collection of registered clients
+ * @returns 0 on success, negative error code on failure
+ */
+int journal_client_list(cls_method_context_t hctx, bufferlist *in,
+                        bufferlist *out) {
+  std::string start_after;
+  uint64_t max_return;
+  try {
+    bufferlist::iterator iter = in->begin();
+    ::decode(start_after, iter);
+    ::decode(max_return, iter);
+  } catch (const buffer::error &err) {
+    CLS_ERR("failed to decode input parameters: %s", err.what());
+    return -EINVAL;
+  }
+
+  std::string last_read;
+  if (!start_after.empty()) {
+    key_from_client_id(start_after, &last_read);
+  }
+
+  std::map<std::string, bufferlist> vals;
+  int r = cls_cxx_map_get_vals(hctx, last_read, HEADER_KEY_CLIENT_PREFIX,
+                               max_return, &vals);
+  if (r < 0) {
+    CLS_ERR("failed to retrieve omap values: %s", cpp_strerror(r).c_str());
+    return r;
+  }
+
+  std::set<cls::journal::Client> clients;
+  for (std::map<std::string, bufferlist>::iterator it = vals.begin();
+       it != vals.end(); ++it) {
+    try {
+      bufferlist::iterator iter = it->second.begin();
+
+      cls::journal::Client client;
+      ::decode(client, iter);
+      clients.insert(client);
+    } catch (const buffer::error &err) {
+      CLS_ERR("could not decode client '%s': %s", it->first.c_str(),
+              err.what());
+      return -EIO;
+    }
+  }
+
+  ::encode(clients, *out);
+  return 0;
+}
+
+/**
+ * Input:
+ * @param soft_max_size (uint64_t)
+ *
+ * Output:
+ * @returns 0 if object size less than max, negative error code otherwise
+ */
+int journal_object_guard_append(cls_method_context_t hctx, bufferlist *in,
+                                bufferlist *out) {
+  uint64_t soft_max_size;
+  try {
+    bufferlist::iterator iter = in->begin();
+    ::decode(soft_max_size, iter);
+  } catch (const buffer::error &err) {
+    CLS_ERR("failed to decode input parameters: %s", err.what());
+    return -EINVAL;
+  }
+
+  uint64_t size;
+  time_t mtime;
+  int r = cls_cxx_stat(hctx, &size, &mtime);
+  if (r == -ENOENT) {
+    return 0;
+  } else if (r < 0) {
+    CLS_ERR("failed to stat object: %s", cpp_strerror(r).c_str());
+    return r;
+  }
+
+  if (size >= soft_max_size) {
+    CLS_LOG(5, "journal object full: %" PRIu64 " >= %" PRIu64,
+            size, soft_max_size);
+    return -EOVERFLOW;
+  }
+  return 0;
+}
+
+#if __GNUC__ >= 4
+  #define CEPH_CLS_API    __attribute__ ((visibility ("default")))
+#else
+  #define CEPH_CLS_API
+#endif
+
+void CEPH_CLS_API __cls_init()
+{
+  CLS_LOG(20, "Loaded journal class!");
+
+  cls_register("journal", &h_class);
+
+  /// methods for journal.$journal_id objects
+  cls_register_cxx_method(h_class, "create",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          journal_create, &h_journal_create);
+  cls_register_cxx_method(h_class, "get_order",
+                          CLS_METHOD_RD,
+                          journal_get_order, &h_journal_get_order);
+  cls_register_cxx_method(h_class, "get_splay_width",
+                          CLS_METHOD_RD,
+                          journal_get_splay_width, &h_journal_get_splay_width);
+  cls_register_cxx_method(h_class, "get_pool_id",
+                          CLS_METHOD_RD,
+                          journal_get_pool_id, &h_journal_get_pool_id);
+  cls_register_cxx_method(h_class, "get_minimum_set",
+                          CLS_METHOD_RD,
+                          journal_get_minimum_set,
+                          &h_journal_get_minimum_set);
+  cls_register_cxx_method(h_class, "set_minimum_set",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          journal_set_minimum_set,
+                          &h_journal_set_minimum_set);
+  cls_register_cxx_method(h_class, "get_active_set",
+                          CLS_METHOD_RD,
+                          journal_get_active_set,
+                          &h_journal_get_active_set);
+  cls_register_cxx_method(h_class, "set_active_set",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          journal_set_active_set,
+                          &h_journal_set_active_set);
+  cls_register_cxx_method(h_class, "client_register",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          journal_client_register, &h_journal_client_register);
+  cls_register_cxx_method(h_class, "client_unregister",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          journal_client_unregister,
+                          &h_journal_client_unregister);
+  cls_register_cxx_method(h_class, "client_commit",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          journal_client_commit, &h_journal_client_commit);
+  cls_register_cxx_method(h_class, "client_list",
+                          CLS_METHOD_RD,
+                          journal_client_list, &h_journal_client_list);
+
+  /// methods for journal_data.$journal_id.$object_id objects
+  cls_register_cxx_method(h_class, "guard_append",
+                          CLS_METHOD_RD | CLS_METHOD_WR,
+                          journal_object_guard_append,
+                          &h_journal_object_guard_append);
+}
diff --git a/src/cls/journal/cls_journal_client.cc b/src/cls/journal/cls_journal_client.cc
new file mode 100644
index 0000000..a4a268d
--- /dev/null
+++ b/src/cls/journal/cls_journal_client.cc
@@ -0,0 +1,274 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cls/journal/cls_journal_client.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "common/Cond.h"
+#include <errno.h>
+#include <map>
+
+namespace cls {
+namespace journal {
+namespace client {
+
+namespace {
+
+static const uint64_t JOURNAL_MAX_RETURN = 256;
+
+struct C_AioExec : public Context {
+  librados::IoCtx ioctx;
+  std::string oid;
+
+  C_AioExec(librados::IoCtx &_ioctx, const std::string &_oid) : oid(_oid) {
+    ioctx.dup(_ioctx);
+  }
+
+  static void rados_callback(rados_completion_t c, void *arg) {
+    Context *ctx = reinterpret_cast<Context *>(arg);
+    ctx->complete(rados_aio_get_return_value(c));
+  }
+};
+
+struct C_ClientList : public C_AioExec {
+  std::set<cls::journal::Client> *clients;
+  Context *on_finish;
+  bufferlist outbl;
+
+  C_ClientList(librados::IoCtx &_ioctx, const std::string &_oid,
+               std::set<cls::journal::Client> *_clients,
+               Context *_on_finish)
+    : C_AioExec(_ioctx, _oid), clients(_clients), on_finish(_on_finish) {}
+
+  void send(const std::string &start_after) {
+    bufferlist inbl;
+    ::encode(start_after, inbl);
+    ::encode(JOURNAL_MAX_RETURN, inbl);
+
+    librados::ObjectReadOperation op;
+    op.exec("journal", "client_list", inbl);
+
+    outbl.clear();
+    librados::AioCompletion *rados_completion =
+       librados::Rados::aio_create_completion(this, rados_callback, NULL);
+    int r = ioctx.aio_operate(oid, rados_completion, &op, &outbl);
+    assert(r == 0);
+    rados_completion->release();
+  }
+
+  virtual void complete(int r) {
+    if (r < 0) {
+      finish(r);
+      return;
+    }
+
+    try {
+      bufferlist::iterator iter = outbl.begin();
+      std::set<cls::journal::Client> partial_clients;
+      ::decode(partial_clients, iter);
+
+      std::string start_after;
+      if (!partial_clients.empty()) {
+        start_after = partial_clients.rbegin()->id;
+        clients->insert(partial_clients.begin(), partial_clients.end());
+      }
+
+      if (partial_clients.size() < JOURNAL_MAX_RETURN) {
+        finish(0);
+      } else {
+        send(start_after);
+      }
+    } catch (const buffer::error &err) {
+      finish(-EBADMSG);
+    }
+  }
+
+  virtual void finish(int r) {
+    on_finish->complete(r);
+    delete this;
+  }
+};
+
+struct C_ImmutableMetadata : public C_AioExec {
+  uint8_t *order;
+  uint8_t *splay_width;
+  int64_t *pool_id;
+  Context *on_finish;
+  bufferlist outbl;
+
+  C_ImmutableMetadata(librados::IoCtx &_ioctx, const std::string &_oid,
+                      uint8_t *_order, uint8_t *_splay_width,
+		      int64_t *_pool_id, Context *_on_finish)
+    : C_AioExec(_ioctx, _oid), order(_order), splay_width(_splay_width),
+      pool_id(_pool_id), on_finish(_on_finish) {
+  }
+
+  void send() {
+    librados::ObjectReadOperation op;
+    bufferlist inbl;
+    op.exec("journal", "get_order", inbl);
+    op.exec("journal", "get_splay_width", inbl);
+    op.exec("journal", "get_pool_id", inbl);
+
+    librados::AioCompletion *rados_completion =
+      librados::Rados::aio_create_completion(this, rados_callback, NULL);
+    int r = ioctx.aio_operate(oid, rados_completion, &op, &outbl);
+    assert(r == 0);
+    rados_completion->release();
+  }
+
+  virtual void finish(int r) {
+    if (r == 0) {
+      try {
+        bufferlist::iterator iter = outbl.begin();
+        ::decode(*order, iter);
+        ::decode(*splay_width, iter);
+        ::decode(*pool_id, iter);
+      } catch (const buffer::error &err) {
+        r = -EBADMSG;
+      }
+    }
+    on_finish->complete(r);
+  }
+};
+
+struct C_MutableMetadata : public C_AioExec {
+  uint64_t *minimum_set;
+  uint64_t *active_set;
+  C_ClientList *client_list;
+  bufferlist outbl;
+
+  C_MutableMetadata(librados::IoCtx &_ioctx, const std::string &_oid,
+                    uint64_t *_minimum_set, uint64_t *_active_set,
+                    C_ClientList *_client_list)
+    : C_AioExec(_ioctx, _oid), minimum_set(_minimum_set),
+      active_set(_active_set), client_list(_client_list) {}
+
+  void send() {
+    librados::ObjectReadOperation op;
+    bufferlist inbl;
+    op.exec("journal", "get_minimum_set", inbl);
+    op.exec("journal", "get_active_set", inbl);
+
+    librados::AioCompletion *rados_completion =
+      librados::Rados::aio_create_completion(this, rados_callback, NULL);
+    int r = ioctx.aio_operate(oid, rados_completion, &op, &outbl);
+    assert(r == 0);
+    rados_completion->release();
+  }
+
+  virtual void finish(int r) {
+    if (r == 0) {
+      try {
+        bufferlist::iterator iter = outbl.begin();
+        ::decode(*minimum_set, iter);
+        ::decode(*active_set, iter);
+        client_list->send("");
+      } catch (const buffer::error &err) {
+        r = -EBADMSG;
+      }
+    }
+    if (r < 0) {
+      client_list->complete(r);
+    }
+  }
+};
+
+
+} // anonymous namespace
+
+int create(librados::IoCtx &ioctx, const std::string &oid, uint8_t order,
+           uint8_t splay, int64_t pool_id) {
+  bufferlist inbl;
+  ::encode(order, inbl);
+  ::encode(splay, inbl);
+  ::encode(pool_id, inbl);
+
+  bufferlist outbl;
+  int r = ioctx.exec(oid, "journal", "create", inbl, outbl);
+  if (r < 0) {
+    return r;
+  }
+  return 0;
+}
+
+void get_immutable_metadata(librados::IoCtx &ioctx, const std::string &oid,
+                            uint8_t *order, uint8_t *splay_width,
+                            int64_t *pool_id, Context *on_finish) {
+  C_ImmutableMetadata *metadata = new C_ImmutableMetadata(ioctx, oid, order,
+                                                          splay_width, pool_id,
+                                                          on_finish);
+  metadata->send();
+}
+
+void get_mutable_metadata(librados::IoCtx &ioctx, const std::string &oid,
+                          uint64_t *minimum_set, uint64_t *active_set,
+                          std::set<cls::journal::Client> *clients,
+                          Context *on_finish) {
+  C_ClientList *client_list = new C_ClientList(ioctx, oid, clients, on_finish);
+  C_MutableMetadata *metadata = new C_MutableMetadata(
+    ioctx, oid, minimum_set, active_set, client_list);
+  metadata->send();
+}
+
+void set_minimum_set(librados::ObjectWriteOperation *op, uint64_t object_set) {
+  bufferlist bl;
+  ::encode(object_set, bl);
+  op->exec("journal", "set_minimum_set", bl);
+}
+
+void set_active_set(librados::ObjectWriteOperation *op, uint64_t object_set) {
+  bufferlist bl;
+  ::encode(object_set, bl);
+  op->exec("journal", "set_active_set", bl);
+}
+
+void client_register(librados::ObjectWriteOperation *op,
+                     const std::string &id, const std::string &description) {
+  bufferlist bl;
+  ::encode(id, bl);
+  ::encode(description, bl);
+  op->exec("journal", "client_register", bl);
+}
+
+int client_register(librados::IoCtx &ioctx, const std::string &oid,
+                    const std::string &id, const std::string &description) {
+  librados::ObjectWriteOperation op;
+  client_register(&op, id, description);
+  return ioctx.operate(oid, &op);
+}
+
+int client_unregister(librados::IoCtx &ioctx, const std::string &oid,
+                       const std::string &id) {
+  bufferlist inbl;
+  ::encode(id, inbl);
+
+  bufferlist outbl;
+  return ioctx.exec(oid, "journal", "client_unregister", inbl, outbl);
+}
+
+void client_commit(librados::ObjectWriteOperation *op, const std::string &id,
+                   const cls::journal::ObjectSetPosition &commit_position) {
+  bufferlist bl;
+  ::encode(id, bl);
+  ::encode(commit_position, bl);
+  op->exec("journal", "client_commit", bl);
+}
+
+int client_list(librados::IoCtx &ioctx, const std::string &oid,
+                std::set<cls::journal::Client> *clients) {
+  C_SaferCond cond;
+  C_ClientList *client_list = new C_ClientList(ioctx, oid, clients, &cond);
+  client_list->send("");
+  return cond.wait();
+}
+
+void guard_append(librados::ObjectWriteOperation *op, uint64_t soft_max_size) {
+  bufferlist bl;
+  ::encode(soft_max_size, bl);
+  op->exec("journal", "guard_append", bl);
+}
+
+} // namespace client
+} // namespace journal
+} // namespace cls
diff --git a/src/cls/journal/cls_journal_client.h b/src/cls/journal/cls_journal_client.h
new file mode 100644
index 0000000..18ccf7b
--- /dev/null
+++ b/src/cls/journal/cls_journal_client.h
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CLS_JOURNAL_CLIENT_H
+#define CEPH_CLS_JOURNAL_CLIENT_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "cls/journal/cls_journal_types.h"
+#include <map>
+#include <set>
+#include <string>
+
+class Context;
+
+namespace cls {
+namespace journal {
+namespace client {
+
+int create(librados::IoCtx &ioctx, const std::string &oid, uint8_t order,
+           uint8_t splay, int64_t pool_id);
+
+void get_immutable_metadata(librados::IoCtx &ioctx, const std::string &oid,
+                            uint8_t *order, uint8_t *splay_width,
+			    int64_t *pool_id, Context *on_finish);
+void get_mutable_metadata(librados::IoCtx &ioctx, const std::string &oid,
+                          uint64_t *minimum_set, uint64_t *active_set,
+                          std::set<cls::journal::Client> *clients,
+                          Context *on_finish);
+
+void set_minimum_set(librados::ObjectWriteOperation *op, uint64_t object_set);
+void set_active_set(librados::ObjectWriteOperation *op, uint64_t object_set);
+
+void client_register(librados::ObjectWriteOperation *op,
+                     const std::string &id, const std::string &description);
+int client_register(librados::IoCtx &ioctx, const std::string &oid,
+                    const std::string &id, const std::string &description);
+int client_unregister(librados::IoCtx &ioctx, const std::string &oid,
+                      const std::string &id);
+void client_commit(librados::ObjectWriteOperation *op, const std::string &id,
+                   const cls::journal::ObjectSetPosition &commit_position);
+int client_list(librados::IoCtx &ioctx, const std::string &oid,
+                std::set<cls::journal::Client> *clients);
+
+void guard_append(librados::ObjectWriteOperation *op, uint64_t soft_max_size);
+
+} // namespace client
+} // namespace journal
+} // namespace cls
+
+#endif // CEPH_CLS_JOURNAL_CLIENT_H
diff --git a/src/cls/journal/cls_journal_types.cc b/src/cls/journal/cls_journal_types.cc
new file mode 100644
index 0000000..3084d10
--- /dev/null
+++ b/src/cls/journal/cls_journal_types.cc
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cls/journal/cls_journal_types.h"
+#include "common/Formatter.h"
+#include <set>
+
+namespace cls {
+namespace journal {
+
+void EntryPosition::encode(bufferlist& bl) const {
+  ENCODE_START(1, 1, bl);
+  ::encode(tag, bl);
+  ::encode(tid, bl);
+  ENCODE_FINISH(bl);
+}
+
+void EntryPosition::decode(bufferlist::iterator& iter) {
+  DECODE_START(1, iter);
+  ::decode(tag, iter);
+  ::decode(tid, iter);
+  DECODE_FINISH(iter);
+}
+
+void EntryPosition::dump(Formatter *f) const {
+  f->dump_string("tag", tag);
+  f->dump_unsigned("tid", tid);
+}
+
+void EntryPosition::generate_test_instances(std::list<EntryPosition *> &o) {
+  o.push_back(new EntryPosition());
+  o.push_back(new EntryPosition("id", 2));
+}
+
+bool ObjectSetPosition::operator<(const ObjectSetPosition& rhs) const {
+  if (entry_positions.size() < rhs.entry_positions.size()) {
+    return true;
+  } else if (entry_positions.size() > rhs.entry_positions.size()) {
+    return false;
+  }
+
+  std::map<std::string, uint64_t> rhs_tids;
+  for (EntryPositions::const_iterator it = rhs.entry_positions.begin();
+       it != rhs.entry_positions.end(); ++it) {
+    rhs_tids[it->tag] = it->tid;
+  }
+
+  for (EntryPositions::const_iterator it = entry_positions.begin();
+       it != entry_positions.end(); ++it) {
+    const EntryPosition &entry_position = *it;
+    if (entry_position.tid < rhs_tids[entry_position.tag]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void ObjectSetPosition::encode(bufferlist& bl) const {
+  ENCODE_START(1, 1, bl);
+  ::encode(object_number, bl);
+  ::encode(entry_positions, bl);
+  ENCODE_FINISH(bl);
+}
+
+void ObjectSetPosition::decode(bufferlist::iterator& iter) {
+  DECODE_START(1, iter);
+  ::decode(object_number, iter);
+  ::decode(entry_positions, iter);
+  DECODE_FINISH(iter);
+}
+
+void ObjectSetPosition::dump(Formatter *f) const {
+  f->dump_unsigned("object_number", object_number);
+  f->open_array_section("entry_positions");
+  for (EntryPositions::const_iterator it = entry_positions.begin();
+       it != entry_positions.end(); ++it) {
+    f->open_object_section("entry_position");
+    it->dump(f);
+    f->close_section();
+  }
+  f->close_section();
+}
+
+void ObjectSetPosition::generate_test_instances(
+    std::list<ObjectSetPosition *> &o) {
+  o.push_back(new ObjectSetPosition());
+
+  EntryPositions entry_positions;
+  entry_positions.push_back(EntryPosition("tag1", 120));
+  entry_positions.push_back(EntryPosition("tag2", 121));
+  o.push_back(new ObjectSetPosition(1, entry_positions));
+}
+
+void Client::encode(bufferlist& bl) const {
+  ENCODE_START(1, 1, bl);
+  ::encode(id, bl);
+  ::encode(description, bl);
+  ::encode(commit_position, bl);
+  ENCODE_FINISH(bl);
+}
+
+void Client::decode(bufferlist::iterator& iter) {
+  DECODE_START(1, iter);
+  ::decode(id, iter);
+  ::decode(description, iter);
+  ::decode(commit_position, iter);
+  DECODE_FINISH(iter);
+}
+
+void Client::dump(Formatter *f) const {
+  f->dump_string("id", id);
+  f->dump_string("description", description);
+  f->open_object_section("commit_position");
+  commit_position.dump(f);
+  f->close_section();
+}
+
+void Client::generate_test_instances(std::list<Client *> &o) {
+  o.push_back(new Client());
+  o.push_back(new Client("id", "desc"));
+
+  EntryPositions entry_positions;
+  entry_positions.push_back(EntryPosition("tag1", 120));
+  entry_positions.push_back(EntryPosition("tag1", 121));
+  o.push_back(new Client("id", "desc", ObjectSetPosition(1, entry_positions)));
+}
+
+std::ostream &operator<<(std::ostream &os,
+                         const EntryPosition &entry_position) {
+  os << "[tag=" << entry_position.tag << ", tid="
+     << entry_position.tid << "]";
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+                         const ObjectSetPosition &object_set_position) {
+  os << "[object_number=" << object_set_position.object_number << ", "
+     << "positions=[";
+  for (EntryPositions::const_iterator it =
+         object_set_position.entry_positions.begin();
+       it != object_set_position.entry_positions.end(); ++it) {
+    os << *it;
+  }
+  os << "]]";
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, const Client &client) {
+  os << "[id=" << client.id << ", description=" << client.description
+     << ", commit_position=" << client.commit_position << "]";
+  return os;
+}
+
+} // namespace journal
+} // namespace cls
diff --git a/src/cls/journal/cls_journal_types.h b/src/cls/journal/cls_journal_types.h
new file mode 100644
index 0000000..dd38b0d
--- /dev/null
+++ b/src/cls/journal/cls_journal_types.h
@@ -0,0 +1,108 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_CLS_JOURNAL_TYPES_H
+#define CEPH_CLS_JOURNAL_TYPES_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include <iosfwd>
+#include <list>
+#include <string>
+
+namespace ceph {
+class Formatter;
+}
+
+namespace cls {
+namespace journal {
+
+struct EntryPosition {
+  std::string tag;
+  uint64_t tid;
+
+  EntryPosition() : tid(0) {}
+  EntryPosition(const std::string& _tag, uint64_t _tid)
+    : tag(_tag), tid(_tid) {}
+
+  inline bool operator==(const EntryPosition& rhs) const {
+    return (tag == rhs.tag && tid == rhs.tid);
+  }
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& iter);
+  void dump(Formatter *f) const;
+
+  static void generate_test_instances(std::list<EntryPosition *> &o);
+};
+
+typedef std::list<EntryPosition> EntryPositions;
+
+struct ObjectSetPosition {
+  uint64_t object_number;
+  EntryPositions entry_positions;
+
+  ObjectSetPosition() : object_number(0) {}
+  ObjectSetPosition(uint64_t _object_number,
+                    const EntryPositions &_entry_positions)
+    : object_number(_object_number), entry_positions(_entry_positions) {}
+
+  bool operator<(const ObjectSetPosition& rhs) const;
+  inline bool operator<=(const ObjectSetPosition& rhs) const {
+    return (*this == rhs || *this < rhs);
+  }
+  inline bool operator==(const ObjectSetPosition &rhs) const {
+    return (entry_positions == rhs.entry_positions);
+  }
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& iter);
+  void dump(Formatter *f) const;
+
+  static void generate_test_instances(std::list<ObjectSetPosition *> &o);
+};
+
+struct Client {
+  std::string id;
+  std::string description;
+  ObjectSetPosition commit_position;
+
+  Client() {}
+  Client(const std::string& _id, const std::string& _description,
+         const ObjectSetPosition &_commit_position = ObjectSetPosition())
+    : id(_id), description(_description), commit_position(_commit_position) {}
+
+  inline bool operator==(const Client &rhs) const {
+    return (id == rhs.id && description == rhs.description &&
+            commit_position == rhs.commit_position);
+  }
+  inline bool operator<(const Client &rhs) const {
+    return (id < rhs.id);
+  }
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& iter);
+  void dump(Formatter *f) const;
+
+  static void generate_test_instances(std::list<Client *> &o);
+};
+
+WRITE_CLASS_ENCODER(EntryPosition);
+WRITE_CLASS_ENCODER(ObjectSetPosition);
+WRITE_CLASS_ENCODER(Client);
+
+std::ostream &operator<<(std::ostream &os,
+                         const EntryPosition &entry_position);
+std::ostream &operator<<(std::ostream &os,
+                         const ObjectSetPosition &object_set_position);
+std::ostream &operator<<(std::ostream &os,
+			 const Client &client);
+
+} // namespace journal
+} // namespace cls
+
+using cls::journal::encode;
+using cls::journal::decode;
+
+#endif // CEPH_CLS_JOURNAL_TYPES_H
diff --git a/src/cls/rbd/cls_rbd.cc b/src/cls/rbd/cls_rbd.cc
index 74af0a2..f83353b 100644
--- a/src/cls/rbd/cls_rbd.cc
+++ b/src/cls/rbd/cls_rbd.cc
@@ -85,6 +85,7 @@ cls_method_handle_t h_get_object_prefix;
 cls_method_handle_t h_get_snapshot_name;
 cls_method_handle_t h_snapshot_add;
 cls_method_handle_t h_snapshot_remove;
+cls_method_handle_t h_snapshot_rename;
 cls_method_handle_t h_get_all_features;
 cls_method_handle_t h_copyup;
 cls_method_handle_t h_get_id;
@@ -108,6 +109,7 @@ cls_method_handle_t h_metadata_get;
 cls_method_handle_t h_old_snapshots_list;
 cls_method_handle_t h_old_snapshot_add;
 cls_method_handle_t h_old_snapshot_remove;
+cls_method_handle_t h_old_snapshot_rename;
 
 #define RBD_MAX_KEYS_READ 64
 #define RBD_SNAP_KEY_PREFIX "snapshot_"
@@ -1560,6 +1562,83 @@ int snapshot_add(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
   return 0;
 }
 
+
+/**
+ * rename snapshot .
+ *
+ * Input:
+ * @param src_snap_id old snap id of the snapshot (snapid_t)
+ * @param dst_snap_name new name of the snapshot (string)
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure.
+ */
+int snapshot_rename(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+  bufferlist snap_namebl, snap_idbl;
+  snapid_t src_snap_id;
+  string src_snap_key,dst_snap_name;
+  cls_rbd_snap snap_meta;
+  int r;
+
+  try {
+    bufferlist::iterator iter = in->begin();
+    ::decode(src_snap_id, iter);
+    ::decode(dst_snap_name, iter);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+  
+  CLS_LOG(20, "snapshot_rename id=%llu dst_name=%s", (unsigned long long)src_snap_id.val,
+	 dst_snap_name.c_str());
+
+  int max_read = RBD_MAX_KEYS_READ;
+  string last_read = RBD_SNAP_KEY_PREFIX;
+  do {
+    map<string, bufferlist> vals;
+    r = cls_cxx_map_get_vals(hctx, last_read, RBD_SNAP_KEY_PREFIX,
+			     max_read, &vals);
+    if (r < 0)
+      return r;
+
+    for (map<string, bufferlist>::iterator it = vals.begin();
+	 it != vals.end(); ++it) {
+      bufferlist::iterator iter = it->second.begin();
+      try {
+	::decode(snap_meta, iter);
+      } catch (const buffer::error &err) {
+	CLS_ERR("error decoding snapshot metadata for snap : %s",
+	        dst_snap_name.c_str());
+	return -EIO;
+      }
+      if (dst_snap_name == snap_meta.name) {
+	CLS_LOG(20, "snap_name %s  matches existing snap with snap id = %llu ",
+		dst_snap_name.c_str(), (unsigned long long)snap_meta.id.val);
+        return -EEXIST;
+      }
+    }
+    if (!vals.empty())
+      last_read = vals.rbegin()->first;
+  } while (r == RBD_MAX_KEYS_READ);
+
+  key_from_snap_id(src_snap_id, &src_snap_key);
+  r = read_key(hctx, src_snap_key, &snap_meta); 
+  if (r == -ENOENT) {
+    CLS_LOG(20, "cannot find existing snap with snap id = %llu ", (unsigned long long)src_snap_id);
+    return r;
+  }
+  snap_meta.name = dst_snap_name;
+  bufferlist snap_metabl;
+  ::encode(snap_meta, snap_metabl);
+
+  r = cls_cxx_map_set_val(hctx, src_snap_key, &snap_metabl);
+  if (r < 0) {
+    CLS_ERR("error writing snapshot metadata: %d", r);
+    return r;
+  }
+
+  return 0;
+}
 /**
  * Removes a snapshot from an rbd header.
  *
@@ -2199,6 +2278,7 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
     ::decode(new_object_state, iter);
     ::decode(current_object_state, iter);
   } catch (const buffer::error &err) {
+    CLS_ERR("failed to decode message");
     return -EINVAL;
   }
 
@@ -2212,6 +2292,7 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
   bufferlist header_bl;
   r = cls_cxx_read(hctx, 0, object_map.get_header_length(), &header_bl);
   if (r < 0) {
+    CLS_ERR("object map header read failed");
     return r;
   }
 
@@ -2247,6 +2328,7 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
   r = cls_cxx_read(hctx, object_map.get_header_length() + byte_offset,
 		   byte_length, &data_bl); 
   if (r < 0) {
+    CLS_ERR("object map data read failed");
     return r;
   }
 
@@ -2285,8 +2367,15 @@ int object_map_update(cls_method_context_t hctx, bufferlist *in, bufferlist *out
     object_map.encode_footer(footer_bl);
     r = cls_cxx_write(hctx, object_map.get_footer_offset(), footer_bl.length(),
 		      &footer_bl);
+  } else {
+    CLS_LOG(20, "object_map_update: no update necessary");
   }
-  return r;
+
+  if (r < 0) {
+    return r;
+  }
+
+  return 0;
 }
 
 /**
@@ -2721,6 +2810,102 @@ int old_snapshot_remove(cls_method_context_t hctx, bufferlist *in, bufferlist *o
   return 0;
 }
 
+/**
+ * rename snapshot of old format.
+ *
+ * Input:
+ * @param src_snap_id old snap id of the snapshot (snapid_t)
+ * @param dst_snap_name new name of the snapshot (string)
+ *
+ * Output:
+ * @returns 0 on success, negative error code on failure.
+*/
+int old_snapshot_rename(cls_method_context_t hctx, bufferlist *in, bufferlist *out)
+{
+  bufferlist bl;
+  struct rbd_obj_header_ondisk *header;
+  bufferlist newbl;
+  bufferptr header_bp(sizeof(*header));
+  snapid_t src_snap_id;
+  const char *dst_snap_name;
+  string dst;
+
+  int rc = snap_read_header(hctx, bl);
+  if (rc < 0)
+    return rc;
+
+  header = (struct rbd_obj_header_ondisk *)bl.c_str();
+
+  int snaps_id_ofs = sizeof(*header);
+  int names_ofs = snaps_id_ofs + sizeof(rbd_obj_snap_ondisk) * header->snap_count;
+  const char *snap_names = ((char *)header) + names_ofs;
+  const char *orig_names = snap_names;
+  const char *end = snap_names + header->snap_names_len;
+  bufferlist::iterator iter = in->begin();
+  unsigned i;
+  bool found = false;
+
+  try {
+    ::decode(src_snap_id, iter);
+    ::decode(dst, iter);
+  } catch (const buffer::error &err) {
+    return -EINVAL;
+  }
+  dst_snap_name = dst.c_str();
+
+  const char *cur_snap_name;
+  for (cur_snap_name = snap_names; cur_snap_name < end; 
+    cur_snap_name += strlen(cur_snap_name) + 1) {
+    if (strcmp(cur_snap_name, dst_snap_name) == 0)
+      return -EEXIST;
+  }
+  if (cur_snap_name > end)
+    return -EIO;
+  for (i = 0; i < header->snap_count; i++) {
+    if (src_snap_id == header->snaps[i].id) {
+      found = true;
+      break;
+    }
+    snap_names += strlen(snap_names) + 1;
+  }
+  if (!found) {
+    CLS_ERR("couldn't find snap %llu\n", (unsigned long long)src_snap_id.val);
+    return -ENOENT;
+  }
+  
+  CLS_LOG(20, "rename snap with snap id %llu to dest name %s", (unsigned long long)src_snap_id.val, dst_snap_name);
+  header->snap_names_len  = header->snap_names_len - strlen(snap_names) + dst.length();
+
+  bufferptr new_names_bp(header->snap_names_len);
+  bufferptr new_snaps_bp(sizeof(header->snaps[0]) * header->snap_count);
+
+  if (header->snap_count) {
+    int names_len = 0;
+    CLS_LOG(20, "i=%d\n", i);
+    if (i > 0) {
+      names_len =  snap_names - orig_names;
+      memcpy(new_names_bp.c_str(), orig_names, names_len);
+    }
+    strcpy(new_names_bp.c_str() + names_len, dst_snap_name);
+    names_len += strlen(dst_snap_name) + 1;
+    snap_names += strlen(snap_names) + 1;
+    if (i < header->snap_count) {
+      memcpy(new_names_bp.c_str() + names_len, snap_names , end - snap_names);
+    }
+    memcpy(new_snaps_bp.c_str(), header->snaps, sizeof(header->snaps[0]) * header->snap_count);
+  }
+
+  memcpy(header_bp.c_str(), header, sizeof(*header));
+  newbl.push_back(header_bp);
+  newbl.push_back(new_snaps_bp);
+  newbl.push_back(new_names_bp);
+
+  rc = cls_cxx_write_full(hctx, &newbl);
+  if (rc < 0)
+    return rc;
+
+  return 0;
+}
 
 void __cls_init()
 {
@@ -2757,6 +2942,9 @@ void __cls_init()
   cls_register_cxx_method(h_class, "snapshot_remove",
 			  CLS_METHOD_RD | CLS_METHOD_WR,
 			  snapshot_remove, &h_snapshot_remove);
+  cls_register_cxx_method(h_class, "snapshot_rename",
+			  CLS_METHOD_RD | CLS_METHOD_WR,
+			  snapshot_rename, &h_snapshot_rename);
   cls_register_cxx_method(h_class, "get_all_features",
 			  CLS_METHOD_RD,
 			  get_all_features, &h_get_all_features);
@@ -2872,6 +3060,9 @@ void __cls_init()
   cls_register_cxx_method(h_class, "snap_remove",
 			  CLS_METHOD_RD | CLS_METHOD_WR,
 			  old_snapshot_remove, &h_old_snapshot_remove);
+  cls_register_cxx_method(h_class, "snap_rename",
+			  CLS_METHOD_RD | CLS_METHOD_WR,
+			  old_snapshot_rename, &h_old_snapshot_rename);
 
   return;
 }
diff --git a/src/cls/rbd/cls_rbd_client.cc b/src/cls/rbd/cls_rbd_client.cc
index 0385ec9..eec813d 100644
--- a/src/cls/rbd/cls_rbd_client.cc
+++ b/src/cls/rbd/cls_rbd_client.cc
@@ -392,6 +392,23 @@ namespace librbd {
       return ioctx->exec(oid, "rbd", "snapshot_remove", bl, bl2);
     }
 
+    int snapshot_rename(librados::IoCtx *ioctx, const std::string &oid,
+			 snapid_t src_snap_id,
+		         const std::string &dst_name)
+    {
+      librados::ObjectWriteOperation op;
+      snapshot_rename(&op, src_snap_id, dst_name);
+      return ioctx->operate(oid, &op);
+    }
+    void snapshot_rename(librados::ObjectWriteOperation *op,
+			 snapid_t src_snap_id,
+		         const std::string &dst_name)
+    {
+      bufferlist bl;
+      ::encode(src_snap_id, bl);
+      ::encode(dst_name, bl);
+      op->exec("rbd", "snapshot_rename", bl);
+    }
     int get_snapcontext(librados::IoCtx *ioctx, const std::string &oid,
 			::SnapContext *snapc)
     {
@@ -484,6 +501,16 @@ namespace librbd {
       return ioctx->exec(oid, "rbd", "snap_add", bl, bl2);
     }
 
+    int old_snapshot_rename(librados::IoCtx *ioctx, const std::string &oid,
+			    snapid_t src_snap_id ,
+			    const std::string &dst_name)
+    {
+      bufferlist bl, bl2;
+      ::encode(src_snap_id, bl);
+      ::encode(dst_name, bl);
+
+      return ioctx->exec(oid, "rbd", "snap_rename", bl, bl2);
+    }
     int old_snapshot_remove(librados::IoCtx *ioctx, const std::string &oid,
 			    const std::string &snap_name)
     {
diff --git a/src/cls/rbd/cls_rbd_client.h b/src/cls/rbd/cls_rbd_client.h
index 486d17f..6235fea 100644
--- a/src/cls/rbd/cls_rbd_client.h
+++ b/src/cls/rbd/cls_rbd_client.h
@@ -70,6 +70,12 @@ namespace librbd {
 		      const std::string &snap_name);
     int snapshot_remove(librados::IoCtx *ioctx, const std::string &oid,
 			snapid_t snap_id);
+    int snapshot_rename(librados::IoCtx *ioctx, const std::string &oid,
+			snapid_t src_snap_id,
+			const std::string &dst_name);
+    void snapshot_rename(librados::ObjectWriteOperation *op,
+			snapid_t src_snap_id,
+			const std::string &dst_name);
     int get_snapcontext(librados::IoCtx *ioctx, const std::string &oid,
 			::SnapContext *snapc);
     int snapshot_list(librados::IoCtx *ioctx, const std::string &oid,
@@ -144,6 +150,9 @@ namespace librbd {
 			  std::vector<string> *names,
 			  std::vector<uint64_t> *sizes,
 			  ::SnapContext *snapc);
+    int old_snapshot_rename(librados::IoCtx *ioctx, const std::string &oid,
+			    snapid_t src_snap_id,
+			    const std::string &dst_name);
   } // namespace cls_client
 } // namespace librbd
 #endif // CEPH_LIBRBD_CLS_RBD_CLIENT_H
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index f98cd18..12572e6 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -26,7 +26,6 @@ libcommon_internal_la_SOURCES = \
 	common/MemoryModel.cc \
 	common/armor.c \
 	common/fd.cc \
-	common/xattr.c \
 	common/safe_io.c \
 	common/snap_types.cc \
 	common/str_list.cc \
@@ -57,10 +56,6 @@ libcommon_internal_la_SOURCES = \
 	common/entity_name.cc \
 	common/ceph_crypto.cc \
 	common/ceph_crypto_cms.cc \
-	common/ceph_json.cc \
-	common/ipaddr.cc \
-	common/pick_address.cc \
-	common/util.cc \
 	common/TextTable.cc \
 	common/ceph_fs.cc \
 	common/ceph_hash.cc \
@@ -69,15 +64,37 @@ libcommon_internal_la_SOURCES = \
 	common/addr_parsing.c \
 	common/hobject.cc \
 	common/bloom_filter.cc \
-	common/linux_version.c \
 	common/module.c \
 	common/Readahead.cc \
 	common/Cycles.cc \
 	common/ContextCompletion.cc \
 	common/TracepointProvider.cc
 
+if ENABLE_SERVER
+libcommon_internal_la_SOURCES += \
+	common/xattr.c \
+	common/ipaddr.cc \
+	common/ceph_json.cc \
+	common/util.cc \
+	common/pick_address.cc
+endif
+
+if LINUX
+libcommon_internal_la_SOURCES += \
+	common/linux_version.c 
+endif
+
+if SOLARIS
+libcommon_internal_la_SOURCES += \
+        common/solaris_errno.cc
+endif
+
+if WITH_RBD
+if LINUX
 libcommon_internal_la_SOURCES += \
 	common/blkdev.cc
+endif
+endif
 
 if ENABLE_XIO
 libcommon_internal_la_SOURCES += \
diff --git a/src/common/MemoryModel.cc b/src/common/MemoryModel.cc
index f737a7b..ddc7fa9 100644
--- a/src/common/MemoryModel.cc
+++ b/src/common/MemoryModel.cc
@@ -1,4 +1,5 @@
 
+#include "acconfig.h"
 #include "include/types.h"
 #include "MemoryModel.h"
 #include "common/config.h"
@@ -92,7 +93,7 @@ void MemoryModel::_sample(snap *psnap)
   psnap->heap = heap >> 10;
 
   // ...
-#if defined(__linux__)
+#if defined(HAVE_MALLINFO)
   struct mallinfo mi = mallinfo();
   
   psnap->malloc = mi.uordblks >> 10;
diff --git a/src/common/SubProcess.h b/src/common/SubProcess.h
index 18d6e92..8b8ffc7 100644
--- a/src/common/SubProcess.h
+++ b/src/common/SubProcess.h
@@ -26,6 +26,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <signal.h>
 
 #include <sstream>
 #include <vector>
@@ -44,9 +45,9 @@
  *     std::cerr << "cat failed: " << cat.err() << std::endl;
  *     return false;
  *   }
- *   write_to_fd(cat.stdout(), "hello world!\n");
+ *   write_to_fd(cat.get_stdout(), "hello world!\n");
  *   cat.close_stdout();
- *   read_from_fd(cat.stdin(), buf);
+ *   read_from_fd(cat.get_stdin(), buf);
  *   if (cat.join() != 0) {
  *     std::cerr << cat.err() << std::endl;
  *     return false;
@@ -67,9 +68,9 @@ public:
 
   bool is_spawned() const { return pid > 0; }
 
-  int stdin() const;
-  int stdout() const;
-  int stderr() const;
+  int get_stdin() const;
+  int get_stdout() const;
+  int get_stderr() const;
 
   void close_stdin();
   void close_stdout();
@@ -113,12 +114,12 @@ private:
   int sigkill;
 };
 
-SubProcess::SubProcess(const char *cmd_, bool stdin, bool stdout, bool stderr) :
+SubProcess::SubProcess(const char *cmd_, bool use_stdin, bool use_stdout, bool use_stderr) :
   cmd(cmd_),
   cmd_args(),
-  pipe_stdin(stdin),
-  pipe_stdout(stdout),
-  pipe_stderr(stderr),
+  pipe_stdin(use_stdin),
+  pipe_stdout(use_stdout),
+  pipe_stderr(use_stderr),
   stdin_pipe_out_fd(-1),
   stdout_pipe_in_fd(-1),
   stderr_pipe_in_fd(-1),
@@ -152,21 +153,21 @@ void SubProcess::add_cmd_arg(const char *arg) {
   cmd_args.push_back(arg);
 }
 
-int SubProcess::stdin() const {
+int SubProcess::get_stdin() const {
   assert(is_spawned());
   assert(pipe_stdin);
 
   return stdin_pipe_out_fd;
 }
 
-int SubProcess::stdout() const {
+int SubProcess::get_stdout() const {
   assert(is_spawned());
   assert(pipe_stdout);
 
   return stdout_pipe_in_fd;
 }
 
-int SubProcess::stderr() const {
+int SubProcess::get_stderr() const {
   assert(is_spawned());
   assert(pipe_stderr);
 
diff --git a/src/common/Thread.h b/src/common/Thread.h
index e284bda..98b76fd 100644
--- a/src/common/Thread.h
+++ b/src/common/Thread.h
@@ -29,8 +29,8 @@ class Thread {
   void *entry_wrapper();
 
  public:
-  explicit Thread(const Thread& other);
-  const Thread& operator=(const Thread& other);
+  Thread(const Thread&) = delete;
+  Thread& operator=(const Thread&) = delete;
 
   Thread();
   virtual ~Thread();
diff --git a/src/common/Throttle.cc b/src/common/Throttle.cc
index d117794..469c808 100644
--- a/src/common/Throttle.cc
+++ b/src/common/Throttle.cc
@@ -42,8 +42,8 @@ Throttle::Throttle(CephContext *cct, const std::string& n, int64_t m, bool _use_
 
   if (cct->_conf->throttler_perf_counter) {
     PerfCountersBuilder b(cct, string("throttle-") + name, l_throttle_first, l_throttle_last);
-    b.add_u64_counter(l_throttle_val, "val", "Currently available throttle");
-    b.add_u64_counter(l_throttle_max, "max", "Max value for throttle");
+    b.add_u64(l_throttle_val, "val", "Currently available throttle");
+    b.add_u64(l_throttle_max, "max", "Max value for throttle");
     b.add_u64_counter(l_throttle_get, "get", "Gets");
     b.add_u64_counter(l_throttle_get_sum, "get_sum", "Got data");
     b.add_u64_counter(l_throttle_get_or_fail_fail, "get_or_fail_fail", "Get blocked during get_or_fail");
diff --git a/src/common/TrackedOp.cc b/src/common/TrackedOp.cc
index f759894..1be8f5c 100644
--- a/src/common/TrackedOp.cc
+++ b/src/common/TrackedOp.cc
@@ -38,10 +38,9 @@ void OpHistory::on_shutdown()
 
 void OpHistory::insert(utime_t now, TrackedOpRef op)
 {
+  Mutex::Locker history_lock(ops_history_lock);
   if (shutdown)
     return;
-
-  Mutex::Locker history_lock(ops_history_lock);
   duration.insert(make_pair(op->get_duration(), op));
   arrived.insert(make_pair(op->get_initiated(), op));
   cleanup(now);
@@ -118,8 +117,8 @@ void OpTracker::dump_ops_in_flight(Formatter *f)
 
 void OpTracker::register_inflight_op(xlist<TrackedOp*>::item *i)
 {
-  if (!tracking_enabled)
-    return;
+  // caller checks;
+  assert(tracking_enabled);
 
   uint64_t current_seq = seq.inc();
   uint32_t shard_index = current_seq % num_optracker_shards;
@@ -135,7 +134,7 @@ void OpTracker::register_inflight_op(xlist<TrackedOp*>::item *i)
 void OpTracker::unregister_inflight_op(TrackedOp *i)
 {
   // caller checks;
-  assert(tracking_enabled);
+  assert(i->is_tracked);
 
   uint32_t shard_index = i->seq % num_optracker_shards;
   ShardedTrackingData* sdata = sharded_in_flight_list[shard_index];
@@ -146,12 +145,18 @@ void OpTracker::unregister_inflight_op(TrackedOp *i)
     i->xitem.remove_myself();
   }
   i->_unregistered();
-  utime_t now = ceph_clock_now(cct);
-  history.insert(now, TrackedOpRef(i));
+
+  if (!tracking_enabled)
+    delete i;
+  else {
+    utime_t now = ceph_clock_now(cct);
+    history.insert(now, TrackedOpRef(i));
+  }
 }
 
 bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
 {
+  RWLock::RLocker l(lock);
   if (!tracking_enabled)
     return false;
 
@@ -242,12 +247,8 @@ bool OpTracker::check_ops_in_flight(std::vector<string> &warning_vector)
 void OpTracker::get_age_ms_histogram(pow2_hist_t *h)
 {
   h->clear();
-
   utime_t now = ceph_clock_now(NULL);
-  unsigned bin = 30;
-  uint32_t lb = 1 << (bin-1);  // lower bound for this bin
-  int count = 0;
-  
+
   for (uint32_t iter = 0; iter < num_optracker_shards; iter++) {
     ShardedTrackingData* sdata = sharded_in_flight_list[iter];
     assert(NULL != sdata);
@@ -257,26 +258,14 @@ void OpTracker::get_age_ms_histogram(pow2_hist_t *h)
                                                                !i.end(); ++i) {
       utime_t age = now - (*i)->get_initiated();
       uint32_t ms = (long)(age * 1000.0);
-      if (ms >= lb) {
-        count++;
-        continue;
-      }
-      if (count)
-        h->set_bin(bin, count);
-      while (lb > ms) {
-        bin--;
-        lb >>= 1;
-      }
-      count = 1;
+      h->add(ms);
     }
   }
-  if (count)
-    h->set_bin(bin, count);
 }
 
 void OpTracker::mark_event(TrackedOp *op, const string &dest, utime_t time)
 {
-  if (!tracking_enabled)
+  if (!op->is_tracked)
     return;
   return _mark_event(op, dest, time);
 }
@@ -294,7 +283,7 @@ void OpTracker::_mark_event(TrackedOp *op, const string &evt,
 }
 
 void OpTracker::RemoveOnDelete::operator()(TrackedOp *op) {
-  if (!tracker->tracking_enabled) {
+  if (!op->is_tracked) {
     op->_unregistered();
     delete op;
     return;
@@ -306,7 +295,7 @@ void OpTracker::RemoveOnDelete::operator()(TrackedOp *op) {
 
 void TrackedOp::mark_event(const string &event)
 {
-  if (!tracker->tracking_enabled)
+  if (!is_tracked)
     return;
 
   utime_t now = ceph_clock_now(g_ceph_context);
diff --git a/src/common/TrackedOp.h b/src/common/TrackedOp.h
index 2f656ca..89b990a 100644
--- a/src/common/TrackedOp.h
+++ b/src/common/TrackedOp.h
@@ -21,6 +21,7 @@
 #include "include/xlist.h"
 #include "msg/Message.h"
 #include "include/memory.h"
+#include "common/RWLock.h"
 
 class TrackedOp;
 typedef ceph::shared_ptr<TrackedOp> TrackedOpRef;
@@ -77,10 +78,12 @@ class OpTracker {
 public:
   bool tracking_enabled;
   CephContext *cct;
+  RWLock       lock;
   OpTracker(CephContext *cct_, bool tracking, uint32_t num_shards) : seq(0), 
                                      num_optracker_shards(num_shards),
 				     complaint_time(0), log_threshold(0),
-				     tracking_enabled(tracking), cct(cct_) {
+				     tracking_enabled(tracking), cct(cct_),
+				     lock("OpTracker::lock") {
 
     for (uint32_t i = 0; i < num_optracker_shards; i++) {
       char lock_name[32] = {0};
@@ -97,6 +100,10 @@ public:
   void set_history_size_and_duration(uint32_t new_size, uint32_t new_duration) {
     history.set_size_and_duration(new_size, new_duration);
   }
+  void set_tracking(bool enable) {
+    RWLock::WLocker l(lock);
+    tracking_enabled = enable;
+  }
   void dump_ops_in_flight(Formatter *f);
   void dump_historic_ops(Formatter *f);
   void register_inflight_op(xlist<TrackedOp*>::item *i);
@@ -151,18 +158,22 @@ protected:
   uint64_t seq; /// a unique value set by the OpTracker
 
   uint32_t warn_interval_multiplier; // limits output of a given op warning
-
+  bool is_tracked; //whether in tracker
   TrackedOp(OpTracker *_tracker, const utime_t& initiated) :
     xitem(this),
     tracker(_tracker),
     initiated_at(initiated),
     lock("TrackedOp::lock"),
     seq(0),
-    warn_interval_multiplier(1)
+    warn_interval_multiplier(1),
+    is_tracked(false)
   {
-    tracker->register_inflight_op(&xitem);
-    if (tracker->tracking_enabled)
+    RWLock::RLocker l(tracker->lock);
+    if (tracker->tracking_enabled) {
+      tracker->register_inflight_op(&xitem);
       events.push_back(make_pair(initiated_at, "initiated"));
+      is_tracked = true;
+    }
   }
 
   /// output any type-specific data you want to get when dump() is called
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
index f0754de..7c3ccb5 100644
--- a/src/common/WorkQueue.h
+++ b/src/common/WorkQueue.h
@@ -18,6 +18,7 @@
 #include "Mutex.h"
 #include "Cond.h"
 #include "Thread.h"
+#include "include/unordered_map.h"
 #include "common/config_obs.h"
 #include "common/HeartbeatMap.h"
 
@@ -349,6 +350,84 @@ public:
 
   };
 
+  template<typename T>
+  class PointerWQ : public WorkQueue_ {
+  public:
+    PointerWQ(string n, time_t ti, time_t sti, ThreadPool* p)
+      : WorkQueue_(n, ti, sti), m_pool(p), m_processing(0) {
+      m_pool->add_work_queue(this);
+    }
+    ~PointerWQ() {
+      m_pool->remove_work_queue(this);
+      assert(m_processing == 0);
+    }
+    void drain() {
+      {
+        // if this queue is empty and not processing, don't wait for other
+        // queues to finish processing
+        Mutex::Locker l(m_pool->_lock);
+        if (m_processing == 0 && m_items.empty()) {
+          return;
+        }
+      }
+      m_pool->drain(this);
+    }
+    void queue(T *item) {
+      Mutex::Locker l(m_pool->_lock);
+      m_items.push_back(item);
+      m_pool->_cond.SignalOne();
+    }
+    bool empty() {
+      Mutex::Locker l(m_pool->_lock);
+      return _empty();
+    }
+  protected:
+    virtual void _clear() {
+      assert(m_pool->_lock.is_locked());
+      m_items.clear();
+    }
+    virtual bool _empty() {
+      assert(m_pool->_lock.is_locked());
+      return m_items.empty();
+    }
+    virtual void *_void_dequeue() {
+      assert(m_pool->_lock.is_locked());
+      if (m_items.empty()) {
+        return NULL;
+      }
+
+      ++m_processing;
+      T *item = m_items.front();
+      m_items.pop_front();
+      return item;
+    }
+    virtual void _void_process(void *item, ThreadPool::TPHandle &handle) {
+      process(reinterpret_cast<T *>(item));
+    }
+    virtual void _void_process_finish(void *item) {
+      assert(m_pool->_lock.is_locked());
+      assert(m_processing > 0);
+      --m_processing;
+    }
+
+    virtual void process(T *item) = 0;
+
+    T *front() {
+      assert(m_pool->_lock.is_locked());
+      if (m_items.empty()) {
+        return NULL;
+      }
+      return m_items.front();
+    }
+    void signal() {
+      Mutex::Locker pool_locker(m_pool->_lock);
+      m_pool->_cond.SignalOne();
+    }
+  private:
+    ThreadPool *m_pool;
+    std::list<T *> m_items;
+    uint32_t m_processing;
+  };
 private:
   vector<WorkQueue_*> work_queues;
   int last_work_queue;
@@ -488,37 +567,44 @@ public:
 
 /// Work queue that asynchronously completes contexts (executes callbacks).
 /// @see Finisher
-class ContextWQ : public ThreadPool::WorkQueueVal<std::pair<Context *, int> > {
+class ContextWQ : public ThreadPool::PointerWQ<Context> {
 public:
   ContextWQ(const string &name, time_t ti, ThreadPool *tp)
-    : ThreadPool::WorkQueueVal<std::pair<Context *, int> >(name, ti, 0, tp) {}
+    : ThreadPool::PointerWQ<Context>(name, ti, 0, tp),
+      m_lock("ContextWQ::m_lock") {
+  }
 
   void queue(Context *ctx, int result = 0) {
-    ThreadPool::WorkQueueVal<std::pair<Context *, int> >::queue(
-      std::make_pair(ctx, result));
+    if (result != 0) {
+      Mutex::Locker locker(m_lock);
+      m_context_results[ctx] = result;
+    }
+    ThreadPool::PointerWQ<Context>::queue(ctx);
   }
-
 protected:
-  virtual void _enqueue(std::pair<Context *, int> item) {
-    _queue.push_back(item);
-  }
-  virtual void _enqueue_front(std::pair<Context *, int> item) {
-    _queue.push_front(item);
-  }
-  virtual bool _empty() {
-    return _queue.empty();
-  }
-  virtual std::pair<Context *, int> _dequeue() {
-    std::pair<Context *, int> item = _queue.front();
-    _queue.pop_front();
-    return item;
+  virtual void _clear() {
+    ThreadPool::PointerWQ<Context>::_clear();
+
+    Mutex::Locker locker(m_lock);
+    m_context_results.clear();
   }
-  virtual void _process(std::pair<Context *, int> item) {
-    item.first->complete(item.second);
+
+  virtual void process(Context *ctx) {
+    int result = 0;
+    {
+      Mutex::Locker locker(m_lock);
+      ceph::unordered_map<Context *, int>::iterator it =
+        m_context_results.find(ctx);
+      if (it != m_context_results.end()) {
+        result = it->second;
+        m_context_results.erase(it);
+      }
+    }
+    ctx->complete(result);
   }
-  using ThreadPool::WorkQueueVal<std::pair<Context *, int> >::_process;
 private:
-  list<std::pair<Context *, int> > _queue;
+  Mutex m_lock;
+  ceph::unordered_map<Context*, int> m_context_results;
 };
 
 class ShardedThreadPool {
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index bca14d1..6f200bc 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -1333,7 +1333,7 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     }
   }
   
-  bool buffer::list::is_contiguous()
+  bool buffer::list::is_contiguous() const
   {
     return &(*_buffers.begin()) == &(*_buffers.rbegin());
   }
diff --git a/src/common/ceph_context.cc b/src/common/ceph_context.cc
index 7383ed7..6e60cf7 100644
--- a/src/common/ceph_context.cc
+++ b/src/common/ceph_context.cc
@@ -398,11 +398,12 @@ void CephContext::do_command(std::string command, cmdmap_t& cmdmap,
 }
 
 
-CephContext::CephContext(uint32_t module_type_)
+CephContext::CephContext(uint32_t module_type_, int init_flags_)
   : nref(1),
     _conf(new md_config_t()),
     _log(NULL),
     _module_type(module_type_),
+    _init_flags(init_flags_),
     _crypto_inited(false),
     _service_thread(NULL),
     _log_obs(NULL),
@@ -586,6 +587,11 @@ uint32_t CephContext::get_module_type() const
   return _module_type;
 }
 
+int CephContext::get_init_flags() const
+{
+  return _init_flags;
+}
+
 PerfCountersCollection *CephContext::get_perfcounters_collection()
 {
   return _perf_counters_collection;
diff --git a/src/common/ceph_context.h b/src/common/ceph_context.h
index 037f2d8..3820a23 100644
--- a/src/common/ceph_context.h
+++ b/src/common/ceph_context.h
@@ -55,7 +55,7 @@ using ceph::bufferlist;
  */
 class CephContext {
 public:
-  CephContext(uint32_t module_type_);
+  CephContext(uint32_t module_type_, int init_flags_ = 0);
 
   // ref count!
 private:
@@ -86,6 +86,8 @@ public:
   /* Get the module type (client, mon, osd, mds, etc.) */
   uint32_t get_module_type() const;
 
+  int get_init_flags() const;
+
   /* Get the PerfCountersCollection of this CephContext */
   PerfCountersCollection *get_perfcounters_collection();
 
@@ -173,6 +175,8 @@ private:
 
   uint32_t _module_type;
 
+  int _init_flags;
+
   bool _crypto_inited;
 
   /* libcommon service thread.
diff --git a/src/common/common_init.cc b/src/common/common_init.cc
index a580309..23c2e7c 100644
--- a/src/common/common_init.cc
+++ b/src/common/common_init.cc
@@ -39,7 +39,7 @@ CephContext *common_preinit(const CephInitParameters &iparams,
   g_code_env = code_env;
 
   // Create a configuration object
-  CephContext *cct = new CephContext(iparams.module_type);
+  CephContext *cct = new CephContext(iparams.module_type, flags);
 
   md_config_t *conf = cct->_conf;
   // add config observers here
@@ -113,10 +113,10 @@ void complain_about_parse_errors(CephContext *cct,
 
 /* Please be sure that this can safely be called multiple times by the
  * same application. */
-void common_init_finish(CephContext *cct, int flags)
+void common_init_finish(CephContext *cct)
 {
   cct->init_crypto();
 
-  if (!(flags & CINIT_FLAG_NO_DAEMON_ACTIONS))
+  if (!(cct->get_init_flags() & CINIT_FLAG_NO_DAEMON_ACTIONS))
     cct->start_service_thread();
 }
diff --git a/src/common/common_init.h b/src/common/common_init.h
index d6aa9fa..f48b349 100644
--- a/src/common/common_init.h
+++ b/src/common/common_init.h
@@ -75,6 +75,6 @@ void complain_about_parse_errors(CephContext *cct,
  * libraries. The most obvious reason for this is that the threads started by
  * the Ceph libraries would be destroyed by a fork().
  */
-void common_init_finish(CephContext *cct, int flags = 0);
+void common_init_finish(CephContext *cct);
 
 #endif
diff --git a/src/common/config.cc b/src/common/config.cc
index c26b826..4ee54bb 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -811,6 +811,22 @@ int md_config_t::_get_val(const char *key, char **buf, int len) const
   return -ENOENT;
 }
 
+void md_config_t::get_all_keys(std::vector<std::string> *keys) const {
+  const std::string negative_flag_prefix("no_");
+
+  keys->clear();
+  keys->reserve(NUM_CONFIG_OPTIONS);
+  for (size_t i = 0; i < NUM_CONFIG_OPTIONS; ++i) {
+    keys->push_back(config_optionsp[i].name);
+    if (config_optionsp[i].type == OPT_BOOL) {
+      keys->push_back(negative_flag_prefix + config_optionsp[i].name);
+    }
+  }
+  for (int i = 0; i < subsys.get_num(); ++i) {
+    keys->push_back("debug_" + subsys.get_name(i));
+  }
+}
+
 /* The order of the sections here is important.  The first section in the
  * vector is the "highest priority" section; if we find it there, we'll stop
  * looking. The lowest priority section is the one we look in only if all
diff --git a/src/common/config.h b/src/common/config.h
index 41c999d..262dc71 100644
--- a/src/common/config.h
+++ b/src/common/config.h
@@ -140,6 +140,8 @@ public:
   int get_val(const char *key, char **buf, int len) const;
   int _get_val(const char *key, char **buf, int len) const;
 
+  void get_all_keys(std::vector<std::string> *keys) const;
+
   // Return a list of all the sections that the current entity is a member of.
   void get_my_sections(std::vector <std::string> &sections) const;
 
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index ab52520..a19cc5d 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -194,7 +194,8 @@ OPTION(mon_compact_on_trim, OPT_BOOL, true)       // compact (a prefix) when we
 OPTION(mon_osd_cache_size, OPT_INT, 10)  // the size of osdmaps cache, not to rely on underlying store's cache
 
 OPTION(mon_tick_interval, OPT_INT, 5)
-OPTION(mon_subscribe_interval, OPT_DOUBLE, 300)
+OPTION(mon_session_timeout, OPT_INT, 300)    // must send keepalive or subscribe
+OPTION(mon_subscribe_interval, OPT_DOUBLE, 24*3600)  // for legacy clients only
 OPTION(mon_delta_reset_interval, OPT_DOUBLE, 10)   // seconds of inactivity before we reset the pg delta to 0
 OPTION(mon_osd_laggy_halflife, OPT_INT, 60*60)        // (seconds) how quickly our laggy estimations decay
 OPTION(mon_osd_laggy_weight, OPT_DOUBLE, .3)          // weight for new 'samples's in laggy estimations
@@ -230,6 +231,7 @@ OPTION(mon_pg_warn_max_per_osd, OPT_INT, 300)  // max # pgs per (in) osd before
 OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT, 10.0) // max skew few average in objects per pg
 OPTION(mon_pg_warn_min_objects, OPT_INT, 10000)  // do not warn below this object #
 OPTION(mon_pg_warn_min_pool_objects, OPT_INT, 1000)  // do not warn on pools below this object #
+OPTION(mon_pg_check_down_all_threshold, OPT_FLOAT, .5) // threshold of down osds after which we check all pgs
 OPTION(mon_cache_target_full_warn_ratio, OPT_FLOAT, .66) // position between pool cache_target_full and max where we start warning
 OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full"
 OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full
@@ -274,8 +276,7 @@ OPTION(mon_sync_debug_leader, OPT_INT, -1) // monitor to be used as the sync lea
 OPTION(mon_sync_debug_provider, OPT_INT, -1) // monitor to be used as the sync provider
 OPTION(mon_sync_debug_provider_fallback, OPT_INT, -1) // monitor to be used as fallback if sync provider fails
 OPTION(mon_inject_sync_get_chunk_delay, OPT_DOUBLE, 0)  // inject N second delay on each get_chunk request
-OPTION(mon_osd_min_down_reporters, OPT_INT, 1)   // number of OSDs who need to report a down OSD for it to count
-OPTION(mon_osd_min_down_reports, OPT_INT, 3)     // number of times a down OSD must be reported for it to count
+OPTION(mon_osd_min_down_reporters, OPT_INT, 2)   // number of OSDs who need to report a down OSD for it to count
 OPTION(mon_osd_force_trim_to, OPT_INT, 0)   // force mon to trim maps to this point, regardless of min_last_epoch_clean (dangerous, use with care)
 OPTION(mon_mds_force_trim_to, OPT_INT, 0)   // force mon to trim mdsmaps to this point (dangerous, use with care)
 
@@ -339,6 +340,8 @@ OPTION(client_readahead_max_bytes, OPT_LONGLONG, 0)  //8 * 1024*1024
 OPTION(client_readahead_max_periods, OPT_LONGLONG, 4)  // as multiple of file layout period (object size * num stripes)
 OPTION(client_snapdir, OPT_STR, ".snap")
 OPTION(client_mountpoint, OPT_STR, "/")
+OPTION(client_mount_uid, OPT_INT, -1)
+OPTION(client_mount_gid, OPT_INT, -1)
 OPTION(client_notify_timeout, OPT_INT, 10) // in seconds
 OPTION(osd_client_watch_timeout, OPT_INT, 30) // in seconds
 OPTION(client_caps_release_delay, OPT_INT, 5) // in seconds
@@ -354,6 +357,7 @@ OPTION(client_debug_inject_tick_delay, OPT_INT, 0) // delay the client tick for
 OPTION(client_max_inline_size, OPT_U64, 4096)
 OPTION(client_inject_release_failure, OPT_BOOL, false)  // synthetic client bug for testing
 OPTION(client_inject_fixed_oldest_tid, OPT_BOOL, false)  // synthetic client bug for testing
+
 // note: the max amount of "in flight" dirty data is roughly (max - target)
 OPTION(fuse_use_invalidate_cb, OPT_BOOL, false) // use fuse 2.8+ invalidate callback to keep page cache consistent
 OPTION(fuse_allow_other, OPT_BOOL, true)
@@ -363,6 +367,7 @@ OPTION(fuse_atomic_o_trunc, OPT_BOOL, true)
 OPTION(fuse_debug, OPT_BOOL, false)
 OPTION(fuse_multithreaded, OPT_BOOL, true)
 OPTION(fuse_require_active_mds, OPT_BOOL, true) // if ceph_fuse requires active mds server
+
 OPTION(client_try_dentry_invalidate, OPT_BOOL, true) // the client should try to use dentry invaldation instead of remounting, on kernels it believes that will work for
 OPTION(client_die_on_failed_remount, OPT_BOOL, true)
 OPTION(client_check_pool_perm, OPT_BOOL, true)
@@ -503,6 +508,8 @@ OPTION(mds_max_purge_ops_per_pg, OPT_FLOAT, 0.5)
 OPTION(mds_root_ino_uid, OPT_INT, 0) // The UID of / on new filesystems
 OPTION(mds_root_ino_gid, OPT_INT, 0) // The GID of / on new filesystems
 
+OPTION(mds_max_scrub_ops_in_progress, OPT_INT, 5) // the number of simultaneous scrubs allowed
+
 // If true, compact leveldb store on mount
 OPTION(osd_compact_leveldb_on_mount, OPT_BOOL, false)
 
@@ -593,6 +600,8 @@ OPTION(osd_tier_default_cache_hit_set_period, OPT_INT, 1200)
 OPTION(osd_tier_default_cache_hit_set_type, OPT_STR, "bloom")
 OPTION(osd_tier_default_cache_min_read_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on read)
 OPTION(osd_tier_default_cache_min_write_recency_for_promote, OPT_INT, 1) // number of recent HitSets the object must appear in to be promoted (on write)
+OPTION(osd_tier_default_cache_hit_set_grade_decay_rate, OPT_INT, 20)
+OPTION(osd_tier_default_cache_hit_set_search_last_n, OPT_INT, 1)
 
 OPTION(osd_map_dedup, OPT_BOOL, true)
 OPTION(osd_map_max_advance, OPT_INT, 150) // make this < cache_size!
@@ -613,6 +622,11 @@ OPTION(osd_recover_clone_overlap, OPT_BOOL, true)   // preserve clone_overlap du
 OPTION(osd_op_num_threads_per_shard, OPT_INT, 2)
 OPTION(osd_op_num_shards, OPT_INT, 5)
 
+// Set to true for testing.  Users should NOT set this.
+// If set to true even after reading enough shards to
+// decode the object, any error will be reported.
+OPTION(osd_read_ec_check_for_errors, OPT_BOOL, false) // return error if any ec shard has an error
+
 // Only use clone_overlap for recovery if there are fewer than
 // osd_recover_clone_overlap_limit entries in the overlap set
 OPTION(osd_recover_clone_overlap_limit, OPT_INT, 10)
@@ -644,10 +658,13 @@ OPTION(osd_pg_max_concurrent_snap_trims, OPT_U64, 2)
 OPTION(osd_heartbeat_min_healthy_ratio, OPT_FLOAT, .33)
 
 OPTION(osd_mon_heartbeat_interval, OPT_INT, 30)  // (seconds) how often to ping monitor if no peers
-OPTION(osd_mon_report_interval_max, OPT_INT, 120)
+OPTION(osd_mon_report_interval_max, OPT_INT, 600)
 OPTION(osd_mon_report_interval_min, OPT_INT, 5)  // pg stats, failures, up_thru, boot.
+OPTION(osd_mon_report_max_in_flight, OPT_INT, 2)  // max updates in flight
 OPTION(osd_pg_stat_report_interval_max, OPT_INT, 500)  // report pg stats for any given pg at least this often
 OPTION(osd_mon_ack_timeout, OPT_INT, 30) // time out a mon if it doesn't ack stats
+OPTION(osd_stats_ack_timeout_factor, OPT_DOUBLE, 2.0) // multiples of mon_ack_timeout
+OPTION(osd_stats_ack_timeout_decay, OPT_DOUBLE, .9)
 OPTION(osd_default_data_pool_replay_window, OPT_INT, 45)
 OPTION(osd_preserve_trimmed_log, OPT_BOOL, false)
 OPTION(osd_auto_mark_unfound_lost, OPT_BOOL, false)
@@ -670,7 +687,10 @@ OPTION(osd_scrub_interval_randomize_ratio, OPT_FLOAT, 0.5) // randomize the sche
 OPTION(osd_scrub_chunk_min, OPT_INT, 5)
 OPTION(osd_scrub_chunk_max, OPT_INT, 25)
 OPTION(osd_scrub_sleep, OPT_FLOAT, 0)   // sleep between [deep]scrub ops
+OPTION(osd_scrub_auto_repair, OPT_BOOL, false)   // whether auto-repair inconsistencies upon deep-scrubbing
+OPTION(osd_scrub_auto_repair_num_errors, OPT_U32, 5)   // only auto-repair when number of errors is below this threshold
 OPTION(osd_deep_scrub_interval, OPT_FLOAT, 60*60*24*7) // once a week
+OPTION(osd_deep_scrub_randomize_ratio, OPT_FLOAT, 0.15) // scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs are deep)
 OPTION(osd_deep_scrub_stride, OPT_INT, 524288)
 OPTION(osd_deep_scrub_update_digest_min_age, OPT_INT, 2*60*60)   // objects must be this old (seconds) before we update the whole-object digest on scrub
 OPTION(osd_scan_list_ping_tp_interval, OPT_U64, 100)
@@ -695,8 +715,6 @@ OPTION(osd_op_log_threshold, OPT_INT, 5) // how many op log messages to show in
 OPTION(osd_verify_sparse_read_holes, OPT_BOOL, false)  // read fiemap-reported holes and verify they are zeros
 OPTION(osd_debug_drop_ping_probability, OPT_DOUBLE, 0)
 OPTION(osd_debug_drop_ping_duration, OPT_INT, 0)
-OPTION(osd_debug_drop_pg_create_probability, OPT_DOUBLE, 0)
-OPTION(osd_debug_drop_pg_create_duration, OPT_INT, 1)
 OPTION(osd_debug_drop_op_probability, OPT_DOUBLE, 0)   // probability of stalling/dropping a client op
 OPTION(osd_debug_op_order, OPT_BOOL, false)
 OPTION(osd_debug_scrub_chance_rewrite_digest, OPT_U64, 0)
@@ -747,7 +765,7 @@ OPTION(keyvaluestore_rocksdb_options, OPT_STR, "")
 // rocksdb options that will be used for omap(if omap_backend is rocksdb)
 OPTION(filestore_rocksdb_options, OPT_STR, "")
 // rocksdb options that will be used in monstore
-OPTION(mon_rocksdb_options, OPT_STR, "")
+OPTION(mon_rocksdb_options, OPT_STR, "cache_size=536870912,write_buffer_size=33554432,block_size=65536,compression=kNoCompression")
 
 /**
  * osd_*_priority adjust the relative priority of client io, recovery io,
@@ -799,7 +817,7 @@ OPTION(memstore_page_size, OPT_U64, 64 << 10)
 OPTION(newstore_max_dir_size, OPT_U32, 1000000)
 OPTION(newstore_onode_map_size, OPT_U32, 1024)   // onodes per collection
 OPTION(newstore_backend, OPT_STR, "rocksdb")
-OPTION(newstore_backend_options, OPT_STR, "")
+OPTION(newstore_rocksdb_options, OPT_STR, "compression=kNoCompression,max_write_buffer_number=16,min_write_buffer_number_to_merge=6")
 OPTION(newstore_fail_eio, OPT_BOOL, true)
 OPTION(newstore_sync_io, OPT_BOOL, false)  // perform initial io synchronously
 OPTION(newstore_sync_transaction, OPT_BOOL, false)  // perform kv txn synchronously
@@ -821,7 +839,6 @@ OPTION(newstore_overlay_max_length, OPT_INT, 65536)
 OPTION(newstore_overlay_max, OPT_INT, 32)
 OPTION(newstore_open_by_handle, OPT_BOOL, true)
 OPTION(newstore_o_direct, OPT_BOOL, true)
-OPTION(newstore_db_path, OPT_STR, "")
 OPTION(newstore_aio, OPT_BOOL, true)
 OPTION(newstore_aio_poll_ms, OPT_INT, 250)  // milliseconds
 OPTION(newstore_aio_max_queue_depth, OPT_INT, 4096)
@@ -908,6 +925,8 @@ OPTION(filestore_update_to, OPT_INT, 1000)
 OPTION(filestore_blackhole, OPT_BOOL, false)     // drop any new transactions on the floor
 OPTION(filestore_fd_cache_size, OPT_INT, 128)    // FD lru size
 OPTION(filestore_fd_cache_shards, OPT_INT, 16)   // FD number of shards
+OPTION(filestore_ondisk_finisher_threads, OPT_INT, 1)
+OPTION(filestore_apply_finisher_threads, OPT_INT, 1)
 OPTION(filestore_dump_file, OPT_STR, "")         // file onto which store transaction dumps
 OPTION(filestore_kill_at, OPT_INT, 0)            // inject a failure at the n'th opportunity
 OPTION(filestore_inject_stall, OPT_INT, 0)       // artificially stall for N seconds in op queue thread
@@ -1117,6 +1136,8 @@ OPTION(rgw_replica_log_obj_prefix, OPT_STR, "replica_log") //
 OPTION(rgw_bucket_quota_ttl, OPT_INT, 600) // time for cached bucket stats to be cached within rgw instance
 OPTION(rgw_bucket_quota_soft_threshold, OPT_DOUBLE, 0.95) // threshold from which we don't rely on cached info for quota decisions
 OPTION(rgw_bucket_quota_cache_size, OPT_INT, 10000) // number of entries in bucket quota cache
+OPTION(rgw_bucket_default_quota_max_objects, OPT_INT, -1) // number of objects allowed
+OPTION(rgw_bucket_default_quota_max_size, OPT_LONGLONG, -1) // Max size of object in kB
 
 OPTION(rgw_expose_bucket, OPT_BOOL, false) // Return the bucket name in the 'Bucket' response header
 
@@ -1126,6 +1147,8 @@ OPTION(rgw_user_quota_bucket_sync_interval, OPT_INT, 180) // time period for acc
 OPTION(rgw_user_quota_sync_interval, OPT_INT, 3600 * 24) // time period for accumulating modified buckets before syncing entire user stats
 OPTION(rgw_user_quota_sync_idle_users, OPT_BOOL, false) // whether stats for idle users be fully synced
 OPTION(rgw_user_quota_sync_wait_time, OPT_INT, 3600 * 24) // min time between two full stats sync for non-idle users
+OPTION(rgw_user_default_quota_max_objects, OPT_INT, -1) // number of objects allowed
+OPTION(rgw_user_default_quota_max_size, OPT_LONGLONG, -1) // Max size of object in kB
 
 OPTION(rgw_multipart_min_part_size, OPT_INT, 5 * 1024 * 1024) // min size for each part (except for last one) in multipart upload
 OPTION(rgw_multipart_part_upload_limit, OPT_INT, 10000) // parts limit in multipart upload
diff --git a/src/common/obj_bencher.cc b/src/common/obj_bencher.cc
index 3772eb8..9ecfbec 100644
--- a/src/common/obj_bencher.cc
+++ b/src/common/obj_bencher.cc
@@ -52,7 +52,7 @@ static std::string generate_object_name(int objnum, int pid = 0)
   return oss.str();
 }
 
-static void sanitize_object_contents (bench_data *data, int length) {
+static void sanitize_object_contents (bench_data *data, size_t length) {
   memset(data->object_contents, 'z', length);
 }
 
@@ -204,7 +204,7 @@ void *ObjBencher::status_printer(void *_bencher) {
 
 int ObjBencher::aio_bench(
   int operation, int secondsToRun,
-  int concurrentios, int object_size, bool cleanup, const std::string& run_name, bool no_verify) {
+  int concurrentios, size_t object_size, bool cleanup, const std::string& run_name, bool no_verify) {
 
   if (concurrentios <= 0) 
     return -EINVAL;
@@ -322,11 +322,11 @@ static T vec_stddev(vector<T>& v)
   return sqrt(stddev);
 }
 
-int ObjBencher::fetch_bench_metadata(const std::string& metadata_file, int* object_size, int* num_objects, int* prevPid) {
+int ObjBencher::fetch_bench_metadata(const std::string& metadata_file, size_t* object_size, int* num_objects, int* prevPid) {
   int r = 0;
   bufferlist object_data;
 
-  r = sync_read(metadata_file, object_data, sizeof(int)*3);
+  r = sync_read(metadata_file, object_data, sizeof(int) * 2 + sizeof(size_t));
   if (r <= 0) {
     // treat an empty file as a file that does not exist
     if (r == 0) {
@@ -666,7 +666,8 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
   
     if (!no_verify) {
       snprintf(data.object_contents, data.object_size, "I'm the %16dth object!", current_index);
-      if (memcmp(data.object_contents, cur_contents->c_str(), data.object_size) != 0) {
+      if ( (cur_contents->length() != data.object_size) || 
+           (memcmp(data.object_contents, cur_contents->c_str(), data.object_size) != 0) ) {
         cerr << name[slot] << " is not correct!" << std::endl;
         ++errors;
       }
@@ -676,13 +677,13 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
     index[slot] = data.started;
     lock.Unlock();
     completion_wait(slot);
+    lock.Lock();
     r = completion_ret(slot);
     if (r < 0) {
       cerr << "read got " << r << std::endl;
       lock.Unlock();
       goto ERR;
     }
-    lock.Lock();
     total_latency += data.cur_latency;
     if (data.cur_latency > data.max_latency) data.max_latency = data.cur_latency;
     if (data.cur_latency < data.min_latency) data.min_latency = data.cur_latency;
@@ -728,7 +729,8 @@ int ObjBencher::seq_read_bench(int seconds_to_run, int num_objects, int concurre
     if (!no_verify) {
       snprintf(data.object_contents, data.object_size, "I'm the %16dth object!", index[slot]);
       lock.Unlock();
-      if (memcmp(data.object_contents, contents[slot]->c_str(), data.object_size) != 0) {
+      if ((contents[slot]->length() != data.object_size) || 
+         (memcmp(data.object_contents, contents[slot]->c_str(), data.object_size) != 0)) {
         cerr << name[slot] << " is not correct!" << std::endl;
         ++errors;
       }
@@ -899,7 +901,8 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr
     
     if (!no_verify) {
       snprintf(data.object_contents, data.object_size, "I'm the %16dth object!", current_index);
-      if (memcmp(data.object_contents, cur_contents->c_str(), data.object_size) != 0) {
+      if ((cur_contents->length() != data.object_size) || 
+          (memcmp(data.object_contents, cur_contents->c_str(), data.object_size) != 0)) {
         cerr << name[slot] << " is not correct!" << std::endl;
         ++errors;
       }
@@ -950,7 +953,8 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr
     if (!no_verify) {
       snprintf(data.object_contents, data.object_size, "I'm the %16dth object!", index[slot]);
       lock.Unlock();
-      if (memcmp(data.object_contents, contents[slot]->c_str(), data.object_size) != 0) {
+      if ((contents[slot]->length() != data.object_size) || 
+          (memcmp(data.object_contents, contents[slot]->c_str(), data.object_size) != 0)) {
         cerr << name[slot] << " is not correct!" << std::endl;
         ++errors;
       }
@@ -1010,7 +1014,7 @@ int ObjBencher::rand_read_bench(int seconds_to_run, int num_objects, int concurr
 
 int ObjBencher::clean_up(const std::string& prefix, int concurrentios, const std::string& run_name) {
   int r = 0;
-  int object_size;
+  size_t object_size;
   int num_objects;
   int prevPid;
 
diff --git a/src/common/obj_bencher.h b/src/common/obj_bencher.h
index 34e22c2..1c96815 100644
--- a/src/common/obj_bencher.h
+++ b/src/common/obj_bencher.h
@@ -36,7 +36,7 @@ struct bench_history {
 
 struct bench_data {
   bool done; //is the benchmark is done
-  int object_size; //the size of the objects
+  size_t object_size; //the size of the objects
   // same as object_size for write tests
   int in_flight; //number of reads/writes being waited on
   int started;
@@ -71,7 +71,7 @@ protected:
 
   struct bench_data data;
 
-  int fetch_bench_metadata(const std::string& metadata_file, int* object_size, int* num_objects, int* prevPid);
+  int fetch_bench_metadata(const std::string& metadata_file, size_t* object_size, int* num_objects, int* prevPid);
 
   int write_bench(int secondsToRun, int concurrentios, const string& run_name_meta);
   int seq_read_bench(int secondsToRun, int num_objects, int concurrentios, int writePid, bool no_verify=false);
@@ -107,7 +107,7 @@ public:
   virtual ~ObjBencher() {}
   int aio_bench(
     int operation, int secondsToRun,
-    int concurrentios, int op_size, bool cleanup, const std::string& run_name, bool no_verify=false);
+    int concurrentios, size_t op_size, bool cleanup, const std::string& run_name, bool no_verify=false);
   int clean_up(const std::string& prefix, int concurrentios, const std::string& run_name);
 
   void set_show_time(bool dt) {
diff --git a/src/common/run_cmd.cc b/src/common/run_cmd.cc
index 5f5cc3c..81e4171 100644
--- a/src/common/run_cmd.cc
+++ b/src/common/run_cmd.cc
@@ -47,6 +47,9 @@ std::string run_cmd(const char *cmd, ...)
   }
   else if (fret == 0) {
     // execvp doesn't modify its arguments, so the const-cast here is safe.
+    close(STDIN_FILENO);
+    close(STDOUT_FILENO);
+    close(STDERR_FILENO);
     execvp(cmd, (char * const*)&arr[0]);
     _exit(127);
   }
diff --git a/src/common/sctp_crc32.c b/src/common/sctp_crc32.c
index 4acf529..2fa26af 100644
--- a/src/common/sctp_crc32.c
+++ b/src/common/sctp_crc32.c
@@ -30,8 +30,9 @@
 
 /* $KAME: sctp_crc32.c,v 1.12 2005/03/06 16:04:17 itojun Exp $	 */
 
-
+#ifdef HAVE_SYS_TYPES_H
 #include <sys/cdefs.h>
+#endif
 #if 0
 __FBSDID("$FreeBSD: src/sys/netinet/sctp_crc32.c,v 1.8 2007/05/08 17:01:10 rrs Exp $");
 
diff --git a/src/common/solaris_errno.cc b/src/common/solaris_errno.cc
new file mode 100644
index 0000000..0ce23c7
--- /dev/null
+++ b/src/common/solaris_errno.cc
@@ -0,0 +1,227 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <errno.h>
+#include "include/types.h"
+
+
+// converts from linux errno values to host values
+__s32 ceph_to_host_errno(__s32 r) 
+{
+  if (r < -34) {
+    switch (r) {
+      case -35:
+        return -EDEADLK;
+      case -36:
+        return -ENAMETOOLONG;
+      case -37:
+        return -ENOLCK;
+      case -38:
+        return -ENOSYS;
+      case -39:
+        return -ENOTEMPTY;
+      case -40:
+        return -ELOOP;
+      case -42:
+        return -ENOMSG;
+      case -43:
+        return -EIDRM;
+      case -44:
+        return -ECHRNG;
+      case -45:
+        return -EL2NSYNC;
+      case -46:
+        return -EL3HLT;
+      case -47:
+        return -EL3RST;
+      case -48:
+        return -ELNRNG;
+      case -49:
+        return -EUNATCH;
+      case -50:
+        return -ENOCSI;
+      case -51:
+        return -EL2HLT;
+      case -52:
+        return -EBADE;
+      case -53:
+        return -EBADR;
+      case -54:
+        return -EXFULL;
+      case -55:
+        return -ENOANO;
+      case -56:
+        return -EBADRQC;
+      case -57:
+        return -EBADSLT;
+      case -59:
+        return -EBFONT;
+      case -60:
+        return -ENOSTR;
+      case -61:
+        return -ENODATA;
+      case -62:
+        return -ETIME;
+      case -63:
+        return -ENOSR;
+      //case -64:
+      //  return -EPERM; //TODO ENONET
+      //case -65:
+      //  return -EPERM; //TODO ENOPKG
+      //case -66:
+      //  return -EREMOTE;
+      //case -67:
+      //  return -ENOLINK;
+      //case -68:
+      //  return -EPERM; //TODO EADV 
+      //case -69:
+      //  return -EPERM; //TODO ESRMNT 
+      //case -70:
+      //  return -EPERM; //TODO ECOMM
+      case -71:
+        return -EPROTO;
+      case -72:
+        return -EMULTIHOP;
+      case -73:
+        return -EPERM; //TODO EDOTDOT 
+      case -74:
+        return -EBADMSG;
+      case -75:
+        return -EOVERFLOW;
+      case -76:
+        return -ENOTUNIQ;
+      case -77:
+        return -EBADFD;
+      case -78:
+        return -EREMCHG;
+      case -79:
+        return -ELIBACC;
+      case -80:
+        return -ELIBBAD;
+      case -81:
+        return -ELIBSCN;
+      case -82:
+        return -ELIBMAX;
+      case -83:
+	return -ELIBEXEC;
+      case -84:
+        return -EILSEQ;
+      case -85:
+        return -ERESTART;
+      case -86:
+        return -ESTRPIPE; 
+      case -87:
+        return -EUSERS;
+      case -88:
+        return -ENOTSOCK;
+      case -89:
+        return -EDESTADDRREQ;
+      case -90:
+        return -EMSGSIZE;
+      case -91:
+        return -EPROTOTYPE;
+      case -92:
+        return -ENOPROTOOPT;
+      case -93:
+        return -EPROTONOSUPPORT;
+      case -94:
+        return -ESOCKTNOSUPPORT;
+      case -95:
+        return -EOPNOTSUPP;
+      case -96:
+        return -EPFNOSUPPORT;
+      case -97:
+        return -EAFNOSUPPORT;
+      case -98:
+        return -EADDRINUSE;
+      case -99:
+        return -EADDRNOTAVAIL;
+      case -100:
+        return -ENETDOWN;
+      case -101:
+        return -ENETUNREACH;
+      case -102:
+        return -ENETRESET;
+      case -103:
+        return -ECONNABORTED;
+      case -104:
+        return -ECONNRESET;
+      case -105:
+        return -ENOBUFS;
+      case -106:
+        return -EISCONN;
+      case -107:
+        return -ENOTCONN;
+      case -108:
+        return -ESHUTDOWN;
+      case -109:
+        return -ETOOMANYREFS;
+      case -110:
+        return -ETIMEDOUT;
+      case -111:
+        return -ECONNREFUSED;
+      case -112:
+        return -EHOSTDOWN;
+      case -113:
+        return -EHOSTUNREACH;
+      case -114:
+        return -EALREADY;
+      case -115:
+        return -EINPROGRESS;
+      case -116:
+        return -ESTALE;
+      case -117:
+        return -EPERM; //TODO EUCLEAN 
+      case -118:
+        return -EPERM; //TODO ENOTNAM
+      case -119:
+        return -EPERM; //TODO ENAVAIL
+      case -120:
+        return -EPERM; //TODO EISNAM
+      case -121:
+        return -EPERM; //TODO EREMOTEIO
+      case -122:
+        return -EDQUOT;
+      case -123:
+        return -EPERM; //TODO ENOMEDIUM
+      case -124:
+        return -EPERM; //TODO EMEDIUMTYPE - not used
+      case -125:
+        return -ECANCELED;
+      case -126:
+        return -EPERM; //TODO ENOKEY
+      case -127:
+        return -EPERM; //TODO EKEYEXPIRED
+      case -128:
+        return -EPERM; //TODO EKEYREVOKED
+      case -129:
+        return -EPERM; //TODO EKEYREJECTED
+      case -130:
+        return -EOWNERDEAD;
+      case -131:
+        return -ENOTRECOVERABLE;
+      case -132:
+        return -EPERM; //TODO ERFKILL
+      case -133:
+        return -EPERM; //TODO EHWPOISON
+
+      default: { 
+        break;
+      }
+    }
+  } 
+  return r; // otherwise return original value
+}
+
+
diff --git a/src/crush/CrushTester.cc b/src/crush/CrushTester.cc
index 0863833..11b536c 100644
--- a/src/crush/CrushTester.cc
+++ b/src/crush/CrushTester.cc
@@ -381,10 +381,10 @@ int CrushTester::test_with_crushtool(const char *crushtool_cmd,
 
   bufferlist bl;
   ::encode(crush, bl);
-  bl.write_fd(crushtool.stdin());
+  bl.write_fd(crushtool.get_stdin());
   crushtool.close_stdin();
   bl.clear();
-  ret = bl.read_fd(crushtool.stderr(), 100 * 1024);
+  ret = bl.read_fd(crushtool.get_stderr(), 100 * 1024);
   if (ret < 0) {
     err << "failed read from crushtool: " << cpp_strerror(-ret);
     return ret;
diff --git a/src/crush/mapper.c b/src/crush/mapper.c
index 393bfb2..412e7e8 100644
--- a/src/crush/mapper.c
+++ b/src/crush/mapper.c
@@ -835,7 +835,8 @@ int crush_do_rule(const struct crush_map *map,
 		case CRUSH_RULE_TAKE:
 			if ((curstep->arg1 >= 0 &&
 			     curstep->arg1 < map->max_devices) ||
-			    (-1-curstep->arg1 < map->max_buckets &&
+			    (-1-curstep->arg1 >= 0 &&
+			     -1-curstep->arg1 < map->max_buckets &&
 			     map->buckets[-1-curstep->arg1])) {
 				w[0] = curstep->arg1;
 				wsize = 1;
@@ -888,6 +889,7 @@ int crush_do_rule(const struct crush_map *map,
 			osize = 0;
 
 			for (i = 0; i < wsize; i++) {
+				int bno;
 				/*
 				 * see CRUSH_N, CRUSH_N_MINUS macros.
 				 * basically, numrep <= 0 means relative to
@@ -900,6 +902,13 @@ int crush_do_rule(const struct crush_map *map,
 						continue;
 				}
 				j = 0;
+				/* make sure bucket id is valid */
+				bno = -1 - w[i];
+				if (bno < 0 || bno >= map->max_buckets) {
+					// w[i] is probably CRUSH_ITEM_NONE
+					dprintk("  bad w[i] %d\n", w[i]);
+					continue;
+				}
 				if (firstn) {
 					int recurse_tries;
 					if (choose_leaf_tries)
@@ -911,7 +920,7 @@ int crush_do_rule(const struct crush_map *map,
 						recurse_tries = choose_tries;
 					osize += crush_choose_firstn(
 						map,
-						map->buckets[-1-w[i]],
+						map->buckets[bno],
 						weight, weight_max,
 						x, numrep,
 						curstep->arg2,
@@ -930,7 +939,7 @@ int crush_do_rule(const struct crush_map *map,
 						    numrep : (result_max-osize));
 					crush_choose_indep(
 						map,
-						map->buckets[-1-w[i]],
+						map->buckets[bno],
 						weight, weight_max,
 						x, out_size, numrep,
 						curstep->arg2,
diff --git a/src/erasure-code/isa/Makefile.am b/src/erasure-code/isa/Makefile.am
index 67725dd..7b60562 100644
--- a/src/erasure-code/isa/Makefile.am
+++ b/src/erasure-code/isa/Makefile.am
@@ -67,7 +67,7 @@ libec_isa_la_CXXFLAGS = ${AM_CXXFLAGS} -I $(srcdir)/erasure-code/isa/isa-l/inclu
 libec_isa_la_CCASFLAGS = ${AM_CCASFLAGS} -I $(abs_srcdir)/erasure-code/isa/isa-l/include/
 
 libec_isa_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_isa_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:14:0
+libec_isa_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared
 if LINUX
 libec_isa_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
diff --git a/src/erasure-code/jerasure/Makefile.am b/src/erasure-code/jerasure/Makefile.am
index 9ca4fbb..adcb95d 100644
--- a/src/erasure-code/jerasure/Makefile.am
+++ b/src/erasure-code/jerasure/Makefile.am
@@ -48,7 +48,7 @@ libec_jerasure_generic_la_CXXFLAGS= ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
 	-I$(srcdir)/erasure-code/jerasure/jerasure/include
 libec_jerasure_generic_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_jerasure_generic_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:0:0
+libec_jerasure_generic_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared
 if LINUX
 libec_jerasure_generic_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
@@ -70,7 +70,7 @@ libec_jerasure_neon_la_CXXFLAGS= ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
 	-I$(srcdir)/erasure-code/jerasure/jerasure/include
 libec_jerasure_neon_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_jerasure_neon_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:0:0
+libec_jerasure_neon_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared
 if LINUX
 libec_jerasure_neon_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
@@ -95,7 +95,7 @@ libec_jerasure_sse3_la_CXXFLAGS= ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
 	-I$(srcdir)/erasure-code/jerasure/jerasure/include
 libec_jerasure_sse3_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_jerasure_sse3_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:0:0
+libec_jerasure_sse3_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared
 if LINUX
 libec_jerasure_sse3_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
@@ -124,7 +124,7 @@ libec_jerasure_sse4_la_CXXFLAGS= ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/jerasure/gf-complete/include \
 	-I$(srcdir)/erasure-code/jerasure/jerasure/include
 libec_jerasure_sse4_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_jerasure_sse4_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:0:0
+libec_jerasure_sse4_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared
 if LINUX
 libec_jerasure_sse4_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
@@ -138,7 +138,7 @@ libec_jerasure_la_SOURCES = \
 libec_jerasure_la_CFLAGS = ${AM_CFLAGS}
 libec_jerasure_la_CXXFLAGS= ${AM_CXXFLAGS}
 libec_jerasure_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_jerasure_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:0:0
+libec_jerasure_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared
 if LINUX
 libec_jerasure_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
diff --git a/src/erasure-code/lrc/Makefile.am b/src/erasure-code/lrc/Makefile.am
index c6547a3..44c47d5 100644
--- a/src/erasure-code/lrc/Makefile.am
+++ b/src/erasure-code/lrc/Makefile.am
@@ -13,7 +13,7 @@ libec_lrc_la_SOURCES = ${lrc_sources} common/str_map.cc
 libec_lrc_la_CFLAGS = ${AM_CFLAGS}
 libec_lrc_la_CXXFLAGS= ${AM_CXXFLAGS}
 libec_lrc_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(LIBJSON_SPIRIT)
-libec_lrc_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
+libec_lrc_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared
 if LINUX
 libec_lrc_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
diff --git a/src/erasure-code/shec/Makefile.am b/src/erasure-code/shec/Makefile.am
index 1449a41..6b658d5 100644
--- a/src/erasure-code/shec/Makefile.am
+++ b/src/erasure-code/shec/Makefile.am
@@ -50,7 +50,7 @@ libec_shec_generic_la_CXXFLAGS= ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/jerasure \
 	-I$(srcdir)/erasure-code/shec
 libec_shec_generic_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_shec_generic_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
+libec_shec_generic_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared
 if LINUX
 libec_shec_generic_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
@@ -76,7 +76,7 @@ libec_shec_neon_la_CXXFLAGS= ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/jerasure \
 	-I$(srcdir)/erasure-code/shec
 libec_shec_neon_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_shec_neon_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
+libec_shec_neon_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared
 if LINUX
 libec_shec_neon_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
@@ -105,7 +105,7 @@ libec_shec_sse3_la_CXXFLAGS= ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/jerasure \
 	-I$(srcdir)/erasure-code/shec
 libec_shec_sse3_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_shec_sse3_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
+libec_shec_sse3_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared
 if LINUX
 libec_shec_sse3_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
@@ -138,7 +138,7 @@ libec_shec_sse4_la_CXXFLAGS= ${AM_CXXFLAGS} \
 	-I$(srcdir)/erasure-code/jerasure \
 	-I$(srcdir)/erasure-code/shec
 libec_shec_sse4_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_shec_sse4_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
+libec_shec_sse4_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared
 if LINUX
 libec_shec_sse4_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
@@ -152,7 +152,7 @@ libec_shec_la_SOURCES = \
 libec_shec_la_CFLAGS = ${AM_CFLAGS}
 libec_shec_la_CXXFLAGS= ${AM_CXXFLAGS}
 libec_shec_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_shec_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
+libec_shec_la_LDFLAGS = ${AM_LDFLAGS} -module -avoid-version -shared
 if LINUX
 libec_shec_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
diff --git a/src/rbdmap b/src/etc-rbdmap
similarity index 100%
copy from src/rbdmap
copy to src/etc-rbdmap
diff --git a/src/global/Makefile.am b/src/global/Makefile.am
index 79a7fff..4738b37 100644
--- a/src/global/Makefile.am
+++ b/src/global/Makefile.am
@@ -2,7 +2,8 @@ libglobal_la_SOURCES = \
 	global/global_context.cc \
 	global/global_init.cc \
 	global/pidfile.cc \
-	global/signal_handler.cc
+	global/signal_handler.cc \
+	common/TrackedOp.cc
 libglobal_la_LIBADD = $(LIBCOMMON)
 noinst_LTLIBRARIES += libglobal.la
 
diff --git a/src/global/global_init.cc b/src/global/global_init.cc
index ed5d186..609c7ea 100644
--- a/src/global/global_init.cc
+++ b/src/global/global_init.cc
@@ -257,7 +257,7 @@ static void pidfile_remove_void(void)
   pidfile_remove();
 }
 
-int global_init_prefork(CephContext *cct, int flags)
+int global_init_prefork(CephContext *cct)
 {
   if (g_code_env != CODE_ENVIRONMENT_DAEMON)
     return -1;
@@ -279,9 +279,9 @@ int global_init_prefork(CephContext *cct, int flags)
   return 0;
 }
 
-void global_init_daemonize(CephContext *cct, int flags)
+void global_init_daemonize(CephContext *cct)
 {
-  if (global_init_prefork(cct, flags) < 0)
+  if (global_init_prefork(cct) < 0)
     return;
 
   int ret = daemon(1, 1);
@@ -293,7 +293,7 @@ void global_init_daemonize(CephContext *cct, int flags)
   }
 
   global_init_postfork_start(cct);
-  global_init_postfork_finish(cct, flags);
+  global_init_postfork_finish(cct);
 }
 
 void global_init_postfork_start(CephContext *cct)
@@ -332,13 +332,13 @@ void global_init_postfork_start(CephContext *cct)
   pidfile_write(g_conf);
 }
 
-void global_init_postfork_finish(CephContext *cct, int flags)
+void global_init_postfork_finish(CephContext *cct)
 {
   /* We only close stderr once the caller decides the daemonization
    * process is finished.  This way we can allow error messages to be
    * propagated in a manner that the user is able to see.
    */
-  if (!(flags & CINIT_FLAG_NO_CLOSE_STDERR)) {
+  if (!(cct->get_init_flags() & CINIT_FLAG_NO_CLOSE_STDERR)) {
     int ret = global_init_shutdown_stderr(cct);
     if (ret) {
       derr << "global_init_daemonize: global_init_shutdown_stderr failed with "
diff --git a/src/global/global_init.h b/src/global/global_init.h
index 54c8d3d..5e934a7 100644
--- a/src/global/global_init.h
+++ b/src/global/global_init.h
@@ -46,7 +46,7 @@ void global_pre_init(std::vector < const char * > *alt_def_args,
  * to actually forking (via daemon(3)).  return 0 if we are going to proceed
  * with the fork, or -1 otherwise.
  */
-int global_init_prefork(CephContext *cct, int flags);
+int global_init_prefork(CephContext *cct);
 
 /*
  * perform all the steps that global_init_daemonize performs just after
@@ -57,7 +57,7 @@ void global_init_postfork_start(CephContext *cct);
 /*
  * close stderr, thus completing the postfork.
  */
-void global_init_postfork_finish(CephContext *cct, int flags);
+void global_init_postfork_finish(CephContext *cct);
 
 
 /*
@@ -67,7 +67,7 @@ void global_init_postfork_finish(CephContext *cct, int flags);
  * Note that this is equivalent to calling _prefork(), daemon(), and
  * _postfork.
  */
-void global_init_daemonize(CephContext *cct, int flags);
+void global_init_daemonize(CephContext *cct);
 
 /*
  * global_init_chdir changes the process directory.
diff --git a/src/global/signal_handler.cc b/src/global/signal_handler.cc
index edca694..9e699ad 100644
--- a/src/global/signal_handler.cc
+++ b/src/global/signal_handler.cc
@@ -40,9 +40,17 @@ void install_sighandler(int signum, signal_handler_t handler, int flags)
   ret = sigaction(signum, &act, &oldact);
   if (ret != 0) {
     char buf[1024];
+#if defined(__sun) 
+    char message[SIG2STR_MAX];
+    sig2str(signum,message);
+    snprintf(buf, sizeof(buf), "install_sighandler: sigaction returned "
+	    "%d when trying to install a signal handler for %s\n",
+	     ret, message);
+#else
     snprintf(buf, sizeof(buf), "install_sighandler: sigaction returned "
 	    "%d when trying to install a signal handler for %s\n",
 	     ret, sys_siglist[signum]);
+#endif
     dout_emergency(buf);
     exit(1);
   }
@@ -79,8 +87,15 @@ static void handle_fatal_signal(int signum)
   // case, SA_RESETHAND specifies that the default signal handler--
   // presumably dump core-- will handle it.
   char buf[1024];
+#if defined(__sun)
+  char message[SIG2STR_MAX];
+  sig2str(signum,message);
+  snprintf(buf, sizeof(buf), "*** Caught signal (%s) **\n "
+	    "in thread %llx\n", message, (unsigned long long)pthread_self());
+#else
   snprintf(buf, sizeof(buf), "*** Caught signal (%s) **\n "
 	    "in thread %llx\n", sys_siglist[signum], (unsigned long long)pthread_self());
+#endif
   dout_emergency(buf);
   pidfile_remove();
 
diff --git a/src/include/assert.h b/src/include/assert.h
index ec0aa34..89e8753 100644
--- a/src/include/assert.h
+++ b/src/include/assert.h
@@ -3,9 +3,17 @@
 
 #if defined(__linux__)
 #include <features.h>
+
+#ifndef __STRING
+# define __STRING(x) #x
+#endif
+
 #elif defined(__FreeBSD__)
 #include <sys/cdefs.h>
 #define	__GNUC_PREREQ(minor, major)	__GNUC_PREREQ__(minor, major)
+#elif defined(__sun)
+#include "include/compat.h"
+#include <assert.h>
 #endif
 
 #ifdef __CEPH__
diff --git a/src/include/buffer.h b/src/include/buffer.h
index f28bc5e..5f90d7b 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -434,7 +434,7 @@ public:
     void zero();
     void zero(unsigned o, unsigned l);
 
-    bool is_contiguous();
+    bool is_contiguous() const;
     void rebuild();
     void rebuild(ptr& nb);
     void rebuild_aligned(unsigned align);
diff --git a/src/include/byteorder.h b/src/include/byteorder.h
index 191d242..d6ff8b6 100644
--- a/src/include/byteorder.h
+++ b/src/include/byteorder.h
@@ -34,6 +34,17 @@
 # endif
 #endif
 
+#if defined(__sun) && defined(__SVR4)
+# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#  define CEPH_BIG_ENDIAN
+# elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#  define CEPH_LITTLE_ENDIAN
+# endif
+#endif
+
+
+
+
 static __inline__ __u16 swab16(__u16 val) 
 {
   return (val >> 8) | (val << 8);
diff --git a/src/include/ceph_features.h b/src/include/ceph_features.h
old mode 100644
new mode 100755
index 4857b0a..4e15563
--- a/src/include/ceph_features.h
+++ b/src/include/ceph_features.h
@@ -69,6 +69,9 @@
 #define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53)
 #define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54)
 #define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
+#define CEPH_FEATURE_NEW_OSDOP_ENCODING   (1ULL<<56) /* New, v7 encoding */
+#define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */
+#define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */
 
 #define CEPH_FEATURE_RESERVED2 (1ULL<<61)  /* slow down, we are almost out... */
 #define CEPH_FEATURE_RESERVED  (1ULL<<62)  /* DO NOT USE THIS ... last bit! */
@@ -131,6 +134,7 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
 	 CEPH_FEATURE_OSD_HBMSGS |		\
 	 CEPH_FEATURE_MDSENC |			\
 	 CEPH_FEATURE_OSDHASHPSPOOL |       \
+	 CEPH_FEATURE_NEW_OSDOP_ENCODING |        \
 	 CEPH_FEATURE_MON_SINGLE_PAXOS |    \
 	 CEPH_FEATURE_OSD_SNAPMAPPER |	    \
 	 CEPH_FEATURE_MON_SCRUB	|	    \
@@ -160,6 +164,8 @@ static inline unsigned long long ceph_sanitize_features(unsigned long long f) {
          CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES |         \
 	 CEPH_FEATURE_OSD_HITSET_GMT |			 \
 	 CEPH_FEATURE_HAMMER_0_94_4 |		 \
+	 CEPH_FEATURE_MON_STATEFUL_SUB |	 \
+	 CEPH_FEATURE_MON_ROUTE_OSDMAP |	 \
 	 0ULL)
 
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  CEPH_FEATURES_ALL
diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h
index 08ef460..1dd333e 100644
--- a/src/include/ceph_fs.h
+++ b/src/include/ceph_fs.h
@@ -351,7 +351,8 @@ enum {
 	CEPH_MDS_OP_FRAGMENTDIR= 0x01500,
 	CEPH_MDS_OP_EXPORTDIR  = 0x01501,
 	CEPH_MDS_OP_VALIDATE   = 0x01502,
-	CEPH_MDS_OP_FLUSH      = 0x01503
+	CEPH_MDS_OP_FLUSH      = 0x01503,
+	CEPH_MDS_OP_ENQUEUE_SCRUB = 0x01504
 };
 
 extern const char *ceph_mds_op_name(int op);
diff --git a/src/include/cephfs/libcephfs.h b/src/include/cephfs/libcephfs.h
index 2083093..acdfa13 100644
--- a/src/include/cephfs/libcephfs.h
+++ b/src/include/cephfs/libcephfs.h
@@ -15,6 +15,7 @@
 #ifndef CEPH_LIB_H
 #define CEPH_LIB_H
 
+#include <features.h>
 #include <utime.h>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -23,12 +24,6 @@
 #include <stdint.h>
 #include <stdbool.h>
 
-// FreeBSD compatibility
-#if defined(__FreeBSD__) || defined(__APPLE__)
-typedef off_t loff_t;
-typedef off_t off64_t;
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -41,11 +36,10 @@ extern "C" {
 #define LIBCEPHFS_VERSION_CODE LIBCEPHFS_VERSION(LIBCEPHFS_VER_MAJOR, LIBCEPHFS_VER_MINOR, LIBCEPHFS_VER_EXTRA)
 
 /*
- * On FreeBSD and Apple the offset is 64 bit, but libc doesn't announce it in
- * the way glibc does.
+ * If using glibc check that file offset is 64-bit.
  */
-#if !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__USE_FILE_OFFSET64)
-# error libceph: must define __USE_FILE_OFFSET64 or readdir results will be corrupted
+#if defined(__GLIBC__) && !defined(__USE_FILE_OFFSET64)
+# error libceph: glibc must define __USE_FILE_OFFSET64 or readdir results will be corrupted
 #endif
 
 /*
@@ -1387,8 +1381,8 @@ int ceph_ll_setattr(struct ceph_mount_info *cmount, struct Inode *in,
 		    struct stat *st, int mask, int uid, int gid);
 int ceph_ll_open(struct ceph_mount_info *cmount, struct Inode *in, int flags,
 		 struct Fh **fh, int uid, int gid);
-loff_t ceph_ll_lseek(struct ceph_mount_info *cmount, struct Fh* filehandle,
-		     loff_t offset, int whence);
+off_t ceph_ll_lseek(struct ceph_mount_info *cmount, struct Fh* filehandle,
+		     off_t offset, int whence);
 int ceph_ll_read(struct ceph_mount_info *cmount, struct Fh* filehandle,
 		 int64_t off, uint64_t len, char* buf);
 int ceph_ll_fsync(struct ceph_mount_info *cmount, struct Fh *fh,
diff --git a/src/include/compat.h b/src/include/compat.h
index 885b9c1..dcb5f6f 100644
--- a/src/include/compat.h
+++ b/src/include/compat.h
@@ -34,7 +34,7 @@
 
 #ifndef TEMP_FAILURE_RETRY
 #define TEMP_FAILURE_RETRY(expression) ({     \
-  typeof(expression) __result;                \
+  __typeof(expression) __result;              \
   do {                                        \
     __result = (expression);                  \
   } while (__result == -1 && errno == EINTR); \
@@ -53,4 +53,11 @@
 #define lseek64(fd, offset, whence) lseek(fd, offset, whence)
 #endif
 
+#if defined(__sun)
+#define LOG_AUTHPRIV    (10<<3)
+#define LOG_FTP         (11<<3)
+#define __STRING(x)     "x"
+#define IFTODT(mode)   (((mode) & 0170000) >> 12)
+#endif
+
 #endif /* !CEPH_COMPAT_H */
diff --git a/src/include/encoding.h b/src/include/encoding.h
index 6fa12f9..3ba5327 100644
--- a/src/include/encoding.h
+++ b/src/include/encoding.h
@@ -62,7 +62,9 @@ inline void decode_raw(T& t, bufferlist::iterator &p)
   inline void decode(type &v, bufferlist::iterator& p) { __ASSERT_FUNCTION decode_raw(v, p); }
 
 WRITE_RAW_ENCODER(__u8)
+#ifndef _CHAR_IS_SIGNED
 WRITE_RAW_ENCODER(__s8)
+#endif
 WRITE_RAW_ENCODER(char)
 WRITE_RAW_ENCODER(ceph_le64)
 WRITE_RAW_ENCODER(ceph_le32)
diff --git a/src/include/rados.h b/src/include/rados.h
index 59d3225..dde4d9e 100644
--- a/src/include/rados.h
+++ b/src/include/rados.h
@@ -250,6 +250,10 @@ extern const char *ceph_osd_state_name(int s);
 									    \
 	/* hints */							    \
 	f(SETALLOCHINT,	__CEPH_OSD_OP(WR, DATA, 35),	"set-alloc-hint")   \
+                                                                            \
+	/* cache pin/unpin */						    \
+	f(CACHE_PIN,	__CEPH_OSD_OP(WR, DATA, 36),	"cache-pin")        \
+	f(CACHE_UNPIN,	__CEPH_OSD_OP(WR, DATA, 37),	"cache-unpin")      \
 									    \
 	/** multi **/							    \
 	f(CLONERANGE,	__CEPH_OSD_OP(WR, MULTI, 1),	"clonerange")	    \
diff --git a/src/include/rados/buffer.h b/src/include/rados/buffer.h
index f28bc5e..5f90d7b 100644
--- a/src/include/rados/buffer.h
+++ b/src/include/rados/buffer.h
@@ -434,7 +434,7 @@ public:
     void zero();
     void zero(unsigned o, unsigned l);
 
-    bool is_contiguous();
+    bool is_contiguous() const;
     void rebuild();
     void rebuild(ptr& nb);
     void rebuild_aligned(unsigned align);
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
index 3aebf20..f13737b 100644
--- a/src/include/rados/librados.h
+++ b/src/include/rados/librados.h
@@ -809,8 +809,33 @@ CEPH_RADOS_API int rados_ioctx_pool_set_auid(rados_ioctx_t io, uint64_t auid);
  */
 CEPH_RADOS_API int rados_ioctx_pool_get_auid(rados_ioctx_t io, uint64_t *auid);
 
-CEPH_RADOS_API int rados_ioctx_pool_requires_alignment(rados_ioctx_t io);
-CEPH_RADOS_API uint64_t rados_ioctx_pool_required_alignment(rados_ioctx_t io);
+/* deprecated, use rados_ioctx_pool_requires_alignment2 instead */
+CEPH_RADOS_API int rados_ioctx_pool_requires_alignment(rados_ioctx_t io)
+  __attribute__((deprecated));
+
+/**
+ * Test whether the specified pool requires alignment or not.
+ *
+ * @param io pool to query
+ * @param requires 1 if alignment is supported, 0 if not. 
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_requires_alignment2(rados_ioctx_t io,
+  int *requires);
+
+/* deprecated, use rados_ioctx_pool_required_alignment2 instead */
+CEPH_RADOS_API uint64_t rados_ioctx_pool_required_alignment(rados_ioctx_t io)
+  __attribute__((deprecated));
+
+/**
+ * Get the alignment flavor of a pool
+ *
+ * @param io pool to query
+ * @param alignment where to store the alignment flavor
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_ioctx_pool_required_alignment2(rados_ioctx_t io,
+  uint64_t *alignment);
 
 /**
  * Get the pool id of the io context
@@ -2092,6 +2117,7 @@ CEPH_RADOS_API int rados_notify(rados_ioctx_t io, const char *o, uint64_t ver,
  * -ETIMEDOUT).
  *
  * @param io the pool the object is in
+ * @param completion what to do when operation has been attempted
  * @param o the name of the object
  * @param buf data to send to watchers
  * @param buf_len length of buf in bytes
@@ -2104,6 +2130,11 @@ CEPH_RADOS_API int rados_notify2(rados_ioctx_t io, const char *o,
 				 const char *buf, int buf_len,
 				 uint64_t timeout_ms,
 				 char **reply_buffer, size_t *reply_buffer_len);
+CEPH_RADOS_API int rados_aio_notify(rados_ioctx_t io, const char *o,
+                                    rados_completion_t completion,
+                                    const char *buf, int buf_len,
+                                    uint64_t timeout_ms, char **reply_buffer,
+                                    size_t *reply_buffer_len);
 
 /**
  * Acknolwedge receipt of a notify
@@ -2137,6 +2168,29 @@ CEPH_RADOS_API int rados_watch_flush(rados_t cluster);
 /** @} Watch/Notify */
 
 /**
+ * Pin an object in the cache tier
+ *
+ * When an object is pinned in the cache tier, it stays in the cache
+ * tier, and won't be flushed out.
+ *
+ * @param io the pool the object is in
+ * @param o the object id
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cache_pin(rados_ioctx_t io, const char *o);
+
+/**
+ * Unpin an object in the cache tier
+ *
+ * After an object is unpinned in the cache tier, it can be flushed out
+ *
+ * @param io the pool the object is in
+ * @param o the object id
+ * @returns 0 on success, negative error code on failure
+ */
+CEPH_RADOS_API int rados_cache_unpin(rados_ioctx_t io, const char *o);
+
+/**
  * @name Hints
  *
  * @{
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
index b92a94f..a184fe1 100644
--- a/src/include/rados/librados.hpp
+++ b/src/include/rados/librados.hpp
@@ -442,6 +442,14 @@ namespace librados
     void set_alloc_hint(uint64_t expected_object_size,
                         uint64_t expected_write_size);
 
+    /**
+     * Pin/unpin an object in cache tier
+     *
+     * @returns 0 on success, negative error code on failure
+     */
+    void cache_pin();
+    void cache_unpin();
+
     friend class IoCtx;
   };
 
@@ -639,7 +647,9 @@ namespace librados
     std::string get_pool_name();
 
     bool pool_requires_alignment();
+    int pool_requires_alignment2(bool * requires);
     uint64_t pool_required_alignment();
+    int pool_required_alignment2(uint64_t * alignment);
 
     // create an object
     int create(const std::string& oid, bool exclusive);
@@ -980,6 +990,12 @@ namespace librados
 		bufferlist& bl,         ///< optional broadcast payload
 		uint64_t timeout_ms,    ///< timeout (in ms)
 		bufferlist *pbl);       ///< reply buffer
+    int aio_notify(const std::string& o,   ///< object
+                   AioCompletion *c,       ///< completion when notify completes
+                   bufferlist& bl,         ///< optional broadcast payload
+                   uint64_t timeout_ms,    ///< timeout (in ms)
+                   bufferlist *pbl);       ///< reply buffer
+
     int list_watchers(const std::string& o, std::list<obj_watch_t> *out_watchers);
     int list_snaps(const std::string& o, snap_set_t *out_snaps);
     void set_notify_timeout(uint32_t timeout);
@@ -1034,6 +1050,15 @@ namespace librados
     void set_assert_version(uint64_t ver);
     void set_assert_src_version(const std::string& o, uint64_t ver);
 
+    /**
+     * Pin/unpin an object in cache tier
+     *
+     * @param o the name of the object
+     * @returns 0 on success, negative error code on failure
+     */
+    int cache_pin(const std::string& o);
+    int cache_unpin(const std::string& o);
+
     const std::string& get_pool_name() const;
 
     void locator_set_key(const std::string& key);
diff --git a/src/include/rbd/features.h b/src/include/rbd/features.h
index ac7b558..a78e6fc 100644
--- a/src/include/rbd/features.h
+++ b/src/include/rbd/features.h
@@ -7,6 +7,7 @@
 #define RBD_FEATURE_OBJECT_MAP		(1<<3)
 #define RBD_FEATURE_FAST_DIFF           (1<<4)
 #define RBD_FEATURE_DEEP_FLATTEN        (1<<5)
+#define RBD_FEATURE_JOURNALING          (1<<6)
 
 /// features that make an image inaccessible for read or write by
 /// clients that don't understand them
@@ -18,24 +19,28 @@
 					 RBD_FEATURE_EXCLUSIVE_LOCK | \
 					 RBD_FEATURE_OBJECT_MAP     | \
                                          RBD_FEATURE_FAST_DIFF      | \
-                                         RBD_FEATURE_DEEP_FLATTEN)
+                                         RBD_FEATURE_DEEP_FLATTEN   | \
+                                         RBD_FEATURE_JOURNALING)
 
 #define RBD_FEATURES_ALL          	(RBD_FEATURE_LAYERING       | \
 					 RBD_FEATURE_STRIPINGV2     | \
                                    	 RBD_FEATURE_EXCLUSIVE_LOCK | \
                                          RBD_FEATURE_OBJECT_MAP     | \
                                          RBD_FEATURE_FAST_DIFF      | \
-                                         RBD_FEATURE_DEEP_FLATTEN)
+                                         RBD_FEATURE_DEEP_FLATTEN   | \
+                                         RBD_FEATURE_JOURNALING)
 
 /// features that may be dynamically enabled or disabled
 #define RBD_FEATURES_MUTABLE            (RBD_FEATURE_EXCLUSIVE_LOCK | \
                                          RBD_FEATURE_OBJECT_MAP     | \
-                                         RBD_FEATURE_FAST_DIFF)
+                                         RBD_FEATURE_FAST_DIFF      | \
+                                         RBD_FEATURE_JOURNALING)
 
 /// features that only work when used with a single client
 /// using the image for writes
 #define RBD_FEATURES_SINGLE_CLIENT (RBD_FEATURE_EXCLUSIVE_LOCK | \
                                     RBD_FEATURE_OBJECT_MAP     | \
-                                    RBD_FEATURE_FAST_DIFF)
+                                    RBD_FEATURE_FAST_DIFF      | \
+                                    RBD_FEATURE_JOURNALING)
 
 #endif
diff --git a/src/include/rbd/librbd.h b/src/include/rbd/librbd.h
index 690dbbd..ba64219 100644
--- a/src/include/rbd/librbd.h
+++ b/src/include/rbd/librbd.h
@@ -52,6 +52,7 @@ extern "C" {
 
 typedef void *rbd_snap_t;
 typedef void *rbd_image_t;
+typedef void *rbd_image_options_t;
 
 typedef int (*librbd_progress_fn_t)(uint64_t offset, uint64_t total, void *ptr);
 
@@ -76,6 +77,30 @@ typedef struct {
 
 CEPH_RBD_API void rbd_version(int *major, int *minor, int *extra);
 
+/* image options */
+enum {
+  RBD_IMAGE_OPTION_FORMAT = 0,
+  RBD_IMAGE_OPTION_FEATURES = 1,
+  RBD_IMAGE_OPTION_ORDER = 2,
+  RBD_IMAGE_OPTION_STRIPE_UNIT = 3,
+  RBD_IMAGE_OPTION_STRIPE_COUNT = 4,
+};
+
+CEPH_RBD_API void rbd_image_options_create(rbd_image_options_t* opts);
+CEPH_RBD_API void rbd_image_options_destroy(rbd_image_options_t opts);
+CEPH_RBD_API int rbd_image_options_set_string(rbd_image_options_t opts,
+					      int optname, const char* optval);
+CEPH_RBD_API int rbd_image_options_set_uint64(rbd_image_options_t opts,
+					      int optname, uint64_t optval);
+CEPH_RBD_API int rbd_image_options_get_string(rbd_image_options_t opts,
+					      int optname, char* optval,
+					      size_t maxlen);
+CEPH_RBD_API int rbd_image_options_get_uint64(rbd_image_options_t opts,
+					      int optname, uint64_t* optval);
+CEPH_RBD_API int rbd_image_options_unset(rbd_image_options_t opts, int optname);
+CEPH_RBD_API void rbd_image_options_clear(rbd_image_options_t opts);
+CEPH_RBD_API int rbd_image_options_is_empty(rbd_image_options_t opts);
+
 /* images */
 CEPH_RBD_API int rbd_list(rados_ioctx_t io, char *names, size_t *size);
 CEPH_RBD_API int rbd_create(rados_ioctx_t io, const char *name, uint64_t size,
@@ -102,6 +127,8 @@ CEPH_RBD_API int rbd_create2(rados_ioctx_t io, const char *name, uint64_t size,
 CEPH_RBD_API int rbd_create3(rados_ioctx_t io, const char *name, uint64_t size,
 		             uint64_t features, int *order,
 		             uint64_t stripe_unit, uint64_t stripe_count);
+CEPH_RBD_API int rbd_create4(rados_ioctx_t io, const char *name, uint64_t size,
+			     rbd_image_options_t opts);
 CEPH_RBD_API int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name,
 	                   const char *p_snapname, rados_ioctx_t c_ioctx,
 	                   const char *c_name, uint64_t features, int *c_order);
@@ -109,6 +136,9 @@ CEPH_RBD_API int rbd_clone2(rados_ioctx_t p_ioctx, const char *p_name,
 	                    const char *p_snapname, rados_ioctx_t c_ioctx,
 	                    const char *c_name, uint64_t features, int *c_order,
 	                    uint64_t stripe_unit, int stripe_count);
+CEPH_RBD_API int rbd_clone3(rados_ioctx_t p_ioctx, const char *p_name,
+	                    const char *p_snapname, rados_ioctx_t c_ioctx,
+	                    const char *c_name, rbd_image_options_t c_opts);
 CEPH_RBD_API int rbd_remove(rados_ioctx_t io, const char *name);
 CEPH_RBD_API int rbd_remove_with_progress(rados_ioctx_t io, const char *name,
 			                  librbd_progress_fn_t cb,
@@ -172,11 +202,18 @@ CEPH_RBD_API int rbd_rebuild_object_map(rbd_image_t image,
 CEPH_RBD_API int rbd_copy(rbd_image_t image, rados_ioctx_t dest_io_ctx,
                           const char *destname);
 CEPH_RBD_API int rbd_copy2(rbd_image_t src, rbd_image_t dest);
+CEPH_RBD_API int rbd_copy3(rbd_image_t src, rados_ioctx_t dest_io_ctx,
+			   const char *destname, rbd_image_options_t dest_opts);
 CEPH_RBD_API int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p,
                                         const char *destname,
                                         librbd_progress_fn_t cb, void *cbdata);
 CEPH_RBD_API int rbd_copy_with_progress2(rbd_image_t src, rbd_image_t dest,
 			                 librbd_progress_fn_t cb, void *cbdata);
+CEPH_RBD_API int rbd_copy_with_progress3(rbd_image_t image,
+					 rados_ioctx_t dest_p,
+					 const char *destname,
+					 rbd_image_options_t dest_opts,
+					 librbd_progress_fn_t cb, void *cbdata);
 
 /* snapshots */
 CEPH_RBD_API int rbd_snap_list(rbd_image_t image, rbd_snap_info_t *snaps,
@@ -189,6 +226,8 @@ CEPH_RBD_API int rbd_snap_rollback_with_progress(rbd_image_t image,
                                                  const char *snapname,
 				                 librbd_progress_fn_t cb,
                                                  void *cbdata);
+CEPH_RBD_API int rbd_snap_rename(rbd_image_t image, const char *snapname,
+				 const char* dstsnapsname);
 /**
  * Prevent a snapshot from being deleted until it is unprotected.
  *
diff --git a/src/include/rbd/librbd.hpp b/src/include/rbd/librbd.hpp
index e587916..a64ffa6 100644
--- a/src/include/rbd/librbd.hpp
+++ b/src/include/rbd/librbd.hpp
@@ -29,6 +29,7 @@ namespace librbd {
   using librados::IoCtx;
 
   class Image;
+  class ImageOptions;
   typedef void *image_ctx_t;
   typedef void *completion_t;
   typedef void (*callback_t)(completion_t cb, void *arg);
@@ -86,12 +87,16 @@ public:
   int create3(IoCtx& io_ctx, const char *name, uint64_t size,
 	      uint64_t features, int *order,
 	      uint64_t stripe_unit, uint64_t stripe_count);
+  int create4(IoCtx& io_ctx, const char *name, uint64_t size,
+	      ImageOptions& opts);
   int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
 	       IoCtx& c_ioctx, const char *c_name, uint64_t features,
 	       int *c_order);
   int clone2(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
 	     IoCtx& c_ioctx, const char *c_name, uint64_t features,
 	     int *c_order, uint64_t stripe_unit, int stripe_count);
+  int clone3(IoCtx& p_ioctx, const char *p_name, const char *p_snapname,
+	     IoCtx& c_ioctx, const char *c_name, ImageOptions& opts);
   int remove(IoCtx& io_ctx, const char *name);
   int remove_with_progress(IoCtx& io_ctx, const char *name, ProgressContext& pctx);
   int rename(IoCtx& src_io_ctx, const char *srcname, const char *destname);
@@ -102,6 +107,27 @@ private:
   const RBD& operator=(const RBD& rhs);
 };
 
+class CEPH_RBD_API ImageOptions {
+public:
+  ImageOptions();
+  ImageOptions(rbd_image_options_t opts);
+  ~ImageOptions();
+
+  int set(int optname, const std::string& optval);
+  int set(int optname, uint64_t optval);
+  int get(int optname, std::string* optval) const;
+  int get(int optname, uint64_t* optval) const;
+  int unset(int optname);
+  void clear();
+  bool empty() const;
+
+private:
+  friend class RBD;
+  friend class Image;
+
+  rbd_image_options_t opts;
+};
+
 class CEPH_RBD_API Image
 {
 public:
@@ -130,9 +156,12 @@ public:
 
   int copy(IoCtx& dest_io_ctx, const char *destname);
   int copy2(Image& dest);
+  int copy3(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts);
   int copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
 			 ProgressContext &prog_ctx);
   int copy_with_progress2(Image& dest, ProgressContext &prog_ctx);
+  int copy_with_progress3(IoCtx& dest_io_ctx, const char *destname,
+			  ImageOptions& opts, ProgressContext &prog_ctx);
 
   /* striping */
   uint64_t get_stripe_unit() const;
@@ -165,6 +194,7 @@ public:
   int snap_unprotect(const char *snap_name);
   int snap_is_protected(const char *snap_name, bool *is_protected);
   int snap_set(const char *snap_name);
+  int snap_rename(const char *srcname, const char *dstname);
 
   /* I/O */
   ssize_t read(uint64_t ofs, size_t len, ceph::bufferlist& bl);
diff --git a/src/include/sock_compat.h b/src/include/sock_compat.h
index 5faacc3..56eb92b 100644
--- a/src/include/sock_compat.h
+++ b/src/include/sock_compat.h
@@ -11,16 +11,4 @@
 # define MSG_MORE 0
 #endif
 
-/*
- * On BSD SO_NOSIGPIPE can be set via setsockopt to block SIGPIPE.
- */
-#ifndef MSG_NOSIGNAL
-# define MSG_NOSIGNAL 0
-# ifdef SO_NOSIGPIPE
-#  define CEPH_USE_SO_NOSIGPIPE
-# else
-#  error "Cannot block SIGPIPE!"
-# endif
-#endif
-
 #endif
diff --git a/src/include/types.h b/src/include/types.h
index f913af6..bf369f3 100644
--- a/src/include/types.h
+++ b/src/include/types.h
@@ -83,6 +83,11 @@ typedef off_t loff_t;
 typedef off_t off64_t;
 #endif
 
+#ifdef __sun
+typedef off_t loff_t;
+#endif
+
+
 // -- io helpers --
 
 template<class A, class B>
@@ -515,4 +520,34 @@ WRITE_EQ_OPERATORS_1(shard_id_t, id)
 WRITE_CMP_OPERATORS_1(shard_id_t, id)
 ostream &operator<<(ostream &lhs, const shard_id_t &rhs);
 
+#if defined(__sun)
+__s32  ceph_to_host_errno(__s32 e);
+#else
+#define  ceph_to_host_errno(e) (e)
+#endif
+
+struct errorcode32_t {
+  int32_t code;
+
+  errorcode32_t() {}
+  errorcode32_t(int32_t i) : code(i) {}
+
+  operator int() const { return code; }
+  int operator==(int i) {
+    return code==i;
+  }
+
+  void encode(bufferlist &bl) const {
+    ::encode(code, bl);
+  }
+  void decode(bufferlist::iterator &bl) {
+    ::decode(code, bl);
+    code = ceph_to_host_errno(code);
+  }
+};
+WRITE_CLASS_ENCODER(errorcode32_t)
+WRITE_EQ_OPERATORS_1(errorcode32_t, code)
+WRITE_CMP_OPERATORS_1(errorcode32_t, code)
+
+
 #endif
diff --git a/src/init-ceph.in b/src/init-ceph.in
index faeb7bd..a8f7a99 100755
--- a/src/init-ceph.in
+++ b/src/init-ceph.in
@@ -40,10 +40,26 @@ fi
 
 usage_exit() {
     echo "usage: $0 [options] {start|stop|restart|condrestart} [mon|osd|mds]..."
-    printf "\t-c ceph.conf\n"
-    printf "\t--cluster [cluster name]\tdefine the cluster name\n"
-    printf "\t--valgrind\trun via valgrind\n"
-    printf "\t--hostname [hostname]\toverride hostname lookup\n"
+    printf "Core options:\n"
+    printf "\t--allhosts / -a           execute (via ssh) on all hosts in conf file\n"
+    printf "\t--cluster [cluster name]  define the cluster name\n"
+    printf "\t--conf / -c [conf file]   use [conf file] instead of default\n"
+    printf "\t--help / -h               show this usage message\n"
+    printf "\t--hostname [hostname]     override hostname lookup\n"
+    printf "\t-m [mon addr]             mon address\n"
+    printf "\n"
+    printf "Other options:\n"
+    printf "\t--btrfs                   btrfs\n"
+    printf "\t--nobtrfs                 no btrfs\n"
+    printf "\t--btrfsumount             btrfs umount\n"
+    printf "\t--fsmount                 fsmount\n"
+    printf "\t--nofsmount               no fsmount\n"
+    printf "\t--fsumount                fsumount\n"
+    printf "\t--restart                 restart on core dump\n"
+    printf "\t--norestart               do not restart on core dump\n"
+    printf "\t--valgrind                run via valgrind\n"
+    printf "\t--novalgrind              do not run via valgrind\n"
+    printf "\t--verbose / -v            be verbose\n"
     exit
 }
 
@@ -109,11 +125,17 @@ stop_daemon() {
 ## command line options
 options=
 
-version=0
+OPTS=$(getopt -n 'init-ceph' -o 'hvam:c:' -l 'help,verbose,valgrind,novalgrind,allhosts,restart,norestart,btrfs,nobtrfs,fsmount,nofsmount,btrfsumount,fsumount,conf:,cluster:,hostname:' -- "$@")
+if [ $? != 0 ]
+then
+    exit 1
+fi
+
+eval set -- "$OPTS"
+
 dovalgrind=
 docrun=
 allhosts=0
-debug=0
 monaddr=
 dofsmount=1
 dofsumount=0
@@ -141,6 +163,9 @@ case $1 in
     --norestart)
 	    docrun=0
 	    ;;
+    -h | --help)
+            usage_exit
+            ;;
     -m )
 	    [ -z "$2" ] && usage_exit
 	    options="$options $1"
@@ -175,6 +200,10 @@ case $1 in
 	    shift
 	    hostname=$1
             ;;
+    -- )
+            shift
+            break
+            ;;
     *)
 	    echo unrecognized option \'$1\'
 	    usage_exit
@@ -184,7 +213,6 @@ options="$options $1"
 shift
 done
 
-
 # if `--cluster` was not passed in, fallback to looking at the config name
 if [ -z "$cluster" ]; then
     cluster=`echo $conf | awk -F'/' '{print $(NF)}' | cut -d'.' -f 1`
diff --git a/src/init-radosgw b/src/init-radosgw
index b7569a0..f18a761 100644
--- a/src/init-radosgw
+++ b/src/init-radosgw
@@ -104,7 +104,11 @@ case "$1" in
 	    elif [ -n "$SYSTEMD_RUN" ]; then
                 $SYSTEMD_RUN -r su "$user" -c "ulimit -n 32768; $RADOSGW -n $name"
             else
-		ulimit -n 32768
+                ulimit -n 32768
+                core_limit=`ceph-conf -n $name 'core file limit'`
+                if [ -z $core_limit ]
+                    DAEMON_COREFILE_LIMIT=$core_limit
+                fi
                 daemon --user="$user" "$RADOSGW -n $name"
             fi
         done
@@ -114,7 +118,7 @@ case "$1" in
 	if [ $DEBIAN -eq 1 ]; then
             start-stop-daemon --stop --signal HUP -x $RADOSGW --oknodo
 	else
-            killproc $RADOSGW -SIGHUP
+            killproc $RADOSGW -HUP
 	fi
 	;;
     restart|force-reload)
diff --git a/src/init-rbdmap b/src/init-rbdmap
index bad2754..cc8b7f5 100755
--- a/src/init-rbdmap
+++ b/src/init-rbdmap
@@ -24,115 +24,16 @@ if [ -e /lib/lsb/init-functions ]; then
     . /lib/lsb/init-functions
 fi
 
-do_map() {
-	if [ ! -f "$RBDMAPFILE" ]; then
-		logger -p "daemon.warning" -t init-rbdmap "No $RBDMAPFILE found."
-		exit 0
-	fi
 
-	# Read /etc/rbdtab to create non-existant mapping
-	RET=0
-	while read DEV PARAMS; do
-		case "$DEV" in
-		  ""|\#*)
-			continue
-			;;
-		  */*)
-			;;
-		  *)
-			DEV=rbd/$DEV
-			;;
-		esac
-		logger -p "daemon.debug" -t init-rbdmap "Mapping '${DEV}'"
-		newrbd=""
-		MAP_RV=""
-		OIFS=$IFS
-		IFS=','
-		for PARAM in ${PARAMS[@]}; do
-			CMDPARAMS="$CMDPARAMS --$(echo $PARAM | tr '=' ' ')"
-		done
-		IFS=$OIFS
-		if [ -b /dev/rbd/$DEV ]; then
-			MAP_RV="$(readlink -f /dev/rbd/$DEV)"
-		else
-			MAP_RV="$(rbd map $DEV $CMDPARAMS 2>&1)"
-			if [ $? -eq 0 ]; then
-			    newrbd="yes"
-			else
-			    RET=$((${RET}+$?))
-			    logger -p "daemon.warning" -t init-rbdmap "Failed to map '${DEV}"
-			    continue
-			fi
-		fi
-		logger -p "daemon.debug" -t init-rbdmap "Mapped '${DEV}' to '${MAP_RV}'"
-
-		if [ "$newrbd" ]; then
-			## Mount new rbd
-			MNT_RV=""
-			mount --fake /dev/rbd/$DEV >>/dev/null 2>&1 \
-			&& MNT_RV=$(mount -vn /dev/rbd/$DEV 2>&1)
-			[ -n "${MNT_RV}" ] && logger -p "daemon.debug" -t init-rbdmap "Mounted '${MAP_RV}' to '${MNT_RV}'"
-
-			## post-mapping
-			if [ -x "/etc/ceph/rbd.d/${DEV}" ]; then
-			    logger -p "daemon.debug" -t init-rbdmap "Running post-map hook '/etc/ceph/rbd.d/${DEV}'"
-			    /etc/ceph/rbd.d/${DEV} map "/dev/rbd/${DEV}"
-			fi
-		fi
-	done < $RBDMAPFILE
-	exit ${RET}
-
-}
-
-do_unmap() {
-	RET=0
-	## Unmount and unmap all rbd devices
-	if ls /dev/rbd[0-9]* >/dev/null 2>&1; then
-		for DEV in /dev/rbd[0-9]*; do
-			## pre-unmapping
-			for L in $(find /dev/rbd -type l); do
-			    LL="${L##/dev/rbd/}"
-			    if [ "$(readlink -f $L)" = "${DEV}" ] \
-			    && [ -x "/etc/ceph/rbd.d/${LL}" ]; then
-			        logger -p "daemon.debug" -t init-rbdmap "Running pre-unmap hook for '${DEV}': '/etc/ceph/rbd.d/${LL}'"
-			        /etc/ceph/rbd.d/${LL} unmap "$L"
-			        break
-			    fi
-			done
-
-			logger -p "daemon.debug" -t init-rbdmap "Unmapping '${DEV}'"
-			MNT=$(findmnt --mtab --source ${DEV} --noheadings | awk '{print $1'})
-			if [ -n "${MNT}" ]; then
-			    logger -p "daemon.debug" -t init-rbdmap "Unmounting '${MNT}'"
-			    umount "${MNT}" >>/dev/null 2>&1
-			fi
-			if mountpoint -q "${MNT}"; then
-			    ## Un-mounting failed.
-			    logger -p "daemon.warning" -t init-rbdmap "Failed to unmount '${MNT}'"
-			    RET=$((${RET}+1))
-			    continue
-			fi
-			## Un-mapping.
-			rbd unmap $DEV >>/dev/null 2>&1
-			if [ $? -ne 0 ]; then
-			    logger -p "daemon.warning" -t init-rbdmap "Failed to unmap '${MNT}'"
-			    RET=$((${RET}+$?))
-			    continue
-			fi
-			logger -p "daemon.debug" -t init-rbdmap "Unmapped '${DEV}'"
-		done
-	fi
-	exit ${RET}
-}
 
 
 case "$1" in
   start)
-	do_map
+	rbdmap map
 	;;
 
   stop)
-	do_unmap
+	rbdmap unmap
 	;;
 
   restart|force-reload)
@@ -141,7 +42,7 @@ case "$1" in
 	;;
 
   reload)
-	do_map
+	rbdmap map
 	;;
 
   status)
diff --git a/src/java/Makefile.in b/src/java/Makefile.in
index 39e879b..3a6408e 100644
--- a/src/java/Makefile.in
+++ b/src/java/Makefile.in
@@ -171,6 +171,7 @@ AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
 BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
+BOOST_REGEX_LIBS = @BOOST_REGEX_LIBS@
 BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
 CC = @CC@
 CCAS = @CCAS@
diff --git a/src/journal/AsyncOpTracker.cc b/src/journal/AsyncOpTracker.cc
new file mode 100644
index 0000000..8c24088
--- /dev/null
+++ b/src/journal/AsyncOpTracker.cc
@@ -0,0 +1,39 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/AsyncOpTracker.h"
+#include "journal/Utils.h"
+#include "include/assert.h"
+
+namespace journal {
+
+AsyncOpTracker::AsyncOpTracker()
+  : m_lock(utils::unique_lock_name("AsyncOpTracker::m_lock", this)),
+    m_pending_ops(0) {
+}
+
+AsyncOpTracker::~AsyncOpTracker() {
+  wait_for_ops();
+}
+
+void AsyncOpTracker::start_op() {
+  Mutex::Locker locker(m_lock);
+  ++m_pending_ops;
+}
+
+void AsyncOpTracker::finish_op() {
+  Mutex::Locker locker(m_lock);
+  assert(m_pending_ops > 0);
+  if (--m_pending_ops == 0) {
+    m_cond.Signal();
+  }
+}
+
+void AsyncOpTracker::wait_for_ops() {
+  Mutex::Locker locker(m_lock);
+  while (m_pending_ops > 0) {
+    m_cond.Wait(m_lock);
+  }
+}
+
+} // namespace journal
diff --git a/src/journal/AsyncOpTracker.h b/src/journal/AsyncOpTracker.h
new file mode 100644
index 0000000..cec332f
--- /dev/null
+++ b/src/journal/AsyncOpTracker.h
@@ -0,0 +1,32 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_ASYNC_OP_TRACKER_H
+#define CEPH_JOURNAL_ASYNC_OP_TRACKER_H
+
+#include "include/int_types.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+
+namespace journal {
+
+class AsyncOpTracker {
+public:
+  AsyncOpTracker();
+  ~AsyncOpTracker();
+
+  void start_op();
+  void finish_op();
+
+  void wait_for_ops();
+
+private:
+  Mutex m_lock;
+  Cond m_cond;
+  uint32_t m_pending_ops;
+
+};
+
+} // namespace journal
+
+#endif // CEPH_JOURNAL_ASYNC_OP_TRACKER_H
diff --git a/src/journal/Entry.cc b/src/journal/Entry.cc
new file mode 100644
index 0000000..bd26ebe
--- /dev/null
+++ b/src/journal/Entry.cc
@@ -0,0 +1,156 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/Entry.h"
+#include "include/encoding.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+#include <strstream>
+
+#define dout_subsys ceph_subsys_journaler
+#undef dout_prefix
+#define dout_prefix *_dout << "Entry: "
+
+namespace journal {
+
+namespace {
+
+const uint32_t HEADER_FIXED_SIZE = 17; /// preamble, version, tid
+
+} // anonymous namespace
+
+void Entry::encode(bufferlist &bl) const {
+  bufferlist data_bl;
+  ::encode(preamble, data_bl);
+  ::encode(static_cast<uint8_t>(1), data_bl);
+  ::encode(m_tid, data_bl);
+  assert(HEADER_FIXED_SIZE == data_bl.length());
+
+  ::encode(m_tag, data_bl);
+  ::encode(m_data, data_bl);
+
+  uint32_t crc = data_bl.crc32c(0);
+  bl.claim_append(data_bl);
+  ::encode(crc, bl);
+}
+
+void Entry::decode(bufferlist::iterator &iter) {
+  uint32_t start_offset = iter.get_off();
+  uint64_t bl_preamble;
+  ::decode(bl_preamble, iter);
+  if (bl_preamble != preamble) {
+    throw buffer::malformed_input("incorrect preamble: " +
+                                  stringify(bl_preamble));
+  }
+
+  uint8_t version;
+  ::decode(version, iter);
+  if (version != 1) {
+    throw buffer::malformed_input("unknown version: " + stringify(version));
+  }
+
+  ::decode(m_tid, iter);
+  ::decode(m_tag, iter);
+  ::decode(m_data, iter);
+  uint32_t end_offset = iter.get_off();
+
+  uint32_t crc;
+  ::decode(crc, iter);
+
+  bufferlist data_bl;
+  data_bl.substr_of(iter.get_bl(), start_offset, end_offset - start_offset);
+  uint32_t actual_crc = data_bl.crc32c(0);
+  if (crc != actual_crc) {
+    throw buffer::malformed_input("crc mismatch: " + stringify(crc) +
+                                  " != " + stringify(actual_crc));
+  }
+}
+
+void Entry::dump(Formatter *f) const {
+  f->dump_string("tag", m_tag);
+  f->dump_unsigned("tid", m_tid);
+
+  std::stringstream data;
+  m_data.hexdump(data);
+  f->dump_string("data", data.str());
+}
+
+bool Entry::is_readable(bufferlist::iterator iter, uint32_t *bytes_needed) {
+  uint32_t start_off = iter.get_off();
+  if (iter.get_remaining() < HEADER_FIXED_SIZE) {
+    *bytes_needed = HEADER_FIXED_SIZE - iter.get_remaining();
+    return false;
+  }
+  uint64_t bl_preamble;
+  ::decode(bl_preamble, iter);
+  if (bl_preamble != preamble) {
+    *bytes_needed = 0;
+    return false;
+  }
+  iter.advance(HEADER_FIXED_SIZE - sizeof(bl_preamble));
+
+  if (iter.get_remaining() < sizeof(uint32_t)) {
+    *bytes_needed = sizeof(uint32_t) - iter.get_remaining();
+    return false;
+  }
+  uint32_t tag_size;
+  ::decode(tag_size, iter);
+
+  if (iter.get_remaining() < tag_size) {
+    *bytes_needed = tag_size - iter.get_remaining();
+    return false;
+  }
+  iter.advance(tag_size);
+
+  if (iter.get_remaining() < sizeof(uint32_t)) {
+    *bytes_needed = sizeof(uint32_t) - iter.get_remaining();
+    return false;
+  }
+  uint32_t data_size;
+  ::decode(data_size, iter);
+
+  if (iter.get_remaining() < data_size) {
+    *bytes_needed = data_size - iter.get_remaining();
+    return false;
+  }
+  iter.advance(data_size);
+  uint32_t end_off = iter.get_off();
+
+  if (iter.get_remaining() < sizeof(uint32_t)) {
+    *bytes_needed = sizeof(uint32_t) - iter.get_remaining();
+    return false;
+  }
+
+  bufferlist crc_bl;
+  crc_bl.substr_of(iter.get_bl(), start_off, end_off - start_off);
+
+  *bytes_needed = 0;
+  uint32_t crc;
+  ::decode(crc, iter);
+  if (crc != crc_bl.crc32c(0)) {
+    return false;
+  }
+  return true;
+}
+
+void Entry::generate_test_instances(std::list<Entry *> &o) {
+  o.push_back(new Entry("tag1", 123, bufferlist()));
+
+  bufferlist bl;
+  bl.append("data");
+  o.push_back(new Entry("tag2", 123, bl));
+}
+
+bool Entry::operator==(const Entry& rhs) const {
+  return (m_tag == rhs.m_tag && m_tid == rhs.m_tid &&
+          const_cast<bufferlist&>(m_data).contents_equal(
+            const_cast<bufferlist&>(rhs.m_data)));
+}
+
+std::ostream &operator<<(std::ostream &os, const Entry &entry) {
+  os << "Entry[tag=" << entry.get_tag() << ", tid=" << entry.get_tid() << ", "
+     << "data size=" << entry.get_data().length() << "]";
+  return os;
+}
+
+} // namespace journal
diff --git a/src/journal/Entry.h b/src/journal/Entry.h
new file mode 100644
index 0000000..9e85df4
--- /dev/null
+++ b/src/journal/Entry.h
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_ENTRY_H
+#define CEPH_JOURNAL_ENTRY_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include <iosfwd>
+#include <string>
+
+namespace ceph {
+class Formatter;
+}
+
+namespace journal {
+
+class Entry {
+public:
+  Entry() : m_tid() {}
+  Entry(const std::string &tag, uint64_t tid, const bufferlist &data)
+    : m_tag(tag), m_tid(tid), m_data(data)
+  {
+  }
+
+  inline const std::string &get_tag() const {
+    return m_tag;
+  }
+  inline uint64_t get_tid() const {
+    return m_tid;
+  }
+  inline const bufferlist &get_data() const {
+    return m_data;
+  }
+
+  void encode(bufferlist &bl) const;
+  void decode(bufferlist::iterator &iter);
+  void dump(ceph::Formatter *f) const;
+
+  bool operator==(const Entry& rhs) const;
+
+  static bool is_readable(bufferlist::iterator iter, uint32_t *bytes_needed);
+  static void generate_test_instances(std::list<Entry *> &o);
+
+private:
+  static const uint64_t preamble = 0x3141592653589793;
+
+  std::string m_tag;
+  uint64_t m_tid;
+  bufferlist m_data;
+};
+
+std::ostream &operator<<(std::ostream &os, const Entry &entry);
+
+} // namespace journal
+
+using journal::operator<<;
+
+WRITE_CLASS_ENCODER(journal::Entry)
+
+#endif // CEPH_JOURNAL_ENTRY_H
diff --git a/src/journal/Future.cc b/src/journal/Future.cc
new file mode 100644
index 0000000..144f320
--- /dev/null
+++ b/src/journal/Future.cc
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/Future.h"
+#include "journal/FutureImpl.h"
+#include "include/assert.h"
+
+namespace journal {
+
+void Future::flush(Context *on_safe) {
+  m_future_impl->flush(on_safe);
+}
+
+void Future::wait(Context *on_safe) {
+  assert(on_safe != NULL);
+  m_future_impl->wait(on_safe);
+}
+
+bool Future::is_complete() const {
+  return m_future_impl->is_complete();
+}
+
+int Future::get_return_value() const {
+  return m_future_impl->get_return_value();
+}
+
+void intrusive_ptr_add_ref(FutureImpl *p) {
+  p->get();
+}
+
+void intrusive_ptr_release(FutureImpl *p) {
+  p->put();
+}
+
+std::ostream &operator<<(std::ostream &os, const Future &future) {
+  return os << *future.m_future_impl.get();
+}
+
+} // namespace journal
+
diff --git a/src/journal/Future.h b/src/journal/Future.h
new file mode 100644
index 0000000..1ddb606
--- /dev/null
+++ b/src/journal/Future.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_FUTURE_H
+#define CEPH_JOURNAL_FUTURE_H
+
+#include "include/int_types.h"
+#include <string>
+#include <iosfwd>
+#include <boost/intrusive_ptr.hpp>
+#include "include/assert.h"
+
+class Context;
+
+namespace journal {
+
+class FutureImpl;
+
+class Future {
+public:
+  typedef boost::intrusive_ptr<FutureImpl> FutureImplPtr;
+
+  Future() {}
+  Future(const FutureImplPtr &future_impl) : m_future_impl(future_impl) {}
+
+  inline bool is_valid() const {
+    return m_future_impl.get() != nullptr;
+  }
+
+  void flush(Context *on_safe);
+  void wait(Context *on_safe);
+
+  bool is_complete() const;
+  int get_return_value() const;
+
+private:
+  friend class Journaler;
+  friend std::ostream& operator<<(std::ostream&, const Future&);
+
+  inline FutureImplPtr get_future_impl() const {
+    return m_future_impl;
+  }
+
+  FutureImplPtr m_future_impl;
+};
+
+void intrusive_ptr_add_ref(FutureImpl *p);
+void intrusive_ptr_release(FutureImpl *p);
+
+std::ostream &operator<<(std::ostream &os, const Future &future);
+
+} // namespace journal
+
+using journal::intrusive_ptr_add_ref;
+using journal::intrusive_ptr_release;
+using journal::operator<<;
+
+#endif // CEPH_JOURNAL_FUTURE_H
diff --git a/src/journal/FutureImpl.cc b/src/journal/FutureImpl.cc
new file mode 100644
index 0000000..8365733
--- /dev/null
+++ b/src/journal/FutureImpl.cc
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/FutureImpl.h"
+#include "common/Finisher.h"
+#include "journal/Utils.h"
+
+namespace journal {
+
+FutureImpl::FutureImpl(Finisher &finisher, const std::string &tag, uint64_t tid,
+                       uint64_t commit_tid)
+  : RefCountedObject(NULL, 0), m_finisher(finisher), m_tag(tag), m_tid(tid),
+    m_commit_tid(commit_tid),
+    m_lock(utils::unique_lock_name("FutureImpl::m_lock", this)), m_safe(false),
+    m_consistent(false), m_return_value(0), m_flush_state(FLUSH_STATE_NONE),
+    m_consistent_ack(this) {
+}
+
+void FutureImpl::init(const FutureImplPtr &prev_future) {
+  // chain ourself to the prior future (if any) to that we known when the
+  // journal is consistent
+  if (prev_future) {
+    m_prev_future = prev_future;
+    m_prev_future->wait(&m_consistent_ack);
+  } else {
+    m_consistent_ack.complete(0);
+  }
+}
+
+void FutureImpl::flush(Context *on_safe) {
+  bool complete;
+  FlushHandlerPtr flush_handler;
+  {
+    Mutex::Locker locker(m_lock);
+    complete = (m_safe && m_consistent);
+    if (!complete) {
+      if (on_safe != NULL) {
+        m_contexts.push_back(on_safe);
+      }
+
+      if (m_flush_state == FLUSH_STATE_NONE) {
+        m_flush_state = FLUSH_STATE_REQUESTED;
+        flush_handler = m_flush_handler;
+
+        // walk the chain backwards up to <splay width> futures
+        if (m_prev_future) {
+          m_prev_future->flush();
+        }
+      }
+    }
+  }
+
+  if (complete && on_safe != NULL) {
+    m_finisher.queue(on_safe, m_return_value);
+  } else if (flush_handler) {
+    // attached to journal object -- instruct it to flush all entries through
+    // this one.  possible to become detached while lock is released, so flush
+    // will be re-requested by the object if it doesn't own the future
+    flush_handler->flush(this);
+  }
+}
+
+void FutureImpl::wait(Context *on_safe) {
+  assert(on_safe != NULL);
+  {
+    Mutex::Locker locker(m_lock);
+    if (!m_safe || !m_consistent) {
+      m_contexts.push_back(on_safe);
+      return;
+    }
+  }
+  m_finisher.queue(on_safe, m_return_value);
+}
+
+bool FutureImpl::is_complete() const {
+  Mutex::Locker locker(m_lock);
+  return m_safe && m_consistent;
+}
+
+int FutureImpl::get_return_value() const {
+  Mutex::Locker locker(m_lock);
+  assert(m_safe && m_consistent);
+  return m_return_value;
+}
+
+bool FutureImpl::attach(const FlushHandlerPtr &flush_handler) {
+  Mutex::Locker locker(m_lock);
+  assert(!m_flush_handler);
+  m_flush_handler = flush_handler;
+  return m_flush_state != FLUSH_STATE_NONE;
+}
+
+void FutureImpl::safe(int r) {
+  Mutex::Locker locker(m_lock);
+  assert(!m_safe);
+  m_safe = true;
+  if (m_return_value == 0) {
+    m_return_value = r;
+  }
+
+  m_flush_handler.reset();
+  if (m_consistent) {
+    finish();
+  }
+}
+
+void FutureImpl::consistent(int r) {
+  Mutex::Locker locker(m_lock);
+  assert(!m_consistent);
+  m_consistent = true;
+  m_prev_future.reset();
+  if (m_return_value == 0) {
+    m_return_value = r;
+  }
+
+  if (m_safe) {
+    finish();
+  }
+}
+
+void FutureImpl::finish() {
+  assert(m_lock.is_locked());
+  assert(m_safe && m_consistent);
+
+  Contexts contexts;
+  contexts.swap(m_contexts);
+
+  m_lock.Unlock();
+  for (Contexts::iterator it = contexts.begin();
+       it != contexts.end(); ++it) {
+    (*it)->complete(m_return_value);
+  }
+  m_lock.Lock();
+}
+
+std::ostream &operator<<(std::ostream &os, const FutureImpl &future) {
+  os << "Future[tag=" << future.m_tag << ", tid=" << future.m_tid << "]";
+  return os;
+}
+
+void intrusive_ptr_add_ref(FutureImpl::FlushHandler *p) {
+  p->get();
+}
+
+void intrusive_ptr_release(FutureImpl::FlushHandler *p) {
+  p->put();
+}
+
+} // namespace journal
diff --git a/src/journal/FutureImpl.h b/src/journal/FutureImpl.h
new file mode 100644
index 0000000..d936805
--- /dev/null
+++ b/src/journal/FutureImpl.h
@@ -0,0 +1,128 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_FUTURE_IMPL_H
+#define CEPH_JOURNAL_FUTURE_IMPL_H
+
+#include "include/int_types.h"
+#include "common/Mutex.h"
+#include "common/RefCountedObj.h"
+#include "journal/Future.h"
+#include <list>
+#include <boost/noncopyable.hpp>
+#include <boost/intrusive_ptr.hpp>
+#include "include/assert.h"
+
+class Context;
+class Finisher;
+
+namespace journal {
+
+class FutureImpl;
+typedef boost::intrusive_ptr<FutureImpl> FutureImplPtr;
+
+class FutureImpl : public RefCountedObject, boost::noncopyable {
+public:
+  struct FlushHandler {
+    virtual ~FlushHandler() {}
+    virtual void flush(const FutureImplPtr &future) = 0;
+    virtual void get() = 0;
+    virtual void put() = 0;
+  };
+  typedef boost::intrusive_ptr<FlushHandler> FlushHandlerPtr;
+
+  FutureImpl(Finisher &finisher, const std::string &tag, uint64_t tid,
+             uint64_t commit_tid);
+
+  void init(const FutureImplPtr &prev_future);
+
+  inline const std::string &get_tag() const {
+    return m_tag;
+  }
+  inline uint64_t get_tid() const {
+    return m_tid;
+  }
+  inline uint64_t get_commit_tid() const {
+    return m_commit_tid;
+  }
+
+  void flush(Context *on_safe = NULL);
+  void wait(Context *on_safe);
+
+  bool is_complete() const;
+  int get_return_value() const;
+
+  inline bool is_flush_in_progress() const {
+    Mutex::Locker locker(m_lock);
+    return (m_flush_state == FLUSH_STATE_IN_PROGRESS);
+  }
+  inline void set_flush_in_progress() {
+    Mutex::Locker locker(m_lock);
+    m_flush_state = FLUSH_STATE_IN_PROGRESS;
+  }
+
+  bool attach(const FlushHandlerPtr &flush_handler);
+  inline void detach() {
+    Mutex::Locker locker(m_lock);
+    assert(m_flush_handler);
+    m_flush_handler.reset();
+  }
+  inline FlushHandlerPtr get_flush_handler() const {
+    Mutex::Locker locker(m_lock);
+    return m_flush_handler;
+  }
+
+  void safe(int r);
+
+private:
+  friend std::ostream &operator<<(std::ostream &, const FutureImpl &);
+
+  typedef std::list<Context *> Contexts;
+
+  enum FlushState {
+    FLUSH_STATE_NONE,
+    FLUSH_STATE_REQUESTED,
+    FLUSH_STATE_IN_PROGRESS
+  };
+
+  struct C_ConsistentAck : public Context {
+    FutureImplPtr future;
+    C_ConsistentAck(FutureImpl *_future) : future(_future) {}
+    virtual void complete(int r) {
+      future->consistent(r);
+      future.reset();
+    }
+    virtual void finish(int r) {}
+  };
+
+  Finisher &m_finisher;
+  std::string m_tag;
+  uint64_t m_tid;
+  uint64_t m_commit_tid;
+
+  mutable Mutex m_lock;
+  FutureImplPtr m_prev_future;
+  bool m_safe;
+  bool m_consistent;
+  int m_return_value;
+
+  FlushHandlerPtr m_flush_handler;
+  FlushState m_flush_state;
+
+  C_ConsistentAck m_consistent_ack;
+  Contexts m_contexts;
+
+  void consistent(int r);
+  void finish();
+};
+
+void intrusive_ptr_add_ref(FutureImpl::FlushHandler *p);
+void intrusive_ptr_release(FutureImpl::FlushHandler *p);
+
+std::ostream &operator<<(std::ostream &os, const FutureImpl &future);
+
+} // namespace journal
+
+using journal::operator<<;
+
+#endif // CEPH_JOURNAL_FUTURE_IMPL_H
diff --git a/src/journal/JournalMetadata.cc b/src/journal/JournalMetadata.cc
new file mode 100644
index 0000000..409b4b9
--- /dev/null
+++ b/src/journal/JournalMetadata.cc
@@ -0,0 +1,493 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/JournalMetadata.h"
+#include "journal/Utils.h"
+#include "common/errno.h"
+#include "common/Finisher.h"
+#include "common/Timer.h"
+#include "cls/journal/cls_journal_client.h"
+#include <set>
+
+#define dout_subsys ceph_subsys_journaler
+#undef dout_prefix
+#define dout_prefix *_dout << "JournalMetadata: "
+
+namespace journal {
+
+using namespace cls::journal;
+
+JournalMetadata::JournalMetadata(librados::IoCtx &ioctx,
+                                 const std::string &oid,
+                                 const std::string &client_id,
+                                 double commit_interval)
+    : RefCountedObject(NULL, 0), m_cct(NULL), m_oid(oid),
+      m_client_id(client_id), m_commit_interval(commit_interval), m_order(0),
+      m_splay_width(0), m_pool_id(-1), m_initialized(false), m_finisher(NULL),
+      m_timer(NULL), m_timer_lock("JournalMetadata::m_timer_lock"),
+      m_lock("JournalMetadata::m_lock"), m_commit_tid(0), m_watch_ctx(this),
+      m_watch_handle(0), m_minimum_set(0), m_active_set(0),
+      m_update_notifications(0), m_commit_position_ctx(NULL),
+      m_commit_position_task_ctx(NULL) {
+  m_ioctx.dup(ioctx);
+  m_cct = reinterpret_cast<CephContext*>(m_ioctx.cct());
+}
+
+JournalMetadata::~JournalMetadata() {
+  if (m_initialized) {
+    shutdown();
+  }
+}
+
+void JournalMetadata::init(Context *on_init) {
+  assert(!m_initialized);
+  m_initialized = true;
+
+  m_finisher = new Finisher(m_cct);
+  m_finisher->start();
+
+  m_timer = new SafeTimer(m_cct, m_timer_lock, true);
+  m_timer->init();
+
+  int r = m_ioctx.watch2(m_oid, &m_watch_handle, &m_watch_ctx);
+  if (r < 0) {
+    lderr(m_cct) << __func__ << ": failed to watch journal"
+                 << cpp_strerror(r) << dendl;
+    on_init->complete(r);
+    return;
+  }
+
+  C_ImmutableMetadata *ctx = new C_ImmutableMetadata(this, on_init);
+  client::get_immutable_metadata(m_ioctx, m_oid, &m_order, &m_splay_width,
+                                 &m_pool_id, ctx);
+}
+
+void JournalMetadata::shutdown() {
+  assert(m_initialized);
+  {
+    Mutex::Locker locker(m_lock);
+    m_initialized = false;
+
+    if (m_watch_handle != 0) {
+      m_ioctx.unwatch2(m_watch_handle);
+      m_watch_handle = 0;
+    }
+  }
+
+  if (m_timer != NULL) {
+    Mutex::Locker locker(m_timer_lock);
+    m_timer->shutdown();
+    delete m_timer;
+    m_timer = NULL;
+  }
+
+  if (m_finisher != NULL) {
+    m_finisher->stop();
+    delete m_finisher;
+    m_finisher = NULL;
+  }
+
+  librados::Rados rados(m_ioctx);
+  rados.watch_flush();
+
+  m_async_op_tracker.wait_for_ops();
+  m_ioctx.aio_flush();
+}
+
+int JournalMetadata::register_client(const std::string &description) {
+  ldout(m_cct, 10) << __func__ << ": " << m_client_id << dendl;
+  int r = client::client_register(m_ioctx, m_oid, m_client_id, description);
+  if (r < 0) {
+    lderr(m_cct) << "failed to register journal client '" << m_client_id
+                 << "': " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  notify_update();
+  return 0;
+}
+
+int JournalMetadata::unregister_client() {
+  assert(!m_client_id.empty());
+
+  ldout(m_cct, 10) << __func__ << ": " << m_client_id << dendl;
+  int r = client::client_unregister(m_ioctx, m_oid, m_client_id);
+  if (r < 0) {
+    lderr(m_cct) << "failed to unregister journal client '" << m_client_id
+                 << "': " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  notify_update();
+  return 0;
+}
+
+void JournalMetadata::add_listener(Listener *listener) {
+  Mutex::Locker locker(m_lock);
+  while (m_update_notifications > 0) {
+    m_update_cond.Wait(m_lock);
+  }
+  m_listeners.push_back(listener);
+}
+
+void JournalMetadata::remove_listener(Listener *listener) {
+  Mutex::Locker locker(m_lock);
+  while (m_update_notifications > 0) {
+    m_update_cond.Wait(m_lock);
+  }
+  m_listeners.remove(listener);
+}
+
+void JournalMetadata::set_minimum_set(uint64_t object_set) {
+  Mutex::Locker locker(m_lock);
+
+  ldout(m_cct, 20) << __func__ << ": current=" << m_minimum_set
+                   << ", new=" << object_set << dendl;
+  if (m_minimum_set >= object_set) {
+    return;
+  }
+
+  librados::ObjectWriteOperation op;
+  client::set_minimum_set(&op, object_set);
+
+  C_NotifyUpdate *ctx = new C_NotifyUpdate(this);
+  librados::AioCompletion *comp =
+    librados::Rados::aio_create_completion(ctx, NULL,
+                                           utils::rados_ctx_callback);
+  int r = m_ioctx.aio_operate(m_oid, comp, &op);
+  assert(r == 0);
+  comp->release();
+
+  m_minimum_set = object_set;
+}
+
+void JournalMetadata::set_active_set(uint64_t object_set) {
+  Mutex::Locker locker(m_lock);
+
+  ldout(m_cct, 20) << __func__ << ": current=" << m_active_set
+                   << ", new=" << object_set << dendl;
+  if (m_active_set >= object_set) {
+    return;
+  }
+
+  librados::ObjectWriteOperation op;
+  client::set_active_set(&op, object_set);
+
+  C_NotifyUpdate *ctx = new C_NotifyUpdate(this);
+  librados::AioCompletion *comp =
+    librados::Rados::aio_create_completion(ctx, NULL,
+                                           utils::rados_ctx_callback);
+  int r = m_ioctx.aio_operate(m_oid, comp, &op);
+  assert(r == 0);
+  comp->release();
+
+  m_active_set = object_set;
+}
+
+void JournalMetadata::flush_commit_position() {
+  {
+    Mutex::Locker locker(m_lock);
+    if (m_commit_position_task_ctx == NULL) {
+      return;
+    }
+
+    Mutex::Locker timer_locker(m_timer_lock);
+    m_timer->cancel_event(m_commit_position_task_ctx);
+    m_commit_position_task_ctx = NULL;
+  }
+  handle_commit_position_task();
+}
+
+void JournalMetadata::set_commit_position(
+    const ObjectSetPosition &commit_position, Context *on_safe) {
+  assert(on_safe != NULL);
+
+  Mutex::Locker locker(m_lock);
+  ldout(m_cct, 20) << __func__ << ": current=" << m_client.commit_position
+                   << ", new=" << commit_position << dendl;
+  if (commit_position <= m_client.commit_position ||
+      commit_position <= m_commit_position) {
+    on_safe->complete(-ESTALE);
+    return;
+  }
+
+  if (m_commit_position_ctx != NULL) {
+    m_commit_position_ctx->complete(-ESTALE);
+  }
+
+  m_client.commit_position = commit_position;
+  m_commit_position = commit_position;
+  m_commit_position_ctx = on_safe;
+  schedule_commit_task();
+}
+
+void JournalMetadata::reserve_tid(const std::string &tag, uint64_t tid) {
+  Mutex::Locker locker(m_lock);
+  uint64_t &allocated_tid = m_allocated_tids[tag];
+  if (allocated_tid <= tid) {
+    allocated_tid = tid + 1;
+  }
+}
+
+bool JournalMetadata::get_last_allocated_tid(const std::string &tag,
+                                             uint64_t *tid) const {
+  Mutex::Locker locker(m_lock);
+
+  AllocatedTids::const_iterator it = m_allocated_tids.find(tag);
+  if (it == m_allocated_tids.end()) {
+    return false;
+  }
+
+  assert(it->second > 0);
+  *tid = it->second - 1;
+  return true;
+}
+
+void JournalMetadata::handle_immutable_metadata(int r, Context *on_init) {
+  if (r < 0) {
+    lderr(m_cct) << "failed to initialize immutable metadata: "
+                 << cpp_strerror(r) << dendl;
+    on_init->complete(r);
+    return;
+  }
+
+  ldout(m_cct, 10) << "initialized immutable metadata" << dendl;
+  refresh(on_init);
+}
+
+void JournalMetadata::refresh(Context *on_complete) {
+  ldout(m_cct, 10) << "refreshing mutable metadata" << dendl;
+  C_Refresh *refresh = new C_Refresh(this, on_complete);
+  client::get_mutable_metadata(m_ioctx, m_oid, &refresh->minimum_set,
+                               &refresh->active_set,
+                               &refresh->registered_clients, refresh);
+}
+
+void JournalMetadata::handle_refresh_complete(C_Refresh *refresh, int r) {
+  ldout(m_cct, 10) << "refreshed mutable metadata: r=" << r << dendl;
+  if (r == 0) {
+    Mutex::Locker locker(m_lock);
+
+    Client client(m_client_id, "");
+    RegisteredClients::iterator it = refresh->registered_clients.find(client);
+    if (it != refresh->registered_clients.end()) {
+      m_minimum_set = refresh->minimum_set;
+      m_active_set = refresh->active_set;
+      m_registered_clients = refresh->registered_clients;
+      m_client = *it;
+
+      ++m_update_notifications;
+      m_lock.Unlock();
+      for (Listeners::iterator it = m_listeners.begin();
+           it != m_listeners.end(); ++it) {
+        (*it)->handle_update(this);
+      }
+      m_lock.Lock();
+      if (--m_update_notifications == 0) {
+        m_update_cond.Signal();
+      }
+    } else {
+      lderr(m_cct) << "failed to locate client: " << m_client_id << dendl;
+      r = -ENOENT;
+    }
+  }
+
+  if (refresh->on_finish != NULL) {
+    refresh->on_finish->complete(r);
+  }
+}
+
+void JournalMetadata::schedule_commit_task() {
+  assert(m_lock.is_locked());
+
+  Mutex::Locker timer_locker(m_timer_lock);
+  if (m_commit_position_task_ctx == NULL) {
+    m_commit_position_task_ctx = new C_CommitPositionTask(this);
+    m_timer->add_event_after(m_commit_interval, m_commit_position_task_ctx);
+  }
+}
+
+void JournalMetadata::handle_commit_position_task() {
+  Mutex::Locker locker(m_lock);
+
+  librados::ObjectWriteOperation op;
+  client::client_commit(&op, m_client_id, m_commit_position);
+
+  C_NotifyUpdate *ctx = new C_NotifyUpdate(this, m_commit_position_ctx);
+  m_commit_position_ctx = NULL;
+
+  librados::AioCompletion *comp =
+    librados::Rados::aio_create_completion(ctx, NULL,
+                                           utils::rados_ctx_callback);
+  int r = m_ioctx.aio_operate(m_oid, comp, &op);
+  assert(r == 0);
+  comp->release();
+}
+
+void JournalMetadata::schedule_watch_reset() {
+  assert(m_timer_lock.is_locked());
+  m_timer->add_event_after(0.1, new C_WatchReset(this));
+}
+
+void JournalMetadata::handle_watch_reset() {
+  assert(m_timer_lock.is_locked());
+  if (!m_initialized) {
+    return;
+  }
+
+  int r = m_ioctx.watch2(m_oid, &m_watch_handle, &m_watch_ctx);
+  if (r < 0) {
+    lderr(m_cct) << __func__ << ": failed to watch journal"
+                 << cpp_strerror(r) << dendl;
+    schedule_watch_reset();
+  } else {
+    ldout(m_cct, 10) << __func__ << ": reset journal watch" << dendl;
+    refresh(NULL);
+  }
+}
+
+void JournalMetadata::handle_watch_notify(uint64_t notify_id, uint64_t cookie) {
+  ldout(m_cct, 10) << "journal header updated" << dendl;
+
+  bufferlist bl;
+  m_ioctx.notify_ack(m_oid, notify_id, cookie, bl);
+
+  refresh(NULL);
+}
+
+void JournalMetadata::handle_watch_error(int err) {
+  lderr(m_cct) << "journal watch error: " << cpp_strerror(err) << dendl;
+  Mutex::Locker locker(m_lock);
+  Mutex::Locker timer_locker(m_timer_lock);
+  if (m_initialized && err != -ENOENT) {
+    schedule_watch_reset();
+  }
+}
+
+uint64_t JournalMetadata::allocate_commit_tid(uint64_t object_num,
+                                              const std::string &tag,
+                                              uint64_t tid) {
+  Mutex::Locker locker(m_lock);
+  uint64_t commit_tid = ++m_commit_tid;
+  m_pending_commit_tids[commit_tid] = CommitEntry(object_num, tag, tid);
+
+  ldout(m_cct, 20) << "allocated commit tid: commit_tid=" << commit_tid << " ["
+                   << "object_num=" << object_num << ", "
+                   << "tag=" << tag << ", tid=" << tid << "]" << dendl;
+  return commit_tid;
+}
+
+bool JournalMetadata::committed(uint64_t commit_tid,
+                                ObjectSetPosition *object_set_position) {
+  ldout(m_cct, 20) << "committed tid=" << commit_tid << dendl;
+
+  Mutex::Locker locker(m_lock);
+  {
+    CommitTids::iterator it = m_pending_commit_tids.find(commit_tid);
+    assert(it != m_pending_commit_tids.end());
+
+    CommitEntry &commit_entry = it->second;
+    commit_entry.committed = true;
+  }
+
+  if (!m_commit_position.entry_positions.empty()) {
+    *object_set_position = m_commit_position;
+  } else {
+    *object_set_position = m_client.commit_position;
+  }
+
+  bool update_commit_position = false;
+  while (!m_pending_commit_tids.empty()) {
+    CommitTids::iterator it = m_pending_commit_tids.begin();
+    CommitEntry &commit_entry = it->second;
+    if (!commit_entry.committed) {
+      break;
+    }
+
+    object_set_position->object_number = commit_entry.object_num;
+    if (!object_set_position->entry_positions.empty() &&
+        object_set_position->entry_positions.front().tag == commit_entry.tag) {
+      object_set_position->entry_positions.front() = EntryPosition(
+        commit_entry.tag, commit_entry.tid);
+    } else {
+      object_set_position->entry_positions.push_front(EntryPosition(
+        commit_entry.tag, commit_entry.tid));
+    }
+    m_pending_commit_tids.erase(it);
+    update_commit_position = true;
+  }
+
+  if (update_commit_position) {
+    // prune the position to have unique tags in commit-order
+    std::set<std::string> in_use_tags;
+    EntryPositions::iterator it = object_set_position->entry_positions.begin();
+    while (it != object_set_position->entry_positions.end()) {
+      if (!in_use_tags.insert(it->tag).second) {
+        it = object_set_position->entry_positions.erase(it);
+      } else {
+        ++it;
+      }
+    }
+
+    ldout(m_cct, 20) << "updated object set position: " << *object_set_position
+                     << dendl;
+  }
+  return update_commit_position;
+}
+
+void JournalMetadata::notify_update() {
+  ldout(m_cct, 10) << "notifying journal header update" << dendl;
+
+  bufferlist bl;
+  m_ioctx.notify2(m_oid, bl, 5000, NULL);
+}
+
+void JournalMetadata::async_notify_update() {
+  ldout(m_cct, 10) << "async notifying journal header update" << dendl;
+
+  C_AioNotify *ctx = new C_AioNotify(this);
+  librados::AioCompletion *comp =
+    librados::Rados::aio_create_completion(ctx, NULL,
+                                           utils::rados_ctx_callback);
+
+  bufferlist bl;
+  int r = m_ioctx.aio_notify(m_oid, comp, bl, 5000, NULL);
+  assert(r == 0);
+
+  comp->release();
+}
+
+void JournalMetadata::handle_notified(int r) {
+  ldout(m_cct, 10) << "notified journal header update: r=" << r << dendl;
+}
+
+std::ostream &operator<<(std::ostream &os,
+			 const JournalMetadata::RegisteredClients &clients) {
+  os << "[";
+  for (JournalMetadata::RegisteredClients::const_iterator c = clients.begin();
+       c != clients.end(); c++) {
+    os << (c == clients.begin() ? "" : ", " ) << *c;
+  }
+  os << "]";
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os,
+			 const JournalMetadata &jm) {
+  Mutex::Locker locker(jm.m_lock);
+  os << "[oid=" << jm.m_oid << ", "
+     << "initialized=" << jm.m_initialized << ", "
+     << "order=" << (int)jm.m_order << ", "
+     << "splay_width=" << (int)jm.m_splay_width << ", "
+     << "pool_id=" << jm.m_pool_id << ", "
+     << "minimum_set=" << jm.m_minimum_set << ", "
+     << "active_set=" << jm.m_active_set << ", "
+     << "client_id=" << jm.m_client_id << ", "
+     << "commit_tid=" << jm.m_commit_tid << ", "
+     << "commit_interval=" << jm.m_commit_interval << ", "
+     << "commit_position=" << jm.m_commit_position << ", "
+     << "registered_clients=" << jm.m_registered_clients << "]";
+  return os;
+}
+
+} // namespace journal
diff --git a/src/journal/JournalMetadata.h b/src/journal/JournalMetadata.h
new file mode 100644
index 0000000..d15bbca
--- /dev/null
+++ b/src/journal/JournalMetadata.h
@@ -0,0 +1,324 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_JOURNAL_METADATA_H
+#define CEPH_JOURNAL_JOURNAL_METADATA_H
+
+#include "include/int_types.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/RefCountedObj.h"
+#include "cls/journal/cls_journal_types.h"
+#include "journal/AsyncOpTracker.h"
+#include <boost/intrusive_ptr.hpp>
+#include <boost/noncopyable.hpp>
+#include <list>
+#include <map>
+#include <string>
+#include "include/assert.h"
+
+class Finisher;
+class SafeTimer;
+
+namespace journal {
+
+class JournalMetadata;
+typedef boost::intrusive_ptr<JournalMetadata> JournalMetadataPtr;
+
+class JournalMetadata : public RefCountedObject, boost::noncopyable {
+public:
+  typedef cls::journal::EntryPosition EntryPosition;
+  typedef cls::journal::EntryPositions EntryPositions;
+  typedef cls::journal::ObjectSetPosition ObjectSetPosition;
+  typedef cls::journal::Client Client;
+
+  typedef std::set<Client> RegisteredClients;
+
+  struct Listener {
+    virtual ~Listener() {};
+    virtual void handle_update(JournalMetadata *) = 0;
+  };
+
+  JournalMetadata(librados::IoCtx &ioctx, const std::string &oid,
+                  const std::string &client_id, double commit_interval);
+  ~JournalMetadata();
+
+  void init(Context *on_init);
+  void shutdown();
+
+  void add_listener(Listener *listener);
+  void remove_listener(Listener *listener);
+
+  int register_client(const std::string &description);
+  int unregister_client();
+
+  inline const std::string &get_client_id() const {
+    return m_client_id;
+  }
+  inline uint8_t get_order() const {
+    return m_order;
+  }
+  inline uint8_t get_splay_width() const {
+    return m_splay_width;
+  }
+  inline int64_t get_pool_id() const {
+    return m_pool_id;
+  }
+
+  inline Finisher &get_finisher() {
+    return *m_finisher;
+  }
+
+  inline SafeTimer &get_timer() {
+    return *m_timer;
+  }
+  inline Mutex &get_timer_lock() {
+    return m_timer_lock;
+  }
+
+  void set_minimum_set(uint64_t object_set);
+  inline uint64_t get_minimum_set() const {
+    Mutex::Locker locker(m_lock);
+    return m_minimum_set;
+  }
+
+  void set_active_set(uint64_t object_set);
+  inline uint64_t get_active_set() const {
+    Mutex::Locker locker(m_lock);
+    return m_active_set;
+  }
+
+  void flush_commit_position();
+  void set_commit_position(const ObjectSetPosition &commit_position,
+                           Context *on_safe);
+  void get_commit_position(ObjectSetPosition *commit_position) const {
+    Mutex::Locker locker(m_lock);
+    *commit_position = m_client.commit_position;
+  }
+
+  void get_registered_clients(RegisteredClients *registered_clients) {
+    Mutex::Locker locker(m_lock);
+    *registered_clients = m_registered_clients;
+  }
+
+  inline uint64_t allocate_tid(const std::string &tag) {
+    Mutex::Locker locker(m_lock);
+    return m_allocated_tids[tag]++;
+  }
+  void reserve_tid(const std::string &tag, uint64_t tid);
+  bool get_last_allocated_tid(const std::string &tag, uint64_t *tid) const;
+
+  uint64_t allocate_commit_tid(uint64_t object_num, const std::string &tag,
+                               uint64_t tid);
+  bool committed(uint64_t commit_tid, ObjectSetPosition *object_set_position);
+
+  void notify_update();
+  void async_notify_update();
+
+private:
+  typedef std::map<std::string, uint64_t> AllocatedTids;
+  typedef std::list<Listener*> Listeners;
+
+  struct CommitEntry {
+    uint64_t object_num;
+    std::string tag;
+    uint64_t tid;
+    bool committed;
+
+    CommitEntry() : object_num(0), tid(0), committed(false) {
+    }
+    CommitEntry(uint64_t _object_num, const std::string &_tag, uint64_t _tid)
+      : object_num(_object_num), tag(_tag), tid(_tid), committed(false) {
+    }
+  };
+  typedef std::map<uint64_t, CommitEntry> CommitTids;
+
+  struct C_WatchCtx : public librados::WatchCtx2 {
+    JournalMetadata *journal_metadata;
+
+    C_WatchCtx(JournalMetadata *_journal_metadata)
+      : journal_metadata(_journal_metadata) {}
+
+    virtual void handle_notify(uint64_t notify_id, uint64_t cookie,
+                               uint64_t notifier_id, bufferlist& bl) {
+      journal_metadata->handle_watch_notify(notify_id, cookie);
+    }
+    virtual void handle_error(uint64_t cookie, int err) {
+      journal_metadata->handle_watch_error(err);
+    }
+  };
+
+  struct C_WatchReset : public Context {
+    JournalMetadata *journal_metadata;
+
+    C_WatchReset(JournalMetadata *_journal_metadata)
+      : journal_metadata(_journal_metadata) {
+      journal_metadata->m_async_op_tracker.start_op();
+    }
+    virtual ~C_WatchReset() {
+      journal_metadata->m_async_op_tracker.finish_op();
+    }
+    virtual void finish(int r) {
+      journal_metadata->handle_watch_reset();
+    }
+  };
+
+  struct C_CommitPositionTask : public Context {
+    JournalMetadata *journal_metadata;
+
+    C_CommitPositionTask(JournalMetadata *_journal_metadata)
+      : journal_metadata(_journal_metadata) {
+      journal_metadata->m_async_op_tracker.start_op();
+    }
+    virtual ~C_CommitPositionTask() {
+      journal_metadata->m_async_op_tracker.finish_op();
+    }
+    virtual void finish(int r) {
+      journal_metadata->handle_commit_position_task();
+    };
+  };
+
+  struct C_AioNotify : public Context {
+    JournalMetadata* journal_metadata;
+
+    C_AioNotify(JournalMetadata *_journal_metadata)
+      : journal_metadata(_journal_metadata) {
+      journal_metadata->m_async_op_tracker.start_op();
+    }
+    virtual ~C_AioNotify() {
+      journal_metadata->m_async_op_tracker.finish_op();
+    }
+    virtual void finish(int r) {
+      journal_metadata->handle_notified(r);
+    }
+  };
+
+  struct C_NotifyUpdate : public Context {
+    JournalMetadata* journal_metadata;
+    Context *on_safe;
+
+    C_NotifyUpdate(JournalMetadata *_journal_metadata, Context *_on_safe = NULL)
+      : journal_metadata(_journal_metadata), on_safe(_on_safe) {
+      journal_metadata->m_async_op_tracker.start_op();
+    }
+    virtual ~C_NotifyUpdate() {
+      journal_metadata->m_async_op_tracker.finish_op();
+    }
+    virtual void finish(int r) {
+      if (r == 0) {
+        journal_metadata->async_notify_update();
+      }
+      if (on_safe != NULL) {
+        on_safe->complete(r);
+      }
+    }
+  };
+
+  struct C_ImmutableMetadata : public Context {
+    JournalMetadata* journal_metadata;
+    Context *on_finish;
+
+    C_ImmutableMetadata(JournalMetadata *_journal_metadata, Context *_on_finish)
+      : journal_metadata(_journal_metadata), on_finish(_on_finish) {
+      Mutex::Locker locker(journal_metadata->m_lock);
+      journal_metadata->m_async_op_tracker.start_op();
+    }
+    virtual ~C_ImmutableMetadata() {
+      journal_metadata->m_async_op_tracker.finish_op();
+    }
+    virtual void finish(int r) {
+      journal_metadata->handle_immutable_metadata(r, on_finish);
+    }
+  };
+
+  struct C_Refresh : public Context {
+    JournalMetadata* journal_metadata;
+    uint64_t minimum_set;
+    uint64_t active_set;
+    RegisteredClients registered_clients;
+    Context *on_finish;
+
+    C_Refresh(JournalMetadata *_journal_metadata, Context *_on_finish)
+      : journal_metadata(_journal_metadata), minimum_set(0), active_set(0),
+        on_finish(_on_finish) {
+      Mutex::Locker locker(journal_metadata->m_lock);
+      journal_metadata->m_async_op_tracker.start_op();
+    }
+    virtual ~C_Refresh() {
+      journal_metadata->m_async_op_tracker.finish_op();
+    }
+    virtual void finish(int r) {
+      journal_metadata->handle_refresh_complete(this, r);
+    }
+  };
+
+  librados::IoCtx m_ioctx;
+  CephContext *m_cct;
+  std::string m_oid;
+  std::string m_client_id;
+  double m_commit_interval;
+
+  uint8_t m_order;
+  uint8_t m_splay_width;
+  int64_t m_pool_id;
+  bool m_initialized;
+
+  Finisher *m_finisher;
+  SafeTimer *m_timer;
+  Mutex m_timer_lock;
+
+  mutable Mutex m_lock;
+
+  uint64_t m_commit_tid;
+  CommitTids m_pending_commit_tids;
+
+  Listeners m_listeners;
+
+  C_WatchCtx m_watch_ctx;
+  uint64_t m_watch_handle;
+
+  uint64_t m_minimum_set;
+  uint64_t m_active_set;
+  RegisteredClients m_registered_clients;
+  Client m_client;
+
+  AllocatedTids m_allocated_tids;
+
+  size_t m_update_notifications;
+  Cond m_update_cond;
+
+  ObjectSetPosition m_commit_position;
+  Context *m_commit_position_ctx;
+  Context *m_commit_position_task_ctx;
+
+  AsyncOpTracker m_async_op_tracker;
+
+  void handle_immutable_metadata(int r, Context *on_init);
+
+  void refresh(Context *on_finish);
+  void handle_refresh_complete(C_Refresh *refresh, int r);
+
+  void schedule_commit_task();
+  void handle_commit_position_task();
+
+  void schedule_watch_reset();
+  void handle_watch_reset();
+  void handle_watch_notify(uint64_t notify_id, uint64_t cookie);
+  void handle_watch_error(int err);
+  void handle_notified(int r);
+
+  friend std::ostream &operator<<(std::ostream &os,
+				  const JournalMetadata &journal_metadata);
+};
+
+std::ostream &operator<<(std::ostream &os,
+			 const JournalMetadata::RegisteredClients &clients);
+
+std::ostream &operator<<(std::ostream &os,
+			 const JournalMetadata &journal_metadata);
+
+} // namespace journal
+
+#endif // CEPH_JOURNAL_JOURNAL_METADATA_H
diff --git a/src/journal/JournalPlayer.cc b/src/journal/JournalPlayer.cc
new file mode 100644
index 0000000..2f97158
--- /dev/null
+++ b/src/journal/JournalPlayer.cc
@@ -0,0 +1,407 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/JournalPlayer.h"
+#include "common/Finisher.h"
+#include "journal/Entry.h"
+#include "journal/ReplayHandler.h"
+#include "journal/Utils.h"
+
+#define dout_subsys ceph_subsys_journaler
+#undef dout_prefix
+#define dout_prefix *_dout << "JournalPlayer: "
+
+namespace journal {
+
+namespace {
+
+struct C_HandleComplete : public Context {
+  ReplayHandler *replay_handler;
+
+  C_HandleComplete(ReplayHandler *_replay_handler)
+    : replay_handler(_replay_handler) {
+    replay_handler->get();
+  }
+  virtual ~C_HandleComplete() {
+    replay_handler->put();
+  }
+  virtual void finish(int r) {
+    replay_handler->handle_complete(r);
+  }
+};
+
+struct C_HandleEntriesAvailable : public Context {
+  ReplayHandler *replay_handler;
+
+  C_HandleEntriesAvailable(ReplayHandler *_replay_handler)
+      : replay_handler(_replay_handler) {
+    replay_handler->get();
+  }
+  virtual ~C_HandleEntriesAvailable() {
+    replay_handler->put();
+  }
+  virtual void finish(int r) {
+    replay_handler->handle_entries_available();
+  }
+};
+
+} // anonymous namespace
+
+JournalPlayer::JournalPlayer(librados::IoCtx &ioctx,
+                             const std::string &object_oid_prefix,
+                             const JournalMetadataPtr& journal_metadata,
+                             ReplayHandler *replay_handler)
+  : m_cct(NULL), m_object_oid_prefix(object_oid_prefix),
+    m_journal_metadata(journal_metadata), m_replay_handler(replay_handler),
+    m_process_state(this), m_lock("JournalPlayer::m_lock"), m_state(STATE_INIT),
+    m_splay_offset(0), m_watch_enabled(false), m_watch_scheduled(false),
+    m_watch_interval(0), m_commit_object(0) {
+  m_replay_handler->get();
+  m_ioctx.dup(ioctx);
+  m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+
+  ObjectSetPosition commit_position;
+  m_journal_metadata->get_commit_position(&commit_position);
+  if (!commit_position.entry_positions.empty()) {
+    uint8_t splay_width = m_journal_metadata->get_splay_width();
+    m_splay_offset = commit_position.object_number % splay_width;
+    m_commit_object = commit_position.object_number;
+    m_commit_tag = commit_position.entry_positions.front().tag;
+
+    for (EntryPositions::const_iterator it =
+           commit_position.entry_positions.begin();
+         it != commit_position.entry_positions.end(); ++it) {
+      const EntryPosition &entry_position = *it;
+      m_commit_tids[entry_position.tag] = entry_position.tid;
+    }
+  }
+}
+
+JournalPlayer::~JournalPlayer() {
+  m_async_op_tracker.wait_for_ops();
+  m_replay_handler->put();
+}
+
+void JournalPlayer::prefetch() {
+  m_lock.Lock();
+  assert(m_state == STATE_INIT);
+  m_state = STATE_PREFETCH;
+
+  uint8_t splay_width = m_journal_metadata->get_splay_width();
+  ldout(m_cct, 10) << __func__ << ": prefetching " << (2 * splay_width) << " "
+                   << "objects" << dendl;
+
+  // prefetch starting from the last known commit set
+  C_PrefetchBatch *ctx = new C_PrefetchBatch(this);
+  uint64_t start_object = (m_commit_object / splay_width) * splay_width;
+  for (uint64_t object_number = start_object;
+       object_number < start_object + (2 * splay_width); ++object_number) {
+    ctx->add_fetch();
+    fetch(object_number, ctx);
+  }
+  m_lock.Unlock();
+
+  ctx->complete(0);
+}
+
+void JournalPlayer::prefetch_and_watch(double interval) {
+  {
+    Mutex::Locker locker(m_lock);
+    m_watch_enabled = true;
+    m_watch_interval = interval;
+  }
+  prefetch();
+}
+
+void JournalPlayer::unwatch() {
+  Mutex::Locker locker(m_lock);
+  m_watch_enabled = false;
+  if (m_watch_scheduled) {
+    ObjectPlayerPtr object_player = get_object_player();
+    assert(object_player);
+
+    object_player->unwatch();
+    m_watch_scheduled = false;
+  }
+}
+
+bool JournalPlayer::try_pop_front(Entry *entry, uint64_t *commit_tid) {
+  Mutex::Locker locker(m_lock);
+  if (m_state != STATE_PLAYBACK) {
+    return false;
+  }
+
+  ObjectPlayerPtr object_player = get_object_player();
+  assert(object_player);
+
+  if (object_player->empty()) {
+    if (m_watch_enabled && !m_watch_scheduled) {
+      object_player->watch(&m_process_state, m_watch_interval);
+      m_watch_scheduled = true;
+    } else if (!m_watch_enabled && !object_player->is_fetch_in_progress()) {
+      m_journal_metadata->get_finisher().queue(new C_HandleComplete(
+        m_replay_handler), 0);
+    }
+    return false;
+  }
+
+  object_player->front(entry);
+  object_player->pop_front();
+
+  uint64_t last_tid;
+  if (m_journal_metadata->get_last_allocated_tid(entry->get_tag(), &last_tid) &&
+      entry->get_tid() != last_tid + 1) {
+    lderr(m_cct) << "missing prior journal entry: " << *entry << dendl;
+
+    m_state = STATE_ERROR;
+    m_journal_metadata->get_finisher().queue(new C_HandleComplete(
+      m_replay_handler), -EINVAL);
+    return false;
+  }
+
+  // skip to next splay offset if we cannot apply the next entry in-sequence
+  if (!object_player->empty()) {
+    Entry peek_entry;
+    object_player->front(&peek_entry);
+    if (peek_entry.get_tag() == entry->get_tag() ||
+        (m_journal_metadata->get_last_allocated_tid(peek_entry.get_tag(),
+                                                    &last_tid) &&
+         last_tid + 1 != peek_entry.get_tid())) {
+      advance_splay_object();
+    }
+  } else {
+    advance_splay_object();
+
+    ObjectPlayerPtr next_set_object_player = get_next_set_object_player();
+    if (!next_set_object_player->empty()) {
+      remove_object_player(object_player, &m_process_state);
+    }
+  }
+
+  m_journal_metadata->reserve_tid(entry->get_tag(), entry->get_tid());
+  *commit_tid = m_journal_metadata->allocate_commit_tid(
+    object_player->get_object_number(), entry->get_tag(), entry->get_tid());
+  return true;
+}
+
+void JournalPlayer::process_state(int r) {
+  ldout(m_cct, 10) << __func__ << ": r=" << r << dendl;
+  if (r >= 0) {
+    Mutex::Locker locker(m_lock);
+    switch (m_state) {
+    case STATE_PREFETCH:
+      ldout(m_cct, 10) << "PREFETCH" << dendl;
+      r = process_prefetch();
+      break;
+    case STATE_PLAYBACK:
+      ldout(m_cct, 10) << "PLAYBACK" << dendl;
+      r = process_playback();
+      break;
+    case STATE_ERROR:
+      ldout(m_cct, 10) << "ERROR" << dendl;
+      break;
+    default:
+      lderr(m_cct) << "UNEXPECTED STATE (" << m_state << ")" << dendl;
+      assert(false);
+      break;
+    }
+  }
+
+  if (r < 0) {
+    {
+      Mutex::Locker locker(m_lock);
+      m_state = STATE_ERROR;
+    }
+    m_replay_handler->handle_complete(r);
+  }
+}
+
+int JournalPlayer::process_prefetch() {
+  ldout(m_cct, 10) << __func__ << dendl;
+  assert(m_lock.is_locked());
+
+  uint8_t splay_width = m_journal_metadata->get_splay_width();
+  for (uint8_t splay_offset = 0; splay_offset < splay_width; ++splay_offset) {
+    assert(m_object_players.count(splay_offset) == 1);
+
+    ObjectPlayers &object_players = m_object_players[splay_offset];
+    assert(object_players.size() == 2);
+
+    ObjectPlayerPtr object_player = object_players.begin()->second;
+    assert(!object_player->is_fetch_in_progress());
+
+    ldout(m_cct, 15) << "seeking known commit position in "
+                     << object_player->get_oid() << dendl;
+    Entry entry;
+    while (!m_commit_tids.empty() && !object_player->empty()) {
+      object_player->front(&entry);
+      if (entry.get_tid() > m_commit_tids[entry.get_tag()]) {
+        ldout(m_cct, 10) << "located next uncommitted entry: " << entry
+                         << dendl;
+        break;
+      }
+
+      ldout(m_cct, 20) << "skipping committed entry: " << entry << dendl;
+      m_journal_metadata->reserve_tid(entry.get_tag(), entry.get_tid());
+      object_player->pop_front();
+    }
+
+    // if this object contains the commit position, our read should start with
+    // the next consistent journal entry in the sequence
+    if (!m_commit_tids.empty() &&
+        object_player->get_object_number() == m_commit_object) {
+      if (object_player->empty()) {
+        advance_splay_object();
+      } else {
+        Entry entry;
+        object_player->front(&entry);
+        if (entry.get_tag() == m_commit_tag) {
+          advance_splay_object();
+        }
+      }
+    }
+
+    ObjectPlayerPtr next_set_object_player = get_next_set_object_player();
+    if (object_player->empty() && !next_set_object_player->empty()) {
+      ldout(m_cct, 15) << object_player->get_oid() << " empty" << dendl;
+      remove_object_player(object_player, &m_process_state);
+    }
+  }
+
+  m_state = STATE_PLAYBACK;
+  ObjectPlayerPtr object_player = get_object_player();
+  if (!object_player->empty()) {
+    ldout(m_cct, 10) << __func__ << ": entries available" << dendl;
+    m_journal_metadata->get_finisher().queue(new C_HandleEntriesAvailable(
+      m_replay_handler), 0);
+  } else if (m_watch_enabled) {
+    object_player->watch(&m_process_state, m_watch_interval);
+    m_watch_scheduled = true;
+  } else {
+    ldout(m_cct, 10) << __func__ << ": no uncommitted entries available"
+                     << dendl;
+    m_journal_metadata->get_finisher().queue(new C_HandleComplete(
+      m_replay_handler), 0);
+  }
+  return 0;
+}
+
+int JournalPlayer::process_playback() {
+  ldout(m_cct, 10) << __func__ << dendl;
+  assert(m_lock.is_locked());
+
+  m_watch_scheduled = false;
+
+  ObjectPlayerPtr object_player = get_object_player();
+  if (!object_player->empty()) {
+    ldout(m_cct, 10) << __func__ << ": entries available" << dendl;
+    m_journal_metadata->get_finisher().queue(new C_HandleEntriesAvailable(
+      m_replay_handler), 0);
+  }
+  return 0;
+}
+
+const JournalPlayer::ObjectPlayers &JournalPlayer::get_object_players() const {
+  assert(m_lock.is_locked());
+
+  assert(m_object_players.count(m_splay_offset) == 1);
+  SplayedObjectPlayers::const_iterator it = m_object_players.find(
+    m_splay_offset);
+  assert(it != m_object_players.end());
+
+  const ObjectPlayers &object_players = it->second;
+  assert(object_players.size() == 2);
+  return object_players;
+}
+
+ObjectPlayerPtr JournalPlayer::get_object_player() const {
+  assert(m_lock.is_locked());
+
+  const ObjectPlayers &object_players = get_object_players();
+  return object_players.begin()->second;
+}
+
+ObjectPlayerPtr JournalPlayer::get_next_set_object_player() const {
+  assert(m_lock.is_locked());
+
+  const ObjectPlayers &object_players = get_object_players();
+  return object_players.rbegin()->second;
+}
+
+void JournalPlayer::advance_splay_object() {
+  assert(m_lock.is_locked());
+  ++m_splay_offset;
+  m_splay_offset %= m_journal_metadata->get_splay_width();
+  ldout(m_cct, 20) << __func__ << ": new offset "
+                   << static_cast<uint32_t>(m_splay_offset) << dendl;
+}
+
+void JournalPlayer::remove_object_player(const ObjectPlayerPtr &object_player,
+                                         Context *on_fetch) {
+  assert(m_lock.is_locked());
+
+  uint8_t splay_width = m_journal_metadata->get_splay_width();
+  ObjectPlayers &object_players = m_object_players[
+    object_player->get_object_number() % splay_width];
+  assert(!object_players.empty());
+  assert(object_players.begin()->second == object_player);
+  object_players.erase(object_players.begin());
+
+  fetch(object_player->get_object_number() + (2 * splay_width), on_fetch);
+}
+
+void JournalPlayer::fetch(uint64_t object_num, Context *ctx) {
+  assert(m_lock.is_locked());
+
+  std::string oid = utils::get_object_name(m_object_oid_prefix, object_num);
+
+  ldout(m_cct, 10) << __func__ << ": " << oid << dendl;
+  C_Fetch *fetch_ctx = new C_Fetch(this, object_num, ctx);
+  ObjectPlayerPtr object_player(new ObjectPlayer(
+    m_ioctx, m_object_oid_prefix, object_num, m_journal_metadata->get_timer(),
+    m_journal_metadata->get_timer_lock(), m_journal_metadata->get_order()));
+
+  uint8_t splay_width = m_journal_metadata->get_splay_width();
+  m_object_players[object_num % splay_width][object_num] = object_player;
+  object_player->fetch(fetch_ctx);
+}
+
+int JournalPlayer::handle_fetched(int r, uint64_t object_num) {
+  std::string oid = utils::get_object_name(m_object_oid_prefix, object_num);
+
+  ldout(m_cct, 10) << __func__ << ": fetched "
+                   << utils::get_object_name(m_object_oid_prefix, object_num)
+                   << ": r=" << r << dendl;
+  if (r < 0 && r != -ENOENT) {
+    return r;
+  }
+  return 0;
+}
+
+JournalPlayer::C_PrefetchBatch::C_PrefetchBatch(JournalPlayer *p)
+  : player(p), lock("JournalPlayer::C_PrefetchBatch::lock"), refs(1),
+    return_value(0) {
+  player->m_async_op_tracker.start_op();
+}
+
+void JournalPlayer::C_PrefetchBatch::add_fetch() {
+  Mutex::Locker locker(lock);
+  ++refs;
+}
+
+void JournalPlayer::C_PrefetchBatch::complete(int r) {
+  {
+    Mutex::Locker locker(lock);
+    if (r < 0 && return_value == 0) {
+      return_value = r;
+    }
+    --refs;
+  }
+
+  if (refs == 0) {
+    player->process_state(return_value);
+    delete this;
+  }
+}
+
+} // namespace journal
diff --git a/src/journal/JournalPlayer.h b/src/journal/JournalPlayer.h
new file mode 100644
index 0000000..7d48559
--- /dev/null
+++ b/src/journal/JournalPlayer.h
@@ -0,0 +1,136 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_JOURNAL_PLAYER_H
+#define CEPH_JOURNAL_JOURNAL_PLAYER_H
+
+#include "include/int_types.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "common/Mutex.h"
+#include "journal/AsyncOpTracker.h"
+#include "journal/JournalMetadata.h"
+#include "journal/ObjectPlayer.h"
+#include "cls/journal/cls_journal_types.h"
+#include <map>
+
+class SafeTimer;
+
+namespace journal {
+
+class Entry;
+class ReplayHandler;
+
+class JournalPlayer {
+public:
+  typedef cls::journal::EntryPosition EntryPosition;
+  typedef cls::journal::EntryPositions EntryPositions;
+  typedef cls::journal::ObjectSetPosition ObjectSetPosition;
+
+  JournalPlayer(librados::IoCtx &ioctx, const std::string &object_oid_prefix,
+                const JournalMetadataPtr& journal_metadata,
+                ReplayHandler *replay_handler);
+  ~JournalPlayer();
+
+  void prefetch();
+  void prefetch_and_watch(double interval);
+  void unwatch();
+
+  bool try_pop_front(Entry *entry, uint64_t *commit_tid);
+
+private:
+  typedef std::map<std::string, uint64_t> AllocatedTids;
+  typedef std::map<uint64_t, ObjectPlayerPtr> ObjectPlayers;
+  typedef std::map<uint8_t, ObjectPlayers> SplayedObjectPlayers;
+
+  enum State {
+    STATE_INIT,
+    STATE_PREFETCH,
+    STATE_PLAYBACK,
+    STATE_ERROR
+  };
+
+  struct C_ProcessState : public Context {
+    JournalPlayer *player;
+    C_ProcessState(JournalPlayer *p) : player(p) {}
+    virtual void complete(int r) {
+      player->process_state(r);
+    }
+    virtual void finish(int r) {}
+  };
+
+  struct C_PrefetchBatch : public Context {
+    JournalPlayer *player;
+    Mutex lock;
+    uint32_t refs;
+    int return_value;
+
+    C_PrefetchBatch(JournalPlayer *p);
+    virtual ~C_PrefetchBatch() {
+      player->m_async_op_tracker.finish_op();
+    }
+    void add_fetch();
+    virtual void complete(int r);
+    virtual void finish(int r) {}
+  };
+
+  struct C_Fetch : public Context {
+    JournalPlayer *player;
+    uint64_t object_num;
+    Context *on_fetch;
+    C_Fetch(JournalPlayer *p, uint64_t o, Context *c)
+      : player(p), object_num(o), on_fetch(c) {
+      player->m_async_op_tracker.start_op();
+    }
+    virtual ~C_Fetch() {
+      player->m_async_op_tracker.finish_op();
+    }
+    virtual void finish(int r) {
+      r = player->handle_fetched(r, object_num);
+      on_fetch->complete(r);
+    }
+  };
+
+  librados::IoCtx m_ioctx;
+  CephContext *m_cct;
+  std::string m_object_oid_prefix;
+  JournalMetadataPtr m_journal_metadata;
+
+  ReplayHandler *m_replay_handler;
+
+  C_ProcessState m_process_state;
+
+  AsyncOpTracker m_async_op_tracker;
+
+  mutable Mutex m_lock;
+  State m_state;
+  uint8_t m_splay_offset;
+
+  bool m_watch_enabled;
+  bool m_watch_scheduled;
+  double m_watch_interval;
+
+  SplayedObjectPlayers m_object_players;
+  uint64_t m_commit_object;
+  std::string m_commit_tag;
+  AllocatedTids m_commit_tids;
+
+  void advance_splay_object();
+
+  const ObjectPlayers &get_object_players() const;
+  ObjectPlayerPtr get_object_player() const;
+  ObjectPlayerPtr get_next_set_object_player() const;
+  void remove_object_player(const ObjectPlayerPtr &object_player,
+                            Context *on_fetch);
+
+  void process_state(int r);
+  int process_prefetch();
+  int process_playback();
+
+  void fetch(uint64_t object_num, Context *ctx);
+  int handle_fetched(int r, uint64_t object_num);
+};
+
+} // namespace journal
+
+#endif // CEPH_JOURNAL_JOURNAL_PLAYER_H
diff --git a/src/journal/JournalRecorder.cc b/src/journal/JournalRecorder.cc
new file mode 100644
index 0000000..4fb7765
--- /dev/null
+++ b/src/journal/JournalRecorder.cc
@@ -0,0 +1,182 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/JournalRecorder.h"
+#include "journal/Entry.h"
+#include "journal/Utils.h"
+
+#define dout_subsys ceph_subsys_journaler
+#undef dout_prefix
+#define dout_prefix *_dout << "JournalRecorder: "
+
+namespace journal {
+
+JournalRecorder::JournalRecorder(librados::IoCtx &ioctx,
+                                 const std::string &object_oid_prefix,
+                                 const JournalMetadataPtr& journal_metadata,
+                                 uint32_t flush_interval, uint64_t flush_bytes,
+                                 double flush_age)
+  : m_cct(NULL), m_object_oid_prefix(object_oid_prefix),
+    m_journal_metadata(journal_metadata), m_flush_interval(flush_interval),
+    m_flush_bytes(flush_bytes), m_flush_age(flush_age), m_listener(this),
+    m_overflow_handler(this), m_lock("JournalerRecorder::m_lock"),
+    m_current_set(m_journal_metadata->get_active_set()) {
+
+  Mutex::Locker locker(m_lock);
+  m_ioctx.dup(ioctx);
+  m_cct = reinterpret_cast<CephContext*>(m_ioctx.cct());
+
+  uint8_t splay_width = m_journal_metadata->get_splay_width();
+  for (uint8_t splay_offset = 0; splay_offset < splay_width; ++splay_offset) {
+    uint64_t object_number = splay_offset + (m_current_set * splay_width);
+    m_object_ptrs[splay_offset] = create_object_recorder(object_number);
+  }
+
+  m_journal_metadata->add_listener(&m_listener);
+}
+
+JournalRecorder::~JournalRecorder() {
+  m_journal_metadata->remove_listener(&m_listener);
+}
+
+Future JournalRecorder::append(const std::string &tag,
+                               const bufferlist &payload_bl) {
+  Mutex::Locker locker(m_lock);
+
+  uint64_t tid = m_journal_metadata->allocate_tid(tag);
+  uint8_t splay_width = m_journal_metadata->get_splay_width();
+  uint8_t splay_offset = tid % splay_width;
+
+  ObjectRecorderPtr object_ptr = get_object(splay_offset);
+  uint64_t commit_tid = m_journal_metadata->allocate_commit_tid(
+    object_ptr->get_object_number(), tag, tid);
+  FutureImplPtr future(new FutureImpl(m_journal_metadata->get_finisher(),
+                                      tag, tid, commit_tid));
+  future->init(m_prev_future);
+  m_prev_future = future;
+
+  bufferlist entry_bl;
+  ::encode(Entry(future->get_tag(), future->get_tid(), payload_bl), entry_bl);
+
+  AppendBuffers append_buffers;
+  append_buffers.push_back(std::make_pair(future, entry_bl));
+  bool object_full = object_ptr->append(append_buffers);
+
+  if (object_full) {
+    ldout(m_cct, 10) << "object " << object_ptr->get_oid() << " now full"
+                     << dendl;
+    close_object_set(object_ptr->get_object_number() / splay_width);
+  }
+
+  return Future(future);
+}
+
+void JournalRecorder::flush(Context *on_safe) {
+  C_Flush *ctx;
+  {
+    Mutex::Locker locker(m_lock);
+
+    ctx = new C_Flush(on_safe, m_object_ptrs.size());
+    for (ObjectRecorderPtrs::iterator it = m_object_ptrs.begin();
+         it != m_object_ptrs.end(); ++it) {
+      it->second->flush(ctx);
+    }
+  }
+
+  ctx->unblock();
+}
+
+ObjectRecorderPtr JournalRecorder::get_object(uint8_t splay_offset) {
+  assert(m_lock.is_locked());
+
+  ObjectRecorderPtr object_recoder = m_object_ptrs[splay_offset];
+  assert(object_recoder != NULL);
+  return object_recoder;
+}
+
+void JournalRecorder::close_object_set(uint64_t object_set) {
+  assert(m_lock.is_locked());
+
+  uint8_t splay_width = m_journal_metadata->get_splay_width();
+  if (object_set != m_current_set) {
+    return;
+  }
+
+  uint64_t active_set = m_journal_metadata->get_active_set();
+  if (active_set < m_current_set + 1) {
+    m_journal_metadata->set_active_set(m_current_set + 1);
+  }
+  m_current_set = m_journal_metadata->get_active_set();
+
+  ldout(m_cct, 10) << __func__ << ": advancing to object set "
+                   << m_current_set << dendl;
+
+  // object recorders will invoke overflow handler as they complete
+  // closing the object to ensure correct order of future appends
+  for (ObjectRecorderPtrs::iterator it = m_object_ptrs.begin();
+       it != m_object_ptrs.end(); ++it) {
+    ObjectRecorderPtr object_recorder = it->second;
+    if (object_recorder != NULL &&
+        object_recorder->get_object_number() / splay_width == m_current_set) {
+      if (object_recorder->close_object()) {
+        // no in-flight ops, immediately create new recorder
+        create_next_object_recorder(object_recorder);
+      }
+    }
+  }
+}
+
+ObjectRecorderPtr JournalRecorder::create_object_recorder(
+    uint64_t object_number) {
+  ObjectRecorderPtr object_recorder(new ObjectRecorder(
+    m_ioctx, utils::get_object_name(m_object_oid_prefix, object_number),
+    object_number, m_journal_metadata->get_timer(),
+    m_journal_metadata->get_timer_lock(), &m_overflow_handler,
+    m_journal_metadata->get_order(), m_flush_interval, m_flush_bytes,
+    m_flush_age));
+  return object_recorder;
+}
+
+void JournalRecorder::create_next_object_recorder(
+    ObjectRecorderPtr object_recorder) {
+  assert(m_lock.is_locked());
+
+  uint64_t object_number = object_recorder->get_object_number();
+  uint8_t splay_width = m_journal_metadata->get_splay_width();
+  uint8_t splay_offset = object_number % splay_width;
+
+  ObjectRecorderPtr new_object_recorder = create_object_recorder(
+     (m_current_set * splay_width) + splay_offset);
+
+  AppendBuffers append_buffers;
+  object_recorder->claim_append_buffers(&append_buffers);
+  new_object_recorder->append(append_buffers);
+
+  m_object_ptrs[splay_offset] = new_object_recorder;
+}
+
+void JournalRecorder::handle_update() {
+  Mutex::Locker locker(m_lock);
+
+  uint64_t active_set = m_journal_metadata->get_active_set();
+  if (active_set > m_current_set) {
+    close_object_set(m_current_set);
+  }
+}
+
+void JournalRecorder::handle_overflow(ObjectRecorder *object_recorder) {
+  ldout(m_cct, 10) << __func__ << ": " << object_recorder->get_oid() << dendl;
+
+  Mutex::Locker locker(m_lock);
+
+  uint64_t object_number = object_recorder->get_object_number();
+  uint8_t splay_width = m_journal_metadata->get_splay_width();
+  uint8_t splay_offset = object_number % splay_width;
+  ObjectRecorderPtr active_object_recorder = m_object_ptrs[splay_offset];
+  assert(active_object_recorder->get_object_number() == object_number);
+
+  close_object_set(object_number / splay_width);
+  create_next_object_recorder(active_object_recorder);
+}
+
+} // namespace journal
diff --git a/src/journal/JournalRecorder.h b/src/journal/JournalRecorder.h
new file mode 100644
index 0000000..4c3489f
--- /dev/null
+++ b/src/journal/JournalRecorder.h
@@ -0,0 +1,117 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_JOURNAL_RECORDER_H
+#define CEPH_JOURNAL_JOURNAL_RECORDER_H
+
+#include "include/int_types.h"
+#include "include/atomic.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "common/Mutex.h"
+#include "journal/Future.h"
+#include "journal/FutureImpl.h"
+#include "journal/JournalMetadata.h"
+#include "journal/ObjectRecorder.h"
+#include <map>
+#include <string>
+
+class SafeTimer;
+
+namespace journal {
+
+class JournalRecorder {
+public:
+  JournalRecorder(librados::IoCtx &ioctx, const std::string &object_oid_prefix,
+                  const JournalMetadataPtr &journal_metadata,
+                  uint32_t flush_interval, uint64_t flush_bytes,
+                  double flush_age);
+  ~JournalRecorder();
+
+  Future append(const std::string &tag, const bufferlist &bl);
+  void flush(Context *on_safe);
+
+  ObjectRecorderPtr get_object(uint8_t splay_offset);
+
+private:
+  typedef std::map<uint8_t, ObjectRecorderPtr> ObjectRecorderPtrs;
+
+  struct Listener : public JournalMetadata::Listener {
+    JournalRecorder *journal_recorder;
+
+    Listener(JournalRecorder *_journal_recorder)
+      : journal_recorder(_journal_recorder) {}
+
+    virtual void handle_update(JournalMetadata *) {
+      journal_recorder->handle_update();
+    }
+  };
+
+  struct OverflowHandler : public ObjectRecorder::OverflowHandler {
+    JournalRecorder *journal_recorder;
+
+    OverflowHandler(JournalRecorder *_journal_recorder)
+      : journal_recorder(_journal_recorder) {}
+
+    virtual void overflow(ObjectRecorder *object_recorder) {
+      journal_recorder->handle_overflow(object_recorder);
+    }
+  };
+
+  struct C_Flush : public Context {
+    Context *on_finish;
+    atomic_t pending_flushes;
+    int ret_val;
+
+    C_Flush(Context *_on_finish, size_t _pending_flushes)
+      : on_finish(_on_finish), pending_flushes(_pending_flushes + 1),
+        ret_val(0) {
+    }
+
+    void unblock() {
+      complete(0);
+    }
+    virtual void complete(int r) {
+      if (r < 0 && ret_val == 0) {
+        ret_val = r;
+      }
+      if (pending_flushes.dec() == 0) {
+        on_finish->complete(ret_val);
+        delete this;
+      }
+    }
+    virtual void finish(int r) {
+    }
+  };
+
+  librados::IoCtx m_ioctx;
+  CephContext *m_cct;
+  std::string m_object_oid_prefix;
+
+  JournalMetadataPtr m_journal_metadata;
+
+  uint32_t m_flush_interval;
+  uint64_t m_flush_bytes;
+  double m_flush_age;
+
+  Listener m_listener;
+  OverflowHandler m_overflow_handler;
+
+  Mutex m_lock;
+
+  uint64_t m_current_set;
+  ObjectRecorderPtrs m_object_ptrs;
+
+  FutureImplPtr m_prev_future;
+
+  void close_object_set(uint64_t object_set);
+  ObjectRecorderPtr create_object_recorder(uint64_t object_number);
+  void create_next_object_recorder(ObjectRecorderPtr object_recorder);
+
+  void handle_update();
+  void handle_overflow(ObjectRecorder *object_recorder);
+};
+
+} // namespace journal
+
+#endif // CEPH_JOURNAL_JOURNAL_RECORDER_H
diff --git a/src/journal/JournalTrimmer.cc b/src/journal/JournalTrimmer.cc
new file mode 100644
index 0000000..33e3be2
--- /dev/null
+++ b/src/journal/JournalTrimmer.cc
@@ -0,0 +1,204 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/JournalTrimmer.h"
+#include "journal/Utils.h"
+#include "common/Cond.h"
+#include "common/errno.h"
+#include "common/Finisher.h"
+#include <limits>
+
+#define dout_subsys ceph_subsys_journaler
+#undef dout_prefix
+#define dout_prefix *_dout << "JournalTrimmer: "
+
+namespace journal {
+
+JournalTrimmer::JournalTrimmer(librados::IoCtx &ioctx,
+                               const std::string &object_oid_prefix,
+                               const JournalMetadataPtr &journal_metadata)
+    : m_cct(NULL), m_object_oid_prefix(object_oid_prefix),
+      m_journal_metadata(journal_metadata), m_lock("JournalTrimmer::m_lock"),
+      m_remove_set_pending(false), m_remove_set(0), m_remove_set_ctx(NULL) {
+  m_ioctx.dup(ioctx);
+  m_cct = reinterpret_cast<CephContext *>(m_ioctx.cct());
+}
+
+JournalTrimmer::~JournalTrimmer() {
+  m_journal_metadata->flush_commit_position();
+  m_async_op_tracker.wait_for_ops();
+}
+
+int JournalTrimmer::remove_objects() {
+  ldout(m_cct, 20) << __func__ << dendl;
+  m_async_op_tracker.wait_for_ops();
+
+  C_SaferCond ctx;
+  {
+    Mutex::Locker locker(m_lock);
+    JournalMetadata::RegisteredClients registered_clients;
+    m_journal_metadata->get_registered_clients(&registered_clients);
+
+    if (registered_clients.size() == 0) {
+      return -EINVAL;
+    } else if (registered_clients.size() > 1 || m_remove_set_pending) {
+      return -EBUSY;
+    }
+
+    m_remove_set = std::numeric_limits<uint64_t>::max();
+    m_remove_set_pending = true;
+    m_remove_set_ctx = &ctx;
+
+    remove_set(m_journal_metadata->get_minimum_set());
+  }
+  return ctx.wait();
+}
+
+void JournalTrimmer::committed(uint64_t commit_tid) {
+  ldout(m_cct, 20) << __func__ << ": commit_tid=" << commit_tid << dendl;
+
+  ObjectSetPosition object_set_position;
+  if (!m_journal_metadata->committed(commit_tid, &object_set_position)) {
+    return;
+  }
+
+  {
+    Mutex::Locker locker(m_lock);
+    m_async_op_tracker.start_op();
+  }
+
+  Context *ctx = new C_CommitPositionSafe(this, object_set_position);
+  m_journal_metadata->set_commit_position(object_set_position, ctx);
+}
+
+void JournalTrimmer::trim_objects(uint64_t minimum_set) {
+  assert(m_lock.is_locked());
+
+  ldout(m_cct, 20) << __func__ << ": min_set=" << minimum_set << dendl;
+  if (minimum_set <= m_journal_metadata->get_minimum_set()) {
+    return;
+  }
+
+  if (m_remove_set_pending) {
+    m_remove_set = MAX(m_remove_set, minimum_set);
+    return;
+  }
+
+  m_remove_set = minimum_set;
+  m_remove_set_pending = true;
+  remove_set(m_journal_metadata->get_minimum_set());
+}
+
+void JournalTrimmer::remove_set(uint64_t object_set) {
+  assert(m_lock.is_locked());
+
+  m_async_op_tracker.start_op();
+  uint8_t splay_width = m_journal_metadata->get_splay_width();
+  C_RemoveSet *ctx = new C_RemoveSet(this, object_set, splay_width);
+
+  ldout(m_cct, 20) << __func__ << ": removing object set " << object_set
+                   << dendl;
+  for (uint64_t object_number = object_set * splay_width;
+       object_number < (object_set + 1) * splay_width;
+       ++object_number) {
+    std::string oid = utils::get_object_name(m_object_oid_prefix,
+                                             object_number);
+
+    ldout(m_cct, 20) << "removing journal object " << oid << dendl;
+    librados::AioCompletion *comp =
+      librados::Rados::aio_create_completion(ctx, NULL,
+                                             utils::rados_ctx_callback);
+    int r = m_ioctx.aio_remove(oid, comp);
+    assert(r == 0);
+    comp->release();
+  }
+}
+
+void JournalTrimmer::handle_commit_position_safe(
+    int r, const ObjectSetPosition &object_set_position) {
+  ldout(m_cct, 20) << __func__ << ": r=" << r << ", pos="
+                   << object_set_position << dendl;
+
+  Mutex::Locker locker(m_lock);
+  if (r == 0) {
+    uint8_t splay_width = m_journal_metadata->get_splay_width();
+    uint64_t object_set = object_set_position.object_number / splay_width;
+
+    JournalMetadata::RegisteredClients registered_clients;
+    m_journal_metadata->get_registered_clients(&registered_clients);
+
+    bool trim_permitted = true;
+    for (JournalMetadata::RegisteredClients::iterator it =
+           registered_clients.begin();
+         it != registered_clients.end(); ++it) {
+      const JournalMetadata::Client &client = *it;
+      uint64_t client_object_set = client.commit_position.object_number /
+                                   splay_width;
+      if (client.id != m_journal_metadata->get_client_id() &&
+          client_object_set < object_set) {
+        ldout(m_cct, 20) << "object set " << client_object_set << " still "
+                         << "in-use by client " << client.id << dendl;
+        trim_permitted = false;
+        break;
+      }
+    }
+
+    if (trim_permitted) {
+      trim_objects(object_set_position.object_number / splay_width);
+    }
+  }
+}
+
+void JournalTrimmer::handle_set_removed(int r, uint64_t object_set) {
+  ldout(m_cct, 20) << __func__ << ": r=" << r << ", set=" << object_set << ", "
+                   << "trim=" << m_remove_set << dendl;
+
+  Mutex::Locker locker(m_lock);
+  m_remove_set_pending = false;
+
+  if (r == 0 || (r == -ENOENT && m_remove_set_ctx == NULL)) {
+    // advance the minimum set to the next set
+    m_journal_metadata->set_minimum_set(object_set + 1);
+    uint64_t minimum_set = m_journal_metadata->get_minimum_set();
+
+    if (m_remove_set > minimum_set) {
+      m_remove_set_pending = true;
+      remove_set(minimum_set);
+    }
+  } else if (r == -ENOENT) {
+    // no objects within the set existed
+    r = 0;
+  }
+
+  if (m_remove_set_ctx != NULL && !m_remove_set_pending) {
+    ldout(m_cct, 20) << "completing remove set context" << dendl;
+    m_remove_set_ctx->complete(r);
+  }
+}
+
+JournalTrimmer::C_RemoveSet::C_RemoveSet(JournalTrimmer *_journal_trimmer,
+                                         uint64_t _object_set,
+                                         uint8_t _splay_width)
+  : journal_trimmer(_journal_trimmer), object_set(_object_set),
+    lock(utils::unique_lock_name("C_RemoveSet::lock", this)),
+    refs(_splay_width), return_value(-ENOENT) {
+}
+
+void JournalTrimmer::C_RemoveSet::complete(int r) {
+  lock.Lock();
+  if (r < 0 && r != -ENOENT && return_value == -ENOENT) {
+    return_value = r;
+  } else if (r == 0 && return_value == -ENOENT) {
+    return_value = 0;
+  }
+
+  if (--refs == 0) {
+    finish(return_value);
+    lock.Unlock();
+    delete this;
+  } else {
+    lock.Unlock();
+  }
+}
+
+} // namespace journal
diff --git a/src/journal/JournalTrimmer.h b/src/journal/JournalTrimmer.h
new file mode 100644
index 0000000..9f557a7
--- /dev/null
+++ b/src/journal/JournalTrimmer.h
@@ -0,0 +1,83 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_JOURNAL_TRIMMER_H
+#define CEPH_JOURNAL_JOURNAL_TRIMMER_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "include/Context.h"
+#include "common/Mutex.h"
+#include "journal/AsyncOpTracker.h"
+#include "journal/JournalMetadata.h"
+#include "cls/journal/cls_journal_types.h"
+
+namespace journal {
+
+class JournalTrimmer {
+public:
+  typedef cls::journal::ObjectSetPosition ObjectSetPosition;
+
+  JournalTrimmer(librados::IoCtx &ioctx, const std::string &object_oid_prefix,
+                 const JournalMetadataPtr &journal_metadata);
+  ~JournalTrimmer();
+
+  int remove_objects();
+  void committed(uint64_t commit_tid);
+
+private:
+  struct C_CommitPositionSafe : public Context {
+    JournalTrimmer *journal_trimmer;
+    ObjectSetPosition object_set_position;
+
+    C_CommitPositionSafe(JournalTrimmer *_journal_trimmer,
+                         const ObjectSetPosition &_object_set_position)
+      : journal_trimmer(_journal_trimmer),
+        object_set_position(_object_set_position) {}
+
+    virtual void finish(int r) {
+      journal_trimmer->handle_commit_position_safe(r, object_set_position);
+      journal_trimmer->m_async_op_tracker.finish_op();
+    }
+  };
+  struct C_RemoveSet : public Context {
+    JournalTrimmer *journal_trimmer;
+    uint64_t object_set;
+    Mutex lock;
+    uint32_t refs;
+    int return_value;
+
+    C_RemoveSet(JournalTrimmer *_journal_trimmer, uint64_t _object_set,
+                uint8_t _splay_width);
+    virtual void complete(int r);
+    virtual void finish(int r) {
+      journal_trimmer->handle_set_removed(r, object_set);
+      journal_trimmer->m_async_op_tracker.finish_op();
+    }
+  };
+
+  librados::IoCtx m_ioctx;
+  CephContext *m_cct;
+  std::string m_object_oid_prefix;
+
+  JournalMetadataPtr m_journal_metadata;
+
+  AsyncOpTracker m_async_op_tracker;
+
+  Mutex m_lock;
+
+  bool m_remove_set_pending;
+  uint64_t m_remove_set;
+  Context *m_remove_set_ctx;
+
+  void trim_objects(uint64_t minimum_set);
+  void remove_set(uint64_t object_set);
+
+  void handle_commit_position_safe(int r, const ObjectSetPosition &position);
+
+  void handle_set_removed(int r, uint64_t object_set);
+};
+
+} // namespace journal
+
+#endif // CEPH_JOURNAL_JOURNAL_TRIMMER_H
diff --git a/src/journal/Journaler.cc b/src/journal/Journaler.cc
new file mode 100644
index 0000000..83862fc
--- /dev/null
+++ b/src/journal/Journaler.cc
@@ -0,0 +1,230 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/Journaler.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "journal/Entry.h"
+#include "journal/FutureImpl.h"
+#include "journal/JournalMetadata.h"
+#include "journal/JournalPlayer.h"
+#include "journal/JournalRecorder.h"
+#include "journal/JournalTrimmer.h"
+#include "journal/ReplayEntry.h"
+#include "journal/ReplayHandler.h"
+#include "cls/journal/cls_journal_client.h"
+#include "cls/journal/cls_journal_types.h"
+
+#define dout_subsys ceph_subsys_journaler
+#undef dout_prefix
+#define dout_prefix *_dout << "Journaler: "
+
+namespace journal {
+
+namespace {
+
+static const std::string JOURNAL_HEADER_PREFIX = "journal.";
+static const std::string JOURNAL_OBJECT_PREFIX = "journal_data.";
+
+struct C_DeleteRecorder : public Context {
+  JournalRecorder *recorder;
+  Context *on_safe;
+  C_DeleteRecorder(JournalRecorder *_recorder, Context *_on_safe)
+    : recorder(_recorder), on_safe(_on_safe) {
+  }
+  virtual void finish(int r) {
+    delete recorder;
+    on_safe->complete(r);
+  }
+};
+
+} // anonymous namespace
+
+using namespace cls::journal;
+
+Journaler::Journaler(librados::IoCtx &header_ioctx,
+		     const std::string &journal_id,
+		     const std::string &client_id, double commit_interval)
+  : m_client_id(client_id), m_metadata(NULL), m_player(NULL), m_recorder(NULL),
+    m_trimmer(NULL)
+{
+  m_header_ioctx.dup(header_ioctx);
+  m_cct = reinterpret_cast<CephContext *>(m_header_ioctx.cct());
+
+  m_header_oid = JOURNAL_HEADER_PREFIX + journal_id;
+  m_object_oid_prefix = JOURNAL_OBJECT_PREFIX + journal_id + ".";
+
+  m_metadata = new JournalMetadata(m_header_ioctx, m_header_oid, m_client_id,
+                                   commit_interval);
+  m_metadata->get();
+}
+
+Journaler::~Journaler() {
+  if (m_metadata != NULL) {
+    m_metadata->put();
+    m_metadata = NULL;
+  }
+  delete m_trimmer;
+  assert(m_player == NULL);
+  assert(m_recorder == NULL);
+}
+
+void Journaler::init(Context *on_init) {
+  m_metadata->init(new C_InitJournaler(this, on_init));
+}
+
+int Journaler::init_complete() {
+  int64_t pool_id = m_metadata->get_pool_id();
+
+  if (pool_id < 0 || pool_id == m_header_ioctx.get_id()) {
+    ldout(m_cct, 20) << "using image pool for journal data" << dendl;
+    m_data_ioctx.dup(m_header_ioctx);
+  } else {
+    ldout(m_cct, 20) << "using pool id=" << pool_id << " for journal data"
+		     << dendl;
+    librados::Rados rados(m_header_ioctx);
+    int r = rados.ioctx_create2(pool_id, m_data_ioctx);
+    if (r < 0) {
+      if (r == -ENOENT) {
+	ldout(m_cct, 1) << "pool id=" << pool_id << " no longer exists"
+			<< dendl;
+      }
+      return r;
+    }
+  }
+  m_trimmer = new JournalTrimmer(m_data_ioctx, m_object_oid_prefix,
+                                 m_metadata);
+  return 0;
+}
+
+int Journaler::create(uint8_t order, uint8_t splay_width, int64_t pool_id) {
+  if (order > 64 || order < 12) {
+    lderr(m_cct) << "order must be in the range [12, 64]" << dendl;
+    return -EDOM;
+  }
+  if (splay_width == 0) {
+    return -EINVAL;
+  }
+
+  ldout(m_cct, 5) << "creating new journal: " << m_header_oid << dendl;
+  int r = client::create(m_header_ioctx, m_header_oid, order, splay_width,
+			 pool_id);
+  if (r < 0) {
+    lderr(m_cct) << "failed to create journal: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+int Journaler::remove() {
+  m_metadata->shutdown();
+
+  int r = m_trimmer->remove_objects();
+  if (r < 0) {
+    lderr(m_cct) << "failed to remove journal objects: " << cpp_strerror(r)
+                 << dendl;
+    return r;
+  }
+
+  r = m_header_ioctx.remove(m_header_oid);
+  if (r < 0) {
+    lderr(m_cct) << "failed to remove journal header: " << cpp_strerror(r)
+                 << dendl;
+    return r;
+  }
+  return 0;
+}
+
+int Journaler::register_client(const std::string &description) {
+  return m_metadata->register_client(description);
+}
+
+int Journaler::unregister_client() {
+  return m_metadata->unregister_client();
+}
+
+void Journaler::start_replay(ReplayHandler *replay_handler) {
+  create_player(replay_handler);
+  m_player->prefetch();
+}
+
+void Journaler::start_live_replay(ReplayHandler *replay_handler,
+                                  double interval) {
+  create_player(replay_handler);
+  m_player->prefetch_and_watch(interval);
+}
+
+bool Journaler::try_pop_front(ReplayEntry *replay_entry) {
+  assert(m_player != NULL);
+
+  Entry entry;
+  uint64_t commit_tid;
+  if (!m_player->try_pop_front(&entry, &commit_tid)) {
+    return false;
+  }
+
+  *replay_entry = ReplayEntry(entry.get_data(), commit_tid);
+  return true;
+}
+
+void Journaler::stop_replay() {
+  assert(m_player != NULL);
+  m_player->unwatch();
+  delete m_player;
+  m_player = NULL;
+}
+
+void Journaler::committed(const ReplayEntry &replay_entry) {
+  m_trimmer->committed(replay_entry.get_commit_tid());
+}
+
+void Journaler::committed(const Future &future) {
+  FutureImplPtr future_impl = future.get_future_impl();
+  m_trimmer->committed(future_impl->get_commit_tid());
+}
+
+void Journaler::start_append(int flush_interval, uint64_t flush_bytes,
+			     double flush_age) {
+  assert(m_recorder == NULL);
+
+  // TODO verify active object set >= current replay object set
+
+  m_recorder = new JournalRecorder(m_data_ioctx, m_object_oid_prefix,
+				   m_metadata, flush_interval, flush_bytes,
+				   flush_age);
+}
+
+void Journaler::stop_append(Context *on_safe) {
+  assert(m_recorder != NULL);
+
+  flush(new C_DeleteRecorder(m_recorder, on_safe));
+  m_recorder = NULL;
+}
+
+Future Journaler::append(const std::string &tag, const bufferlist &payload_bl) {
+  return m_recorder->append(tag, payload_bl);
+}
+
+void Journaler::flush(Context *on_safe) {
+  m_recorder->flush(on_safe);
+}
+
+void Journaler::create_player(ReplayHandler *replay_handler) {
+  assert(m_player == NULL);
+  m_player = new JournalPlayer(m_data_ioctx, m_object_oid_prefix, m_metadata,
+                               replay_handler);
+}
+
+std::ostream &operator<<(std::ostream &os,
+			 const Journaler &journaler) {
+  os << "[metadata=";
+  if (journaler.m_metadata != NULL) {
+    os << *journaler.m_metadata;
+  } else {
+    os << "NULL";
+  }
+  os << "]";
+  return os;
+}
+
+} // namespace journal
diff --git a/src/journal/Journaler.h b/src/journal/Journaler.h
new file mode 100644
index 0000000..d358218
--- /dev/null
+++ b/src/journal/Journaler.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_JOURNALER_H
+#define CEPH_JOURNAL_JOURNALER_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "journal/Future.h"
+#include <string>
+#include <map>
+#include "include/assert.h"
+
+class SafeTimer;
+
+namespace journal {
+
+class JournalMetadata;
+class JournalPlayer;
+class JournalRecorder;
+class JournalTrimmer;
+class ReplayEntry;
+class ReplayHandler;
+
+class Journaler {
+public:
+  Journaler(librados::IoCtx &header_ioctx, const std::string &journal_id,
+	    const std::string &client_id, double commit_interval);
+  ~Journaler();
+
+  int create(uint8_t order, uint8_t splay_width, int64_t pool_id);
+  int remove();
+
+  void init(Context *on_init);
+
+  int register_client(const std::string &description);
+  int unregister_client();
+
+  void start_replay(ReplayHandler *replay_handler);
+  void start_live_replay(ReplayHandler *replay_handler, double interval);
+  bool try_pop_front(ReplayEntry *replay_entry);
+  void stop_replay();
+
+  void start_append(int flush_interval, uint64_t flush_bytes, double flush_age);
+  Future append(const std::string &tag, const bufferlist &bl);
+  void flush(Context *on_safe);
+  void stop_append(Context *on_safe);
+
+  void committed(const ReplayEntry &replay_entry);
+  void committed(const Future &future);
+
+private:
+  struct C_InitJournaler : public Context {
+    Journaler *journaler;
+    Context *on_safe;
+    C_InitJournaler(Journaler *_journaler, Context *_on_safe)
+      : journaler(_journaler), on_safe(_on_safe) {
+    }
+    virtual void finish(int r) {
+      if (r == 0) {
+	r = journaler->init_complete();
+      }
+      on_safe->complete(r);
+    }
+  };
+
+  librados::IoCtx m_header_ioctx;
+  librados::IoCtx m_data_ioctx;
+  CephContext *m_cct;
+  std::string m_client_id;
+
+  std::string m_header_oid;
+  std::string m_object_oid_prefix;
+
+  JournalMetadata *m_metadata;
+  JournalPlayer *m_player;
+  JournalRecorder *m_recorder;
+  JournalTrimmer *m_trimmer;
+
+  int init_complete();
+  void create_player(ReplayHandler *replay_handler);
+
+  friend std::ostream &operator<<(std::ostream &os,
+				  const Journaler &journaler);
+};
+
+std::ostream &operator<<(std::ostream &os,
+			 const Journaler &journaler);
+
+} // namespace journal
+
+#endif // CEPH_JOURNAL_JOURNALER_H
diff --git a/src/journal/Makefile.am b/src/journal/Makefile.am
new file mode 100644
index 0000000..4f0cd18
--- /dev/null
+++ b/src/journal/Makefile.am
@@ -0,0 +1,37 @@
+if ENABLE_CLIENT
+if WITH_RADOS
+
+libjournal_la_SOURCES = \
+	journal/AsyncOpTracker.cc \
+	journal/Entry.cc \
+	journal/Future.cc \
+	journal/FutureImpl.cc \
+	journal/Journaler.cc \
+	journal/JournalMetadata.cc \
+	journal/JournalPlayer.cc \
+	journal/JournalRecorder.cc \
+	journal/JournalTrimmer.cc \
+	journal/ObjectPlayer.cc \
+	journal/ObjectRecorder.cc \
+	journal/Utils.cc
+
+noinst_LTLIBRARIES += libjournal.la
+noinst_HEADERS += \
+	journal/AsyncOpTracker.h \
+	journal/Entry.h \
+	journal/Future.h \
+	journal/FutureImpl.h \
+	journal/Journaler.h \
+	journal/JournalMetadata.h \
+	journal/JournalPlayer.h \
+	journal/JournalRecorder.h \
+	journal/JournalTrimmer.h \
+	journal/ObjectPlayer.h \
+	journal/ObjectRecorder.h \
+	journal/ReplayEntry.h \
+	journal/ReplayHandler.h \
+	journal/Utils.h
+DENCODER_DEPS += libjournal.la
+
+endif # WITH_RADOS
+endif # ENABLE_CLIENT
diff --git a/src/journal/ObjectPlayer.cc b/src/journal/ObjectPlayer.cc
new file mode 100644
index 0000000..939722e
--- /dev/null
+++ b/src/journal/ObjectPlayer.cc
@@ -0,0 +1,248 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/ObjectPlayer.h"
+#include "journal/Utils.h"
+#include "common/Timer.h"
+#include <limits>
+
+#define dout_subsys ceph_subsys_journaler
+#undef dout_prefix
+#define dout_prefix *_dout << "ObjectPlayer: "
+
+namespace journal {
+
+ObjectPlayer::ObjectPlayer(librados::IoCtx &ioctx,
+                           const std::string &object_oid_prefix,
+                           uint64_t object_num, SafeTimer &timer,
+                           Mutex &timer_lock, uint8_t order)
+  : RefCountedObject(NULL, 0), m_object_num(object_num),
+    m_oid(utils::get_object_name(object_oid_prefix, m_object_num)),
+    m_cct(NULL), m_timer(timer), m_timer_lock(timer_lock), m_order(order),
+    m_watch_interval(0), m_watch_task(NULL),
+    m_lock(utils::unique_lock_name("ObjectPlayer::m_lock", this)),
+    m_fetch_in_progress(false), m_read_off(0), m_watch_ctx(NULL),
+    m_watch_in_progress(false) {
+  m_ioctx.dup(ioctx);
+  m_cct = reinterpret_cast<CephContext*>(m_ioctx.cct());
+}
+
+ObjectPlayer::~ObjectPlayer() {
+  {
+    Mutex::Locker locker(m_lock);
+    assert(!m_fetch_in_progress);
+    assert(m_watch_ctx == NULL);
+  }
+}
+
+void ObjectPlayer::fetch(Context *on_finish) {
+  ldout(m_cct, 10) << __func__ << ": " << m_oid << dendl;
+
+  Mutex::Locker locker(m_lock);
+  m_fetch_in_progress = true;
+
+  C_Fetch *context = new C_Fetch(this, on_finish);
+  librados::ObjectReadOperation op;
+  op.read(m_read_off, 2 << m_order, &context->read_bl, NULL);
+
+  librados::AioCompletion *rados_completion =
+    librados::Rados::aio_create_completion(context, utils::rados_ctx_callback,
+                                           NULL);
+  int r = m_ioctx.aio_operate(m_oid, rados_completion, &op, 0, NULL);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+void ObjectPlayer::watch(Context *on_fetch, double interval) {
+  ldout(m_cct, 20) << __func__ << ": " << m_oid << " watch" << dendl;
+
+  Mutex::Locker timer_locker(m_timer_lock);
+  m_watch_interval = interval;
+
+  Mutex::Locker locker(m_lock);
+  assert(m_watch_ctx == NULL);
+  m_watch_ctx = on_fetch;
+
+  schedule_watch();
+}
+
+void ObjectPlayer::unwatch() {
+  ldout(m_cct, 20) << __func__ << ": " << m_oid << " unwatch" << dendl;
+  Mutex::Locker timer_locker(m_timer_lock);
+  Mutex::Locker locker(m_lock);
+
+  cancel_watch();
+
+  m_watch_ctx = NULL;
+  m_timer_lock.Unlock();
+  while (m_watch_in_progress) {
+    m_watch_in_progress_cond.Wait(m_lock);
+  }
+  m_timer_lock.Lock();
+}
+
+void ObjectPlayer::front(Entry *entry) const {
+  Mutex::Locker locker(m_lock);
+  assert(!m_entries.empty());
+  *entry = m_entries.front();
+}
+
+void ObjectPlayer::pop_front() {
+  Mutex::Locker locker(m_lock);
+  assert(!m_entries.empty());
+  m_entries.pop_front();
+}
+
+int ObjectPlayer::handle_fetch_complete(int r, const bufferlist &bl) {
+  ldout(m_cct, 10) << __func__ << ": " << m_oid << ", r=" << r << ", len="
+                   << bl.length() << dendl;
+
+  m_fetch_in_progress = false;
+  if (r < 0) {
+    return r;
+  }
+  if (bl.length() == 0) {
+    return -ENOENT;
+  }
+
+  Mutex::Locker locker(m_lock);
+  m_read_bl.append(bl);
+
+  bool invalid = false;
+  uint32_t invalid_start_off = 0;
+
+  bufferlist::iterator iter(&m_read_bl, m_read_off);
+  while (!iter.end()) {
+    uint32_t bytes_needed;
+    if (!Entry::is_readable(iter, &bytes_needed)) {
+      if (bytes_needed != 0) {
+        invalid_start_off = iter.get_off();
+        invalid = true;
+        lderr(m_cct) << ": partial record at offset " << iter.get_off()
+                     << dendl;
+        break;
+      }
+
+      if (!invalid) {
+        invalid_start_off = iter.get_off();
+        invalid = true;
+        lderr(m_cct) << ": detected corrupt journal entry at offset "
+                     << invalid_start_off << dendl;
+      }
+      ++iter;
+      continue;
+    }
+
+    if (invalid) {
+      uint32_t invalid_end_off = iter.get_off();
+      lderr(m_cct) << ": corruption range [" << invalid_start_off
+                   << ", " << invalid_end_off << ")" << dendl;
+      m_invalid_ranges.insert(invalid_start_off, invalid_end_off);
+      invalid = false;
+    }
+
+    Entry entry;
+    ::decode(entry, iter);
+    ldout(m_cct, 20) << ": " << entry << " decoded" << dendl;
+
+    EntryKey entry_key(std::make_pair(entry.get_tag(), entry.get_tid()));
+    if (m_entry_keys.find(entry_key) == m_entry_keys.end()) {
+      m_entry_keys[entry_key] = m_entries.insert(m_entries.end(), entry);
+    } else {
+      ldout(m_cct, 10) << ": " << entry << " is duplicate, replacing" << dendl;
+      *m_entry_keys[entry_key] = entry;
+    }
+  }
+
+  m_read_off = m_read_bl.length();
+  if (invalid) {
+    uint32_t invalid_end_off = m_read_bl.length();
+    lderr(m_cct) << ": corruption range [" << invalid_start_off
+                 << ", " << invalid_end_off << ")" << dendl;
+    m_invalid_ranges.insert(invalid_start_off, invalid_end_off);
+  }
+
+  if (!m_invalid_ranges.empty()) {
+    r = -EINVAL;
+  }
+  return r;
+}
+
+void ObjectPlayer::schedule_watch() {
+  assert(m_timer_lock.is_locked());
+  assert(m_lock.is_locked());
+  if (m_watch_ctx == NULL) {
+    return;
+  }
+
+  ldout(m_cct, 20) << __func__ << ": " << m_oid << " scheduling watch" << dendl;
+  assert(m_watch_task == NULL);
+  m_watch_task = new C_WatchTask(this);
+  m_timer.add_event_after(m_watch_interval, m_watch_task);
+}
+
+void ObjectPlayer::cancel_watch() {
+  assert(m_timer_lock.is_locked());
+  ldout(m_cct, 20) << __func__ << ": " << m_oid << " cancelling watch" << dendl;
+  if (m_watch_task != NULL) {
+    m_timer.cancel_event(m_watch_task);
+    m_watch_task = NULL;
+  }
+}
+
+void ObjectPlayer::handle_watch_task() {
+  assert(m_timer_lock.is_locked());
+
+  ldout(m_cct, 10) << __func__ << ": " << m_oid << " polling" << dendl;
+  {
+    Mutex::Locker locker(m_lock);
+    assert(m_watch_ctx != NULL);
+
+    m_watch_in_progress = true;
+    m_watch_task = NULL;
+  }
+  fetch(new C_WatchFetch(this));
+}
+
+void ObjectPlayer::handle_watch_fetched(int r) {
+  ldout(m_cct, 10) << __func__ << ": " << m_oid << " poll complete, r=" << r
+                   << dendl;
+
+  Context *on_finish = NULL;
+  {
+    Mutex::Locker timer_locker(m_timer_lock);
+    Mutex::Locker locker(m_lock);
+    assert(m_watch_in_progress);
+    if (r == -ENOENT) {
+      schedule_watch();
+    } else {
+      on_finish = m_watch_ctx;
+      m_watch_ctx = NULL;
+    }
+  }
+
+  if (on_finish != NULL) {
+    on_finish->complete(r);
+  }
+
+  {
+    Mutex::Locker locker(m_lock);
+    m_watch_in_progress = false;
+    m_watch_in_progress_cond.Signal();
+  }
+}
+
+void ObjectPlayer::C_Fetch::finish(int r) {
+  r = object_player->handle_fetch_complete(r, read_bl);
+  on_finish->complete(r);
+}
+
+void ObjectPlayer::C_WatchTask::finish(int r) {
+  object_player->handle_watch_task();
+}
+
+void ObjectPlayer::C_WatchFetch::finish(int r) {
+  object_player->handle_watch_fetched(r);
+}
+
+} // namespace journal
diff --git a/src/journal/ObjectPlayer.h b/src/journal/ObjectPlayer.h
new file mode 100644
index 0000000..5fb9c27
--- /dev/null
+++ b/src/journal/ObjectPlayer.h
@@ -0,0 +1,133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_OBJECT_PLAYER_H
+#define CEPH_JOURNAL_OBJECT_PLAYER_H
+
+#include "include/Context.h"
+#include "include/interval_set.h"
+#include "include/rados/librados.hpp"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/RefCountedObj.h"
+#include "journal/Entry.h"
+#include <list>
+#include <string>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/noncopyable.hpp>
+#include <boost/unordered_map.hpp>
+#include "include/assert.h"
+
+class SafeTimer;
+
+namespace journal {
+
+class ObjectPlayer;
+typedef boost::intrusive_ptr<ObjectPlayer> ObjectPlayerPtr;
+
+class ObjectPlayer : public RefCountedObject {
+public:
+  typedef std::list<Entry> Entries;
+  typedef interval_set<uint64_t> InvalidRanges;
+
+  ObjectPlayer(librados::IoCtx &ioctx, const std::string &object_oid_prefix,
+               uint64_t object_num, SafeTimer &timer, Mutex &timer_lock,
+               uint8_t order);
+  ~ObjectPlayer();
+
+  inline const std::string &get_oid() const {
+    return m_oid;
+  }
+  inline uint64_t get_object_number() const {
+    return m_object_num;
+  }
+
+  void fetch(Context *on_finish);
+  void watch(Context *on_fetch, double interval);
+  void unwatch();
+
+  inline bool is_fetch_in_progress() const {
+    Mutex::Locker locker(m_lock);
+    return m_fetch_in_progress;
+  }
+
+  void front(Entry *entry) const;
+  void pop_front();
+  inline bool empty() const {
+    Mutex::Locker locker(m_lock);
+    return m_entries.empty();
+  }
+
+  inline void get_entries(Entries *entries) {
+    Mutex::Locker locker(m_lock);
+    *entries = m_entries;
+  }
+  inline void get_invalid_ranges(InvalidRanges *invalid_ranges) {
+    Mutex::Locker locker(m_lock);
+    *invalid_ranges = m_invalid_ranges;
+  }
+
+private:
+  typedef std::pair<std::string, uint64_t> EntryKey;
+  typedef boost::unordered_map<EntryKey, Entries::iterator> EntryKeys;
+
+  struct C_Fetch : public Context {
+    ObjectPlayerPtr object_player;
+    Context *on_finish;
+    bufferlist read_bl;
+    C_Fetch(ObjectPlayer *o, Context *ctx)
+      : object_player(o), on_finish(ctx) {
+    }
+    virtual void finish(int r);
+  };
+  struct C_WatchTask : public Context {
+    ObjectPlayerPtr object_player;
+    C_WatchTask(ObjectPlayer *o) : object_player(o) {
+    }
+    virtual void finish(int r);
+  };
+  struct C_WatchFetch : public Context {
+    ObjectPlayerPtr object_player;
+    C_WatchFetch(ObjectPlayer *o) : object_player(o) {
+    }
+    virtual void finish(int r);
+  };
+
+  librados::IoCtx m_ioctx;
+  uint64_t m_object_num;
+  std::string m_oid;
+  CephContext *m_cct;
+
+  SafeTimer &m_timer;
+  Mutex &m_timer_lock;
+
+  double m_fetch_interval;
+  uint8_t m_order;
+
+  double m_watch_interval;
+  Context *m_watch_task;
+
+  mutable Mutex m_lock;
+  bool m_fetch_in_progress;
+  bufferlist m_read_bl;
+  uint32_t m_read_off;
+
+  Entries m_entries;
+  EntryKeys m_entry_keys;
+  InvalidRanges m_invalid_ranges;
+
+  Context *m_watch_ctx;
+  Cond m_watch_in_progress_cond;
+  bool m_watch_in_progress;
+
+  int handle_fetch_complete(int r, const bufferlist &bl);
+
+  void schedule_watch();
+  void cancel_watch();
+  void handle_watch_task();
+  void handle_watch_fetched(int r);
+};
+
+} // namespace journal
+
+#endif // CEPH_JOURNAL_OBJECT_PLAYER_H
diff --git a/src/journal/ObjectRecorder.cc b/src/journal/ObjectRecorder.cc
new file mode 100644
index 0000000..cf96b94
--- /dev/null
+++ b/src/journal/ObjectRecorder.cc
@@ -0,0 +1,310 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/ObjectRecorder.h"
+#include "journal/Future.h"
+#include "journal/Utils.h"
+#include "include/assert.h"
+#include "common/Timer.h"
+#include "cls/journal/cls_journal_client.h"
+
+#define dout_subsys ceph_subsys_journaler
+#undef dout_prefix
+#define dout_prefix *_dout << "ObjectRecorder: "
+
+using namespace cls::journal;
+
+namespace journal {
+
+ObjectRecorder::ObjectRecorder(librados::IoCtx &ioctx, const std::string &oid,
+                               uint64_t object_number,
+                               SafeTimer &timer, Mutex &timer_lock,
+                               OverflowHandler *overflow_handler, uint8_t order,
+                               uint32_t flush_interval, uint64_t flush_bytes,
+                               double flush_age)
+  : RefCountedObject(NULL, 0), m_oid(oid), m_object_number(object_number),
+    m_cct(NULL), m_timer(timer), m_timer_lock(timer_lock),
+    m_overflow_handler(overflow_handler), m_order(order),
+    m_soft_max_size(1 << m_order), m_flush_interval(flush_interval),
+    m_flush_bytes(flush_bytes), m_flush_age(flush_age), m_flush_handler(this),
+    m_append_task(NULL),
+    m_lock(utils::unique_lock_name("ObjectRecorder::m_lock", this)),
+    m_append_tid(0), m_pending_bytes(0), m_size(0), m_overflowed(false),
+    m_object_closed(false) {
+  m_ioctx.dup(ioctx);
+  m_cct = reinterpret_cast<CephContext*>(m_ioctx.cct());
+  assert(m_overflow_handler != NULL);
+}
+
+ObjectRecorder::~ObjectRecorder() {
+  assert(m_append_task == NULL);
+  assert(m_append_buffers.empty());
+  assert(m_in_flight_appends.empty());
+}
+
+bool ObjectRecorder::append(const AppendBuffers &append_buffers) {
+  FutureImplPtr last_flushed_future;
+  {
+    Mutex::Locker locker(m_lock);
+    for (AppendBuffers::const_iterator iter = append_buffers.begin();
+         iter != append_buffers.end(); ++iter) {
+      if (append(*iter)) {
+        last_flushed_future = iter->first;
+      }
+    }
+  }
+
+  if (last_flushed_future) {
+    flush(last_flushed_future);
+  }
+  return (m_size + m_pending_bytes >= m_soft_max_size);
+}
+
+void ObjectRecorder::flush(Context *on_safe) {
+  ldout(m_cct, 20) << __func__ << ": " << m_oid << dendl;
+
+  Future future;
+  {
+    Mutex::Locker locker(m_lock);
+
+    // attach the flush to the most recent append
+    if (!m_append_buffers.empty()) {
+      future = Future(m_append_buffers.rbegin()->first);
+
+      flush_appends(true);
+    } else if (!m_in_flight_appends.empty()) {
+      AppendBuffers &append_buffers = m_in_flight_appends.rbegin()->second;
+      assert(!append_buffers.empty());
+      future = Future(append_buffers.rbegin()->first);
+    }
+    cancel_append_task();
+  }
+
+  if (future.is_valid()) {
+    future.flush(on_safe);
+  } else {
+    on_safe->complete(0);
+  }
+}
+
+void ObjectRecorder::flush(const FutureImplPtr &future) {
+  ldout(m_cct, 20) << __func__ << ": " << m_oid << " flushing " << *future
+                   << dendl;
+
+  Mutex::Locker locker(m_lock);
+  if (future->get_flush_handler().get() != &m_flush_handler) {
+    // if we don't own this future, re-issue the flush so that it hits the
+    // correct journal object owner
+    future->flush();
+    return;
+  } else if (future->is_flush_in_progress()) {
+    return;
+  }
+
+  assert(!m_object_closed);
+  AppendBuffers::iterator it;
+  for (it = m_append_buffers.begin(); it != m_append_buffers.end(); ++it) {
+    if (it->first == future) {
+      break;
+    }
+  }
+  assert(it != m_append_buffers.end());
+  ++it;
+
+  AppendBuffers flush_buffers;
+  flush_buffers.splice(flush_buffers.end(), m_append_buffers,
+                       m_append_buffers.begin(), it);
+  send_appends(&flush_buffers);
+}
+
+void ObjectRecorder::claim_append_buffers(AppendBuffers *append_buffers) {
+  ldout(m_cct, 20) << __func__ << ": " << m_oid << dendl;
+
+  Mutex::Locker locker(m_lock);
+  assert(m_in_flight_appends.empty());
+  assert(m_object_closed || m_overflowed);
+  append_buffers->splice(append_buffers->end(), m_append_buffers,
+                         m_append_buffers.begin(), m_append_buffers.end());
+}
+
+bool ObjectRecorder::close_object() {
+  ldout(m_cct, 20) << __func__ << ": " << m_oid << dendl;
+
+  Mutex::Locker locker(m_lock);
+  m_object_closed = true;
+  if (flush_appends(true)) {
+    cancel_append_task();
+  }
+  return m_in_flight_appends.empty();
+}
+
+void ObjectRecorder::handle_append_task() {
+  assert(m_timer_lock.is_locked());
+  m_append_task = NULL;
+
+  Mutex::Locker locker(m_lock);
+  flush_appends(true);
+}
+
+void ObjectRecorder::cancel_append_task() {
+  Mutex::Locker locker(m_timer_lock);
+  if (m_append_task != NULL) {
+    m_timer.cancel_event(m_append_task);
+    m_append_task = NULL;
+  }
+}
+
+void ObjectRecorder::schedule_append_task() {
+  Mutex::Locker locker(m_timer_lock);
+  if (m_append_task == NULL && m_flush_age > 0) {
+    m_append_task = new C_AppendTask(this);
+    m_timer.add_event_after(m_flush_age, m_append_task);
+  }
+}
+
+bool ObjectRecorder::append(const AppendBuffer &append_buffer) {
+  assert(m_lock.is_locked());
+
+  bool flush_requested = append_buffer.first->attach(&m_flush_handler);
+  m_append_buffers.push_back(append_buffer);
+  m_pending_bytes += append_buffer.second.length();
+
+  if (flush_appends(false)) {
+    cancel_append_task();
+  } else {
+    schedule_append_task();
+  }
+  return flush_requested;
+}
+
+bool ObjectRecorder::flush_appends(bool force) {
+  assert(m_lock.is_locked());
+  if (m_object_closed || m_overflowed) {
+    return true;
+  }
+
+  if (m_append_buffers.empty() ||
+      (!force &&
+       m_size + m_pending_bytes < m_soft_max_size &&
+       (m_flush_interval > 0 && m_append_buffers.size() < m_flush_interval) &&
+       (m_flush_bytes > 0 && m_pending_bytes < m_flush_bytes))) {
+    return false;
+  }
+
+  m_pending_bytes = 0;
+  AppendBuffers append_buffers;
+  append_buffers.swap(m_append_buffers);
+  send_appends(&append_buffers);
+  return true;
+}
+
+void ObjectRecorder::handle_append_flushed(uint64_t tid, int r) {
+  ldout(m_cct, 10) << __func__ << ": " << m_oid << " tid=" << tid
+                   << ", r=" << r << dendl;
+
+  Mutex::Locker locker(m_lock);
+  InFlightAppends::iterator iter = m_in_flight_appends.find(tid);
+  if (iter == m_in_flight_appends.end()) {
+    // must have seen an overflow on a previous append op
+    assert(m_overflowed);
+    return;
+  } else if (r == -EOVERFLOW) {
+    m_overflowed = true;
+    append_overflowed(tid);
+    return;
+  }
+
+  assert(!m_overflowed || r != 0);
+  AppendBuffers &append_buffers = iter->second;
+  assert(!append_buffers.empty());
+
+  // Flag the associated futures as complete.
+  for (AppendBuffers::iterator buf_it = append_buffers.begin();
+       buf_it != append_buffers.end(); ++buf_it) {
+    ldout(m_cct, 20) << __func__ << ": " << *buf_it->first << " marked safe"
+                     << dendl;
+    buf_it->first->safe(r);
+  }
+  m_in_flight_appends.erase(iter);
+
+  if (m_in_flight_appends.empty() && m_object_closed) {
+    // all remaining unsent appends should be redirected to new object
+    notify_overflow();
+  }
+}
+
+void ObjectRecorder::append_overflowed(uint64_t tid) {
+  ldout(m_cct, 10) << __func__ << ": " << m_oid << " append overflowed"
+                   << dendl;
+
+  assert(m_lock.is_locked());
+  assert(!m_in_flight_appends.empty());
+  assert(m_in_flight_appends.begin()->first == tid);
+
+  cancel_append_task();
+
+  InFlightAppends in_flight_appends;
+  in_flight_appends.swap(m_in_flight_appends);
+
+  AppendBuffers restart_append_buffers;
+  for (InFlightAppends::iterator it = in_flight_appends.begin();
+       it != in_flight_appends.end(); ++it) {
+    restart_append_buffers.insert(restart_append_buffers.end(),
+                                  it->second.begin(), it->second.end());
+  }
+
+  restart_append_buffers.splice(restart_append_buffers.end(),
+                                m_append_buffers,
+                                m_append_buffers.begin(),
+                                m_append_buffers.end());
+  restart_append_buffers.swap(m_append_buffers);
+  notify_overflow();
+}
+
+void ObjectRecorder::send_appends(AppendBuffers *append_buffers) {
+  assert(m_lock.is_locked());
+  assert(!append_buffers->empty());
+
+  uint64_t append_tid = m_append_tid++;
+  ldout(m_cct, 10) << __func__ << ": " << m_oid << " flushing journal tid="
+                   << append_tid << dendl;
+  C_AppendFlush *append_flush = new C_AppendFlush(this, append_tid);
+
+  librados::ObjectWriteOperation op;
+  client::guard_append(&op, m_soft_max_size);
+
+  for (AppendBuffers::iterator it = append_buffers->begin();
+       it != append_buffers->end(); ++it) {
+    ldout(m_cct, 20) << __func__ << ": flushing " << *it->first
+                     << dendl;
+    it->first->set_flush_in_progress();
+    op.append(it->second);
+    m_size += it->second.length();
+  }
+  m_in_flight_appends[append_tid].swap(*append_buffers);
+
+  librados::AioCompletion *rados_completion =
+    librados::Rados::aio_create_completion(append_flush, NULL,
+                                           utils::rados_ctx_callback);
+  int r = m_ioctx.aio_operate(m_oid, rados_completion, &op);
+  assert(r == 0);
+  rados_completion->release();
+}
+
+void ObjectRecorder::notify_overflow() {
+  assert(m_lock.is_locked());
+
+  for (AppendBuffers::const_iterator it = m_append_buffers.begin();
+       it != m_append_buffers.end(); ++it) {
+    ldout(m_cct, 20) << __func__ << ": overflowed " << *it->first
+                     << dendl;
+    it->first->detach();
+  }
+
+  // TODO need to delay completion until after aio_notify completes
+  m_lock.Unlock();
+  m_overflow_handler->overflow(this);
+  m_lock.Lock();
+}
+
+} // namespace journal
diff --git a/src/journal/ObjectRecorder.h b/src/journal/ObjectRecorder.h
new file mode 100644
index 0000000..566c41f
--- /dev/null
+++ b/src/journal/ObjectRecorder.h
@@ -0,0 +1,149 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_OBJECT_RECORDER_H
+#define CEPH_JOURNAL_OBJECT_RECORDER_H
+
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "common/Mutex.h"
+#include "common/RefCountedObj.h"
+#include "journal/FutureImpl.h"
+#include <list>
+#include <boost/intrusive_ptr.hpp>
+#include <boost/noncopyable.hpp>
+#include "include/assert.h"
+
+class SafeTimer;
+
+namespace journal {
+
+class ObjectRecorder;
+typedef boost::intrusive_ptr<ObjectRecorder> ObjectRecorderPtr;
+
+typedef std::pair<FutureImplPtr, bufferlist> AppendBuffer;
+typedef std::list<AppendBuffer> AppendBuffers;
+
+class ObjectRecorder : public RefCountedObject, boost::noncopyable {
+public:
+  struct OverflowHandler {
+    virtual ~OverflowHandler() {}
+    virtual void overflow(ObjectRecorder *object_recorder) = 0;
+  };
+
+  ObjectRecorder(librados::IoCtx &ioctx, const std::string &oid,
+                 uint64_t object_number, SafeTimer &timer, Mutex &timer_lock,
+                 OverflowHandler *overflow_handler, uint8_t order,
+                 uint32_t flush_interval, uint64_t flush_bytes,
+                 double flush_age);
+  ~ObjectRecorder();
+
+  inline uint64_t get_object_number() const {
+    return m_object_number;
+  }
+  inline const std::string &get_oid() const {
+    return m_oid;
+  }
+
+  bool append(const AppendBuffers &append_buffers);
+  void flush(Context *on_safe);
+  void flush(const FutureImplPtr &future);
+
+  void claim_append_buffers(AppendBuffers *append_buffers);
+  bool close_object();
+
+  inline CephContext *cct() const {
+    return m_cct;
+  }
+
+  inline size_t get_pending_appends() const {
+    Mutex::Locker locker(m_lock);
+    return m_append_buffers.size();
+  }
+
+private:
+  typedef std::map<uint64_t, AppendBuffers> InFlightAppends;
+
+  struct FlushHandler : public FutureImpl::FlushHandler {
+    ObjectRecorder *object_recorder;
+    FlushHandler(ObjectRecorder *o) : object_recorder(o) {}
+    virtual void get() {
+      object_recorder->get();
+    }
+    virtual void put() {
+      object_recorder->put();
+    }
+    virtual void flush(const FutureImplPtr &future) {
+      object_recorder->flush(future);
+    }
+  };
+  struct C_AppendTask : public Context {
+    ObjectRecorder *object_recorder;
+    C_AppendTask(ObjectRecorder *o) : object_recorder(o) {
+    }
+    virtual void finish(int r) {
+      object_recorder->handle_append_task();
+    }
+  };
+  struct C_AppendFlush : public Context {
+    ObjectRecorder *object_recorder;
+    uint64_t tid;
+    C_AppendFlush(ObjectRecorder *o, uint64_t _tid)
+        : object_recorder(o), tid(_tid) {
+      object_recorder->get();
+    }
+    virtual void finish(int r) {
+      object_recorder->handle_append_flushed(tid, r);
+      object_recorder->put();
+    }
+  };
+
+  librados::IoCtx m_ioctx;
+  std::string m_oid;
+  uint64_t m_object_number;
+  CephContext *m_cct;
+
+  SafeTimer &m_timer;
+  Mutex &m_timer_lock;
+
+  OverflowHandler *m_overflow_handler;
+
+  uint8_t m_order;
+  uint64_t m_soft_max_size;
+
+  uint32_t m_flush_interval;
+  uint64_t m_flush_bytes;
+  double m_flush_age;
+
+  FlushHandler m_flush_handler;
+
+  C_AppendTask *m_append_task;
+
+  mutable Mutex m_lock;
+  AppendBuffers m_append_buffers;
+  uint64_t m_append_tid;
+  uint32_t m_pending_bytes;
+
+  InFlightAppends m_in_flight_appends;
+  uint64_t m_size;
+  bool m_overflowed;
+  bool m_object_closed;
+
+  bufferlist m_prefetch_bl;
+
+  void handle_append_task();
+  void cancel_append_task();
+  void schedule_append_task();
+
+  bool append(const AppendBuffer &append_buffer);
+  bool flush_appends(bool force);
+  void handle_append_flushed(uint64_t tid, int r);
+  void append_overflowed(uint64_t tid);
+  void send_appends(AppendBuffers *append_buffers);
+
+  void notify_overflow();
+};
+
+} // namespace journal
+
+#endif // CEPH_JOURNAL_OBJECT_RECORDER_H
diff --git a/src/journal/ReplayEntry.h b/src/journal/ReplayEntry.h
new file mode 100644
index 0000000..4dd3ba4
--- /dev/null
+++ b/src/journal/ReplayEntry.h
@@ -0,0 +1,34 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_REPLAY_ENTRY_H
+#define CEPH_JOURNAL_REPLAY_ENTRY_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+
+namespace journal {
+
+class ReplayEntry {
+public:
+  ReplayEntry() : m_commit_tid(0) {
+  }
+  ReplayEntry(const bufferlist &data, uint64_t commit_tid)
+    : m_data(data), m_commit_tid(commit_tid) {
+  }
+
+  inline const bufferlist &get_data() const {
+    return m_data;
+  }
+  inline uint64_t get_commit_tid() const {
+    return m_commit_tid;
+  }
+
+private:
+  bufferlist m_data;
+  uint64_t m_commit_tid;
+};
+
+} // namespace journal
+
+#endif // CEPH_JOURNAL_REPLAY_ENTRY_H
diff --git a/src/journal/ReplayHandler.h b/src/journal/ReplayHandler.h
new file mode 100644
index 0000000..e61240d
--- /dev/null
+++ b/src/journal/ReplayHandler.h
@@ -0,0 +1,21 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_REPLAY_HANDLER_H
+#define CEPH_JOURNAL_REPLAY_HANDLER_H
+
+namespace journal {
+
+struct ReplayHandler  {
+  virtual ~ReplayHandler() {}
+
+  virtual void get() = 0;
+  virtual void put() = 0;
+
+  virtual void handle_entries_available() = 0;
+  virtual void handle_complete(int r) = 0;
+};
+
+} // namespace journal
+
+#endif // CEPH_JOURNAL_REPLAY_HANDLER_H
diff --git a/src/journal/Utils.cc b/src/journal/Utils.cc
new file mode 100644
index 0000000..2a8d945
--- /dev/null
+++ b/src/journal/Utils.cc
@@ -0,0 +1,25 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/Utils.h"
+#include "include/Context.h"
+#include "include/stringify.h"
+
+namespace journal {
+namespace utils {
+
+std::string get_object_name(const std::string &prefix, uint64_t number) {
+  return prefix + stringify(number);
+}
+
+std::string unique_lock_name(const std::string &name, void *address) {
+  return name + " (" + stringify(address) + ")";
+}
+
+void rados_ctx_callback(rados_completion_t c, void *arg) {
+  Context *comp = reinterpret_cast<Context *>(arg);
+  comp->complete(rados_aio_get_return_value(c));
+}
+
+} // namespace utils
+} // namespace journal
diff --git a/src/journal/Utils.h b/src/journal/Utils.h
new file mode 100644
index 0000000..1169ac9
--- /dev/null
+++ b/src/journal/Utils.h
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_JOURNAL_UTILS_H
+#define CEPH_JOURNAL_UTILS_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include <string>
+
+namespace journal {
+namespace utils {
+
+std::string get_object_name(const std::string &prefix, uint64_t number);
+
+std::string unique_lock_name(const std::string &name, void *address);
+
+void rados_ctx_callback(rados_completion_t c, void *arg);
+
+} // namespace utils
+} // namespace journal
+
+#endif // CEPH_JOURNAL_UTILS_H
diff --git a/src/krbd.cc b/src/krbd.cc
index 9901edb..53f64bb 100644
--- a/src/krbd.cc
+++ b/src/krbd.cc
@@ -107,9 +107,6 @@ static int have_minor_attr(void)
   return access("/sys/module/rbd/parameters/single_major", F_OK) == 0;
 }
 
-/*
- * options can be NULL
- */
 static int build_map_buf(CephContext *cct, const char *pool, const char *image,
                          const char *snap, const char *options, string *pbuf)
 {
@@ -164,7 +161,7 @@ static int build_map_buf(CephContext *cct, const char *pool, const char *image,
     oss << ",key=" << key_name;
   }
 
-  if (options && strcmp(options, "") != 0)
+  if (strcmp(options, "") != 0)
     oss << "," << options;
 
   oss << " " << pool << " " << image << " " << snap;
@@ -283,16 +280,13 @@ out_mon:
   return r;
 }
 
-/*
- * snap and options can be NULL
- */
 static int map_image(struct krbd_ctx *ctx, const char *pool, const char *image,
                      const char *snap, const char *options, string *pname)
 {
   string buf;
   int r;
 
-  if (!snap)
+  if (strcmp(snap, "") == 0)
     snap = "-";
 
   r = build_map_buf(ctx->cct, pool, image, snap, options, &buf);
@@ -521,8 +515,7 @@ static int do_unmap(struct udev *udev, dev_t devno, const string& id)
          * libudev does not provide the "wait until the queue is empty"
          * API or the sufficient amount of primitives to build it from.
          */
-        string err = run_cmd("udevadm", "settle", "--timeout", "10", "--quiet",
-                             NULL);
+        string err = run_cmd("udevadm", "settle", "--timeout", "10", NULL);
         if (!err.empty())
           cerr << "rbd: " << err << std::endl;
       }
diff --git a/src/os/KeyValueDB.cc b/src/kv/KeyValueDB.cc
similarity index 100%
rename from src/os/KeyValueDB.cc
rename to src/kv/KeyValueDB.cc
diff --git a/src/kv/KeyValueDB.h b/src/kv/KeyValueDB.h
new file mode 100644
index 0000000..347169a
--- /dev/null
+++ b/src/kv/KeyValueDB.h
@@ -0,0 +1,277 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef KEY_VALUE_DB_H
+#define KEY_VALUE_DB_H
+
+#include "include/buffer.h"
+#include <ostream>
+#include <set>
+#include <map>
+#include <string>
+#include "include/memory.h"
+#include <boost/scoped_ptr.hpp>
+#include "include/encoding.h"
+
+using std::string;
+/**
+ * Defines virtual interface to be implemented by key value store
+ *
+ * Kyoto Cabinet or LevelDB should implement this
+ */
+class KeyValueDB {
+public:
+  class TransactionImpl {
+  public:
+    /// Set Keys
+    void set(
+      const std::string &prefix,                 ///< [in] Prefix for keys
+      const std::map<std::string, bufferlist> &to_set ///< [in] keys/values to set
+    ) {
+      std::map<std::string, bufferlist>::const_iterator it;
+      for (it = to_set.begin(); it != to_set.end(); ++it)
+	set(prefix, it->first, it->second);
+    }
+
+    /// Set Keys (via encoded bufferlist)
+    void set(
+      const std::string &prefix,      ///< [in] prefix
+      bufferlist& to_set_bl     ///< [in] encoded key/values to set
+      ) {
+      bufferlist::iterator p = to_set_bl.begin();
+      uint32_t num;
+      ::decode(num, p);
+      while (num--) {
+	string key;
+	bufferlist value;
+	::decode(key, p);
+	::decode(value, p);
+	set(prefix, key, value);
+      }
+    }
+
+    /// Set Key
+    virtual void set(
+      const std::string &prefix,   ///< [in] Prefix for the key
+      const std::string &k,	      ///< [in] Key to set
+      const bufferlist &bl    ///< [in] Value to set
+      ) = 0;
+
+
+    /// Removes Keys (via encoded bufferlist)
+    void rmkeys(
+      const std::string &prefix,   ///< [in] Prefix to search for
+      bufferlist &keys_bl ///< [in] Keys to remove
+    ) {
+      bufferlist::iterator p = keys_bl.begin();
+      uint32_t num;
+      ::decode(num, p);
+      while (num--) {
+	string key;
+	::decode(key, p);
+	rmkey(prefix, key);
+      }
+    }
+
+    /// Removes Keys
+    void rmkeys(
+      const std::string &prefix,   ///< [in] Prefix to search for
+      const std::set<std::string> &keys ///< [in] Keys to remove
+    ) {
+      std::set<std::string>::const_iterator it;
+      for (it = keys.begin(); it != keys.end(); ++it)
+	rmkey(prefix, *it);
+    }
+
+    /// Remove Key
+    virtual void rmkey(
+      const std::string &prefix,   ///< [in] Prefix to search for
+      const std::string &k	      ///< [in] Key to remove
+      ) = 0;
+
+    /// Removes keys beginning with prefix
+    virtual void rmkeys_by_prefix(
+      const std::string &prefix ///< [in] Prefix by which to remove keys
+      ) = 0;
+
+    virtual ~TransactionImpl() {}
+  };
+  typedef ceph::shared_ptr< TransactionImpl > Transaction;
+
+  /// create a new instance
+  static KeyValueDB *create(CephContext *cct, const std::string& type,
+			    const std::string& dir);
+
+  /// test whether we can successfully initialize; may have side effects (e.g., create)
+  static int test_init(const std::string& type, const std::string& dir);
+  virtual int init(string option_str="") = 0;
+  virtual int open(std::ostream &out) = 0;
+  virtual int create_and_open(std::ostream &out) = 0;
+
+  virtual Transaction get_transaction() = 0;
+  virtual int submit_transaction(Transaction) = 0;
+  virtual int submit_transaction_sync(Transaction t) {
+    return submit_transaction(t);
+  }
+
+  /// Retrieve Keys
+  virtual int get(
+    const std::string &prefix,        ///< [in] Prefix for key
+    const std::set<std::string> &key,      ///< [in] Key to retrieve
+    std::map<std::string, bufferlist> *out ///< [out] Key value retrieved
+    ) = 0;
+  virtual int get(const std::string &prefix, ///< [in] prefix
+		  const std::string &key,    ///< [in] key
+		  bufferlist *value) {  ///< [out] value
+    std::set<std::string> ks;
+    ks.insert(key);
+    std::map<std::string,bufferlist> om;
+    int r = get(prefix, ks, &om);
+    if (om.find(key) != om.end()) {
+      *value = om[key];
+    } else {
+      *value = bufferlist();
+      r = -ENOENT;
+    }
+    return r;
+  }
+
+  class GenericIteratorImpl {
+  public:
+    virtual int seek_to_first() = 0;
+    virtual int upper_bound(const std::string &after) = 0;
+    virtual int lower_bound(const std::string &to) = 0;
+    virtual bool valid() = 0;
+    virtual int next(bool validate=true) = 0;
+    virtual std::string key() = 0;
+    virtual bufferlist value() = 0;
+    virtual int status() = 0;
+    virtual ~GenericIteratorImpl() {}
+  };
+
+  class WholeSpaceIteratorImpl {
+  public:
+    virtual int seek_to_first() = 0;
+    virtual int seek_to_first(const std::string &prefix) = 0;
+    virtual int seek_to_last() = 0;
+    virtual int seek_to_last(const std::string &prefix) = 0;
+    virtual int upper_bound(const std::string &prefix, const std::string &after) = 0;
+    virtual int lower_bound(const std::string &prefix, const std::string &to) = 0;
+    virtual bool valid() = 0;
+    virtual int next() = 0;
+    virtual int prev() = 0;
+    virtual std::string key() = 0;
+    virtual std::pair<std::string,std::string> raw_key() = 0;
+    virtual bool raw_key_is_prefixed(const std::string &prefix) = 0;
+    virtual bufferlist value() = 0;
+    virtual int status() = 0;
+    virtual ~WholeSpaceIteratorImpl() { }
+  };
+  typedef ceph::shared_ptr< WholeSpaceIteratorImpl > WholeSpaceIterator;
+
+  class IteratorImpl : public GenericIteratorImpl {
+    const std::string prefix;
+    WholeSpaceIterator generic_iter;
+  public:
+    IteratorImpl(const std::string &prefix, WholeSpaceIterator iter) :
+      prefix(prefix), generic_iter(iter) { }
+    virtual ~IteratorImpl() { }
+
+    int seek_to_first() {
+      return generic_iter->seek_to_first(prefix);
+    }
+    int seek_to_last() {
+      return generic_iter->seek_to_last(prefix);
+    }
+    int upper_bound(const std::string &after) {
+      return generic_iter->upper_bound(prefix, after);
+    }
+    int lower_bound(const std::string &to) {
+      return generic_iter->lower_bound(prefix, to);
+    }
+    bool valid() {
+      if (!generic_iter->valid())
+	return false;
+      return generic_iter->raw_key_is_prefixed(prefix);
+    }
+    // Note that next() and prev() shouldn't validate iters,
+    // it's responsibility of caller to ensure they're valid.
+    int next(bool validate=true) {
+      if (validate) {
+        if (valid())
+          return generic_iter->next();
+        return status();
+      } else {
+        return generic_iter->next();  
+      }      
+    }
+    
+    int prev(bool validate=true) {
+      if (validate) {
+        if (valid())
+          return generic_iter->prev();
+        return status();
+      } else {
+        return generic_iter->prev();  
+      }      
+    }
+    std::string key() {
+      return generic_iter->key();
+    }
+    std::pair<std::string, std::string> raw_key() {
+      return generic_iter->raw_key();
+    }
+    bufferlist value() {
+      return generic_iter->value();
+    }
+    int status() {
+      return generic_iter->status();
+    }
+  };
+
+  typedef ceph::shared_ptr< IteratorImpl > Iterator;
+
+  WholeSpaceIterator get_iterator() {
+    return _get_iterator();
+  }
+
+  Iterator get_iterator(const std::string &prefix) {
+    return ceph::shared_ptr<IteratorImpl>(
+      new IteratorImpl(prefix, get_iterator())
+    );
+  }
+
+  WholeSpaceIterator get_snapshot_iterator() {
+    return _get_snapshot_iterator();
+  }
+
+  Iterator get_snapshot_iterator(const std::string &prefix) {
+    return ceph::shared_ptr<IteratorImpl>(
+      new IteratorImpl(prefix, get_snapshot_iterator())
+    );
+  }
+
+  virtual uint64_t get_estimated_size(std::map<std::string,uint64_t> &extra) = 0;
+  virtual int get_statfs(struct statfs *buf) {
+    return -EOPNOTSUPP;
+  }
+
+  virtual ~KeyValueDB() {}
+
+  /// compact the underlying store
+  virtual void compact() {}
+
+  /// compact db for all keys with a given prefix
+  virtual void compact_prefix(const std::string& prefix) {}
+  /// compact db for all keys with a given prefix, async
+  virtual void compact_prefix_async(const std::string& prefix) {}
+  virtual void compact_range(const std::string& prefix,
+			     const std::string& start, const std::string& end) {}
+  virtual void compact_range_async(const std::string& prefix,
+				   const std::string& start, const std::string& end) {}
+
+protected:
+  virtual WholeSpaceIterator _get_iterator() = 0;
+  virtual WholeSpaceIterator _get_snapshot_iterator() = 0;
+};
+
+#endif
diff --git a/src/kv/KineticStore.cc b/src/kv/KineticStore.cc
new file mode 100644
index 0000000..71559f0
--- /dev/null
+++ b/src/kv/KineticStore.cc
@@ -0,0 +1,348 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "KineticStore.h"
+#include "common/ceph_crypto.h"
+
+#include <set>
+#include <map>
+#include <string>
+#include "include/memory.h"
+#include <errno.h>
+using std::string;
+#include "common/perf_counters.h"
+
+#define dout_subsys ceph_subsys_keyvaluestore
+
+int KineticStore::init()
+{
+  // init defaults.  caller can override these if they want
+  // prior to calling open.
+  host = cct->_conf->kinetic_host;
+  port = cct->_conf->kinetic_port;
+  user_id = cct->_conf->kinetic_user_id;
+  hmac_key = cct->_conf->kinetic_hmac_key;
+  use_ssl = cct->_conf->kinetic_use_ssl;
+  return 0;
+}
+
+int KineticStore::_test_init(CephContext *c)
+{
+  kinetic::KineticConnectionFactory conn_factory =
+    kinetic::NewKineticConnectionFactory();
+
+  kinetic::ConnectionOptions options;
+  options.host = cct->_conf->kinetic_host;
+  options.port = cct->_conf->kinetic_port;
+  options.user_id = cct->_conf->kinetic_user_id;
+  options.hmac_key = cct->_conf->kinetic_hmac_key;
+  options.use_ssl = cct->_conf->kinetic_use_ssl;
+
+  kinetic::Status status = conn_factory.NewThreadsafeBlockingConnection(options, kinetic_conn, 10);
+  kinetic_conn.reset();
+  if (!status.ok())
+    derr << __func__ << "Unable to connect to kinetic store " << options.host
+         << ":" << options.port << " : " << status.ToString() << dendl;
+  return status.ok() ? 0 : -EIO;
+}
+
+int KineticStore::do_open(ostream &out, bool create_if_missing)
+{
+  kinetic::KineticConnectionFactory conn_factory =
+    kinetic::NewKineticConnectionFactory();
+  kinetic::ConnectionOptions options;
+  options.host = host;
+  options.port = port;
+  options.user_id = user_id;
+  options.hmac_key = hmac_key;
+  options.use_ssl = use_ssl;
+  kinetic::Status status = conn_factory.NewThreadsafeBlockingConnection(options, kinetic_conn, 10);
+  if (!status.ok()) {
+    derr << "Unable to connect to kinetic store " << host << ":" << port
+	 << " : " << status.ToString() << dendl;
+    return -EINVAL;
+  }
+
+  PerfCountersBuilder plb(g_ceph_context, "kinetic", l_kinetic_first, l_kinetic_last);
+  plb.add_u64_counter(l_kinetic_gets, "kinetic_get", "Gets");
+  plb.add_u64_counter(l_kinetic_txns, "kinetic_transaction", "Transactions");
+  logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+  return 0;
+}
+
+KineticStore::KineticStore(CephContext *c) :
+  cct(c),
+  logger(NULL)
+{
+  host = c->_conf->kinetic_host;
+  port = c->_conf->kinetic_port;
+  user_id = c->_conf->kinetic_user_id;
+  hmac_key = c->_conf->kinetic_hmac_key;
+  use_ssl = c->_conf->kinetic_use_ssl;
+}
+
+KineticStore::~KineticStore()
+{
+  close();
+  delete logger;
+}
+
+void KineticStore::close()
+{
+  kinetic_conn.reset();
+  if (logger)
+    cct->get_perfcounters_collection()->remove(logger);
+}
+
+int KineticStore::submit_transaction(KeyValueDB::Transaction t)
+{
+  KineticTransactionImpl * _t =
+    static_cast<KineticTransactionImpl *>(t.get());
+
+  dout(20) << "kinetic submit_transaction" << dendl;
+
+  for (vector<KineticOp>::iterator it = _t->ops.begin();
+       it != _t->ops.end(); ++it) {
+    kinetic::KineticStatus status(kinetic::StatusCode::OK, "");
+    if (it->type == KINETIC_OP_WRITE) {
+      string data(it->data.c_str(), it->data.length());
+      kinetic::KineticRecord record(data, "", "",
+				    com::seagate::kinetic::client::proto::Message_Algorithm_SHA1);
+      dout(30) << "kinetic before put of " << it->key << " (" << data.length() << " bytes)" << dendl;
+      status = kinetic_conn->Put(it->key, "", kinetic::WriteMode::IGNORE_VERSION,
+				 record);
+      dout(30) << "kinetic after put of " << it->key << dendl;
+    } else {
+      assert(it->type == KINETIC_OP_DELETE);
+      dout(30) << "kinetic before delete" << dendl;
+      status = kinetic_conn->Delete(it->key, "",
+				    kinetic::WriteMode::IGNORE_VERSION);
+      dout(30) << "kinetic after delete" << dendl;
+    }
+    if (!status.ok()) {
+      derr << "kinetic error submitting transaction: "
+	   << status.message() << dendl;
+      return -1;
+    }
+  }
+
+  logger->inc(l_kinetic_txns);
+  return 0;
+}
+
+int KineticStore::submit_transaction_sync(KeyValueDB::Transaction t)
+{
+  return submit_transaction(t);
+}
+
+void KineticStore::KineticTransactionImpl::set(
+  const string &prefix,
+  const string &k,
+  const bufferlist &to_set_bl)
+{
+  string key = combine_strings(prefix, k);
+  dout(30) << "kinetic set key " << key << dendl;
+  ops.push_back(KineticOp(KINETIC_OP_WRITE, key, to_set_bl));
+}
+
+void KineticStore::KineticTransactionImpl::rmkey(const string &prefix,
+					         const string &k)
+{
+  string key = combine_strings(prefix, k);
+  dout(30) << "kinetic rm key " << key << dendl;
+  ops.push_back(KineticOp(KINETIC_OP_DELETE, key));
+}
+
+void KineticStore::KineticTransactionImpl::rmkeys_by_prefix(const string &prefix)
+{
+  dout(20) << "kinetic rmkeys_by_prefix " << prefix << dendl;
+  KeyValueDB::Iterator it = db->get_iterator(prefix);
+  for (it->seek_to_first();
+       it->valid();
+       it->next()) {
+    string key = combine_strings(prefix, it->key());
+    ops.push_back(KineticOp(KINETIC_OP_DELETE, key));
+    dout(30) << "kinetic rm key by prefix: " << key << dendl;
+  }
+}
+
+int KineticStore::get(
+    const string &prefix,
+    const std::set<string> &keys,
+    std::map<string, bufferlist> *out)
+{
+  dout(30) << "kinetic get prefix: " << prefix << " keys: " << keys << dendl;
+  for (std::set<string>::const_iterator i = keys.begin();
+       i != keys.end();
+       ++i) {
+    unique_ptr<kinetic::KineticRecord> record;
+    string key = combine_strings(prefix, *i);
+    dout(30) << "before get key " << key << dendl;
+    kinetic::KineticStatus status = kinetic_conn->Get(key, record);
+    if (!status.ok())
+      break;
+    dout(30) << "kinetic get got key: " << key << dendl;
+    out->insert(make_pair(key, to_bufferlist(*record.get())));
+  }
+  logger->inc(l_kinetic_gets);
+  return 0;
+}
+
+string KineticStore::combine_strings(const string &prefix, const string &value)
+{
+  string out = prefix;
+  out.push_back(1);
+  out.append(value);
+  return out;
+}
+
+bufferlist KineticStore::to_bufferlist(const kinetic::KineticRecord &record)
+{
+  bufferlist bl;
+  bl.append(*(record.value()));
+  return bl;
+}
+
+int KineticStore::split_key(string &in, string *prefix, string *key)
+{
+  size_t prefix_len = 0;
+  char* in_data = in.c_str();
+  
+  // Find separator inside Slice
+  char* separator = (char*) memchr((void*)in_data, 1, in.size());
+  if (separator == NULL)
+     return -EINVAL;
+  prefix_len = size_t(separator - in_data);
+  if (prefix_len >= in.size())
+    return -EINVAL;
+
+  // Fetch prefix and/or key directly from Slice
+  if (prefix)
+    *prefix = string(in_data, prefix_len);
+  if (key)
+    *key = string(separator+1, in.size()-prefix_len-1);
+  return 0;
+}
+
+KineticStore::KineticWholeSpaceIteratorImpl::KineticWholeSpaceIteratorImpl(kinetic::BlockingKineticConnection *conn) : kinetic_conn(conn),
+   kinetic_status(kinetic::StatusCode::OK, "")
+{
+  dout(30) << "kinetic iterator constructor()" << dendl;
+  const static string last_key = "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF";
+  kinetic::KeyRangeIterator it =
+    kinetic_conn->IterateKeyRange("", true, last_key, true, 1024);
+  while (it != kinetic::KeyRangeEnd()) {
+    try {
+      keys.insert(*it);
+      dout(30) << "kinetic iterator added " << *it << dendl;
+    } catch (std::runtime_error &e) {
+      kinetic_status = kinetic::KineticStatus(kinetic::StatusCode::CLIENT_INTERNAL_ERROR, e.what());
+      return;
+    }
+    ++it;
+  }
+  keys_iter = keys.begin();
+}
+
+int KineticStore::KineticWholeSpaceIteratorImpl::seek_to_first(const string &prefix)
+{
+  dout(30) << "kinetic iterator seek_to_first(prefix): " << prefix << dendl;
+  keys_iter = keys.lower_bound(prefix);
+  return 0;
+}
+
+int KineticStore::KineticWholeSpaceIteratorImpl::seek_to_last()
+{
+  dout(30) << "kinetic iterator seek_to_last()" << dendl;
+  keys_iter = keys.end();
+  if (keys.begin() != keys_iter)
+    --keys_iter;
+  return 0;
+}
+
+int KineticStore::KineticWholeSpaceIteratorImpl::seek_to_last(const string &prefix)
+{
+  dout(30) << "kinetic iterator seek_to_last(prefix): " << prefix << dendl;
+  keys_iter = keys.upper_bound(prefix + "\2");
+  if (keys.begin() == keys_iter) {
+    keys_iter = keys.end();
+  } else {
+    --keys_iter;
+  }
+  return 0;
+}
+
+int KineticStore::KineticWholeSpaceIteratorImpl::upper_bound(const string &prefix, const string &after) {
+  dout(30) << "kinetic iterator upper_bound()" << dendl;
+  string bound = combine_strings(prefix, after);
+  keys_iter = keys.upper_bound(bound);
+  return 0;
+}
+
+int KineticStore::KineticWholeSpaceIteratorImpl::lower_bound(const string &prefix, const string &to) {
+  dout(30) << "kinetic iterator lower_bound()" << dendl;
+  string bound = combine_strings(prefix, to);
+  keys_iter = keys.lower_bound(bound);
+  return 0;
+}
+
+bool KineticStore::KineticWholeSpaceIteratorImpl::valid() {
+  dout(30) << "kinetic iterator valid()" << dendl;
+  return keys_iter != keys.end();
+}
+
+int KineticStore::KineticWholeSpaceIteratorImpl::next() {
+  dout(30) << "kinetic iterator next()" << dendl;
+  if (keys_iter != keys.end()) {
+      ++keys_iter;
+      return 0;
+  }
+  return -1;
+}
+
+int KineticStore::KineticWholeSpaceIteratorImpl::prev() {
+  dout(30) << "kinetic iterator prev()" << dendl;
+  if (keys_iter != keys.begin()) {
+      --keys_iter;
+      return 0;
+  }
+  keys_iter = keys.end();
+  return -1;
+}
+
+string KineticStore::KineticWholeSpaceIteratorImpl::key() {
+  dout(30) << "kinetic iterator key()" << dendl;
+  string out_key;
+  split_key(*keys_iter, NULL, &out_key);
+  return out_key;
+}
+
+pair<string,string> KineticStore::KineticWholeSpaceIteratorImpl::raw_key() {
+  dout(30) << "kinetic iterator raw_key()" << dendl;
+  string prefix, key;
+  split_key(*keys_iter, &prefix, &key);
+  return make_pair(prefix, key);
+}
+
+bool KineticStore::KineticWholeSpaceIteratorImpl::raw_key_is_prefixed(const string &prefix) {
+  // Look for "prefix\1" right in *keys_iter without making a copy
+  string key = *keys_iter;
+  if ((key.size() > prefix.length()) && (key[prefix.length()] == '\1')) {
+    return memcmp(key.c_str(), prefix.c_str(), prefix.length()) == 0;
+  } else {
+    return false;
+  }
+}
+
+
+bufferlist KineticStore::KineticWholeSpaceIteratorImpl::value() {
+  dout(30) << "kinetic iterator value()" << dendl;
+  unique_ptr<kinetic::KineticRecord> record;
+  kinetic_status = kinetic_conn->Get(*keys_iter, record);
+  return to_bufferlist(*record.get());
+}
+
+int KineticStore::KineticWholeSpaceIteratorImpl::status() {
+  dout(30) << "kinetic iterator status()" << dendl;
+  return kinetic_status.ok() ? 0 : -1;
+}
diff --git a/src/kv/KineticStore.h b/src/kv/KineticStore.h
new file mode 100644
index 0000000..657dfeb
--- /dev/null
+++ b/src/kv/KineticStore.h
@@ -0,0 +1,161 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef KINETIC_STORE_H
+#define KINETIC_STORE_H
+
+#include "include/types.h"
+#include "include/buffer.h"
+#include "KeyValueDB.h"
+#include <set>
+#include <map>
+#include <string>
+#include "include/memory.h"
+#include <kinetic/kinetic.h>
+
+#include <errno.h>
+#include "common/errno.h"
+#include "common/dout.h"
+#include "include/assert.h"
+#include "common/Formatter.h"
+
+#include "common/ceph_context.h"
+
+class PerfCounters;
+
+enum {
+  l_kinetic_first = 34400,
+  l_kinetic_gets,
+  l_kinetic_txns,
+  l_kinetic_last,
+};
+
+/**
+ * Uses Kinetic to implement the KeyValueDB interface
+ */
+class KineticStore : public KeyValueDB {
+  CephContext *cct;
+  PerfCounters *logger;
+  string host;
+  int port;
+  int user_id;
+  string hmac_key;
+  bool use_ssl;
+  std::unique_ptr<kinetic::BlockingKineticConnection> kinetic_conn;
+
+  int do_open(ostream &out, bool create_if_missing);
+
+public:
+  KineticStore(CephContext *c);
+  ~KineticStore();
+
+  static int _test_init(CephContext *c);
+  int init();
+
+  /// Opens underlying db
+  int open(ostream &out) {
+    return do_open(out, false);
+  }
+  /// Creates underlying db if missing and opens it
+  int create_and_open(ostream &out) {
+    return do_open(out, true);
+  }
+
+  void close();
+
+  enum KineticOpType {
+    KINETIC_OP_WRITE,
+    KINETIC_OP_DELETE,
+  };
+
+  struct KineticOp {
+    KineticOpType type;
+    std::string key;
+    bufferlist data;
+    KineticOp(KineticOpType type, const string &key) : type(type), key(key) {}
+    KineticOp(KineticOpType type, const string &key, const bufferlist &data)
+      : type(type), key(key), data(data) {}
+  };
+
+  class KineticTransactionImpl : public KeyValueDB::TransactionImpl {
+  public:
+    vector<KineticOp> ops;
+    KineticStore *db;
+
+    KineticTransactionImpl(KineticStore *db) : db(db) {}
+    void set(
+      const string &prefix,
+      const string &k,
+      const bufferlist &bl);
+    void rmkey(
+      const string &prefix,
+      const string &k);
+    void rmkeys_by_prefix(
+      const string &prefix
+      );
+  };
+
+  KeyValueDB::Transaction get_transaction() {
+    return ceph::shared_ptr< KineticTransactionImpl >(
+      new KineticTransactionImpl(this));
+  }
+
+  int submit_transaction(KeyValueDB::Transaction t);
+  int submit_transaction_sync(KeyValueDB::Transaction t);
+  int get(
+    const string &prefix,
+    const std::set<string> &key,
+    std::map<string, bufferlist> *out
+    );
+
+  class KineticWholeSpaceIteratorImpl :
+    public KeyValueDB::WholeSpaceIteratorImpl {
+    std::set<std::string> keys;
+    std::set<std::string>::iterator keys_iter;
+    kinetic::BlockingKineticConnection *kinetic_conn;
+    kinetic::KineticStatus kinetic_status;
+  public:
+    KineticWholeSpaceIteratorImpl(kinetic::BlockingKineticConnection *conn);
+    virtual ~KineticWholeSpaceIteratorImpl() { }
+
+    int seek_to_first() {
+      return seek_to_first("");
+    }
+    int seek_to_first(const string &prefix);
+    int seek_to_last();
+    int seek_to_last(const string &prefix);
+    int upper_bound(const string &prefix, const string &after);
+    int lower_bound(const string &prefix, const string &to);
+    bool valid();
+    int next();
+    int prev();
+    string key();
+    pair<string,string> raw_key();
+    bool raw_key_is_prefixed(const string &prefix);
+    bufferlist value();
+    int status();
+  };
+
+  /// Utility
+  static string combine_strings(const string &prefix, const string &value);
+  static int split_key(string &in_prefix, string *prefix, string *key);
+  static bufferlist to_bufferlist(const kinetic::KineticRecord &record);
+  virtual uint64_t get_estimated_size(map<string,uint64_t> &extra) {
+    // not used by the osd
+    return 0;
+  }
+
+
+protected:
+  WholeSpaceIterator _get_iterator() {
+    return ceph::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
+								new KineticWholeSpaceIteratorImpl(kinetic_conn.get()));
+  }
+
+  // TODO: remove snapshots from interface
+  WholeSpaceIterator _get_snapshot_iterator() {
+    return _get_iterator();
+  }
+
+};
+
+#endif
diff --git a/src/kv/LevelDBStore.cc b/src/kv/LevelDBStore.cc
new file mode 100644
index 0000000..2db6dc9
--- /dev/null
+++ b/src/kv/LevelDBStore.cc
@@ -0,0 +1,349 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#include "LevelDBStore.h"
+
+#include <set>
+#include <map>
+#include <string>
+#include "include/memory.h"
+#include <errno.h>
+using std::string;
+#include "common/debug.h"
+#include "common/perf_counters.h"
+
+int LevelDBStore::init(string option_str)
+{
+  // init defaults.  caller can override these if they want
+  // prior to calling open.
+  options.write_buffer_size = g_conf->leveldb_write_buffer_size;
+  options.cache_size = g_conf->leveldb_cache_size;
+  options.block_size = g_conf->leveldb_block_size;
+  options.bloom_size = g_conf->leveldb_bloom_size;
+  options.compression_enabled = g_conf->leveldb_compression;
+  options.paranoid_checks = g_conf->leveldb_paranoid;
+  options.max_open_files = g_conf->leveldb_max_open_files;
+  options.log_file = g_conf->leveldb_log;
+  return 0;
+}
+
+int LevelDBStore::do_open(ostream &out, bool create_if_missing)
+{
+  leveldb::Options ldoptions;
+
+  if (options.write_buffer_size)
+    ldoptions.write_buffer_size = options.write_buffer_size;
+  if (options.max_open_files)
+    ldoptions.max_open_files = options.max_open_files;
+  if (options.cache_size) {
+    leveldb::Cache *_db_cache = leveldb::NewLRUCache(options.cache_size);
+    db_cache.reset(_db_cache);
+    ldoptions.block_cache = db_cache.get();
+  }
+  if (options.block_size)
+    ldoptions.block_size = options.block_size;
+  if (options.bloom_size) {
+#ifdef HAVE_LEVELDB_FILTER_POLICY
+    const leveldb::FilterPolicy *_filterpolicy =
+	leveldb::NewBloomFilterPolicy(options.bloom_size);
+    filterpolicy.reset(_filterpolicy);
+    ldoptions.filter_policy = filterpolicy.get();
+#else
+    assert(0 == "bloom size set but installed leveldb doesn't support bloom filters");
+#endif
+  }
+  if (options.compression_enabled)
+    ldoptions.compression = leveldb::kSnappyCompression;
+  else
+    ldoptions.compression = leveldb::kNoCompression;
+  if (options.block_restart_interval)
+    ldoptions.block_restart_interval = options.block_restart_interval;
+
+  ldoptions.error_if_exists = options.error_if_exists;
+  ldoptions.paranoid_checks = options.paranoid_checks;
+  ldoptions.create_if_missing = create_if_missing;
+
+  if (options.log_file.length()) {
+    leveldb::Env *env = leveldb::Env::Default();
+    env->NewLogger(options.log_file, &ldoptions.info_log);
+  }
+
+  leveldb::DB *_db;
+  leveldb::Status status = leveldb::DB::Open(ldoptions, path, &_db);
+  db.reset(_db);
+  if (!status.ok()) {
+    out << status.ToString() << std::endl;
+    return -EINVAL;
+  }
+
+  PerfCountersBuilder plb(g_ceph_context, "leveldb", l_leveldb_first, l_leveldb_last);
+  plb.add_u64_counter(l_leveldb_gets, "leveldb_get", "Gets");
+  plb.add_u64_counter(l_leveldb_txns, "leveldb_transaction", "Transactions");
+  plb.add_time_avg(l_leveldb_get_latency, "leveldb_get_latency", "Get Latency");
+  plb.add_time_avg(l_leveldb_submit_latency, "leveldb_submit_latency", "Submit Latency");
+  plb.add_time_avg(l_leveldb_submit_sync_latency, "leveldb_submit_sync_latency", "Submit Sync Latency");
+  plb.add_u64_counter(l_leveldb_compact, "leveldb_compact", "Compactions");
+  plb.add_u64_counter(l_leveldb_compact_range, "leveldb_compact_range", "Compactions by range");
+  plb.add_u64_counter(l_leveldb_compact_queue_merge, "leveldb_compact_queue_merge", "Mergings of ranges in compaction queue");
+  plb.add_u64(l_leveldb_compact_queue_len, "leveldb_compact_queue_len", "Length of compaction queue");
+  logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+
+  if (g_conf->leveldb_compact_on_mount) {
+    derr << "Compacting leveldb store..." << dendl;
+    compact();
+    derr << "Finished compacting leveldb store" << dendl;
+  }
+  return 0;
+}
+
+int LevelDBStore::_test_init(const string& dir)
+{
+  leveldb::Options options;
+  options.create_if_missing = true;
+  leveldb::DB *db;
+  leveldb::Status status = leveldb::DB::Open(options, dir, &db);
+  delete db;
+  return status.ok() ? 0 : -EIO;
+}
+
+LevelDBStore::~LevelDBStore()
+{
+  close();
+  delete logger;
+
+  // Ensure db is destroyed before dependent db_cache and filterpolicy
+  db.reset();
+}
+
+void LevelDBStore::close()
+{
+  // stop compaction thread
+  compact_queue_lock.Lock();
+  if (compact_thread.is_started()) {
+    compact_queue_stop = true;
+    compact_queue_cond.Signal();
+    compact_queue_lock.Unlock();
+    compact_thread.join();
+  } else {
+    compact_queue_lock.Unlock();
+  }
+
+  if (logger)
+    cct->get_perfcounters_collection()->remove(logger);
+}
+
+int LevelDBStore::submit_transaction(KeyValueDB::Transaction t)
+{
+  utime_t start = ceph_clock_now(g_ceph_context);
+  LevelDBTransactionImpl * _t =
+    static_cast<LevelDBTransactionImpl *>(t.get());
+  leveldb::Status s = db->Write(leveldb::WriteOptions(), &(_t->bat));
+  utime_t lat = ceph_clock_now(g_ceph_context) - start;
+  logger->inc(l_leveldb_txns);
+  logger->tinc(l_leveldb_submit_latency, lat);
+  return s.ok() ? 0 : -1;
+}
+
+int LevelDBStore::submit_transaction_sync(KeyValueDB::Transaction t)
+{
+  utime_t start = ceph_clock_now(g_ceph_context);
+  LevelDBTransactionImpl * _t =
+    static_cast<LevelDBTransactionImpl *>(t.get());
+  leveldb::WriteOptions options;
+  options.sync = true;
+  leveldb::Status s = db->Write(options, &(_t->bat));
+  utime_t lat = ceph_clock_now(g_ceph_context) - start;
+  logger->inc(l_leveldb_txns);
+  logger->tinc(l_leveldb_submit_sync_latency, lat);
+  return s.ok() ? 0 : -1;
+}
+
+void LevelDBStore::LevelDBTransactionImpl::set(
+  const string &prefix,
+  const string &k,
+  const bufferlist &to_set_bl)
+{
+  string key = combine_strings(prefix, k);
+  size_t bllen = to_set_bl.length();
+  // bufferlist::c_str() is non-constant, so we can't call c_str()
+  if (to_set_bl.is_contiguous() && bllen > 0) {
+    // bufferlist contains just one ptr or they're contiguous
+    bat.Put(leveldb::Slice(key), leveldb::Slice(to_set_bl.buffers().front().c_str(), bllen));
+  } else if ((bllen <= 32 * 1024) && (bllen > 0)) {
+    // 2+ bufferptrs that are not contiguopus
+    // allocate buffer on stack and copy bl contents to that buffer
+    // make sure the buffer isn't too large or we might crash here...    
+    char* slicebuf = (char*) alloca(bllen);
+    leveldb::Slice newslice(slicebuf, bllen);
+    std::list<buffer::ptr>::const_iterator pb;
+    for (pb = to_set_bl.buffers().begin(); pb != to_set_bl.buffers().end(); ++pb) {
+      size_t ptrlen = (*pb).length();
+      memcpy((void*)slicebuf, (*pb).c_str(), ptrlen);
+      slicebuf += ptrlen;
+    } 
+    bat.Put(leveldb::Slice(key), newslice);
+  } else {
+    // 2+ bufferptrs that are not contiguous, and enormous in size
+    bufferlist val = to_set_bl;
+    bat.Put(leveldb::Slice(key), leveldb::Slice(val.c_str(), val.length()));
+  }
+}
+
+void LevelDBStore::LevelDBTransactionImpl::rmkey(const string &prefix,
+					         const string &k)
+{
+  string key = combine_strings(prefix, k);
+  bat.Delete(leveldb::Slice(key));
+}
+
+void LevelDBStore::LevelDBTransactionImpl::rmkeys_by_prefix(const string &prefix)
+{
+  KeyValueDB::Iterator it = db->get_iterator(prefix);
+  for (it->seek_to_first();
+       it->valid();
+       it->next()) {
+    string key = combine_strings(prefix, it->key());
+    bat.Delete(key);
+  }
+}
+
+int LevelDBStore::get(
+    const string &prefix,
+    const std::set<string> &keys,
+    std::map<string, bufferlist> *out)
+{
+  utime_t start = ceph_clock_now(g_ceph_context);
+  KeyValueDB::Iterator it = get_iterator(prefix);
+  for (std::set<string>::const_iterator i = keys.begin();
+       i != keys.end();
+       ++i) {
+    it->lower_bound(*i);
+    if (it->valid() && it->key() == *i) {
+      out->insert(make_pair(*i, it->value()));
+    } else if (!it->valid())
+      break;
+  }
+  utime_t lat = ceph_clock_now(g_ceph_context) - start;
+  logger->inc(l_leveldb_gets);
+  logger->tinc(l_leveldb_get_latency, lat);
+  return 0;
+}
+
+int LevelDBStore::get(const string &prefix, 
+		  const string &key,
+		  bufferlist *value)
+{
+  utime_t start = ceph_clock_now(g_ceph_context);
+  int r = 0;
+  KeyValueDB::Iterator it = get_iterator(prefix);
+  it->lower_bound(key);
+  if (it->valid() && it->key() == key) {
+    *value = it->value();
+  } else {
+    r = -ENOENT;
+  }
+  utime_t lat = ceph_clock_now(g_ceph_context) - start;
+  logger->inc(l_leveldb_gets);
+  logger->tinc(l_leveldb_get_latency, lat);
+  return r;
+}
+
+string LevelDBStore::combine_strings(const string &prefix, const string &value)
+{
+  string out = prefix;
+  out.push_back(0);
+  out.append(value);
+  return out;
+}
+
+bufferlist LevelDBStore::to_bufferlist(leveldb::Slice in)
+{
+  bufferlist bl;
+  bl.append(bufferptr(in.data(), in.size()));
+  return bl;
+}
+
+int LevelDBStore::split_key(leveldb::Slice in, string *prefix, string *key)
+{
+  size_t prefix_len = 0;
+  
+  // Find separator inside Slice
+  char* separator = (char*) memchr(in.data(), 0, in.size());
+  if (separator == NULL)
+     return -EINVAL;
+  prefix_len = size_t(separator - in.data());
+  if (prefix_len >= in.size())
+    return -EINVAL;
+
+  if (prefix)
+    *prefix = string(in.data(), prefix_len);
+  if (key)
+    *key = string(separator+1, in.size() - prefix_len - 1);
+   return 0;
+}
+
+void LevelDBStore::compact()
+{
+  logger->inc(l_leveldb_compact);
+  db->CompactRange(NULL, NULL);
+}
+
+
+void LevelDBStore::compact_thread_entry()
+{
+  compact_queue_lock.Lock();
+  while (!compact_queue_stop) {
+    while (!compact_queue.empty()) {
+      pair<string,string> range = compact_queue.front();
+      compact_queue.pop_front();
+      logger->set(l_leveldb_compact_queue_len, compact_queue.size());
+      compact_queue_lock.Unlock();
+      logger->inc(l_leveldb_compact_range);
+      compact_range(range.first, range.second);
+      compact_queue_lock.Lock();
+      continue;
+    }
+    compact_queue_cond.Wait(compact_queue_lock);
+  }
+  compact_queue_lock.Unlock();
+}
+
+void LevelDBStore::compact_range_async(const string& start, const string& end)
+{
+  Mutex::Locker l(compact_queue_lock);
+
+  // try to merge adjacent ranges.  this is O(n), but the queue should
+  // be short.  note that we do not cover all overlap cases and merge
+  // opportunities here, but we capture the ones we currently need.
+  list< pair<string,string> >::iterator p = compact_queue.begin();
+  while (p != compact_queue.end()) {
+    if (p->first == start && p->second == end) {
+      // dup; no-op
+      return;
+    }
+    if (p->first <= end && p->first > start) {
+      // merge with existing range to the right
+      compact_queue.push_back(make_pair(start, p->second));
+      compact_queue.erase(p);
+      logger->inc(l_leveldb_compact_queue_merge);
+      break;
+    }
+    if (p->second >= start && p->second < end) {
+      // merge with existing range to the left
+      compact_queue.push_back(make_pair(p->first, end));
+      compact_queue.erase(p);
+      logger->inc(l_leveldb_compact_queue_merge);
+      break;
+    }
+    ++p;
+  }
+  if (p == compact_queue.end()) {
+    // no merge, new entry.
+    compact_queue.push_back(make_pair(start, end));
+    logger->set(l_leveldb_compact_queue_len, compact_queue.size());
+  }
+  compact_queue_cond.Signal();
+  if (!compact_thread.is_started()) {
+    compact_thread.create();
+  }
+}
diff --git a/src/kv/LevelDBStore.h b/src/kv/LevelDBStore.h
new file mode 100644
index 0000000..c269601
--- /dev/null
+++ b/src/kv/LevelDBStore.h
@@ -0,0 +1,415 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef LEVEL_DB_STORE_H
+#define LEVEL_DB_STORE_H
+
+#include "include/types.h"
+#include "include/buffer.h"
+#include "KeyValueDB.h"
+#include <set>
+#include <map>
+#include <string>
+#include "include/memory.h"
+#include <boost/scoped_ptr.hpp>
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/write_batch.h"
+#include "leveldb/slice.h"
+#include "leveldb/cache.h"
+#ifdef HAVE_LEVELDB_FILTER_POLICY
+#include "leveldb/filter_policy.h"
+#endif
+
+#include <errno.h>
+#include "common/errno.h"
+#include "common/dout.h"
+#include "include/assert.h"
+#include "common/Formatter.h"
+#include "common/Cond.h"
+
+#include "common/ceph_context.h"
+
+class PerfCounters;
+
+enum {
+  l_leveldb_first = 34300,
+  l_leveldb_gets,
+  l_leveldb_txns,
+  l_leveldb_get_latency,
+  l_leveldb_submit_latency,
+  l_leveldb_submit_sync_latency,
+  l_leveldb_compact,
+  l_leveldb_compact_range,
+  l_leveldb_compact_queue_merge,
+  l_leveldb_compact_queue_len,
+  l_leveldb_last,
+};
+
+/**
+ * Uses LevelDB to implement the KeyValueDB interface
+ */
+class LevelDBStore : public KeyValueDB {
+  CephContext *cct;
+  PerfCounters *logger;
+  string path;
+  boost::scoped_ptr<leveldb::Cache> db_cache;
+#ifdef HAVE_LEVELDB_FILTER_POLICY
+  boost::scoped_ptr<const leveldb::FilterPolicy> filterpolicy;
+#endif
+  boost::scoped_ptr<leveldb::DB> db;
+
+  int do_open(ostream &out, bool create_if_missing);
+
+  // manage async compactions
+  Mutex compact_queue_lock;
+  Cond compact_queue_cond;
+  list< pair<string,string> > compact_queue;
+  bool compact_queue_stop;
+  class CompactThread : public Thread {
+    LevelDBStore *db;
+  public:
+    CompactThread(LevelDBStore *d) : db(d) {}
+    void *entry() {
+      db->compact_thread_entry();
+      return NULL;
+    }
+    friend class LevelDBStore;
+  } compact_thread;
+
+  void compact_thread_entry();
+
+  void compact_range(const string& start, const string& end) {
+    leveldb::Slice cstart(start);
+    leveldb::Slice cend(end);
+    db->CompactRange(&cstart, &cend);
+  }
+  void compact_range_async(const string& start, const string& end);
+
+public:
+  /// compact the underlying leveldb store
+  void compact();
+
+  /// compact db for all keys with a given prefix
+  void compact_prefix(const string& prefix) {
+    compact_range(prefix, past_prefix(prefix));
+  }
+  void compact_prefix_async(const string& prefix) {
+    compact_range_async(prefix, past_prefix(prefix));
+  }
+  void compact_range(const string& prefix,
+		     const string& start, const string& end) {
+    compact_range(combine_strings(prefix, start), combine_strings(prefix, end));
+  }
+  void compact_range_async(const string& prefix,
+			   const string& start, const string& end) {
+    compact_range_async(combine_strings(prefix, start),
+			combine_strings(prefix, end));
+  }
+
+
+  /**
+   * options_t: Holds options which are minimally interpreted
+   * on initialization and then passed through to LevelDB.
+   * We transform a couple of these into actual LevelDB
+   * structures, but the rest are simply passed through unchanged. See
+   * leveldb/options.h for more precise details on each.
+   *
+   * Set them after constructing the LevelDBStore, but before calling
+   * open() or create_and_open().
+   */
+  struct options_t {
+    uint64_t write_buffer_size; /// in-memory write buffer size
+    int max_open_files; /// maximum number of files LevelDB can open at once
+    uint64_t cache_size; /// size of extra decompressed cache to use
+    uint64_t block_size; /// user data per block
+    int bloom_size; /// number of bits per entry to put in a bloom filter
+    bool compression_enabled; /// whether to use libsnappy compression or not
+
+    // don't change these ones. No, seriously
+    int block_restart_interval;
+    bool error_if_exists;
+    bool paranoid_checks;
+
+    string log_file;
+
+    options_t() :
+      write_buffer_size(0), //< 0 means default
+      max_open_files(0), //< 0 means default
+      cache_size(0), //< 0 means no cache (default)
+      block_size(0), //< 0 means default
+      bloom_size(0), //< 0 means no bloom filter (default)
+      compression_enabled(true), //< set to false for no compression
+      block_restart_interval(0), //< 0 means default
+      error_if_exists(false), //< set to true if you want to check nonexistence
+      paranoid_checks(false) //< set to true if you want paranoid checks
+    {}
+  } options;
+
+  LevelDBStore(CephContext *c, const string &path) :
+    cct(c),
+    logger(NULL),
+    path(path),
+    db_cache(NULL),
+#ifdef HAVE_LEVELDB_FILTER_POLICY
+    filterpolicy(NULL),
+#endif
+    compact_queue_lock("LevelDBStore::compact_thread_lock"),
+    compact_queue_stop(false),
+    compact_thread(this),
+    options()
+  {}
+
+  ~LevelDBStore();
+
+  static int _test_init(const string& dir);
+  int init(string option_str="");
+
+  /// Opens underlying db
+  int open(ostream &out) {
+    return do_open(out, false);
+  }
+  /// Creates underlying db if missing and opens it
+  int create_and_open(ostream &out) {
+    return do_open(out, true);
+  }
+
+  void close();
+
+  class LevelDBTransactionImpl : public KeyValueDB::TransactionImpl {
+  public:
+    leveldb::WriteBatch bat;
+    LevelDBStore *db;
+    LevelDBTransactionImpl(LevelDBStore *db) : db(db) {}
+    void set(
+      const string &prefix,
+      const string &k,
+      const bufferlist &bl);
+    void rmkey(
+      const string &prefix,
+      const string &k);
+    void rmkeys_by_prefix(
+      const string &prefix
+      );
+  };
+
+  KeyValueDB::Transaction get_transaction() {
+    return ceph::shared_ptr< LevelDBTransactionImpl >(
+      new LevelDBTransactionImpl(this));
+  }
+
+  int submit_transaction(KeyValueDB::Transaction t);
+  int submit_transaction_sync(KeyValueDB::Transaction t);
+  int get(
+    const string &prefix,
+    const std::set<string> &key,
+    std::map<string, bufferlist> *out
+    );
+
+  int get(const string &prefix, 
+    const string &key,   
+    bufferlist *value);
+      
+  class LevelDBWholeSpaceIteratorImpl :
+    public KeyValueDB::WholeSpaceIteratorImpl {
+  protected:
+    boost::scoped_ptr<leveldb::Iterator> dbiter;
+  public:
+    LevelDBWholeSpaceIteratorImpl(leveldb::Iterator *iter) :
+      dbiter(iter) { }
+    virtual ~LevelDBWholeSpaceIteratorImpl() { }
+
+    int seek_to_first() {
+      dbiter->SeekToFirst();
+      return dbiter->status().ok() ? 0 : -1;
+    }
+    int seek_to_first(const string &prefix) {
+      leveldb::Slice slice_prefix(prefix);
+      dbiter->Seek(slice_prefix);
+      return dbiter->status().ok() ? 0 : -1;
+    }
+    int seek_to_last() {
+      dbiter->SeekToLast();
+      return dbiter->status().ok() ? 0 : -1;
+    }
+    int seek_to_last(const string &prefix) {
+      string limit = past_prefix(prefix);
+      leveldb::Slice slice_limit(limit);
+      dbiter->Seek(slice_limit);
+
+      if (!dbiter->Valid()) {
+        dbiter->SeekToLast();
+      } else {
+        dbiter->Prev();
+      }
+      return dbiter->status().ok() ? 0 : -1;
+    }
+    int upper_bound(const string &prefix, const string &after) {
+      lower_bound(prefix, after);
+      if (valid()) {
+	pair<string,string> key = raw_key();
+	if (key.first == prefix && key.second == after)
+	  next();
+      }
+      return dbiter->status().ok() ? 0 : -1;
+    }
+    int lower_bound(const string &prefix, const string &to) {
+      string bound = combine_strings(prefix, to);
+      leveldb::Slice slice_bound(bound);
+      dbiter->Seek(slice_bound);
+      return dbiter->status().ok() ? 0 : -1;
+    }
+    bool valid() {
+      return dbiter->Valid();
+    }
+    int next() {
+      if (valid())
+	dbiter->Next();
+      return dbiter->status().ok() ? 0 : -1;
+    }
+    int prev() {
+      if (valid())
+	dbiter->Prev();
+      return dbiter->status().ok() ? 0 : -1;
+    }
+    string key() {
+      string out_key;
+      split_key(dbiter->key(), 0, &out_key);
+      return out_key;
+    }
+    pair<string,string> raw_key() {
+      string prefix, key;
+      split_key(dbiter->key(), &prefix, &key);
+      return make_pair(prefix, key);
+    }
+    bool raw_key_is_prefixed(const string &prefix) {
+      leveldb::Slice key = dbiter->key();
+      if ((key.size() > prefix.length()) && (key[prefix.length()] == '\0')) {
+        return memcmp(key.data(), prefix.c_str(), prefix.length()) == 0;
+      } else {
+        return false;
+      }
+    }
+    bufferlist value() {
+      return to_bufferlist(dbiter->value());
+    }
+    int status() {
+      return dbiter->status().ok() ? 0 : -1;
+    }
+  };
+
+  class LevelDBSnapshotIteratorImpl : public LevelDBWholeSpaceIteratorImpl {
+    leveldb::DB *db;
+    const leveldb::Snapshot *snapshot;
+  public:
+    LevelDBSnapshotIteratorImpl(leveldb::DB *db, const leveldb::Snapshot *s,
+				leveldb::Iterator *iter) :
+      LevelDBWholeSpaceIteratorImpl(iter), db(db), snapshot(s) { }
+
+    ~LevelDBSnapshotIteratorImpl() {
+      assert(snapshot != NULL);
+      db->ReleaseSnapshot(snapshot);
+    }
+  };
+
+  /// Utility
+  static string combine_strings(const string &prefix, const string &value);
+  static int split_key(leveldb::Slice in, string *prefix, string *key);
+  static bufferlist to_bufferlist(leveldb::Slice in);
+  static string past_prefix(const string &prefix) {
+    string limit = prefix;
+    limit.push_back(1);
+    return limit;
+  }
+
+  virtual uint64_t get_estimated_size(map<string,uint64_t> &extra) {
+    DIR *store_dir = opendir(path.c_str());
+    if (!store_dir) {
+      lderr(cct) << __func__ << " something happened opening the store: "
+                 << cpp_strerror(errno) << dendl;
+      return 0;
+    }
+
+    uint64_t total_size = 0;
+    uint64_t sst_size = 0;
+    uint64_t log_size = 0;
+    uint64_t misc_size = 0;
+
+    struct dirent *entry = NULL;
+    while ((entry = readdir(store_dir)) != NULL) {
+      string n(entry->d_name);
+
+      if (n == "." || n == "..")
+        continue;
+
+      string fpath = path + '/' + n;
+      struct stat s;
+      int err = stat(fpath.c_str(), &s);
+      if (err < 0)
+	err = -errno;
+      // we may race against leveldb while reading files; this should only
+      // happen when those files are being updated, data is being shuffled
+      // and files get removed, in which case there's not much of a problem
+      // as we'll get to them next time around.
+      if (err == -ENOENT) {
+	continue;
+      }
+      if (err < 0) {
+        lderr(cct) << __func__ << " error obtaining stats for " << fpath
+                   << ": " << cpp_strerror(err) << dendl;
+        goto err;
+      }
+
+      size_t pos = n.find_last_of('.');
+      if (pos == string::npos) {
+        misc_size += s.st_size;
+        continue;
+      }
+
+      string ext = n.substr(pos+1);
+      if (ext == "sst") {
+        sst_size += s.st_size;
+      } else if (ext == "log") {
+        log_size += s.st_size;
+      } else {
+        misc_size += s.st_size;
+      }
+    }
+
+    total_size = sst_size + log_size + misc_size;
+
+    extra["sst"] = sst_size;
+    extra["log"] = log_size;
+    extra["misc"] = misc_size;
+    extra["total"] = total_size;
+
+err:
+    closedir(store_dir);
+    return total_size;
+  }
+
+
+protected:
+  WholeSpaceIterator _get_iterator() {
+    return ceph::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
+      new LevelDBWholeSpaceIteratorImpl(
+	db->NewIterator(leveldb::ReadOptions())
+      )
+    );
+  }
+
+  WholeSpaceIterator _get_snapshot_iterator() {
+    const leveldb::Snapshot *snapshot;
+    leveldb::ReadOptions options;
+
+    snapshot = db->GetSnapshot();
+    options.snapshot = snapshot;
+
+    return ceph::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
+      new LevelDBSnapshotIteratorImpl(db.get(), snapshot,
+	db->NewIterator(options))
+    );
+  }
+
+};
+
+#endif
diff --git a/src/kv/Makefile.am b/src/kv/Makefile.am
new file mode 100644
index 0000000..e5e4878
--- /dev/null
+++ b/src/kv/Makefile.am
@@ -0,0 +1,40 @@
+if ENABLE_SERVER
+
+libkv_a_SOURCES = \
+	kv/KeyValueDB.cc \
+	kv/LevelDBStore.cc
+libkv_a_CXXFLAGS = ${AM_CXXFLAGS} -I rocksdb/include
+libkv_a_LIBADD =
+
+noinst_LIBRARIES += libkv.a
+
+noinst_HEADERS += \
+	kv/KeyValueDB.h \
+	kv/LevelDBStore.h
+
+if WITH_SLIBROCKSDB
+# build rocksdb with its own makefile
+# for some stupid reason this needs -fPIC...
+# PORTABLE=1 fixes the aarch64 build (-march=native doesn't work there)
+rocksdb/librocksdb.a:
+	cd rocksdb && EXTRA_CXXFLAGS=-fPIC PORTABLE=1 make -j$(shell nproc) static_lib
+libkv_a_CXXFLAGS +=  -I rocksdb/include -fPIC
+libkv_a_SOURCES += kv/RocksDBStore.cc
+libkv_a_LIBADD += rocksdb/librocksdb.a
+noinst_HEADERS += kv/RocksDBStore.h
+endif
+
+if WITH_DLIBROCKSDB
+libkv_a_SOURCES += kv/RocksDBStore.cc
+libkv_a_LIBADD += -lrocksdb
+noinst_HEADERS += kv/RocksDBStore.h
+endif
+
+if WITH_KINETIC
+libkv_a_SOURCES += kv/KineticStore.cc
+libkv_a_CXXFLAGS += -std=gnu++11
+libkv_a_LIBADD += -lkinetic_client -lprotobuf -lglog -lgflags libcrypto.a
+noinst_HEADERS += kv/KineticStore.h
+endif
+
+endif # ENABLE_SERVER
diff --git a/src/kv/RocksDBStore.cc b/src/kv/RocksDBStore.cc
new file mode 100644
index 0000000..a6d071e
--- /dev/null
+++ b/src/kv/RocksDBStore.cc
@@ -0,0 +1,587 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <set>
+#include <map>
+#include <string>
+#include <memory>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "rocksdb/db.h"
+#include "rocksdb/table.h"
+#include "rocksdb/env.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/utilities/convenience.h"
+using std::string;
+#include "common/perf_counters.h"
+#include "common/debug.h"
+#include "include/str_map.h"
+#include "KeyValueDB.h"
+#include "RocksDBStore.h"
+
+int string2bool(string val, bool &b_val)
+{
+  if (strcasecmp(val.c_str(), "false") == 0) {
+    b_val = false;
+    return 0;
+  } else if (strcasecmp(val.c_str(), "true") == 0) {
+    b_val = true;
+    return 0;
+  } else {
+    std::string err;
+    int b = strict_strtol(val.c_str(), 10, &err);
+    if (!err.empty())
+      return -EINVAL;
+    b_val = !!b;
+    return 0;
+  }
+}
+  
+int RocksDBStore::tryInterpret(const string key, const string val, rocksdb::Options &opt)
+{
+  if (key == "compaction_threads") {
+    std::string err;
+    int f = strict_sistrtoll(val.c_str(), &err);
+    if (!err.empty())
+      return -EINVAL;
+    //Low priority threadpool is used for compaction
+    opt.env->SetBackgroundThreads(f, rocksdb::Env::Priority::LOW);
+  } else if (key == "flusher_threads") {
+    std::string err;
+    int f = strict_sistrtoll(val.c_str(), &err);
+    if (!err.empty())
+      return -EINVAL;
+    //High priority threadpool is used for flusher
+    opt.env->SetBackgroundThreads(f, rocksdb::Env::Priority::HIGH);
+  } else if (key == "compact_on_mount") {
+    int ret = string2bool(val, compact_on_mount);
+    if (ret != 0)
+      return ret;
+  } else if (key == "disableWAL") {
+    int ret = string2bool(val, disableWAL);
+    if (ret != 0)
+      return ret;
+  } else {
+    //unrecognize config options.
+    return -EINVAL;
+  }
+  return 0;
+}
+
+int RocksDBStore::ParseOptionsFromString(const string opt_str, rocksdb::Options &opt)
+{
+  map<string, string> str_map;
+  int r = get_str_map(opt_str, ",\n;", &str_map);
+  if (r < 0)
+    return r;
+  map<string, string>::iterator it;
+  for(it = str_map.begin(); it != str_map.end(); ++it) {
+    string this_opt = it->first + "=" + it->second;
+    rocksdb::Status status = rocksdb::GetOptionsFromString(opt, this_opt , &opt); 
+    if (!status.ok()) {
+      //unrecognized by rocksdb, try to interpret by ourselves.
+      r = tryInterpret(it->first, it->second, opt);
+      if (r < 0) {
+	derr << status.ToString() << dendl;
+	return -EINVAL;
+      }
+    }
+    lgeneric_dout(cct, 0) << " set rocksdb option " << it->first
+			  << " = " << it->second << dendl;
+  }
+  return 0;
+}
+
+int RocksDBStore::init(string _options_str)
+{
+  options_str = _options_str;
+  rocksdb::Options opt;
+  //try parse options
+  int r = ParseOptionsFromString(options_str, opt); 
+  if (r != 0) {
+    return -EINVAL;
+  }
+  return 0;
+}
+
+int RocksDBStore::create_and_open(ostream &out)
+{
+  // create tertiary paths
+  string wal_path = path + ".wal";
+  struct stat st;
+  int r = ::stat(wal_path.c_str(), &st);
+  if (r < 0)
+    r = -errno;
+  if (r == -ENOENT) {
+    unsigned slashoff = path.rfind('/');
+    string target = path.substr(slashoff + 1);
+    r = ::symlink(target.c_str(), wal_path.c_str());
+    if (r < 0) {
+      out << "failed to symlink " << wal_path << " to " << target;
+      return -errno;
+    }
+  }
+  return do_open(out, true);
+}
+
+int RocksDBStore::do_open(ostream &out, bool create_if_missing)
+{
+  rocksdb::Options opt;
+  rocksdb::Status status;
+
+  int r = ParseOptionsFromString(options_str, opt); 
+  if (r != 0) {
+    return -EINVAL;
+  }
+  opt.create_if_missing = create_if_missing;
+  opt.wal_dir = path + ".wal";
+
+  status = rocksdb::DB::Open(opt, path, &db);
+  if (!status.ok()) {
+    derr << status.ToString() << dendl;
+    return -EINVAL;
+  }
+
+  PerfCountersBuilder plb(g_ceph_context, "rocksdb", l_rocksdb_first, l_rocksdb_last);
+  plb.add_u64_counter(l_rocksdb_gets, "rocksdb_get", "Gets");
+  plb.add_u64_counter(l_rocksdb_txns, "rocksdb_transaction", "Transactions");
+  plb.add_time_avg(l_rocksdb_get_latency, "rocksdb_get_latency", "Get latency");
+  plb.add_time_avg(l_rocksdb_submit_latency, "rocksdb_submit_latency", "Submit Latency");
+  plb.add_time_avg(l_rocksdb_submit_sync_latency, "rocksdb_submit_sync_latency", "Submit Sync Latency");
+  plb.add_u64_counter(l_rocksdb_compact, "rocksdb_compact", "Compactions");
+  plb.add_u64_counter(l_rocksdb_compact_range, "rocksdb_compact_range", "Compactions by range");
+  plb.add_u64_counter(l_rocksdb_compact_queue_merge, "rocksdb_compact_queue_merge", "Mergings of ranges in compaction queue");
+  plb.add_u64(l_rocksdb_compact_queue_len, "rocksdb_compact_queue_len", "Length of compaction queue");
+  logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(logger);
+
+  if (compact_on_mount) {
+    derr << "Compacting rocksdb store..." << dendl;
+    compact();
+    derr << "Finished compacting rocksdb store" << dendl;
+  }
+  return 0;
+}
+
+int RocksDBStore::_test_init(const string& dir)
+{
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  rocksdb::DB *db;
+  rocksdb::Status status = rocksdb::DB::Open(options, dir, &db);
+  delete db;
+  return status.ok() ? 0 : -EIO;
+}
+
+RocksDBStore::~RocksDBStore()
+{
+  close();
+  delete logger;
+
+  // Ensure db is destroyed before dependent db_cache and filterpolicy
+  delete db;
+}
+
+void RocksDBStore::close()
+{
+  // stop compaction thread
+  compact_queue_lock.Lock();
+  if (compact_thread.is_started()) {
+    compact_queue_stop = true;
+    compact_queue_cond.Signal();
+    compact_queue_lock.Unlock();
+    compact_thread.join();
+  } else {
+    compact_queue_lock.Unlock();
+  }
+
+  if (logger)
+    cct->get_perfcounters_collection()->remove(logger);
+}
+
+int RocksDBStore::submit_transaction(KeyValueDB::Transaction t)
+{
+  utime_t start = ceph_clock_now(g_ceph_context);
+  RocksDBTransactionImpl * _t =
+    static_cast<RocksDBTransactionImpl *>(t.get());
+  rocksdb::WriteOptions woptions;
+  woptions.disableWAL = disableWAL;
+  rocksdb::Status s = db->Write(woptions, _t->bat);
+  utime_t lat = ceph_clock_now(g_ceph_context) - start;
+  logger->inc(l_rocksdb_txns);
+  logger->tinc(l_rocksdb_submit_latency, lat);
+  return s.ok() ? 0 : -1;
+}
+
+int RocksDBStore::submit_transaction_sync(KeyValueDB::Transaction t)
+{
+  utime_t start = ceph_clock_now(g_ceph_context);
+  RocksDBTransactionImpl * _t =
+    static_cast<RocksDBTransactionImpl *>(t.get());
+  rocksdb::WriteOptions woptions;
+  woptions.sync = true;
+  woptions.disableWAL = disableWAL;
+  rocksdb::Status s = db->Write(woptions, _t->bat);
+  utime_t lat = ceph_clock_now(g_ceph_context) - start;
+  logger->inc(l_rocksdb_txns);
+  logger->tinc(l_rocksdb_submit_sync_latency, lat);
+  return s.ok() ? 0 : -1;
+}
+int RocksDBStore::get_info_log_level(string info_log_level)
+{
+  if (info_log_level == "debug") {
+    return 0;
+  } else if (info_log_level == "info") {
+    return 1;
+  } else if (info_log_level == "warn") {
+    return 2;
+  } else if (info_log_level == "error") {
+    return 3;
+  } else if (info_log_level == "fatal") {
+    return 4;
+  } else {
+    return 1;
+  }
+}
+
+RocksDBStore::RocksDBTransactionImpl::RocksDBTransactionImpl(RocksDBStore *_db)
+{
+  db = _db;
+  bat = new rocksdb::WriteBatch();
+}
+RocksDBStore::RocksDBTransactionImpl::~RocksDBTransactionImpl()
+{
+  delete bat;
+}
+void RocksDBStore::RocksDBTransactionImpl::set(
+  const string &prefix,
+  const string &k,
+  const bufferlist &to_set_bl)
+{
+  string key = combine_strings(prefix, k);
+
+  // bufferlist::c_str() is non-constant, so we can't call c_str()
+  if (to_set_bl.is_contiguous() && to_set_bl.length() > 0) {
+    bat->Put(rocksdb::Slice(key),
+	     rocksdb::Slice(to_set_bl.buffers().front().c_str(),
+			    to_set_bl.length()));
+  } else {
+    // make a copy
+    bufferlist val = to_set_bl;
+    bat->Put(rocksdb::Slice(key),
+	     rocksdb::Slice(val.c_str(), val.length()));
+  }
+}
+
+void RocksDBStore::RocksDBTransactionImpl::rmkey(const string &prefix,
+					         const string &k)
+{
+  bat->Delete(combine_strings(prefix, k));
+}
+
+void RocksDBStore::RocksDBTransactionImpl::rmkeys_by_prefix(const string &prefix)
+{
+  KeyValueDB::Iterator it = db->get_iterator(prefix);
+  for (it->seek_to_first();
+       it->valid();
+       it->next()) {
+    bat->Delete(combine_strings(prefix, it->key()));
+  }
+}
+
+int RocksDBStore::get(
+    const string &prefix,
+    const std::set<string> &keys,
+    std::map<string, bufferlist> *out)
+{
+  utime_t start = ceph_clock_now(g_ceph_context);
+  KeyValueDB::Iterator it = get_iterator(prefix);
+  for (std::set<string>::const_iterator i = keys.begin();
+       i != keys.end();
+       ++i) {
+    it->lower_bound(*i);
+    if (it->valid() && it->key() == *i) {
+      out->insert(make_pair(*i, it->value()));
+    } else if (!it->valid())
+      break;
+  }
+  utime_t lat = ceph_clock_now(g_ceph_context) - start;
+  logger->inc(l_rocksdb_gets);
+  logger->tinc(l_rocksdb_get_latency, lat);
+  return 0;
+}
+
+int RocksDBStore::get(
+    const string &prefix,
+    const string &key,
+    bufferlist *out)
+{
+  utime_t start = ceph_clock_now(g_ceph_context);
+  int r = 0;
+  KeyValueDB::Iterator it = get_iterator(prefix);
+  it->lower_bound(key);
+  if (it->valid() && it->key() == key) {
+    *out = it->value();
+  } else {
+    r = -ENOENT;
+  }
+  utime_t lat = ceph_clock_now(g_ceph_context) - start;
+  logger->inc(l_rocksdb_gets);
+  logger->tinc(l_rocksdb_get_latency, lat);
+  return r;
+}
+
+string RocksDBStore::combine_strings(const string &prefix, const string &value)
+{
+  string out = prefix;
+  out.push_back(0);
+  out.append(value);
+  return out;
+}
+
+bufferlist RocksDBStore::to_bufferlist(rocksdb::Slice in)
+{
+  bufferlist bl;
+  bl.append(bufferptr(in.data(), in.size()));
+  return bl;
+}
+
+int RocksDBStore::split_key(rocksdb::Slice in, string *prefix, string *key)
+{
+  size_t prefix_len = 0;
+  
+  // Find separator inside Slice
+  char* separator = (char*) memchr(in.data(), 0, in.size());
+  if (separator == NULL)
+     return -EINVAL;
+  prefix_len = size_t(separator - in.data());
+  if (prefix_len >= in.size())
+    return -EINVAL;
+
+  // Fetch prefix and/or key directly from Slice
+  if (prefix)
+    *prefix = string(in.data(), prefix_len);
+  if (key)
+    *key = string(separator+1, in.size()-prefix_len-1);
+  return 0;
+}
+
+void RocksDBStore::compact()
+{
+  logger->inc(l_rocksdb_compact);
+  db->CompactRange(NULL, NULL);
+}
+
+
+void RocksDBStore::compact_thread_entry()
+{
+  compact_queue_lock.Lock();
+  while (!compact_queue_stop) {
+    while (!compact_queue.empty()) {
+      pair<string,string> range = compact_queue.front();
+      compact_queue.pop_front();
+      logger->set(l_rocksdb_compact_queue_len, compact_queue.size());
+      compact_queue_lock.Unlock();
+      logger->inc(l_rocksdb_compact_range);
+      compact_range(range.first, range.second);
+      compact_queue_lock.Lock();
+      continue;
+    }
+    compact_queue_cond.Wait(compact_queue_lock);
+  }
+  compact_queue_lock.Unlock();
+}
+
+void RocksDBStore::compact_range_async(const string& start, const string& end)
+{
+  Mutex::Locker l(compact_queue_lock);
+
+  // try to merge adjacent ranges.  this is O(n), but the queue should
+  // be short.  note that we do not cover all overlap cases and merge
+  // opportunities here, but we capture the ones we currently need.
+  list< pair<string,string> >::iterator p = compact_queue.begin();
+  while (p != compact_queue.end()) {
+    if (p->first == start && p->second == end) {
+      // dup; no-op
+      return;
+    }
+    if (p->first <= end && p->first > start) {
+      // merge with existing range to the right
+      compact_queue.push_back(make_pair(start, p->second));
+      compact_queue.erase(p);
+      logger->inc(l_rocksdb_compact_queue_merge);
+      break;
+    }
+    if (p->second >= start && p->second < end) {
+      // merge with existing range to the left
+      compact_queue.push_back(make_pair(p->first, end));
+      compact_queue.erase(p);
+      logger->inc(l_rocksdb_compact_queue_merge);
+      break;
+    }
+    ++p;
+  }
+  if (p == compact_queue.end()) {
+    // no merge, new entry.
+    compact_queue.push_back(make_pair(start, end));
+    logger->set(l_rocksdb_compact_queue_len, compact_queue.size());
+  }
+  compact_queue_cond.Signal();
+  if (!compact_thread.is_started()) {
+    compact_thread.create();
+  }
+}
+bool RocksDBStore::check_omap_dir(string &omap_dir)
+{
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  rocksdb::DB *db;
+  rocksdb::Status status = rocksdb::DB::Open(options, omap_dir, &db);
+  delete db;
+  return status.ok();
+}
+void RocksDBStore::compact_range(const string& start, const string& end)
+{
+    rocksdb::Slice cstart(start);
+    rocksdb::Slice cend(end);
+    db->CompactRange(&cstart, &cend);
+}
+RocksDBStore::RocksDBWholeSpaceIteratorImpl::~RocksDBWholeSpaceIteratorImpl()
+{
+  delete dbiter;
+}
+int RocksDBStore::RocksDBWholeSpaceIteratorImpl::seek_to_first()
+{
+  dbiter->SeekToFirst();
+  return dbiter->status().ok() ? 0 : -1;
+}
+int RocksDBStore::RocksDBWholeSpaceIteratorImpl::seek_to_first(const string &prefix)
+{
+  rocksdb::Slice slice_prefix(prefix);
+  dbiter->Seek(slice_prefix);
+  return dbiter->status().ok() ? 0 : -1;
+}
+int RocksDBStore::RocksDBWholeSpaceIteratorImpl::seek_to_last()
+{
+  dbiter->SeekToLast();
+  return dbiter->status().ok() ? 0 : -1;
+}
+int RocksDBStore::RocksDBWholeSpaceIteratorImpl::seek_to_last(const string &prefix)
+{
+  string limit = past_prefix(prefix);
+  rocksdb::Slice slice_limit(limit);
+  dbiter->Seek(slice_limit);
+
+  if (!dbiter->Valid()) {
+    dbiter->SeekToLast();
+  } else {
+    dbiter->Prev();
+  }
+  return dbiter->status().ok() ? 0 : -1;
+}
+int RocksDBStore::RocksDBWholeSpaceIteratorImpl::upper_bound(const string &prefix, const string &after)
+{
+  lower_bound(prefix, after);
+  if (valid()) {
+  pair<string,string> key = raw_key();
+    if (key.first == prefix && key.second == after)
+      next();
+  }
+  return dbiter->status().ok() ? 0 : -1;
+}
+int RocksDBStore::RocksDBWholeSpaceIteratorImpl::lower_bound(const string &prefix, const string &to)
+{
+  string bound = combine_strings(prefix, to);
+  rocksdb::Slice slice_bound(bound);
+  dbiter->Seek(slice_bound);
+  return dbiter->status().ok() ? 0 : -1;
+}
+bool RocksDBStore::RocksDBWholeSpaceIteratorImpl::valid()
+{
+  return dbiter->Valid();
+}
+int RocksDBStore::RocksDBWholeSpaceIteratorImpl::next()
+{
+  if (valid())
+  dbiter->Next();
+  return dbiter->status().ok() ? 0 : -1;
+}
+int RocksDBStore::RocksDBWholeSpaceIteratorImpl::prev()
+{
+  if (valid())
+    dbiter->Prev();
+    return dbiter->status().ok() ? 0 : -1;
+}
+string RocksDBStore::RocksDBWholeSpaceIteratorImpl::key()
+{
+  string out_key;
+  split_key(dbiter->key(), 0, &out_key);
+  return out_key;
+}
+pair<string,string> RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key()
+{
+  string prefix, key;
+  split_key(dbiter->key(), &prefix, &key);
+  return make_pair(prefix, key);
+}
+
+bool RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key_is_prefixed(const string &prefix) {
+  // Look for "prefix\0" right in rocksb::Slice
+  rocksdb::Slice key = dbiter->key();
+  if ((key.size() > prefix.length()) && (key[prefix.length()] == '\0')) {
+    return memcmp(key.data(), prefix.c_str(), prefix.length()) == 0;
+  } else {
+    return false;
+  }
+}
+
+bufferlist RocksDBStore::RocksDBWholeSpaceIteratorImpl::value()
+{
+  return to_bufferlist(dbiter->value());
+}
+int RocksDBStore::RocksDBWholeSpaceIteratorImpl::status()
+{
+  return dbiter->status().ok() ? 0 : -1;
+}
+
+string RocksDBStore::past_prefix(const string &prefix)
+{
+  string limit = prefix;
+  limit.push_back(1);
+  return limit;
+}
+
+
+RocksDBStore::WholeSpaceIterator RocksDBStore::_get_iterator()
+{
+  return std::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
+    new RocksDBWholeSpaceIteratorImpl(
+      db->NewIterator(rocksdb::ReadOptions())
+    )
+  );
+}
+
+RocksDBStore::WholeSpaceIterator RocksDBStore::_get_snapshot_iterator()
+{
+  const rocksdb::Snapshot *snapshot;
+  rocksdb::ReadOptions options;
+
+  snapshot = db->GetSnapshot();
+  options.snapshot = snapshot;
+
+  return std::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
+    new RocksDBSnapshotIteratorImpl(db, snapshot,
+      db->NewIterator(options))
+  );
+}
+
+RocksDBStore::RocksDBSnapshotIteratorImpl::~RocksDBSnapshotIteratorImpl()
+{
+  db->ReleaseSnapshot(snapshot);
+}
diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h
new file mode 100644
index 0000000..90523c4
--- /dev/null
+++ b/src/kv/RocksDBStore.h
@@ -0,0 +1,285 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef ROCKS_DB_STORE_H
+#define ROCKS_DB_STORE_H
+
+#include "include/types.h"
+#include "include/buffer.h"
+#include "KeyValueDB.h"
+#include <set>
+#include <map>
+#include <string>
+#include <memory>
+#include <boost/scoped_ptr.hpp>
+
+#include <errno.h>
+#include "common/errno.h"
+#include "common/dout.h"
+#include "include/assert.h"
+#include "common/Formatter.h"
+#include "common/Cond.h"
+
+#include "common/ceph_context.h"
+class PerfCounters;
+
+enum {
+  l_rocksdb_first = 34300,
+  l_rocksdb_gets,
+  l_rocksdb_txns,
+  l_rocksdb_get_latency,
+  l_rocksdb_submit_latency,
+  l_rocksdb_submit_sync_latency,
+  l_rocksdb_compact,
+  l_rocksdb_compact_range,
+  l_rocksdb_compact_queue_merge,
+  l_rocksdb_compact_queue_len,
+  l_rocksdb_last,
+};
+
+namespace rocksdb{
+  class DB;
+  class Cache;
+  class FilterPolicy;
+  class Snapshot;
+  class Slice;
+  class WriteBatch;
+  class Iterator;
+  struct Options;
+}
+/**
+ * Uses RocksDB to implement the KeyValueDB interface
+ */
+class RocksDBStore : public KeyValueDB {
+  CephContext *cct;
+  PerfCounters *logger;
+  string path;
+  rocksdb::DB *db;
+  string options_str;
+  int do_open(ostream &out, bool create_if_missing);
+
+  // manage async compactions
+  Mutex compact_queue_lock;
+  Cond compact_queue_cond;
+  list< pair<string,string> > compact_queue;
+  bool compact_queue_stop;
+  class CompactThread : public Thread {
+    RocksDBStore *db;
+  public:
+    CompactThread(RocksDBStore *d) : db(d) {}
+    void *entry() {
+      db->compact_thread_entry();
+      return NULL;
+    }
+    friend class RocksDBStore;
+  } compact_thread;
+
+  void compact_thread_entry();
+
+  void compact_range(const string& start, const string& end);
+  void compact_range_async(const string& start, const string& end);
+
+public:
+  /// compact the underlying rocksdb store
+  bool compact_on_mount;
+  bool disableWAL;
+  void compact();
+
+  int tryInterpret(const string key, const string val, rocksdb::Options &opt);
+  int ParseOptionsFromString(const string opt_str, rocksdb::Options &opt);
+  static int _test_init(const string& dir);
+  int init(string options_str);
+  /// compact rocksdb for all keys with a given prefix
+  void compact_prefix(const string& prefix) {
+    compact_range(prefix, past_prefix(prefix));
+  }
+  void compact_prefix_async(const string& prefix) {
+    compact_range_async(prefix, past_prefix(prefix));
+  }
+
+  void compact_range(const string& prefix, const string& start, const string& end) {
+    compact_range(combine_strings(prefix, start), combine_strings(prefix, end));
+  }
+  void compact_range_async(const string& prefix, const string& start, const string& end) {
+    compact_range_async(combine_strings(prefix, start), combine_strings(prefix, end));
+  }
+  int get_info_log_level(string info_log_level);
+
+  RocksDBStore(CephContext *c, const string &path) :
+    cct(c),
+    logger(NULL),
+    path(path),
+    db(NULL),
+    compact_queue_lock("RocksDBStore::compact_thread_lock"),
+    compact_queue_stop(false),
+    compact_thread(this),
+    compact_on_mount(false),
+    disableWAL(false)
+  {}
+
+  ~RocksDBStore();
+
+  static bool check_omap_dir(string &omap_dir);
+  /// Opens underlying db
+  int open(ostream &out) {
+    return do_open(out, false);
+  }
+  /// Creates underlying db if missing and opens it
+  int create_and_open(ostream &out);
+
+  void close();
+
+  class RocksDBTransactionImpl : public KeyValueDB::TransactionImpl {
+  public:
+    rocksdb::WriteBatch *bat;
+    RocksDBStore *db;
+
+    RocksDBTransactionImpl(RocksDBStore *_db);
+    ~RocksDBTransactionImpl();
+    void set(
+      const string &prefix,
+      const string &k,
+      const bufferlist &bl);
+    void rmkey(
+      const string &prefix,
+      const string &k);
+    void rmkeys_by_prefix(
+      const string &prefix
+      );
+  };
+
+  KeyValueDB::Transaction get_transaction() {
+    return std::shared_ptr< RocksDBTransactionImpl >(
+      new RocksDBTransactionImpl(this));
+  }
+
+  int submit_transaction(KeyValueDB::Transaction t);
+  int submit_transaction_sync(KeyValueDB::Transaction t);
+  int get(
+    const string &prefix,
+    const std::set<string> &key,
+    std::map<string, bufferlist> *out
+    );
+  int get(
+    const string &prefix,
+    const string &key,
+    bufferlist *out
+    );
+
+  class RocksDBWholeSpaceIteratorImpl :
+    public KeyValueDB::WholeSpaceIteratorImpl {
+  protected:
+    rocksdb::Iterator *dbiter;
+  public:
+    RocksDBWholeSpaceIteratorImpl(rocksdb::Iterator *iter) :
+      dbiter(iter) { }
+    //virtual ~RocksDBWholeSpaceIteratorImpl() { }
+    ~RocksDBWholeSpaceIteratorImpl();
+
+    int seek_to_first();
+    int seek_to_first(const string &prefix);
+    int seek_to_last();
+    int seek_to_last(const string &prefix);
+    int upper_bound(const string &prefix, const string &after);
+    int lower_bound(const string &prefix, const string &to);
+    bool valid();
+    int next();
+    int prev();
+    string key();
+    pair<string,string> raw_key();
+    bool raw_key_is_prefixed(const string &prefix);
+    bufferlist value();
+    int status();
+  };
+
+  class RocksDBSnapshotIteratorImpl : public RocksDBWholeSpaceIteratorImpl {
+    rocksdb::DB *db;
+    const rocksdb::Snapshot *snapshot;
+  public:
+    RocksDBSnapshotIteratorImpl(rocksdb::DB *db, const rocksdb::Snapshot *s,
+				rocksdb::Iterator *iter) :
+      RocksDBWholeSpaceIteratorImpl(iter), db(db), snapshot(s) { }
+
+    ~RocksDBSnapshotIteratorImpl();
+  };
+
+  /// Utility
+  static string combine_strings(const string &prefix, const string &value);
+  static int split_key(rocksdb::Slice in, string *prefix, string *key);
+  static bufferlist to_bufferlist(rocksdb::Slice in);
+  static string past_prefix(const string &prefix);
+
+  virtual uint64_t get_estimated_size(map<string,uint64_t> &extra) {
+    DIR *store_dir = opendir(path.c_str());
+    if (!store_dir) {
+      lderr(cct) << __func__ << " something happened opening the store: "
+                 << cpp_strerror(errno) << dendl;
+      return 0;
+    }
+
+    uint64_t total_size = 0;
+    uint64_t sst_size = 0;
+    uint64_t log_size = 0;
+    uint64_t misc_size = 0;
+
+    struct dirent *entry = NULL;
+    while ((entry = readdir(store_dir)) != NULL) {
+      string n(entry->d_name);
+
+      if (n == "." || n == "..")
+        continue;
+
+      string fpath = path + '/' + n;
+      struct stat s;
+      int err = stat(fpath.c_str(), &s);
+      if (err < 0)
+	err = -errno;
+      // we may race against rocksdb while reading files; this should only
+      // happen when those files are being updated, data is being shuffled
+      // and files get removed, in which case there's not much of a problem
+      // as we'll get to them next time around.
+      if (err == -ENOENT) {
+	continue;
+      }
+      if (err < 0) {
+        lderr(cct) << __func__ << " error obtaining stats for " << fpath
+                   << ": " << cpp_strerror(err) << dendl;
+        goto err;
+      }
+
+      size_t pos = n.find_last_of('.');
+      if (pos == string::npos) {
+        misc_size += s.st_size;
+        continue;
+      }
+
+      string ext = n.substr(pos+1);
+      if (ext == "sst") {
+        sst_size += s.st_size;
+      } else if (ext == "log") {
+        log_size += s.st_size;
+      } else {
+        misc_size += s.st_size;
+      }
+    }
+
+    total_size = sst_size + log_size + misc_size;
+
+    extra["sst"] = sst_size;
+    extra["log"] = log_size;
+    extra["misc"] = misc_size;
+    extra["total"] = total_size;
+
+err:
+    closedir(store_dir);
+    return total_size;
+  }
+
+
+protected:
+  WholeSpaceIterator _get_iterator();
+
+  WholeSpaceIterator _get_snapshot_iterator();
+
+};
+
+#endif
diff --git a/src/libcephfs.cc b/src/libcephfs.cc
index 40cd028..849b1c0 100644
--- a/src/libcephfs.cc
+++ b/src/libcephfs.cc
@@ -1453,8 +1453,8 @@ extern "C" int ceph_ll_fsync(class ceph_mount_info *cmount,
   return (cmount->get_client()->ll_fsync(fh, syncdataonly));
 }
 
-extern "C" loff_t ceph_ll_lseek(class ceph_mount_info *cmount,
-				Fh *fh, loff_t offset, int whence)
+extern "C" off_t ceph_ll_lseek(class ceph_mount_info *cmount,
+				Fh *fh, off_t offset, int whence)
 {
   return (cmount->get_client()->ll_lseek(fh, offset, whence));
 }
diff --git a/src/librados/IoCtxImpl.cc b/src/librados/IoCtxImpl.cc
index 945dbec..3e68212 100644
--- a/src/librados/IoCtxImpl.cc
+++ b/src/librados/IoCtxImpl.cc
@@ -25,6 +25,121 @@
 #undef dout_prefix
 #define dout_prefix *_dout << "librados: "
 
+namespace librados {
+namespace {
+
+struct C_notify_Finish : public Context {
+  CephContext *cct;
+  Context *ctx;
+  Objecter *objecter;
+  Objecter::LingerOp *linger_op;
+  bufferlist reply_bl;
+  bufferlist *preply_bl;
+  char **preply_buf;
+  size_t *preply_buf_len;
+
+  C_notify_Finish(CephContext *_cct, Context *_ctx, Objecter *_objecter,
+                  Objecter::LingerOp *_linger_op, bufferlist *_preply_bl,
+                  char **_preply_buf, size_t *_preply_buf_len)
+    : cct(_cct), ctx(_ctx), objecter(_objecter), linger_op(_linger_op),
+      preply_bl(_preply_bl), preply_buf(_preply_buf),
+      preply_buf_len(_preply_buf_len)
+  {
+    linger_op->on_notify_finish = this;
+    linger_op->notify_result_bl = &reply_bl;
+  }
+
+  virtual void finish(int r)
+  {
+    ldout(cct, 10) << __func__ << " completed notify (linger op "
+                   << linger_op << "), r = " << r << dendl;
+
+    // pass result back to user
+    // NOTE: we do this regardless of what error code we return
+    if (preply_buf) {
+      if (reply_bl.length()) {
+        *preply_buf = (char*)malloc(reply_bl.length());
+        memcpy(*preply_buf, reply_bl.c_str(), reply_bl.length());
+      } else {
+        *preply_buf = NULL;
+      }
+    }
+    if (preply_buf_len)
+      *preply_buf_len = reply_bl.length();
+    if (preply_bl)
+      preply_bl->claim(reply_bl);
+
+    ctx->complete(r);
+  }
+};
+
+struct C_aio_linger_cancel : public Context {
+  Objecter *objecter;
+  Objecter::LingerOp *linger_op;
+
+  C_aio_linger_cancel(Objecter *_objecter, Objecter::LingerOp *_linger_op)
+    : objecter(_objecter), linger_op(_linger_op)
+  {
+  }
+
+  virtual void finish(int r)
+  {
+    objecter->linger_cancel(linger_op);
+  }
+};
+
+struct C_aio_notify_Complete : public Context {
+  AioCompletionImpl *c;
+  Objecter::LingerOp *linger_op;
+
+  C_aio_notify_Complete(AioCompletionImpl *_c, Objecter::LingerOp *_linger_op)
+    : c(_c), linger_op(_linger_op)
+  {
+    c->get();
+  }
+
+  virtual void finish(int r) {
+    c->io->client->finisher.queue(new C_aio_linger_cancel(c->io->objecter,
+                                                          linger_op));
+
+    c->lock.Lock();
+    c->rval = r;
+    c->ack = true;
+    c->safe = true;
+    c->cond.Signal();
+
+    if (c->callback_complete) {
+      c->io->client->finisher.queue(new C_AioComplete(c));
+    }
+    if (c->callback_safe) {
+      c->io->client->finisher.queue(new C_AioSafe(c));
+    }
+    c->put_unlock();
+  }
+};
+
+struct C_aio_notify_Ack : public Context {
+  CephContext *cct;
+  C_notify_Finish *f;
+
+  C_aio_notify_Ack(CephContext *_cct, C_notify_Finish *_f)
+    : cct(_cct), f(_f)
+  {
+  }
+
+  virtual void finish(int r)
+  {
+    ldout(cct, 10) << __func__ << " linger op " << f->linger_op << " acked ("
+                   << r << ")" << dendl;
+    if (r < 0) {
+      f->complete(r);
+    }
+  }
+};
+
+} // anonymous namespace
+} // namespace librados
+
 librados::IoCtxImpl::IoCtxImpl() :
   ref_cnt(0), client(NULL), poolid(0), assert_ver(0), last_objver(0),
   notify_timeout(30), aio_write_list_lock("librados::IoCtxImpl::aio_write_list_lock"),
@@ -262,7 +377,6 @@ int librados::IoCtxImpl::rollback(const object_t& oid, const char *snapName)
   if (r < 0) {
     return r;
   }
-  string sName(snapName);
 
   return selfmanaged_snap_rollback_object(oid, snapc, snap);
 }
@@ -1120,6 +1234,7 @@ int librados::IoCtxImpl::watch(const object_t& oid,
 
   if (r < 0) {
     objecter->linger_cancel(linger_op);
+    *handle = 0;
   }
 
   return r;
@@ -1169,52 +1284,22 @@ int librados::IoCtxImpl::notify(const object_t& oid, bufferlist& bl,
 				bufferlist *preply_bl,
 				char **preply_buf, size_t *preply_buf_len)
 {
-  bufferlist inbl;
-
-  struct C_NotifyFinish : public Context {
-    Cond cond;
-    Mutex lock;
-    bool done;
-    int result;
-    bufferlist reply_bl;
-
-    C_NotifyFinish()
-      : lock("IoCtxImpl::notify::C_NotifyFinish::lock"),
-	done(false),
-	result(0) { }
-
-    void finish(int r) {}
-    void complete(int r) {
-      lock.Lock();
-      done = true;
-      result = r;
-      cond.Signal();
-      lock.Unlock();
-    }
-    void wait() {
-      lock.Lock();
-      while (!done)
-	cond.Wait(lock);
-      lock.Unlock();
-    }
-  } notify_private;
-
   Objecter::LingerOp *linger_op = objecter->linger_register(oid, oloc, 0);
-  linger_op->on_notify_finish = &notify_private;
-  linger_op->notify_result_bl = &notify_private.reply_bl;
 
-  uint32_t prot_ver = 1;
+  C_SaferCond notify_finish_cond;
+  Context *notify_finish = new C_notify_Finish(client->cct, &notify_finish_cond,
+                                               objecter, linger_op, preply_bl,
+                                               preply_buf, preply_buf_len);
+
   uint32_t timeout = notify_timeout;
   if (timeout_ms)
     timeout = timeout_ms / 1000;
-  ::encode(prot_ver, inbl);
-  ::encode(timeout, inbl);
-  ::encode(bl, inbl);
 
   // Construct RADOS op
   ::ObjectOperation rd;
   prepare_assert_ops(&rd);
-  rd.notify(linger_op->get_cookie(), inbl);
+  bufferlist inbl;
+  rd.notify(linger_op->get_cookie(), 1, timeout, bl, &inbl);
 
   // Issue RADOS op
   C_SaferCond onack;
@@ -1224,44 +1309,58 @@ int librados::IoCtxImpl::notify(const object_t& oid, bufferlist& bl,
 			  &onack, &objver);
 
   ldout(client->cct, 10) << __func__ << " issued linger op " << linger_op << dendl;
-  int r_issue = onack.wait();
+  int r = onack.wait();
   ldout(client->cct, 10) << __func__ << " linger op " << linger_op
-			 << " acked (" << r_issue << ")" << dendl;
+			 << " acked (" << r << ")" << dendl;
 
-  if (r_issue == 0) {
+  if (r == 0) {
     ldout(client->cct, 10) << __func__ << " waiting for watch_notify finish "
 			   << linger_op << dendl;
-    notify_private.wait();
+    r = notify_finish_cond.wait();
 
-    ldout(client->cct, 10) << __func__ << " completed notify (linger op "
-			   << linger_op << "), r = " << notify_private.result
-			   << dendl;
   } else {
     ldout(client->cct, 10) << __func__ << " failed to initiate notify, r = "
-			   << r_issue << dendl;
+			   << r << dendl;
+    notify_finish->complete(r);
   }
 
-  // pass result back to user
-  // NOTE: we do this regardless of what error code we return
-  if (preply_buf) {
-    if (notify_private.reply_bl.length()) {
-      *preply_buf = (char*)malloc(notify_private.reply_bl.length());
-      memcpy(*preply_buf, notify_private.reply_bl.c_str(),
-	     notify_private.reply_bl.length());
-    } else {
-      *preply_buf = NULL;
-    }
-  }
-  if (preply_buf_len)
-    *preply_buf_len = notify_private.reply_bl.length();
-  if (preply_bl)
-    preply_bl->claim(notify_private.reply_bl);
-
   objecter->linger_cancel(linger_op);
 
   set_sync_op_version(objver);
+  return r;
+}
 
-  return r_issue ? r_issue : notify_private.result;
+int librados::IoCtxImpl::aio_notify(const object_t& oid, AioCompletionImpl *c,
+                                    bufferlist& bl, uint64_t timeout_ms,
+                                    bufferlist *preply_bl, char **preply_buf,
+                                    size_t *preply_buf_len)
+{
+  Objecter::LingerOp *linger_op = objecter->linger_register(oid, oloc, 0);
+
+  c->io = this;
+
+  Context *oncomplete = new C_aio_notify_Complete(c, linger_op);
+  C_notify_Finish *onnotify = new C_notify_Finish(client->cct, oncomplete,
+                                                  objecter, linger_op,
+                                                  preply_bl, preply_buf,
+                                                  preply_buf_len);
+  Context *onack = new C_aio_notify_Ack(client->cct, onnotify);
+
+  uint32_t timeout = notify_timeout;
+  if (timeout_ms)
+    timeout = timeout_ms / 1000;
+
+  // Construct RADOS op
+  ::ObjectOperation rd;
+  prepare_assert_ops(&rd);
+  bufferlist inbl;
+  rd.notify(linger_op->get_cookie(), 1, timeout, bl, &inbl);
+
+  // Issue RADOS op
+  objecter->linger_notify(linger_op,
+			  rd, snap_seq, inbl, NULL,
+			  onack, NULL);
+  return 0;
 }
 
 int librados::IoCtxImpl::set_alloc_hint(const object_t& oid,
@@ -1294,6 +1393,22 @@ void librados::IoCtxImpl::set_notify_timeout(uint32_t timeout)
   notify_timeout = timeout;
 }
 
+int librados::IoCtxImpl::cache_pin(const object_t& oid)
+{
+  ::ObjectOperation wr;
+  prepare_assert_ops(&wr);
+  wr.cache_pin();
+  return operate(oid, &wr, NULL);
+}
+
+int librados::IoCtxImpl::cache_unpin(const object_t& oid)
+{
+  ::ObjectOperation wr;
+  prepare_assert_ops(&wr);
+  wr.cache_unpin();
+  return operate(oid, &wr, NULL);
+}
+
 
 ///////////////////////////// C_aio_Ack ////////////////////////////////
 
@@ -1379,4 +1494,3 @@ void librados::IoCtxImpl::C_aio_Safe::finish(int r)
 
   c->put_unlock();
 }
-
diff --git a/src/librados/IoCtxImpl.h b/src/librados/IoCtxImpl.h
index df73b03..b0a1b19 100644
--- a/src/librados/IoCtxImpl.h
+++ b/src/librados/IoCtxImpl.h
@@ -206,6 +206,9 @@ struct librados::IoCtxImpl {
 	     bufferlist *preplybl, char **preply_buf, size_t *preply_buf_len);
   int notify_ack(const object_t& oid, uint64_t notify_id, uint64_t cookie,
 		 bufferlist& bl);
+  int aio_notify(const object_t& oid, AioCompletionImpl *c, bufferlist& bl,
+                 uint64_t timeout_ms, bufferlist *preplybl, char **preply_buf,
+                 size_t *preply_buf_len);
 
   int set_alloc_hint(const object_t& oid,
                      uint64_t expected_object_size,
@@ -216,6 +219,9 @@ struct librados::IoCtxImpl {
   void set_assert_src_version(const object_t& oid, uint64_t ver);
   void set_notify_timeout(uint32_t timeout);
 
+  int cache_pin(const object_t& oid);
+  int cache_unpin(const object_t& oid);
+
 };
 
 #endif
diff --git a/src/librados/RadosClient.cc b/src/librados/RadosClient.cc
index 08ed909..1acde03 100644
--- a/src/librados/RadosClient.cc
+++ b/src/librados/RadosClient.cc
@@ -94,30 +94,70 @@ int64_t librados::RadosClient::lookup_pool(const char *name)
 
 bool librados::RadosClient::pool_requires_alignment(int64_t pool_id)
 {
+  bool requires;
+  int r = pool_requires_alignment2(pool_id, &requires);
+  if (r < 0) {
+    // Cast answer to false, this is a little bit problematic
+    // since we really don't know the answer yet, say.
+    return false;
+  }
+
+  return requires;
+}
+
+// a safer version of pool_requires_alignment
+int librados::RadosClient::pool_requires_alignment2(int64_t pool_id,
+	bool *requires)
+{
+  if (!requires)
+    return -EINVAL;
+
   int r = wait_for_osdmap();
   if (r < 0) {
     return r;
   }
 
   const OSDMap *osdmap = objecter->get_osdmap_read();
-  bool ret = osdmap->have_pg_pool(pool_id) &&
-    osdmap->get_pg_pool(pool_id)->requires_aligned_append();
+  if (!osdmap->have_pg_pool(pool_id)) { 
+    objecter->put_osdmap_read();
+    return -ENOENT;
+  }
+  *requires = osdmap->get_pg_pool(pool_id)->requires_aligned_append();
   objecter->put_osdmap_read();
-  return ret;
+  return 0;
 }
 
 uint64_t librados::RadosClient::pool_required_alignment(int64_t pool_id)
 {
+  uint64_t alignment;
+  int r = pool_required_alignment2(pool_id, &alignment);
+  if (r < 0) {
+    return 0;
+  }
+
+  return alignment;
+}
+
+// a safer version of pool_required_alignment
+int librados::RadosClient::pool_required_alignment2(int64_t pool_id,
+	uint64_t *alignment)
+{
+  if (!alignment)
+    return -EINVAL;
+
   int r = wait_for_osdmap();
   if (r < 0) {
     return r;
   }
 
   const OSDMap *osdmap = objecter->get_osdmap_read();
-  uint64_t ret = osdmap->have_pg_pool(pool_id) ?
-    osdmap->get_pg_pool(pool_id)->required_alignment() : 0;
+  if (!osdmap->have_pg_pool(pool_id)) {
+    objecter->put_osdmap_read();
+    return -ENOENT;
+  }
+  *alignment = osdmap->get_pg_pool(pool_id)->required_alignment();
   objecter->put_osdmap_read();
-  return ret;
+  return 0;
 }
 
 int librados::RadosClient::pool_get_auid(uint64_t pool_id, unsigned long long *auid)
@@ -266,8 +306,6 @@ int librados::RadosClient::connect()
 
   lock.Unlock();
 
-  cct->_conf->call_all_observers();
-
   ldout(cct, 1) << "init done" << dendl;
   err = 0;
 
diff --git a/src/librados/RadosClient.h b/src/librados/RadosClient.h
index d44336f..a26e46f 100644
--- a/src/librados/RadosClient.h
+++ b/src/librados/RadosClient.h
@@ -93,7 +93,9 @@ public:
   int get_fsid(std::string *s);
   int64_t lookup_pool(const char *name);
   bool pool_requires_alignment(int64_t pool_id);
+  int pool_requires_alignment2(int64_t pool_id, bool *requires);
   uint64_t pool_required_alignment(int64_t pool_id);
+  int pool_required_alignment2(int64_t pool_id, uint64_t *alignment);
   int pool_get_auid(uint64_t pool_id, unsigned long long *auid);
   int pool_get_name(uint64_t pool_id, std::string *auid);
 
diff --git a/src/librados/RadosXattrIter.cc b/src/librados/RadosXattrIter.cc
index c0f23f6..f4fb39d 100644
--- a/src/librados/RadosXattrIter.cc
+++ b/src/librados/RadosXattrIter.cc
@@ -12,6 +12,8 @@
  *
  */
 
+#include <stdlib.h>
+
 #include "RadosXattrIter.h"
 
 librados::RadosXattrsIter::RadosXattrsIter()
diff --git a/src/librados/librados.cc b/src/librados/librados.cc
index 7a193a3..403b5b0 100644
--- a/src/librados/librados.cc
+++ b/src/librados/librados.cc
@@ -513,6 +513,18 @@ void librados::ObjectWriteOperation::set_alloc_hint(
   o->set_alloc_hint(expected_object_size, expected_write_size);
 }
 
+void librados::ObjectWriteOperation::cache_pin()
+{
+  ::ObjectOperation *o = (::ObjectOperation *)impl;
+  o->cache_pin();
+}
+
+void librados::ObjectWriteOperation::cache_unpin()
+{
+  ::ObjectOperation *o = (::ObjectOperation *)impl;
+  o->cache_unpin();
+}
+
 librados::WatchCtx::
 ~WatchCtx()
 {
@@ -1074,11 +1086,21 @@ bool librados::IoCtx::pool_requires_alignment()
   return io_ctx_impl->client->pool_requires_alignment(get_id());
 }
 
+int librados::IoCtx::pool_requires_alignment2(bool *requires)
+{
+  return io_ctx_impl->client->pool_requires_alignment2(get_id(), requires);
+}
+
 uint64_t librados::IoCtx::pool_required_alignment()
 {
   return io_ctx_impl->client->pool_required_alignment(get_id());
 }
 
+int librados::IoCtx::pool_required_alignment2(uint64_t *alignment)
+{
+  return io_ctx_impl->client->pool_required_alignment2(get_id(), alignment);
+}
+
 std::string librados::IoCtx::get_pool_name()
 {
   std::string s;
@@ -1157,14 +1179,14 @@ int librados::IoCtx::mapext(const std::string& oid, uint64_t off, size_t len,
 			    std::map<uint64_t,uint64_t>& m)
 {
   object_t obj(oid);
-  return io_ctx_impl->mapext(oid, off, len, m);
+  return io_ctx_impl->mapext(obj, off, len, m);
 }
 
 int librados::IoCtx::sparse_read(const std::string& oid, std::map<uint64_t,uint64_t>& m,
 				 bufferlist& bl, size_t len, uint64_t off)
 {
   object_t obj(oid);
-  return io_ctx_impl->sparse_read(oid, m, bl, len, off);
+  return io_ctx_impl->sparse_read(obj, m, bl, len, off);
 }
 
 int librados::IoCtx::getxattr(const std::string& oid, const char *name, bufferlist& bl)
@@ -1194,7 +1216,7 @@ int librados::IoCtx::rmxattr(const std::string& oid, const char *name)
 int librados::IoCtx::stat(const std::string& oid, uint64_t *psize, time_t *pmtime)
 {
   object_t obj(oid);
-  return io_ctx_impl->stat(oid, psize, pmtime);
+  return io_ctx_impl->stat(obj, psize, pmtime);
 }
 
 int librados::IoCtx::exec(const std::string& oid, const char *cls, const char *method,
@@ -1748,7 +1770,7 @@ int librados::IoCtx::aio_stat(const std::string& oid, librados::AioCompletion *c
 			      uint64_t *psize, time_t *pmtime)
 {
   object_t obj(oid);
-  return io_ctx_impl->aio_stat(oid, c->pc, psize, pmtime);
+  return io_ctx_impl->aio_stat(obj, c->pc, psize, pmtime);
 }
 
 int librados::IoCtx::aio_cancel(librados::AioCompletion *c)
@@ -1772,7 +1794,6 @@ int librados::IoCtx::watch2(const string& oid, uint64_t *cookie,
 
 int librados::IoCtx::unwatch(const string& oid, uint64_t handle)
 {
-  object_t obj(oid);
   return io_ctx_impl->unwatch(handle);
 }
 
@@ -1799,6 +1820,15 @@ int librados::IoCtx::notify2(const string& oid, bufferlist& bl,
   return io_ctx_impl->notify(obj, bl, timeout_ms, preplybl, NULL, NULL);
 }
 
+int librados::IoCtx::aio_notify(const string& oid, AioCompletion *c,
+                                bufferlist& bl, uint64_t timeout_ms,
+                                bufferlist *preplybl)
+{
+  object_t obj(oid);
+  return io_ctx_impl->aio_notify(obj, c->pc, bl, timeout_ms, preplybl, NULL,
+                                 NULL);
+}
+
 void librados::IoCtx::notify_ack(const std::string& o,
 				 uint64_t notify_id, uint64_t handle,
 				 bufferlist& bl)
@@ -1861,6 +1891,18 @@ void librados::IoCtx::set_assert_src_version(const std::string& oid, uint64_t ve
   io_ctx_impl->set_assert_src_version(obj, ver);
 }
 
+int librados::IoCtx::cache_pin(const string& oid)
+{
+  object_t obj(oid);
+  return io_ctx_impl->cache_pin(obj);
+}
+
+int librados::IoCtx::cache_unpin(const string& oid)
+{
+  object_t obj(oid);
+  return io_ctx_impl->cache_unpin(obj);
+}
+
 void librados::IoCtx::locator_set_key(const string& key)
 {
   io_ctx_impl->oloc.key = key;
@@ -3111,6 +3153,18 @@ extern "C" int rados_ioctx_pool_requires_alignment(rados_ioctx_t io)
   return retval;
 }
 
+extern "C" int rados_ioctx_pool_requires_alignment2(rados_ioctx_t io,
+	int *requires)
+{
+  tracepoint(librados, rados_ioctx_pool_requires_alignment_enter2, io);
+  librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+  int retval = ctx->client->pool_requires_alignment2(ctx->get_id(), 
+  	(bool *)requires);
+  tracepoint(librados, rados_ioctx_pool_requires_alignment_exit2, retval, 
+  	*requires);
+  return retval;
+}
+
 extern "C" uint64_t rados_ioctx_pool_required_alignment(rados_ioctx_t io)
 {
   tracepoint(librados, rados_ioctx_pool_required_alignment_enter, io);
@@ -3120,6 +3174,18 @@ extern "C" uint64_t rados_ioctx_pool_required_alignment(rados_ioctx_t io)
   return retval;
 }
 
+extern "C" int rados_ioctx_pool_required_alignment2(rados_ioctx_t io,
+	uint64_t *alignment)
+{
+  tracepoint(librados, rados_ioctx_pool_required_alignment_enter2, io);
+  librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+  int retval = ctx->client->pool_required_alignment2(ctx->get_id(),
+  	alignment);
+  tracepoint(librados, rados_ioctx_pool_required_alignment_exit2, retval, 
+  	*alignment);
+  return retval;
+}
+
 extern "C" void rados_ioctx_locator_set_key(rados_ioctx_t io, const char *key)
 {
   tracepoint(librados, rados_ioctx_locator_set_key_enter, io, key);
@@ -4048,6 +4114,28 @@ extern "C" int rados_notify2(rados_ioctx_t io, const char *o,
   return ret;
 }
 
+extern "C" int rados_aio_notify(rados_ioctx_t io, const char *o,
+                                rados_completion_t completion,
+                                const char *buf, int buf_len,
+                                uint64_t timeout_ms, char **reply_buffer,
+                                size_t *reply_buffer_len)
+{
+  tracepoint(librados, rados_aio_notify_enter, io, o, completion, buf, buf_len,
+             timeout_ms);
+  librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+  object_t oid(o);
+  bufferlist bl;
+  if (buf) {
+    bl.push_back(buffer::copy(buf, buf_len));
+  }
+  librados::AioCompletionImpl *c =
+    reinterpret_cast<librados::AioCompletionImpl*>(completion);
+  int ret = ctx->aio_notify(oid, c, bl, timeout_ms, NULL, reply_buffer,
+                            reply_buffer_len);
+  tracepoint(librados, rados_aio_notify_exit, ret);
+  return ret;
+}
+
 extern "C" int rados_notify_ack(rados_ioctx_t io, const char *o,
 				uint64_t notify_id, uint64_t handle,
 				const char *buf, int buf_len)
@@ -4802,6 +4890,26 @@ extern "C" int rados_aio_read_op_operate(rados_read_op_t read_op,
   return retval;
 }
 
+extern "C" int rados_cache_pin(rados_ioctx_t io, const char *o)
+{
+  tracepoint(librados, rados_cache_pin_enter, io, o);
+  librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+  object_t oid(o);
+  int retval = ctx->cache_pin(oid);
+  tracepoint(librados, rados_cache_pin_exit, retval);
+  return retval;
+}
+
+extern "C" int rados_cache_unpin(rados_ioctx_t io, const char *o)
+{
+  tracepoint(librados, rados_cache_unpin_enter, io, o);
+  librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
+  object_t oid(o);
+  int retval = ctx->cache_unpin(oid);
+  tracepoint(librados, rados_cache_unpin_exit, retval);
+  return retval;
+}
+
 
 ///////////////////////////// ListObject //////////////////////////////
 librados::ListObject::ListObject() : impl(NULL)
diff --git a/src/librbd/AioCompletion.cc b/src/librbd/AioCompletion.cc
index ec7f684..ac6754d 100644
--- a/src/librbd/AioCompletion.cc
+++ b/src/librbd/AioCompletion.cc
@@ -6,11 +6,15 @@
 #include "common/ceph_context.h"
 #include "common/dout.h"
 #include "common/errno.h"
+#include "common/perf_counters.h"
+#include "common/WorkQueue.h"
 
-#include "librbd/AioRequest.h"
+#include "librbd/AioObjectRequest.h"
+#include "librbd/ImageCtx.h"
 #include "librbd/internal.h"
 
 #include "librbd/AioCompletion.h"
+#include "librbd/Journal.h"
 
 #ifdef WITH_LTTNG
 #include "tracing/librbd.h"
@@ -83,6 +87,12 @@ namespace librbd {
       break;
     }
 
+    // inform the journal that the op has successfully committed
+    if (journal_tid != 0) {
+      assert(ictx->journal != NULL);
+      ictx->journal->commit_event(journal_tid, rval);
+    }
+
     // note: possible for image to be closed after op marked finished
     if (async_op.started()) {
       async_op.finish_op();
@@ -98,6 +108,21 @@ namespace librbd {
     tracepoint(librbd, aio_complete_exit);
   }
 
+  void AioCompletion::init_time(ImageCtx *i, aio_type_t t) {
+    if (ictx == NULL) {
+      ictx = i;
+      aio_type = t;
+      start_time = ceph_clock_now(ictx->cct);
+    }
+  }
+
+  void AioCompletion::start_op(ImageCtx *i, aio_type_t t) {
+    init_time(i, t);
+    if (!async_op.started()) {
+      async_op.start_op(*ictx);
+    }
+  }
+
   void AioCompletion::fail(CephContext *cct, int r)
   {
     lderr(cct) << "AioCompletion::fail() " << this << ": " << cpp_strerror(r)
@@ -130,6 +155,12 @@ namespace librbd {
     put_unlock();
   }
 
+  void AioCompletion::associate_journal_event(uint64_t tid) {
+    Mutex::Locker l(lock);
+    assert(!done);
+    journal_tid = tid;
+  }
+
   bool AioCompletion::is_complete() {
     tracepoint(librbd, aio_is_complete_enter, this);
     bool done;
@@ -170,7 +201,7 @@ namespace librbd {
       m_completion->lock.Unlock();
       r = m_req->m_object_len;
     }
-    m_completion->complete_request(m_cct, r);
+    C_AioRequest::finish(r);
   }
 
   void C_CacheRead::complete(int r) {
diff --git a/src/librbd/AioCompletion.h b/src/librbd/AioCompletion.h
index fddf5fb..532f7e2 100644
--- a/src/librbd/AioCompletion.h
+++ b/src/librbd/AioCompletion.h
@@ -5,21 +5,19 @@
 
 #include "common/Cond.h"
 #include "common/Mutex.h"
-#include "common/ceph_context.h"
-#include "common/perf_counters.h"
 #include "include/Context.h"
 #include "include/utime.h"
 #include "include/rbd/librbd.hpp"
 
 #include "librbd/AsyncOperation.h"
-#include "librbd/ImageCtx.h"
-#include "librbd/internal.h"
 
 #include "osdc/Striper.h"
 
+class CephContext;
+
 namespace librbd {
 
-  class AioRead;
+  class AioObjectRead;
 
   typedef enum {
     AIO_TYPE_READ = 0,
@@ -31,7 +29,7 @@ namespace librbd {
 
   /**
    * AioCompletion is the overall completion for a single
-   * rbd I/O request. It may be composed of many AioRequests,
+   * rbd I/O request. It may be composed of many AioObjectRequests,
    * which each go to a single object.
    *
    * The retrying of individual requests is handled at a lower level,
@@ -64,13 +62,16 @@ namespace librbd {
 
     AsyncOperation async_op;
 
+    uint64_t journal_tid;
+
     AioCompletion() : lock("AioCompletion::lock", true, false),
 		      done(false), rval(0), complete_cb(NULL),
 		      complete_arg(NULL), rbd_comp(NULL),
 		      pending_count(0), blockers(1),
 		      ref(1), released(false), ictx(NULL),
 		      aio_type(AIO_TYPE_NONE),
-		      read_bl(NULL), read_buf(NULL), read_buf_len(0) {
+		      read_bl(NULL), read_buf(NULL), read_buf_len(0),
+                      journal_tid(0) {
     }
     ~AioCompletion() {
     }
@@ -88,20 +89,8 @@ namespace librbd {
 
     void finish_adding_requests(CephContext *cct);
 
-    void init_time(ImageCtx *i, aio_type_t t) {
-      if (ictx == NULL) {
-        ictx = i;
-        aio_type = t;
-        start_time = ceph_clock_now(ictx->cct);
-      }
-    }
-    void start_op(ImageCtx *i, aio_type_t t) {
-      init_time(i, t);
-      if (!async_op.started()) {
-        async_op.start_op(*ictx);
-      }
-    }
-
+    void init_time(ImageCtx *i, aio_type_t t);
+    void start_op(ImageCtx *i, aio_type_t t);
     void fail(CephContext *cct, int r);
 
     void complete(CephContext *cct);
@@ -113,6 +102,8 @@ namespace librbd {
 
     void complete_request(CephContext *cct, ssize_t r);
 
+    void associate_journal_event(uint64_t tid);
+
     bool is_complete();
 
     ssize_t get_return_value();
@@ -156,45 +147,45 @@ namespace librbd {
     }
   };
 
-  class C_AioRead : public Context {
+  class C_AioRequest : public Context {
   public:
-    C_AioRead(CephContext *cct, AioCompletion *completion)
-      : m_cct(cct), m_completion(completion), m_req(NULL)
-    { }
-    virtual ~C_AioRead() {}
-    virtual void finish(int r);
-    void set_req(AioRead *req) {
-      m_req = req;
+    C_AioRequest(CephContext *cct, AioCompletion *completion)
+      : m_cct(cct), m_completion(completion) {
+      m_completion->add_request();
     }
-  private:
+    virtual ~C_AioRequest() {}
+    virtual void finish(int r) {
+      m_completion->complete_request(m_cct, r);
+    }
+  protected:
     CephContext *m_cct;
     AioCompletion *m_completion;
-    AioRead *m_req;
   };
 
-  class C_AioWrite : public Context {
+  class C_AioRead : public C_AioRequest {
   public:
-    C_AioWrite(CephContext *cct, AioCompletion *completion)
-      : m_cct(cct), m_completion(completion) {}
-    virtual ~C_AioWrite() {}
-    virtual void finish(int r) {
-      m_completion->complete_request(m_cct, r);
+    C_AioRead(CephContext *cct, AioCompletion *completion)
+      : C_AioRequest(cct, completion), m_req(NULL) {
+    }
+    virtual ~C_AioRead() {}
+    virtual void finish(int r);
+    void set_req(AioObjectRead *req) {
+      m_req = req;
     }
   private:
-    CephContext *m_cct;
-    AioCompletion *m_completion;
+    AioObjectRead *m_req;
   };
 
   class C_CacheRead : public Context {
   public:
-    explicit C_CacheRead(ImageCtx *ictx, AioRead *req)
+    explicit C_CacheRead(ImageCtx *ictx, AioObjectRead *req)
       : m_image_ctx(*ictx), m_req(req), m_enqueued(false) {}
     virtual void complete(int r);
   protected:
     virtual void finish(int r);
   private:
     ImageCtx &m_image_ctx;
-    AioRead *m_req;
+    AioObjectRead *m_req;
     bool m_enqueued;
   };
 }
diff --git a/src/librbd/AioImageRequest.cc b/src/librbd/AioImageRequest.cc
new file mode 100644
index 0000000..49632b6
--- /dev/null
+++ b/src/librbd/AioImageRequest.cc
@@ -0,0 +1,445 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/AioImageRequest.h"
+#include "librbd/AioCompletion.h"
+#include "librbd/AioObjectRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/internal.h"
+#include "librbd/Journal.h"
+#include "librbd/JournalTypes.h"
+#include "include/rados/librados.hpp"
+#include "osdc/Striper.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::AioImageRequest: "
+
+namespace librbd {
+
+namespace {
+
+struct C_DiscardJournalCommit : public Context {
+  typedef std::vector<ObjectExtent> ObjectExtents;
+
+  ImageCtx &image_ctx;
+  AioCompletion *aio_comp;
+  ObjectExtents object_extents;
+
+  C_DiscardJournalCommit(ImageCtx &_image_ctx, AioCompletion *_aio_comp,
+                         const ObjectExtents &_object_extents, uint64_t tid)
+    : image_ctx(_image_ctx), aio_comp(_aio_comp),
+      object_extents(_object_extents) {
+    CephContext *cct = image_ctx.cct;
+    ldout(cct, 20) << this << " C_DiscardJournalCommit: "
+                   << "delaying cache discard until journal tid " << tid << " "
+                   << "safe" << dendl;
+
+    aio_comp->add_request();
+  }
+
+  virtual void finish(int r) {
+    CephContext *cct = image_ctx.cct;
+    ldout(cct, 20) << this << " C_DiscardJournalCommit: "
+                   << "journal committed: discarding from cache" << dendl;
+
+    Mutex::Locker cache_locker(image_ctx.cache_lock);
+    image_ctx.object_cacher->discard_set(image_ctx.object_set, object_extents);
+    aio_comp->complete_request(cct, r);
+  }
+};
+
+struct C_FlushJournalCommit : public Context {
+  ImageCtx &image_ctx;
+  AioCompletion *aio_comp;
+
+  C_FlushJournalCommit(ImageCtx &_image_ctx, AioCompletion *_aio_comp,
+                       uint64_t tid)
+    : image_ctx(_image_ctx), aio_comp(_aio_comp) {
+    CephContext *cct = image_ctx.cct;
+    ldout(cct, 20) << this << " C_FlushJournalCommit: "
+                   << "delaying flush until journal tid " << tid << " "
+                   << "safe" << dendl;
+
+    aio_comp->add_request();
+  }
+
+  virtual void finish(int r) {
+    CephContext *cct = image_ctx.cct;
+    ldout(cct, 20) << this << " C_FlushJournalCommit: journal committed"
+                   << dendl;
+    aio_comp->complete_request(cct, r);
+  }
+};
+
+} // anonymous namespace
+
+void AioImageRequest::aio_read(
+    ImageCtx *ictx, AioCompletion *c,
+    const std::vector<std::pair<uint64_t,uint64_t> > &extents,
+    char *buf, bufferlist *pbl, int op_flags) {
+  AioImageRead req(*ictx, c, extents, buf, pbl, op_flags);
+  req.send();
+}
+
+void AioImageRequest::aio_read(ImageCtx *ictx, AioCompletion *c, uint64_t off,
+                               size_t len, char *buf, bufferlist *pbl,
+                               int op_flags) {
+  AioImageRead req(*ictx, c, off, len, buf, pbl, op_flags);
+  req.send();
+}
+
+void AioImageRequest::aio_write(ImageCtx *ictx, AioCompletion *c, uint64_t off,
+                                size_t len, const char *buf, int op_flags) {
+  AioImageWrite req(*ictx, c, off, len, buf, op_flags);
+  req.send();
+}
+
+void AioImageRequest::aio_discard(ImageCtx *ictx, AioCompletion *c,
+                                  uint64_t off, uint64_t len) {
+  AioImageDiscard req(*ictx, c, off, len);
+  req.send();
+}
+
+void AioImageRequest::aio_flush(ImageCtx *ictx, AioCompletion *c) {
+  AioImageFlush req(*ictx, c);
+  req.send();
+}
+
+void AioImageRequest::send() {
+  assert(m_image_ctx.owner_lock.is_locked());
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << get_request_type() << ": ictx=" << &m_image_ctx << ", "
+                 << "completion=" << m_aio_comp <<  dendl;
+
+  m_aio_comp->get();
+  int r = ictx_check(&m_image_ctx, m_image_ctx.owner_lock);
+  if (r < 0) {
+    m_aio_comp->fail(cct, r);
+    return;
+  }
+
+  send_request();
+}
+
+void AioImageRead::send_request() {
+  CephContext *cct = m_image_ctx.cct;
+
+  if (m_image_ctx.object_cacher && m_image_ctx.readahead_max_bytes > 0 &&
+      !(m_op_flags & LIBRADOS_OP_FLAG_FADVISE_RANDOM)) {
+    readahead(&m_image_ctx, m_image_extents);
+  }
+
+  librados::snap_t snap_id;
+  map<object_t,vector<ObjectExtent> > object_extents;
+  uint64_t buffer_ofs = 0;
+  {
+    // prevent image size from changing between computing clip and recording
+    // pending async operation
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    snap_id = m_image_ctx.snap_id;
+
+    // map
+    for (vector<pair<uint64_t,uint64_t> >::const_iterator p =
+           m_image_extents.begin();
+         p != m_image_extents.end(); ++p) {
+      uint64_t len = p->second;
+      int r = clip_io(&m_image_ctx, p->first, &len);
+      if (r < 0) {
+        m_aio_comp->fail(cct, r);
+        return;
+      }
+      if (len == 0) {
+        continue;
+      }
+
+      Striper::file_to_extents(cct, m_image_ctx.format_string,
+                               &m_image_ctx.layout, p->first, len, 0,
+                               object_extents, buffer_ofs);
+      buffer_ofs += len;
+    }
+
+    m_aio_comp->start_op(&m_image_ctx, AIO_TYPE_READ);
+  }
+
+  m_aio_comp->read_buf = m_buf;
+  m_aio_comp->read_buf_len = buffer_ofs;
+  m_aio_comp->read_bl = m_pbl;
+
+  for (map<object_t,vector<ObjectExtent> >::iterator p = object_extents.begin();
+       p != object_extents.end(); ++p) {
+    for (vector<ObjectExtent>::iterator q = p->second.begin();
+         q != p->second.end(); ++q) {
+      ldout(cct, 20) << " oid " << q->oid << " " << q->offset << "~"
+                     << q->length << " from " << q->buffer_extents
+                     << dendl;
+
+      C_AioRead *req_comp = new C_AioRead(cct, m_aio_comp);
+      AioObjectRead *req = new AioObjectRead(&m_image_ctx, q->oid.name,
+                                             q->objectno, q->offset, q->length,
+                                             q->buffer_extents, snap_id, true,
+                                             req_comp, m_op_flags);
+      req_comp->set_req(req);
+
+      if (m_image_ctx.object_cacher) {
+        C_CacheRead *cache_comp = new C_CacheRead(&m_image_ctx, req);
+        m_image_ctx.aio_read_from_cache(q->oid, q->objectno, &req->data(),
+                                        q->length, q->offset,
+                                        cache_comp, m_op_flags);
+      } else {
+        req->send();
+      }
+    }
+  }
+
+  m_aio_comp->finish_adding_requests(cct);
+  m_aio_comp->put();
+
+  m_image_ctx.perfcounter->inc(l_librbd_rd);
+  m_image_ctx.perfcounter->inc(l_librbd_rd_bytes, buffer_ofs);
+}
+
+void AbstractAioImageWrite::send_request() {
+  CephContext *cct = m_image_ctx.cct;
+
+  RWLock::RLocker md_locker(m_image_ctx.md_lock);
+
+  bool journaling = false;
+  uint64_t journal_tid = 0;
+
+  uint64_t clip_len = m_len;
+  ObjectExtents object_extents;
+  ::SnapContext snapc;
+  {
+    // prevent image size from changing between computing clip and recording
+    // pending async operation
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    if (m_image_ctx.snap_id != CEPH_NOSNAP || m_image_ctx.read_only) {
+      m_aio_comp->fail(cct, -EROFS);
+      return;
+    }
+
+    int r = clip_io(&m_image_ctx, m_off, &clip_len);
+    if (r < 0) {
+      m_aio_comp->fail(cct, r);
+      return;
+    }
+
+    snapc = m_image_ctx.snapc;
+    m_aio_comp->start_op(&m_image_ctx, get_aio_type());
+
+    // map to object extents
+    if (clip_len > 0) {
+      Striper::file_to_extents(cct, m_image_ctx.format_string,
+                               &m_image_ctx.layout, m_off, clip_len, 0,
+                               object_extents);
+    }
+
+    journaling = (m_image_ctx.journal != NULL &&
+                  !m_image_ctx.journal->is_journal_replaying());
+  }
+
+  assert(!m_image_ctx.image_watcher->is_lock_supported() ||
+          m_image_ctx.image_watcher->is_lock_owner());
+
+  AioObjectRequests requests;
+  send_object_requests(object_extents, snapc, (journaling ? &requests : NULL));
+
+  if (journaling) {
+    // in-flight ops are flushed prior to closing the journal
+    assert(m_image_ctx.journal != NULL);
+    journal_tid = append_journal_event(requests, m_synchronous);
+  }
+
+  if (m_image_ctx.object_cacher != NULL) {
+    send_cache_requests(object_extents, journal_tid);
+  }
+  update_stats(clip_len);
+
+  m_aio_comp->finish_adding_requests(cct);
+  m_aio_comp->put();
+}
+
+void AbstractAioImageWrite::send_object_requests(
+    const ObjectExtents &object_extents, const ::SnapContext &snapc,
+    AioObjectRequests *aio_object_requests) {
+  CephContext *cct = m_image_ctx.cct;
+
+  for (ObjectExtents::const_iterator p = object_extents.begin();
+       p != object_extents.end(); ++p) {
+    ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
+                   << " from " << p->buffer_extents << dendl;
+    C_AioRequest *req_comp = new C_AioRequest(cct, m_aio_comp);
+    AioObjectRequest *request = create_object_request(*p, snapc, req_comp);
+
+    // if journaling, stash the request for later; otherwise send
+    if (request != NULL) {
+      if (aio_object_requests != NULL) {
+        aio_object_requests->push_back(request);
+      } else {
+        request->send();
+      }
+    }
+  }
+}
+
+void AioImageWrite::assemble_extent(const ObjectExtent &object_extent,
+                                    bufferlist *bl) {
+  for (Extents::const_iterator q = object_extent.buffer_extents.begin();
+       q != object_extent.buffer_extents.end(); ++q) {
+    bl->append(m_buf + q->first, q->second);;
+  }
+}
+
+uint64_t AioImageWrite::append_journal_event(
+    const AioObjectRequests &requests, bool synchronous) {
+  bufferlist bl;
+  bl.append(m_buf, m_len);
+
+  journal::EventEntry event_entry(journal::AioWriteEvent(m_off, m_len, bl));
+  uint64_t tid = m_image_ctx.journal->append_event(m_aio_comp, event_entry,
+                                                   requests, m_off, m_len,
+                                                   synchronous);
+  if (m_image_ctx.object_cacher == NULL) {
+    m_aio_comp->associate_journal_event(tid);
+  }
+  return tid;
+}
+
+void AioImageWrite::send_cache_requests(const ObjectExtents &object_extents,
+                                        uint64_t journal_tid) {
+  CephContext *cct = m_image_ctx.cct;
+  for (ObjectExtents::const_iterator p = object_extents.begin();
+       p != object_extents.end(); ++p) {
+    const ObjectExtent &object_extent = *p;
+
+    bufferlist bl;
+    assemble_extent(object_extent, &bl);
+
+    C_AioRequest *req_comp = new C_AioRequest(cct, m_aio_comp);
+    m_image_ctx.write_to_cache(object_extent.oid, bl, object_extent.length,
+                               object_extent.offset, req_comp, m_op_flags,
+                               journal_tid);
+  }
+}
+
+void AioImageWrite::send_object_requests(
+    const ObjectExtents &object_extents, const ::SnapContext &snapc,
+    AioObjectRequests *aio_object_requests) {
+  // cache handles creating object requests during writeback
+  if (m_image_ctx.object_cacher == NULL) {
+    AbstractAioImageWrite::send_object_requests(object_extents, snapc,
+                                                aio_object_requests);
+  }
+}
+
+AioObjectRequest *AioImageWrite::create_object_request(
+    const ObjectExtent &object_extent, const ::SnapContext &snapc,
+    Context *on_finish) {
+  assert(m_image_ctx.object_cacher == NULL);
+
+  bufferlist bl;
+  assemble_extent(object_extent, &bl);
+  AioObjectWrite *req = new AioObjectWrite(&m_image_ctx,
+                                           object_extent.oid.name,
+                                           object_extent.objectno,
+                                           object_extent.offset, bl,
+                                           snapc, on_finish);
+  req->set_op_flags(m_op_flags);
+  return req;
+}
+
+void AioImageWrite::update_stats(size_t length) {
+  m_image_ctx.perfcounter->inc(l_librbd_wr);
+  m_image_ctx.perfcounter->inc(l_librbd_wr_bytes, length);
+}
+
+uint64_t AioImageDiscard::append_journal_event(
+    const AioObjectRequests &requests, bool synchronous) {
+  journal::EventEntry event_entry(journal::AioDiscardEvent(m_off, m_len));
+  uint64_t tid = m_image_ctx.journal->append_event(m_aio_comp, event_entry,
+                                                   requests, m_off, m_len,
+                                                   synchronous);
+  m_aio_comp->associate_journal_event(tid);
+  return tid;
+}
+
+void AioImageDiscard::send_cache_requests(const ObjectExtents &object_extents,
+                                          uint64_t journal_tid) {
+  if (journal_tid == 0) {
+    Mutex::Locker cache_locker(m_image_ctx.cache_lock);
+    m_image_ctx.object_cacher->discard_set(m_image_ctx.object_set,
+                                           object_extents);
+  } else {
+    // cannot discard from cache until journal has committed
+    assert(m_image_ctx.journal != NULL);
+    m_image_ctx.journal->wait_event(
+      journal_tid, new C_DiscardJournalCommit(m_image_ctx, m_aio_comp,
+                                              object_extents, journal_tid));
+  }
+}
+
+AioObjectRequest *AioImageDiscard::create_object_request(
+    const ObjectExtent &object_extent, const ::SnapContext &snapc,
+    Context *on_finish) {
+  CephContext *cct = m_image_ctx.cct;
+
+  AioObjectRequest *req;
+  if (object_extent.length == m_image_ctx.layout.fl_object_size) {
+    req = new AioObjectRemove(&m_image_ctx, object_extent.oid.name,
+                              object_extent.objectno, snapc, on_finish);
+  } else if (object_extent.offset + object_extent.length ==
+               m_image_ctx.layout.fl_object_size) {
+    req = new AioObjectTruncate(&m_image_ctx, object_extent.oid.name,
+                                object_extent.objectno, object_extent.offset,
+                                snapc, on_finish);
+  } else {
+    if(cct->_conf->rbd_skip_partial_discard) {
+      delete on_finish;
+      return NULL;
+    }
+    req = new AioObjectZero(&m_image_ctx, object_extent.oid.name,
+                            object_extent.objectno, object_extent.offset,
+                            object_extent.length, snapc, on_finish);
+  }
+  return req;
+}
+
+void AioImageDiscard::update_stats(size_t length) {
+  m_image_ctx.perfcounter->inc(l_librbd_discard);
+  m_image_ctx.perfcounter->inc(l_librbd_discard_bytes, length);
+}
+
+void AioImageFlush::send_request() {
+  CephContext *cct = m_image_ctx.cct;
+
+  {
+    // journal the flush event
+    RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+    if (m_image_ctx.journal != NULL &&
+        !m_image_ctx.journal->is_journal_replaying()) {
+      uint64_t journal_tid = m_image_ctx.journal->append_event(
+        m_aio_comp, journal::EventEntry(journal::AioFlushEvent()),
+        AioObjectRequests(), 0, 0, false);
+
+      C_FlushJournalCommit *ctx = new C_FlushJournalCommit(m_image_ctx,
+                                                           m_aio_comp,
+                                                           journal_tid);
+      m_image_ctx.journal->flush_event(journal_tid, ctx);
+      m_aio_comp->associate_journal_event(journal_tid);
+    }
+  }
+
+  C_AioRequest *req_comp = new C_AioRequest(cct, m_aio_comp);
+  m_image_ctx.flush(req_comp);
+
+  m_aio_comp->start_op(&m_image_ctx, AIO_TYPE_FLUSH);
+  m_aio_comp->finish_adding_requests(cct);
+  m_aio_comp->put();
+
+  m_image_ctx.perfcounter->inc(l_librbd_aio_flush);
+}
+
+} // namespace librbd
diff --git a/src/librbd/AioImageRequest.h b/src/librbd/AioImageRequest.h
new file mode 100644
index 0000000..c6037e6
--- /dev/null
+++ b/src/librbd/AioImageRequest.h
@@ -0,0 +1,211 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_AIO_IMAGE_REQUEST_H
+#define CEPH_LIBRBD_AIO_IMAGE_REQUEST_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "common/snap_types.h"
+#include "osd/osd_types.h"
+#include "librbd/AioCompletion.h"
+#include <list>
+#include <utility>
+#include <vector>
+
+namespace librbd {
+
+class AioObjectRequest;
+class ImageCtx;
+
+class AioImageRequest {
+public:
+  typedef std::vector<std::pair<uint64_t,uint64_t> > Extents;
+
+  virtual ~AioImageRequest() {}
+
+  static void aio_read(ImageCtx *ictx, AioCompletion *c,
+                       const std::vector<std::pair<uint64_t,uint64_t> > &extents,
+                       char *buf, bufferlist *pbl, int op_flags);
+  static void aio_read(ImageCtx *ictx, AioCompletion *c, uint64_t off,
+                       size_t len, char *buf, bufferlist *pbl, int op_flags);
+  static void aio_write(ImageCtx *ictx, AioCompletion *c, uint64_t off,
+                        size_t len, const char *buf, int op_flags);
+  static void aio_discard(ImageCtx *ictx, AioCompletion *c, uint64_t off,
+                          uint64_t len);
+  static void aio_flush(ImageCtx *ictx, AioCompletion *c);
+
+  virtual bool is_write_op() const {
+    return false;
+  }
+
+  void send();
+
+protected:
+  typedef std::list<AioObjectRequest *> AioObjectRequests;
+
+  ImageCtx &m_image_ctx;
+  AioCompletion *m_aio_comp;
+
+  AioImageRequest(ImageCtx &image_ctx, AioCompletion *aio_comp)
+    : m_image_ctx(image_ctx), m_aio_comp(aio_comp) {}
+
+  virtual void send_request() = 0;
+  virtual const char *get_request_type() const = 0;
+};
+
+class AioImageRead : public AioImageRequest {
+public:
+  AioImageRead(ImageCtx &image_ctx, AioCompletion *aio_comp, uint64_t off,
+               size_t len, char *buf, bufferlist *pbl, int op_flags)
+    : AioImageRequest(image_ctx, aio_comp), m_buf(buf), m_pbl(pbl),
+      m_op_flags(op_flags) {
+    m_image_extents.push_back(std::make_pair(off, len));
+  }
+
+  AioImageRead(ImageCtx &image_ctx, AioCompletion *aio_comp,
+               const Extents &image_extents, char *buf, bufferlist *pbl,
+               int op_flags)
+    : AioImageRequest(image_ctx, aio_comp), m_image_extents(image_extents),
+      m_buf(buf), m_pbl(pbl), m_op_flags(op_flags) {
+  }
+
+protected:
+  virtual void send_request();
+  virtual const char *get_request_type() const {
+    return "aio_read";
+  }
+private:
+  Extents m_image_extents;
+  char *m_buf;
+  bufferlist *m_pbl;
+  int m_op_flags;
+};
+
+class AbstractAioImageWrite : public AioImageRequest {
+public:
+  virtual bool is_write_op() const {
+    return true;
+  }
+
+  inline void flag_synchronous() {
+    m_synchronous = true;
+  }
+
+protected:
+  typedef std::vector<ObjectExtent> ObjectExtents;
+
+  const uint64_t m_off;
+  const size_t m_len;
+
+  AbstractAioImageWrite(ImageCtx &image_ctx, AioCompletion *aio_comp,
+                        uint64_t off, size_t len)
+    : AioImageRequest(image_ctx, aio_comp), m_off(off), m_len(len),
+      m_synchronous(false) {
+  }
+
+  virtual aio_type_t get_aio_type() const = 0;
+
+  virtual void send_request();
+
+  virtual void send_cache_requests(const ObjectExtents &object_extents,
+                                   uint64_t journal_tid) = 0;
+
+  virtual void send_object_requests(const ObjectExtents &object_extents,
+                                    const ::SnapContext &snapc,
+                                    AioObjectRequests *aio_object_requests);
+  virtual AioObjectRequest *create_object_request(
+      const ObjectExtent &object_extent, const ::SnapContext &snapc,
+      Context *on_finish) = 0;
+
+  virtual uint64_t append_journal_event(const AioObjectRequests &requests,
+                                        bool synchronous) = 0;
+  virtual void update_stats(size_t length) = 0;
+
+private:
+  bool m_synchronous;
+};
+
+class AioImageWrite : public AbstractAioImageWrite {
+public:
+  AioImageWrite(ImageCtx &image_ctx, AioCompletion *aio_comp, uint64_t off,
+                size_t len, const char *buf, int op_flags)
+    : AbstractAioImageWrite(image_ctx, aio_comp, off, len), m_buf(buf),
+      m_op_flags(op_flags) {
+  }
+
+protected:
+  virtual aio_type_t get_aio_type() const {
+    return AIO_TYPE_WRITE;
+  }
+  virtual const char *get_request_type() const {
+    return "aio_write";
+  }
+
+  void assemble_extent(const ObjectExtent &object_extent, bufferlist *bl);
+
+  virtual void send_cache_requests(const ObjectExtents &object_extents,
+                                   uint64_t journal_tid);
+
+  virtual void send_object_requests(const ObjectExtents &object_extents,
+                                    const ::SnapContext &snapc,
+                                    AioObjectRequests *aio_object_requests);
+  virtual AioObjectRequest *create_object_request(
+      const ObjectExtent &object_extent, const ::SnapContext &snapc,
+      Context *on_finish);
+
+  virtual uint64_t append_journal_event(const AioObjectRequests &requests,
+                                        bool synchronous);
+  virtual void update_stats(size_t length);
+private:
+  const char *m_buf;
+  int m_op_flags;
+};
+
+class AioImageDiscard : public AbstractAioImageWrite {
+public:
+  AioImageDiscard(ImageCtx &image_ctx, AioCompletion *aio_comp, uint64_t off,
+                  uint64_t len)
+    : AbstractAioImageWrite(image_ctx, aio_comp, off, len) {
+  }
+
+protected:
+  virtual aio_type_t get_aio_type() const {
+    return AIO_TYPE_DISCARD;
+  }
+  virtual const char *get_request_type() const {
+    return "aio_discard";
+  }
+
+  virtual void send_cache_requests(const ObjectExtents &object_extents,
+                                   uint64_t journal_tid);
+
+  virtual AioObjectRequest *create_object_request(
+      const ObjectExtent &object_extent, const ::SnapContext &snapc,
+      Context *on_finish);
+
+  virtual uint64_t append_journal_event(const AioObjectRequests &requests,
+                                        bool synchronous);
+  virtual void update_stats(size_t length);
+};
+
+class AioImageFlush : public AioImageRequest {
+public:
+  AioImageFlush(ImageCtx &image_ctx, AioCompletion *aio_comp)
+    : AioImageRequest(image_ctx, aio_comp) {
+  }
+
+  virtual bool is_write_op() const {
+    return true;
+  }
+
+protected:
+  virtual void send_request();
+  virtual const char *get_request_type() const {
+    return "aio_flush";
+  }
+};
+
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_AIO_IMAGE_REQUEST_H
diff --git a/src/librbd/AioImageRequestWQ.cc b/src/librbd/AioImageRequestWQ.cc
new file mode 100644
index 0000000..7898653
--- /dev/null
+++ b/src/librbd/AioImageRequestWQ.cc
@@ -0,0 +1,303 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/AioImageRequestWQ.h"
+#include "librbd/AioCompletion.h"
+#include "librbd/AioImageRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::AioImageRequestWQ: "
+
+namespace librbd {
+
+AioImageRequestWQ::AioImageRequestWQ(ImageCtx *image_ctx, const string &name,
+                                     time_t ti, ThreadPool *tp)
+  : ThreadPool::PointerWQ<AioImageRequest>(name, ti, 0, tp),
+    m_image_ctx(*image_ctx), m_lock("AioImageRequestWQ::m_lock"),
+    m_write_blockers(0), m_in_progress_writes(0), m_queued_writes(0),
+    m_lock_listener(this), m_blocking_writes(false) {
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 5) << this << " " << ": ictx=" << image_ctx << dendl;
+}
+
+ssize_t AioImageRequestWQ::read(uint64_t off, size_t len, char *buf,
+                                int op_flags) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << "read: ictx=" << &m_image_ctx << ", off=" << off << ", "
+                 << "len = " << len << dendl;
+
+  std::vector<std::pair<uint64_t,uint64_t> > image_extents;
+  image_extents.push_back(make_pair(off, len));
+
+  C_SaferCond cond;
+  AioCompletion *c = aio_create_completion_internal(&cond, rbd_ctx_cb);
+  aio_read(c, off, len, buf, NULL, op_flags);
+  return cond.wait();
+}
+
+ssize_t AioImageRequestWQ::write(uint64_t off, size_t len, const char *buf,
+                                 int op_flags) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << "write: ictx=" << &m_image_ctx << ", off=" << off << ", "
+                 << "len = " << len << dendl;
+
+  m_image_ctx.snap_lock.get_read();
+  int r = clip_io(&m_image_ctx, off, &len);
+  m_image_ctx.snap_lock.put_read();
+  if (r < 0) {
+    return r;
+  }
+
+  C_SaferCond cond;
+  AioCompletion *c = aio_create_completion_internal(&cond, rbd_ctx_cb);
+  aio_write(c, off, len, buf, op_flags);
+
+  r = cond.wait();
+  if (r < 0) {
+    return r;
+  }
+  return len;
+}
+
+int AioImageRequestWQ::discard(uint64_t off, uint64_t len) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << "discard: ictx=" << &m_image_ctx << ", off=" << off << ", "
+                 << "len = " << len << dendl;
+
+  m_image_ctx.snap_lock.get_read();
+  int r = clip_io(&m_image_ctx, off, &len);
+  m_image_ctx.snap_lock.put_read();
+  if (r < 0) {
+    return r;
+  }
+
+  C_SaferCond cond;
+  AioCompletion *c = aio_create_completion_internal(&cond, rbd_ctx_cb);
+  aio_discard(c, off, len);
+
+  r = cond.wait();
+  if (r < 0) {
+    return r;
+  }
+  return len;
+}
+
+void AioImageRequestWQ::aio_read(AioCompletion *c, uint64_t off, size_t len,
+                                 char *buf, bufferlist *pbl, int op_flags) {
+  c->init_time(&m_image_ctx, librbd::AIO_TYPE_READ);
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << "aio_read: ictx=" << &m_image_ctx << ", "
+                 << "completion=" << c << ", off=" << off << ", "
+                 << "len=" << len << ", " << "flags=" << op_flags << dendl;
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  if (m_image_ctx.non_blocking_aio) {
+    queue(new AioImageRead(m_image_ctx, c, off, len, buf, pbl, op_flags));
+  } else {
+    AioImageRequest::aio_read(&m_image_ctx, c, off, len, buf, pbl, op_flags);
+  }
+}
+
+void AioImageRequestWQ::aio_write(AioCompletion *c, uint64_t off, size_t len,
+                                  const char *buf, int op_flags) {
+  c->init_time(&m_image_ctx, librbd::AIO_TYPE_WRITE);
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << "aio_write: ictx=" << &m_image_ctx << ", "
+                 << "completion=" << c << ", off=" << off << ", "
+                 << "len=" << len << ", flags=" << op_flags << dendl;
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  if (m_image_ctx.non_blocking_aio || is_journal_required() ||
+      writes_blocked()) {
+    queue(new AioImageWrite(m_image_ctx, c, off, len, buf, op_flags));
+  } else {
+    AioImageRequest::aio_write(&m_image_ctx, c, off, len, buf, op_flags);
+  }
+}
+
+void AioImageRequestWQ::aio_discard(AioCompletion *c, uint64_t off,
+                                    uint64_t len) {
+  c->init_time(&m_image_ctx, librbd::AIO_TYPE_DISCARD);
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << "aio_discard: ictx=" << &m_image_ctx << ", "
+                 << "completion=" << c << ", off=" << off << ", len=" << len
+                 << dendl;
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  if (m_image_ctx.non_blocking_aio || is_journal_required() ||
+      writes_blocked()) {
+    queue(new AioImageDiscard(m_image_ctx, c, off, len));
+  } else {
+    AioImageRequest::aio_discard(&m_image_ctx, c, off, len);
+  }
+}
+
+void AioImageRequestWQ::aio_flush(AioCompletion *c) {
+  c->init_time(&m_image_ctx, librbd::AIO_TYPE_FLUSH);
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << "aio_flush: ictx=" << &m_image_ctx << ", "
+                 << "completion=" << c << dendl;
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  if (m_image_ctx.non_blocking_aio || is_journal_required() ||
+      writes_blocked() || !writes_empty()) {
+    queue(new AioImageFlush(m_image_ctx, c));
+  } else {
+    AioImageRequest::aio_flush(&m_image_ctx, c);
+  }
+}
+
+void AioImageRequestWQ::block_writes() {
+  CephContext *cct = m_image_ctx.cct;
+
+  Mutex::Locker locker(m_lock);
+  ++m_write_blockers;
+  ldout(cct, 5) << __func__ << ": " << &m_image_ctx << ", "
+                << "num=" << m_write_blockers << dendl;
+  if (m_write_blockers == 1) {
+    while (m_in_progress_writes > 0) {
+      m_cond.Wait(m_lock);
+    }
+  }
+}
+
+void AioImageRequestWQ::unblock_writes() {
+  CephContext *cct = m_image_ctx.cct;
+
+  bool wake_up = false;
+  {
+    Mutex::Locker locker(m_lock);
+    assert(m_write_blockers > 0);
+    --m_write_blockers;
+
+    ldout(cct, 5) << __func__ << ": " << &m_image_ctx << ", "
+                  << "num=" << m_write_blockers << dendl;
+    if (m_write_blockers == 0) {
+      wake_up = true;
+    }
+  }
+
+  if (wake_up) {
+    signal();
+  }
+}
+
+void AioImageRequestWQ::register_lock_listener() {
+  m_image_ctx.image_watcher->register_listener(&m_lock_listener);
+}
+
+void *AioImageRequestWQ::_void_dequeue() {
+  AioImageRequest *peek_item = front();
+  if (peek_item == NULL) {
+    return NULL;
+  }
+
+  {
+    if (peek_item->is_write_op()) {
+      Mutex::Locker locker(m_lock);
+      if (m_write_blockers > 0) {
+        return NULL;
+      }
+      ++m_in_progress_writes;
+    }
+  }
+
+  AioImageRequest *item = reinterpret_cast<AioImageRequest *>(
+    ThreadPool::PointerWQ<AioImageRequest>::_void_dequeue());
+  assert(peek_item == item);
+  return item;
+}
+
+void AioImageRequestWQ::process(AioImageRequest *req) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << __func__ << ": ictx=" << &m_image_ctx << ", "
+                 << "req=" << req << dendl;
+
+  {
+    RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+    req->send();
+  }
+
+  {
+    Mutex::Locker locker(m_lock);
+    if (req->is_write_op()) {
+      assert(m_queued_writes > 0);
+      --m_queued_writes;
+
+      assert(m_in_progress_writes > 0);
+      if (--m_in_progress_writes == 0) {
+        m_cond.Signal();
+      }
+    }
+  }
+  delete req;
+}
+
+bool AioImageRequestWQ::is_journal_required() const {
+  RWLock::RLocker snap_locker(m_image_ctx.snap_lock);
+  return (m_image_ctx.journal != NULL);
+}
+
+bool AioImageRequestWQ::is_lock_required() const {
+  assert(m_image_ctx.owner_lock.is_locked());
+  if (m_image_ctx.image_watcher == NULL) {
+    return false;
+  }
+
+  return (m_image_ctx.image_watcher->is_lock_supported() &&
+          !m_image_ctx.image_watcher->is_lock_owner());
+}
+
+void AioImageRequestWQ::queue(AioImageRequest *req) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << __func__ << ": ictx=" << &m_image_ctx << ", "
+                 << "req=" << req << dendl;
+
+  assert(m_image_ctx.owner_lock.is_locked());
+
+  bool first_write_op = false;
+  {
+    Mutex::Locker locker(m_lock);
+    if (req->is_write_op()) {
+      if (++m_queued_writes == 1) {
+        first_write_op = true;
+      }
+    }
+  }
+  ThreadPool::PointerWQ<AioImageRequest>::queue(req);
+
+  if (is_lock_required() && first_write_op) {
+    m_image_ctx.image_watcher->request_lock();
+  }
+}
+
+void AioImageRequestWQ::handle_lock_updated(
+    ImageWatcher::LockUpdateState state) {
+  assert(m_image_ctx.owner_lock.is_locked());
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << __func__ << ": ictx=" << &m_image_ctx << ", "
+                 << "state=" << state << dendl;
+
+  if ((state == ImageWatcher::LOCK_UPDATE_STATE_NOT_SUPPORTED ||
+       state == ImageWatcher::LOCK_UPDATE_STATE_LOCKED) && m_blocking_writes) {
+    m_blocking_writes = false;
+    unblock_writes();
+  } else if (state == ImageWatcher::LOCK_UPDATE_STATE_RELEASING &&
+             !m_blocking_writes) {
+    m_blocking_writes = true;
+    block_writes();
+  } else if (state == ImageWatcher::LOCK_UPDATE_STATE_UNLOCKED) {
+    assert(m_blocking_writes);
+    assert(writes_blocked());
+  } else if (state == ImageWatcher::LOCK_UPDATE_STATE_NOTIFICATION &&
+             !writes_empty()) {
+    m_image_ctx.image_watcher->request_lock();
+  }
+}
+
+} // namespace librbd
diff --git a/src/librbd/AioImageRequestWQ.h b/src/librbd/AioImageRequestWQ.h
new file mode 100644
index 0000000..20169f5
--- /dev/null
+++ b/src/librbd/AioImageRequestWQ.h
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_AIO_IMAGE_REQUEST_WQ_H
+#define CEPH_LIBRBD_AIO_IMAGE_REQUEST_WQ_H
+
+#include "include/Context.h"
+#include "common/WorkQueue.h"
+#include "common/Mutex.h"
+#include "librbd/ImageWatcher.h"
+
+namespace librbd {
+
+class AioCompletion;
+class AioImageRequest;
+class ImageCtx;
+
+class AioImageRequestWQ : protected ThreadPool::PointerWQ<AioImageRequest> {
+public:
+  AioImageRequestWQ(ImageCtx *image_ctx, const string &name, time_t ti,
+                    ThreadPool *tp);
+
+  ssize_t read(uint64_t off, size_t len, char *buf, int op_flags);
+  ssize_t write(uint64_t off, size_t len, const char *buf, int op_flags);
+  int discard(uint64_t off, uint64_t len);
+
+  void aio_read(AioCompletion *c, uint64_t off, size_t len, char *buf,
+                bufferlist *pbl, int op_flags);
+  void aio_write(AioCompletion *c, uint64_t off, size_t len, const char *buf,
+                 int op_flags);
+  void aio_discard(AioCompletion *c, uint64_t off, uint64_t len);
+  void aio_flush(AioCompletion *c);
+
+  using ThreadPool::PointerWQ<AioImageRequest>::drain;
+  using ThreadPool::PointerWQ<AioImageRequest>::empty;
+
+  inline bool writes_empty() const {
+    Mutex::Locker locker(m_lock);
+    return (m_queued_writes == 0);
+  }
+
+  inline bool writes_blocked() const {
+    Mutex::Locker locker(m_lock);
+    return (m_write_blockers > 0);
+  }
+
+  void block_writes();
+  void unblock_writes();
+
+  void register_lock_listener();
+
+protected:
+  virtual void *_void_dequeue();
+  virtual void process(AioImageRequest *req);
+
+private:
+  struct LockListener : public ImageWatcher::Listener {
+    AioImageRequestWQ *aio_work_queue;
+    LockListener(AioImageRequestWQ *_aio_work_queue)
+      : aio_work_queue(_aio_work_queue) {
+    }
+
+    virtual bool handle_requested_lock() {
+      return true;
+    }
+    virtual void handle_lock_updated(ImageWatcher::LockUpdateState state) {
+      aio_work_queue->handle_lock_updated(state);
+    }
+  };
+
+  ImageCtx &m_image_ctx;
+  mutable Mutex m_lock;
+  Cond m_cond;
+  uint32_t m_write_blockers;
+  uint32_t m_in_progress_writes;
+  uint32_t m_queued_writes;
+
+  LockListener m_lock_listener;
+  bool m_blocking_writes;
+
+  bool is_journal_required() const;
+  bool is_lock_required() const;
+  void queue(AioImageRequest *req);
+
+  void handle_lock_updated(ImageWatcher::LockUpdateState state);
+};
+
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_AIO_IMAGE_REQUEST_WQ_H
diff --git a/src/librbd/AioObjectRequest.cc b/src/librbd/AioObjectRequest.cc
new file mode 100644
index 0000000..48e9837
--- /dev/null
+++ b/src/librbd/AioObjectRequest.cc
@@ -0,0 +1,565 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/ceph_context.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "common/Mutex.h"
+#include "common/RWLock.h"
+
+#include "librbd/AioCompletion.h"
+#include "librbd/AioImageRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/internal.h"
+
+#include "librbd/AioObjectRequest.h"
+#include "librbd/CopyupRequest.h"
+
+#include <boost/bind.hpp>
+#include <boost/optional.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::AioObjectRequest: "
+
+namespace librbd {
+
+  AioObjectRequest::AioObjectRequest(ImageCtx *ictx, const std::string &oid,
+			             uint64_t objectno, uint64_t off,
+                                     uint64_t len, librados::snap_t snap_id,
+                                     Context *completion, bool hide_enoent)
+    : m_ictx(ictx), m_oid(oid), m_object_no(objectno), m_object_off(off),
+      m_object_len(len), m_snap_id(snap_id), m_completion(completion),
+      m_hide_enoent(hide_enoent) {
+
+    Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
+                            0, m_ictx->layout.fl_object_size, m_parent_extents);
+
+    RWLock::RLocker snap_locker(m_ictx->snap_lock);
+    RWLock::RLocker parent_locker(m_ictx->parent_lock);
+    compute_parent_extents();
+  }
+
+  void AioObjectRequest::complete(int r)
+  {
+    if (should_complete(r)) {
+      ldout(m_ictx->cct, 20) << "complete " << this << dendl;
+      if (m_hide_enoent && r == -ENOENT) {
+	r = 0;
+      }
+      m_completion->complete(r);
+      delete this;
+    }
+  }
+
+  bool AioObjectRequest::compute_parent_extents() {
+    assert(m_ictx->snap_lock.is_locked());
+    assert(m_ictx->parent_lock.is_locked());
+
+    uint64_t parent_overlap;
+    int r = m_ictx->get_parent_overlap(m_snap_id, &parent_overlap);
+    if (r < 0) {
+      // NOTE: it's possible for a snapshot to be deleted while we are
+      // still reading from it
+      lderr(m_ictx->cct) << this << " compute_parent_extents: failed to "
+                         << "retrieve parent overlap: " << cpp_strerror(r)
+                         << dendl;
+      m_parent_extents.clear();
+      return false;
+    }
+
+    uint64_t object_overlap =
+      m_ictx->prune_parent_extents(m_parent_extents, parent_overlap);
+    if (object_overlap > 0) {
+      ldout(m_ictx->cct, 20) << this << " compute_parent_extents: "
+                             << "overlap " << parent_overlap << " "
+                             << "extents " << m_parent_extents << dendl;
+      return true;
+    }
+    return false;
+  }
+
+  static inline bool is_copy_on_read(ImageCtx *ictx, librados::snap_t snap_id) {
+    assert(ictx->snap_lock.is_locked());
+    return (ictx->clone_copy_on_read) &&
+           (!ictx->read_only) && (snap_id == CEPH_NOSNAP);
+  }
+
+  /** read **/
+
+  AioObjectRead::AioObjectRead(ImageCtx *ictx, const std::string &oid,
+                               uint64_t objectno, uint64_t offset, uint64_t len,
+                               vector<pair<uint64_t,uint64_t> >& be,
+                               librados::snap_t snap_id, bool sparse,
+                               Context *completion, int op_flags)
+    : AioObjectRequest(ictx, oid, objectno, offset, len, snap_id, completion,
+                       false),
+      m_buffer_extents(be), m_tried_parent(false), m_sparse(sparse),
+      m_op_flags(op_flags), m_parent_completion(NULL),
+      m_state(LIBRBD_AIO_READ_FLAT) {
+
+    guard_read();
+  }
+
+  AioObjectRead::~AioObjectRead()
+  {
+    if (m_parent_completion) {
+      m_parent_completion->release();
+      m_parent_completion = NULL;
+    }
+  }
+
+  void AioObjectRead::guard_read()
+  {
+    RWLock::RLocker snap_locker(m_ictx->snap_lock);
+    RWLock::RLocker parent_locker(m_ictx->parent_lock);
+
+    if (has_parent()) {
+      ldout(m_ictx->cct, 20) << __func__ << " guarding read" << dendl;
+      m_state = LIBRBD_AIO_READ_GUARD;
+    }
+  }
+
+  bool AioObjectRead::should_complete(int r)
+  {
+    ldout(m_ictx->cct, 20) << "should_complete " << this << " " << m_oid << " "
+                           << m_object_off << "~" << m_object_len
+                           << " r = " << r << dendl;
+
+    bool finished = true;
+
+    switch (m_state) {
+    case LIBRBD_AIO_READ_GUARD:
+      ldout(m_ictx->cct, 20) << "should_complete " << this
+                             << " READ_CHECK_GUARD" << dendl;
+
+      // This is the step to read from parent
+      if (!m_tried_parent && r == -ENOENT) {
+        {
+          RWLock::RLocker snap_locker(m_ictx->snap_lock);
+          RWLock::RLocker parent_locker(m_ictx->parent_lock);
+          if (m_ictx->parent == NULL) {
+	    ldout(m_ictx->cct, 20) << "parent is gone; do nothing" << dendl;
+	    m_state = LIBRBD_AIO_READ_FLAT;
+	    finished = false;
+	    break;
+	  }
+
+          // calculate reverse mapping onto the image
+          vector<pair<uint64_t,uint64_t> > parent_extents;
+          Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
+                                  m_object_off, m_object_len, parent_extents);
+
+          uint64_t parent_overlap = 0;
+          uint64_t object_overlap = 0;
+          r = m_ictx->get_parent_overlap(m_snap_id, &parent_overlap);
+          if (r == 0) {
+            object_overlap = m_ictx->prune_parent_extents(parent_extents,
+                                                          parent_overlap);
+          }
+
+          if (object_overlap > 0) {
+            m_tried_parent = true;
+            if (is_copy_on_read(m_ictx, m_snap_id)) {
+              m_state = LIBRBD_AIO_READ_COPYUP;
+	    }
+
+            read_from_parent(parent_extents);
+            finished = false;
+          }
+        }
+
+        if (m_tried_parent) {
+          // release reference to the parent read completion.  this request
+          // might be completed after unblock is invoked.
+          AioCompletion *parent_completion = m_parent_completion;
+          parent_completion->unblock(m_ictx->cct);
+          parent_completion->put();
+        }
+      }
+      break;
+    case LIBRBD_AIO_READ_COPYUP:
+      ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_COPYUP"
+                             << dendl;
+      // This is the extra step for copy-on-read: kick off an asynchronous copyup.
+      // It is different from copy-on-write as asynchronous copyup will finish
+      // by itself so state won't go back to LIBRBD_AIO_READ_GUARD.
+
+      assert(m_tried_parent);
+      if (r > 0) {
+        // If read entire object from parent success and CoR is possible, kick
+        // off a asynchronous copyup. This approach minimizes the latency
+        // impact.
+        send_copyup();
+      }
+      break;
+    case LIBRBD_AIO_READ_FLAT:
+      ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_FLAT"
+                             << dendl;
+      // The read content should be deposit in m_read_data
+      break;
+    default:
+      lderr(m_ictx->cct) << "invalid request state: " << m_state << dendl;
+      assert(0);
+    }
+
+    return finished;
+  }
+
+  void AioObjectRead::send() {
+    ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " "
+                           << m_object_off << "~" << m_object_len << dendl;
+
+    // send read request to parent if the object doesn't exist locally
+    if (!m_ictx->object_map.object_may_exist(m_object_no)) {
+      complete(-ENOENT);
+      return;
+    }
+
+    librados::AioCompletion *rados_completion =
+      librados::Rados::aio_create_completion(this, rados_req_cb, NULL);
+    int r;
+    librados::ObjectReadOperation op;
+    int flags = m_ictx->get_read_flags(m_snap_id);
+    if (m_sparse) {
+      op.sparse_read(m_object_off, m_object_len, &m_ext_map, &m_read_data,
+		     NULL);
+    } else {
+      op.read(m_object_off, m_object_len, &m_read_data, NULL);
+    }
+    op.set_op_flags2(m_op_flags);
+
+    r = m_ictx->data_ctx.aio_operate(m_oid, rados_completion, &op, flags, NULL);
+    assert(r == 0);
+
+    rados_completion->release();
+  }
+
+  void AioObjectRead::send_copyup()
+  {
+    {
+      RWLock::RLocker snap_locker(m_ictx->snap_lock);
+      RWLock::RLocker parent_locker(m_ictx->parent_lock);
+      if (!compute_parent_extents()) {
+        return;
+      }
+    }
+
+    Mutex::Locker copyup_locker(m_ictx->copyup_list_lock);
+    map<uint64_t, CopyupRequest*>::iterator it =
+      m_ictx->copyup_list.find(m_object_no);
+    if (it == m_ictx->copyup_list.end()) {
+      // create and kick off a CopyupRequest
+      CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid, m_object_no,
+    					         m_parent_extents);
+      m_ictx->copyup_list[m_object_no] = new_req;
+      new_req->queue_send();
+    }
+  }
+
+  void AioObjectRead::read_from_parent(const vector<pair<uint64_t,uint64_t> >& parent_extents)
+  {
+    assert(!m_parent_completion);
+    m_parent_completion = aio_create_completion_internal(this, rbd_req_cb);
+
+    // prevent the parent image from being deleted while this
+    // request is still in-progress
+    m_parent_completion->get();
+    m_parent_completion->block();
+
+    ldout(m_ictx->cct, 20) << "read_from_parent this = " << this
+			   << " parent completion " << m_parent_completion
+			   << " extents " << parent_extents
+			   << dendl;
+    RWLock::RLocker owner_locker(m_ictx->parent->owner_lock);
+    AioImageRequest::aio_read(m_ictx->parent, m_parent_completion,
+                              parent_extents, NULL, &m_read_data, 0);
+  }
+
+  /** write **/
+
+  AbstractAioObjectWrite::AbstractAioObjectWrite(ImageCtx *ictx,
+                                                 const std::string &oid,
+                                                 uint64_t object_no,
+                                                 uint64_t object_off,
+                                                 uint64_t len,
+                                                 const ::SnapContext &snapc,
+                                                 Context *completion,
+                                                 bool hide_enoent)
+    : AioObjectRequest(ictx, oid, object_no, object_off, len, CEPH_NOSNAP,
+                       completion, hide_enoent),
+      m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val)
+  {
+    m_snaps.insert(m_snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
+  }
+
+  void AbstractAioObjectWrite::guard_write()
+  {
+    if (has_parent()) {
+      m_state = LIBRBD_AIO_WRITE_GUARD;
+      m_write.assert_exists();
+      ldout(m_ictx->cct, 20) << __func__ << " guarding write" << dendl;
+    }
+  }
+
+  bool AbstractAioObjectWrite::should_complete(int r)
+  {
+    ldout(m_ictx->cct, 20) << get_write_type() << " " << this << " " << m_oid
+                           << " " << m_object_off << "~" << m_object_len
+			   << " should_complete: r = " << r << dendl;
+
+    bool finished = true;
+    switch (m_state) {
+    case LIBRBD_AIO_WRITE_PRE:
+      ldout(m_ictx->cct, 20) << "WRITE_PRE" << dendl;
+      if (r < 0) {
+	return true;
+      }
+
+      send_write();
+      finished = false;
+      break;
+
+    case LIBRBD_AIO_WRITE_POST:
+      ldout(m_ictx->cct, 20) << "WRITE_POST" << dendl;
+      finished = true;
+      break;
+
+    case LIBRBD_AIO_WRITE_GUARD:
+      ldout(m_ictx->cct, 20) << "WRITE_CHECK_GUARD" << dendl;
+
+      if (r == -ENOENT) {
+        handle_write_guard();
+	finished = false;
+	break;
+      } else if (r < 0) {
+        // pass the error code to the finish context
+        m_state = LIBRBD_AIO_WRITE_ERROR;
+        complete(r);
+	finished = false;
+	break;
+      }
+
+      finished = send_post();
+      break;
+
+    case LIBRBD_AIO_WRITE_COPYUP:
+      ldout(m_ictx->cct, 20) << "WRITE_COPYUP" << dendl;
+      if (r < 0) {
+        m_state = LIBRBD_AIO_WRITE_ERROR;
+        complete(r);
+        finished = false;
+      } else {
+        finished = send_post();
+      }
+      break;
+
+    case LIBRBD_AIO_WRITE_FLAT:
+      ldout(m_ictx->cct, 20) << "WRITE_FLAT" << dendl;
+
+      finished = send_post();
+      break;
+
+    case LIBRBD_AIO_WRITE_ERROR:
+      assert(r < 0);
+      lderr(m_ictx->cct) << "WRITE_ERROR: " << cpp_strerror(r)
+			 << dendl;
+      break;
+
+    default:
+      lderr(m_ictx->cct) << "invalid request state: " << m_state << dendl;
+      assert(0);
+    }
+
+    return finished;
+  }
+
+  void AbstractAioObjectWrite::send() {
+    assert(m_ictx->owner_lock.is_locked());
+    ldout(m_ictx->cct, 20) << "send " << get_write_type() << " " << this <<" "
+                           << m_oid << " " << m_object_off << "~"
+                           << m_object_len << dendl;
+    send_pre();
+  }
+
+  void AbstractAioObjectWrite::send_pre() {
+    assert(m_ictx->owner_lock.is_locked());
+
+    m_object_exist = m_ictx->object_map.object_may_exist(m_object_no);
+    bool write = false;
+    {
+      RWLock::RLocker snap_lock(m_ictx->snap_lock);
+      if (!m_ictx->object_map.enabled()) {
+        write = true;
+      } else {
+        // should have been flushed prior to releasing lock
+        assert(m_ictx->image_watcher->is_lock_owner());
+
+        ldout(m_ictx->cct, 20) << "send_pre " << this << " " << m_oid << " "
+          		       << m_object_off << "~" << m_object_len << dendl;
+        m_state = LIBRBD_AIO_WRITE_PRE;
+
+        uint8_t new_state;
+        boost::optional<uint8_t> current_state;
+        pre_object_map_update(&new_state);
+
+        RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
+        if (m_ictx->object_map[m_object_no] != new_state) {
+          FunctionContext *ctx = new FunctionContext(
+            boost::bind(&AioObjectRequest::complete, this, _1));
+          bool updated = m_ictx->object_map.aio_update(m_object_no, new_state,
+                                                       current_state, ctx);
+          assert(updated);
+        } else {
+          write = true;
+        }
+      }
+    }
+
+    // avoid possible recursive lock attempts
+    if (write) {
+      // no object map update required
+      send_write();
+    }
+  }
+
+  bool AbstractAioObjectWrite::send_post() {
+    RWLock::RLocker owner_locker(m_ictx->owner_lock);
+    RWLock::RLocker snap_locker(m_ictx->snap_lock);
+    if (!m_ictx->object_map.enabled() || !post_object_map_update()) {
+      return true;
+    }
+
+    // should have been flushed prior to releasing lock
+    assert(m_ictx->image_watcher->is_lock_owner());
+
+    ldout(m_ictx->cct, 20) << "send_post " << this << " " << m_oid << " "
+			   << m_object_off << "~" << m_object_len << dendl;
+    m_state = LIBRBD_AIO_WRITE_POST;
+
+    RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
+    uint8_t current_state = m_ictx->object_map[m_object_no];
+    if (current_state != OBJECT_PENDING ||
+        current_state == OBJECT_NONEXISTENT) {
+      return true;
+    }
+
+    FunctionContext *ctx = new FunctionContext(
+      boost::bind(&AioObjectRequest::complete, this, _1));
+    bool updated = m_ictx->object_map.aio_update(m_object_no,
+                                                 OBJECT_NONEXISTENT,
+				                 OBJECT_PENDING, ctx);
+    assert(updated);
+    return false;
+  }
+
+  void AbstractAioObjectWrite::send_write() {
+    ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
+			   << m_object_off << "~" << m_object_len 
+                           << " object exist " << m_object_exist << dendl;
+
+    if (!m_object_exist && has_parent()) {
+      m_state = LIBRBD_AIO_WRITE_GUARD;
+      handle_write_guard();
+    } else {
+      send_write_op(true);
+    }
+  }
+
+  void AbstractAioObjectWrite::send_copyup()
+  {
+    ldout(m_ictx->cct, 20) << "send_copyup " << this << " " << m_oid << " "
+                           << m_object_off << "~" << m_object_len << dendl;
+    m_state = LIBRBD_AIO_WRITE_COPYUP;
+
+    m_ictx->copyup_list_lock.Lock();
+    map<uint64_t, CopyupRequest*>::iterator it =
+      m_ictx->copyup_list.find(m_object_no);
+    if (it == m_ictx->copyup_list.end()) {
+      CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
+                                                 m_object_no,
+                                                 m_parent_extents);
+
+      // make sure to wait on this CopyupRequest
+      new_req->append_request(this);
+      m_ictx->copyup_list[m_object_no] = new_req;
+
+      m_ictx->copyup_list_lock.Unlock();
+      new_req->send();
+    } else {
+      it->second->append_request(this);
+      m_ictx->copyup_list_lock.Unlock();
+    }
+  }
+  void AbstractAioObjectWrite::send_write_op(bool write_guard)
+  {
+    m_state = LIBRBD_AIO_WRITE_FLAT;
+    if (write_guard)
+      guard_write();
+    add_write_ops(&m_write);
+    assert(m_write.size() != 0);
+
+    librados::AioCompletion *rados_completion =
+      librados::Rados::aio_create_completion(this, NULL, rados_req_cb);
+    int r = m_ictx->data_ctx.aio_operate(m_oid, rados_completion, &m_write,
+					 m_snap_seq, m_snaps);
+    assert(r == 0);
+    rados_completion->release();
+  }
+  void AbstractAioObjectWrite::handle_write_guard()
+  {
+    bool has_parent;
+    {
+      RWLock::RLocker snap_locker(m_ictx->snap_lock);
+      RWLock::RLocker parent_locker(m_ictx->parent_lock);
+      has_parent = compute_parent_extents();
+    }
+    // If parent still exists, overlap might also have changed.
+    if (has_parent) {
+      send_copyup();
+    } else {
+      // parent may have disappeared -- send original write again
+      ldout(m_ictx->cct, 20) << "should_complete(" << this
+        << "): parent overlap now 0" << dendl;
+      send_write();
+    }
+  }
+
+  void AioObjectWrite::add_write_ops(librados::ObjectWriteOperation *wr) {
+    if (m_ictx->enable_alloc_hint && !m_ictx->object_map.object_may_exist(m_object_no))
+      wr->set_alloc_hint(m_ictx->get_object_size(), m_ictx->get_object_size());
+    if (m_object_off == 0 && m_object_len == m_ictx->get_object_size()) {
+      wr->write_full(m_write_data);
+    } else {
+      wr->write(m_object_off, m_write_data);
+    }
+    wr->set_op_flags2(m_op_flags);
+  }
+
+  void AioObjectWrite::send_write() {
+    bool write_full = (m_object_off == 0 && m_object_len == m_ictx->get_object_size());
+    ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
+			   << m_object_off << "~" << m_object_len
+                           << " object exist " << m_object_exist
+			   << " write_full " << write_full << dendl;
+    if (write_full) {
+      send_write_op(false);
+    } else {
+      AbstractAioObjectWrite::send_write();
+    }
+  }
+
+  void AioObjectRemove::guard_write() {
+    // do nothing to disable write guard only if deep-copyup not required
+    RWLock::RLocker snap_locker(m_ictx->snap_lock);
+    if (!m_ictx->snaps.empty()) {
+      AbstractAioObjectWrite::guard_write();
+    }
+  }
+  void AioObjectRemove::send_write() {
+    ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
+			   << m_object_off << "~" << m_object_len << dendl;
+    send_write_op(true);
+  }
+}
diff --git a/src/librbd/AioObjectRequest.h b/src/librbd/AioObjectRequest.h
new file mode 100644
index 0000000..bcbaf6b
--- /dev/null
+++ b/src/librbd/AioObjectRequest.h
@@ -0,0 +1,349 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_AIOREQUEST_H
+#define CEPH_LIBRBD_AIOREQUEST_H
+
+#include "include/int_types.h"
+
+#include <map>
+
+#include "common/snap_types.h"
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "include/rados/librados.hpp"
+#include "librbd/ObjectMap.h"
+
+namespace librbd {
+
+  struct AioCompletion;
+  struct ImageCtx;
+  class CopyupRequest;
+
+  /**
+   * This class represents an I/O operation to a single RBD data object.
+   * Its subclasses encapsulate logic for dealing with special cases
+   * for I/O due to layering.
+   */
+  class AioObjectRequest
+  {
+  public:
+    AioObjectRequest(ImageCtx *ictx, const std::string &oid,
+                     uint64_t objectno, uint64_t off, uint64_t len,
+                     librados::snap_t snap_id,
+                     Context *completion, bool hide_enoent);
+    virtual ~AioObjectRequest() {}
+
+    virtual void add_copyup_ops(librados::ObjectWriteOperation *wr) {};
+
+    void complete(int r);
+
+    virtual bool should_complete(int r) = 0;
+    virtual void send() = 0;
+
+    bool has_parent() const {
+      return !m_parent_extents.empty();
+    }
+
+  protected:
+    bool compute_parent_extents();
+
+    ImageCtx *m_ictx;
+    std::string m_oid;
+    uint64_t m_object_no, m_object_off, m_object_len;
+    librados::snap_t m_snap_id;
+    Context *m_completion;
+    std::vector<std::pair<uint64_t,uint64_t> > m_parent_extents;
+    bool m_hide_enoent;
+  };
+
+  class AioObjectRead : public AioObjectRequest {
+  public:
+    AioObjectRead(ImageCtx *ictx, const std::string &oid,
+	          uint64_t objectno, uint64_t offset, uint64_t len,
+	          vector<pair<uint64_t,uint64_t> >& be,
+	          librados::snap_t snap_id, bool sparse,
+	          Context *completion, int op_flags);
+    virtual ~AioObjectRead();
+
+    virtual bool should_complete(int r);
+    virtual void send();
+    void guard_read();
+
+    ceph::bufferlist &data() {
+      return m_read_data;
+    }
+
+    std::map<uint64_t, uint64_t> m_ext_map;
+
+    friend class C_AioRead;
+
+  private:
+    vector<pair<uint64_t,uint64_t> > m_buffer_extents;
+    bool m_tried_parent;
+    bool m_sparse;
+    int m_op_flags;
+    ceph::bufferlist m_read_data;
+    AioCompletion *m_parent_completion;
+
+    /**
+     * Reads go through the following state machine to deal with
+     * layering:
+     *
+     *                          need copyup
+     * LIBRBD_AIO_READ_GUARD ---------------> LIBRBD_AIO_READ_COPYUP
+     *           |                                       |
+     *           v                                       |
+     *         done <------------------------------------/
+     *           ^
+     *           |
+     * LIBRBD_AIO_READ_FLAT
+     *
+     * Reads start in LIBRBD_AIO_READ_GUARD or _FLAT, depending on
+     * whether there is a parent or not.
+     */
+    enum read_state_d {
+      LIBRBD_AIO_READ_GUARD,
+      LIBRBD_AIO_READ_COPYUP,
+      LIBRBD_AIO_READ_FLAT
+    };
+
+    read_state_d m_state;
+
+    void send_copyup();
+    void read_from_parent(const vector<pair<uint64_t,uint64_t> >& image_extents);
+  };
+
+  class AbstractAioObjectWrite : public AioObjectRequest {
+  public:
+    AbstractAioObjectWrite(ImageCtx *ictx, const std::string &oid,
+                           uint64_t object_no, uint64_t object_off,
+                           uint64_t len, const ::SnapContext &snapc,
+                           Context *completion, bool hide_enoent);
+
+    virtual void add_copyup_ops(librados::ObjectWriteOperation *wr)
+    {
+      add_write_ops(wr);
+    }
+
+    virtual bool should_complete(int r);
+    virtual void send();
+
+    /**
+     * Writes go through the following state machine to deal with
+     * layering and the object map:
+     *
+     * <start>
+     *  .  |
+     *  .  |
+     *  .  \---> LIBRBD_AIO_WRITE_PRE
+     *  .           |         |
+     *  . . . . . . | . . . . | . . . . . . . . . . .
+     *      .       |   -or-  |                     .
+     *      .       |         |                     v
+     *      .       |         \----------------> LIBRBD_AIO_WRITE_FLAT . . .
+     *      .       |                                               |      .
+     *      v       v         need copyup                           |      .
+     * LIBRBD_AIO_WRITE_GUARD -----------> LIBRBD_AIO_WRITE_COPYUP  |      .
+     *  .       |                               |        .          |      .
+     *  .       |                               |        .          |      .
+     *  .       |                         /-----/        .          |      .
+     *  .       |                         |              .          |      .
+     *  .       \-------------------\     |     /-------------------/      .
+     *  .                           |     |     |        .                 .
+     *  .                           v     v     v        .                 .
+     *  .                       LIBRBD_AIO_WRITE_POST    .                 .
+     *  .                               |                .                 .
+     *  .                               |  . . . . . . . .                 .
+     *  .                               |  .                               .
+     *  .                               v  v                               .
+     *  . . . . . . . . . . . . . . > <finish> < . . . . . . . . . . . . . .
+     *
+     * The _PRE/_POST states are skipped if the object map is disabled.
+     * The write starts in _WRITE_GUARD or _FLAT depending on whether or not
+     * there is a parent overlap.
+     */
+  protected:
+    enum write_state_d {
+      LIBRBD_AIO_WRITE_GUARD,
+      LIBRBD_AIO_WRITE_COPYUP,
+      LIBRBD_AIO_WRITE_FLAT,
+      LIBRBD_AIO_WRITE_PRE,
+      LIBRBD_AIO_WRITE_POST,
+      LIBRBD_AIO_WRITE_ERROR
+    };
+
+    write_state_d m_state;
+    librados::ObjectWriteOperation m_write;
+    uint64_t m_snap_seq;
+    std::vector<librados::snap_t> m_snaps;
+    bool m_object_exist;
+
+    virtual void add_write_ops(librados::ObjectWriteOperation *wr) = 0;
+    virtual const char* get_write_type() const = 0;
+    virtual void guard_write();
+    virtual void pre_object_map_update(uint8_t *new_state) = 0;
+    virtual bool post_object_map_update() {
+      return false;
+    }
+    virtual void send_write();
+    virtual void send_write_op(bool write_guard);
+    virtual void handle_write_guard();
+
+  private:
+    void send_pre();
+    bool send_post();
+    void send_copyup();
+  };
+
+  class AioObjectWrite : public AbstractAioObjectWrite {
+  public:
+    AioObjectWrite(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+                   uint64_t object_off, const ceph::bufferlist &data,
+                   const ::SnapContext &snapc, Context *completion)
+      : AbstractAioObjectWrite(ictx, oid, object_no, object_off, data.length(),
+                               snapc, completion, false),
+	m_write_data(data), m_op_flags(0) {
+    }
+
+    void set_op_flags(int op_flags) {
+      m_op_flags = op_flags;
+    }
+  protected:
+    virtual void add_write_ops(librados::ObjectWriteOperation *wr);
+
+    virtual const char* get_write_type() const {
+      return "write";
+    }
+
+    virtual void pre_object_map_update(uint8_t *new_state) {
+      *new_state = OBJECT_EXISTS;
+    }
+    virtual void send_write();
+
+  private:
+    ceph::bufferlist m_write_data;
+    int m_op_flags;
+  };
+
+  class AioObjectRemove : public AbstractAioObjectWrite {
+  public:
+    AioObjectRemove(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+	            const ::SnapContext &snapc, Context *completion)
+      : AbstractAioObjectWrite(ictx, oid, object_no, 0, 0, snapc, completion,
+                               true),
+        m_object_state(OBJECT_NONEXISTENT) {
+    }
+
+  protected:
+    virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
+      if (has_parent()) {
+	wr->truncate(0);
+      } else {
+	wr->remove();
+      }
+    }
+
+    virtual const char* get_write_type() const {
+      if (has_parent()) {
+        return "remove (trunc)";
+      }
+      return "remove";
+    }
+    virtual void pre_object_map_update(uint8_t *new_state) {
+      if (has_parent()) {
+	m_object_state = OBJECT_EXISTS;
+      } else {
+	m_object_state = OBJECT_PENDING;
+      }
+      *new_state = m_object_state;
+    }
+
+    virtual bool post_object_map_update() {
+      if (m_object_state == OBJECT_EXISTS) {
+	return false;
+      }
+      return true;
+    }
+
+    virtual void guard_write();
+    virtual void send_write();
+
+  private:
+    uint8_t m_object_state;
+  };
+
+  class AioObjectTrim : public AbstractAioObjectWrite {
+  public:
+    AioObjectTrim(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+                  const ::SnapContext &snapc, Context *completion)
+      : AbstractAioObjectWrite(ictx, oid, object_no, 0, 0, snapc, completion,
+                               true) {
+    }
+
+  protected:
+    virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
+      wr->remove();
+    }
+
+    virtual const char* get_write_type() const {
+      return "remove (trim)";
+    }
+
+    virtual void pre_object_map_update(uint8_t *new_state) {
+      *new_state = OBJECT_PENDING;
+    }
+
+    virtual bool post_object_map_update() {
+      return true;
+    }
+  };
+
+  class AioObjectTruncate : public AbstractAioObjectWrite {
+  public:
+    AioObjectTruncate(ImageCtx *ictx, const std::string &oid,
+                      uint64_t object_no, uint64_t object_off,
+                      const ::SnapContext &snapc, Context *completion)
+      : AbstractAioObjectWrite(ictx, oid, object_no, object_off, 0, snapc,
+                               completion, true) {
+    }
+
+  protected:
+    virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
+      wr->truncate(m_object_off);
+    }
+
+    virtual const char* get_write_type() const {
+      return "truncate";
+    }
+
+    virtual void pre_object_map_update(uint8_t *new_state) {
+      *new_state = OBJECT_EXISTS;
+    }
+  };
+
+  class AioObjectZero : public AbstractAioObjectWrite {
+  public:
+    AioObjectZero(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
+                  uint64_t object_off, uint64_t object_len,
+                  const ::SnapContext &snapc, Context *completion)
+      : AbstractAioObjectWrite(ictx, oid, object_no, object_off, object_len,
+                               snapc, completion, true) {
+    }
+
+  protected:
+    virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
+      wr->zero(m_object_off, m_object_len);
+    }
+
+    virtual const char* get_write_type() const {
+      return "zero";
+    }
+
+    virtual void pre_object_map_update(uint8_t *new_state) {
+      *new_state = OBJECT_EXISTS;
+    }
+  };
+
+}
+
+#endif
diff --git a/src/librbd/AioRequest.cc b/src/librbd/AioRequest.cc
deleted file mode 100644
index b6fc1f9..0000000
--- a/src/librbd/AioRequest.cc
+++ /dev/null
@@ -1,526 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "common/ceph_context.h"
-#include "common/dout.h"
-#include "common/errno.h"
-#include "common/Mutex.h"
-#include "common/RWLock.h"
-
-#include "librbd/AioCompletion.h"
-#include "librbd/ImageCtx.h"
-#include "librbd/ImageWatcher.h"
-#include "librbd/internal.h"
-
-#include "librbd/AioRequest.h"
-#include "librbd/CopyupRequest.h"
-
-#include <boost/bind.hpp>
-#include <boost/optional.hpp>
-
-#define dout_subsys ceph_subsys_rbd
-#undef dout_prefix
-#define dout_prefix *_dout << "librbd::AioRequest: "
-
-namespace librbd {
-
-  AioRequest::AioRequest(ImageCtx *ictx, const std::string &oid,
-			 uint64_t objectno, uint64_t off, uint64_t len,
-			 librados::snap_t snap_id,
-			 Context *completion,
-			 bool hide_enoent)
-    : m_ictx(ictx), m_oid(oid), m_object_no(objectno), m_object_off(off),
-      m_object_len(len), m_snap_id(snap_id), m_completion(completion),
-      m_hide_enoent(hide_enoent) {
-
-    Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
-                            0, m_ictx->layout.fl_object_size, m_parent_extents);
-
-    RWLock::RLocker snap_locker(m_ictx->snap_lock);
-    RWLock::RLocker parent_locker(m_ictx->parent_lock);
-    compute_parent_extents();
-  }
-
-  void AioRequest::complete(int r)
-  {
-    if (should_complete(r)) {
-      ldout(m_ictx->cct, 20) << "complete " << this << dendl;
-      if (m_hide_enoent && r == -ENOENT) {
-	r = 0;
-      }
-      m_completion->complete(r);
-      delete this;
-    }
-  }
-
-  bool AioRequest::compute_parent_extents() {
-    assert(m_ictx->snap_lock.is_locked());
-    assert(m_ictx->parent_lock.is_locked());
-
-    uint64_t parent_overlap;
-    int r = m_ictx->get_parent_overlap(m_snap_id, &parent_overlap);
-    if (r < 0) {
-      // NOTE: it's possible for a snapshot to be deleted while we are
-      // still reading from it
-      lderr(m_ictx->cct) << this << " compute_parent_extents: failed to "
-                         << "retrieve parent overlap: " << cpp_strerror(r)
-                         << dendl;
-      m_parent_extents.clear();
-      return false;
-    }
-
-    uint64_t object_overlap =
-      m_ictx->prune_parent_extents(m_parent_extents, parent_overlap);
-    if (object_overlap > 0) {
-      ldout(m_ictx->cct, 20) << this << " compute_parent_extents: "
-                             << "overlap " << parent_overlap << " "
-                             << "extents " << m_parent_extents << dendl;
-      return true;
-    }
-    return false;
-  }
-
-  static inline bool is_copy_on_read(ImageCtx *ictx, librados::snap_t snap_id) {
-    assert(ictx->snap_lock.is_locked());
-    return (ictx->clone_copy_on_read) &&
-           (!ictx->read_only) && (snap_id == CEPH_NOSNAP);
-  }
-
-  /** read **/
-
-  AioRead::AioRead(ImageCtx *ictx, const std::string &oid,
-                   uint64_t objectno, uint64_t offset, uint64_t len,
-                   vector<pair<uint64_t,uint64_t> >& be,
-                   librados::snap_t snap_id, bool sparse,
-                   Context *completion, int op_flags)
-    : AioRequest(ictx, oid, objectno, offset, len, snap_id, completion, false),
-      m_buffer_extents(be), m_tried_parent(false), m_sparse(sparse),
-      m_op_flags(op_flags), m_parent_completion(NULL),
-      m_state(LIBRBD_AIO_READ_FLAT) {
-
-    guard_read();
-  }
-
-  AioRead::~AioRead()
-  {
-    if (m_parent_completion) {
-      m_parent_completion->release();
-      m_parent_completion = NULL;
-    }
-  }
-
-  void AioRead::guard_read()
-  {
-    RWLock::RLocker snap_locker(m_ictx->snap_lock);
-    RWLock::RLocker parent_locker(m_ictx->parent_lock);
-
-    if (has_parent()) {
-      ldout(m_ictx->cct, 20) << __func__ << " guarding read" << dendl;
-      m_state = LIBRBD_AIO_READ_GUARD;
-    }
-  }
-
-  bool AioRead::should_complete(int r)
-  {
-    ldout(m_ictx->cct, 20) << "should_complete " << this << " " << m_oid << " "
-                           << m_object_off << "~" << m_object_len
-                           << " r = " << r << dendl;
-
-    bool finished = true;
-
-    switch (m_state) {
-    case LIBRBD_AIO_READ_GUARD:
-      ldout(m_ictx->cct, 20) << "should_complete " << this
-                             << " READ_CHECK_GUARD" << dendl;
-
-      // This is the step to read from parent
-      if (!m_tried_parent && r == -ENOENT) {
-        {
-          RWLock::RLocker l(m_ictx->snap_lock);
-          RWLock::RLocker l2(m_ictx->parent_lock);
-          if (m_ictx->parent == NULL) {
-	    ldout(m_ictx->cct, 20) << "parent is gone; do nothing" << dendl;
-	    m_state = LIBRBD_AIO_READ_FLAT;
-	    finished = false;
-	    break;
-	  }
-
-          // calculate reverse mapping onto the image
-          vector<pair<uint64_t,uint64_t> > parent_extents;
-          Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, m_object_no,
-                                  m_object_off, m_object_len, parent_extents);
-
-          uint64_t parent_overlap = 0;
-          uint64_t object_overlap = 0;
-          r = m_ictx->get_parent_overlap(m_snap_id, &parent_overlap);
-          if (r == 0) {
-            object_overlap = m_ictx->prune_parent_extents(parent_extents,
-                                                          parent_overlap);
-          }
-
-          if (object_overlap > 0) {
-            m_tried_parent = true;
-            if (is_copy_on_read(m_ictx, m_snap_id)) {
-              m_state = LIBRBD_AIO_READ_COPYUP;
-	    }
-
-            read_from_parent(parent_extents);
-            finished = false;
-          }
-        }
-
-        if (m_tried_parent) {
-          // release reference to the parent read completion.  this request
-          // might be completed after unblock is invoked.
-          AioCompletion *parent_completion = m_parent_completion;
-          parent_completion->unblock(m_ictx->cct);
-          parent_completion->put();
-        }
-      }
-      break;
-    case LIBRBD_AIO_READ_COPYUP:
-      ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_COPYUP"
-                             << dendl;
-      // This is the extra step for copy-on-read: kick off an asynchronous copyup.
-      // It is different from copy-on-write as asynchronous copyup will finish
-      // by itself so state won't go back to LIBRBD_AIO_READ_GUARD.
-
-      assert(m_tried_parent);
-      if (r > 0) {
-        // If read entire object from parent success and CoR is possible, kick
-        // off a asynchronous copyup. This approach minimizes the latency
-        // impact.
-        send_copyup();
-      }
-      break;
-    case LIBRBD_AIO_READ_FLAT:
-      ldout(m_ictx->cct, 20) << "should_complete " << this << " READ_FLAT"
-                             << dendl;
-      // The read content should be deposit in m_read_data
-      break;
-    default:
-      lderr(m_ictx->cct) << "invalid request state: " << m_state << dendl;
-      assert(0);
-    }
-
-    return finished;
-  }
-
-  void AioRead::send() {
-    ldout(m_ictx->cct, 20) << "send " << this << " " << m_oid << " "
-                           << m_object_off << "~" << m_object_len << dendl;
-
-    // send read request to parent if the object doesn't exist locally
-    if (!m_ictx->object_map.object_may_exist(m_object_no)) {
-      complete(-ENOENT);
-      return;
-    }
-
-    librados::AioCompletion *rados_completion =
-      librados::Rados::aio_create_completion(this, rados_req_cb, NULL);
-    int r;
-    librados::ObjectReadOperation op;
-    int flags = m_ictx->get_read_flags(m_snap_id);
-    if (m_sparse) {
-      op.sparse_read(m_object_off, m_object_len, &m_ext_map, &m_read_data,
-		     NULL);
-    } else {
-      op.read(m_object_off, m_object_len, &m_read_data, NULL);
-    }
-    op.set_op_flags2(m_op_flags);
-
-    r = m_ictx->data_ctx.aio_operate(m_oid, rados_completion, &op, flags, NULL);
-    assert(r == 0);
-
-    rados_completion->release();
-  }
-
-  void AioRead::send_copyup()
-  {
-    {
-      RWLock::RLocker snap_locker(m_ictx->snap_lock);
-      RWLock::RLocker parent_locker(m_ictx->parent_lock);
-      if (!compute_parent_extents()) {
-        return;
-      }
-    }
-
-    Mutex::Locker copyup_locker(m_ictx->copyup_list_lock);
-    map<uint64_t, CopyupRequest*>::iterator it =
-      m_ictx->copyup_list.find(m_object_no);
-    if (it == m_ictx->copyup_list.end()) {
-      // create and kick off a CopyupRequest
-      CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid, m_object_no,
-    					         m_parent_extents);
-      m_ictx->copyup_list[m_object_no] = new_req;
-      new_req->queue_send();
-    }
-  }
-
-  void AioRead::read_from_parent(const vector<pair<uint64_t,uint64_t> >& parent_extents)
-  {
-    assert(!m_parent_completion);
-    m_parent_completion = aio_create_completion_internal(this, rbd_req_cb);
-
-    // prevent the parent image from being deleted while this
-    // request is still in-progress
-    m_parent_completion->get();
-    m_parent_completion->block();
-
-    ldout(m_ictx->cct, 20) << "read_from_parent this = " << this
-			   << " parent completion " << m_parent_completion
-			   << " extents " << parent_extents
-			   << dendl;
-    aio_read(m_ictx->parent, parent_extents, NULL, &m_read_data,
-             m_parent_completion, 0);
-  }
-
-  /** write **/
-
-  AbstractWrite::AbstractWrite(ImageCtx *ictx, const std::string &oid,
-                               uint64_t object_no, uint64_t object_off,
-                               uint64_t len, const ::SnapContext &snapc,
-                               Context *completion, bool hide_enoent)
-    : AioRequest(ictx, oid, object_no, object_off, len, CEPH_NOSNAP, completion,
-                 hide_enoent),
-      m_state(LIBRBD_AIO_WRITE_FLAT), m_snap_seq(snapc.seq.val)
-  {
-    m_snaps.insert(m_snaps.end(), snapc.snaps.begin(), snapc.snaps.end());
-  }
-
-  void AbstractWrite::guard_write()
-  {
-    if (has_parent()) {
-      m_state = LIBRBD_AIO_WRITE_GUARD;
-      m_write.assert_exists();
-      ldout(m_ictx->cct, 20) << __func__ << " guarding write" << dendl;
-    }
-  }
-
-  bool AbstractWrite::should_complete(int r)
-  {
-    ldout(m_ictx->cct, 20) << get_write_type() << " " << this << " " << m_oid
-                           << " " << m_object_off << "~" << m_object_len
-			   << " should_complete: r = " << r << dendl;
-
-    bool finished = true;
-    switch (m_state) {
-    case LIBRBD_AIO_WRITE_PRE:
-      ldout(m_ictx->cct, 20) << "WRITE_PRE" << dendl;
-      if (r < 0) {
-	return true;
-      }
-
-      send_write();
-      finished = false;
-      break;
-
-    case LIBRBD_AIO_WRITE_POST:
-      ldout(m_ictx->cct, 20) << "WRITE_POST" << dendl;
-      finished = true;
-      break;
-
-    case LIBRBD_AIO_WRITE_GUARD:
-      ldout(m_ictx->cct, 20) << "WRITE_CHECK_GUARD" << dendl;
-
-      if (r == -ENOENT) {
-        bool has_parent;
-        {
-	  RWLock::RLocker snap_locker(m_ictx->snap_lock);
-	  RWLock::RLocker parent_locker(m_ictx->parent_lock);
-          has_parent = compute_parent_extents();
-        }
-
-	// If parent still exists, overlap might also have changed.
-	if (has_parent) {
-          send_copyup();
-	} else {
-          // parent may have disappeared -- send original write again
-	  ldout(m_ictx->cct, 20) << "should_complete(" << this
-				 << "): parent overlap now 0" << dendl;
-          send_write();
-	}
-	finished = false;
-	break;
-      } else if (r < 0) {
-        // pass the error code to the finish context
-        m_state = LIBRBD_AIO_WRITE_ERROR;
-        complete(r);
-	finished = false;
-	break;
-      }
-
-      finished = send_post();
-      break;
-
-    case LIBRBD_AIO_WRITE_COPYUP:
-      ldout(m_ictx->cct, 20) << "WRITE_COPYUP" << dendl;
-      if (r < 0) {
-        m_state = LIBRBD_AIO_WRITE_ERROR;
-        complete(r);
-        finished = false;
-      } else {
-        finished = send_post();
-      }
-      break;
-
-    case LIBRBD_AIO_WRITE_FLAT:
-      ldout(m_ictx->cct, 20) << "WRITE_FLAT" << dendl;
-
-      finished = send_post();
-      break;
-
-    case LIBRBD_AIO_WRITE_ERROR:
-      assert(r < 0);
-      lderr(m_ictx->cct) << "WRITE_ERROR: " << cpp_strerror(r)
-			 << dendl;
-      break;
-
-    default:
-      lderr(m_ictx->cct) << "invalid request state: " << m_state << dendl;
-      assert(0);
-    }
-
-    return finished;
-  }
-
-  void AbstractWrite::send() {
-    assert(m_ictx->owner_lock.is_locked());
-    ldout(m_ictx->cct, 20) << "send " << get_write_type() << " " << this <<" "
-                           << m_oid << " " << m_object_off << "~"
-                           << m_object_len << dendl;
-    send_pre();
-  }
-
-  void AbstractWrite::send_pre() {
-    assert(m_ictx->owner_lock.is_locked());
-
-    bool write = false;
-    {
-      RWLock::RLocker snap_lock(m_ictx->snap_lock);
-      if (!m_ictx->object_map.enabled()) {
-        write = true;
-      } else {
-        // should have been flushed prior to releasing lock
-        assert(m_ictx->image_watcher->is_lock_owner());
-
-        ldout(m_ictx->cct, 20) << "send_pre " << this << " " << m_oid << " "
-          		       << m_object_off << "~" << m_object_len << dendl;
-        m_state = LIBRBD_AIO_WRITE_PRE;
-
-        uint8_t new_state;
-        boost::optional<uint8_t> current_state;
-        pre_object_map_update(&new_state);
-
-        RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
-        if (m_ictx->object_map[m_object_no] != new_state) {
-          FunctionContext *ctx = new FunctionContext(
-            boost::bind(&AioRequest::complete, this, _1));
-          bool updated = m_ictx->object_map.aio_update(m_object_no, new_state,
-                                                       current_state, ctx);
-          assert(updated);
-        } else {
-          write = true;
-        }
-      }
-    }
-
-    // avoid possible recursive lock attempts
-    if (write) {
-      // no object map update required
-      send_write();
-    }
-  }
-
-  bool AbstractWrite::send_post() {
-    RWLock::RLocker owner_locker(m_ictx->owner_lock);
-    RWLock::RLocker snap_locker(m_ictx->snap_lock);
-    if (!m_ictx->object_map.enabled() || !post_object_map_update()) {
-      return true;
-    }
-
-    // should have been flushed prior to releasing lock
-    assert(m_ictx->image_watcher->is_lock_owner());
-
-    ldout(m_ictx->cct, 20) << "send_post " << this << " " << m_oid << " "
-			   << m_object_off << "~" << m_object_len << dendl;
-    m_state = LIBRBD_AIO_WRITE_POST;
-
-    RWLock::WLocker object_map_locker(m_ictx->object_map_lock);
-    uint8_t current_state = m_ictx->object_map[m_object_no];
-    if (current_state != OBJECT_PENDING ||
-        current_state == OBJECT_NONEXISTENT) {
-      return true;
-    }
-
-    FunctionContext *ctx = new FunctionContext(
-      boost::bind(&AioRequest::complete, this, _1));
-    bool updated = m_ictx->object_map.aio_update(m_object_no,
-                                                 OBJECT_NONEXISTENT,
-				                 OBJECT_PENDING, ctx);
-    assert(updated);
-    return false;
-  }
-
-  void AbstractWrite::send_write() {
-    ldout(m_ictx->cct, 20) << "send_write " << this << " " << m_oid << " "
-			   << m_object_off << "~" << m_object_len << dendl;
-
-    m_state = LIBRBD_AIO_WRITE_FLAT;
-    guard_write();
-    add_write_ops(&m_write);
-    assert(m_write.size() != 0);
-
-    librados::AioCompletion *rados_completion =
-      librados::Rados::aio_create_completion(this, NULL, rados_req_cb);
-    int r = m_ictx->data_ctx.aio_operate(m_oid, rados_completion, &m_write,
-					 m_snap_seq, m_snaps);
-    assert(r == 0);
-    rados_completion->release();
-  }
-
-  void AbstractWrite::send_copyup()
-  {
-    ldout(m_ictx->cct, 20) << "send_copyup " << this << " " << m_oid << " "
-                           << m_object_off << "~" << m_object_len << dendl;
-    m_state = LIBRBD_AIO_WRITE_COPYUP;
-
-    m_ictx->copyup_list_lock.Lock();
-    map<uint64_t, CopyupRequest*>::iterator it =
-      m_ictx->copyup_list.find(m_object_no);
-    if (it == m_ictx->copyup_list.end()) {
-      CopyupRequest *new_req = new CopyupRequest(m_ictx, m_oid,
-                                                 m_object_no,
-                                                 m_parent_extents);
-
-      // make sure to wait on this CopyupRequest
-      new_req->append_request(this);
-      m_ictx->copyup_list[m_object_no] = new_req;
-
-      m_ictx->copyup_list_lock.Unlock();
-      new_req->send();
-    } else {
-      it->second->append_request(this);
-      m_ictx->copyup_list_lock.Unlock();
-    }
-  }
-
-  void AioWrite::add_write_ops(librados::ObjectWriteOperation *wr) {
-    if (m_ictx->enable_alloc_hint && !m_ictx->object_map.object_may_exist(m_object_no))
-      wr->set_alloc_hint(m_ictx->get_object_size(), m_ictx->get_object_size());
-    if (m_object_off == 0 && m_object_len == m_ictx->get_object_size()) {
-      wr->write_full(m_write_data);
-    } else {
-      wr->write(m_object_off, m_write_data);
-    }
-    wr->set_op_flags2(m_op_flags);
-  }
-
-  void AioRemove::guard_write() {
-    // do nothing to disable write guard only if deep-copyup not required
-    RWLock::RLocker snap_locker(m_ictx->snap_lock);
-    if (!m_ictx->snaps.empty()) {
-      AbstractWrite::guard_write();
-    }
-  }
-}
diff --git a/src/librbd/AioRequest.h b/src/librbd/AioRequest.h
deleted file mode 100644
index 885cbce..0000000
--- a/src/librbd/AioRequest.h
+++ /dev/null
@@ -1,347 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#ifndef CEPH_LIBRBD_AIOREQUEST_H
-#define CEPH_LIBRBD_AIOREQUEST_H
-
-#include "include/int_types.h"
-
-#include <map>
-
-#include "common/snap_types.h"
-#include "include/buffer.h"
-#include "include/Context.h"
-#include "include/rados/librados.hpp"
-#include "librbd/ObjectMap.h"
-
-namespace librbd {
-
-  struct AioCompletion;
-  struct ImageCtx;
-  class CopyupRequest;
-
-  /**
-   * This class represents an I/O operation to a single RBD data object.
-   * Its subclasses encapsulate logic for dealing with special cases
-   * for I/O due to layering.
-   */
-  class AioRequest
-  {
-  public:
-    AioRequest(ImageCtx *ictx, const std::string &oid,
-               uint64_t objectno, uint64_t off, uint64_t len,
-               librados::snap_t snap_id,
-               Context *completion, bool hide_enoent);
-    virtual ~AioRequest() {}
-
-    virtual void add_copyup_ops(librados::ObjectWriteOperation *wr) {};
-
-    void complete(int r);
-
-    virtual bool should_complete(int r) = 0;
-    virtual void send() = 0;
-
-    bool has_parent() const {
-      return !m_parent_extents.empty();
-    }
-
-  protected:
-    bool compute_parent_extents();
-
-    ImageCtx *m_ictx;
-    std::string m_oid;
-    uint64_t m_object_no, m_object_off, m_object_len;
-    librados::snap_t m_snap_id;
-    Context *m_completion;
-    std::vector<std::pair<uint64_t,uint64_t> > m_parent_extents;
-    bool m_hide_enoent;
-  };
-
-  class AioRead : public AioRequest {
-  public:
-    AioRead(ImageCtx *ictx, const std::string &oid,
-	    uint64_t objectno, uint64_t offset, uint64_t len,
-	    vector<pair<uint64_t,uint64_t> >& be,
-	    librados::snap_t snap_id, bool sparse,
-	    Context *completion, int op_flags);
-    virtual ~AioRead();
-
-    virtual bool should_complete(int r);
-    virtual void send();
-    void guard_read();
-
-    ceph::bufferlist &data() {
-      return m_read_data;
-    }
-
-    std::map<uint64_t, uint64_t> m_ext_map;
-
-    friend class C_AioRead;
-
-  private:
-    vector<pair<uint64_t,uint64_t> > m_buffer_extents;
-    bool m_tried_parent;
-    bool m_sparse;
-    int m_op_flags;
-    ceph::bufferlist m_read_data;
-    AioCompletion *m_parent_completion;
-
-    /**
-     * Reads go through the following state machine to deal with
-     * layering:
-     *
-     *                          need copyup
-     * LIBRBD_AIO_READ_GUARD ---------------> LIBRBD_AIO_READ_COPYUP
-     *           |                                       |
-     *           v                                       |
-     *         done <------------------------------------/
-     *           ^
-     *           |
-     * LIBRBD_AIO_READ_FLAT
-     *
-     * Reads start in LIBRBD_AIO_READ_GUARD or _FLAT, depending on
-     * whether there is a parent or not.
-     */
-    enum read_state_d {
-      LIBRBD_AIO_READ_GUARD,
-      LIBRBD_AIO_READ_COPYUP,
-      LIBRBD_AIO_READ_FLAT
-    };
-
-    read_state_d m_state;
-
-    void send_copyup();
-    void read_from_parent(const vector<pair<uint64_t,uint64_t> >& image_extents);
-  };
-
-  class AbstractWrite : public AioRequest {
-  public:
-    AbstractWrite(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
-                  uint64_t object_off, uint64_t len, const ::SnapContext &snapc,
-		  Context *completion, bool hide_enoent);
-    virtual ~AbstractWrite() {}
-
-    virtual void add_copyup_ops(librados::ObjectWriteOperation *wr)
-    {
-      add_write_ops(wr);
-    }
-
-    virtual bool should_complete(int r);
-    virtual void send();
-
-  private:
-    /**
-     * Writes go through the following state machine to deal with
-     * layering and the object map:
-     *
-     * <start>
-     *  .  |
-     *  .  |
-     *  .  \---> LIBRBD_AIO_WRITE_PRE
-     *  .           |         |
-     *  . . . . . . | . . . . | . . . . . . . . . . .
-     *      .       |   -or-  |                     .
-     *      .       |         |                     v
-     *      .       |         \----------------> LIBRBD_AIO_WRITE_FLAT . . .
-     *      .       |                                               |      .
-     *      v       v         need copyup                           |      .
-     * LIBRBD_AIO_WRITE_GUARD -----------> LIBRBD_AIO_WRITE_COPYUP  |      .
-     *  .       |                               |        .          |      .
-     *  .       |                               |        .          |      .
-     *  .       |                         /-----/        .          |      .
-     *  .       |                         |              .          |      .
-     *  .       \-------------------\     |     /-------------------/      .
-     *  .                           |     |     |        .                 .
-     *  .                           v     v     v        .                 .
-     *  .                       LIBRBD_AIO_WRITE_POST    .                 .
-     *  .                               |                .                 .
-     *  .                               |  . . . . . . . .                 .
-     *  .                               |  .                               .
-     *  .                               v  v                               .
-     *  . . . . . . . . . . . . . . > <finish> < . . . . . . . . . . . . . .
-     *
-     * The _PRE/_POST states are skipped if the object map is disabled.
-     * The write starts in _WRITE_GUARD or _FLAT depending on whether or not
-     * there is a parent overlap.
-     */
-    enum write_state_d {
-      LIBRBD_AIO_WRITE_GUARD,
-      LIBRBD_AIO_WRITE_COPYUP,
-      LIBRBD_AIO_WRITE_FLAT,
-      LIBRBD_AIO_WRITE_PRE,
-      LIBRBD_AIO_WRITE_POST,
-      LIBRBD_AIO_WRITE_ERROR
-    };
-
-  protected:
-    write_state_d m_state;
-    librados::ObjectWriteOperation m_write;
-    uint64_t m_snap_seq;
-    std::vector<librados::snap_t> m_snaps;
-
-    virtual void add_write_ops(librados::ObjectWriteOperation *wr) = 0;
-    virtual const char* get_write_type() const = 0;
-    virtual void guard_write();
-    virtual void pre_object_map_update(uint8_t *new_state) = 0;
-    virtual bool post_object_map_update() {
-      return false;
-    }
-
-  private:
-    void send_pre();
-    bool send_post();
-    void send_write();
-    void send_copyup();
-  };
-
-  class AioWrite : public AbstractWrite {
-  public:
-    AioWrite(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
-             uint64_t object_off, const ceph::bufferlist &data,
-             const ::SnapContext &snapc, Context *completion)
-      : AbstractWrite(ictx, oid, object_no, object_off, data.length(), snapc,
-		      completion, false),
-	m_write_data(data), m_op_flags(0) {
-    }
-    virtual ~AioWrite() {}
-
-    void set_op_flags(int op_flags) {
-      m_op_flags = op_flags;
-    }
-  protected:
-    virtual void add_write_ops(librados::ObjectWriteOperation *wr);
-
-    virtual const char* get_write_type() const {
-      return "write";
-    }
-
-    virtual void pre_object_map_update(uint8_t *new_state) {
-      *new_state = OBJECT_EXISTS;
-    }
-
-  private:
-    ceph::bufferlist m_write_data;
-    int m_op_flags;
-  };
-
-  class AioRemove : public AbstractWrite {
-  public:
-    AioRemove(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
-	      const ::SnapContext &snapc, Context *completion)
-      : AbstractWrite(ictx, oid, object_no, 0, 0, snapc, completion, true),
-        m_object_state(OBJECT_NONEXISTENT) {
-    }
-    virtual ~AioRemove() {}
-
-  protected:
-    virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
-      if (has_parent()) {
-	wr->truncate(0);
-      } else {
-	wr->remove();
-      }
-    }
-
-    virtual const char* get_write_type() const {
-      if (has_parent()) {
-        return "remove (trunc)";
-      }
-      return "remove";
-    }
-    virtual void pre_object_map_update(uint8_t *new_state) {
-      if (has_parent()) {
-	m_object_state = OBJECT_EXISTS;
-      } else {
-	m_object_state = OBJECT_PENDING;
-      }
-      *new_state = m_object_state;
-    }
-
-    virtual bool post_object_map_update() {
-      if (m_object_state == OBJECT_EXISTS) {
-	return false;
-      }
-      return true;
-    }
-
-    virtual void guard_write();
-
-  private:
-    uint8_t m_object_state;
-  };
-
-  class AioTrim : public AbstractWrite {
-  public:
-    AioTrim(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
-            const ::SnapContext &snapc, Context *completion)
-      : AbstractWrite(ictx, oid, object_no, 0, 0, snapc, completion, true) {
-    }
-
-  protected:
-    virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
-      wr->remove();
-    }
-
-    virtual const char* get_write_type() const {
-      return "remove (trim)";
-    }
-
-    virtual void pre_object_map_update(uint8_t *new_state) {
-      *new_state = OBJECT_PENDING;
-    }
-
-    virtual bool post_object_map_update() {
-      return true;
-    }
-  };
-
-  class AioTruncate : public AbstractWrite {
-  public:
-    AioTruncate(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
-                uint64_t object_off, const ::SnapContext &snapc,
-                Context *completion)
-      : AbstractWrite(ictx, oid, object_no, object_off, 0, snapc, completion,
-                      true) {
-    }
-    virtual ~AioTruncate() {}
-
-  protected:
-    virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
-      wr->truncate(m_object_off);
-    }
-
-    virtual const char* get_write_type() const {
-      return "truncate";
-    }
-
-    virtual void pre_object_map_update(uint8_t *new_state) {
-      *new_state = OBJECT_EXISTS;
-    }
-  };
-
-  class AioZero : public AbstractWrite {
-  public:
-    AioZero(ImageCtx *ictx, const std::string &oid, uint64_t object_no,
-            uint64_t object_off, uint64_t object_len,
-            const ::SnapContext &snapc, Context *completion)
-      : AbstractWrite(ictx, oid, object_no, object_off, object_len, snapc,
-                      completion, true) {
-    }
-    virtual ~AioZero() {}
-
-  protected:
-    virtual void add_write_ops(librados::ObjectWriteOperation *wr) {
-      wr->zero(m_object_off, m_object_len);
-    }
-
-    virtual const char* get_write_type() const {
-      return "zero";
-    }
-
-    virtual void pre_object_map_update(uint8_t *new_state) {
-      *new_state = OBJECT_EXISTS;
-    }
-  };
-
-}
-
-#endif
diff --git a/src/librbd/AsyncFlattenRequest.cc b/src/librbd/AsyncFlattenRequest.cc
index 9136220..dbcf334 100644
--- a/src/librbd/AsyncFlattenRequest.cc
+++ b/src/librbd/AsyncFlattenRequest.cc
@@ -2,7 +2,7 @@
 // vim: ts=8 sw=2 smarttab
 
 #include "librbd/AsyncFlattenRequest.h"
-#include "librbd/AioRequest.h"
+#include "librbd/AioObjectRequest.h"
 #include "librbd/AsyncObjectThrottle.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
@@ -40,8 +40,8 @@ public:
 
     bufferlist bl;
     string oid = m_image_ctx.get_object_name(m_object_no);
-    AioWrite *req = new AioWrite(&m_image_ctx, oid, m_object_no, 0, bl, m_snapc,
-                                 this);
+    AioObjectWrite *req = new AioObjectWrite(&m_image_ctx, oid, m_object_no, 0,
+                                             bl, m_snapc, this);
     if (!req->has_parent()) {
       // stop early if the parent went away - it just means
       // another flatten finished first or the image was resized
diff --git a/src/librbd/AsyncOperation.cc b/src/librbd/AsyncOperation.cc
index dfb1e61..7cfc2d7 100644
--- a/src/librbd/AsyncOperation.cc
+++ b/src/librbd/AsyncOperation.cc
@@ -3,6 +3,7 @@
 #include "librbd/AsyncOperation.h"
 #include "librbd/ImageCtx.h"
 #include "common/dout.h"
+#include "common/WorkQueue.h"
 #include "include/assert.h"
 
 #define dout_subsys ceph_subsys_rbd
@@ -11,11 +12,34 @@
 
 namespace librbd {
 
+namespace {
+
+struct C_CompleteFlushes : public Context {
+  ImageCtx *image_ctx;
+  std::list<Context *> flush_contexts;
+
+  C_CompleteFlushes(ImageCtx *image_ctx, std::list<Context *> &&flush_contexts)
+    : image_ctx(image_ctx), flush_contexts(std::move(flush_contexts)) {
+  }
+  virtual void finish(int r) {
+    RWLock::RLocker owner_locker(image_ctx->owner_lock);
+    while (!flush_contexts.empty()) {
+      Context *flush_ctx = flush_contexts.front();
+      flush_contexts.pop_front();
+
+      ldout(image_ctx->cct, 20) << "completed flush: " << flush_ctx << dendl;
+      flush_ctx->complete(0);
+    }
+  }
+};
+
+} // anonymous namespace
+
 void AsyncOperation::start_op(ImageCtx &image_ctx) {
   assert(m_image_ctx == NULL);
   m_image_ctx = &image_ctx;
 
-  ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl; 
+  ldout(m_image_ctx->cct, 20) << this << " " << __func__ << dendl;
   Mutex::Locker l(m_image_ctx->async_ops_lock);
   m_image_ctx->async_ops.push_front(&m_xlist_item);
 }
@@ -39,18 +63,16 @@ void AsyncOperation::finish_op() {
     }
   }
 
-  while (!m_flush_contexts.empty()) {
-    Context *flush_ctx = m_flush_contexts.front();
-    m_flush_contexts.pop_front();
-
-    ldout(m_image_ctx->cct, 20) << "completed flush: " << flush_ctx << dendl;
-    flush_ctx->complete(0);
-  }
+  C_CompleteFlushes *ctx = new C_CompleteFlushes(m_image_ctx,
+                                                 std::move(m_flush_contexts));
+  m_image_ctx->op_work_queue->queue(ctx);
 }
 
 void AsyncOperation::add_flush_context(Context *on_finish) {
   assert(m_image_ctx->async_ops_lock.is_locked());
+  ldout(m_image_ctx->cct, 20) << this << " " << __func__ << ": "
+                              << "flush=" << on_finish << dendl;
   m_flush_contexts.push_back(on_finish);
-} 
+}
 
 } // namespace librbd
diff --git a/src/librbd/AsyncResizeRequest.cc b/src/librbd/AsyncResizeRequest.cc
index 732e3f7..9982492 100644
--- a/src/librbd/AsyncResizeRequest.cc
+++ b/src/librbd/AsyncResizeRequest.cc
@@ -243,11 +243,11 @@ void AsyncResizeRequest::send_update_header() {
 
   librados::ObjectWriteOperation op;
   if (m_image_ctx.old_format) {
-    // rewrite header
+    // rewrite only the size field of the header
+    // NOTE: format 1 image headers are not stored in fixed endian format
     bufferlist bl;
-    m_image_ctx.header.image_size = m_new_size;
-    bl.append((const char *)&m_image_ctx.header, sizeof(m_image_ctx.header));
-    op.write(0, bl);
+    bl.append(reinterpret_cast<const char*>(&m_new_size), sizeof(m_new_size));
+    op.write(offsetof(rbd_obj_header_ondisk, image_size), bl);
   } else {
     if (m_image_ctx.image_watcher->is_lock_supported()) {
       m_image_ctx.image_watcher->assert_header_locked(&op);
diff --git a/src/librbd/AsyncTrimRequest.cc b/src/librbd/AsyncTrimRequest.cc
index 90668ce..6159ef5 100644
--- a/src/librbd/AsyncTrimRequest.cc
+++ b/src/librbd/AsyncTrimRequest.cc
@@ -2,7 +2,7 @@
 // vim: ts=8 sw=2 smarttab
 #include "librbd/AsyncTrimRequest.h"
 #include "librbd/AsyncObjectThrottle.h"
-#include "librbd/AioRequest.h"
+#include "librbd/AioObjectRequest.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/internal.h"
@@ -41,8 +41,8 @@ public:
     string oid = m_image_ctx.get_object_name(m_object_no);
     ldout(m_image_ctx.cct, 10) << "removing (with copyup) " << oid << dendl;
 
-    AbstractWrite *req = new AioTrim(&m_image_ctx, oid, m_object_no, m_snapc,
-                                     this);
+    AioObjectRequest *req = new AioObjectTrim(&m_image_ctx, oid, m_object_no,
+                                              m_snapc, this);
     req->send();
     return 0;
   }
@@ -340,13 +340,13 @@ void AsyncTrimRequest::send_clean_boundary() {
     ldout(cct, 20) << " ex " << *p << dendl;
     Context *req_comp = new C_ContextCompletion(*completion);
 
-    AbstractWrite *req;
+    AioObjectRequest *req;
     if (p->offset == 0) {
-      req = new AioTrim(&m_image_ctx, p->oid.name, p->objectno, snapc,
-                        req_comp);
+      req = new AioObjectTrim(&m_image_ctx, p->oid.name, p->objectno, snapc,
+                              req_comp);
     } else {
-      req = new AioTruncate(&m_image_ctx, p->oid.name, p->objectno,
-                            p->offset, snapc, req_comp);
+      req = new AioObjectTruncate(&m_image_ctx, p->oid.name, p->objectno,
+                                  p->offset, snapc, req_comp);
     }
     req->send();
   }
diff --git a/src/librbd/CopyupRequest.cc b/src/librbd/CopyupRequest.cc
index 667d19d..5c3973a 100644
--- a/src/librbd/CopyupRequest.cc
+++ b/src/librbd/CopyupRequest.cc
@@ -7,11 +7,13 @@
 #include "common/Mutex.h"
 
 #include "librbd/AioCompletion.h"
-#include "librbd/AioRequest.h"
+#include "librbd/AioImageRequest.h"
+#include "librbd/AioObjectRequest.h"
 #include "librbd/AsyncObjectThrottle.h"
 #include "librbd/CopyupRequest.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
+#include "librbd/internal.h"
 #include "librbd/ObjectMap.h"
 
 #include <boost/bind.hpp>
@@ -84,15 +86,15 @@ private:
     m_async_op.finish_op();
   }
 
-  void CopyupRequest::append_request(AioRequest *req) {
+  void CopyupRequest::append_request(AioObjectRequest *req) {
     ldout(m_ictx->cct, 20) << __func__ << " " << this << ": " << req << dendl;
     m_pending_requests.push_back(req);
   }
 
   void CopyupRequest::complete_requests(int r) {
     while (!m_pending_requests.empty()) {
-      vector<AioRequest *>::iterator it = m_pending_requests.begin();
-      AioRequest *req = *it;
+      vector<AioObjectRequest *>::iterator it = m_pending_requests.begin();
+      AioObjectRequest *req = *it;
       ldout(m_ictx->cct, 20) << __func__ << " completing request " << req
 			     << dendl;
       req->complete(r);
@@ -156,7 +158,7 @@ private:
 
       // merge all pending write ops into this single RADOS op
       for (size_t i=0; i<m_pending_requests.size(); ++i) {
-        AioRequest *req = m_pending_requests[i];
+        AioObjectRequest *req = m_pending_requests[i];
         ldout(m_ictx->cct, 20) << __func__ << " add_copyup_ops " << req
                                << dendl;
         req->add_copyup_ops(&write_op);
@@ -185,7 +187,9 @@ private:
 			   << ", oid " << m_oid
                            << ", extents " << m_image_extents
                            << dendl;
-    aio_read(m_ictx->parent, m_image_extents, NULL, &m_copyup_data, comp, 0);
+    RWLock::RLocker owner_locker(m_ictx->parent->owner_lock);
+    AioImageRequest::aio_read(m_ictx->parent, comp, m_image_extents, NULL,
+                              &m_copyup_data, 0);
   }
 
   void CopyupRequest::queue_send()
diff --git a/src/librbd/CopyupRequest.h b/src/librbd/CopyupRequest.h
index fd1fd87..e3a7cdb 100644
--- a/src/librbd/CopyupRequest.h
+++ b/src/librbd/CopyupRequest.h
@@ -20,7 +20,7 @@ namespace librbd {
                   vector<pair<uint64_t,uint64_t> >& image_extents);
     ~CopyupRequest();
 
-    void append_request(AioRequest *req);
+    void append_request(AioObjectRequest *req);
 
     void send();
     void queue_send();
@@ -65,7 +65,7 @@ namespace librbd {
     vector<pair<uint64_t,uint64_t> > m_image_extents;
     State m_state;
     ceph::bufferlist m_copyup_data;
-    vector<AioRequest *> m_pending_requests;
+    vector<AioObjectRequest *> m_pending_requests;
     atomic_t m_pending_copyups;
 
     AsyncOperation m_async_op;
diff --git a/src/librbd/ImageCtx.cc b/src/librbd/ImageCtx.cc
index c68b45c..8fd13e4 100644
--- a/src/librbd/ImageCtx.cc
+++ b/src/librbd/ImageCtx.cc
@@ -8,13 +8,17 @@
 #include "common/dout.h"
 #include "common/errno.h"
 #include "common/perf_counters.h"
+#include "common/WorkQueue.h"
 
+#include "librbd/AioImageRequestWQ.h"
 #include "librbd/AsyncOperation.h"
 #include "librbd/AsyncRequest.h"
 #include "librbd/AsyncResizeRequest.h"
 #include "librbd/internal.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
+#include "librbd/Journal.h"
+#include "librbd/LibrbdAdminSocketHook.h"
 #include "librbd/ObjectMap.h"
 
 #include <boost/bind.hpp>
@@ -49,6 +53,65 @@ public:
   }
 };
 
+struct C_FlushCache : public Context {
+  ImageCtx *image_ctx;
+  Context *on_safe;
+
+  C_FlushCache(ImageCtx *_image_ctx, Context *_on_safe)
+    : image_ctx(_image_ctx), on_safe(_on_safe) {
+  }
+  virtual void finish(int r) {
+    // successful cache flush indicates all IO is now safe
+    assert(image_ctx->owner_lock.is_locked());
+    image_ctx->flush_cache(on_safe);
+  }
+};
+
+struct C_InvalidateCache : public Context {
+  ImageCtx *image_ctx;
+  bool purge_on_error;
+  bool reentrant_safe;
+  Context *on_finish;
+
+  C_InvalidateCache(ImageCtx *_image_ctx, bool _purge_on_error,
+                    bool _reentrant_safe, Context *_on_finish)
+    : image_ctx(_image_ctx), purge_on_error(_purge_on_error),
+      reentrant_safe(_reentrant_safe), on_finish(_on_finish) {
+  }
+  virtual void finish(int r) {
+    assert(image_ctx->cache_lock.is_locked());
+    CephContext *cct = image_ctx->cct;
+
+    if (r == -EBLACKLISTED) {
+      lderr(cct) << "Blacklisted during flush!  Purging cache..." << dendl;
+      image_ctx->object_cacher->purge_set(image_ctx->object_set);
+    } else if (r != 0 && purge_on_error) {
+      lderr(cct) << "invalidate cache encountered error "
+                 << cpp_strerror(r) << " !Purging cache..." << dendl;
+      image_ctx->object_cacher->purge_set(image_ctx->object_set);
+    } else if (r != 0) {
+      lderr(cct) << "flush_cache returned " << r << dendl;
+    }
+
+    loff_t unclean = image_ctx->object_cacher->release_set(
+      image_ctx->object_set);
+    if (unclean == 0) {
+      r = 0;
+    } else {
+      lderr(cct) << "could not release all objects from cache: "
+                 << unclean << " bytes remain" << dendl;
+      r = -EBUSY;
+    }
+
+    if (reentrant_safe) {
+      on_finish->complete(r);
+    } else {
+      image_ctx->op_work_queue->queue(on_finish, r);
+    }
+  }
+
+};
+
 } // anonymous namespace
 
   const string ImageCtx::METADATA_CONF_PREFIX = "conf_";
@@ -64,6 +127,7 @@ public:
       exclusive_locked(false),
       name(image_name),
       image_watcher(NULL),
+      journal(NULL),
       refresh_seq(0),
       last_refresh(0),
       owner_lock(unique_lock_name("librbd::ImageCtx::owner_lock", this)),
@@ -84,7 +148,8 @@ public:
       object_cacher(NULL), writeback_handler(NULL), object_set(NULL),
       readahead(),
       total_bytes_read(0), copyup_finisher(NULL),
-      object_map(*this), aio_work_queue(NULL), op_work_queue(NULL)
+      object_map(*this), aio_work_queue(NULL), op_work_queue(NULL),
+      refresh_in_progress(false), asok_hook(new LibrbdAdminSocketHook(this))
   {
     md_ctx.dup(p);
     data_ctx.dup(p);
@@ -97,16 +162,19 @@ public:
     ThreadPoolSingleton *thread_pool_singleton;
     cct->lookup_or_create_singleton_object<ThreadPoolSingleton>(
       thread_pool_singleton, "librbd::thread_pool");
-    aio_work_queue = new ContextWQ("librbd::aio_work_queue",
-                                   cct->_conf->rbd_op_thread_timeout,
-                                   thread_pool_singleton);
+    aio_work_queue = new AioImageRequestWQ(this, "librbd::aio_work_queue",
+                                           cct->_conf->rbd_op_thread_timeout,
+                                           thread_pool_singleton);
     op_work_queue = new ContextWQ("librbd::op_work_queue",
                                   cct->_conf->rbd_op_thread_timeout,
                                   thread_pool_singleton);
   }
 
   ImageCtx::~ImageCtx() {
-    perf_stop();
+    assert(journal == NULL);
+    if (perfcounter) {
+      perf_stop();
+    }
     if (object_cacher) {
       delete object_cacher;
       object_cacher = NULL;
@@ -127,18 +195,12 @@ public:
 
     delete op_work_queue;
     delete aio_work_queue;
+
+    delete asok_hook;
   }
 
   int ImageCtx::init() {
     int r;
-    string pname = string("librbd-") + id + string("-") +
-      data_ctx.get_pool_name() + string("/") + name;
-    if (!snap_name.empty()) {
-      pname += "@";
-      pname += snap_name;
-    }
-
-    perf_start(pname);
 
     if (id.length()) {
       old_format = false;
@@ -184,6 +246,15 @@ public:
       header_oid = old_header_name(name);
     }
 
+    string pname = string("librbd-") + id + string("-") +
+      data_ctx.get_pool_name() + string("-") + name;
+    if (!snap_name.empty()) {
+      pname += "-";
+      pname += snap_name;
+    }
+
+    perf_start(pname);
+
     if (cache) {
       Mutex::Locker l(cache_lock);
       ldout(cct, 20) << "enabling caching..." << dendl;
@@ -288,10 +359,12 @@ public:
     plb.add_u64_counter(l_librbd_snap_create, "snap_create", "Snap creations");
     plb.add_u64_counter(l_librbd_snap_remove, "snap_remove", "Snap removals");
     plb.add_u64_counter(l_librbd_snap_rollback, "snap_rollback", "Snap rollbacks");
+    plb.add_u64_counter(l_librbd_snap_rename, "snap_rename", "Snap rename");
     plb.add_u64_counter(l_librbd_notify, "notify", "Updated header notifications");
     plb.add_u64_counter(l_librbd_resize, "resize", "Resizes");
     plb.add_u64_counter(l_librbd_readahead, "readahead", "Read ahead");
     plb.add_u64_counter(l_librbd_readahead_bytes, "readahead_bytes", "Data size in read ahead");
+    plb.add_u64_counter(l_librbd_invalidate_cache, "invalidate_cache", "Cache invalidates");
 
     perfcounter = plb.create_perf_counters();
     cct->get_perfcounters_collection()->add(perfcounter);
@@ -611,10 +684,12 @@ public:
 
   void ImageCtx::write_to_cache(object_t o, const bufferlist& bl, size_t len,
 				uint64_t off, Context *onfinish,
-				int fadvise_flags) {
+				int fadvise_flags, uint64_t journal_tid) {
     snap_lock.get_read();
     ObjectCacher::OSDWrite *wr = object_cacher->prepare_write(snapc, bl,
-							      utime_t(), fadvise_flags);
+							      utime_t(),
+                                                              fadvise_flags,
+                                                              journal_tid);
     snap_lock.put_read();
     ObjectExtent extent(o, 0, off, len, 0);
     extent.oloc.pool = data_ctx.get_id();
@@ -647,30 +722,24 @@ public:
     }
   }
 
-  void ImageCtx::flush_cache_aio(Context *onfinish) {
+  int ImageCtx::flush_cache() {
+    C_SaferCond cond_ctx;
+    flush_cache(&cond_ctx);
+
+    ldout(cct, 20) << "waiting for cache to be flushed" << dendl;
+    int r = cond_ctx.wait();
+    ldout(cct, 20) << "finished flushing cache" << dendl;
+
+    return r;
+  }
+
+  void ImageCtx::flush_cache(Context *onfinish) {
     assert(owner_lock.is_locked());
     cache_lock.Lock();
     object_cacher->flush_set(object_set, onfinish);
     cache_lock.Unlock();
   }
 
-  int ImageCtx::flush_cache() {
-    int r = 0;
-    Mutex mylock("librbd::ImageCtx::flush_cache");
-    Cond cond;
-    bool done;
-    Context *onfinish = new C_SafeCond(&mylock, &cond, &done, &r);
-    flush_cache_aio(onfinish);
-    mylock.Lock();
-    while (!done) {
-      ldout(cct, 20) << "waiting for cache to be flushed" << dendl;
-      cond.Wait(mylock);
-    }
-    mylock.Unlock();
-    ldout(cct, 20) << "finished flushing cache" << dendl;
-    return r;
-  }
-
   int ImageCtx::shutdown_cache() {
     flush_async_operations();
 
@@ -681,20 +750,19 @@ public:
   }
 
   int ImageCtx::invalidate_cache(bool purge_on_error) {
-    int result;
-    C_SaferCond ctx;
-    invalidate_cache(&ctx);
-    result = ctx.wait();
-
-    if (result && purge_on_error) {
-      cache_lock.Lock();
-      if (object_cacher != NULL) {
-	lderr(cct) << "invalidate cache met error " << cpp_strerror(result) << " !Purging cache..." << dendl;
-	object_cacher->purge_set(object_set);
-      }
-      cache_lock.Unlock();
+    flush_async_operations();
+    if (object_cacher == NULL) {
+      return 0;
     }
 
+    cache_lock.Lock();
+    object_cacher->release_set(object_set);
+    cache_lock.Unlock();
+
+    C_SaferCond ctx;
+    flush_cache(new C_InvalidateCache(this, purge_on_error, true, &ctx));
+
+    int result = ctx.wait();
     return result;
   }
 
@@ -708,29 +776,7 @@ public:
     object_cacher->release_set(object_set);
     cache_lock.Unlock();
 
-    flush_cache_aio(new FunctionContext(boost::bind(
-      &ImageCtx::invalidate_cache_completion, this, _1, on_finish)));
-  }
-
-  void ImageCtx::invalidate_cache_completion(int r, Context *on_finish) {
-    assert(cache_lock.is_locked());
-    if (r == -EBLACKLISTED) {
-      lderr(cct) << "Blacklisted during flush!  Purging cache..." << dendl;
-      object_cacher->purge_set(object_set);
-    } else if (r != 0) {
-      lderr(cct) << "flush_cache returned " << r << dendl;
-    }
-
-    loff_t unclean = object_cacher->release_set(object_set);
-    if (unclean == 0) {
-      r = 0;
-    } else {
-      lderr(cct) << "could not release all objects from cache: "
-                 << unclean << " bytes remain" << dendl;
-      r = -EBUSY;
-    }
-
-    op_work_queue->queue(on_finish, r);
+    flush_cache(new C_InvalidateCache(this, false, false, on_finish));
   }
 
   void ImageCtx::clear_nonexistence_cache() {
@@ -743,6 +789,7 @@ public:
   int ImageCtx::register_watch() {
     assert(image_watcher == NULL);
     image_watcher = new ImageWatcher(*this);
+    aio_work_queue->register_lock_listener();
     return image_watcher->register_watch();
   }
 
@@ -782,15 +829,31 @@ public:
   }
 
   void ImageCtx::flush_async_operations(Context *on_finish) {
-    Mutex::Locker l(async_ops_lock);
-    if (async_ops.empty()) {
-      op_work_queue->queue(on_finish, 0);
-      return;
+    {
+      Mutex::Locker l(async_ops_lock);
+      if (!async_ops.empty()) {
+        ldout(cct, 20) << "flush async operations: " << on_finish << " "
+                       << "count=" << async_ops.size() << dendl;
+        async_ops.front()->add_flush_context(on_finish);
+        return;
+      }
     }
+    on_finish->complete(0);
+  }
 
-    ldout(cct, 20) << "flush async operations: " << on_finish << " "
-                   << "count=" << async_ops.size() << dendl;
-    async_ops.front()->add_flush_context(on_finish);
+  int ImageCtx::flush() {
+    C_SaferCond cond_ctx;
+    flush(&cond_ctx);
+    return cond_ctx.wait();
+  }
+
+  void ImageCtx::flush(Context *on_safe) {
+    assert(owner_lock.is_locked());
+    if (object_cacher != NULL) {
+      // flush cache after completing all in-flight AIO ops
+      on_safe = new C_FlushCache(this, on_safe);
+    }
+    flush_async_operations(on_safe);
   }
 
   void ImageCtx::cancel_async_requests() {
@@ -926,4 +989,24 @@ public:
     ASSIGN_OPTION(request_timed_out_seconds);
     ASSIGN_OPTION(enable_alloc_hint);
   }
+
+  void ImageCtx::open_journal() {
+    assert(journal == NULL);
+    journal = new Journal(*this);
+  }
+
+  int ImageCtx::close_journal(bool force) {
+    assert(journal != NULL);
+    int r = journal->close();
+    if (r < 0) {
+      lderr(cct) << "failed to flush journal: " << cpp_strerror(r) << dendl;
+      if (!force) {
+        return r;
+      }
+    }
+
+    delete journal;
+    journal = NULL;
+    return r;
+  }
 }
diff --git a/src/librbd/ImageCtx.h b/src/librbd/ImageCtx.h
index 3c7f170..f61929c 100644
--- a/src/librbd/ImageCtx.h
+++ b/src/librbd/ImageCtx.h
@@ -16,7 +16,6 @@
 #include "common/Readahead.h"
 #include "common/RWLock.h"
 #include "common/snap_types.h"
-#include "common/WorkQueue.h"
 #include "include/atomic.h"
 #include "include/buffer.h"
 #include "include/rbd/librbd.hpp"
@@ -32,16 +31,20 @@
 #include "librbd/parent_types.h"
 
 class CephContext;
+class ContextWQ;
 class Finisher;
 class PerfCounters;
 
 namespace librbd {
 
+  class AioImageRequestWQ;
   class AsyncOperation;
   template <typename ImageCtxT> class AsyncRequest;
   class AsyncResizeRequest;
   class CopyupRequest;
+  class LibrbdAdminSocketHook;
   class ImageWatcher;
+  class Journal;
 
   struct ImageCtx {
     CephContext *cct;
@@ -67,6 +70,7 @@ namespace librbd {
     std::string snap_name;
     IoCtx data_ctx, md_ctx;
     ImageWatcher *image_watcher;
+    Journal *journal;
     int refresh_seq;    ///< sequence for refresh requests
     int last_refresh;   ///< last completed refresh
 
@@ -131,9 +135,12 @@ namespace librbd {
 
     xlist<AsyncResizeRequest*> async_resize_reqs;
 
-    ContextWQ *aio_work_queue;
+    AioImageRequestWQ *aio_work_queue;
     ContextWQ *op_work_queue;
 
+    Cond refresh_cond;
+    bool refresh_in_progress;
+
     // Configuration
     static const string METADATA_CONF_PREFIX;
     bool non_blocking_aio;
@@ -158,6 +165,9 @@ namespace librbd {
     uint32_t blacklist_expire_seconds;
     uint32_t request_timed_out_seconds;
     bool enable_alloc_hint;
+
+    LibrbdAdminSocketHook *asok_hook;
+
     static bool _filter_metadata_confs(const string &prefix, std::map<string, bool> &configs,
                                        map<string, bufferlist> &pairs, map<string, bufferlist> *res);
 
@@ -216,14 +226,14 @@ namespace librbd {
 			     size_t len, uint64_t off, Context *onfinish,
 			     int fadvise_flags);
     void write_to_cache(object_t o, const bufferlist& bl, size_t len,
-			uint64_t off, Context *onfinish, int fadvise_flags);
+			uint64_t off, Context *onfinish, int fadvise_flags,
+                        uint64_t journal_tid);
     void user_flushed();
-    void flush_cache_aio(Context *onfinish);
     int flush_cache();
+    void flush_cache(Context *onfinish);
     int shutdown_cache();
     int invalidate_cache(bool purge_on_error=false);
     void invalidate_cache(Context *on_finish);
-    void invalidate_cache_completion(int r, Context *on_finish);
     void clear_nonexistence_cache();
     int register_watch();
     void unregister_watch();
@@ -233,8 +243,14 @@ namespace librbd {
     void flush_async_operations();
     void flush_async_operations(Context *on_finish);
 
+    int flush();
+    void flush(Context *on_safe);
+
     void cancel_async_requests();
     void apply_metadata_confs();
+
+    void open_journal();
+    int close_journal(bool force);
   };
 }
 
diff --git a/src/librbd/ImageWatcher.cc b/src/librbd/ImageWatcher.cc
index 43644f8..014ce94 100644
--- a/src/librbd/ImageWatcher.cc
+++ b/src/librbd/ImageWatcher.cc
@@ -34,11 +34,12 @@ ImageWatcher::ImageWatcher(ImageCtx &image_ctx)
   : m_image_ctx(image_ctx),
     m_watch_lock(unique_lock_name("librbd::ImageWatcher::m_watch_lock", this)),
     m_watch_ctx(*this), m_watch_handle(0),
-    m_watch_state(WATCH_STATE_UNREGISTERED),
+    m_watch_state(WATCH_STATE_UNREGISTERED), m_lock_supported(false),
     m_lock_owner_state(LOCK_OWNER_STATE_NOT_LOCKED),
+    m_listeners_lock(unique_lock_name("librbd::ImageWatcher::m_listeners_lock", this)),
+    m_listeners_in_use(false),
     m_task_finisher(new TaskFinisher<Task>(*m_image_ctx.cct)),
     m_async_request_lock(unique_lock_name("librbd::ImageWatcher::m_async_request_lock", this)),
-    m_aio_request_lock(unique_lock_name("librbd::ImageWatcher::m_aio_request_lock", this)),
     m_owner_client_id_lock(unique_lock_name("librbd::ImageWatcher::m_owner_client_id_lock", this))
 {
 }
@@ -74,6 +75,20 @@ bool ImageWatcher::is_lock_owner() const {
           m_lock_owner_state == LOCK_OWNER_STATE_RELEASING);
 }
 
+void ImageWatcher::register_listener(Listener *listener) {
+  Mutex::Locker listeners_locker(m_listeners_lock);
+  m_listeners.push_back(listener);
+}
+
+void ImageWatcher::unregister_listener(Listener *listener) {
+  // TODO CoW listener list
+  Mutex::Locker listeners_locker(m_listeners_lock);
+  while (m_listeners_in_use) {
+    m_listeners_cond.Wait(m_listeners_lock);
+  }
+  m_listeners.remove(listener);
+}
+
 int ImageWatcher::register_watch() {
   ldout(m_image_ctx.cct, 10) << this << " registering image watcher" << dendl;
 
@@ -93,11 +108,6 @@ int ImageWatcher::register_watch() {
 int ImageWatcher::unregister_watch() {
   ldout(m_image_ctx.cct, 10) << this << " unregistering image watcher" << dendl;
 
-  {
-    Mutex::Locker l(m_aio_request_lock);
-    assert(m_aio_requests.empty());
-  }
-
   cancel_async_requests();
   m_task_finisher->cancel_all();
 
@@ -115,9 +125,44 @@ int ImageWatcher::unregister_watch() {
   return r;
 }
 
+int ImageWatcher::refresh() {
+  assert(m_image_ctx.owner_lock.is_locked());
+
+  bool lock_support_changed = false;
+  {
+    RWLock::WLocker watch_locker(m_watch_lock);
+    if (m_lock_supported != is_lock_supported()) {
+      m_lock_supported = is_lock_supported();
+      lock_support_changed = true;
+    }
+  }
+
+  int r = 0;
+  if (lock_support_changed) {
+    if (is_lock_supported()) {
+      // image opened, exclusive lock dynamically enabled, or now HEAD
+      notify_listeners_updated_lock(LOCK_UPDATE_STATE_RELEASING);
+      notify_listeners_updated_lock(LOCK_UPDATE_STATE_UNLOCKED);
+    } else if (!is_lock_supported()) {
+      if (is_lock_owner()) {
+        // exclusive lock dynamically disabled or now snapshot
+        m_image_ctx.owner_lock.put_read();
+        {
+          RWLock::WLocker owner_locker(m_image_ctx.owner_lock);
+          r = release_lock();
+        }
+        m_image_ctx.owner_lock.get_read();
+      }
+      notify_listeners_updated_lock(LOCK_UPDATE_STATE_NOT_SUPPORTED);
+    }
+  }
+  return r;
+}
+
 int ImageWatcher::try_lock() {
   assert(m_image_ctx.owner_lock.is_wlocked());
   assert(m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED);
+  assert(is_lock_supported());
 
   while (true) {
     int r = lock();
@@ -182,33 +227,8 @@ int ImageWatcher::try_lock() {
   return 0;
 }
 
-void ImageWatcher::request_lock(
-    const boost::function<void(AioCompletion*)>& restart_op, AioCompletion* c) {
-  assert(m_image_ctx.owner_lock.is_locked());
-  assert(m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED);
-
-  {
-    Mutex::Locker l(m_aio_request_lock);
-    bool request_pending = !m_aio_requests.empty();
-    ldout(m_image_ctx.cct, 15) << this << " queuing aio request: " << c
-			       << dendl;
-
-    c->get();
-    m_aio_requests.push_back(std::make_pair(restart_op, c));
-    if (request_pending) {
-      return;
-    }
-  }
-
-  RWLock::RLocker l(m_watch_lock);
-  if (m_watch_state == WATCH_STATE_REGISTERED) {
-    ldout(m_image_ctx.cct, 15) << this << " requesting exclusive lock" << dendl;
-
-    // run notify request in finisher to avoid blocking aio path
-    FunctionContext *ctx = new FunctionContext(
-      boost::bind(&ImageWatcher::notify_request_lock, this));
-    m_task_finisher->queue(TASK_CODE_REQUEST_LOCK, ctx);
-  }
+void ImageWatcher::request_lock() {
+  schedule_request_lock(false);
 }
 
 bool ImageWatcher::try_request_lock() {
@@ -291,6 +311,9 @@ int ImageWatcher::get_lock_owner_info(entity_name_t *locker, std::string *cookie
 }
 
 int ImageWatcher::lock() {
+  assert(m_image_ctx.owner_lock.is_wlocked());
+  assert(m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED);
+
   int r = rados::cls::lock::lock(&m_image_ctx.md_ctx, m_image_ctx.header_oid,
 				 RBD_LOCK_NAME, LOCK_EXCLUSIVE,
 				 encode_lock_cookie(), WATCHER_LOCK_TAG, "",
@@ -318,37 +341,16 @@ int ImageWatcher::lock() {
     m_image_ctx.object_map.refresh(CEPH_NOSNAP);
   }
 
-  bufferlist bl;
-  ::encode(NotifyMessage(AcquiredLockPayload(get_client_id())), bl);
-
   // send the notification when we aren't holding locks
   FunctionContext *ctx = new FunctionContext(
-    boost::bind(&IoCtx::notify2, &m_image_ctx.md_ctx, m_image_ctx.header_oid,
-		bl, NOTIFY_TIMEOUT, reinterpret_cast<bufferlist *>(0)));
+    boost::bind(&ImageWatcher::notify_acquired_lock, this));
   m_task_finisher->queue(TASK_CODE_ACQUIRED_LOCK, ctx);
   return 0;
 }
 
-void ImageWatcher::prepare_unlock() {
-  assert(m_image_ctx.owner_lock.is_wlocked());
-  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
-    m_lock_owner_state = LOCK_OWNER_STATE_RELEASING;
-  }
-}
-
-void ImageWatcher::cancel_unlock() {
-  assert(m_image_ctx.owner_lock.is_wlocked());
-  if (m_lock_owner_state == LOCK_OWNER_STATE_RELEASING) {
-    m_lock_owner_state = LOCK_OWNER_STATE_LOCKED;
-  }
-}
-
 int ImageWatcher::unlock()
 {
   assert(m_image_ctx.owner_lock.is_wlocked());
-  if (m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED) {
-    return 0;
-  }
 
   ldout(m_image_ctx.cct, 10) << this << " releasing exclusive lock" << dendl;
   m_lock_owner_state = LOCK_OWNER_STATE_NOT_LOCKED;
@@ -364,8 +366,10 @@ int ImageWatcher::unlock()
     m_image_ctx.object_map.unlock();
   }
 
-  Mutex::Locker l(m_owner_client_id_lock);
-  set_owner_client_id(ClientId());
+  {
+    Mutex::Locker l(m_owner_client_id_lock);
+    set_owner_client_id(ClientId());
+  }
 
   FunctionContext *ctx = new FunctionContext(
     boost::bind(&ImageWatcher::notify_released_lock, this));
@@ -373,32 +377,66 @@ int ImageWatcher::unlock()
   return 0;
 }
 
-bool ImageWatcher::release_lock()
+int ImageWatcher::release_lock()
 {
   assert(m_image_ctx.owner_lock.is_wlocked());
-  ldout(m_image_ctx.cct, 10) << this << " releasing exclusive lock by request"
-                             << dendl;
-  if (!is_lock_owner()) {
-    return false;
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 10) << this << " releasing exclusive lock by request" << dendl;
+  if (m_lock_owner_state != LOCK_OWNER_STATE_LOCKED) {
+    return 0;
   }
-  prepare_unlock();
+
+  m_lock_owner_state = LOCK_OWNER_STATE_RELEASING;
   m_image_ctx.owner_lock.put_write();
+
+  // ensure all maint operations are canceled
   m_image_ctx.cancel_async_requests();
   m_image_ctx.flush_async_operations();
 
+  int r;
   {
     RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+
+    // alert listeners that all incoming IO needs to be stopped since the
+    // lock is being released
+    notify_listeners_updated_lock(LOCK_UPDATE_STATE_RELEASING);
+
     RWLock::WLocker md_locker(m_image_ctx.md_lock);
-    librbd::_flush(&m_image_ctx);
+    r = m_image_ctx.flush();
+    if (r < 0) {
+      lderr(cct) << this << " failed to flush: " << cpp_strerror(r) << dendl;
+      goto err_cancel_unlock;
+    }
   }
 
   m_image_ctx.owner_lock.get_write();
-  if (!is_lock_owner()) {
-    return false;
+  assert(m_lock_owner_state == LOCK_OWNER_STATE_RELEASING);
+  r = unlock();
+
+  // notify listeners of the change w/ owner read locked
+  m_image_ctx.owner_lock.put_write();
+  {
+    RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+    if (m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED) {
+      notify_listeners_updated_lock(LOCK_UPDATE_STATE_UNLOCKED);
+    }
   }
+  m_image_ctx.owner_lock.get_write();
 
-  unlock();
-  return true;
+  if (r < 0) {
+    lderr(cct) << this << " failed to unlock: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  return 0;
+
+err_cancel_unlock:
+  m_image_ctx.owner_lock.get_write();
+  if (m_lock_owner_state == LOCK_OWNER_STATE_RELEASING) {
+    m_lock_owner_state = LOCK_OWNER_STATE_LOCKED;
+  }
+  return r;
 }
 
 void ImageWatcher::assert_header_locked(librados::ObjectWriteOperation *op) {
@@ -496,6 +534,16 @@ int ImageWatcher::notify_snap_create(const std::string &snap_name) {
   return notify_lock_owner(bl);
 }
 
+int ImageWatcher::notify_snap_rename(const snapid_t &src_snap_id,
+				     const std::string &dst_snap_name) {
+  assert(m_image_ctx.owner_lock.is_locked());
+  assert(!is_lock_owner());
+
+  bufferlist bl;
+  ::encode(NotifyMessage(SnapRenamePayload(src_snap_id, dst_snap_name)), bl);
+
+  return notify_lock_owner(bl);
+}
 int ImageWatcher::notify_snap_remove(const std::string &snap_name) {
   assert(m_image_ctx.owner_lock.is_locked());
   assert(!is_lock_owner());
@@ -519,6 +567,21 @@ int ImageWatcher::notify_rebuild_object_map(uint64_t request_id,
   return notify_async_request(async_request_id, bl, prog_ctx);
 }
 
+void ImageWatcher::notify_lock_state() {
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
+    // re-send the acquired lock notification so that peers know they can now
+    // request the lock
+    ldout(m_image_ctx.cct, 10) << this << " notify lock state" << dendl;
+
+    bufferlist bl;
+    ::encode(NotifyMessage(AcquiredLockPayload(get_client_id())), bl);
+
+    m_image_ctx.md_ctx.notify2(m_image_ctx.header_oid, bl, NOTIFY_TIMEOUT,
+                               NULL);
+  }
+}
+
 void ImageWatcher::notify_header_update(librados::IoCtx &io_ctx,
 				        const std::string &oid)
 {
@@ -546,37 +609,6 @@ bool ImageWatcher::decode_lock_cookie(const std::string &tag,
   return true;
 }
 
-void ImageWatcher::schedule_retry_aio_requests(bool use_timer) {
-  m_task_finisher->cancel(TASK_CODE_REQUEST_LOCK);
-  Context *ctx = new FunctionContext(boost::bind(
-    &ImageWatcher::retry_aio_requests, this));
-  if (use_timer) {
-    m_task_finisher->add_event_after(TASK_CODE_RETRY_AIO_REQUESTS,
-                                     RETRY_DELAY_SECONDS, ctx);
-  } else {
-    m_task_finisher->queue(TASK_CODE_RETRY_AIO_REQUESTS, ctx);
-  }
-}
-
-void ImageWatcher::retry_aio_requests() {
-  m_task_finisher->cancel(TASK_CODE_RETRY_AIO_REQUESTS);
-  std::vector<AioRequest> lock_request_restarts;
-  {
-    Mutex::Locker l(m_aio_request_lock);
-    lock_request_restarts.swap(m_aio_requests);
-  }
-
-  ldout(m_image_ctx.cct, 15) << this << " retrying pending aio requests"
-                             << dendl;
-  for (std::vector<AioRequest>::iterator iter = lock_request_restarts.begin();
-       iter != lock_request_restarts.end(); ++iter) {
-    ldout(m_image_ctx.cct, 20) << this << " retrying aio request: "
-                               << iter->second << dendl;
-    iter->first(iter->second);
-    iter->second->put();
-  }
-}
-
 void ImageWatcher::schedule_cancel_async_requests() {
   FunctionContext *ctx = new FunctionContext(
     boost::bind(&ImageWatcher::cancel_async_requests, this));
@@ -605,6 +637,21 @@ ClientId ImageWatcher::get_client_id() {
   return ClientId(m_image_ctx.md_ctx.get_instance_id(), m_watch_handle);
 }
 
+void ImageWatcher::notify_acquired_lock() {
+  ldout(m_image_ctx.cct, 10) << this << " notify acquired lock" << dendl;
+
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+  if (m_lock_owner_state != LOCK_OWNER_STATE_LOCKED) {
+    return;
+  }
+
+  notify_listeners_updated_lock(LOCK_UPDATE_STATE_LOCKED);
+
+  bufferlist bl;
+  ::encode(NotifyMessage(AcquiredLockPayload(get_client_id())), bl);
+  m_image_ctx.md_ctx.notify2(m_image_ctx.header_oid, bl, NOTIFY_TIMEOUT, NULL);
+}
+
 void ImageWatcher::notify_release_lock() {
   RWLock::WLocker owner_locker(m_image_ctx.owner_lock);
   release_lock();
@@ -617,14 +664,33 @@ void ImageWatcher::notify_released_lock() {
   m_image_ctx.md_ctx.notify2(m_image_ctx.header_oid, bl, NOTIFY_TIMEOUT, NULL);
 }
 
+void ImageWatcher::schedule_request_lock(bool use_timer, int timer_delay) {
+  assert(m_image_ctx.owner_lock.is_locked());
+  assert(m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED);
+
+  RWLock::RLocker watch_locker(m_watch_lock);
+  if (m_watch_state == WATCH_STATE_REGISTERED) {
+    ldout(m_image_ctx.cct, 15) << this << " requesting exclusive lock" << dendl;
+
+    FunctionContext *ctx = new FunctionContext(
+      boost::bind(&ImageWatcher::notify_request_lock, this));
+    if (use_timer) {
+      if (timer_delay < 0) {
+        timer_delay = RETRY_DELAY_SECONDS;
+      }
+      m_task_finisher->add_event_after(TASK_CODE_REQUEST_LOCK, timer_delay,
+                                       ctx);
+    } else {
+      m_task_finisher->queue(TASK_CODE_REQUEST_LOCK, ctx);
+    }
+  }
+}
+
 void ImageWatcher::notify_request_lock() {
   ldout(m_image_ctx.cct, 10) << this << " notify request lock" << dendl;
-  m_task_finisher->cancel(TASK_CODE_RETRY_AIO_REQUESTS);
 
-  m_image_ctx.owner_lock.get_read();
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
   if (try_request_lock()) {
-    m_image_ctx.owner_lock.put_read();
-    retry_aio_requests();
     return;
   }
 
@@ -632,23 +698,20 @@ void ImageWatcher::notify_request_lock() {
   ::encode(NotifyMessage(RequestLockPayload(get_client_id())), bl);
 
   int r = notify_lock_owner(bl);
-  m_image_ctx.owner_lock.put_read();
-
   if (r == -ETIMEDOUT) {
-    ldout(m_image_ctx.cct, 5) << this << "timed out requesting lock: retrying"
+    ldout(m_image_ctx.cct, 5) << this << " timed out requesting lock: retrying"
                               << dendl;
-    retry_aio_requests();
+    schedule_request_lock(false);
   } else if (r < 0) {
     lderr(m_image_ctx.cct) << this << " error requesting lock: "
                            << cpp_strerror(r) << dendl;
-    schedule_retry_aio_requests(true);
+    schedule_request_lock(true);
   } else {
     // lock owner acked -- but resend if we don't see them release the lock
     int retry_timeout = m_image_ctx.cct->_conf->client_notify_timeout;
-    FunctionContext *ctx = new FunctionContext(
-      boost::bind(&ImageWatcher::notify_request_lock, this));
-    m_task_finisher->add_event_after(TASK_CODE_REQUEST_LOCK,
-                                     retry_timeout, ctx);
+    ldout(m_image_ctx.cct, 15) << this << " will retry in " << retry_timeout
+                               << " seconds" << dendl;
+    schedule_request_lock(true, retry_timeout);
   }
 }
 
@@ -803,40 +866,48 @@ void ImageWatcher::handle_payload(const AcquiredLockPayload &payload,
                                   bufferlist *out) {
   ldout(m_image_ctx.cct, 10) << this << " image exclusively locked announcement"
                              << dendl;
+
+  bool cancel_async_requests = true;
   if (payload.client_id.is_valid()) {
     Mutex::Locker l(m_owner_client_id_lock);
     if (payload.client_id == m_owner_client_id) {
-      // we already know that the remote client is the owner
-      return;
+      cancel_async_requests = false;
     }
     set_owner_client_id(payload.client_id);
   }
 
-  RWLock::RLocker l(m_image_ctx.owner_lock);
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
   if (m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED) {
-    schedule_cancel_async_requests();
-    schedule_retry_aio_requests(false);
+    if (cancel_async_requests) {
+      schedule_cancel_async_requests();
+    }
+    notify_listeners_updated_lock(LOCK_UPDATE_STATE_NOTIFICATION);
   }
 }
 
 void ImageWatcher::handle_payload(const ReleasedLockPayload &payload,
                                   bufferlist *out) {
   ldout(m_image_ctx.cct, 10) << this << " exclusive lock released" << dendl;
+
+  bool cancel_async_requests = true;
   if (payload.client_id.is_valid()) {
     Mutex::Locker l(m_owner_client_id_lock);
     if (payload.client_id != m_owner_client_id) {
       ldout(m_image_ctx.cct, 10) << this << " unexpected owner: "
                                  << payload.client_id << " != "
                                  << m_owner_client_id << dendl;
-      return;
+      cancel_async_requests = false;
+    } else {
+      set_owner_client_id(ClientId());
     }
-    set_owner_client_id(ClientId());
   }
 
-  RWLock::RLocker l(m_image_ctx.owner_lock);
+  RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
   if (m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED) {
-    schedule_cancel_async_requests();
-    schedule_retry_aio_requests(false);
+    if (cancel_async_requests) {
+      schedule_cancel_async_requests();
+    }
+    notify_listeners_updated_lock(LOCK_UPDATE_STATE_NOTIFICATION);
   }
 }
 
@@ -859,11 +930,25 @@ void ImageWatcher::handle_payload(const RequestLockPayload &payload,
       }
     }
 
-    ldout(m_image_ctx.cct, 10) << this << " queuing release of exclusive lock"
-                               << dendl;
-    FunctionContext *ctx = new FunctionContext(
-      boost::bind(&ImageWatcher::notify_release_lock, this));
-    m_task_finisher->queue(TASK_CODE_RELEASING_LOCK, ctx);
+    bool release_permitted = true;
+    {
+      Mutex::Locker listeners_locker(m_listeners_lock);
+      for (Listeners::iterator it = m_listeners.begin();
+           it != m_listeners.end(); ++it) {
+        if (!(*it)->handle_requested_lock()) {
+          release_permitted = false;
+          break;
+        }
+      }
+    }
+
+    if (release_permitted) {
+      ldout(m_image_ctx.cct, 10) << this << " queuing release of exclusive lock"
+                                 << dendl;
+      FunctionContext *ctx = new FunctionContext(
+        boost::bind(&ImageWatcher::notify_release_lock, this));
+      m_task_finisher->queue(TASK_CODE_RELEASING_LOCK, ctx);
+    }
   }
 }
 
@@ -958,6 +1043,20 @@ void ImageWatcher::handle_payload(const SnapCreatePayload &payload,
   }
 }
 
+void ImageWatcher::handle_payload(const SnapRenamePayload &payload,
+				  bufferlist *out) {
+  RWLock::RLocker l(m_image_ctx.owner_lock);
+  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
+    ldout(m_image_ctx.cct, 10) << this << " remote snap_rename request: "
+			       << payload.src_snap_id << " to " 
+			       << payload.dst_snap_name << dendl;
+    int r = librbd::snap_rename_helper(&m_image_ctx, NULL,
+                                       payload.src_snap_id,
+                                       payload.dst_snap_name.c_str());
+
+    ::encode(ResponseMessage(r), *out);
+  }
+}
 void ImageWatcher::handle_payload(const SnapRemovePayload &payload,
 				  bufferlist *out) {
   RWLock::RLocker l(m_image_ctx.owner_lock);
@@ -1054,53 +1153,53 @@ void ImageWatcher::acknowledge_notify(uint64_t notify_id, uint64_t handle,
 void ImageWatcher::reregister_watch() {
   ldout(m_image_ctx.cct, 10) << this << " re-registering image watch" << dendl;
 
+  RWLock::WLocker l(m_image_ctx.owner_lock);
+  bool was_lock_owner = false;
+  if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
+    // ensure all async requests are canceled and IO is flushed
+    was_lock_owner = release_lock();
+  }
+
+  int r;
   {
-    RWLock::WLocker l(m_image_ctx.owner_lock);
-    bool was_lock_owner = false;
-    if (m_lock_owner_state == LOCK_OWNER_STATE_LOCKED) {
-      // ensure all async requests are canceled and IO is flushed
-      was_lock_owner = release_lock();
+    RWLock::WLocker l(m_watch_lock);
+    if (m_watch_state != WATCH_STATE_ERROR) {
+      return;
     }
 
-    int r;
-    {
-      RWLock::WLocker l(m_watch_lock);
-      if (m_watch_state != WATCH_STATE_ERROR) {
-	return;
-      }
-
-      r = m_image_ctx.md_ctx.watch2(m_image_ctx.header_oid,
-                                    &m_watch_handle, &m_watch_ctx);
-      if (r < 0) {
-        lderr(m_image_ctx.cct) << this << " failed to re-register image watch: "
-                               << cpp_strerror(r) << dendl;
-	if (r != -ESHUTDOWN) {
-	  FunctionContext *ctx = new FunctionContext(boost::bind(
-	    &ImageWatcher::reregister_watch, this));
-	  m_task_finisher->add_event_after(TASK_CODE_REREGISTER_WATCH,
-                                           RETRY_DELAY_SECONDS, ctx);
-	}
-        return;
+    r = m_image_ctx.md_ctx.watch2(m_image_ctx.header_oid,
+                                  &m_watch_handle, &m_watch_ctx);
+    if (r < 0) {
+      lderr(m_image_ctx.cct) << this << " failed to re-register image watch: "
+                             << cpp_strerror(r) << dendl;
+      if (r != -ESHUTDOWN) {
+        FunctionContext *ctx = new FunctionContext(boost::bind(
+          &ImageWatcher::reregister_watch, this));
+        m_task_finisher->add_event_after(TASK_CODE_REREGISTER_WATCH,
+                                         RETRY_DELAY_SECONDS, ctx);
       }
-
-      m_watch_state = WATCH_STATE_REGISTERED;
+      return;
     }
-    handle_payload(HeaderUpdatePayload(), NULL);
 
-    if (was_lock_owner) {
-      r = try_lock();
-      if (r == -EBUSY) {
-        ldout(m_image_ctx.cct, 5) << this << "lost image lock while "
-                                  << "re-registering image watch" << dendl;
-      } else if (r < 0) {
-        lderr(m_image_ctx.cct) << this
-                               << "failed to lock image while re-registering "
-                               << "image watch" << cpp_strerror(r) << dendl;
-      }
+    m_watch_state = WATCH_STATE_REGISTERED;
+  }
+  handle_payload(HeaderUpdatePayload(), NULL);
+
+  if (was_lock_owner) {
+    r = try_lock();
+    if (r == -EBUSY) {
+      ldout(m_image_ctx.cct, 5) << this << "lost image lock while "
+                                << "re-registering image watch" << dendl;
+    } else if (r < 0) {
+      lderr(m_image_ctx.cct) << this
+                             << "failed to lock image while re-registering "
+                             << "image watch" << cpp_strerror(r) << dendl;
     }
   }
 
-  retry_aio_requests();
+  if (m_lock_owner_state == LOCK_OWNER_STATE_NOT_LOCKED) {
+    notify_listeners_updated_lock(LOCK_UPDATE_STATE_UNLOCKED);
+  }
 }
 
 void ImageWatcher::WatchCtx::handle_notify(uint64_t notify_id,
@@ -1118,4 +1217,25 @@ void ImageWatcher::RemoteContext::finish(int r) {
   m_image_watcher.schedule_async_complete(m_async_request_id, r);
 }
 
+void ImageWatcher::notify_listeners_updated_lock(
+    LockUpdateState lock_update_state) {
+  assert(m_image_ctx.owner_lock.is_locked());
+
+  Listeners listeners;
+  {
+    Mutex::Locker listeners_locker(m_listeners_lock);
+    m_listeners_in_use = true;
+    listeners = m_listeners;
+  }
+
+  for (Listeners::iterator it = listeners.begin();
+       it != listeners.end(); ++it) {
+    (*it)->handle_lock_updated(lock_update_state);
+  }
+
+  Mutex::Locker listeners_locker(m_listeners_lock);
+  m_listeners_in_use = false;
+  m_listeners_cond.Signal();
+}
+
 }
diff --git a/src/librbd/ImageWatcher.h b/src/librbd/ImageWatcher.h
index 6ebeb9d..2b3da27 100644
--- a/src/librbd/ImageWatcher.h
+++ b/src/librbd/ImageWatcher.h
@@ -3,6 +3,7 @@
 #ifndef CEPH_LIBRBD_IMAGE_WATCHER_H
 #define CEPH_LIBRBD_IMAGE_WATCHER_H
 
+#include "common/Cond.h"
 #include "common/Mutex.h"
 #include "common/RWLock.h"
 #include "include/Context.h"
@@ -20,270 +21,292 @@ class entity_name_t;
 
 namespace librbd {
 
-  class AioCompletion;
-  class ImageCtx;
-  template <typename T> class TaskFinisher;
+class ImageCtx;
+template <typename T> class TaskFinisher;
+
+class ImageWatcher {
+public:
+  enum LockUpdateState {
+    LOCK_UPDATE_STATE_NOT_SUPPORTED,
+    LOCK_UPDATE_STATE_LOCKED,
+    LOCK_UPDATE_STATE_RELEASING,
+    LOCK_UPDATE_STATE_UNLOCKED,
+    LOCK_UPDATE_STATE_NOTIFICATION
+  };
 
-  class ImageWatcher {
-  public:
+  struct Listener {
+    virtual ~Listener() {}
 
-    ImageWatcher(ImageCtx& image_ctx);
-    ~ImageWatcher();
+    virtual bool handle_requested_lock() = 0;
+    virtual void handle_lock_updated(LockUpdateState lock_update_state) = 0;
+  };
 
-    bool is_lock_supported() const;
-    bool is_lock_supported(const RWLock &snap_lock) const;
-    bool is_lock_owner() const;
+  ImageWatcher(ImageCtx& image_ctx);
+  ~ImageWatcher();
 
-    int register_watch();
-    int unregister_watch();
+  bool is_lock_supported() const;
+  bool is_lock_supported(const RWLock &snap_lock) const;
+  bool is_lock_owner() const;
 
-    int try_lock();
-    void request_lock(const boost::function<void(AioCompletion*)>& restart_op,
-		      AioCompletion* c);
-    void prepare_unlock();
-    void cancel_unlock();
-    int unlock();
+  void register_listener(Listener *listener);
+  void unregister_listener(Listener *listener);
 
-    void assert_header_locked(librados::ObjectWriteOperation *op);
+  int register_watch();
+  int unregister_watch();
 
-    int notify_flatten(uint64_t request_id, ProgressContext &prog_ctx);
-    int notify_resize(uint64_t request_id, uint64_t size,
-		      ProgressContext &prog_ctx);
-    int notify_snap_create(const std::string &snap_name);
-    int notify_snap_remove(const std::string &snap_name);
-    int notify_rebuild_object_map(uint64_t request_id,
-                                  ProgressContext &prog_ctx);
+  int refresh();
 
-    static void notify_header_update(librados::IoCtx &io_ctx,
-				     const std::string &oid);
+  int try_lock();
+  void request_lock();
+  int release_lock();
 
-  private:
+  void assert_header_locked(librados::ObjectWriteOperation *op);
 
-    enum LockOwnerState {
-      LOCK_OWNER_STATE_NOT_LOCKED,
-      LOCK_OWNER_STATE_LOCKED,
-      LOCK_OWNER_STATE_RELEASING
-    };
-
-    enum WatchState {
-      WATCH_STATE_UNREGISTERED,
-      WATCH_STATE_REGISTERED,
-      WATCH_STATE_ERROR
-    };
-
-    enum TaskCode {
-      TASK_CODE_ACQUIRED_LOCK,
-      TASK_CODE_REQUEST_LOCK,
-      TASK_CODE_RELEASING_LOCK,
-      TASK_CODE_RELEASED_LOCK,
-      TASK_CODE_RETRY_AIO_REQUESTS,
-      TASK_CODE_CANCEL_ASYNC_REQUESTS,
-      TASK_CODE_REREGISTER_WATCH,
-      TASK_CODE_ASYNC_REQUEST,
-      TASK_CODE_ASYNC_PROGRESS
-    };
-
-    typedef std::pair<Context *, ProgressContext *> AsyncRequest;
-    typedef std::pair<boost::function<void(AioCompletion *)>,
-		      AioCompletion *> AioRequest;
-
-    class Task {
-    public:
-      Task(TaskCode task_code) : m_task_code(task_code) {}
-      Task(TaskCode task_code, const WatchNotify::AsyncRequestId &id)
-        : m_task_code(task_code), m_async_request_id(id) {}
-
-      inline bool operator<(const Task& rhs) const {
-        if (m_task_code != rhs.m_task_code) {
-          return m_task_code < rhs.m_task_code;
-        } else if ((m_task_code == TASK_CODE_ASYNC_REQUEST ||
-                    m_task_code == TASK_CODE_ASYNC_PROGRESS) &&
-                   m_async_request_id != rhs.m_async_request_id) {
-          return m_async_request_id < rhs.m_async_request_id;
-        }
-        return false;
-      }
-    private:
-      TaskCode m_task_code;
-      WatchNotify::AsyncRequestId m_async_request_id;
-    };
-
-    struct WatchCtx : public librados::WatchCtx2 {
-      ImageWatcher &image_watcher;
-
-      WatchCtx(ImageWatcher &parent) : image_watcher(parent) {}
-
-      virtual void handle_notify(uint64_t notify_id,
-                                 uint64_t handle,
-				 uint64_t notifier_id,
-                                 bufferlist& bl);
-      virtual void handle_error(uint64_t handle, int err);
-    };
-
-    class RemoteProgressContext : public ProgressContext {
-    public:
-      RemoteProgressContext(ImageWatcher &image_watcher,
-			    const WatchNotify::AsyncRequestId &id)
-        : m_image_watcher(image_watcher), m_async_request_id(id)
-      {
-      }
+  int notify_flatten(uint64_t request_id, ProgressContext &prog_ctx);
+  int notify_resize(uint64_t request_id, uint64_t size,
+                    ProgressContext &prog_ctx);
+  int notify_snap_create(const std::string &snap_name);
+  int notify_snap_rename(const snapid_t &src_snap_id,
+                         const std::string &dst_snap_name);
+  int notify_snap_remove(const std::string &snap_name);
+  int notify_rebuild_object_map(uint64_t request_id,
+                                ProgressContext &prog_ctx);
 
-      virtual int update_progress(uint64_t offset, uint64_t total) {
-	m_image_watcher.schedule_async_progress(m_async_request_id, offset,
-						total);
-        return 0;
-      }
+  void notify_lock_state();
+  static void notify_header_update(librados::IoCtx &io_ctx,
+                                   const std::string &oid);
 
-    private:
-      ImageWatcher &m_image_watcher;
-      WatchNotify::AsyncRequestId m_async_request_id;
-    };
-
-    class RemoteContext : public Context {
-    public:
-      RemoteContext(ImageWatcher &image_watcher,
-		    const WatchNotify::AsyncRequestId &id,
-		    ProgressContext *prog_ctx)
-        : m_image_watcher(image_watcher), m_async_request_id(id),
-	  m_prog_ctx(prog_ctx)
-      {
-      }
+private:
 
-      ~RemoteContext() {
-        delete m_prog_ctx;
-      }
+  enum LockOwnerState {
+    LOCK_OWNER_STATE_NOT_LOCKED,
+    LOCK_OWNER_STATE_LOCKED,
+    LOCK_OWNER_STATE_RELEASING
+  };
 
-      virtual void finish(int r);
+  enum WatchState {
+    WATCH_STATE_UNREGISTERED,
+    WATCH_STATE_REGISTERED,
+    WATCH_STATE_ERROR
+  };
 
-    private:
-      ImageWatcher &m_image_watcher;
-      WatchNotify::AsyncRequestId m_async_request_id;
-      ProgressContext *m_prog_ctx;
-    };
+  enum TaskCode {
+    TASK_CODE_ACQUIRED_LOCK,
+    TASK_CODE_REQUEST_LOCK,
+    TASK_CODE_RELEASING_LOCK,
+    TASK_CODE_RELEASED_LOCK,
+    TASK_CODE_CANCEL_ASYNC_REQUESTS,
+    TASK_CODE_REREGISTER_WATCH,
+    TASK_CODE_ASYNC_REQUEST,
+    TASK_CODE_ASYNC_PROGRESS
+  };
 
-    struct HandlePayloadVisitor : public boost::static_visitor<void> {
-      ImageWatcher *image_watcher;
-      uint64_t notify_id;
-      uint64_t handle;
+  typedef std::list<Listener *> Listeners;
+  typedef std::pair<Context *, ProgressContext *> AsyncRequest;
 
-      HandlePayloadVisitor(ImageWatcher *image_watcher_, uint64_t notify_id_,
-			   uint64_t handle_)
-	: image_watcher(image_watcher_), notify_id(notify_id_), handle(handle_)
-      {
+  class Task {
+  public:
+    Task(TaskCode task_code) : m_task_code(task_code) {}
+    Task(TaskCode task_code, const WatchNotify::AsyncRequestId &id)
+      : m_task_code(task_code), m_async_request_id(id) {}
+
+    inline bool operator<(const Task& rhs) const {
+      if (m_task_code != rhs.m_task_code) {
+        return m_task_code < rhs.m_task_code;
+      } else if ((m_task_code == TASK_CODE_ASYNC_REQUEST ||
+                  m_task_code == TASK_CODE_ASYNC_PROGRESS) &&
+                 m_async_request_id != rhs.m_async_request_id) {
+        return m_async_request_id < rhs.m_async_request_id;
       }
+      return false;
+    }
+  private:
+    TaskCode m_task_code;
+    WatchNotify::AsyncRequestId m_async_request_id;
+  };
 
-      inline void operator()(const WatchNotify::HeaderUpdatePayload &payload) const {
-	bufferlist out;
-	image_watcher->handle_payload(payload, &out);
-	image_watcher->acknowledge_notify(notify_id, handle, out);
-      }
+  struct WatchCtx : public librados::WatchCtx2 {
+    ImageWatcher &image_watcher;
 
-      template <typename Payload>
-      inline void operator()(const Payload &payload) const {
-	bufferlist out;
-	image_watcher->handle_payload(payload, &out);
-	image_watcher->acknowledge_notify(notify_id, handle, out);
-      }
-    };
-
-    ImageCtx &m_image_ctx;
-
-    RWLock m_watch_lock;
-    WatchCtx m_watch_ctx;
-    uint64_t m_watch_handle;
-    WatchState m_watch_state;
-
-    LockOwnerState m_lock_owner_state;
-
-    TaskFinisher<Task> *m_task_finisher;
-
-    RWLock m_async_request_lock;
-    std::map<WatchNotify::AsyncRequestId, AsyncRequest> m_async_requests;
-    std::set<WatchNotify::AsyncRequestId> m_async_pending;
-
-    Mutex m_aio_request_lock;
-    std::vector<AioRequest> m_aio_requests;
-
-    Mutex m_owner_client_id_lock;
-    WatchNotify::ClientId m_owner_client_id;
-
-    std::string encode_lock_cookie() const;
-    static bool decode_lock_cookie(const std::string &cookie, uint64_t *handle);
-
-    int get_lock_owner_info(entity_name_t *locker, std::string *cookie,
-			    std::string *address, uint64_t *handle);
-    int lock();
-    bool release_lock();
-    bool try_request_lock();
-
-    void schedule_retry_aio_requests(bool use_timer);
-    void retry_aio_requests();
-
-    void schedule_cancel_async_requests();
-    void cancel_async_requests();
-
-    void set_owner_client_id(const WatchNotify::ClientId &client_id);
-    WatchNotify::ClientId get_client_id();
-
-    void notify_release_lock();
-    void notify_released_lock();
-    void notify_request_lock();
-    int notify_lock_owner(bufferlist &bl);
-
-    void schedule_async_request_timed_out(const WatchNotify::AsyncRequestId &id);
-    void async_request_timed_out(const WatchNotify::AsyncRequestId &id);
-    int notify_async_request(const WatchNotify::AsyncRequestId &id,
-			     bufferlist &in, ProgressContext& prog_ctx);
-    void notify_request_leadership();
-
-    void schedule_async_progress(const WatchNotify::AsyncRequestId &id,
-				 uint64_t offset, uint64_t total);
-    int notify_async_progress(const WatchNotify::AsyncRequestId &id,
-			      uint64_t offset, uint64_t total);
-    void schedule_async_complete(const WatchNotify::AsyncRequestId &id,
-				 int r);
-    int notify_async_complete(const WatchNotify::AsyncRequestId &id,
-			      int r);
-
-    int prepare_async_request(const WatchNotify::AsyncRequestId& id,
-                              bool* new_request, Context** ctx,
-                              ProgressContext** prog_ctx);
-    void cleanup_async_request(const WatchNotify::AsyncRequestId& id,
-                               Context *ctx);
-
-    void handle_payload(const WatchNotify::HeaderUpdatePayload& payload,
-		        bufferlist *out);
-    void handle_payload(const WatchNotify::AcquiredLockPayload& payload,
-		        bufferlist *out);
-    void handle_payload(const WatchNotify::ReleasedLockPayload& payload,
-		        bufferlist *out);
-    void handle_payload(const WatchNotify::RequestLockPayload& payload,
-		        bufferlist *out);
-    void handle_payload(const WatchNotify::AsyncProgressPayload& payload,
-		        bufferlist *out);
-    void handle_payload(const WatchNotify::AsyncCompletePayload& payload,
-		        bufferlist *out);
-    void handle_payload(const WatchNotify::FlattenPayload& payload,
-		        bufferlist *out);
-    void handle_payload(const WatchNotify::ResizePayload& payload,
-		        bufferlist *out);
-    void handle_payload(const WatchNotify::SnapCreatePayload& payload,
-		        bufferlist *out);
-    void handle_payload(const WatchNotify::SnapRemovePayload& payload,
-		        bufferlist *out);
-    void handle_payload(const WatchNotify::RebuildObjectMapPayload& payload,
-                        bufferlist *out);
-    void handle_payload(const WatchNotify::UnknownPayload& payload,
-		        bufferlist *out);
-
-    void handle_notify(uint64_t notify_id, uint64_t handle, bufferlist &bl);
-    void handle_error(uint64_t cookie, int err);
-    void acknowledge_notify(uint64_t notify_id, uint64_t handle,
-			    bufferlist &out);
-
-    void reregister_watch();
+    WatchCtx(ImageWatcher &parent) : image_watcher(parent) {}
+
+    virtual void handle_notify(uint64_t notify_id,
+                               uint64_t handle,
+      			       uint64_t notifier_id,
+                               bufferlist& bl);
+    virtual void handle_error(uint64_t handle, int err);
+  };
+
+  class RemoteProgressContext : public ProgressContext {
+  public:
+    RemoteProgressContext(ImageWatcher &image_watcher,
+                          const WatchNotify::AsyncRequestId &id)
+      : m_image_watcher(image_watcher), m_async_request_id(id)
+    {
+    }
+
+    virtual int update_progress(uint64_t offset, uint64_t total) {
+      m_image_watcher.schedule_async_progress(m_async_request_id, offset,
+                                              total);
+      return 0;
+    }
+
+  private:
+    ImageWatcher &m_image_watcher;
+    WatchNotify::AsyncRequestId m_async_request_id;
+  };
+
+  class RemoteContext : public Context {
+  public:
+    RemoteContext(ImageWatcher &image_watcher,
+                  const WatchNotify::AsyncRequestId &id,
+                  ProgressContext *prog_ctx)
+      : m_image_watcher(image_watcher), m_async_request_id(id),
+        m_prog_ctx(prog_ctx)
+    {
+    }
+
+    virtual ~RemoteContext() {
+      delete m_prog_ctx;
+    }
+
+    virtual void finish(int r);
+
+  private:
+    ImageWatcher &m_image_watcher;
+    WatchNotify::AsyncRequestId m_async_request_id;
+    ProgressContext *m_prog_ctx;
+  };
+
+  struct HandlePayloadVisitor : public boost::static_visitor<void> {
+    ImageWatcher *image_watcher;
+    uint64_t notify_id;
+    uint64_t handle;
+
+    HandlePayloadVisitor(ImageWatcher *image_watcher_, uint64_t notify_id_,
+      		   uint64_t handle_)
+      : image_watcher(image_watcher_), notify_id(notify_id_), handle(handle_)
+    {
+    }
+
+    inline void operator()(const WatchNotify::HeaderUpdatePayload &payload) const {
+      bufferlist out;
+      image_watcher->handle_payload(payload, &out);
+      image_watcher->acknowledge_notify(notify_id, handle, out);
+    }
+
+    template <typename Payload>
+    inline void operator()(const Payload &payload) const {
+      bufferlist out;
+      image_watcher->handle_payload(payload, &out);
+      image_watcher->acknowledge_notify(notify_id, handle, out);
+    }
   };
 
+  ImageCtx &m_image_ctx;
+
+  RWLock m_watch_lock;
+  WatchCtx m_watch_ctx;
+  uint64_t m_watch_handle;
+  WatchState m_watch_state;
+
+  bool m_lock_supported;
+
+  LockOwnerState m_lock_owner_state;
+
+  Mutex m_listeners_lock;
+  Cond m_listeners_cond;
+  Listeners m_listeners;
+  bool m_listeners_in_use;
+
+  TaskFinisher<Task> *m_task_finisher;
+
+  RWLock m_async_request_lock;
+  std::map<WatchNotify::AsyncRequestId, AsyncRequest> m_async_requests;
+  std::set<WatchNotify::AsyncRequestId> m_async_pending;
+
+  Mutex m_owner_client_id_lock;
+  WatchNotify::ClientId m_owner_client_id;
+
+  std::string encode_lock_cookie() const;
+  static bool decode_lock_cookie(const std::string &cookie, uint64_t *handle);
+
+  int get_lock_owner_info(entity_name_t *locker, std::string *cookie,
+                          std::string *address, uint64_t *handle);
+  int lock();
+  int unlock();
+  bool try_request_lock();
+
+  void schedule_cancel_async_requests();
+  void cancel_async_requests();
+
+  void set_owner_client_id(const WatchNotify::ClientId &client_id);
+  WatchNotify::ClientId get_client_id();
+
+  void notify_acquired_lock();
+  void notify_release_lock();
+  void notify_released_lock();
+
+  void schedule_request_lock(bool use_timer, int timer_delay = -1);
+  void notify_request_lock();
+
+  int notify_lock_owner(bufferlist &bl);
+
+  void schedule_async_request_timed_out(const WatchNotify::AsyncRequestId &id);
+  void async_request_timed_out(const WatchNotify::AsyncRequestId &id);
+  int notify_async_request(const WatchNotify::AsyncRequestId &id,
+                           bufferlist &in, ProgressContext& prog_ctx);
+  void notify_request_leadership();
+
+  void schedule_async_progress(const WatchNotify::AsyncRequestId &id,
+                               uint64_t offset, uint64_t total);
+  int notify_async_progress(const WatchNotify::AsyncRequestId &id,
+                            uint64_t offset, uint64_t total);
+  void schedule_async_complete(const WatchNotify::AsyncRequestId &id, int r);
+  int notify_async_complete(const WatchNotify::AsyncRequestId &id, int r);
+
+  int prepare_async_request(const WatchNotify::AsyncRequestId& id,
+                            bool* new_request, Context** ctx,
+                            ProgressContext** prog_ctx);
+  void cleanup_async_request(const WatchNotify::AsyncRequestId& id,
+                             Context *ctx);
+
+  void handle_payload(const WatchNotify::HeaderUpdatePayload& payload,
+                      bufferlist *out);
+  void handle_payload(const WatchNotify::AcquiredLockPayload& payload,
+                      bufferlist *out);
+  void handle_payload(const WatchNotify::ReleasedLockPayload& payload,
+                      bufferlist *out);
+  void handle_payload(const WatchNotify::RequestLockPayload& payload,
+                      bufferlist *out);
+  void handle_payload(const WatchNotify::AsyncProgressPayload& payload,
+                      bufferlist *out);
+  void handle_payload(const WatchNotify::AsyncCompletePayload& payload,
+                      bufferlist *out);
+  void handle_payload(const WatchNotify::FlattenPayload& payload,
+                      bufferlist *out);
+  void handle_payload(const WatchNotify::ResizePayload& payload,
+                      bufferlist *out);
+  void handle_payload(const WatchNotify::SnapCreatePayload& payload,
+                      bufferlist *out);
+  void handle_payload(const WatchNotify::SnapRenamePayload& payload,
+                      bufferlist *out);
+  void handle_payload(const WatchNotify::SnapRemovePayload& payload,
+                      bufferlist *out);
+  void handle_payload(const WatchNotify::RebuildObjectMapPayload& payload,
+                      bufferlist *out);
+  void handle_payload(const WatchNotify::UnknownPayload& payload,
+                      bufferlist *out);
+
+  void handle_notify(uint64_t notify_id, uint64_t handle, bufferlist &bl);
+  void handle_error(uint64_t cookie, int err);
+  void acknowledge_notify(uint64_t notify_id, uint64_t handle, bufferlist &out);
+
+  void reregister_watch();
+
+  void notify_listeners_updated_lock(LockUpdateState lock_update_state);
+};
+
 } // namespace librbd
 
 #endif // CEPH_LIBRBD_IMAGE_WATCHER_H
diff --git a/src/librbd/Journal.cc b/src/librbd/Journal.cc
new file mode 100644
index 0000000..37e311e
--- /dev/null
+++ b/src/librbd/Journal.cc
@@ -0,0 +1,635 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/Journal.h"
+#include "librbd/AioCompletion.h"
+#include "librbd/AioImageRequestWQ.h"
+#include "librbd/AioObjectRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/JournalReplay.h"
+#include "librbd/JournalTypes.h"
+#include "journal/Journaler.h"
+#include "journal/ReplayEntry.h"
+#include "common/errno.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::Journal: "
+
+namespace librbd {
+
+namespace {
+
+const std::string CLIENT_DESCRIPTION = "master image";
+
+struct C_DestroyJournaler : public Context {
+  ::journal::Journaler *journaler;
+
+  C_DestroyJournaler(::journal::Journaler *_journaler) : journaler(_journaler) {
+  }
+  virtual void finish(int r) {
+    delete journaler;
+  }
+};
+
+} // anonymous namespace
+
+Journal::Journal(ImageCtx &image_ctx)
+  : m_image_ctx(image_ctx), m_journaler(NULL),
+    m_lock("Journal::m_lock"), m_state(STATE_UNINITIALIZED),
+    m_lock_listener(this), m_replay_handler(this), m_close_pending(false),
+    m_event_lock("Journal::m_event_lock"), m_event_tid(0),
+    m_blocking_writes(false), m_journal_replay(NULL) {
+
+  ldout(m_image_ctx.cct, 5) << this << ": ictx=" << &m_image_ctx << dendl;
+
+  m_image_ctx.image_watcher->register_listener(&m_lock_listener);
+
+  Mutex::Locker locker(m_lock);
+  block_writes();
+}
+
+Journal::~Journal() {
+  m_image_ctx.op_work_queue->drain();
+  assert(m_journaler == NULL);
+  assert(m_journal_replay == NULL);
+
+  m_image_ctx.image_watcher->unregister_listener(&m_lock_listener);
+
+  Mutex::Locker locker(m_lock);
+  unblock_writes();
+}
+
+bool Journal::is_journal_supported(ImageCtx &image_ctx) {
+  assert(image_ctx.snap_lock.is_locked());
+  return ((image_ctx.features & RBD_FEATURE_JOURNALING) &&
+          !image_ctx.read_only && image_ctx.snap_id == CEPH_NOSNAP);
+}
+
+int Journal::create(librados::IoCtx &io_ctx, const std::string &image_id) {
+  CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+  ldout(cct, 5) << __func__ << ": image=" << image_id << dendl;
+
+  // TODO configurable commit flush interval
+  ::journal::Journaler journaler(io_ctx, image_id, "", 5);
+
+  // TODO order / splay width via config / image metadata / data pool
+  int r = journaler.create(24, 4, io_ctx.get_id());
+  if (r < 0) {
+    lderr(cct) << "failed to create journal: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  r = journaler.register_client(CLIENT_DESCRIPTION);
+  if (r < 0) {
+    lderr(cct) << "failed to register client: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+int Journal::remove(librados::IoCtx &io_ctx, const std::string &image_id) {
+  CephContext *cct = reinterpret_cast<CephContext *>(io_ctx.cct());
+  ldout(cct, 5) << __func__ << ": image=" << image_id << dendl;
+
+  // TODO configurable commit flush interval
+  ::journal::Journaler journaler(io_ctx, image_id, "", 5);
+
+  C_SaferCond cond;
+  journaler.init(&cond);
+
+  int r = cond.wait();
+  if (r == -ENOENT) {
+    return 0;
+  } else if (r < 0) {
+    lderr(cct) << "failed to initialize journal: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  r = journaler.remove();
+  if (r < 0) {
+    lderr(cct) << "failed to remove journal: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+  return 0;
+}
+
+bool Journal::is_journal_ready() const {
+  Mutex::Locker locker(m_lock);
+  return (m_state == STATE_RECORDING);
+}
+
+bool Journal::is_journal_replaying() const {
+  Mutex::Locker locker(m_lock);
+  return (m_state == STATE_REPLAYING);
+}
+
+bool Journal::wait_for_journal_ready() {
+  Mutex::Locker locker(m_lock);
+  while (m_state != STATE_UNINITIALIZED && m_state != STATE_RECORDING) {
+    wait_for_state_transition();
+  }
+  return (m_state == STATE_RECORDING);
+}
+
+void Journal::open() {
+  Mutex::Locker locker(m_lock);
+  if (m_journaler != NULL) {
+    return;
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << dendl;
+  create_journaler();
+}
+
+int Journal::close() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": state=" << m_state << dendl;
+
+  Mutex::Locker locker(m_lock);
+  if (m_state == STATE_UNINITIALIZED) {
+    return 0;
+  }
+
+  int r;
+  bool done = false;
+  while (!done) {
+    switch (m_state) {
+    case STATE_UNINITIALIZED:
+      done = true;
+      break;
+    case STATE_INITIALIZING:
+    case STATE_REPLAYING:
+      m_close_pending = true;
+      wait_for_state_transition();
+      break;
+    case STATE_STOPPING_RECORDING:
+      wait_for_state_transition();
+      break;
+    case STATE_RECORDING:
+      r = stop_recording();
+      if (r < 0) {
+        return r;
+      }
+      done = true;
+      break;
+    default:
+      assert(false);
+    }
+  }
+
+  destroy_journaler();
+  return 0;
+}
+
+uint64_t Journal::append_event(AioCompletion *aio_comp,
+                               const journal::EventEntry &event_entry,
+                               const AioObjectRequests &requests,
+                               uint64_t offset, size_t length,
+                               bool flush_entry) {
+  assert(m_image_ctx.owner_lock.is_locked());
+
+  bufferlist bl;
+  ::encode(event_entry, bl);
+
+  ::journal::Future future;
+  uint64_t tid;
+  {
+    Mutex::Locker locker(m_lock);
+    assert(m_state == STATE_RECORDING);
+
+    future = m_journaler->append("", bl);
+
+    Mutex::Locker event_locker(m_event_lock);
+    tid = ++m_event_tid;
+    assert(tid != 0);
+
+    m_events[tid] = Event(future, aio_comp, requests, offset, length);
+  }
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": "
+                 << "event=" << event_entry.get_event_type() << ", "
+                 << "new_reqs=" << requests.size() << ", "
+                 << "offset=" << offset << ", "
+                 << "length=" << length << ", "
+                 << "flush=" << flush_entry << ", tid=" << tid << dendl;
+
+  Context *on_safe = new C_EventSafe(this, tid);
+  if (flush_entry) {
+    future.flush(on_safe);
+  } else {
+    future.wait(on_safe);
+  }
+  return tid;
+}
+
+void Journal::commit_event(uint64_t tid, int r) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", "
+                 "r=" << r << dendl;
+
+  Mutex::Locker event_locker(m_event_lock);
+  Events::iterator it = m_events.find(tid);
+  if (it == m_events.end()) {
+    return;
+  }
+  complete_event(it, r);
+}
+
+void Journal::commit_event_extent(uint64_t tid, uint64_t offset,
+                                  uint64_t length, int r) {
+  assert(length > 0);
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", "
+                 << "offset=" << offset << ", "
+                 << "length=" << length << ", "
+                 << "r=" << r << dendl;
+
+  Mutex::Locker event_locker(m_event_lock);
+  Events::iterator it = m_events.find(tid);
+  if (it == m_events.end()) {
+    return;
+  }
+
+  Event &event = it->second;
+  if (event.ret_val == 0 && r < 0) {
+    event.ret_val = r;
+  }
+
+  ExtentInterval extent;
+  extent.insert(offset, length);
+
+  ExtentInterval intersect;
+  intersect.intersection_of(extent, event.pending_extents);
+
+  event.pending_extents.subtract(intersect);
+  if (!event.pending_extents.empty()) {
+    ldout(cct, 20) << "pending extents: " << event.pending_extents << dendl;
+    return;
+  }
+  complete_event(it, event.ret_val);
+}
+
+void Journal::flush_event(uint64_t tid, Context *on_safe) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", "
+                 << "on_safe=" << on_safe << dendl;
+
+  ::journal::Future future;
+  {
+    Mutex::Locker event_locker(m_event_lock);
+    future = wait_event(m_lock, tid, on_safe);
+  }
+
+  if (future.is_valid()) {
+    future.flush(NULL);
+  }
+}
+
+void Journal::wait_event(uint64_t tid, Context *on_safe) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": tid=" << tid << ", "
+                 << "on_safe=" << on_safe << dendl;
+
+  Mutex::Locker event_locker(m_event_lock);
+  wait_event(m_lock, tid, on_safe);
+}
+
+::journal::Future Journal::wait_event(Mutex &lock, uint64_t tid,
+                                      Context *on_safe) {
+  assert(m_event_lock.is_locked());
+  CephContext *cct = m_image_ctx.cct;
+
+  Events::iterator it = m_events.find(tid);
+  if (it == m_events.end() || it->second.safe) {
+    // journal entry already safe
+    ldout(cct, 20) << "journal entry already safe" << dendl;
+    m_image_ctx.op_work_queue->queue(on_safe, 0);
+    return ::journal::Future();
+  }
+
+  Event &event = it->second;
+  event.on_safe_contexts.push_back(on_safe);
+  return event.future;
+}
+
+void Journal::create_journaler() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << dendl;
+
+  assert(m_lock.is_locked());
+  assert(m_state == STATE_UNINITIALIZED);
+
+  // TODO allow alternate pool for journal objects and commit flush interval
+  m_close_pending = false;
+  m_journaler = new ::journal::Journaler(m_image_ctx.md_ctx, m_image_ctx.id, "",
+                                         5);
+
+  m_journaler->init(new C_InitJournal(this));
+  transition_state(STATE_INITIALIZING);
+}
+
+void Journal::destroy_journaler() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << dendl;
+
+  assert(m_lock.is_locked());
+
+  delete m_journal_replay;
+  m_journal_replay = NULL;
+
+  m_close_pending = false;
+  m_image_ctx.op_work_queue->queue(new C_DestroyJournaler(m_journaler), 0);
+  m_journaler = NULL;
+
+  transition_state(STATE_UNINITIALIZED);
+}
+
+void Journal::complete_event(Events::iterator it, int r) {
+  assert(m_event_lock.is_locked());
+  assert(m_state == STATE_RECORDING);
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": tid=" << it->first << " "
+                 << "r=" << r << dendl;
+
+  m_journaler->committed(it->second.future);
+  if (it->second.safe) {
+    m_events.erase(it);
+  }
+}
+
+void Journal::handle_initialized(int r) {
+  CephContext *cct = m_image_ctx.cct;
+  if (r < 0) {
+    lderr(cct) << this << " " << __func__ << ": r=" << r << dendl;
+    Mutex::Locker locker(m_lock);
+
+    // TODO: failed to open journal -- retry?
+    destroy_journaler();
+    create_journaler();
+    return;
+  }
+
+  ldout(cct, 20) << this << " " << __func__ << dendl;
+  Mutex::Locker locker(m_lock);
+  if (m_close_pending) {
+    destroy_journaler();
+    return;
+  }
+
+  m_journal_replay = new JournalReplay(m_image_ctx);
+
+  transition_state(STATE_REPLAYING);
+  m_journaler->start_replay(&m_replay_handler);
+}
+
+void Journal::handle_replay_ready() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << dendl;
+
+  Mutex::Locker locker(m_lock);
+  if (m_state != STATE_REPLAYING) {
+    return;
+  }
+
+  while (true) {
+    if (m_close_pending) {
+      m_journaler->stop_replay();
+      destroy_journaler();
+      return;
+    }
+
+    ::journal::ReplayEntry replay_entry;
+    if (!m_journaler->try_pop_front(&replay_entry)) {
+      return;
+    }
+
+    m_lock.Unlock();
+    bufferlist data = replay_entry.get_data();
+    bufferlist::iterator it = data.begin();
+    int r = m_journal_replay->process(it);
+    m_lock.Lock();
+
+    if (r < 0) {
+      // TODO
+    }
+  }
+}
+
+void Journal::handle_replay_complete(int r) {
+  CephContext *cct = m_image_ctx.cct;
+
+  {
+    Mutex::Locker locker(m_lock);
+    if (m_state != STATE_REPLAYING) {
+      return;
+    }
+
+    if (r == 0) {
+      r = m_journal_replay->flush();
+    }
+    delete m_journal_replay;
+    m_journal_replay = NULL;
+
+    if (r < 0) {
+      lderr(cct) << this << " " << __func__ << ": r=" << r << dendl;
+
+      // TODO: failed to replay journal -- retry?
+      destroy_journaler();
+      create_journaler();
+      return;
+    }
+
+    ldout(cct, 20) << this << " " << __func__ << dendl;
+    m_journaler->stop_replay();
+
+    if (m_close_pending) {
+      destroy_journaler();
+      return;
+    }
+
+    // TODO configurable flush interval, flush bytes, and flush age
+    m_journaler->start_append(0, 0, 0);
+    transition_state(STATE_RECORDING);
+
+    unblock_writes();
+  }
+
+  // kick peers to let them know they can re-request the lock now
+  m_image_ctx.image_watcher->notify_lock_state();
+}
+
+void Journal::handle_event_safe(int r, uint64_t tid) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": r=" << r << ", "
+                 << "tid=" << tid << dendl;
+
+  // TODO: ensure this callback never sees a failure
+  AioCompletion *aio_comp;
+  AioObjectRequests aio_object_requests;
+  Contexts on_safe_contexts;
+  {
+    Mutex::Locker event_locker(m_event_lock);
+    Events::iterator it = m_events.find(tid);
+    assert(it != m_events.end());
+
+    Event &event = it->second;
+    aio_comp = event.aio_comp;
+    aio_object_requests.swap(event.aio_object_requests);
+    on_safe_contexts.swap(event.on_safe_contexts);
+
+    if (event.pending_extents.empty()) {
+      m_events.erase(it);
+    } else {
+      event.safe = true;
+    }
+  }
+
+  ldout(cct, 20) << "completing tid=" << tid << dendl;
+
+  if (r < 0) {
+    // don't send aio requests if the journal fails -- bubble error up
+    aio_comp->fail(cct, r);
+  } else {
+    // send any waiting aio requests now that journal entry is safe
+    RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
+    assert(m_image_ctx.image_watcher->is_lock_owner());
+
+    for (AioObjectRequests::iterator it = aio_object_requests.begin();
+         it != aio_object_requests.end(); ++it) {
+      (*it)->send();
+    }
+  }
+
+  // alert the cache about the journal event status
+  for (Contexts::iterator it = on_safe_contexts.begin();
+       it != on_safe_contexts.end(); ++it) {
+    (*it)->complete(r);
+  }
+}
+
+bool Journal::handle_requested_lock() {
+  Mutex::Locker locker(m_lock);
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": " << "state=" << m_state
+                 << dendl;
+
+  // prevent peers from taking our lock while we are replaying since that
+  // will stale forward progress
+  return (m_state != STATE_INITIALIZING && m_state != STATE_REPLAYING);
+}
+
+void Journal::handle_lock_updated(ImageWatcher::LockUpdateState state) {
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": "
+                 << "state=" << state << dendl;
+
+  Mutex::Locker locker(m_lock);
+  if (state == ImageWatcher::LOCK_UPDATE_STATE_LOCKED &&
+      m_state == STATE_UNINITIALIZED) {
+    create_journaler();
+  } else if (state == ImageWatcher::LOCK_UPDATE_STATE_RELEASING) {
+    if (m_state == STATE_INITIALIZING || m_state == STATE_REPLAYING) {
+      // wait for replay to successfully interrupt
+      m_close_pending = true;
+      wait_for_state_transition();
+    }
+
+    if (m_state == STATE_UNINITIALIZED || m_state == STATE_RECORDING) {
+      // prevent new write ops but allow pending ops to flush to the journal
+      block_writes();
+    }
+    if (m_state == STATE_RECORDING) {
+      flush_journal();
+    }
+  } else if ((state == ImageWatcher::LOCK_UPDATE_STATE_NOT_SUPPORTED ||
+              state == ImageWatcher::LOCK_UPDATE_STATE_UNLOCKED) &&
+             m_state != STATE_UNINITIALIZED &&
+             m_state != STATE_STOPPING_RECORDING) {
+    assert(m_state == STATE_RECORDING);
+    {
+      Mutex::Locker event_locker(m_event_lock);
+      assert(m_events.empty());
+    }
+
+    int r = stop_recording();
+    if (r < 0) {
+      // TODO handle failed journal writes
+      assert(false);
+    }
+  }
+}
+
+int Journal::stop_recording() {
+  assert(m_lock.is_locked());
+  assert(m_journaler != NULL);
+
+  transition_state(STATE_STOPPING_RECORDING);
+
+  C_SaferCond cond;
+  m_lock.Unlock();
+  m_journaler->stop_append(&cond);
+  int r = cond.wait();
+  m_lock.Lock();
+
+  destroy_journaler();
+  if (r < 0) {
+    lderr(m_image_ctx.cct) << "failed to flush journal: " << cpp_strerror(r)
+                           << dendl;
+    return r;
+  }
+  return 0;
+}
+
+void Journal::block_writes() {
+  assert(m_lock.is_locked());
+  if (!m_blocking_writes) {
+    m_blocking_writes = true;
+    m_image_ctx.aio_work_queue->block_writes();
+  }
+}
+
+void Journal::unblock_writes() {
+  assert(m_lock.is_locked());
+  if (m_blocking_writes) {
+    m_blocking_writes = false;
+    m_image_ctx.aio_work_queue->unblock_writes();
+  }
+}
+
+void Journal::flush_journal() {
+  assert(m_lock.is_locked());
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << dendl;
+
+  m_lock.Unlock();
+  C_SaferCond cond_ctx;
+  m_journaler->flush(&cond_ctx);
+  cond_ctx.wait();
+  m_lock.Lock();
+}
+
+void Journal::transition_state(State state) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": new state=" << state << dendl;
+  assert(m_lock.is_locked());
+  m_state = state;
+  m_cond.Signal();
+}
+
+void Journal::wait_for_state_transition() {
+  assert(m_lock.is_locked());
+  State state = m_state;
+  while (m_state == state) {
+    m_cond.Wait(m_lock);
+  }
+}
+
+} // namespace librbd
diff --git a/src/librbd/Journal.h b/src/librbd/Journal.h
new file mode 100644
index 0000000..f6368b2
--- /dev/null
+++ b/src/librbd/Journal.h
@@ -0,0 +1,209 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_H
+#define CEPH_LIBRBD_JOURNAL_H
+
+#include "include/int_types.h"
+#include "include/Context.h"
+#include "include/interval_set.h"
+#include "include/unordered_map.h"
+#include "include/rados/librados.hpp"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "journal/Future.h"
+#include "journal/ReplayHandler.h"
+#include "librbd/ImageWatcher.h"
+#include <algorithm>
+#include <list>
+#include <string>
+
+class Context;
+namespace journal {
+class Journaler;
+}
+
+namespace librbd {
+
+class AioCompletion;
+class AioObjectRequest;
+class ImageCtx;
+class JournalReplay;
+
+namespace journal {
+class EventEntry;
+}
+
+class Journal {
+public:
+  typedef std::list<AioObjectRequest *> AioObjectRequests;
+
+  Journal(ImageCtx &image_ctx);
+  ~Journal();
+
+  static bool is_journal_supported(ImageCtx &image_ctx);
+  static int create(librados::IoCtx &io_ctx, const std::string &image_id);
+  static int remove(librados::IoCtx &io_ctx, const std::string &image_id);
+
+  bool is_journal_ready() const;
+  bool is_journal_replaying() const;
+
+  bool wait_for_journal_ready();
+
+  void open();
+  int close();
+
+  uint64_t append_event(AioCompletion *aio_comp,
+                        const journal::EventEntry &event_entry,
+                        const AioObjectRequests &requests,
+                        uint64_t offset, size_t length,
+                        bool flush_entry);
+
+  void commit_event(uint64_t tid, int r);
+  void commit_event_extent(uint64_t tid, uint64_t offset, uint64_t length,
+                           int r);
+
+  void flush_event(uint64_t tid, Context *on_safe);
+  void wait_event(uint64_t tid, Context *on_safe);
+
+private:
+  typedef std::list<Context *> Contexts;
+  typedef interval_set<uint64_t> ExtentInterval;
+
+  enum State {
+    STATE_UNINITIALIZED,
+    STATE_INITIALIZING,
+    STATE_REPLAYING,
+    STATE_RECORDING,
+    STATE_STOPPING_RECORDING
+  };
+
+  struct Event {
+    ::journal::Future future;
+    AioCompletion *aio_comp;
+    AioObjectRequests aio_object_requests;
+    Contexts on_safe_contexts;
+    ExtentInterval pending_extents;
+    bool safe;
+    int ret_val;
+
+    Event() : aio_comp(NULL) {
+    }
+    Event(const ::journal::Future &_future, AioCompletion *_aio_comp,
+          const AioObjectRequests &_requests, uint64_t offset, size_t length)
+      : future(_future), aio_comp(_aio_comp), aio_object_requests(_requests),
+        safe(false), ret_val(0) {
+      if (length > 0) {
+        pending_extents.insert(offset, length);
+      }
+    }
+  };
+  typedef ceph::unordered_map<uint64_t, Event> Events;
+
+  struct LockListener : public ImageWatcher::Listener {
+    Journal *journal;
+    LockListener(Journal *_journal) : journal(_journal) {
+    }
+
+    virtual bool handle_requested_lock() {
+      return journal->handle_requested_lock();
+    }
+    virtual void handle_lock_updated(ImageWatcher::LockUpdateState state) {
+      journal->handle_lock_updated(state);
+    }
+  };
+
+  struct C_InitJournal : public Context {
+    Journal *journal;
+
+    C_InitJournal(Journal *_journal) : journal(_journal) {
+    }
+
+    virtual void finish(int r) {
+      journal->handle_initialized(r);
+    }
+  };
+
+  struct C_EventSafe : public Context {
+    Journal *journal;
+    uint64_t tid;
+
+    C_EventSafe(Journal *_journal, uint64_t _tid)
+      : journal(_journal), tid(_tid) {
+    }
+
+    virtual void finish(int r) {
+      journal->handle_event_safe(r, tid);
+    }
+  };
+
+  struct ReplayHandler : public ::journal::ReplayHandler {
+    Journal *journal;
+    ReplayHandler(Journal *_journal) : journal(_journal) {
+    }
+
+    virtual void get() {
+      // TODO
+    }
+    virtual void put() {
+      // TODO
+    }
+
+    virtual void handle_entries_available() {
+      journal->handle_replay_ready();
+    }
+    virtual void handle_complete(int r) {
+      journal->handle_replay_complete(r);
+    }
+  };
+
+  ImageCtx &m_image_ctx;
+
+  ::journal::Journaler *m_journaler;
+  mutable Mutex m_lock;
+  Cond m_cond;
+  State m_state;
+
+  LockListener m_lock_listener;
+
+  ReplayHandler m_replay_handler;
+  bool m_close_pending;
+
+  Mutex m_event_lock;
+  uint64_t m_event_tid;
+  Events m_events;
+
+  bool m_blocking_writes;
+
+  JournalReplay *m_journal_replay;
+
+  ::journal::Future wait_event(Mutex &lock, uint64_t tid, Context *on_safe);
+
+  void create_journaler();
+  void destroy_journaler();
+
+  void complete_event(Events::iterator it, int r);
+
+  void handle_initialized(int r);
+
+  void handle_replay_ready();
+  void handle_replay_complete(int r);
+
+  void handle_event_safe(int r, uint64_t tid);
+
+  bool handle_requested_lock();
+  void handle_lock_updated(ImageWatcher::LockUpdateState state);
+
+  int stop_recording();
+
+  void block_writes();
+  void unblock_writes();
+
+  void flush_journal();
+  void transition_state(State state);
+  void wait_for_state_transition();
+};
+
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_H
diff --git a/src/librbd/JournalReplay.cc b/src/librbd/JournalReplay.cc
new file mode 100644
index 0000000..7daf10c
--- /dev/null
+++ b/src/librbd/JournalReplay.cc
@@ -0,0 +1,120 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/JournalReplay.h"
+#include "librbd/AioCompletion.h"
+#include "librbd/AioImageRequest.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/internal.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbd::JournalReplay: "
+
+namespace librbd {
+
+JournalReplay::JournalReplay(ImageCtx &image_ctx)
+  : m_image_ctx(image_ctx), m_lock("JournalReplay::m_lock"), m_ret_val(0) {
+}
+
+JournalReplay::~JournalReplay() {
+  assert(m_aio_completions.empty());
+}
+
+int JournalReplay::process(bufferlist::iterator it) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << dendl;
+
+  RWLock::RLocker owner_lock(m_image_ctx.owner_lock);
+  journal::EventEntry event_entry;
+  try {
+    ::decode(event_entry, it);
+  } catch (const buffer::error &err) {
+    lderr(cct) << "failed to decode event entry: " << err.what() << dendl;
+    return -EINVAL;
+  }
+
+  boost::apply_visitor(EventVisitor(this), event_entry.event);
+  return 0;
+}
+
+int JournalReplay::flush() {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << dendl;
+
+  Mutex::Locker locker(m_lock);
+  while (!m_aio_completions.empty()) {
+    m_cond.Wait(m_lock);
+  }
+  return m_ret_val;
+}
+
+void JournalReplay::handle_event(const journal::AioDiscardEvent &event) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": AIO discard event" << dendl;
+
+  AioCompletion *aio_comp = create_aio_completion();
+  AioImageRequest::aio_discard(&m_image_ctx, aio_comp, event.offset,
+                               event.length);
+}
+
+void JournalReplay::handle_event(const journal::AioWriteEvent &event) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": AIO write event" << dendl;
+
+  bufferlist data = event.data;
+  AioCompletion *aio_comp = create_aio_completion();
+  AioImageRequest::aio_write(&m_image_ctx, aio_comp, event.offset, event.length,
+                             data.c_str(), 0);
+}
+
+void JournalReplay::handle_event(const journal::AioFlushEvent &event) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": AIO flush event" << dendl;
+
+  AioCompletion *aio_comp = create_aio_completion();
+  AioImageRequest::aio_flush(&m_image_ctx, aio_comp);
+}
+
+void JournalReplay::handle_event(const journal::UnknownEvent &event) {
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": unknown event" << dendl;
+}
+
+AioCompletion *JournalReplay::create_aio_completion() {
+  Mutex::Locker locker(m_lock);
+  AioCompletion *aio_comp = aio_create_completion_internal(
+    this, &aio_completion_callback);
+  m_aio_completions.insert(aio_comp);
+  return aio_comp;
+}
+
+void JournalReplay::handle_aio_completion(AioCompletion *aio_comp) {
+  Mutex::Locker locker(m_lock);
+
+  AioCompletions::iterator it = m_aio_completions.find(aio_comp);
+  assert(it != m_aio_completions.end());
+
+  int r = aio_comp->get_return_value();
+
+  CephContext *cct = m_image_ctx.cct;
+  ldout(cct, 20) << this << " " << __func__ << ": aio_comp=" << aio_comp << ", "
+                 << "r=" << r << dendl;
+
+  if (r < 0 && m_ret_val == 0) {
+    m_ret_val = r;
+  }
+
+  m_aio_completions.erase(it);
+  m_cond.Signal();
+}
+
+void JournalReplay::aio_completion_callback(completion_t cb, void *arg) {
+  JournalReplay *journal_replay = reinterpret_cast<JournalReplay *>(arg);
+  AioCompletion *aio_comp = reinterpret_cast<AioCompletion *>(cb);
+
+  journal_replay->handle_aio_completion(aio_comp);
+  aio_comp->release();
+}
+
+} // namespace librbd
diff --git a/src/librbd/JournalReplay.h b/src/librbd/JournalReplay.h
new file mode 100644
index 0000000..7b85713
--- /dev/null
+++ b/src/librbd/JournalReplay.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_REPLAY_H
+#define CEPH_LIBRBD_JOURNAL_REPLAY_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/rbd/librbd.hpp"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "librbd/JournalTypes.h"
+#include <boost/variant.hpp>
+#include <set>
+
+namespace librbd {
+
+class AioCompletion;
+class ImageCtx;
+
+class JournalReplay {
+public:
+  JournalReplay(ImageCtx &image_ctx);
+  ~JournalReplay();
+
+  int process(bufferlist::iterator it);
+  int flush();
+
+private:
+  typedef std::set<AioCompletion *> AioCompletions;
+
+  struct EventVisitor : public boost::static_visitor<void> {
+    JournalReplay *journal_replay;
+
+    EventVisitor(JournalReplay *_journal_replay)
+      : journal_replay(_journal_replay) {
+    }
+
+    template <typename Event>
+    inline void operator()(const Event &event) const {
+      journal_replay->handle_event(event);
+    }
+  };
+
+  ImageCtx &m_image_ctx;
+
+  Mutex m_lock;
+  Cond m_cond;
+
+  AioCompletions m_aio_completions;
+  int m_ret_val;
+
+  void handle_event(const journal::AioDiscardEvent &event);
+  void handle_event(const journal::AioWriteEvent &event);
+  void handle_event(const journal::AioFlushEvent &event);
+  void handle_event(const journal::UnknownEvent &event);
+
+  AioCompletion *create_aio_completion();
+  void handle_aio_completion(AioCompletion *aio_comp);
+
+  static void aio_completion_callback(completion_t cb, void *arg);
+};
+
+} // namespace librbd
+
+#endif // CEPH_LIBRBD_JOURNAL_REPLAY_H
diff --git a/src/librbd/JournalTypes.cc b/src/librbd/JournalTypes.cc
new file mode 100644
index 0000000..4dcd2f5
--- /dev/null
+++ b/src/librbd/JournalTypes.cc
@@ -0,0 +1,192 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "librbd/JournalTypes.h"
+#include "include/assert.h"
+#include "include/stringify.h"
+#include "common/Formatter.h"
+
+namespace librbd {
+namespace journal {
+
+namespace {
+
+class GetEventTypeVistor : public boost::static_visitor<EventType> {
+public:
+  template <typename Event>
+  inline EventType operator()(const Event &event) const {
+    return Event::EVENT_TYPE;
+  }
+};
+
+class EncodeEventVisitor : public boost::static_visitor<void> {
+public:
+  EncodeEventVisitor(bufferlist &bl) : m_bl(bl) {
+  }
+
+  template <typename Event>
+  inline void operator()(const Event &event) const {
+    ::encode(static_cast<uint32_t>(Event::EVENT_TYPE), m_bl);
+    event.encode(m_bl);
+  }
+private:
+  bufferlist &m_bl;
+};
+
+class DecodeEventVisitor : public boost::static_visitor<void> {
+public:
+  DecodeEventVisitor(__u8 version, bufferlist::iterator &iter)
+    : m_version(version), m_iter(iter) {
+  }
+
+  template <typename Event>
+  inline void operator()(Event &event) const {
+    event.decode(m_version, m_iter);
+  }
+private:
+  __u8 m_version;
+  bufferlist::iterator &m_iter;
+};
+
+class DumpEventVisitor : public boost::static_visitor<void> {
+public:
+  DumpEventVisitor(Formatter *formatter) : m_formatter(formatter) {}
+
+  template <typename Event>
+  inline void operator()(const Event &event) const {
+    EventType event_type = Event::EVENT_TYPE;
+    m_formatter->dump_string("event_type", stringify(event_type));
+    event.dump(m_formatter);
+  }
+private:
+  ceph::Formatter *m_formatter;
+};
+
+} // anonymous namespace
+
+void AioDiscardEvent::encode(bufferlist& bl) const {
+  ::encode(offset, bl);
+  ::encode(length, bl);
+}
+
+void AioDiscardEvent::decode(__u8 version, bufferlist::iterator& it) {
+  ::decode(offset, it);
+  ::decode(length, it);
+}
+
+void AioDiscardEvent::dump(Formatter *f) const {
+  f->dump_unsigned("offset", offset);
+  f->dump_unsigned("length", length);
+}
+
+void AioWriteEvent::encode(bufferlist& bl) const {
+  ::encode(offset, bl);
+  ::encode(length, bl);
+  ::encode(data, bl);
+}
+
+void AioWriteEvent::decode(__u8 version, bufferlist::iterator& it) {
+  ::decode(offset, it);
+  ::decode(length, it);
+  ::decode(data, it);
+}
+
+void AioWriteEvent::dump(Formatter *f) const {
+  f->dump_unsigned("offset", offset);
+  f->dump_unsigned("length", length);
+}
+
+void AioFlushEvent::encode(bufferlist& bl) const {
+}
+
+void AioFlushEvent::decode(__u8 version, bufferlist::iterator& it) {
+}
+
+void AioFlushEvent::dump(Formatter *f) const {
+}
+
+void UnknownEvent::encode(bufferlist& bl) const {
+  assert(false);
+}
+
+void UnknownEvent::decode(__u8 version, bufferlist::iterator& it) {
+}
+
+void UnknownEvent::dump(Formatter *f) const {
+}
+
+EventType EventEntry::get_event_type() const {
+  return boost::apply_visitor(GetEventTypeVistor(), event);
+}
+
+void EventEntry::encode(bufferlist& bl) const {
+  ENCODE_START(1, 1, bl);
+  boost::apply_visitor(EncodeEventVisitor(bl), event);
+  ENCODE_FINISH(bl);
+}
+
+void EventEntry::decode(bufferlist::iterator& it) {
+  DECODE_START(1, it);
+
+  uint32_t event_type;
+  ::decode(event_type, it);
+
+  // select the correct payload variant based upon the encoded op
+  switch (event_type) {
+  case EVENT_TYPE_AIO_DISCARD:
+    event = AioDiscardEvent();
+    break;
+  case EVENT_TYPE_AIO_WRITE:
+    event = AioWriteEvent();
+    break;
+  case EVENT_TYPE_AIO_FLUSH:
+    event = AioFlushEvent();
+    break;
+  default:
+    event = UnknownEvent();
+    break;
+  }
+
+  boost::apply_visitor(DecodeEventVisitor(struct_v, it), event);
+  DECODE_FINISH(it);
+}
+
+void EventEntry::dump(Formatter *f) const {
+  boost::apply_visitor(DumpEventVisitor(f), event);
+}
+
+void EventEntry::generate_test_instances(std::list<EventEntry *> &o) {
+  o.push_back(new EventEntry(AioDiscardEvent()));
+  o.push_back(new EventEntry(AioDiscardEvent(123, 345)));
+
+  bufferlist bl;
+  bl.append(std::string(32, '1'));
+  o.push_back(new EventEntry(AioWriteEvent()));
+  o.push_back(new EventEntry(AioWriteEvent(123, 456, bl)));
+
+  o.push_back(new EventEntry(AioFlushEvent()));
+}
+
+} // namespace journal
+} // namespace librbd
+
+std::ostream &operator<<(std::ostream &out,
+                         const librbd::journal::EventType &type) {
+  using namespace librbd::journal;
+
+  switch (type) {
+  case EVENT_TYPE_AIO_DISCARD:
+    out << "AioDiscard";
+    break;
+  case EVENT_TYPE_AIO_WRITE:
+    out << "AioWrite";
+    break;
+  case EVENT_TYPE_AIO_FLUSH:
+    out << "AioFlush";
+    break;
+  default:
+    out << "Unknown (" << static_cast<uint32_t>(type) << ")";
+    break;
+  }
+  return out;
+}
diff --git a/src/librbd/JournalTypes.h b/src/librbd/JournalTypes.h
new file mode 100644
index 0000000..59bd13f
--- /dev/null
+++ b/src/librbd/JournalTypes.h
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_LIBRBD_JOURNAL_TYPES_H
+#define CEPH_LIBRBD_JOURNAL_TYPES_H
+
+#include "include/int_types.h"
+#include "include/buffer.h"
+#include "include/encoding.h"
+#include <iosfwd>
+#include <boost/variant.hpp>
+
+namespace ceph {
+class Formatter;
+}
+
+namespace librbd {
+namespace journal {
+
+enum EventType {
+  EVENT_TYPE_AIO_DISCARD = 0,
+  EVENT_TYPE_AIO_WRITE   = 1,
+  EVENT_TYPE_AIO_FLUSH   = 2
+};
+
+struct AioDiscardEvent {
+  static const EventType EVENT_TYPE = EVENT_TYPE_AIO_DISCARD;
+
+  uint64_t offset;
+  size_t length;
+
+  AioDiscardEvent() : offset(0), length(0) {
+  }
+  AioDiscardEvent(uint64_t _offset, size_t _length)
+    : offset(_offset), length(_length) {
+  }
+
+  void encode(bufferlist& bl) const;
+  void decode(__u8 version, bufferlist::iterator& it);
+  void dump(Formatter *f) const;
+};
+
+struct AioWriteEvent {
+  static const EventType EVENT_TYPE = EVENT_TYPE_AIO_WRITE;
+
+  uint64_t offset;
+  size_t length;
+  bufferlist data;
+
+  AioWriteEvent() : offset(0), length(0) {
+  }
+  AioWriteEvent(uint64_t _offset, size_t _length, const bufferlist &_data)
+    : offset(_offset), length(_length), data(_data) {
+  }
+
+  void encode(bufferlist& bl) const;
+  void decode(__u8 version, bufferlist::iterator& it);
+  void dump(Formatter *f) const;
+};
+
+struct UnknownEvent {
+  static const EventType EVENT_TYPE = static_cast<EventType>(-1);
+
+  void encode(bufferlist& bl) const;
+  void decode(__u8 version, bufferlist::iterator& it);
+  void dump(Formatter *f) const;
+};
+
+struct AioFlushEvent {
+  static const EventType EVENT_TYPE = EVENT_TYPE_AIO_FLUSH;
+
+  void encode(bufferlist& bl) const;
+  void decode(__u8 version, bufferlist::iterator& it);
+  void dump(Formatter *f) const;
+};
+
+typedef boost::variant<AioDiscardEvent,
+                       AioWriteEvent,
+                       AioFlushEvent,
+                       UnknownEvent> Event;
+
+struct EventEntry {
+  EventEntry() : event(UnknownEvent()) {
+  }
+  EventEntry(const Event &_event) : event(_event) {
+  }
+
+  Event event;
+
+  EventType get_event_type() const;
+
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& it);
+  void dump(Formatter *f) const;
+
+  static void generate_test_instances(std::list<EventEntry *> &o);
+};
+
+} // namespace journal
+} // namespace librbd
+
+std::ostream &operator<<(std::ostream &out,
+                         const librbd::journal::EventType &type);
+
+WRITE_CLASS_ENCODER(librbd::journal::EventEntry);
+
+#endif // CEPH_LIBRBD_JOURNAL_TYPES_H
diff --git a/src/librbd/LibrbdAdminSocketHook.cc b/src/librbd/LibrbdAdminSocketHook.cc
new file mode 100644
index 0000000..ca7d64a
--- /dev/null
+++ b/src/librbd/LibrbdAdminSocketHook.cc
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "common/errno.h"
+
+#include "librbd/ImageCtx.h"
+#include "librbd/LibrbdAdminSocketHook.h"
+#include "librbd/internal.h"
+
+#define dout_subsys ceph_subsys_rbd
+#undef dout_prefix
+#define dout_prefix *_dout << "librbdadminsocket: "
+
+namespace librbd {
+
+class LibrbdAdminSocketCommand {
+public:
+  virtual ~LibrbdAdminSocketCommand() {}
+  virtual bool call(stringstream *ss) = 0;
+};
+
+class FlushCacheCommand : public LibrbdAdminSocketCommand {
+public:
+  FlushCacheCommand(ImageCtx *ictx) : ictx(ictx) {}
+
+  bool call(stringstream *ss) {
+    int r = flush(ictx);
+    if (r < 0) {
+      *ss << "flush: " << cpp_strerror(r);
+      return false;
+    }
+    return true;
+  }
+
+private:
+  ImageCtx *ictx;
+};
+
+struct InvalidateCacheCommand : public LibrbdAdminSocketCommand {
+public:
+  InvalidateCacheCommand(ImageCtx *ictx) : ictx(ictx) {}
+
+  bool call(stringstream *ss) {
+    int r = invalidate_cache(ictx);
+    if (r < 0) {
+      *ss << "invalidate_cache: " << cpp_strerror(r);
+      return false;
+    }
+    return true;
+  }
+
+private:
+  ImageCtx *ictx;
+};
+
+LibrbdAdminSocketHook::LibrbdAdminSocketHook(ImageCtx *ictx) :
+  admin_socket(ictx->cct->get_admin_socket()) {
+
+  std::string command;
+  int r;
+
+  command = "rbd cache flush " + ictx->name;
+  r = admin_socket->register_command(command, command, this,
+				     "flush rbd image " + ictx->name +
+				     " cache");
+  if (r == 0) {
+    commands[command] = new FlushCacheCommand(ictx);
+  }
+
+  command = "rbd cache invalidate " + ictx->name;
+  r = admin_socket->register_command(command, command, this,
+				     "invalidate rbd image " + ictx->name +
+				     " cache");
+  if (r == 0) {
+    commands[command] = new InvalidateCacheCommand(ictx);
+  }
+}
+
+LibrbdAdminSocketHook::~LibrbdAdminSocketHook() {
+  for (Commands::const_iterator i = commands.begin(); i != commands.end();
+       i++) {
+    (void)admin_socket->unregister_command(i->first);
+    delete i->second;
+  }
+}
+
+bool LibrbdAdminSocketHook::call(std::string command, cmdmap_t& cmdmap,
+				 std::string format, bufferlist& out) {
+  Commands::const_iterator i = commands.find(command);
+  assert(i != commands.end());
+  stringstream ss;
+  bool r = i->second->call(&ss);
+  out.append(ss);
+  return r;
+}
+
+} // namespace librbd
diff --git a/src/librbd/LibrbdAdminSocketHook.h b/src/librbd/LibrbdAdminSocketHook.h
new file mode 100644
index 0000000..ecd8988
--- /dev/null
+++ b/src/librbd/LibrbdAdminSocketHook.h
@@ -0,0 +1,31 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+#ifndef CEPH_LIBRBD_LIBRBDADMINSOCKETHOOK_H
+#define CEPH_LIBRBD_LIBRBDADMINSOCKETHOOK_H
+
+#include <map>
+
+#include "common/admin_socket.h"
+
+namespace librbd {
+
+  struct ImageCtx;
+  class LibrbdAdminSocketCommand;
+
+  class LibrbdAdminSocketHook : public AdminSocketHook {
+  public:
+    LibrbdAdminSocketHook(ImageCtx *ictx);
+    ~LibrbdAdminSocketHook();
+
+    bool call(std::string command, cmdmap_t& cmdmap, std::string format,
+	      bufferlist& out);
+
+  private:
+    typedef std::map<std::string,LibrbdAdminSocketCommand*> Commands;
+
+    AdminSocket *admin_socket;
+    Commands commands;
+  };
+}
+
+#endif
diff --git a/src/librbd/LibrbdWriteback.cc b/src/librbd/LibrbdWriteback.cc
index ac778ee..1e68214 100644
--- a/src/librbd/LibrbdWriteback.cc
+++ b/src/librbd/LibrbdWriteback.cc
@@ -11,12 +11,13 @@
 #include "include/rados/librados.hpp"
 #include "include/rbd/librbd.hpp"
 
-#include "librbd/AioRequest.h"
+#include "librbd/AioObjectRequest.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/internal.h"
 #include "librbd/LibrbdWriteback.h"
 #include "librbd/AioCompletion.h"
 #include "librbd/ObjectMap.h"
+#include "librbd/Journal.h"
 
 #include "include/assert.h"
 
@@ -91,6 +92,79 @@ namespace librbd {
     LibrbdWriteback *m_wb_handler;
   };
 
+  struct C_WriteJournalCommit : public Context {
+    typedef std::vector<std::pair<uint64_t,uint64_t> > Extents;
+
+    ImageCtx *image_ctx;
+    std::string oid;
+    uint64_t object_no;
+    uint64_t off;
+    bufferlist bl;
+    SnapContext snapc;
+    Context *req_comp;
+    uint64_t journal_tid;
+    bool request_sent;
+
+    C_WriteJournalCommit(ImageCtx *_image_ctx, const std::string &_oid,
+                         uint64_t _object_no, uint64_t _off,
+                         const bufferlist &_bl, const SnapContext& _snapc,
+                         Context *_req_comp, uint64_t _journal_tid)
+      : image_ctx(_image_ctx), oid(_oid), object_no(_object_no), off(_off),
+        bl(_bl), snapc(_snapc), req_comp(_req_comp), journal_tid(_journal_tid),
+        request_sent(false) {
+      CephContext *cct = image_ctx->cct;
+      ldout(cct, 20) << this << " C_WriteJournalCommit: "
+                     << "delaying write until journal tid "
+                     << journal_tid << " safe" << dendl;
+    }
+
+    virtual void complete(int r) {
+      if (request_sent || r < 0) {
+        commit_event_extent(r);
+        req_comp->complete(r);
+        delete this;
+      } else {
+        send_request();
+      }
+    }
+
+    virtual void finish(int r) {
+    }
+
+    void commit_event_extent(int r) {
+      CephContext *cct = image_ctx->cct;
+      ldout(cct, 20) << this << " C_WriteJournalCommit: "
+                     << "write committed: updating journal commit position"
+                     << dendl;
+
+      // all IO operations are flushed prior to closing the journal
+      assert(image_ctx->journal != NULL);
+
+      Extents file_extents;
+      Striper::extent_to_file(cct, &image_ctx->layout, object_no, off,
+                              bl.length(), file_extents);
+      for (Extents::iterator it = file_extents.begin();
+           it != file_extents.end(); ++it) {
+        image_ctx->journal->commit_event_extent(journal_tid, it->first,
+                                                it->second, r);
+      }
+    }
+
+    void send_request() {
+      CephContext *cct = image_ctx->cct;
+      ldout(cct, 20) << this << " C_WriteJournalCommit: "
+                     << "journal committed: sending write request" << dendl;
+
+      RWLock::RLocker owner_locker(image_ctx->owner_lock);
+      assert(image_ctx->image_watcher->is_lock_owner());
+
+      request_sent = true;
+      AioObjectWrite *req = new AioObjectWrite(image_ctx, oid, object_no, off,
+                                               bl, snapc, this);
+      req->send();
+    }
+  };
+
   LibrbdWriteback::LibrbdWriteback(ImageCtx *ictx, Mutex& lock)
     : m_finisher(new Finisher(ictx->cct)), m_tid(0), m_lock(lock), m_ictx(ictx)
   {
@@ -160,21 +234,52 @@ namespace librbd {
 			       const SnapContext& snapc,
 			       const bufferlist &bl, utime_t mtime,
 			       uint64_t trunc_size, __u32 trunc_seq,
-			       Context *oncommit)
+			       ceph_tid_t journal_tid, Context *oncommit)
   {
     assert(m_ictx->owner_lock.is_locked());
     uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix);
-    
+
     write_result_d *result = new write_result_d(oid.name, oncommit);
     m_writes[oid.name].push(result);
     ldout(m_ictx->cct, 20) << "write will wait for result " << result << dendl;
     C_OrderedWrite *req_comp = new C_OrderedWrite(m_ictx->cct, result, this);
-    AioWrite *req = new AioWrite(m_ictx, oid.name, object_no, off, bl, snapc,
-                                 req_comp);
-    req->send();
+
+    // all IO operations are flushed prior to closing the journal
+    assert(journal_tid == 0 || m_ictx->journal != NULL);
+    if (journal_tid != 0) {
+      m_ictx->journal->flush_event(
+        journal_tid, new C_WriteJournalCommit(m_ictx, oid.name, object_no, off,
+                                              bl, snapc, req_comp,
+                                              journal_tid));
+    } else {
+      AioObjectWrite *req = new AioObjectWrite(m_ictx, oid.name, object_no, off,
+                                               bl, snapc, req_comp);
+      req->send();
+    }
     return ++m_tid;
   }
 
+
+  void LibrbdWriteback::overwrite_extent(const object_t& oid, uint64_t off,
+                                         uint64_t len, ceph_tid_t journal_tid) {
+    typedef std::vector<std::pair<uint64_t,uint64_t> > Extents;
+
+    assert(m_ictx->owner_lock.is_locked());
+    uint64_t object_no = oid_to_object_no(oid.name, m_ictx->object_prefix);
+
+    // all IO operations are flushed prior to closing the journal
+    assert(journal_tid != 0 && m_ictx->journal != NULL);
+
+    Extents file_extents;
+    Striper::extent_to_file(m_ictx->cct, &m_ictx->layout, object_no, off,
+                            len, file_extents);
+    for (Extents::iterator it = file_extents.begin();
+         it != file_extents.end(); ++it) {
+      m_ictx->journal->commit_event_extent(journal_tid, it->first, it->second,
+                                           0);
+    }
+  }
+
   void LibrbdWriteback::get_client_lock() {
     m_ictx->owner_lock.get_read();
   }
diff --git a/src/librbd/LibrbdWriteback.h b/src/librbd/LibrbdWriteback.h
index b5578ae..b7574ae 100644
--- a/src/librbd/LibrbdWriteback.h
+++ b/src/librbd/LibrbdWriteback.h
@@ -34,9 +34,14 @@ namespace librbd {
 
     // Note that oloc, trunc_size, and trunc_seq are ignored
     virtual ceph_tid_t write(const object_t& oid, const object_locator_t& oloc,
-			uint64_t off, uint64_t len, const SnapContext& snapc,
-			const bufferlist &bl, utime_t mtime, uint64_t trunc_size,
-			__u32 trunc_seq, Context *oncommit);
+                             uint64_t off, uint64_t len,
+                             const SnapContext& snapc, const bufferlist &bl,
+                             utime_t mtime, uint64_t trunc_size,
+                             __u32 trunc_seq, ceph_tid_t journal_tid,
+                             Context *oncommit);
+
+    virtual void overwrite_extent(const object_t& oid, uint64_t off,
+                                  uint64_t len, ceph_tid_t journal_tid);
 
     virtual void get_client_lock();
     virtual void put_client_lock();
diff --git a/src/librbd/Makefile.am b/src/librbd/Makefile.am
index 4360497..72268a7 100644
--- a/src/librbd/Makefile.am
+++ b/src/librbd/Makefile.am
@@ -1,4 +1,5 @@
 librbd_types_la_SOURCES = \
+	librbd/JournalTypes.cc \
 	librbd/WatchNotifyTypes.cc
 noinst_LTLIBRARIES += librbd_types.la
 
@@ -8,7 +9,9 @@ if WITH_RBD
 
 librbd_internal_la_SOURCES = \
 	librbd/AioCompletion.cc \
-	librbd/AioRequest.cc \
+	librbd/AioImageRequest.cc \
+	librbd/AioImageRequestWQ.cc \
+	librbd/AioObjectRequest.cc \
 	librbd/AsyncFlattenRequest.cc \
 	librbd/AsyncObjectThrottle.cc \
 	librbd/AsyncOperation.cc \
@@ -20,6 +23,9 @@ librbd_internal_la_SOURCES = \
 	librbd/ImageCtx.cc \
 	librbd/ImageWatcher.cc \
 	librbd/internal.cc \
+	librbd/Journal.cc \
+	librbd/JournalReplay.cc \
+	librbd/LibrbdAdminSocketHook.cc \
 	librbd/LibrbdWriteback.cc \
 	librbd/ObjectMap.cc \
 	librbd/RebuildObjectMapRequest.cc
@@ -32,11 +38,12 @@ noinst_LTLIBRARIES += librbd_api.la
 librbd_la_SOURCES = \
 	librbd/librbd.cc
 librbd_la_LIBADD = \
-	librbd_internal.la $(LIBRBD_TYPES) \
+	librbd_internal.la $(LIBRBD_TYPES) libjournal.la \
 	$(LIBRADOS) $(LIBCOMMON) $(LIBOSDC) \
 	librados_internal.la \
 	libcls_rbd_client.la \
 	libcls_lock_client.la \
+	libcls_journal_client.la \
 	$(PTHREAD_LIBS) $(EXTRALIBS)
 
 librbd_la_LDFLAGS = ${AM_LDFLAGS} -version-info 1:0:0
@@ -48,7 +55,9 @@ lib_LTLIBRARIES += librbd.la
 
 noinst_HEADERS += \
 	librbd/AioCompletion.h \
-	librbd/AioRequest.h \
+	librbd/AioImageRequest.h \
+	librbd/AioImageRequestWQ.h \
+	librbd/AioObjectRequest.h \
 	librbd/AsyncFlattenRequest.h \
 	librbd/AsyncObjectThrottle.h \
 	librbd/AsyncOperation.h \
@@ -60,6 +69,10 @@ noinst_HEADERS += \
 	librbd/ImageCtx.h \
 	librbd/ImageWatcher.h \
 	librbd/internal.h \
+	librbd/Journal.h \
+	librbd/JournalReplay.h \
+	librbd/JournalTypes.h \
+	librbd/LibrbdAdminSocketHook.h \
 	librbd/LibrbdWriteback.h \
 	librbd/ObjectMap.h \
 	librbd/parent_types.h \
diff --git a/src/librbd/ObjectMap.cc b/src/librbd/ObjectMap.cc
index d947807..e1f2e17 100644
--- a/src/librbd/ObjectMap.cc
+++ b/src/librbd/ObjectMap.cc
@@ -21,6 +21,10 @@ ObjectMap::ObjectMap(ImageCtx &image_ctx)
 {
 }
 
+int ObjectMap::remove(librados::IoCtx &io_ctx, const std::string &image_id) {
+  return io_ctx.remove(object_map_name(image_id, CEPH_NOSNAP));
+}
+
 std::string ObjectMap::object_map_name(const std::string &image_id,
 				       uint64_t snap_id) {
   std::string oid(RBD_OBJECT_MAP_PREFIX + image_id);
diff --git a/src/librbd/ObjectMap.h b/src/librbd/ObjectMap.h
index 797307f..1737e12 100644
--- a/src/librbd/ObjectMap.h
+++ b/src/librbd/ObjectMap.h
@@ -21,6 +21,7 @@ public:
 
   ObjectMap(ImageCtx &image_ctx);
 
+  static int remove(librados::IoCtx &io_ctx, const std::string &image_id);
   static std::string object_map_name(const std::string &image_id,
 				     uint64_t snap_id);
 
diff --git a/src/librbd/WatchNotifyTypes.cc b/src/librbd/WatchNotifyTypes.cc
index 6785864..af2e434 100644
--- a/src/librbd/WatchNotifyTypes.cc
+++ b/src/librbd/WatchNotifyTypes.cc
@@ -17,6 +17,7 @@ public:
 
   template <typename Payload>
   inline void operator()(const Payload &payload) const {
+    ::encode(static_cast<uint32_t>(Payload::NOTIFY_OP), m_bl);
     payload.encode(m_bl);
   }
 
@@ -45,6 +46,8 @@ public:
 
   template <typename Payload>
   inline void operator()(const Payload &payload) const {
+    NotifyOp notify_op = Payload::NOTIFY_OP;
+    m_formatter->dump_string("notify_op", stringify(notify_op));
     payload.dump(m_formatter);
   }
 
@@ -87,7 +90,6 @@ void AsyncRequestId::dump(Formatter *f) const {
 }
 
 void AcquiredLockPayload::encode(bufferlist &bl) const {
-  ::encode(static_cast<uint32_t>(NOTIFY_OP_ACQUIRED_LOCK), bl);
   ::encode(client_id, bl);
 }
 
@@ -98,14 +100,12 @@ void AcquiredLockPayload::decode(__u8 version, bufferlist::iterator &iter) {
 }
 
 void AcquiredLockPayload::dump(Formatter *f) const {
-  f->dump_string("notify_op", stringify(NOTIFY_OP_ACQUIRED_LOCK));
   f->open_object_section("client_id");
   client_id.dump(f);
   f->close_section();
 }
 
 void ReleasedLockPayload::encode(bufferlist &bl) const {
-  ::encode(static_cast<uint32_t>(NOTIFY_OP_RELEASED_LOCK), bl);
   ::encode(client_id, bl);
 }
 
@@ -116,14 +116,12 @@ void ReleasedLockPayload::decode(__u8 version, bufferlist::iterator &iter) {
 }
 
 void ReleasedLockPayload::dump(Formatter *f) const {
-  f->dump_string("notify_op", stringify(NOTIFY_OP_RELEASED_LOCK));
   f->open_object_section("client_id");
   client_id.dump(f);
   f->close_section();
 }
 
 void RequestLockPayload::encode(bufferlist &bl) const {
-  ::encode(static_cast<uint32_t>(NOTIFY_OP_REQUEST_LOCK), bl);
   ::encode(client_id, bl);
 }
 
@@ -134,25 +132,21 @@ void RequestLockPayload::decode(__u8 version, bufferlist::iterator &iter) {
 }
 
 void RequestLockPayload::dump(Formatter *f) const {
-  f->dump_string("notify_op", stringify(NOTIFY_OP_REQUEST_LOCK));
   f->open_object_section("client_id");
   client_id.dump(f);
   f->close_section();
 }
 
 void HeaderUpdatePayload::encode(bufferlist &bl) const {
-  ::encode(static_cast<uint32_t>(NOTIFY_OP_HEADER_UPDATE), bl);
 }
 
 void HeaderUpdatePayload::decode(__u8 version, bufferlist::iterator &iter) {
 }
 
 void HeaderUpdatePayload::dump(Formatter *f) const {
-  f->dump_string("notify_op", stringify(NOTIFY_OP_HEADER_UPDATE));
 }
 
 void AsyncProgressPayload::encode(bufferlist &bl) const {
-  ::encode(static_cast<uint32_t>(NOTIFY_OP_ASYNC_PROGRESS), bl);
   ::encode(async_request_id, bl);
   ::encode(offset, bl);
   ::encode(total, bl);
@@ -165,7 +159,6 @@ void AsyncProgressPayload::decode(__u8 version, bufferlist::iterator &iter) {
 }
 
 void AsyncProgressPayload::dump(Formatter *f) const {
-  f->dump_string("notify_op", stringify(NOTIFY_OP_ASYNC_PROGRESS));
   f->open_object_section("async_request_id");
   async_request_id.dump(f);
   f->close_section();
@@ -174,7 +167,6 @@ void AsyncProgressPayload::dump(Formatter *f) const {
 }
 
 void AsyncCompletePayload::encode(bufferlist &bl) const {
-  ::encode(static_cast<uint32_t>(NOTIFY_OP_ASYNC_COMPLETE), bl);
   ::encode(async_request_id, bl);
   ::encode(result, bl);
 }
@@ -185,7 +177,6 @@ void AsyncCompletePayload::decode(__u8 version, bufferlist::iterator &iter) {
 }
 
 void AsyncCompletePayload::dump(Formatter *f) const {
-  f->dump_string("notify_op", stringify(NOTIFY_OP_ASYNC_COMPLETE));
   f->open_object_section("async_request_id");
   async_request_id.dump(f);
   f->close_section();
@@ -193,7 +184,6 @@ void AsyncCompletePayload::dump(Formatter *f) const {
 }
 
 void FlattenPayload::encode(bufferlist &bl) const {
-  ::encode(static_cast<uint32_t>(NOTIFY_OP_FLATTEN), bl);
   ::encode(async_request_id, bl);
 }
 
@@ -202,14 +192,12 @@ void FlattenPayload::decode(__u8 version, bufferlist::iterator &iter) {
 }
 
 void FlattenPayload::dump(Formatter *f) const {
-  f->dump_string("notify_op", stringify(NOTIFY_OP_FLATTEN));
   f->open_object_section("async_request_id");
   async_request_id.dump(f);
   f->close_section();
 }
 
 void ResizePayload::encode(bufferlist &bl) const {
-  ::encode(static_cast<uint32_t>(NOTIFY_OP_RESIZE), bl);
   ::encode(size, bl);
   ::encode(async_request_id, bl);
 }
@@ -220,7 +208,6 @@ void ResizePayload::decode(__u8 version, bufferlist::iterator &iter) {
 }
 
 void ResizePayload::dump(Formatter *f) const {
-  f->dump_string("notify_op", stringify(NOTIFY_OP_RESIZE));
   f->dump_unsigned("size", size);
   f->open_object_section("async_request_id");
   async_request_id.dump(f);
@@ -228,7 +215,6 @@ void ResizePayload::dump(Formatter *f) const {
 }
 
 void SnapCreatePayload::encode(bufferlist &bl) const {
-  ::encode(static_cast<uint32_t>(NOTIFY_OP_SNAP_CREATE), bl);
   ::encode(snap_name, bl);
 }
 
@@ -237,12 +223,24 @@ void SnapCreatePayload::decode(__u8 version, bufferlist::iterator &iter) {
 }
 
 void SnapCreatePayload::dump(Formatter *f) const {
-  f->dump_string("notify_op", stringify(NOTIFY_OP_SNAP_CREATE));
   f->dump_string("snap_name", snap_name);
 }
 
+void SnapRenamePayload::encode(bufferlist &bl) const {
+  ::encode(src_snap_id, bl);
+  ::encode(dst_snap_name, bl);
+}
+
+void SnapRenamePayload::decode(__u8 version, bufferlist::iterator &iter) {
+  ::decode(src_snap_id, iter);
+  ::decode(dst_snap_name, iter);
+}
+
+void SnapRenamePayload::dump(Formatter *f) const {
+  f->dump_unsigned("src_snap_id", src_snap_id);
+  f->dump_string("dst_snap_name", dst_snap_name);
+}
 void SnapRemovePayload::encode(bufferlist &bl) const {
-  ::encode(static_cast<uint32_t>(NOTIFY_OP_SNAP_REMOVE), bl);
   ::encode(snap_name, bl);
 }
 
@@ -251,12 +249,10 @@ void SnapRemovePayload::decode(__u8 version, bufferlist::iterator &iter) {
 }
 
 void SnapRemovePayload::dump(Formatter *f) const {
-  f->dump_string("notify_op", stringify(NOTIFY_OP_SNAP_REMOVE));
   f->dump_string("snap_name", snap_name);
 }
 
 void RebuildObjectMapPayload::encode(bufferlist &bl) const {
-  ::encode(static_cast<uint32_t>(NOTIFY_OP_REBUILD_OBJECT_MAP), bl);
   ::encode(async_request_id, bl);
 }
 
@@ -265,7 +261,6 @@ void RebuildObjectMapPayload::decode(__u8 version, bufferlist::iterator &iter) {
 }
 
 void RebuildObjectMapPayload::dump(Formatter *f) const {
-  f->dump_string("notify_op", stringify(NOTIFY_OP_REBUILD_OBJECT_MAP));
   f->open_object_section("async_request_id");
   async_request_id.dump(f);
   f->close_section();
@@ -325,6 +320,9 @@ void NotifyMessage::decode(bufferlist::iterator& iter) {
   case NOTIFY_OP_SNAP_REMOVE:
     payload = SnapRemovePayload();
     break;
+  case NOTIFY_OP_SNAP_RENAME:
+    payload = SnapRenamePayload();
+    break;
   case NOTIFY_OP_REBUILD_OBJECT_MAP:
     payload = RebuildObjectMapPayload();
     break;
@@ -413,6 +411,9 @@ std::ostream &operator<<(std::ostream &out,
   case NOTIFY_OP_SNAP_REMOVE:
     out << "SnapRemove";
     break;
+  case NOTIFY_OP_SNAP_RENAME:
+    out << "SnapRename";
+    break;
   case NOTIFY_OP_REBUILD_OBJECT_MAP:
     out << "RebuildObjectMap";
     break;
diff --git a/src/librbd/WatchNotifyTypes.h b/src/librbd/WatchNotifyTypes.h
index 94a2836..eaa3305 100644
--- a/src/librbd/WatchNotifyTypes.h
+++ b/src/librbd/WatchNotifyTypes.h
@@ -83,10 +83,13 @@ enum NotifyOp {
   NOTIFY_OP_RESIZE             = 7,
   NOTIFY_OP_SNAP_CREATE        = 8,
   NOTIFY_OP_SNAP_REMOVE        = 9,
-  NOTIFY_OP_REBUILD_OBJECT_MAP = 10
+  NOTIFY_OP_REBUILD_OBJECT_MAP = 10,
+  NOTIFY_OP_SNAP_RENAME = 11
 };
 
 struct AcquiredLockPayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_ACQUIRED_LOCK;
+
   ClientId client_id;
 
   AcquiredLockPayload() {}
@@ -98,6 +101,8 @@ struct AcquiredLockPayload {
 };
 
 struct ReleasedLockPayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_RELEASED_LOCK;
+
   ClientId client_id;
 
   ReleasedLockPayload() {}
@@ -109,6 +114,8 @@ struct ReleasedLockPayload {
 };
 
 struct RequestLockPayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_REQUEST_LOCK;
+
   ClientId client_id;
 
   RequestLockPayload() {}
@@ -120,12 +127,16 @@ struct RequestLockPayload {
 };
 
 struct HeaderUpdatePayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_HEADER_UPDATE;
+
   void encode(bufferlist &bl) const;
   void decode(__u8 version, bufferlist::iterator &iter);
   void dump(Formatter *f) const;
 };
 
 struct AsyncProgressPayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_ASYNC_PROGRESS;
+
   AsyncProgressPayload() : offset(0), total(0) {}
   AsyncProgressPayload(const AsyncRequestId &id, uint64_t offset_, uint64_t total_)
     : async_request_id(id), offset(offset_), total(total_) {}
@@ -140,6 +151,8 @@ struct AsyncProgressPayload {
 };
 
 struct AsyncCompletePayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_ASYNC_COMPLETE;
+
   AsyncCompletePayload() {}
   AsyncCompletePayload(const AsyncRequestId &id, int r)
     : async_request_id(id), result(r) {}
@@ -153,6 +166,8 @@ struct AsyncCompletePayload {
 };
 
 struct FlattenPayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_FLATTEN;
+
   FlattenPayload() {}
   FlattenPayload(const AsyncRequestId &id) : async_request_id(id) {}
 
@@ -164,6 +179,8 @@ struct FlattenPayload {
 };
 
 struct ResizePayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_RESIZE;
+
   ResizePayload() : size(0) {}
   ResizePayload(uint64_t size_, const AsyncRequestId &id)
     : size(size_), async_request_id(id) {}
@@ -177,6 +194,8 @@ struct ResizePayload {
 };
 
 struct SnapCreatePayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_CREATE;
+
   SnapCreatePayload() {}
   SnapCreatePayload(const std::string &name) : snap_name(name) {}
 
@@ -187,7 +206,24 @@ struct SnapCreatePayload {
   void dump(Formatter *f) const;
 };
 
+struct SnapRenamePayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_RENAME;
+
+  SnapRenamePayload() {}
+  SnapRenamePayload(const uint64_t &src_snap_id, const std::string &dst_name)
+    : src_snap_id(src_snap_id), dst_snap_name(dst_name) {}
+
+  uint64_t src_snap_id;
+  std::string dst_snap_name;
+
+  void encode(bufferlist &bl) const;
+  void decode(__u8 version, bufferlist::iterator &iter);
+  void dump(Formatter *f) const;
+};
+
 struct SnapRemovePayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_SNAP_REMOVE;
+
   SnapRemovePayload() {}
   SnapRemovePayload(const std::string &name) : snap_name(name) {}
 
@@ -199,6 +235,8 @@ struct SnapRemovePayload {
 };
 
 struct RebuildObjectMapPayload {
+  static const NotifyOp NOTIFY_OP = NOTIFY_OP_REBUILD_OBJECT_MAP;
+
   RebuildObjectMapPayload() {}
   RebuildObjectMapPayload(const AsyncRequestId &id) : async_request_id(id) {}
 
@@ -210,23 +248,26 @@ struct RebuildObjectMapPayload {
 };
 
 struct UnknownPayload {
+  static const NotifyOp NOTIFY_OP = static_cast<NotifyOp>(-1);
+
   void encode(bufferlist &bl) const;
   void decode(__u8 version, bufferlist::iterator &iter);
   void dump(Formatter *f) const;
 };
 
 typedef boost::variant<AcquiredLockPayload,
-                 ReleasedLockPayload,
-                 RequestLockPayload,
-                 HeaderUpdatePayload,
-                 AsyncProgressPayload,
-                 AsyncCompletePayload,
-                 FlattenPayload,
-                 ResizePayload,
-                 SnapCreatePayload,
-                 SnapRemovePayload,
-                 RebuildObjectMapPayload,
-                 UnknownPayload> Payload;
+                       ReleasedLockPayload,
+                       RequestLockPayload,
+                       HeaderUpdatePayload,
+                       AsyncProgressPayload,
+                       AsyncCompletePayload,
+                       FlattenPayload,
+                       ResizePayload,
+                       SnapCreatePayload,
+                       SnapRemovePayload,
+                       SnapRenamePayload,
+                       RebuildObjectMapPayload,
+                       UnknownPayload> Payload;
 
 struct NotifyMessage {
   NotifyMessage() : payload(UnknownPayload()) {}
diff --git a/src/librbd/internal.cc b/src/librbd/internal.cc
index 3c6f740..d925b42 100644
--- a/src/librbd/internal.cc
+++ b/src/librbd/internal.cc
@@ -10,6 +10,7 @@
 #include "common/errno.h"
 #include "common/ContextCompletion.h"
 #include "common/Throttle.h"
+#include "common/WorkQueue.h"
 #include "cls/lock/cls_lock_client.h"
 #include "include/stringify.h"
 
@@ -17,7 +18,9 @@
 #include "cls/rbd/cls_rbd_client.h"
 
 #include "librbd/AioCompletion.h"
-#include "librbd/AioRequest.h"
+#include "librbd/AioImageRequest.h"
+#include "librbd/AioImageRequestWQ.h"
+#include "librbd/AioObjectRequest.h"
 #include "librbd/AsyncFlattenRequest.h"
 #include "librbd/AsyncResizeRequest.h"
 #include "librbd/AsyncTrimRequest.h"
@@ -26,6 +29,7 @@
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/internal.h"
+#include "librbd/Journal.h"
 #include "librbd/ObjectMap.h"
 #include "librbd/parent_types.h"
 #include "librbd/RebuildObjectMapRequest.h"
@@ -33,6 +37,7 @@
 
 #include <boost/bind.hpp>
 #include <boost/scope_exit.hpp>
+#include <boost/variant.hpp>
 #include "include/assert.h"
 
 #define dout_subsys ceph_subsys_rbd
@@ -79,6 +84,36 @@ int remove_object_map(ImageCtx *ictx) {
   }
   return 0;
 }
+int create_object_map(ImageCtx *ictx) {
+  assert(ictx->snap_lock.is_locked());
+  CephContext *cct = ictx->cct;
+
+  int r;
+  std::vector<uint64_t> snap_ids;
+  snap_ids.push_back(CEPH_NOSNAP);
+  for (std::map<snap_t, SnapInfo>::iterator it = ictx->snap_info.begin();
+       it != ictx->snap_info.end(); ++it) {
+    snap_ids.push_back(it->first);
+  }
+
+  for (std::vector<uint64_t>::iterator it = snap_ids.begin();
+    it != snap_ids.end(); ++it) {
+    librados::ObjectWriteOperation op;
+    std::string oid(ObjectMap::object_map_name(ictx->id, *it));
+    uint64_t snap_size = ictx->get_image_size(*it);
+    cls_client::object_map_resize(&op, Striper::get_num_objects(ictx->layout, snap_size),
+                                  OBJECT_NONEXISTENT);
+    r = ictx->md_ctx.operate(oid, &op);
+    if (r < 0) {
+      lderr(cct) << "failed to create object map " << oid << ": "
+                 << cpp_strerror(r) << dendl;
+      return r;
+    }
+  }
+
+  return 0;
+}
+
 
 int update_all_flags(ImageCtx *ictx, uint64_t flags, uint64_t mask) {
   assert(ictx->snap_lock.is_locked());
@@ -283,12 +318,6 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     return num;
   }
 
-  int init_rbd_info(struct rbd_info *info)
-  {
-    memset(info, 0, sizeof(*info));
-    return 0;
-  }
-
   void trim_image(ImageCtx *ictx, uint64_t newsize, ProgressContext& prog_ctx)
   {
     assert(ictx->owner_lock.is_locked());
@@ -309,25 +338,6 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     }
   }
 
-  int read_rbd_info(IoCtx& io_ctx, const string& info_oid,
-		    struct rbd_info *info)
-  {
-    int r;
-    bufferlist bl;
-    r = io_ctx.read(info_oid, bl, sizeof(*info), 0);
-    if (r < 0)
-      return r;
-    if (r == 0) {
-      return init_rbd_info(info);
-    }
-
-    if (r < (int)sizeof(*info))
-      return -EIO;
-
-    memcpy(info, bl.c_str(), r);
-    return 0;
-  }
-
   int read_header_bl(IoCtx& io_ctx, const string& header_oid,
 		     bufferlist& header, uint64_t *ver)
   {
@@ -383,16 +393,6 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     return 0;
   }
 
-  int write_header(IoCtx& io_ctx, const string& header_oid, bufferlist& header)
-  {
-    bufferlist bl;
-    int r = io_ctx.write(header_oid, header, header.length(), 0);
-
-    notify_change(io_ctx, header_oid, NULL);
-
-    return r;
-  }
-
   int tmap_set(IoCtx& io_ctx, const string& imgname)
   {
     bufferlist cmdbl, emptybl;
@@ -412,6 +412,156 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     return io_ctx.tmap_update(RBD_DIRECTORY, cmdbl);
   }
 
+  typedef boost::variant<std::string,uint64_t> image_option_value_t;
+  typedef std::map<int,image_option_value_t> image_options_t;
+  typedef std::shared_ptr<image_options_t> image_options_ref;
+
+  enum image_option_type_t {
+    STR,
+    UINT64,
+  };
+
+  const std::map<int, image_option_type_t> IMAGE_OPTIONS_TYPE_MAPPING = {
+    {RBD_IMAGE_OPTION_FORMAT, UINT64},
+    {RBD_IMAGE_OPTION_FEATURES, UINT64},
+    {RBD_IMAGE_OPTION_ORDER, UINT64},
+    {RBD_IMAGE_OPTION_STRIPE_UNIT, UINT64},
+    {RBD_IMAGE_OPTION_STRIPE_COUNT, UINT64},
+  };
+
+  void image_options_create(rbd_image_options_t* opts)
+  {
+    image_options_ref* opts_ = new image_options_ref(new image_options_t());
+
+    *opts = static_cast<rbd_image_options_t>(opts_);
+  }
+
+  void image_options_create_ref(rbd_image_options_t* opts,
+				rbd_image_options_t orig)
+  {
+    image_options_ref* orig_ = static_cast<image_options_ref*>(orig);
+    image_options_ref* opts_ = new image_options_ref(*orig_);
+
+    *opts = static_cast<rbd_image_options_t>(opts_);
+  }
+
+  void image_options_destroy(rbd_image_options_t opts)
+  {
+    image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+    delete opts_;
+  }
+
+  int image_options_set(rbd_image_options_t opts, int optname,
+			const std::string& optval)
+  {
+    image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+    std::map<int, image_option_type_t>::const_iterator i =
+      IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+    if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) {
+      return -EINVAL;
+    }
+
+    (*opts_->get())[optname] = optval;
+    return 0;
+  }
+
+  int image_options_set(rbd_image_options_t opts, int optname, uint64_t optval)
+  {
+    image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+    std::map<int, image_option_type_t>::const_iterator i =
+      IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+    if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) {
+      return -EINVAL;
+    }
+
+    (*opts_->get())[optname] = optval;
+    return 0;
+  }
+
+  int image_options_get(rbd_image_options_t opts, int optname,
+			std::string* optval)
+  {
+    image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+    std::map<int, image_option_type_t>::const_iterator i =
+      IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+    if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != STR) {
+      return -EINVAL;
+    }
+
+    image_options_t::const_iterator j = (*opts_)->find(optname);
+
+    if (j == (*opts_)->end()) {
+      return -ENOENT;
+    }
+
+    *optval = boost::get<std::string>(j->second);
+    return 0;
+  }
+
+  int image_options_get(rbd_image_options_t opts, int optname, uint64_t* optval)
+  {
+    image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+    std::map<int, image_option_type_t>::const_iterator i =
+      IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+    if (i == IMAGE_OPTIONS_TYPE_MAPPING.end() || i->second != UINT64) {
+      return -EINVAL;
+    }
+
+    image_options_t::const_iterator j = (*opts_)->find(optname);
+
+    if (j == (*opts_)->end()) {
+      return -ENOENT;
+    }
+
+    *optval = boost::get<uint64_t>(j->second);
+    return 0;
+  }
+
+  int image_options_unset(rbd_image_options_t opts, int optname)
+  {
+    image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+    std::map<int, image_option_type_t>::const_iterator i =
+      IMAGE_OPTIONS_TYPE_MAPPING.find(optname);
+
+    if (i == IMAGE_OPTIONS_TYPE_MAPPING.end()) {
+      assert((*opts_)->find(optname) == (*opts_)->end());
+      return -EINVAL;
+    }
+
+    image_options_t::const_iterator j = (*opts_)->find(optname);
+
+    if (j == (*opts_)->end()) {
+      return -ENOENT;
+    }
+
+    (*opts_)->erase(j);
+    return 0;
+  }
+
+  void image_options_clear(rbd_image_options_t opts)
+  {
+    image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+    (*opts_)->clear();
+  }
+
+  bool image_options_is_empty(rbd_image_options_t opts)
+  {
+    image_options_ref* opts_ = static_cast<image_options_ref*>(opts);
+
+    return (*opts_)->empty();
+  }
+
   void rollback_object(ImageCtx *ictx, uint64_t snap_id, const string& oid,
 		       SimpleThrottle& throttle)
   {
@@ -619,13 +769,13 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     ldout(ictx->cct, 20) << "snap_create_helper " << ictx << " " << snap_name
                          << dendl;
 
-    int r = ictx_check(ictx, true);
+    int r = ictx_check(ictx, ictx->owner_lock);
     if (r < 0) {
       return r;
     }
 
     RWLock::WLocker md_locker(ictx->md_lock);
-    r = _flush(ictx);
+    r = ictx->flush();
     if (r < 0) {
       return r;
     }
@@ -644,6 +794,71 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     return 0;
   }
 
+  int snap_rename(ImageCtx *ictx, const char *srcname, const char *dstname)
+  {
+    ldout(ictx->cct, 20) << "snap_rename " << ictx << " from " << srcname << " to " << dstname << dendl;
+
+    snapid_t snap_id;
+    if (ictx->read_only) {
+      return -EROFS;
+    }
+
+    int r = ictx_check(ictx);
+    if (r < 0)
+      return r;
+
+    {
+      RWLock::RLocker l(ictx->snap_lock);
+      snap_id = ictx->get_snap_id(srcname);
+      if (snap_id == CEPH_NOSNAP) {
+        return -ENOENT;
+      }
+      if (ictx->get_snap_id(dstname) != CEPH_NOSNAP) {
+        return -EEXIST;
+      }
+    }
+
+    r = invoke_async_request(ictx, "snap_rename", true,
+                             boost::bind(&snap_rename_helper, ictx, _1,
+                                         snap_id, dstname),
+                             boost::bind(&ImageWatcher::notify_snap_rename,
+                                         ictx->image_watcher, snap_id,
+					 dstname));
+    if (r < 0 && r != -EEXIST) {
+      return r;
+    }
+
+    ictx->perfcounter->inc(l_librbd_snap_rename);
+    notify_change(ictx->md_ctx, ictx->header_oid, ictx);
+    return 0;
+  }
+
+  int snap_rename_helper(ImageCtx* ictx, Context* ctx,
+                         const uint64_t src_snap_id,
+                         const char* dst_name) {
+    assert(ictx->owner_lock.is_locked());
+    assert(!ictx->image_watcher->is_lock_supported() ||
+	   ictx->image_watcher->is_lock_owner());
+
+    ldout(ictx->cct, 20) << __func__ << " " << ictx << " from " 
+			 << src_snap_id << " to " << dst_name << dendl;
+
+    int r = ictx_check(ictx, ictx->owner_lock);
+    if (r < 0) {
+      return r;
+    }
+    r = rename_snap(ictx, src_snap_id, dst_name);
+
+    if (r < 0) {
+      return r;
+    }
+
+    if (ctx != NULL) {
+      ctx->complete(0);
+    }
+    return 0;
+  }
+
   static int scan_for_parents(ImageCtx *ictx, parent_spec &pspec,
 			      snapid_t oursnap_id)
   {
@@ -719,7 +934,7 @@ int invoke_async_request(ImageCtx *ictx, const std::string& request_type,
     ldout(ictx->cct, 20) << "snap_remove_helper " << ictx << " " << snap_name
                          << dendl;
 
-    int r = ictx_check(ictx, true);
+    int r = ictx_check(ictx, ictx->owner_lock);
     if (r < 0) {
       return r;
     }
@@ -1087,13 +1302,37 @@ reprotect_and_return_err:
                                     OBJECT_NONEXISTENT);
       r = io_ctx.operate(ObjectMap::object_map_name(id, CEPH_NOSNAP), &op);
       if (r < 0) {
+        lderr(cct) << "error creating initial object map: "
+                   << cpp_strerror(r) << dendl;
         goto err_remove_header;
       }
     }
 
+    if ((features & RBD_FEATURE_JOURNALING) != 0) {
+      if ((features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+        lderr(cct) << "cannot use journaling without exclusive lock" << dendl;
+        goto err_remove_object_map;
+      }
+
+      r = Journal::create(io_ctx, id);
+      if (r < 0) {
+        lderr(cct) << "error creating journal: " << cpp_strerror(r) << dendl;
+        goto err_remove_object_map;
+      }
+    }
+
     ldout(cct, 2) << "done." << dendl;
     return 0;
 
+  err_remove_object_map:
+    if ((features & RBD_FEATURE_OBJECT_MAP) != 0) {
+      remove_r = ObjectMap::remove(io_ctx, id);
+      if (remove_r < 0) {
+        lderr(cct) << "error cleaning up object map after creation failed: "
+                   << cpp_strerror(remove_r) << dendl;
+      }
+    }
+
   err_remove_header:
     remove_r = io_ctx.remove(header_oid);
     if (remove_r < 0) {
@@ -1134,15 +1373,59 @@ reprotect_and_return_err:
     if (!order)
       return -EINVAL;
 
+    uint64_t order_ = *order;
+    uint64_t format = old_format ? 1 : 2;
+    ImageOptions opts;
+    int r;
+
+    r = opts.set(RBD_IMAGE_OPTION_FORMAT, format);
+    assert(r == 0);
+    r = opts.set(RBD_IMAGE_OPTION_FEATURES, features);
+    assert(r == 0);
+    r = opts.set(RBD_IMAGE_OPTION_ORDER, order_);
+    assert(r == 0);
+    r = opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit);
+    assert(r == 0);
+    r = opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count);
+    assert(r == 0);
+
+    r = create(io_ctx, imgname, size, opts);
+
+    int r1 = opts.get(RBD_IMAGE_OPTION_ORDER, &order_);
+    assert(r1 == 0);
+    *order = order_;
+
+    return r;
+  }
+
+  int create(IoCtx& io_ctx, const char *imgname, uint64_t size,
+	     ImageOptions& opts)
+  {
     CephContext *cct = (CephContext *)io_ctx.cct();
+
+    uint64_t format = cct->_conf->rbd_default_format;
+    opts.get(RBD_IMAGE_OPTION_FORMAT, &format);
+    bool old_format = format == 1;
+
+    uint64_t features;
+    if (opts.get(RBD_IMAGE_OPTION_FEATURES, &features) != 0) {
+      features = old_format ? 0 : cct->_conf->rbd_default_features;
+    }
+    uint64_t stripe_unit = 0;
+    uint64_t stripe_count = 0;
+    opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit);
+    opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count);
+
+    uint64_t order = 0;
+    opts.get(RBD_IMAGE_OPTION_ORDER, &order);
+
     ldout(cct, 20) << "create " << &io_ctx << " name = " << imgname
 		   << " size = " << size << " old_format = " << old_format
-		   << " features = " << features << " order = " << *order
+		   << " features = " << features << " order = " << order
 		   << " stripe_unit = " << stripe_unit
 		   << " stripe_count = " << stripe_count
 		   << dendl;
 
-
     if (features & ~RBD_FEATURES_ALL) {
       lderr(cct) << "librbd does not support requested features." << dendl;
       return -ENOSYS;
@@ -1159,12 +1442,12 @@ reprotect_and_return_err:
       return -EEXIST;
     }
 
-    if (!*order)
-      *order = cct->_conf->rbd_default_order;
-    if (!*order)
-      *order = RBD_DEFAULT_OBJ_ORDER;
+    if (!order)
+      order = cct->_conf->rbd_default_order;
+    if (!order)
+      order = RBD_DEFAULT_OBJ_ORDER;
 
-    if (*order > 25 || *order < 12) {
+    if (order > 25 || order < 12) {
       lderr(cct) << "order must be in the range [12, 25]" << dendl;
       return -EDOM;
     }
@@ -1180,7 +1463,7 @@ reprotect_and_return_err:
     }
 
     // normalize for default striping
-    if (stripe_unit == (1ull << *order) && stripe_count == 1) {
+    if (stripe_unit == (1ull << order) && stripe_count == 1) {
       stripe_unit = 0;
       stripe_count = 0;
     }
@@ -1194,22 +1477,44 @@ reprotect_and_return_err:
       return -EINVAL;
 
     if (old_format) {
-      if (stripe_unit && stripe_unit != (1ull << *order))
+      if (stripe_unit && stripe_unit != (1ull << order))
 	return -EINVAL;
       if (stripe_count && stripe_count != 1)
 	return -EINVAL;
 
-      return create_v1(io_ctx, imgname, bid, size, *order);
+      r = create_v1(io_ctx, imgname, bid, size, order);
     } else {
-      return create_v2(io_ctx, imgname, bid, size, *order, features,
-		       stripe_unit, stripe_count);
+      r = create_v2(io_ctx, imgname, bid, size, order, features, stripe_unit,
+		    stripe_count);
     }
+
+    int r1 = opts.set(RBD_IMAGE_OPTION_ORDER, order);
+    assert(r1 == 0);
+
+    return r;
   }
 
   /*
    * Parent may be in different pool, hence different IoCtx
    */
   int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
+	    IoCtx& c_ioctx, const char *c_name, ImageOptions& c_opts)
+  {
+    int order = 0;
+    uint64_t features = 0;
+    uint64_t stripe_unit = 0;
+    uint64_t stripe_count = 0;
+    c_opts.get(RBD_IMAGE_OPTION_FEATURES, &features);
+    c_opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit);
+    c_opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count);
+
+    int r = clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name, features,
+		  &order, stripe_unit, stripe_count);
+    c_opts.set(RBD_IMAGE_OPTION_ORDER, static_cast<uint64_t>(order));
+    return r;
+  }
+
+  int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
 	    IoCtx& c_ioctx, const char *c_name,
 	    uint64_t features, int *c_order,
 	    uint64_t stripe_unit, int stripe_count)
@@ -1536,6 +1841,13 @@ reprotect_and_return_err:
       return -EINVAL;
     }
 
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    RWLock::WLocker md_locker(ictx->md_lock);
+    r = ictx->flush();
+    if (r < 0) {
+      return r;
+    }
+
     if ((features & RBD_FEATURES_MUTABLE) != features) {
       lderr(cct) << "cannot update immutable features" << dendl;
       return -EINVAL;
@@ -1544,13 +1856,17 @@ reprotect_and_return_err:
       return -EINVAL;
     }
 
-    RWLock::RLocker l(ictx->snap_lock);
-    uint64_t new_features = ictx->features | features;
-    if (!enabled) {
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    uint64_t new_features;
+    if (enabled) {
+      features &= ~ictx->features;
+      new_features = ictx->features | features;
+    } else {
+      features &= ictx->features;
       new_features = ictx->features & ~features;
     }
 
-    if (ictx->features == new_features) {
+    if (features == 0) {
       return 0;
     }
 
@@ -1575,6 +1891,20 @@ reprotect_and_return_err:
         enable_flags |= RBD_FLAG_FAST_DIFF_INVALID;
         features_mask |= (RBD_FEATURE_OBJECT_MAP | RBD_FEATURE_EXCLUSIVE_LOCK);
       }
+      if ((features & RBD_FEATURE_JOURNALING) != 0) {
+        if ((new_features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) {
+          lderr(cct) << "cannot enable journaling" << dendl;
+          return -EINVAL;
+        }
+        features_mask |= RBD_FEATURE_EXCLUSIVE_LOCK;
+
+        r = Journal::create(ictx->md_ctx, ictx->id);
+        if (r < 0) {
+          lderr(cct) << "error creating image journal: " << cpp_strerror(r)
+                     << dendl;
+          return r;
+        }
+      }
 
       if (enable_flags != 0) {
         r = update_all_flags(ictx, enable_flags, enable_flags);
@@ -1584,7 +1914,8 @@ reprotect_and_return_err:
       }
     } else {
       if ((features & RBD_FEATURE_EXCLUSIVE_LOCK) != 0) {
-        if ((new_features & RBD_FEATURE_OBJECT_MAP) != 0) {
+        if ((new_features & RBD_FEATURE_OBJECT_MAP) != 0 ||
+            (new_features & RBD_FEATURE_JOURNALING) != 0) {
           lderr(cct) << "cannot disable exclusive lock" << dendl;
           return -EINVAL;
         }
@@ -1606,6 +1937,14 @@ reprotect_and_return_err:
       if ((features & RBD_FEATURE_FAST_DIFF) != 0) {
         disable_flags = RBD_FLAG_FAST_DIFF_INVALID;
       }
+      if ((features & RBD_FEATURE_JOURNALING) != 0) {
+        r = Journal::remove(ictx->md_ctx, ictx->id);
+        if (r < 0) {
+          lderr(cct) << "error removing image journal: " << cpp_strerror(r)
+                     << dendl;
+          return r;
+        }
+      }
     }
 
     ldout(cct, 10) << "update_features: features=" << new_features << ", mask="
@@ -1615,6 +1954,15 @@ reprotect_and_return_err:
     if (r < 0) {
       lderr(cct) << "failed to update features: " << cpp_strerror(r)
                  << dendl;
+      return r;
+    }
+    if (((ictx->features & RBD_FEATURE_OBJECT_MAP) == 0) &&
+      ((features & RBD_FEATURE_OBJECT_MAP) != 0)) {
+      r = create_object_map(ictx);
+      if (r < 0) {
+        lderr(cct) << "failed to create object map" << dendl;
+        return r;
+      }
     }
 
     if (disable_flags != 0) {
@@ -1694,8 +2042,7 @@ reprotect_and_return_err:
       lderr(ictx->cct) << "parent snapshot does not exist" << dendl;
       ictx->parent->snap_lock.put_write();
       ictx->parent->cache_lock.Unlock();
-      close_image(ictx->parent);
-      ictx->parent = NULL;
+      close_parent(ictx);
       return r;
     }
     ictx->parent->snap_set(ictx->parent->snap_name);
@@ -1708,8 +2055,7 @@ reprotect_and_return_err:
       ictx->parent->parent_lock.put_write();
       ictx->parent->snap_lock.put_write();
       ictx->parent->cache_lock.Unlock();
-      close_image(ictx->parent);
-      ictx->parent = NULL;
+      close_parent(ictx);
       return r;
     }
     ictx->parent->parent_lock.put_write();
@@ -1890,9 +2236,16 @@ reprotect_and_return_err:
       }
     }
     if (!old_format) {
-      r = io_ctx.remove(ObjectMap::object_map_name(id, CEPH_NOSNAP));
+      r = Journal::remove(io_ctx, id);
+      if (r < 0 && r != -ENOENT) {
+        lderr(cct) << "error removing image journal" << dendl;
+        return r;
+      }
+
+      r = ObjectMap::remove(io_ctx, id);
       if (r < 0 && r != -ENOENT) {
 	lderr(cct) << "error removing image object map" << dendl;
+        return r;
       }
 
       ldout(cct, 2) << "removing id object..." << dendl;
@@ -1962,7 +2315,7 @@ reprotect_and_return_err:
 		   << size << dendl;
     ictx->snap_lock.put_read();
 
-    int r = ictx_check(ictx, true);
+    int r = ictx_check(ictx, ictx->owner_lock);
     if (r < 0) {
       return r;
     }
@@ -1994,7 +2347,6 @@ reprotect_and_return_err:
     int r = ictx_check(ictx);
     if (r < 0)
       return r;
-    bufferlist bl, bl2;
 
     RWLock::RLocker l(ictx->snap_lock);
     for (map<snap_t, SnapInfo>::iterator it = ictx->snap_info.begin();
@@ -2102,29 +2454,100 @@ reprotect_and_return_err:
 
     return 0;
   }
-
-  int ictx_check(ImageCtx *ictx, bool owner_locked)
+  int rename_snap(ImageCtx *ictx, uint64_t src_snap_id, const char *dst_name)
   {
-    CephContext *cct = ictx->cct;
-    ldout(cct, 20) << "ictx_check " << ictx << dendl;
-
-    ictx->refresh_lock.Lock();
-    bool needs_refresh = ictx->last_refresh != ictx->refresh_seq;
-    ictx->refresh_lock.Unlock();
+    assert(ictx->owner_lock.is_locked());
 
-    if (needs_refresh) {
-      int r;
-      if (owner_locked) {
-        r = ictx_refresh(ictx);
-      } else {
-        RWLock::RLocker owner_lock(ictx->owner_lock);
-        r = ictx_refresh(ictx);
+    int r;
+    map<snap_t, SnapInfo>::iterator it;
+    {
+      RWLock::RLocker(ictx->snap_lock);
+      it = ictx->snap_info.find(src_snap_id);
+      if (it == ictx->snap_info.end()) {
+        ldout(ictx->cct, 20) << __func__ << " can not find snap with snap id "
+                             << src_snap_id << dendl;
+        return -ENOENT;
       }
-      if (r < 0) {
-	lderr(cct) << "Error re-reading rbd header: " << cpp_strerror(-r)
+    }
+    bool lock_owner = ictx->image_watcher->is_lock_owner();
+    if (ictx->image_watcher->is_lock_supported()) {
+      assert(lock_owner);
+    }
+
+
+    if (ictx->old_format) {
+      r = cls_client::old_snapshot_rename(&ictx->md_ctx, ictx->header_oid,
+				       src_snap_id, dst_name);
+    } else {
+      librados::ObjectWriteOperation op;
+      if (lock_owner) {
+	ictx->image_watcher->assert_header_locked(&op);
+      }
+      cls_client::snapshot_rename(&op, src_snap_id, dst_name);
+      r = ictx->md_ctx.operate(ictx->header_oid, &op);
+    }
+
+    if (r < 0) {
+      lderr(ictx->cct) << "rename snapshot name failed: "
+		       << cpp_strerror(r) << dendl;
+      return r;
+    }
+
+    RWLock::WLocker snap_locker(ictx->snap_lock);
+    if (!ictx->old_format) {
+      if (lock_owner) {
+        it = ictx->snap_info.find(src_snap_id);
+        if (it == ictx->snap_info.end())
+          return -ENOENT;
+        ictx->snap_ids.erase(it->second.name);
+        it->second.name = dst_name;
+        ictx->snap_ids.insert(make_pair(dst_name,it->first));
+        if (ictx->snap_id == src_snap_id)
+          ictx->snap_name = it->second.name;
+      }
+    }
+    return 0;
+  }
+
+  int ictx_check(ImageCtx *ictx) {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    return ictx_check(ictx, ictx->owner_lock);
+  }
+
+  int ictx_check(ImageCtx *ictx, const RWLock &owner_lock)
+  {
+    assert(ictx->owner_lock.is_locked());
+    CephContext *cct = ictx->cct;
+    ldout(cct, 20) << "ictx_check " << ictx << dendl;
+
+    bool needs_refresh = false;
+    int refresh_seq;
+    {
+      Mutex::Locker refresh_locker(ictx->refresh_lock);
+      while (ictx->refresh_in_progress) {
+        ictx->refresh_cond.Wait(ictx->refresh_lock);
+      }
+
+      if (ictx->last_refresh != ictx->refresh_seq) {
+        ictx->refresh_in_progress = true;
+        needs_refresh = true;
+        refresh_seq = ictx->refresh_seq;
+      }
+    }
+
+    if (needs_refresh) {
+      int r = ictx_refresh(ictx);
+
+      Mutex::Locker refresh_locker(ictx->refresh_lock);
+      ictx->refresh_in_progress = false;
+      ictx->refresh_cond.Signal();
+
+      if (r < 0) {
+	lderr(cct) << "Error re-reading rbd header: " << cpp_strerror(-r)
 		   << dendl;
-	return r;
+        return r;
       }
+      ictx->last_refresh = refresh_seq;
     }
     return 0;
   }
@@ -2149,8 +2572,7 @@ reprotect_and_return_err:
 	  ictx->parent->id != ictx->get_parent_image_id(ictx->snap_id) ||
 	  ictx->parent->snap_id != ictx->get_parent_snap_id(ictx->snap_id)) {
 	ictx->clear_nonexistence_cache();
-	close_image(ictx->parent);
-	ictx->parent = NULL;
+	close_parent(ictx);
       }
     }
 
@@ -2172,14 +2594,9 @@ reprotect_and_return_err:
     RWLock::WLocker md_locker(ictx->md_lock);
 
     CephContext *cct = ictx->cct;
-    bufferlist bl, bl2;
 
     ldout(cct, 20) << "ictx_refresh " << ictx << dendl;
 
-    ictx->refresh_lock.Lock();
-    int refresh_seq = ictx->refresh_seq;
-    ictx->refresh_lock.Unlock();
-
     ::SnapContext new_snapc;
     bool new_snap = false;
     vector<string> snap_names;
@@ -2190,6 +2607,7 @@ reprotect_and_return_err:
     {
       Mutex::Locker cache_locker(ictx->cache_lock);
       RWLock::WLocker snap_locker(ictx->snap_lock);
+
       {
 	int r;
 	RWLock::WLocker parent_locker(ictx->parent_lock);
@@ -2343,16 +2761,26 @@ reprotect_and_return_err:
       ictx->object_map.refresh(ictx->snap_id);
 
       ictx->data_ctx.selfmanaged_snap_set_write_ctx(ictx->snapc.seq, ictx->snaps);
+
+      // dynamically enable/disable journaling support
+      if ((ictx->features & RBD_FEATURE_JOURNALING) != 0 &&
+          ictx->image_watcher != NULL && ictx->journal == NULL &&
+          ictx->snap_name.empty()) {
+        ictx->open_journal();
+      } else if ((ictx->features & RBD_FEATURE_JOURNALING) == 0 &&
+                 ictx->journal != NULL) {
+        // TODO journal needs to be disabled via proxied request to avoid race
+        //      between deleting journal and appending journal events
+      }
     } // release snap_lock and cache_lock
 
-    if (new_snap) {
-      _flush(ictx);
+    if (ictx->image_watcher != NULL) {
+      ictx->image_watcher->refresh();
     }
 
-    ictx->refresh_lock.Lock();
-    ictx->last_refresh = refresh_seq;
-    ictx->refresh_lock.Unlock();
-
+    if (new_snap) {
+      ictx->flush();
+    }
     return 0;
   }
 
@@ -2406,7 +2834,6 @@ reprotect_and_return_err:
       // writes might create new snapshots. Rolling back will replace
       // the current version, so we have to invalidate that too.
       RWLock::WLocker md_locker(ictx->md_lock);
-      ictx->flush_async_operations();
       r = ictx->invalidate_cache();
       if (r < 0) {
 	return r;
@@ -2447,37 +2874,41 @@ reprotect_and_return_err:
     ProgressContext &prog_ctx;
   };
 
-  int do_copy_extent(uint64_t offset, size_t len, const char *buf, void *data)
-  {
-    CopyProgressCtx *cp = reinterpret_cast<CopyProgressCtx*>(data);
-    cp->prog_ctx.update_progress(offset, cp->src_size);
-    int ret = 0;
-    if (buf) {
-      ret = write(cp->destictx, offset, len, buf, 0);
-    }
-    return ret;
-  }
-
   int copy(ImageCtx *src, IoCtx& dest_md_ctx, const char *destname,
-	   ProgressContext &prog_ctx)
+	   ImageOptions& opts, ProgressContext &prog_ctx)
   {
     CephContext *cct = (CephContext *)dest_md_ctx.cct();
     ldout(cct, 20) << "copy " << src->name
 		   << (src->snap_name.length() ? "@" + src->snap_name : "")
 		   << " -> " << destname << dendl;
-    int order = src->order;
 
     src->snap_lock.get_read();
-    uint64_t src_features = src->features;
+    uint64_t features = src->features;
     uint64_t src_size = src->get_image_size(src->snap_id);
     src->snap_lock.put_read();
+    uint64_t stripe_unit = src->stripe_unit;
+    uint64_t stripe_count = src->stripe_count;
+    opts.get(RBD_IMAGE_OPTION_FEATURES, &features);
+    opts.get(RBD_IMAGE_OPTION_STRIPE_UNIT, &stripe_unit);
+    opts.get(RBD_IMAGE_OPTION_STRIPE_COUNT, &stripe_count);
+    int order = src->order;
+    uint64_t opt_order = 0;
+    if (opts.get(RBD_IMAGE_OPTION_ORDER, &opt_order)) {
+      order = opt_order;
+    }
+
+    if (features & ~RBD_FEATURES_ALL) {
+      lderr(cct) << "librbd does not support requested features" << dendl;
+      return -ENOSYS;
+    }
 
     int r = create(dest_md_ctx, destname, src_size, src->old_format,
-		   src_features, &order, src->stripe_unit, src->stripe_count);
+		   features, &order, stripe_unit, stripe_count);
     if (r < 0) {
       lderr(cct) << "header creation failed" << dendl;
       return r;
     }
+    opts.set(RBD_IMAGE_OPTION_ORDER, static_cast<uint64_t>(order));
 
     ImageCtx *dest = new librbd::ImageCtx(destname, "", NULL,
 					  dest_md_ctx, false);
@@ -2533,8 +2964,13 @@ reprotect_and_return_err:
 
       Context *ctx = new C_CopyWrite(m_throttle, m_bl);
       AioCompletion *comp = aio_create_completion_internal(ctx, rbd_ctx_cb);
-      aio_write(m_dest, m_offset, m_bl->length(), m_bl->c_str(), comp, LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
+
+      // coordinate through AIO WQ to ensure lock is acquired if needed
+      m_dest->aio_work_queue->aio_write(comp, m_offset, m_bl->length(),
+                                        m_bl->c_str(),
+                                        LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
     }
+
   private:
     SimpleThrottle *m_throttle;
     ImageCtx *m_dest;
@@ -2573,6 +3009,7 @@ reprotect_and_return_err:
       }
     }
 
+    RWLock::RLocker owner_lock(src->owner_lock);
     SimpleThrottle throttle(src->concurrent_management_ops, false);
     uint64_t period = src->get_stripe_period();
     unsigned fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL | LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
@@ -2585,7 +3022,8 @@ reprotect_and_return_err:
       bufferlist *bl = new bufferlist();
       Context *ctx = new C_CopyRead(&throttle, dest, offset, bl);
       AioCompletion *comp = aio_create_completion_internal(ctx, rbd_ctx_cb);
-      aio_read(src, offset, len, NULL, bl, comp, fadvise_flags);
+      AioImageRequest::aio_read(src, comp, offset, len, NULL, bl,
+                                fadvise_flags);
       prog_ctx.update_progress(offset, src_size);
     }
 
@@ -2627,45 +3065,54 @@ reprotect_and_return_err:
     // snapshot and the user is trying to fix that
     ictx_check(ictx);
 
-    bool unlocking = false;
-    {
-      RWLock::WLocker l(ictx->owner_lock);
-      if (ictx->image_watcher != NULL && ictx->image_watcher->is_lock_owner() &&
-          snap_name != NULL && strlen(snap_name) != 0) {
-        // stop incoming requests since we will release the lock
-        ictx->image_watcher->prepare_unlock();
-        unlocking = true;
+    int r;
+    bool snapshot_mode = (snap_name != NULL && strlen(snap_name) != 0);
+    if (snapshot_mode) {
+      {
+        RWLock::WLocker owner_locker(ictx->owner_lock);
+        if (ictx->image_watcher != NULL &&
+            ictx->image_watcher->is_lock_owner()) {
+          r = ictx->image_watcher->release_lock();
+          if (r < 0) {
+            return r;
+          }
+        }
       }
-    }
 
-    ictx->cancel_async_requests();
-    ictx->flush_async_operations();
-    if (ictx->object_cacher) {
-      // complete pending writes before we're set to a snapshot and
-      // get -EROFS for writes
-      RWLock::RLocker owner_locker(ictx->owner_lock);
-      RWLock::WLocker md_locker(ictx->md_lock);
-      ictx->flush_cache();
+      ictx->cancel_async_requests();
+      {
+        RWLock::RLocker owner_locker(ictx->owner_lock);
+        r = ictx->flush();
+      }
+
+      {
+        RWLock::WLocker snap_locker(ictx->snap_lock);
+        if (ictx->journal != NULL) {
+          r = ictx->close_journal(false);
+          if (r < 0) {
+            return r;
+          }
+        }
+      }
     }
-    int r = _snap_set(ictx, snap_name);
+
+    r = _snap_set(ictx, snap_name);
     if (r < 0) {
-      RWLock::WLocker l(ictx->owner_lock);
-      if (unlocking) {
-        ictx->image_watcher->cancel_unlock();
-      }
       return r;
     }
 
-    RWLock::WLocker l(ictx->owner_lock);
-    if (ictx->image_watcher != NULL) {
-      if (unlocking) {
-	r = ictx->image_watcher->unlock();
-	if (r < 0) {
-	  lderr(ictx->cct) << "error unlocking image: " << cpp_strerror(r)
-                           << dendl;
-	}
+    {
+      RWLock::WLocker snap_locker(ictx->snap_lock);
+      if ((ictx->features & RBD_FEATURE_JOURNALING) != 0 &&
+          ictx->journal == NULL && !snapshot_mode) {
+        ictx->open_journal();
       }
     }
+
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    if (ictx->image_watcher != NULL) {
+      ictx->image_watcher->refresh();
+    }
     return r;
   }
 
@@ -2699,6 +3146,11 @@ reprotect_and_return_err:
     if ((r = _snap_set(ictx, ictx->snap_name.c_str())) < 0)
       goto err_close;
 
+    if (ictx->image_watcher != NULL) {
+      RWLock::RLocker owner_locker(ictx->owner_lock);
+      ictx->image_watcher->refresh();
+    }
+
     return 0;
 
   err_close:
@@ -2710,28 +3162,40 @@ reprotect_and_return_err:
   {
     ldout(ictx->cct, 20) << "close_image " << ictx << dendl;
 
+    if (!ictx->read_only) {
+      // finish all incoming IO operations
+      ictx->aio_work_queue->drain();
+    }
+
+    int r = 0;
     {
-      RWLock::WLocker l(ictx->owner_lock);
+      // release the lock (and flush all in-flight IO)
+      RWLock::WLocker owner_locker(ictx->owner_lock);
       if (ictx->image_watcher != NULL && ictx->image_watcher->is_lock_owner()) {
-        // stop incoming requests
-        ictx->image_watcher->prepare_unlock();
+        r = ictx->image_watcher->release_lock();
+        if (r < 0) {
+          lderr(ictx->cct) << "error releasing image lock: " << cpp_strerror(r)
+                           << dendl;
+        }
       }
     }
 
-    ictx->aio_work_queue->drain();
+    assert(!ictx->aio_work_queue->writes_blocked() ||
+           ictx->aio_work_queue->writes_empty());
+
     ictx->cancel_async_requests();
     ictx->flush_async_operations();
     ictx->readahead.wait_for_pending();
 
-    int r;
     if (ictx->object_cacher) {
-      r = ictx->shutdown_cache(); // implicitly flushes
-    } else {
-      r = flush(ictx);
-    }
-    if (r < 0) {
-      lderr(ictx->cct) << "error flushing IO: " << cpp_strerror(r)
-                       << dendl;
+      int flush_r = ictx->shutdown_cache(); // implicitly flushes
+      if (flush_r < 0) {
+        lderr(ictx->cct) << "error flushing IO: " << cpp_strerror(flush_r)
+                         << dendl;
+        if (r == 0) {
+          r = flush_r;
+        }
+      }
     }
 
     ictx->op_work_queue->drain();
@@ -2741,28 +3205,22 @@ reprotect_and_return_err:
       ictx->copyup_finisher->stop();
     }
 
+    if (ictx->journal != NULL) {
+      int close_r = ictx->close_journal(true);
+      if (close_r < 0 && r == 0) {
+        r = close_r;
+      }
+    }
+
     if (ictx->parent) {
-      int close_r = close_image(ictx->parent);
+      RWLock::WLocker parent_locker(ictx->parent_lock);
+      int close_r = close_parent(ictx);
       if (r == 0 && close_r < 0) {
         r = close_r;
       }
-      ictx->parent = NULL;
     }
 
     if (ictx->image_watcher) {
-      {
-	RWLock::WLocker l(ictx->owner_lock);
-	if (ictx->image_watcher->is_lock_owner()) {
-	  int unlock_r = ictx->image_watcher->unlock();
-	  if (unlock_r < 0) {
-	    lderr(ictx->cct) << "error unlocking image: "
-                             << cpp_strerror(unlock_r) << dendl;
-            if (r == 0) {
-              r = unlock_r;
-            }
-	  }
-	}
-      }
       ictx->unregister_watch();
     }
 
@@ -2770,6 +3228,28 @@ reprotect_and_return_err:
     return r;
   }
 
+  int close_parent(ImageCtx *ictx)
+  {
+    assert(ictx->parent_lock.is_wlocked());
+    ImageCtx *parent_ictx = ictx->parent;
+
+    // AIO to the parent must be complete before closing
+    parent_ictx->flush_async_operations();
+    parent_ictx->readahead.wait_for_pending();
+    {
+      Mutex::Locker async_ops_locker(parent_ictx->async_ops_lock);
+      assert(parent_ictx->async_ops.empty());
+    }
+
+    // attempting to drain the work queues might result in deadlock
+    assert(parent_ictx->aio_work_queue->empty());
+    assert(parent_ictx->op_work_queue->empty());
+
+    int r = close_image(parent_ictx);
+    ictx->parent = NULL;
+    return r;
+  }
+
   // 'flatten' child image by copying all parent's blocks
   int flatten(ImageCtx *ictx, ProgressContext &prog_ctx)
   {
@@ -2821,7 +3301,7 @@ reprotect_and_return_err:
 
     int r;
     // ictx_check also updates parent data
-    if ((r = ictx_check(ictx, true)) < 0) {
+    if ((r = ictx_check(ictx, ictx->owner_lock)) < 0) {
       lderr(cct) << "ictx_check failed" << dendl;
       return r;
     }
@@ -2844,7 +3324,7 @@ reprotect_and_return_err:
 	lderr(cct) << "image has no parent" << dendl;
 	return -EINVAL;
       }
-      if (ictx->snap_id != CEPH_NOSNAP || ictx->read_only) {
+      if (ictx->snap_id != CEPH_NOSNAP) {
 	lderr(cct) << "snapshots cannot be flattened" << dendl;
 	return -EROFS;
       }
@@ -2906,7 +3386,7 @@ reprotect_and_return_err:
       return -EINVAL;
     }
 
-    int r = ictx_check(ictx, true);
+    int r = ictx_check(ictx, ictx->owner_lock);
     if (r < 0) {
       return r;
     }
@@ -3089,6 +3569,7 @@ reprotect_and_return_err:
     uint64_t period = ictx->get_stripe_period();
     uint64_t left = mylen;
 
+    RWLock::RLocker owner_locker(ictx->owner_lock);
     start_time = ceph_clock_now(ictx->cct);
     while (left > 0) {
       uint64_t period_off = off - (off % period);
@@ -3103,7 +3584,7 @@ reprotect_and_return_err:
 
       Context *ctx = new C_SafeCond(&mylock, &cond, &done, &ret);
       AioCompletion *c = aio_create_completion_internal(ctx, rbd_ctx_cb);
-      aio_read(ictx, off, read_len, NULL, &bl, c, 0);
+      AioImageRequest::aio_read(ictx, c, off, read_len, NULL, &bl, 0);
 
       mylock.Lock();
       while (!done)
@@ -3139,7 +3620,7 @@ reprotect_and_return_err:
     // ensure previous writes are visible to listsnaps
     {
       RWLock::RLocker owner_locker(ictx->owner_lock);
-      _flush(ictx);
+      ictx->flush();
     }
 
     int r = ictx_check(ictx);
@@ -3160,184 +3641,9 @@ reprotect_and_return_err:
     return r;
   }
 
-  int simple_read_cb(uint64_t ofs, size_t len, const char *buf, void *arg)
-  {
-    char *dest_buf = (char *)arg;
-    if (buf)
-      memcpy(dest_buf + ofs, buf, len);
-    else
-      memset(dest_buf + ofs, 0, len);
-
-    return 0;
-  }
-
-  ssize_t read(ImageCtx *ictx, uint64_t ofs, size_t len, char *buf, int op_flags)
-  {
-    ssize_t ret;
-    ldout(ictx->cct, 20) << "read " << ictx << " off = " << ofs << " len = "
-			 << len << dendl;
-
-    vector<pair<uint64_t,uint64_t> > extents;
-    extents.push_back(make_pair(ofs, len));
-    ret = read(ictx, extents, buf, NULL, op_flags);
-    if (ret < 0)
-      return ret;
-
-    return ret;
-  }
-
-  ssize_t read(ImageCtx *ictx, const vector<pair<uint64_t,uint64_t> >& image_extents,
-		char *buf, bufferlist *pbl, int op_flags)
-  {
-    Mutex mylock("librbd::read::mylock");
-    Cond cond;
-    bool done;
-    int ret;
-
-    Context *ctx = new C_SafeCond(&mylock, &cond, &done, &ret);
-    AioCompletion *c = aio_create_completion_internal(ctx, rbd_ctx_cb);
-    aio_read(ictx, image_extents, buf, pbl, c, op_flags);
-
-    mylock.Lock();
-    while (!done)
-      cond.Wait(mylock);
-    mylock.Unlock();
-
-    return ret;
-  }
-
-  ssize_t write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf, int op_flags)
-  {
-    ldout(ictx->cct, 20) << "write " << ictx << " off = " << off << " len = "
-			 << len << dendl;
-
-    Mutex mylock("librbd::write::mylock");
-    Cond cond;
-    bool done;
-    int ret;
-
-    uint64_t mylen = len;
-    ictx->snap_lock.get_read();
-    int r = clip_io(ictx, off, &mylen);
-    ictx->snap_lock.put_read();
-    if (r < 0) {
-      return r;
-    }
-
-    Context *ctx = new C_SafeCond(&mylock, &cond, &done, &ret);
-    AioCompletion *c = aio_create_completion_internal(ctx, rbd_ctx_cb);
-    aio_write(ictx, off, mylen, buf, c, op_flags);
-
-    mylock.Lock();
-    while (!done)
-      cond.Wait(mylock);
-    mylock.Unlock();
-
-    if (ret < 0) {
-      return ret;
-    }
-
-    return mylen;
-  }
-
-  int discard(ImageCtx *ictx, uint64_t off, uint64_t len)
-  {
-    ldout(ictx->cct, 20) << "discard " << ictx << " off = " << off << " len = "
-			 << len << dendl;
-
-    Mutex mylock("librbd::discard::mylock");
-    Cond cond;
-    bool done;
-    int ret;
-
-    uint64_t mylen = len;
-    ictx->snap_lock.get_read();
-    int r = clip_io(ictx, off, &mylen);
-    ictx->snap_lock.put_read();
-    if (r < 0) {
-      return r;
-    }
-
-    Context *ctx = new C_SafeCond(&mylock, &cond, &done, &ret);
-    AioCompletion *c = aio_create_completion_internal(ctx, rbd_ctx_cb);
-    aio_discard(ictx, off, mylen, c);
-
-    mylock.Lock();
-    while (!done)
-      cond.Wait(mylock);
-    mylock.Unlock();
-
-    if (ret < 0) {
-      return ret;
-    }
-
-    return mylen;
-  }
-
-  ssize_t handle_sparse_read(CephContext *cct,
-			     bufferlist data_bl,
-			     uint64_t block_ofs,
-			     const map<uint64_t, uint64_t> &data_map,
-			     uint64_t buf_ofs,   // offset into buffer
-			     size_t buf_len,     // length in buffer (not size of buffer!)
-			     char *dest_buf)
-  {
-    uint64_t bl_ofs = 0;
-    size_t buf_left = buf_len;
-
-    for (map<uint64_t, uint64_t>::const_iterator iter = data_map.begin();
-	 iter != data_map.end();
-	 ++iter) {
-      uint64_t extent_ofs = iter->first;
-      size_t extent_len = iter->second;
-
-      ldout(cct, 10) << "extent_ofs=" << extent_ofs
-		     << " extent_len=" << extent_len << dendl;
-      ldout(cct, 10) << "block_ofs=" << block_ofs << dendl;
-
-      /* a hole? */
-      if (extent_ofs > block_ofs) {
-	uint64_t gap = extent_ofs - block_ofs;
-	ldout(cct, 10) << "<1>zeroing " << buf_ofs << "~" << gap << dendl;
-	memset(dest_buf + buf_ofs, 0, gap);
-
-	buf_ofs += gap;
-	buf_left -= gap;
-	block_ofs = extent_ofs;
-      } else if (extent_ofs < block_ofs) {
-	assert(0 == "osd returned data prior to what we asked for");
-	return -EIO;
-      }
-
-      if (bl_ofs + extent_len > (buf_ofs + buf_left)) {
-	assert(0 == "osd returned more data than we asked for");
-	return -EIO;
-      }
-
-      /* data */
-      ldout(cct, 10) << "<2>copying " << buf_ofs << "~" << extent_len
-		     << " from ofs=" << bl_ofs << dendl;
-      memcpy(dest_buf + buf_ofs, data_bl.c_str() + bl_ofs, extent_len);
-
-      bl_ofs += extent_len;
-      buf_ofs += extent_len;
-      assert(buf_left >= extent_len);
-      buf_left -= extent_len;
-      block_ofs += extent_len;
-    }
-
-    /* last hole */
-    if (buf_left > 0) {
-      ldout(cct, 10) << "<3>zeroing " << buf_ofs << "~" << buf_left << dendl;
-      memset(dest_buf + buf_ofs, 0, buf_left);
-    }
-
-    return buf_len;
-  }
-
   void rados_req_cb(rados_completion_t c, void *arg)
   {
-    AioRequest *req = reinterpret_cast<AioRequest *>(arg);
+    AioObjectRequest *req = reinterpret_cast<AioObjectRequest *>(arg);
     req->complete(rados_aio_get_return_value(c));
   }
 
@@ -3372,41 +3678,6 @@ reprotect_and_return_err:
     return 0;
   }
 
-  void aio_flush(ImageCtx *ictx, AioCompletion *c)
-  {
-    CephContext *cct = ictx->cct;
-    ldout(cct, 20) << "aio_flush " << ictx << " completion " << c <<  dendl;
-
-    c->get();
-    int r = ictx_check(ictx);
-    if (r < 0) {
-      c->fail(cct, r);
-      return;
-    }
-
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    ictx->user_flushed();
-
-    C_AioWrite *flush_ctx = new C_AioWrite(cct, c);
-    c->add_request();
-    ictx->flush_async_operations(flush_ctx);
-
-    c->start_op(ictx, AIO_TYPE_FLUSH);
-    C_AioWrite *req_comp = new C_AioWrite(cct, c);
-    c->add_request();
-    if (ictx->object_cacher) {
-      ictx->flush_cache_aio(req_comp);
-    } else {
-      librados::AioCompletion *rados_completion =
-	librados::Rados::aio_create_completion(req_comp, NULL, rados_ctx_cb);
-      ictx->data_ctx.aio_flush_async(rados_completion);
-      rados_completion->release();
-    }
-    c->finish_adding_requests(cct);
-    c->put();
-    ictx->perfcounter->inc(l_librbd_aio_flush);
-  }
-
   int flush(ImageCtx *ictx)
   {
     CephContext *cct = ictx->cct;
@@ -3420,31 +3691,12 @@ reprotect_and_return_err:
     ictx->user_flushed();
     {
       RWLock::RLocker owner_locker(ictx->owner_lock);
-      r = _flush(ictx);
+      r = ictx->flush();
     }
     ictx->perfcounter->inc(l_librbd_flush);
     return r;
   }
 
-  int _flush(ImageCtx *ictx)
-  {
-    assert(ictx->owner_lock.is_locked());
-    CephContext *cct = ictx->cct;
-    int r;
-    // flush any outstanding writes
-    if (ictx->object_cacher) {
-      r = ictx->flush_cache();
-    } else {
-      r = ictx->data_ctx.aio_flush();
-      ictx->flush_async_operations();
-    }
-
-    if (r)
-      lderr(cct) << "_flush " << ictx << " r = " << r << dendl;
-
-    return r;
-  }
-
   int invalidate_cache(ImageCtx *ictx)
   {
     CephContext *cct = ictx->cct;
@@ -3455,99 +3707,13 @@ reprotect_and_return_err:
       return r;
     }
 
-    ictx->flush_async_operations();
-
     RWLock::RLocker owner_locker(ictx->owner_lock);
     RWLock::WLocker md_locker(ictx->md_lock);
     r = ictx->invalidate_cache();
+    ictx->perfcounter->inc(l_librbd_invalidate_cache);
     return r;
   }
 
-  void aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
-		 AioCompletion *c, int op_flags)
-  {
-    CephContext *cct = ictx->cct;
-    ldout(cct, 20) << "aio_write " << ictx << " off = " << off << " len = "
-		   << len << " buf = " << (void*)buf << dendl;
-
-    c->get();
-    int r = ictx_check(ictx);
-    if (r < 0) {
-      c->fail(cct, r);
-      return;
-    }
-
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    RWLock::RLocker md_locker(ictx->md_lock);
-
-    uint64_t clip_len = len;
-    ::SnapContext snapc;
-    {
-      // prevent image size from changing between computing clip and recording
-      // pending async operation
-      RWLock::RLocker snap_locker(ictx->snap_lock);
-      if (ictx->snap_id != CEPH_NOSNAP || ictx->read_only) {
-        c->fail(cct, -EROFS);
-        return;
-      }
-
-      r = clip_io(ictx, off, &clip_len);
-      if (r < 0) {
-        c->fail(cct, r);
-        return;
-      }
-
-      snapc = ictx->snapc;
-      c->start_op(ictx, AIO_TYPE_WRITE);
-    }
-
-    if (ictx->image_watcher->is_lock_supported() &&
-	!ictx->image_watcher->is_lock_owner()) {
-      c->put();
-      ictx->image_watcher->request_lock(
-	boost::bind(&librbd::aio_write, ictx, off, len, buf, _1, op_flags), c);
-      return;
-    }
-
-    // map
-    vector<ObjectExtent> extents;
-    if (len > 0) {
-      Striper::file_to_extents(ictx->cct, ictx->format_string,
-			       &ictx->layout, off, clip_len, 0, extents);
-    }
-
-    for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
-      ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
-		     << " from " << p->buffer_extents << dendl;
-      // assemble extent
-      bufferlist bl;
-      for (vector<pair<uint64_t,uint64_t> >::iterator q = p->buffer_extents.begin();
-	   q != p->buffer_extents.end();
-	   ++q) {
-	bl.append(buf + q->first, q->second);
-      }
-
-      C_AioWrite *req_comp = new C_AioWrite(cct, c);
-      if (ictx->object_cacher) {
-	c->add_request();
-	ictx->write_to_cache(p->oid, bl, p->length, p->offset, req_comp, op_flags);
-      } else {
-	AioWrite *req = new AioWrite(ictx, p->oid.name, p->objectno, p->offset,
-				     bl, snapc, req_comp);
-	c->add_request();
-
-	req->set_op_flags(op_flags);
-	req->send();
-      }
-    }
-
-    c->finish_adding_requests(ictx->cct);
-    c->put();
-
-    ictx->perfcounter->inc(l_librbd_wr);
-    ictx->perfcounter->inc(l_librbd_wr_bytes, clip_len);
-  }
-
   int metadata_get(ImageCtx *ictx, const string &key, string *value)
   {
     CephContext *cct = ictx->cct;
@@ -3602,111 +3768,13 @@ reprotect_and_return_err:
     return cls_client::metadata_list(&ictx->md_ctx, ictx->header_oid, start, max, pairs);
   }
 
-  void aio_discard(ImageCtx *ictx, uint64_t off, uint64_t len, AioCompletion *c)
-  {
-    CephContext *cct = ictx->cct;
-    ldout(cct, 20) << "aio_discard " << ictx << " off = " << off << " len = "
-		   << len << dendl;
-
-    c->get();
-    int r = ictx_check(ictx);
-    if (r < 0) {
-      c->fail(cct, r);
-      return;
-    }
-
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    RWLock::RLocker md_locker(ictx->md_lock);
-
-    uint64_t clip_len = len;
-    ::SnapContext snapc;
-    {
-      // prevent image size from changing between computing clip and recording
-      // pending async operation
-      RWLock::RLocker snap_locker(ictx->snap_lock);
-      if (ictx->snap_id != CEPH_NOSNAP || ictx->read_only) {
-        c->fail(cct, -EROFS);
-        return;
-      }
-
-      r = clip_io(ictx, off, &clip_len);
-      if (r < 0) {
-        c->fail(cct, r);
-        return;
-      }
-
-      // TODO: check for snap
-      snapc = ictx->snapc;
-      c->start_op(ictx, AIO_TYPE_DISCARD);
-    }
-
-    if (ictx->image_watcher->is_lock_supported() &&
-	!ictx->image_watcher->is_lock_owner()) {
-      c->put();
-      ictx->image_watcher->request_lock(
-	boost::bind(&librbd::aio_discard, ictx, off, len, _1), c);
-      return;
-    }
-
-    // map
-    vector<ObjectExtent> extents;
-    if (len > 0) {
-      Striper::file_to_extents(ictx->cct, ictx->format_string,
-			       &ictx->layout, off, clip_len, 0, extents);
-    }
-
-    for (vector<ObjectExtent>::iterator p = extents.begin(); p != extents.end(); ++p) {
-      ldout(cct, 20) << " oid " << p->oid << " " << p->offset << "~" << p->length
-		     << " from " << p->buffer_extents << dendl;
-      C_AioWrite *req_comp = new C_AioWrite(cct, c);
-      AbstractWrite *req;
-      c->add_request();
-
-      if (p->length == ictx->layout.fl_object_size) {
-	req = new AioRemove(ictx, p->oid.name, p->objectno, snapc, req_comp);
-      } else if (p->offset + p->length == ictx->layout.fl_object_size) {
-	req = new AioTruncate(ictx, p->oid.name, p->objectno, p->offset, snapc,
-                              req_comp);
-      } else {
-	if(ictx->cct->_conf->rbd_skip_partial_discard) {
-	  delete req_comp;
-	  continue;
-	}
-	req = new AioZero(ictx, p->oid.name, p->objectno, p->offset, p->length,
-			  snapc, req_comp);
-      }
-
-      req->send();
-    }
-
-    if (ictx->object_cacher) {
-      Mutex::Locker l(ictx->cache_lock);
-      ictx->object_cacher->discard_set(ictx->object_set, extents);
-    }
-
-    c->finish_adding_requests(ictx->cct);
-    c->put();
-
-    ictx->perfcounter->inc(l_librbd_discard);
-    ictx->perfcounter->inc(l_librbd_discard_bytes, clip_len);
-  }
-
   void rbd_req_cb(completion_t cb, void *arg)
   {
-    AioRequest *req = reinterpret_cast<AioRequest *>(arg);
+    AioObjectRequest *req = reinterpret_cast<AioObjectRequest *>(arg);
     AioCompletion *comp = reinterpret_cast<AioCompletion *>(cb);
     req->complete(comp->get_return_value());
   }
 
-  void aio_read(ImageCtx *ictx, uint64_t off, size_t len,
-	       char *buf, bufferlist *bl,
-	       AioCompletion *c, int op_flags)
-  {
-    vector<pair<uint64_t,uint64_t> > image_extents(1);
-    image_extents[0] = make_pair(off, len);
-    aio_read(ictx, image_extents, buf, bl, c, op_flags);
-  }
-
   struct C_RBD_Readahead : public Context {
     ImageCtx *ictx;
     object_t oid;
@@ -3720,8 +3788,8 @@ reprotect_and_return_err:
     }
   };
 
-  static void readahead(ImageCtx *ictx,
-			const vector<pair<uint64_t,uint64_t> >& image_extents)
+  void readahead(ImageCtx *ictx,
+                 const vector<pair<uint64_t,uint64_t> >& image_extents)
   {
     uint64_t total_bytes = 0;
     for (vector<pair<uint64_t,uint64_t> >::const_iterator p = image_extents.begin();
@@ -3765,94 +3833,6 @@ reprotect_and_return_err:
     }
   }
 
-  void aio_read(ImageCtx *ictx, const vector<pair<uint64_t,uint64_t> >& image_extents,
-	        char *buf, bufferlist *pbl, AioCompletion *c, int op_flags)
-  {
-    CephContext *cct = ictx->cct;
-    ldout(cct, 20) << "aio_read " << ictx << " completion " << c << " " << image_extents << dendl;
-
-    c->get();
-    int r = ictx_check(ictx);
-    if (r < 0) {
-      c->fail(cct, r);
-      return;
-    }
-
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-
-    // readahead
-    if (ictx->object_cacher && ictx->readahead_max_bytes > 0 &&
-	!(op_flags & LIBRADOS_OP_FLAG_FADVISE_RANDOM)) {
-      readahead(ictx, image_extents);
-    }
-
-    snap_t snap_id;
-    map<object_t,vector<ObjectExtent> > object_extents;
-    uint64_t buffer_ofs = 0;
-    {
-      // prevent image size from changing between computing clip and recording
-      // pending async operation
-      RWLock::RLocker snap_locker(ictx->snap_lock);
-      snap_id = ictx->snap_id;
-
-      // map
-      for (vector<pair<uint64_t,uint64_t> >::const_iterator p =
-             image_extents.begin();
-	   p != image_extents.end(); ++p) {
-        uint64_t len = p->second;
-        r = clip_io(ictx, p->first, &len);
-        if (r < 0) {
-          c->fail(cct, r);
-	  return;
-        }
-        if (len == 0) {
-	  continue;
-        }
-
-        Striper::file_to_extents(cct, ictx->format_string, &ictx->layout,
-			         p->first, len, 0, object_extents, buffer_ofs);
-        buffer_ofs += len;
-      }
-      c->start_op(ictx, AIO_TYPE_READ);
-    }
-
-    c->read_buf = buf;
-    c->read_buf_len = buffer_ofs;
-    c->read_bl = pbl;
-
-    for (map<object_t,vector<ObjectExtent> >::iterator p = object_extents.begin();
-         p != object_extents.end(); ++p) {
-      for (vector<ObjectExtent>::iterator q = p->second.begin();
-           q != p->second.end(); ++q) {
-	ldout(ictx->cct, 20) << " oid " << q->oid << " " << q->offset << "~"
-                             << q->length << " from " << q->buffer_extents
-                             << dendl;
-
-	C_AioRead *req_comp = new C_AioRead(ictx->cct, c);
-	AioRead *req = new AioRead(ictx, q->oid.name, q->objectno, q->offset,
-                                   q->length, q->buffer_extents, snap_id, true,
-                                   req_comp, op_flags);
-	req_comp->set_req(req);
-	c->add_request();
-
-	if (ictx->object_cacher) {
-	  C_CacheRead *cache_comp = new C_CacheRead(ictx, req);
-	  ictx->aio_read_from_cache(q->oid, q->objectno, &req->data(),
-				    q->length, q->offset,
-				    cache_comp, op_flags);
-	} else {
-	  req->send();
-	}
-      }
-    }
-
-    c->finish_adding_requests(cct);
-    c->put();
-
-    ictx->perfcounter->inc(l_librbd_rd);
-    ictx->perfcounter->inc(l_librbd_rd_bytes, buffer_ofs);
-  }
-
   AioCompletion *aio_create_completion() {
     AioCompletion *c = new AioCompletion();
     return c;
diff --git a/src/librbd/internal.h b/src/librbd/internal.h
index 7eaa3c5..3de90a6 100644
--- a/src/librbd/internal.h
+++ b/src/librbd/internal.h
@@ -34,6 +34,7 @@ enum {
   l_librbd_snap_create,
   l_librbd_snap_remove,
   l_librbd_snap_rollback,
+  l_librbd_snap_rename,
 
   l_librbd_notify,
   l_librbd_resize,
@@ -41,6 +42,8 @@ enum {
   l_librbd_readahead,
   l_librbd_readahead_bytes,
 
+  l_librbd_invalidate_cache,
+
   l_librbd_last,
 };
 
@@ -74,6 +77,21 @@ namespace librbd {
 
   bool has_parent(int64_t parent_pool_id, uint64_t off, uint64_t overlap);
 
+  void image_options_create(rbd_image_options_t* opts);
+  void image_options_create_ref(rbd_image_options_t* opts,
+				rbd_image_options_t orig);
+  void image_options_destroy(rbd_image_options_t opts);
+  int image_options_set(rbd_image_options_t opts, int optname,
+			const std::string& optval);
+  int image_options_set(rbd_image_options_t opts, int optname, uint64_t optval);
+  int image_options_get(rbd_image_options_t opts, int optname,
+			std::string* optval);
+  int image_options_get(rbd_image_options_t opts, int optname,
+			uint64_t* optval);
+  int image_options_unset(rbd_image_options_t opts, int optname);
+  void image_options_clear(rbd_image_options_t opts);
+  bool image_options_is_empty(rbd_image_options_t opts);
+
   int snap_set(ImageCtx *ictx, const char *snap_name);
   int list(librados::IoCtx& io_ctx, std::vector<std::string>& names);
   int list_children(ImageCtx *ictx,
@@ -83,10 +101,14 @@ namespace librbd {
   int create(librados::IoCtx& io_ctx, const char *imgname, uint64_t size,
 	     bool old_format, uint64_t features, int *order,
 	     uint64_t stripe_unit, uint64_t stripe_count);
+  int create(IoCtx& io_ctx, const char *imgname, uint64_t size,
+	     ImageOptions& opts);
   int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
 	    IoCtx& c_ioctx, const char *c_name,
 	    uint64_t features, int *c_order,
 	    uint64_t stripe_unit, int stripe_count);
+  int clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
+	    IoCtx& c_ioctx, const char *c_name, ImageOptions& c_opts);
   int rename(librados::IoCtx& io_ctx, const char *srcname, const char *dstname);
   int info(ImageCtx *ictx, image_info_t& info, size_t image_size);
   int get_old_format(ImageCtx *ictx, uint8_t *old);
@@ -110,25 +132,29 @@ namespace librbd {
 		    ProgressContext& prog_ctx);
   int snap_remove(ImageCtx *ictx, const char *snap_name);
   int snap_remove_helper(ImageCtx *ictx, Context* ctx, const char *snap_name);
+  int snap_rename_helper(ImageCtx *ictx, Context* ctx, const uint64_t src_snap_id,
+			 const char *dst_name);
+  int snap_rename(ImageCtx *ictx, const char *srcname, const char *dstname);
   int snap_protect(ImageCtx *ictx, const char *snap_name);
   int snap_unprotect(ImageCtx *ictx, const char *snap_name);
   int snap_is_protected(ImageCtx *ictx, const char *snap_name,
 			bool *is_protected);
   int add_snap(ImageCtx *ictx, const char *snap_name);
   int rm_snap(ImageCtx *ictx, const char *snap_name, uint64_t snap_id);
+  int rename_snap(ImageCtx *ictx, uint64_t src_snap_id, const char *dst_name);
   int refresh_parent(ImageCtx *ictx);
-  int ictx_check(ImageCtx *ictx, bool owner_locked=false);
+  int ictx_check(ImageCtx *ictx);
+  int ictx_check(ImageCtx *ictx, const RWLock &owner_lock);
   int ictx_refresh(ImageCtx *ictx);
   int copy(ImageCtx *ictx, IoCtx& dest_md_ctx, const char *destname,
-	   ProgressContext &prog_ctx);
+	   ImageOptions& opts, ProgressContext &prog_ctx);
   int copy(ImageCtx *src, ImageCtx *dest, ProgressContext &prog_ctx);
 
   int open_parent(ImageCtx *ictx);
   int open_image(ImageCtx *ictx);
   int close_image(ImageCtx *ictx);
+  int close_parent(ImageCtx *ictx);
 
-  int copyup_block(ImageCtx *ictx, uint64_t offset, size_t len,
-		   const char *buf);
   int flatten(ImageCtx *ictx, ProgressContext &prog_ctx);
 
   int rebuild_object_map(ImageCtx *ictx, ProgressContext &prog_ctx);
@@ -147,8 +173,6 @@ namespace librbd {
 		 const std::string& cookie);
 
   void trim_image(ImageCtx *ictx, uint64_t newsize, ProgressContext& prog_ctx);
-  int read_rbd_info(librados::IoCtx& io_ctx, const std::string& info_oid,
-		    struct rbd_info *info);
 
   int read_header_bl(librados::IoCtx& io_ctx, const std::string& md_oid,
 		     ceph::bufferlist& header, uint64_t *ver);
@@ -156,8 +180,6 @@ namespace librbd {
 		    ImageCtx *ictx);
   int read_header(librados::IoCtx& io_ctx, const std::string& md_oid,
 		  struct rbd_obj_header_ondisk *header, uint64_t *ver);
-  int write_header(librados::IoCtx& io_ctx, const std::string& md_oid,
-		   ceph::bufferlist& header);
   int tmap_set(librados::IoCtx& io_ctx, const std::string& imgname);
   int tmap_rm(librados::IoCtx& io_ctx, const std::string& imgname);
   void rollback_object(ImageCtx *ictx, uint64_t snap_id, const string& oid,
@@ -165,12 +187,9 @@ namespace librbd {
   int rollback_image(ImageCtx *ictx, uint64_t snap_id,
 		     ProgressContext& prog_ctx);
   void image_info(const ImageCtx *ictx, image_info_t& info, size_t info_size);
-  std::string get_block_oid(const std::string &object_prefix, uint64_t num,
-			    bool old_format);
   uint64_t oid_to_object_no(const std::string& oid,
 			    const std::string& object_prefix);
   int clip_io(ImageCtx *ictx, uint64_t off, uint64_t *len);
-  int init_rbd_info(struct rbd_info *info);
   void init_rbd_header(struct rbd_obj_header_ondisk& ondisk,
 		       uint64_t size, int order, uint64_t bid);
 
@@ -181,11 +200,8 @@ namespace librbd {
                    uint64_t len, bool include_parent, bool whole_object,
 		   int (*cb)(uint64_t, size_t, int, void *),
 		   void *arg);
-  ssize_t read(ImageCtx *ictx, uint64_t off, size_t len, char *buf, int op_flags);
-  ssize_t read(ImageCtx *ictx, const vector<pair<uint64_t,uint64_t> >& image_extents,
-	       char *buf, bufferlist *pbl, int op_flags);
-  ssize_t write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf, int op_flags);
-  int discard(ImageCtx *ictx, uint64_t off, uint64_t len);
+  void readahead(ImageCtx *ictx,
+                 const vector<pair<uint64_t,uint64_t> >& image_extents);
 
   int async_flatten(ImageCtx *ictx, Context *ctx, ProgressContext &prog_ctx);
   int async_resize(ImageCtx *ictx, Context *ctx, uint64_t size,
@@ -195,37 +211,19 @@ namespace librbd {
   int async_rebuild_object_map(ImageCtx *ictx, Context *ctx,
                                ProgressContext &prog_ctx);
 
-  void aio_write(ImageCtx *ictx, uint64_t off, size_t len, const char *buf,
-		 AioCompletion *c, int op_flags);
-  void aio_discard(ImageCtx *ictx, uint64_t off, uint64_t len, AioCompletion *c);
-  void aio_read(ImageCtx *ictx, uint64_t off, size_t len,
-	        char *buf, bufferlist *pbl, AioCompletion *c, int op_flags);
-  void aio_read(ImageCtx *ictx, const vector<pair<uint64_t,uint64_t> >& image_extents,
-	        char *buf, bufferlist *pbl, AioCompletion *c, int op_flags);
-  void aio_flush(ImageCtx *ictx, AioCompletion *c);
   int flush(ImageCtx *ictx);
-  int _flush(ImageCtx *ictx);
   int invalidate_cache(ImageCtx *ictx);
   int metadata_list(ImageCtx *ictx, const string &last, uint64_t max, map<string, bufferlist> *pairs);
   int metadata_get(ImageCtx *ictx, const std::string &key, std::string *value);
   int metadata_set(ImageCtx *ictx, const std::string &key, const std::string &value);
   int metadata_remove(ImageCtx *ictx, const std::string &key);
 
-  ssize_t handle_sparse_read(CephContext *cct,
-			     ceph::bufferlist data_bl,
-			     uint64_t block_ofs,
-			     const std::map<uint64_t, uint64_t> &data_map,
-			     uint64_t buf_ofs,
-			     size_t buf_len,
-			     char *dest_buf);
-
   AioCompletion *aio_create_completion();
   AioCompletion *aio_create_completion(void *cb_arg, callback_t cb_complete);
   AioCompletion *aio_create_completion_internal(void *cb_arg,
 						callback_t cb_complete);
 
   // raw callbacks
-  int simple_read_cb(uint64_t ofs, size_t len, const char *buf, void *arg);
   void rados_req_cb(rados_completion_t cb, void *arg);
   void rados_ctx_cb(rados_completion_t cb, void *arg);
   void rbd_req_cb(completion_t cb, void *arg);
diff --git a/src/librbd/librbd.cc b/src/librbd/librbd.cc
index 251b96e..e322ad5 100644
--- a/src/librbd/librbd.cc
+++ b/src/librbd/librbd.cc
@@ -26,6 +26,7 @@
 #include "osdc/ObjectCacher.h"
 
 #include "librbd/AioCompletion.h"
+#include "librbd/AioImageRequestWQ.h"
 #include "cls/rbd/cls_rbd_client.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/internal.h"
@@ -60,121 +61,6 @@ namespace {
 
 TracepointProvider::Traits tracepoint_traits("librbd_tp.so", "rbd_tracing");
 
-class C_AioReadWQ : public Context {
-public:
-  C_AioReadWQ(librbd::ImageCtx *ictx, uint64_t off, size_t len,
-              char *buf, bufferlist *pbl, librbd::AioCompletion *c,
-              int op_flags)
-    : m_ictx(ictx), m_off(off), m_len(len), m_buf(buf), m_pbl(pbl), m_comp(c),
-      m_op_flags(op_flags) {
-  }
-protected:
-  virtual void finish(int r) {
-    librbd::aio_read(m_ictx, m_off, m_len, m_buf, m_pbl, m_comp, m_op_flags);
-  }
-private:
-  librbd::ImageCtx *m_ictx;
-  uint64_t m_off;
-  uint64_t m_len;
-  char *m_buf;
-  bufferlist *m_pbl;
-  librbd::AioCompletion *m_comp;
-  int m_op_flags;
-};
-
-class C_AioWriteWQ : public Context {
-public:
-  C_AioWriteWQ(librbd::ImageCtx *ictx, uint64_t off, size_t len,
-               const char *buf, librbd::AioCompletion *c, int op_flags)
-    : m_ictx(ictx), m_off(off), m_len(len), m_buf(buf), m_comp(c),
-      m_op_flags(op_flags) {
-  }
-protected:
-  virtual void finish(int r) {
-    librbd::aio_write(m_ictx, m_off, m_len, m_buf, m_comp, m_op_flags);
-  }
-private:
-  librbd::ImageCtx *m_ictx;
-  uint64_t m_off;
-  uint64_t m_len;
-  const char *m_buf;
-  librbd::AioCompletion *m_comp;
-  int m_op_flags;
-};
-
-class C_AioDiscardWQ : public Context {
-public:
-  C_AioDiscardWQ(librbd::ImageCtx *ictx, uint64_t off, uint64_t len,
-                 librbd::AioCompletion *c)
-    : m_ictx(ictx), m_off(off), m_len(len), m_comp(c) {
-  }
-protected:
-  virtual void finish(int r) {
-    librbd::aio_discard(m_ictx, m_off, m_len, m_comp);
-  }
-private:
-  librbd::ImageCtx *m_ictx;
-  uint64_t m_off;
-  uint64_t m_len;
-  librbd::AioCompletion *m_comp;
-};
-
-class C_AioFlushWQ : public Context {
-public:
-  C_AioFlushWQ(librbd::ImageCtx *ictx, librbd::AioCompletion *c)
-    : m_ictx(ictx), m_comp(c) {
-  }
-protected:
-  virtual void finish(int r) {
-    librbd::aio_flush(m_ictx, m_comp);
-  }
-private:
-  librbd::ImageCtx *m_ictx;
-  librbd::AioCompletion *m_comp;
-};
-
-void submit_aio_read(librbd::ImageCtx *ictx, uint64_t off, size_t len,
-                     char *buf, bufferlist *pbl, librbd::AioCompletion *c,
-                     int op_flags) {
-  c->init_time(ictx, librbd::AIO_TYPE_READ);
-  if (ictx->non_blocking_aio) {
-    ictx->aio_work_queue->queue(new C_AioReadWQ(ictx, off, len, buf, pbl, c,
-                                                op_flags));
-  } else {
-    librbd::aio_read(ictx, off, len, buf, pbl, c, op_flags);
-  }
-}
-
-void submit_aio_write(librbd::ImageCtx *ictx, uint64_t off, size_t len,
-                      const char *buf, librbd::AioCompletion *c, int op_flags) {
-  c->init_time(ictx, librbd::AIO_TYPE_WRITE);
-  if (ictx->non_blocking_aio) {
-    ictx->aio_work_queue->queue(new C_AioWriteWQ(ictx, off, len, buf, c,
-                                                 op_flags));
-  } else {
-    librbd::aio_write(ictx, off, len, buf, c, op_flags);
-  }
-}
-
-void submit_aio_discard(librbd::ImageCtx *ictx, uint64_t off, uint64_t len,
-                        librbd::AioCompletion *c) {
-  c->init_time(ictx, librbd::AIO_TYPE_DISCARD);
-  if (ictx->non_blocking_aio) {
-    ictx->aio_work_queue->queue(new C_AioDiscardWQ(ictx, off, len, c));
-  } else {
-    librbd::aio_discard(ictx, off, len, c);
-  }
-}
-
-void submit_aio_flush(librbd::ImageCtx *ictx, librbd::AioCompletion *c) {
-  c->init_time(ictx, librbd::AIO_TYPE_FLUSH);
-  if (ictx->non_blocking_aio) {
-    ictx->aio_work_queue->queue(new C_AioFlushWQ(ictx, c));
-  } else {
-    librbd::aio_flush(ictx, c);
-  }
-}
-
 CephContext* get_cct(IoCtx &io_ctx) {
   return reinterpret_cast<CephContext*>(io_ctx.cct());
 }
@@ -304,6 +190,16 @@ namespace librbd {
     return r;
   }
 
+  int RBD::create4(IoCtx& io_ctx, const char *name, uint64_t size,
+		   ImageOptions& opts)
+  {
+    TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+    tracepoint(librbd, create4_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, opts.opts);
+    int r = librbd::create(io_ctx, name, size, opts);
+    tracepoint(librbd, create4_exit, r);
+    return r;
+  }
+
   int RBD::clone(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
 		 IoCtx& c_ioctx, const char *c_name, uint64_t features,
 		 int *c_order)
@@ -324,7 +220,18 @@ namespace librbd {
     tracepoint(librbd, clone2_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, features, stripe_unit, stripe_count);
     int r = librbd::clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name,
 			 features, c_order, stripe_unit, stripe_count);
-    tracepoint(librbd, clone_exit, r, *c_order);
+    tracepoint(librbd, clone2_exit, r, *c_order);
+    return r;
+  }
+
+  int RBD::clone3(IoCtx& p_ioctx, const char *p_name, const char *p_snap_name,
+		  IoCtx& c_ioctx, const char *c_name, ImageOptions& c_opts)
+  {
+    TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioctx));
+    tracepoint(librbd, clone3_enter, p_ioctx.get_pool_name().c_str(), p_ioctx.get_id(), p_name, p_snap_name, c_ioctx.get_pool_name().c_str(), c_ioctx.get_id(), c_name, c_opts.opts);
+    int r = librbd::clone(p_ioctx, p_name, p_snap_name, c_ioctx, c_name,
+			  c_opts);
+    tracepoint(librbd, clone3_exit, r);
     return r;
   }
 
@@ -405,6 +312,60 @@ namespace librbd {
   }
 
   /*
+    ImageOptions
+  */
+
+  ImageOptions::ImageOptions()
+  {
+    librbd::image_options_create(&opts);
+  }
+
+  ImageOptions::ImageOptions(rbd_image_options_t opts_)
+  {
+    librbd::image_options_create_ref(&opts, opts_);
+  }
+
+  ImageOptions::~ImageOptions()
+  {
+    librbd::image_options_destroy(opts);
+  }
+
+  int ImageOptions::set(int optname, const std::string& optval)
+  {
+    return librbd::image_options_set(opts, optname, optval);
+  }
+
+  int ImageOptions::set(int optname, uint64_t optval)
+  {
+    return librbd::image_options_set(opts, optname, optval);
+  }
+
+  int ImageOptions::get(int optname, std::string* optval) const
+  {
+    return librbd::image_options_get(opts, optname, optval);
+  }
+
+  int ImageOptions::get(int optname, uint64_t* optval) const
+  {
+    return librbd::image_options_get(opts, optname, optval);
+  }
+
+  int ImageOptions::unset(int optname)
+  {
+    return librbd::image_options_unset(opts, optname);
+  }
+
+  void ImageOptions::clear()
+  {
+    librbd::image_options_clear(opts);
+  }
+
+  bool ImageOptions::empty() const
+  {
+    return librbd::image_options_is_empty(opts);
+  }
+
+  /*
     Image
   */
 
@@ -560,8 +521,9 @@ namespace librbd {
   {
     ImageCtx *ictx = (ImageCtx *)ctx;
     tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname);
+    ImageOptions opts;
     librbd::NoOpProgressContext prog_ctx;
-    int r = librbd::copy(ictx, dest_io_ctx, destname, prog_ctx);
+    int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx);
     tracepoint(librbd, copy_exit, r);
     return r;
   }
@@ -577,12 +539,23 @@ namespace librbd {
     return r;
   }
 
+  int Image::copy3(IoCtx& dest_io_ctx, const char *destname, ImageOptions& opts)
+  {
+    ImageCtx *ictx = (ImageCtx *)ctx;
+    tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts);
+    librbd::NoOpProgressContext prog_ctx;
+    int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx);
+    tracepoint(librbd, copy3_exit, r);
+    return r;
+  }
+
   int Image::copy_with_progress(IoCtx& dest_io_ctx, const char *destname,
 				librbd::ProgressContext &pctx)
   {
     ImageCtx *ictx = (ImageCtx *)ctx;
     tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname);
-    int r = librbd::copy(ictx, dest_io_ctx, destname, pctx);
+    ImageOptions opts;
+    int r = librbd::copy(ictx, dest_io_ctx, destname, opts, pctx);
     tracepoint(librbd, copy_exit, r);
     return r;
   }
@@ -597,6 +570,17 @@ namespace librbd {
     return r;
   }
 
+  int Image::copy_with_progress3(IoCtx& dest_io_ctx, const char *destname,
+				 ImageOptions& opts,
+				 librbd::ProgressContext &pctx)
+  {
+    ImageCtx *ictx = (ImageCtx *)ctx;
+    tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, opts.opts);
+    int r = librbd::copy(ictx, dest_io_ctx, destname, opts, pctx);
+    tracepoint(librbd, copy3_exit, r);
+    return r;
+  }
+
   int Image::flatten()
   {
     ImageCtx *ictx = (ImageCtx *)ctx;
@@ -711,6 +695,15 @@ namespace librbd {
     return r;
   }
 
+  int Image::snap_rename(const char *srcname, const char *dstname)
+  {
+    ImageCtx *ictx = (ImageCtx *)ctx;
+    tracepoint(librbd, snap_rename_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, srcname, dstname);
+    int r = librbd::snap_rename(ictx, srcname, dstname);
+    tracepoint(librbd, snap_rename_exit, r);
+    return r;
+  }
+
   int Image::snap_rollback_with_progress(const char *snap_name,
 					 ProgressContext& prog_ctx)
   {
@@ -789,7 +782,7 @@ namespace librbd {
     tracepoint(librbd, read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
     bufferptr ptr(len);
     bl.push_back(ptr);
-    int r = librbd::read(ictx, ofs, len, bl.c_str(), 0);
+    int r = ictx->aio_work_queue->read(ofs, len, bl.c_str(), 0);
     tracepoint(librbd, read_exit, r);
     return r;
   }
@@ -801,7 +794,7 @@ namespace librbd {
 		ictx->read_only, ofs, len, op_flags);
     bufferptr ptr(len);
     bl.push_back(ptr);
-    int r = librbd::read(ictx, ofs, len, bl.c_str(), op_flags);
+    int r = ictx->aio_work_queue->read(ofs, len, bl.c_str(), op_flags);
     tracepoint(librbd, read_exit, r);
     return r;
   }
@@ -867,7 +860,7 @@ namespace librbd {
       tracepoint(librbd, write_exit, -EINVAL);
       return -EINVAL;
     }
-    int r = librbd::write(ictx, ofs, len, bl.c_str(), 0);
+    int r = ictx->aio_work_queue->write(ofs, len, bl.c_str(), 0);
     tracepoint(librbd, write_exit, r);
     return r;
   }
@@ -881,7 +874,7 @@ namespace librbd {
       tracepoint(librbd, write_exit, -EINVAL);
       return -EINVAL;
     }
-    int r = librbd::write(ictx, ofs, len, bl.c_str(), op_flags);
+    int r = ictx->aio_work_queue->write(ofs, len, bl.c_str(), op_flags);
     tracepoint(librbd, write_exit, r);
     return r;
   }
@@ -890,7 +883,7 @@ namespace librbd {
   {
     ImageCtx *ictx = (ImageCtx *)ctx;
     tracepoint(librbd, discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
-    int r = librbd::discard(ictx, ofs, len);
+    int r = ictx->aio_work_queue->discard(ofs, len);
     tracepoint(librbd, discard_exit, r);
     return r;
   }
@@ -904,7 +897,8 @@ namespace librbd {
       tracepoint(librbd, aio_write_exit, -EINVAL);
       return -EINVAL;
     }
-    submit_aio_write(ictx, off, len, bl.c_str(), get_aio_completion(c), 0);
+    ictx->aio_work_queue->aio_write(get_aio_completion(c), off, len, bl.c_str(),
+                                    0);
     tracepoint(librbd, aio_write_exit, 0);
     return 0;
   }
@@ -919,8 +913,8 @@ namespace librbd {
       tracepoint(librbd, aio_write_exit, -EINVAL);
       return -EINVAL;
     }
-    submit_aio_write(ictx, off, len, bl.c_str(), get_aio_completion(c),
-                     op_flags);
+    ictx->aio_work_queue->aio_write(get_aio_completion(c), off, len, bl.c_str(),
+                                    op_flags);
     tracepoint(librbd, aio_write_exit, 0);
     return 0;
   }
@@ -929,7 +923,7 @@ namespace librbd {
   {
     ImageCtx *ictx = (ImageCtx *)ctx;
     tracepoint(librbd, aio_discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, c->pc);
-    submit_aio_discard(ictx, off, len, get_aio_completion(c));
+    ictx->aio_work_queue->aio_discard(get_aio_completion(c), off, len);
     tracepoint(librbd, aio_discard_exit, 0);
     return 0;
   }
@@ -941,7 +935,8 @@ namespace librbd {
     tracepoint(librbd, aio_read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, bl.c_str(), c->pc);
     ldout(ictx->cct, 10) << "Image::aio_read() buf=" << (void *)bl.c_str() << "~"
 			 << (void *)(bl.c_str() + len - 1) << dendl;
-    submit_aio_read(ictx, off, len, NULL, &bl, get_aio_completion(c), 0);
+    ictx->aio_work_queue->aio_read(get_aio_completion(c), off, len, NULL, &bl,
+                                   0);
     tracepoint(librbd, aio_read_exit, 0);
     return 0;
   }
@@ -954,7 +949,8 @@ namespace librbd {
 		ictx->read_only, off, len, bl.c_str(), c->pc, op_flags);
     ldout(ictx->cct, 10) << "Image::aio_read() buf=" << (void *)bl.c_str() << "~"
 			 << (void *)(bl.c_str() + len - 1) << dendl;
-    submit_aio_read(ictx, off, len, NULL, &bl, get_aio_completion(c), op_flags);
+    ictx->aio_work_queue->aio_read(get_aio_completion(c), off, len, NULL, &bl,
+                                   op_flags);
     tracepoint(librbd, aio_read_exit, 0);
     return 0;
   }
@@ -972,7 +968,7 @@ namespace librbd {
   {
     ImageCtx *ictx = (ImageCtx *)ctx;
     tracepoint(librbd, aio_flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, c->pc);
-    submit_aio_flush(ictx, get_aio_completion(c));
+    ictx->aio_work_queue->aio_flush(get_aio_completion(c));
     tracepoint(librbd, aio_flush_exit, 0);
     return 0;
   }
@@ -1044,6 +1040,69 @@ extern "C" void rbd_version(int *major, int *minor, int *extra)
     *extra = LIBRBD_VER_EXTRA;
 }
 
+extern "C" void rbd_image_options_create(rbd_image_options_t* opts)
+{
+  librbd::image_options_create(opts);
+}
+
+extern "C" void rbd_image_options_destroy(rbd_image_options_t opts)
+{
+  librbd::image_options_destroy(opts);
+}
+
+extern "C" int rbd_image_options_set_string(rbd_image_options_t opts, int optname,
+					    const char* optval)
+{
+  return librbd::image_options_set(opts, optname, optval);
+}
+
+extern "C" int rbd_image_options_set_uint64(rbd_image_options_t opts, int optname,
+					    uint64_t optval)
+{
+  return librbd::image_options_set(opts, optname, optval);
+}
+
+extern "C" int rbd_image_options_get_string(rbd_image_options_t opts, int optname,
+					    char* optval, size_t maxlen)
+{
+  std::string optval_;
+
+  int r = librbd::image_options_get(opts, optname, &optval_);
+
+  if (r < 0) {
+    return r;
+  }
+
+  if (optval_.size() >= maxlen) {
+    return -E2BIG;
+  }
+
+  strncpy(optval, optval_.c_str(), maxlen);
+
+  return 0;
+}
+
+extern "C" int rbd_image_options_get_uint64(rbd_image_options_t opts, int optname,
+				 uint64_t* optval)
+{
+  return librbd::image_options_get(opts, optname, optval);
+}
+
+extern "C" int rbd_image_options_unset(rbd_image_options_t opts, int optname)
+{
+  return librbd::image_options_unset(opts, optname);
+}
+
+extern "C" void rbd_image_options_clear(rbd_image_options_t opts)
+{
+  librbd::image_options_clear(opts);
+}
+
+extern "C" int rbd_image_options_is_empty(rbd_image_options_t opts)
+{
+  return librbd::image_options_is_empty(opts);
+}
+
 /* images */
 extern "C" int rbd_list(rados_ioctx_t p, char *names, size_t *size)
 {
@@ -1126,6 +1185,19 @@ extern "C" int rbd_create3(rados_ioctx_t p, const char *name,
   return r;
 }
 
+extern "C" int rbd_create4(rados_ioctx_t p, const char *name,
+			   uint64_t size, rbd_image_options_t opts)
+{
+  librados::IoCtx io_ctx;
+  librados::IoCtx::from_rados_ioctx_t(p, io_ctx);
+  TracepointProvider::initialize<tracepoint_traits>(get_cct(io_ctx));
+  tracepoint(librbd, create4_enter, io_ctx.get_pool_name().c_str(), io_ctx.get_id(), name, size, opts);
+  librbd::ImageOptions opts_(opts);
+  int r = librbd::create(io_ctx, name, size, opts_);
+  tracepoint(librbd, create4_exit, r);
+  return r;
+}
+
 extern "C" int rbd_clone(rados_ioctx_t p_ioctx, const char *p_name,
 			 const char *p_snap_name, rados_ioctx_t c_ioctx,
 			 const char *c_name, uint64_t features, int *c_order)
@@ -1157,6 +1229,21 @@ extern "C" int rbd_clone2(rados_ioctx_t p_ioctx, const char *p_name,
   return r;
 }
 
+extern "C" int rbd_clone3(rados_ioctx_t p_ioctx, const char *p_name,
+			  const char *p_snap_name, rados_ioctx_t c_ioctx,
+			  const char *c_name, rbd_image_options_t c_opts)
+{
+  librados::IoCtx p_ioc, c_ioc;
+  librados::IoCtx::from_rados_ioctx_t(p_ioctx, p_ioc);
+  librados::IoCtx::from_rados_ioctx_t(c_ioctx, c_ioc);
+  TracepointProvider::initialize<tracepoint_traits>(get_cct(p_ioc));
+  tracepoint(librbd, clone3_enter, p_ioc.get_pool_name().c_str(), p_ioc.get_id(), p_name, p_snap_name, c_ioc.get_pool_name().c_str(), c_ioc.get_id(), c_name, c_opts);
+  librbd::ImageOptions c_opts_(c_opts);
+  int r = librbd::clone(p_ioc, p_name, p_snap_name, c_ioc, c_name, c_opts_);
+  tracepoint(librbd, clone3_exit, r);
+  return r;
+}
+
 extern "C" int rbd_remove(rados_ioctx_t p, const char *name)
 {
   librados::IoCtx io_ctx;
@@ -1189,8 +1276,9 @@ extern "C" int rbd_copy(rbd_image_t image, rados_ioctx_t dest_p,
   librados::IoCtx dest_io_ctx;
   librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
   tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname);
+  librbd::ImageOptions opts;
   librbd::NoOpProgressContext prog_ctx;
-  int r = librbd::copy(ictx, dest_io_ctx, destname, prog_ctx);
+  int r = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx);
   tracepoint(librbd, copy_exit, r);
   return r;
 }
@@ -1206,6 +1294,20 @@ extern "C" int rbd_copy2(rbd_image_t srcp, rbd_image_t destp)
   return r;
 }
 
+extern "C" int rbd_copy3(rbd_image_t image, rados_ioctx_t dest_p,
+			 const char *destname, rbd_image_options_t c_opts)
+{
+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+  librados::IoCtx dest_io_ctx;
+  librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+  tracepoint(librbd, copy3_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname, c_opts);
+  librbd::ImageOptions c_opts_(c_opts);
+  librbd::NoOpProgressContext prog_ctx;
+  int r = librbd::copy(ictx, dest_io_ctx, destname, c_opts_, prog_ctx);
+  tracepoint(librbd, copy3_exit, r);
+  return r;
+}
+
 extern "C" int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p,
 				      const char *destname,
 				      librbd_progress_fn_t fn, void *data)
@@ -1214,8 +1316,9 @@ extern "C" int rbd_copy_with_progress(rbd_image_t image, rados_ioctx_t dest_p,
   librados::IoCtx dest_io_ctx;
   librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
   tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname);
+  librbd::ImageOptions opts;
   librbd::CProgressContext prog_ctx(fn, data);
-  int ret = librbd::copy(ictx, dest_io_ctx, destname, prog_ctx);
+  int ret = librbd::copy(ictx, dest_io_ctx, destname, opts, prog_ctx);
   tracepoint(librbd, copy_exit, ret);
   return ret;
 }
@@ -1232,6 +1335,22 @@ extern "C" int rbd_copy_with_progress2(rbd_image_t srcp, rbd_image_t destp,
   return ret;
 }
 
+extern "C" int rbd_copy_with_progress3(rbd_image_t image, rados_ioctx_t dest_p,
+				       const char *destname,
+				       rbd_image_options_t dest_opts,
+				       librbd_progress_fn_t fn, void *data)
+{
+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+  librados::IoCtx dest_io_ctx;
+  librados::IoCtx::from_rados_ioctx_t(dest_p, dest_io_ctx);
+  tracepoint(librbd, copy_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, dest_io_ctx.get_pool_name().c_str(), dest_io_ctx.get_id(), destname);
+  librbd::ImageOptions dest_opts_(dest_opts);
+  librbd::CProgressContext prog_ctx(fn, data);
+  int ret = librbd::copy(ictx, dest_io_ctx, destname, dest_opts_, prog_ctx);
+  tracepoint(librbd, copy_exit, ret);
+  return ret;
+}
+
 extern "C" int rbd_flatten(rbd_image_t image)
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
@@ -1483,6 +1602,15 @@ extern "C" int rbd_snap_create(rbd_image_t image, const char *snap_name)
   return r;
 }
 
+extern "C" int rbd_snap_rename(rbd_image_t image, const char *srcname, const char *dstname)
+{
+  librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
+  tracepoint(librbd, snap_rename_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, srcname, dstname);
+  int r = librbd::snap_rename(ictx, srcname, dstname);
+  tracepoint(librbd, snap_rename_exit, r);
+  return r;
+}
+
 extern "C" int rbd_snap_remove(rbd_image_t image, const char *snap_name)
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
@@ -1781,7 +1909,7 @@ extern "C" ssize_t rbd_read(rbd_image_t image, uint64_t ofs, size_t len,
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   tracepoint(librbd, read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
-  int r = librbd::read(ictx, ofs, len, buf, 0);
+  int r = ictx->aio_work_queue->read(ofs, len, buf, 0);
   tracepoint(librbd, read_exit, r);
   return r;
 }
@@ -1792,7 +1920,7 @@ extern "C" ssize_t rbd_read2(rbd_image_t image, uint64_t ofs, size_t len,
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   tracepoint(librbd, read2_enter, ictx, ictx->name.c_str(),
 	      ictx->snap_name.c_str(), ictx->read_only, ofs, len, op_flags);
-  int r = librbd::read(ictx, ofs, len, buf, op_flags);
+  int r = ictx->aio_work_queue->read(ofs, len, buf, op_flags);
   tracepoint(librbd, read_exit, r);
   return r;
 }
@@ -1859,7 +1987,7 @@ extern "C" ssize_t rbd_write(rbd_image_t image, uint64_t ofs, size_t len,
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   tracepoint(librbd, write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len, buf);
-  int r = librbd::write(ictx, ofs, len, buf, 0);
+  int r = ictx->aio_work_queue->write(ofs, len, buf, 0);
   tracepoint(librbd, write_exit, r);
   return r;
 }
@@ -1870,7 +1998,7 @@ extern "C" ssize_t rbd_write2(rbd_image_t image, uint64_t ofs, size_t len,
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   tracepoint(librbd, write2_enter, ictx, ictx->name.c_str(),
 	      ictx->snap_name.c_str(), ictx->read_only, ofs, len, buf, op_flags);
-  int r = librbd::write(ictx, ofs, len, buf, op_flags);
+  int r = ictx->aio_work_queue->write(ofs, len, buf, op_flags);
   tracepoint(librbd, write_exit, r);
   return r;
 }
@@ -1880,7 +2008,7 @@ extern "C" int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len)
 {
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   tracepoint(librbd, discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, ofs, len);
-  int r = librbd::discard(ictx, ofs, len);
+  int r = ictx->aio_work_queue->discard(ofs, len);
   tracepoint(librbd, discard_exit, r);
   return r;
 }
@@ -1901,7 +2029,7 @@ extern "C" int rbd_aio_write(rbd_image_t image, uint64_t off, size_t len,
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
   tracepoint(librbd, aio_write_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, buf, comp->pc);
-  submit_aio_write(ictx, off, len, buf, get_aio_completion(comp), 0);
+  ictx->aio_work_queue->aio_write(get_aio_completion(comp), off, len, buf, 0);
   tracepoint(librbd, aio_write_exit, 0);
   return 0;
 }
@@ -1913,7 +2041,8 @@ extern "C" int rbd_aio_write2(rbd_image_t image, uint64_t off, size_t len,
   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
   tracepoint(librbd, aio_write2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
 	      ictx->read_only, off, len, buf, comp->pc, op_flags);
-  submit_aio_write(ictx, off, len, buf, get_aio_completion(comp), op_flags);
+  ictx->aio_work_queue->aio_write(get_aio_completion(comp), off, len, buf,
+                                  op_flags);
   tracepoint(librbd, aio_write_exit, 0);
   return 0;
 }
@@ -1925,7 +2054,7 @@ extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len,
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
   tracepoint(librbd, aio_discard_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, comp->pc);
-  submit_aio_discard(ictx, off, len, get_aio_completion(comp));
+  ictx->aio_work_queue->aio_discard(get_aio_completion(comp), off, len);
   tracepoint(librbd, aio_discard_exit, 0);
   return 0;
 }
@@ -1936,7 +2065,8 @@ extern "C" int rbd_aio_read(rbd_image_t image, uint64_t off, size_t len,
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
   tracepoint(librbd, aio_read_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, off, len, buf, comp->pc);
-  submit_aio_read(ictx, off, len, buf, NULL, get_aio_completion(comp), 0);
+  ictx->aio_work_queue->aio_read(get_aio_completion(comp), off, len, buf, NULL,
+                                 0);
   tracepoint(librbd, aio_read_exit, 0);
   return 0;
 }
@@ -1948,8 +2078,8 @@ extern "C" int rbd_aio_read2(rbd_image_t image, uint64_t off, size_t len,
   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
   tracepoint(librbd, aio_read2_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(),
 	      ictx->read_only, off, len, buf, comp->pc, op_flags);
-  submit_aio_read(ictx, off, len, buf, NULL, get_aio_completion(comp),
-                  op_flags);
+  ictx->aio_work_queue->aio_read(get_aio_completion(comp), off, len, buf, NULL,
+                                 op_flags);
   tracepoint(librbd, aio_read_exit, 0);
   return 0;
 }
@@ -1968,7 +2098,7 @@ extern "C" int rbd_aio_flush(rbd_image_t image, rbd_completion_t c)
   librbd::ImageCtx *ictx = (librbd::ImageCtx *)image;
   librbd::RBD::AioCompletion *comp = (librbd::RBD::AioCompletion *)c;
   tracepoint(librbd, aio_flush_enter, ictx, ictx->name.c_str(), ictx->snap_name.c_str(), ictx->read_only, comp->pc);
-  submit_aio_flush(ictx, get_aio_completion(comp));
+  ictx->aio_work_queue->aio_flush(get_aio_completion(comp));
   tracepoint(librbd, aio_flush_exit, 0);
   return 0;
 }
diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc
index e56c66a..eb74332 100644
--- a/src/mds/CDentry.cc
+++ b/src/mds/CDentry.cc
@@ -622,3 +622,56 @@ std::string CDentry::linkage_t::get_remote_d_type_string() const
   }
 }
 
+void CDentry::scrub_initialize(CDir *parent, bool recurse, bool children,
+                        ScrubHeaderRefConst header,
+                               Context *f)
+{
+  if (!scrub_infop)
+    scrub_info_create();
+  else
+    assert(!scrub_infop->dentry_scrubbing);
+
+  scrub_infop->scrub_parent = parent;
+  scrub_infop->scrub_recursive = recurse;
+  scrub_infop->scrub_children = children;
+  scrub_infop->dentry_scrubbing = true;
+  scrub_infop->on_finish = f;
+  scrub_infop->header = header;
+
+  auth_pin(this);
+}
+
+void CDentry::scrub_finished(Context **c)
+{
+  dout(10) << __func__ << dendl;
+  assert(scrub_info()->dentry_scrubbing);
+
+  if (scrub_infop->scrub_parent) {
+    scrub_infop->scrub_parent->scrub_dentry_finished(this);
+  }
+
+  *c = scrub_infop->on_finish;
+
+  if (scrub_infop->header && scrub_infop->header->origin == this) {
+    // We are at the point that a tagging scrub was initiated
+    LogChannelRef clog = dir->cache->mds->clog;
+    clog->info() << "scrub complete with tag '"
+      << scrub_infop->header->tag << "'";
+  }
+
+  delete scrub_infop;
+  scrub_infop = NULL;
+
+  auth_unpin(this);
+}
+
+void CDentry::scrub_info_create() const
+{
+  assert(!scrub_infop);
+
+  // break out of const-land to set up implicit initial state
+  CDentry *me = const_cast<CDentry*>(this);
+
+  // we don't need to change or set up any default parameters; assign directly
+  me->scrub_infop = new scrub_info_t();
+}
diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h
index 40a4723..171fb3b 100644
--- a/src/mds/CDentry.h
+++ b/src/mds/CDentry.h
@@ -29,6 +29,7 @@
 
 #include "SimpleLock.h"
 #include "LocalLock.h"
+#include "ScrubHeader.h"
 
 class CInode;
 class CDir;
@@ -80,11 +81,14 @@ public:
   static const int PIN_INODEPIN =     1;  // linked inode is pinned
   static const int PIN_FRAGMENTING = -2;  // containing dir is refragmenting
   static const int PIN_PURGING =      3;
+  static const int PIN_SCRUBQUEUE =   4; // TODO: negative value?
+
   const char *pin_name(int p) const {
     switch (p) {
     case PIN_INODEPIN: return "inodepin";
     case PIN_FRAGMENTING: return "fragmenting";
     case PIN_PURGING: return "purging";
+    case PIN_SCRUBQUEUE: return "scrub_enqueued";
     default: return generic_pin_name(p);
     }
   }
@@ -137,6 +141,25 @@ public:
     void link_remote(CInode *in);
   };
   
+  class scrub_info_t {
+  public:
+    CDir *scrub_parent; /// This either matches get_parent_dir() or NULL
+    bool scrub_recursive; /// true if we are scrubbing everything under this
+    bool scrub_children; /// true if we have to scrub all direct children
+    bool dentry_scrubbing; /// safety check
+    bool dentry_children_done; /// safety check
+    bool inode_validated;  /// Has our inode's validate_disk_state run?
+    Context *on_finish; /// called when we finish scrubbing
+    ScrubHeaderRefConst header;
+
+    scrub_info_t() :
+      scrub_parent(NULL), scrub_recursive(false),
+      scrub_children(false), dentry_scrubbing(false),
+      dentry_children_done(false), inode_validated(false),
+      on_finish(NULL)
+    {}
+  };
+
 protected:
   CDir *dir;     // containing dirfrag
   linkage_t linkage;
@@ -144,10 +167,34 @@ protected:
   
   version_t version;  // dir version when last touched.
   version_t projected_version;  // what it will be when i unlock/commit.
+  scrub_info_t* scrub_infop;
 
 public:
   elist<CDentry*>::item item_dirty;
   elist<CDentry*>::item item_stray;
+  elist<CDentry*>::item item_scrub;
+
+  const scrub_info_t *scrub_info() const {
+    if(!scrub_infop)
+      scrub_info_create();
+    return scrub_infop;
+  }
+  void scrub_initialize(CDir *parent, bool recurse, bool children,
+                        ScrubHeaderRefConst header,
+                        Context *f);
+  void scrub_finished(Context **c);
+  void scrub_children_finished() {
+    scrub_infop->dentry_children_done = true;
+  }
+  void scrub_set_finisher(Context *c) {
+    scrub_infop->on_finish = c;
+  }
+
+private:
+  /**
+   * Create a scrub_info_t struct for the scrub_infop pointer.
+   */
+  void scrub_info_create() const;
 
 protected:
   friend class Migrator;
@@ -174,6 +221,7 @@ public:
     first(f), last(l),
     dir(0),
     version(0), projected_version(0),
+    scrub_infop(NULL),
     item_dirty(this),
     lock(this, &lock_type),
     versionlock(this, &versionlock_type) {
@@ -186,6 +234,7 @@ public:
     first(f), last(l),
     dir(0),
     version(0), projected_version(0),
+    scrub_infop(NULL),
     item_dirty(this),
     lock(this, &lock_type),
     versionlock(this, &versionlock_type) {
@@ -195,6 +244,8 @@ public:
     linkage.remote_d_type = dt;
   }
   ~CDentry() {
+    assert(!scrub_infop);
+    assert(!item_scrub.is_on_list());
     g_num_dn--;
     g_num_dns++;
   }
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 9ecbf01..70a4c7f 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -181,51 +181,36 @@ ostream& CDir::print_db_line_prefix(ostream& out)
 // CDir
 
 CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) :
+  cache(mdcache), inode(in), frag(fg),
+  first(2),
   dirty_rstat_inodes(member_offset(CInode, dirty_rstat_item)),
-  item_dirty(this), item_new(this),
+  projected_version(0),  item_dirty(this), item_new(this),
+  scrub_infop(NULL),
+  num_head_items(0), num_head_null(0),
+  num_snap_items(0), num_snap_null(0),
+  num_dirty(0), committing_version(0), committed_version(0),
+  dir_auth_pins(0), request_pins(0),
+  dir_rep(REP_NONE),
   pop_me(ceph_clock_now(g_ceph_context)),
   pop_nested(ceph_clock_now(g_ceph_context)),
   pop_auth_subtree(ceph_clock_now(g_ceph_context)),
   pop_auth_subtree_nested(ceph_clock_now(g_ceph_context)),
-  bloom(NULL)
+  num_dentries_nested(0), num_dentries_auth_subtree(0),
+  num_dentries_auth_subtree_nested(0),
+  bloom(NULL),
+  dir_auth(CDIR_AUTH_DEFAULT)
 {
   g_num_dir++;
   g_num_dira++;
 
-  inode = in;
-  frag = fg;
-  this->cache = mdcache;
-
-  first = 2;
-  
-  num_head_items = num_head_null = 0;
-  num_snap_items = num_snap_null = 0;
-  num_dirty = 0;
-
-  num_dentries_nested = 0;
-  num_dentries_auth_subtree = 0;
-  num_dentries_auth_subtree_nested = 0;
-
   state = STATE_INITIAL;
 
   memset(&fnode, 0, sizeof(fnode));
-  projected_version = 0;
-
-  committing_version = 0;
-  committed_version = 0;
-
-  // dir_auth
-  dir_auth = CDIR_AUTH_DEFAULT;
 
   // auth
   assert(in->is_dir());
   if (auth) 
     state |= STATE_AUTH;
- 
-  dir_auth_pins = 0;
-  request_pins = 0;
-
-  dir_rep = REP_NONE;
 }
 
 /**
@@ -1279,6 +1264,16 @@ fnode_t *CDir::project_fnode()
   fnode_t *p = new fnode_t;
   *p = *get_projected_fnode();
   projected_fnode.push_back(p);
+
+  if (scrub_infop && scrub_infop->last_scrub_dirty) {
+    p->localized_scrub_stamp = scrub_infop->last_local.time;
+    p->localized_scrub_version = scrub_infop->last_local.version;
+    p->recursive_scrub_stamp = scrub_infop->last_recursive.time;
+    p->recursive_scrub_version = scrub_infop->last_recursive.version;
+    scrub_infop->last_scrub_dirty = false;
+    scrub_maybe_delete_info();
+  }
+
   dout(10) << "project_fnode " << p << dendl;
   return p;
 }
@@ -2864,3 +2859,213 @@ void CDir::dump(Formatter *f) const
   MDSCacheObject::dump(f);
 }
 
+/****** Scrub Stuff *******/
+
+void CDir::scrub_info_create() const
+{
+  assert(!scrub_infop);
+
+  // break out of const-land to set up implicit initial state
+  CDir *me = const_cast<CDir*>(this);
+  fnode_t *fn = me->get_projected_fnode();
+
+  scrub_info_t *si = new scrub_info_t();
+
+  si->last_recursive.version = si->recursive_start.version =
+      fn->recursive_scrub_version;
+  si->last_recursive.time = si->recursive_start.time =
+      fn->recursive_scrub_stamp;
+
+  si->last_local.version = fn->localized_scrub_version;
+  si->last_local.time = fn->localized_scrub_stamp;
+
+  me->scrub_infop = si;
+}
+
+void CDir::scrub_initialize()
+{
+  dout(20) << __func__ << dendl;
+  assert(is_complete());
+
+  // FIXME: weird implicit construction, is someone else meant
+  // to be calling scrub_info_create first?
+  scrub_info();
+  assert(scrub_infop && !scrub_infop->directory_scrubbing);
+
+  scrub_infop->recursive_start.version = get_projected_version();
+  scrub_infop->recursive_start.time = ceph_clock_now(g_ceph_context);
+
+  scrub_infop->directories_to_scrub.clear();
+  scrub_infop->directories_scrubbing.clear();
+  scrub_infop->directories_scrubbed.clear();
+  scrub_infop->others_to_scrub.clear();
+  scrub_infop->others_scrubbing.clear();
+  scrub_infop->others_scrubbed.clear();
+
+  for (map_t::iterator i = items.begin();
+      i != items.end();
+      ++i) {
+    // TODO: handle snapshot scrubbing
+    if (i->first.snapid != CEPH_NOSNAP)
+      continue;
+
+    CDentry::linkage_t *dnl = i->second->get_projected_linkage();
+    if (dnl->is_primary()) {
+      if (dnl->get_inode()->is_dir())
+	scrub_infop->directories_to_scrub.insert(i->first);
+      else
+	scrub_infop->others_to_scrub.insert(i->first);
+    } else if (dnl->is_remote()) {
+      // TODO: check remote linkage
+    }
+  }
+  scrub_infop->directory_scrubbing = true;
+
+  assert(scrub_local()); // TODO: handle failure
+}
+
+void CDir::scrub_finished()
+{
+  dout(20) << __func__ << dendl;
+  assert(scrub_infop && scrub_infop->directory_scrubbing);
+
+  assert(scrub_infop->directories_to_scrub.empty());
+  assert(scrub_infop->directories_scrubbing.empty());
+  scrub_infop->directories_scrubbed.clear();
+  assert(scrub_infop->others_to_scrub.empty());
+  assert(scrub_infop->others_scrubbing.empty());
+  scrub_infop->others_scrubbed.clear();
+  scrub_infop->directory_scrubbing = false;
+
+  scrub_infop->last_recursive = scrub_infop->recursive_start;
+  scrub_infop->last_scrub_dirty = true;
+}
+
+int CDir::_next_dentry_on_set(set<dentry_key_t>& dns, bool missing_okay,
+                              MDSInternalContext *cb, CDentry **dnout)
+{
+  dentry_key_t dnkey;
+
+  while (!dns.empty()) {
+    set<dentry_key_t>::iterator front = dns.begin();
+    dnkey = *front;
+    *dnout = lookup(dnkey.name);
+    if (!*dnout) {
+      if (!is_complete() &&
+          (!has_bloom() || is_in_bloom(dnkey.name))) {
+        // need to re-read this dirfrag
+        fetch(cb);
+        return EAGAIN;
+      }
+      // okay, we lost it
+      if (missing_okay) {
+	dout(15) << " we no longer have directory dentry "
+		 << dnkey.name << ", assuming it got renamed" << dendl;
+	dns.erase(dnkey);
+	continue;
+      } else {
+	dout(5) << " we lost dentry " << dnkey.name
+		<< ", bailing out because that's impossible!" << dendl;
+	assert(0);
+      }
+    }
+    // okay, we got a  dentry
+    dns.erase(dnkey);
+
+    return 0;
+  }
+  *dnout = NULL;
+  return ENOENT;
+}
+
+int CDir::scrub_dentry_next(MDSInternalContext *cb, CDentry **dnout)
+{
+  dout(20) << __func__ << dendl;
+  assert(scrub_infop && scrub_infop->directory_scrubbing);
+
+  dout(20) << "trying to scrub directories underneath us" << dendl;
+  int rval = _next_dentry_on_set(scrub_infop->directories_to_scrub, true,
+                                 cb, dnout);
+  if (rval == 0) {
+    dout(20) << __func__ << " inserted to directories scrubbing: "
+      << *dnout << dendl;
+    scrub_infop->directories_scrubbing.insert((*dnout)->key());
+  } else if (rval < 0 || rval == EAGAIN) {
+    // we don't need to do anything else
+  } else { // we emptied out the directory scrub set
+    assert(rval == ENOENT);
+    dout(20) << "no directories left, moving on to other kinds of dentries"
+             << dendl;
+    
+    rval = _next_dentry_on_set(scrub_infop->others_to_scrub, false, cb, dnout);
+    if (rval == 0) {
+      dout(20) << __func__ << " inserted to others scrubbing: "
+        << *dnout << dendl;
+      scrub_infop->others_scrubbing.insert((*dnout)->key());
+    }
+  }
+  dout(20) << " returning " << rval << " with dn=" << *dnout << dendl;
+  return rval;
+}
+
+void CDir::scrub_dentries_scrubbing(list<CDentry*> *out_dentries)
+{
+  dout(20) << __func__ << dendl;
+  assert(scrub_infop && scrub_infop->directory_scrubbing);
+
+  for (set<dentry_key_t>::iterator i =
+        scrub_infop->directories_scrubbing.begin();
+      i != scrub_infop->directories_scrubbing.end();
+      ++i) {
+    CDentry *d = lookup(i->name, i->snapid);
+    assert(d);
+    out_dentries->push_back(d);
+  }
+  for (set<dentry_key_t>::iterator i = scrub_infop->others_scrubbing.begin();
+      i != scrub_infop->others_scrubbing.end();
+      ++i) {
+    CDentry *d = lookup(i->name, i->snapid);
+    assert(d);
+    out_dentries->push_back(d);
+  }
+}
+
+void CDir::scrub_dentry_finished(CDentry *dn)
+{
+  dout(20) << __func__ << " on dn " << *dn << dendl;
+  assert(scrub_infop && scrub_infop->directory_scrubbing);
+  dentry_key_t dn_key = dn->key();
+  if (scrub_infop->directories_scrubbing.count(dn_key)) {
+    scrub_infop->directories_scrubbing.erase(dn_key);
+    scrub_infop->directories_scrubbed.insert(dn_key);
+  } else {
+    assert(scrub_infop->others_scrubbing.count(dn_key));
+    scrub_infop->others_scrubbing.erase(dn_key);
+    scrub_infop->others_scrubbed.insert(dn_key);
+  }
+}
+
+void CDir::scrub_maybe_delete_info()
+{
+  if (scrub_infop &&
+      !scrub_infop->directory_scrubbing &&
+      !scrub_infop->last_scrub_dirty &&
+      scrub_infop->dirty_scrub_stamps.empty()) {
+    delete scrub_infop;
+    scrub_infop = NULL;
+  }
+}
+
+bool CDir::scrub_local()
+{
+  assert(is_complete());
+  bool rval = check_rstats();
+
+  if (rval) {
+    scrub_info();
+    scrub_infop->last_local.time = ceph_clock_now(g_ceph_context);
+    scrub_infop->last_local.version = get_projected_version();
+    scrub_infop->last_scrub_dirty = true;
+  }
+  return rval;
+}
diff --git a/src/mds/CDir.h b/src/mds/CDir.h
index be0f10a..e161d5b 100644
--- a/src/mds/CDir.h
+++ b/src/mds/CDir.h
@@ -233,7 +233,103 @@ private:
 
 public:
   typedef std::map<dentry_key_t, CDentry*> map_t;
+
+  class scrub_info_t {
+  public:
+    /// inodes we contain with dirty scrub stamps
+    map<dentry_key_t,CInode*> dirty_scrub_stamps; // TODO: make use of this!
+    struct scrub_stamps {
+      version_t version;
+      utime_t time;
+      scrub_stamps() : version(0) {}
+      void operator=(const scrub_stamps &o) {
+        version = o.version;
+        time = o.time;
+      }
+    };
+
+    scrub_stamps recursive_start; // when we last started a recursive scrub
+    scrub_stamps last_recursive; // when we last finished a recursive scrub
+    scrub_stamps last_local; // when we last did a local scrub
+
+    bool directory_scrubbing; /// safety check
+    bool last_scrub_dirty; /// is scrub info dirty or is it flushed to fnode?
+
+    /// these are lists of children in each stage of scrubbing
+    set<dentry_key_t> directories_to_scrub;
+    set<dentry_key_t> directories_scrubbing;
+    set<dentry_key_t> directories_scrubbed;
+    set<dentry_key_t> others_to_scrub;
+    set<dentry_key_t> others_scrubbing;
+    set<dentry_key_t> others_scrubbed;
+
+    scrub_info_t() : directory_scrubbing(false), last_scrub_dirty(false) {}
+  };
+  /**
+   * Call to start this CDir on a new scrub.
+   * @pre It is not currently scrubbing
+   * @pre The CDir is marked complete.
+   * @post It has set up its internal scrubbing state.
+   */
+  void scrub_initialize();
+  /**
+   * Get the next dentry to scrub. Gives you a CDentry* and its meaning. This
+   * function will give you all directory-representing dentries before any
+   * others.
+   * 0: success, you should scrub this CDentry right now
+   * EAGAIN: is currently fetching the next CDentry into memory for you.
+   *   It will activate your callback when done; try again when it does!
+   * ENOENT: there are no remaining dentries to scrub
+   * <0: There was an unexpected error
+   *
+   * @param cb An MDSInternalContext which will be activated only if
+   *   we return EAGAIN via rcode, or else ignored
+   * @param dnout CDentry * which you should next scrub, or NULL
+   * @returns a value as described above
+   */
+  int scrub_dentry_next(MDSInternalContext *cb, CDentry **dnout);
+  /**
+   * Get the currently scrubbing dentries. When returned, the passed-in
+   * list will be filled with all CDentry * which have been returned
+   * from scrub_dentry_next() but not sent back via scrub_dentry_finished().
+   */
+  void scrub_dentries_scrubbing(list<CDentry*> *out_dentries);
+  /**
+   * Report to the CDir that a CDentry has been scrubbed. Call this
+   * for every CDentry returned from scrub_dentry_next().
+   * @param dn The CDentry which has been scrubbed.
+   */
+  void scrub_dentry_finished(CDentry *dn);
+  /**
+   * Call this once all CDentries have been scrubbed, according to
+   * scrub_dentry_next's listing. It finalizes the scrub statistics.
+   */
+  void scrub_finished();
+  /**
+   * Tell the CDir to do a local scrub of itself.
+   * @pre The CDir is_complete().
+   * @returns true if the rstats and directory contents match, false otherwise.
+   */
+  bool scrub_local();
+private:
+  /**
+   * Create a scrub_info_t struct for the scrub_infop pointer.
+   */
+  void scrub_info_create() const;
+  /**
+   * Delete the scrub_infop if it's not got any useful data.
+   */
+  void scrub_maybe_delete_info();
+  /**
+   * Check the given set (presumably one of those in scrub_info_t) for the
+   * next key to scrub and look it up (or fail!).
+   */
+  int _next_dentry_on_set(set<dentry_key_t>& dns, bool missing_okay,
+                          MDSInternalContext *cb, CDentry **dnout);
+
+
 protected:
+  scrub_info_t *scrub_infop;
 
   // contents of this directory
   map_t items;       // non-null AND null
@@ -297,11 +393,18 @@ protected:
  public:
   CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth);
   ~CDir() {
+    delete scrub_infop;
     remove_bloom();
     g_num_dir--;
     g_num_dirs++;
   }
 
+  const scrub_info_t *scrub_info() const {
+    if (!scrub_infop) {
+      scrub_info_create();
+    }
+    return scrub_infop;
+  }
 
 
   // -- accessors --
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index cb5d8f2..c3ee1e0 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -250,6 +250,16 @@ ostream& operator<<(ostream& out, const CInode& in)
   return out;
 }
 
+ostream& operator<<(ostream& out, const CInode::scrub_stamp_info_t& si)
+{
+  out << "{scrub_start_version: " << si.scrub_start_version
+      << ", scrub_start_stamp: " << si.scrub_start_stamp
+      << ", last_scrub_version: " << si.last_scrub_version
+      << ", last_scrub_stamp: " << si.last_scrub_stamp;
+  return out;
+}
+
+
 
 void CInode::print(ostream& out)
 {
@@ -352,12 +362,22 @@ inode_t *CInode::project_inode(map<string,bufferptr> *px)
     if (px)
       *px = *get_projected_xattrs();
   }
+
+  projected_inode_t &pi = *projected_nodes.back();
+
   if (px) {
-    projected_nodes.back()->xattrs = px;
+    pi.xattrs = px;
     ++num_projected_xattrs;
   }
-  dout(15) << "project_inode " << projected_nodes.back()->inode << dendl;
-  return projected_nodes.back()->inode;
+
+  if (scrub_infop && scrub_infop->last_scrub_dirty) {
+    pi.inode->last_scrub_stamp = scrub_infop->last_scrub_stamp;
+    pi.inode->last_scrub_version = scrub_infop->last_scrub_version;
+    scrub_infop->last_scrub_dirty = false;
+    scrub_maybe_delete_info();
+  }
+  dout(15) << "project_inode " << pi.inode << dendl;
+  return pi.inode;
 }
 
 void CInode::pop_and_dirty_projected_inode(LogSegment *ls) 
@@ -3622,10 +3642,12 @@ void InodeStore::generate_test_instances(list<InodeStore*> &ls)
 }
 
 void CInode::validate_disk_state(CInode::validated_data *results,
-                                 MDRequestRef &mdr)
+                                 MDRequestRef &mdr, MDSInternalContext *fin)
 {
   class ValidationContinuation : public MDSContinuation {
   public:
+    MDRequestRef mdr;
+    MDSInternalContext *fin;
     CInode *in;
     CInode::validated_data *results;
     bufferlist bl;
@@ -3638,10 +3660,18 @@ void CInode::validate_disk_state(CInode::validated_data *results,
       DIRFRAGS
     };
 
+    /**
+     * May set either mdr or fin, depending on whether caller is doing
+     * validation in a single MDRequest (i.e. asok) or caller is doing
+     * their own thing (i.e. ScrubStack)
+     */
     ValidationContinuation(CInode *i,
                            CInode::validated_data *data_r,
-                           MDRequestRef &mdr) :
-                             MDSContinuation(mdr, i->mdcache->mds->server),
+                           MDRequestRef &_mdr,
+                           MDSInternalContext *fin_) :
+                             MDSContinuation(i->mdcache->mds->server),
+                             mdr(_mdr),
+                             fin(fin_),
                              in(i),
                              results(data_r),
                              shadow_in(NULL) {
@@ -3655,6 +3685,40 @@ void CInode::validate_disk_state(CInode::validated_data *results,
       delete shadow_in;
     }
 
+    /**
+     * Fetch backtrace and set tag if tag is non-empty
+     */
+    void fetch_backtrace_and_tag(CInode *in, std::string tag,
+                                 Context *fin, int *bt_r, bufferlist *bt)
+    {
+      int64_t pool;
+      if (in->is_dir())
+        pool = in->mdcache->mds->mdsmap->get_metadata_pool();
+      else
+        pool = in->inode.layout.fl_pg_pool;
+
+      object_t oid = CInode::get_object_name(in->ino(), frag_t(), "");
+
+      ObjectOperation fetch;
+
+      fetch.getxattr("parent", bt, bt_r);
+      // We want to tag even if we get ENODATA fetching the backtrace
+      fetch.set_last_op_flags(CEPH_OSD_OP_FLAG_FAILOK);
+      if (!tag.empty()) {
+        bufferlist tag_bl;
+        ::encode(tag, tag_bl);
+        fetch.setxattr("scrub_tag", tag_bl);
+      }
+      if (tag.empty()) {
+        in->mdcache->mds->objecter->read(oid, object_locator_t(pool), fetch, CEPH_NOSNAP,
+            NULL, 0, fin);
+      } else {
+        SnapContext snapc;
+        in->mdcache->mds->objecter->mutate(oid, object_locator_t(pool), fetch, snapc,
+            ceph_clock_now(g_ceph_context), 0, NULL, fin);
+      }
+    }
+
     bool _start(int rval) {
       if (in->is_dirty()) {
         MDCache *mdcache = in->mdcache;
@@ -3673,7 +3737,23 @@ void CInode::validate_disk_state(CInode::validated_data *results,
       C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE),
                                             in->mdcache->mds->finisher);
 
-      in->fetch_backtrace(conf, &bl);
+      // Whether we have a tag to apply depends on ScrubHeader (if one is
+      // present)
+      if (in->get_parent_dn() != nullptr &&
+          in->get_parent_dn()->scrub_info()->header != nullptr) {
+        // I'm a non-orphan, so look up my ScrubHeader via my linkage
+        const std::string &tag = in->get_parent_dn()->scrub_info()->header->tag;
+        // Rather than using the usual CInode::fetch_backtrace,
+        // use a special variant that optionally writes a tag in the same
+        // operation.
+        fetch_backtrace_and_tag(in, tag, conf,
+                                &results->backtrace.ondisk_read_retval, &bl);
+      } else {
+        // When we're invoked outside of ScrubStack we might be called
+        // on an orphaned inode like /
+        fetch_backtrace_and_tag(in, {}, conf,
+                                &results->backtrace.ondisk_read_retval, &bl);
+      }
       return false;
     }
 
@@ -3681,9 +3761,12 @@ void CInode::validate_disk_state(CInode::validated_data *results,
       // set up basic result reporting and make sure we got the data
       results->performed_validation = true; // at least, some of it!
       results->backtrace.checked = true;
-      results->backtrace.ondisk_read_retval = rval;
       results->backtrace.passed = false; // we'll set it true if we make it
-      if (rval != 0) {
+
+      // Ignore rval because it's the result of a FAILOK operation
+      // from fetch_backtrace_and_tag: the real result is in
+      // backtrace.ondisk_read_retval
+      if (results->backtrace.ondisk_read_retval != 0) {
         results->backtrace.error_str << "failed to read off disk; see retval";
         return true;
       }
@@ -3692,9 +3775,15 @@ void CInode::validate_disk_state(CInode::validated_data *results,
       try {
         bufferlist::iterator p = bl.begin();
         ::decode(results->backtrace.ondisk_value, p);
-      } catch (buffer::malformed_input&) {
-        results->backtrace.passed = false;
-        results->backtrace.error_str << "failed to decode on-disk backtrace!";
+      } catch (buffer::error&) {
+        if (results->backtrace.ondisk_read_retval == 0 && rval != 0) {
+          // Cases where something has clearly gone wrong with the overall
+          // fetch op, though we didn't get a nonzero rc from the getxattr
+          // operation.  e.g. object missing.
+          results->backtrace.ondisk_read_retval = rval;
+        }
+        results->backtrace.error_str << "failed to decode on-disk backtrace ("
+                                     << bl.length() << " bytes)!";
         return true;
       }
       int64_t pool;
@@ -3810,7 +3899,8 @@ void CInode::validate_disk_state(CInode::validated_data *results,
           results->raw_rstats.error_str << "dirfrag is INCOMPLETE despite fetching; probably too large compared to MDS cache size?\n";
           return true;
         }
-        assert(p->second->check_rstats());
+        // FIXME!!! Don't assert out on damage!
+        assert(p->second->scrub_local());
         sub_info.add(p->second->fnode.accounted_rstat);
       }
       // ...and that their sum matches our inode settings
@@ -3826,12 +3916,22 @@ void CInode::validate_disk_state(CInode::validated_data *results,
       results->passed_validation = true;
       return true;
     }
+
+    void _done() {
+      if (mdr) {
+        server->respond_to_request(mdr, get_rval());
+      } else if (fin) {
+        fin->complete(get_rval());
+      }
+    }
   };
 
 
+  dout(10) << "scrub starting validate_disk_state on " << *this << dendl;
   ValidationContinuation *vc = new ValidationContinuation(this,
                                                           results,
-                                                          mdr);
+                                                          mdr,
+                                                          fin);
   vc->begin();
 }
 
@@ -3979,3 +4079,138 @@ void CInode::dump(Formatter *f) const
   f->close_section();
 }
 
+/****** Scrub Stuff *****/
+void CInode::scrub_info_create() const
+{
+  dout(25) << __func__ << dendl;
+  assert(!scrub_infop);
+
+  // break out of const-land to set up implicit initial state
+  CInode *me = const_cast<CInode*>(this);
+  inode_t *in = me->get_projected_inode();
+
+  scrub_info_t *si = new scrub_info_t();
+  si->scrub_start_stamp = si->last_scrub_stamp = in->last_scrub_stamp;
+  si->scrub_start_version = si->last_scrub_version = in->last_scrub_version;
+
+  me->scrub_infop = si;
+}
+
+void CInode::scrub_maybe_delete_info()
+{
+  if (scrub_infop &&
+      !scrub_infop->scrub_in_progress &&
+      !scrub_infop->last_scrub_dirty) {
+    delete scrub_infop;
+    scrub_infop = NULL;
+  }
+}
+
+void CInode::scrub_initialize(version_t scrub_version)
+{
+  dout(20) << __func__ << " with scrub_version "
+           << scrub_version << dendl;
+  assert(!scrub_infop || !scrub_infop->scrub_in_progress);
+  scrub_info();
+  if (!scrub_infop)
+    scrub_infop = new scrub_info_t();
+  else
+    assert(!scrub_infop->scrub_in_progress);
+
+  if (get_projected_inode()->is_dir()) {
+    // fill in dirfrag_stamps with initial state
+    std::list<frag_t> frags;
+    dirfragtree.get_leaves(frags);
+    for (std::list<frag_t>::iterator i = frags.begin();
+        i != frags.end();
+        ++i) {
+      scrub_infop->dirfrag_stamps[*i];
+    }
+  }
+  scrub_infop->scrub_in_progress = true;
+  scrub_infop->scrub_start_version = scrub_version;
+  scrub_infop->scrub_start_stamp = ceph_clock_now(g_ceph_context);
+  // right now we don't handle remote inodes
+}
+
+int CInode::scrub_dirfrag_next(frag_t* out_dirfrag)
+{
+  dout(20) << __func__ << dendl;
+  assert(scrub_infop && scrub_infop->scrub_in_progress);
+
+  if (!is_dir()) {
+    return -ENOTDIR;
+  }
+
+  std::map<frag_t, scrub_stamp_info_t>::iterator i =
+      scrub_infop->dirfrag_stamps.begin();
+
+  while (i != scrub_infop->dirfrag_stamps.end()) {
+    if (i->second.scrub_start_version < scrub_infop->scrub_start_version) {
+      i->second.scrub_start_version = get_projected_version();
+      i->second.scrub_start_stamp = ceph_clock_now(g_ceph_context);
+      *out_dirfrag = i->first;
+      dout(20) << " return frag " << *out_dirfrag << dendl;
+      return 0;
+    }
+    ++i;
+  }
+
+  dout(20) << " no frags left, ENOENT " << dendl;
+  return ENOENT;
+}
+
+void CInode::scrub_dirfrags_scrubbing(list<frag_t>* out_dirfrags)
+{
+  assert(out_dirfrags != NULL);
+  assert(scrub_infop != NULL);
+
+  out_dirfrags->clear();
+  std::map<frag_t, scrub_stamp_info_t>::iterator i =
+      scrub_infop->dirfrag_stamps.begin();
+
+  while (i != scrub_infop->dirfrag_stamps.end()) {
+    if (i->second.scrub_start_version >= scrub_infop->scrub_start_version) {
+      if (i->second.last_scrub_version < scrub_infop->scrub_start_version)
+        out_dirfrags->push_back(i->first);
+    } else {
+      return;
+    }
+
+    ++i;
+  }
+}
+
+void CInode::scrub_dirfrag_finished(frag_t dirfrag)
+{
+  dout(20) << __func__ << " on frag " << dirfrag << dendl;
+  assert(scrub_infop && scrub_infop->scrub_in_progress);
+
+  std::map<frag_t, scrub_stamp_info_t>::iterator i =
+      scrub_infop->dirfrag_stamps.find(dirfrag);
+  assert(i != scrub_infop->dirfrag_stamps.end());
+
+  scrub_stamp_info_t &si = i->second;
+  si.last_scrub_stamp = si.scrub_start_stamp;
+  si.last_scrub_version = si.scrub_start_version;
+}
+
+void CInode::scrub_finished(Context **c) {
+  dout(20) << __func__ << dendl;
+  assert(scrub_info()->scrub_in_progress);
+  for (std::map<frag_t, scrub_stamp_info_t>::iterator i =
+      scrub_infop->dirfrag_stamps.begin();
+      i != scrub_infop->dirfrag_stamps.end();
+      ++i) {
+    if(i->second.last_scrub_version != i->second.scrub_start_version) {
+      derr << i->second.last_scrub_version << " != "
+        << i->second.scrub_start_version << dendl;
+    }
+    assert(i->second.last_scrub_version == i->second.scrub_start_version);
+  }
+  scrub_infop->last_scrub_version = scrub_infop->scrub_start_version;
+  scrub_infop->last_scrub_stamp = scrub_infop->scrub_start_stamp;
+  scrub_infop->last_scrub_dirty = true;
+  scrub_infop->scrub_in_progress = false;
+  parent->scrub_finished(c);
+}
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index a7abba4..f032e5f 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -242,6 +242,87 @@ public:
   snapid_t          first, last;
   compact_set<snapid_t> dirty_old_rstats;
 
+  class scrub_stamp_info_t {
+  public:
+    /// version we started our latest scrub (whether in-progress or finished)
+    version_t scrub_start_version;
+    /// time we started our latest scrub (whether in-progress or finished)
+    utime_t scrub_start_stamp;
+    /// version we started our most recent finished scrub
+    version_t last_scrub_version;
+    /// time we started our most recent finished scrub
+    utime_t last_scrub_stamp;
+    scrub_stamp_info_t() : scrub_start_version(0), last_scrub_version(0) {}
+  };
+
+  class scrub_info_t : public scrub_stamp_info_t {
+  public:
+    bool last_scrub_dirty; /// are our stamps dirty with respect to disk state?
+    bool scrub_in_progress; /// are we currently scrubbing?
+    /// my own (temporary) stamps and versions for each dirfrag we have
+    std::map<frag_t, scrub_stamp_info_t> dirfrag_stamps;
+
+    scrub_info_t() : scrub_stamp_info_t(), last_scrub_dirty(false),
+        scrub_in_progress(false) {}
+  };
+
+  const scrub_info_t *scrub_info() const{
+    if (!scrub_infop)
+      scrub_info_create();
+    return scrub_infop;
+  }
+  /**
+   * Start scrubbing on this inode. That could be very short if it's
+   * a file, or take a long time if we're recursively scrubbing a directory.
+   * @pre It is not currently scrubbing
+   * @post it has set up internal scrubbing state
+   * @param scrub_version What version are we scrubbing at (usually, parent
+   * directory's get_projected_version())
+   */
+  void scrub_initialize(version_t scrub_version);
+  /**
+   * Get the next dirfrag to scrub. Gives you a frag_t in output param which
+   * you must convert to a CDir (and possibly load off disk).
+   * @param dir A pointer to frag_t, will be filled in with the next dirfrag to
+   * scrub if there is one.
+   * @returns 0 on success, you should scrub the passed-out frag_t right now;
+   * ENOENT: There are no remaining dirfrags to scrub
+   * <0 There was some other error (It will return -ENOTDIR if not a directory)
+   */
+  int scrub_dirfrag_next(frag_t* out_dirfrag);
+  /**
+   * Get the currently scrubbing dirfrags. When returned, the
+   * passed-in list will be filled in with all frag_ts which have
+   * been returned from scrub_dirfrag_next but not sent back
+   * via scrub_dirfrag_finished.
+   */
+  void scrub_dirfrags_scrubbing(list<frag_t> *out_dirfrags);
+  /**
+   * Report to the CInode that a dirfrag it owns has been scrubbed. Call
+   * this for every frag_t returned from scrub_dirfrag_next().
+   * @param dirfrag The frag_t that was scrubbed
+   */
+  void scrub_dirfrag_finished(frag_t dirfrag);
+  /**
+   * Call this once the scrub has been completed, whether it's a full
+   * recursive scrub on a directory or simply the data on a file (or
+   * anything in between).
+   * @param c An out param which is filled in with a Context* that must
+   * be complete()ed.
+   */
+  void scrub_finished(Context **c);
+
+private:
+  /**
+   * Create a scrub_info_t struct for the scrub_infop poitner.
+   */
+  void scrub_info_create() const;
+  /**
+   * Delete the scrub_info_t struct if it's not got any useful data
+   */
+  void scrub_maybe_delete_info();
+public:
+
   bool is_multiversion() const {
     return snaprealm ||  // other snaprealms will link to me
       inode.is_dir() ||  // links to me in other snaps
@@ -401,6 +482,7 @@ public:
 private:
   compact_map<frag_t,CDir*> dirfrags; // cached dir fragments under this Inode
   int stickydir_ref;
+  scrub_info_t *scrub_infop;
 
 public:
   bool has_dirfrags() { return !dirfrags.empty(); }
@@ -539,6 +621,7 @@ public:
     num_projected_xattrs(0),
     num_projected_srnodes(0),
     stickydir_ref(0),
+    scrub_infop(NULL),
     parent(0),
     inode_auth(CDIR_AUTH_DEFAULT),
     replica_caps_wanted(0),
@@ -1012,10 +1095,12 @@ public:
    * @param results A freshly-created validated_data struct, with values set
    * as described in the struct documentation.
    * @param mdr The request to be responeded upon the completion of the
-   * validation.
+   * validation (or NULL)
+   * @param fin Context to call back on completion (or NULL)
    */
   void validate_disk_state(validated_data *results,
-                           MDRequestRef& mdr);
+                           MDRequestRef& mdr,
+                           MDSInternalContext *fin);
   static void dump_validation_results(const validated_data& results,
                                       Formatter *f);
 private:
@@ -1025,4 +1110,6 @@ private:
   /** @} Scrubbing and fsck */
 };
 
+ostream& operator<<(ostream& out, const CInode::scrub_stamp_info_t& si);
+
 #endif
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index fb548a5..0cbb5e3 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -3131,6 +3131,12 @@ bool Locker::_do_cap_update(CInode *in, Capability *cap,
   if (!dirty && !change_max)
     return false;
 
+  Session *session = static_cast<Session *>(m->get_connection()->get_priv());
+  if (!session->check_access(in, MAY_WRITE, m->caller_uid, m->caller_gid, 0, 0)) {
+    dout(10) << "check_access failed, dropping cap update on " << *in << dendl;
+    return false;
+  }
+  session->put();
 
   // do the update.
   EUpdate *le = new EUpdate(mds->mdlog, "cap update");
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 3ec852a..4f99f7d 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -26,6 +26,7 @@
 #include "MDLog.h"
 #include "MDBalancer.h"
 #include "Migrator.h"
+#include "ScrubStack.h"
 
 #include "SnapClient.h"
 
@@ -6622,12 +6623,6 @@ void MDCache::trim_non_auth()
       if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
 	dn->unlink_remote(dnl);
 
-      if (dn->get_dir()->get_inode()->is_stray()) {
-	dn->state_set(CDentry::STATE_STRAY);
-	if (dnl->is_primary() && dnl->get_inode()->inode.nlink == 0)
-	  dnl->get_inode()->state_set(CInode::STATE_ORPHAN);
-      }
-
       if (!first_auth) {
 	first_auth = dn;
       } else {
@@ -8821,6 +8816,9 @@ void MDCache::dispatch_request(MDRequestRef& mdr)
     case CEPH_MDS_OP_VALIDATE:
       scrub_dentry_work(mdr);
       break;
+    case CEPH_MDS_OP_ENQUEUE_SCRUB:
+      enqueue_scrub_work(mdr);
+      break;
     case CEPH_MDS_OP_FLUSH:
       flush_dentry_work(mdr);
       break;
@@ -9185,10 +9183,14 @@ void MDCache::scan_stray_dir(dirfrag_t next)
     }
     for (CDir::map_t::iterator q = dir->items.begin(); q != dir->items.end(); ++q) {
       CDentry *dn = q->second;
+      dn->state_set(CDentry::STATE_STRAY);
       CDentry::linkage_t *dnl = dn->get_projected_linkage();
       stray_manager.notify_stray_created();
       if (dnl->is_primary()) {
-	maybe_eval_stray(dnl->get_inode());
+	CInode *in = dnl->get_inode();
+	if (in->inode.nlink == 0)
+	  in->state_set(CInode::STATE_ORPHAN);
+	maybe_eval_stray(in);
       }
     }
   }
@@ -11674,6 +11676,21 @@ void MDCache::scrub_dentry(const string& path, Formatter *f, Context *fin)
   scrub_dentry_work(mdr);
 }
 
+/**
+ * The private data for an OP_ENQUEUE_SCRUB MDRequest
+ */
+class EnqueueScrubParams
+{
+  public:
+  const bool recursive;
+  const bool children;
+  const std::string tag;
+  EnqueueScrubParams(bool r, bool c, const std::string &tag_)
+    : recursive(r), children(c), tag(tag_)
+  {}
+};
+
+
 void MDCache::scrub_dentry_work(MDRequestRef& mdr)
 {
   set<SimpleLock*> rdlocks, wrlocks, xlocks;
@@ -11691,10 +11708,99 @@ void MDCache::scrub_dentry_work(MDRequestRef& mdr)
   CInode::validated_data *vr =
       static_cast<CInode::validated_data*>(mdr->internal_op_private);
 
-  in->validate_disk_state(vr, mdr);
+  in->validate_disk_state(vr, mdr, NULL);
   return;
 }
 
+
+class C_ScrubEnqueued : public Context
+{
+public:
+  MDRequestRef mdr;
+  Context *on_finish;
+  Formatter *formatter;
+  C_ScrubEnqueued(MDRequestRef& mdr,
+                  Context *fin, Formatter *f) :
+    mdr(mdr), on_finish(fin), formatter(f) {}
+
+  void finish(int r) {
+#if 0
+    if (r >= 0) { // we got into the scrubbing dump it
+      results.dump(formatter);
+    } else { // we failed the lookup or something; dump ourselves
+      formatter->open_object_section("results");
+      formatter->dump_int("return_code", r);
+      formatter->close_section(); // results
+    }
+#endif
+    on_finish->complete(r);
+  }
+};
+
+void MDCache::enqueue_scrub(
+    const string& path,
+    const std::string &tag,
+    Formatter *f, Context *fin)
+{
+  dout(10) << "scrub_dentry " << path << dendl;
+  MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
+  filepath fp(path.c_str());
+  mdr->set_filepath(fp);
+
+  C_ScrubEnqueued *se = new C_ScrubEnqueued(mdr, fin, f);
+  mdr->internal_op_finish = se;
+  // TODO pass through tag/args
+  mdr->internal_op_private = new EnqueueScrubParams(true, true, tag);
+  enqueue_scrub_work(mdr);
+}
+
+void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
+{
+  set<SimpleLock*> rdlocks, wrlocks, xlocks;
+  CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true);
+  if (NULL == in)
+    return;
+
+  // TODO: Remove this restriction
+  assert(in->is_auth());
+
+  bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks);
+  if (!locked)
+    return;
+
+  CDentry *dn = in->get_parent_dn();
+
+  // We got to this inode by path, so it must have a parent
+  assert(dn != NULL);
+
+  // Not setting a completion context here because we don't
+  // want to block asok caller on long running scrub
+  EnqueueScrubParams *args = static_cast<EnqueueScrubParams*>(
+      mdr->internal_op_private);
+  assert(args != NULL);
+
+  // Cannot scrub same dentry twice at same time
+  if (dn->scrub_info()->dentry_scrubbing) {
+    mds->server->respond_to_request(mdr, -EBUSY);
+    return;
+  }
+
+  ScrubHeaderRef header(new ScrubHeader());
+
+  header->tag = args->tag;
+  header->origin = dn;
+
+  mds->scrubstack->enqueue_dentry_bottom(dn, true, true, header, NULL);
+  delete args;
+  mdr->internal_op_private = NULL;
+
+  // Successfully enqueued
+  mds->server->respond_to_request(mdr, 0);
+  return;
+}
+
+
+
 void MDCache::flush_dentry(const string& path, Context *fin)
 {
   if (is_readonly()) {
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index c369acd..93eb697 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -1115,10 +1115,31 @@ public:
     while (n--) ++p;
     return p->second;
   }
+
   void scrub_dentry(const string& path, Formatter *f, Context *fin);
+  /**
+   * Scrub the named dentry only (skip the scrubstack)
+   */
   void scrub_dentry_work(MDRequestRef& mdr);
+
   void flush_dentry(const string& path, Context *fin);
   void flush_dentry_work(MDRequestRef& mdr);
+
+  /**
+   * Create and start an OP_ENQUEUE_SCRUB
+   */
+  void enqueue_scrub(const string& path, const std::string &tag,
+                     Formatter *f, Context *fin);
+
+  /**
+   * Resolve path to a dentry and pass it onto the ScrubStack.
+   *
+   * TODO: return enough information to the original mdr formatter
+   * and completion that they can subsequeuntly check the progress of
+   * this scrub (we won't block them on a whole scrub as it can take a very
+   * long time)
+   */
+  void enqueue_scrub_work(MDRequestRef& mdr);
 };
 
 class C_MDS_RetryRequest : public MDSInternalContext {
diff --git a/src/mds/MDSAuthCaps.cc b/src/mds/MDSAuthCaps.cc
index a8eec5c..42d01af 100644
--- a/src/mds/MDSAuthCaps.cc
+++ b/src/mds/MDSAuthCaps.cc
@@ -14,21 +14,26 @@
 
 
 #include <errno.h>
+#include <fcntl.h>
 
 #include <boost/spirit/include/qi.hpp>
 #include <boost/spirit/include/phoenix_operator.hpp>
 #include <boost/spirit/include/phoenix.hpp>
 
+#include "common/debug.h"
 #include "MDSAuthCaps.h"
 
+#define dout_subsys ceph_subsys_mds
+
+#undef dout_prefix
+#define dout_prefix *_dout << "MDSAuthCap "
+
 using std::ostream;
 using std::string;
 namespace qi = boost::spirit::qi;
 namespace ascii = boost::spirit::ascii;
 namespace phoenix = boost::phoenix;
 
-const std::string MDSCapMatch::MDS_AUTH_PATH_ROOT = "/";
-
 template <typename Iterator>
 struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()>
 {
@@ -36,6 +41,7 @@ struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()>
   {
     using qi::char_;
     using qi::int_;
+    using qi::uint_;
     using qi::lexeme;
     using qi::alnum;
     using qi::_val;
@@ -52,12 +58,14 @@ struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()>
       lexeme[lit("'") >> *(char_ - '\'') >> '\''];
     unquoted_path %= +char_("a-zA-Z0-9_.-/");
 
-    // match := [path=<path>] [uid=<uid>]
-    uid %= (spaces >> lit("uid") >> lit('=') >> int_);
+    // match := [path=<path>] [uid=<uid> [gids=<gid>[,<gid>...]]
     path %= (spaces >> lit("path") >> lit('=') >> (quoted_path | unquoted_path));
+    uid %= (spaces >> lit("uid") >> lit('=') >> uint_);
+    uintlist %= (uint_ % lit(','));
+    gidlist %= -(spaces >> lit("gids") >> lit('=') >> uintlist);
     match = -(
-             (uid)[_val = phoenix::construct<MDSCapMatch>(_1)] |
-             (path >> uid)[_val = phoenix::construct<MDSCapMatch>(_1, _2)] | 
+	     (uid >> gidlist)[_val = phoenix::construct<MDSCapMatch>(_1, _2)] |
+	     (path >> uid >> gidlist)[_val = phoenix::construct<MDSCapMatch>(_1, _2, _3)] |
              (path)[_val = phoenix::construct<MDSCapMatch>(_1)]);
 
     // capspec = * | r[w]
@@ -77,13 +85,49 @@ struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()>
   qi::rule<Iterator, string()> quoted_path, unquoted_path;
   qi::rule<Iterator, MDSCapSpec()> capspec;
   qi::rule<Iterator, string()> path;
-  qi::rule<Iterator, int()> uid;
+  qi::rule<Iterator, uint32_t()> uid;
+  qi::rule<Iterator, std::vector<uint32_t>() > uintlist;
+  qi::rule<Iterator, std::vector<uint32_t>() > gidlist;
   qi::rule<Iterator, MDSCapMatch()> match;
   qi::rule<Iterator, MDSCapGrant()> grant;
   qi::rule<Iterator, std::vector<MDSCapGrant>()> grants;
   qi::rule<Iterator, MDSAuthCaps()> mdscaps;
 };
 
+void MDSCapMatch::normalize_path()
+{
+  // drop any leading /
+  while (path.length() && path[0] == '/') {
+    path = path.substr(1);
+  }
+
+  // drop dup //
+  // drop .
+  // drop ..
+}
+
+bool MDSCapMatch::match(const std::string &target_path,
+			const int caller_uid,
+			const int caller_gid) const
+{
+  if (uid != MDS_AUTH_UID_ANY) {
+    if (uid != caller_uid)
+      return false;
+    if (std::find(gids.begin(), gids.end(), caller_gid) == gids.end())
+      return false;
+  }
+  if (path.length()) {
+    if (target_path.find(path) != 0)
+      return false;
+    // if path doesn't already have a trailing /, make sure the target
+    // does so that path=/foo doesn't match target_path=/food
+    if (target_path.length() > path.length() &&
+	path[path.length()-1] != '/' &&
+	target_path[path.length()] != '/')
+      return false;
+  }
+  return true;
+}
 
 /**
  * For a given filesystem path, query whether this capability carries`
@@ -91,18 +135,71 @@ struct MDSCapParser : qi::grammar<Iterator, MDSAuthCaps()>
  *
  * This is true if any of the 'grant' clauses in the capability match the
  * requested path + op.
- *
  */
-bool MDSAuthCaps::is_capable(const std::string &path, int uid, bool may_read, bool may_write) const
+bool MDSAuthCaps::is_capable(const std::string &inode_path,
+			     uid_t inode_uid, gid_t inode_gid,
+			     unsigned inode_mode,
+			     uid_t caller_uid, gid_t caller_gid,
+			     unsigned mask,
+			     uid_t new_uid, gid_t new_gid) const
 {
-  for (std::vector<MDSCapGrant>::const_iterator i = grants.begin(); i != grants.end(); ++i) {
-    if (i->match.match(path, uid)) {
-      if ((may_read && !i->spec.read) ||
-          (may_write && !i->spec.write)) {
-        continue;
-      } else {
+  if (cct)
+    ldout(cct, 10) << __func__ << " inode(path /" << inode_path
+		   << " owner " << inode_uid << ":" << inode_gid
+		   << " mode 0" << std::oct << inode_mode << std::dec
+		   << ") by caller " << caller_uid << ":" << caller_gid
+		   << " mask " << mask
+		   << " new " << new_uid << ":" << new_gid
+		   << " cap: " << *this << dendl;
+
+  for (std::vector<MDSCapGrant>::const_iterator i = grants.begin();
+       i != grants.end();
+       ++i) {
+
+    if (i->match.match(inode_path, caller_uid, caller_gid) &&
+	i->spec.allows(mask & (MAY_READ|MAY_EXECUTE), mask & MAY_WRITE)) {
+
+      // check unix permissions?
+      if (i->match.uid == MDSCapMatch::MDS_AUTH_UID_ANY) {
         return true;
       }
+
+      // chown/chgrp
+      if (mask & MAY_CHOWN) {
+	if (new_uid != caller_uid ||   // you can't chown to someone else
+	    inode_uid != caller_uid) { // you can't chown from someone else
+	  continue;
+	}
+      }
+      if (mask & MAY_CHGRP) {
+	// you can only chgrp *to* one of your groups... if you own the file.
+	if (inode_uid != caller_uid ||
+	    std::find(i->match.gids.begin(), i->match.gids.end(), new_gid) ==
+	    i->match.gids.end()) {
+	  continue;
+	}
+      }
+
+      if (inode_uid == caller_uid) {
+        if ((!(mask & MAY_READ) || (inode_mode & S_IRUSR)) &&
+	    (!(mask & MAY_WRITE) || (inode_mode & S_IWUSR)) &&
+	    (!(mask & MAY_EXECUTE) || (inode_mode & S_IXUSR))) {
+          return true;
+        }
+      } else if (std::find(i->match.gids.begin(), i->match.gids.end(),
+			   inode_gid) != i->match.gids.end()) {
+        if ((!(mask & MAY_READ) || (inode_mode & S_IRGRP)) &&
+	    (!(mask & MAY_WRITE) || (inode_mode & S_IWGRP)) &&
+	    (!(mask & MAY_EXECUTE) || (inode_mode & S_IXGRP))) {
+          return true;
+        }
+      } else {
+        if ((!(mask & MAY_READ) || (inode_mode & S_IROTH)) &&
+	    (!(mask & MAY_WRITE) || (inode_mode & S_IWOTH)) &&
+	    (!(mask & MAY_EXECUTE) || (inode_mode & S_IXOTH))) {
+          return true;
+        }
+      }
     }
   }
 
@@ -115,7 +212,7 @@ void MDSAuthCaps::set_allow_all()
     grants.push_back(MDSCapGrant(MDSCapSpec(true, true, true), MDSCapMatch()));
 }
 
-bool MDSAuthCaps::parse(const std::string& str, ostream *err)
+bool MDSAuthCaps::parse(CephContext *c, const std::string& str, ostream *err)
 {
   // Special case for legacy caps
   if (str == "allow") {
@@ -129,6 +226,7 @@ bool MDSAuthCaps::parse(const std::string& str, ostream *err)
   std::string::const_iterator end = str.end();
 
   bool r = qi::phrase_parse(iter, end, g, ascii::space, *this);
+  cct = c;  // set after parser self-assignment
   if (r && iter == end) {
     return true;
   } else {
@@ -136,7 +234,7 @@ bool MDSAuthCaps::parse(const std::string& str, ostream *err)
     grants.clear();
 
     if (err)
-      *err << "osdcap parse failed, stopped at '" << std::string(iter, end)
+      *err << "MDSAuthCaps parse failed, stopped at '" << std::string(iter, end)
            << "' of '" << str << "'\n";
     return false; 
   }
@@ -157,14 +255,24 @@ bool MDSAuthCaps::allow_all() const
 
 ostream &operator<<(ostream &out, const MDSCapMatch &match)
 {
-  if (match.path != MDSCapMatch::MDS_AUTH_PATH_ROOT) {
-    out << "path=\"" << match.path << "\"";
-  }
-  if (match.path != MDSCapMatch::MDS_AUTH_PATH_ROOT && match.uid != MDSCapMatch::MDS_AUTH_UID_ANY) {
-    out << " ";
+  if (match.path.length()) {
+    out << "path=\"/" << match.path << "\"";
+    if (match.uid != MDSCapMatch::MDS_AUTH_UID_ANY) {
+      out << " ";
+    }
   }
   if (match.uid != MDSCapMatch::MDS_AUTH_UID_ANY) {
     out << "uid=" << match.uid;
+    if (!match.gids.empty()) {
+      out << " gids=";
+      for (std::vector<gid_t>::const_iterator p = match.gids.begin();
+	   p != match.gids.end();
+	   ++p) {
+	if (p != match.gids.begin())
+	  out << ',';
+	out << *p;
+      }
+    }
   }
 
   return out;
diff --git a/src/mds/MDSAuthCaps.h b/src/mds/MDSAuthCaps.h
index 2878c79..112a7fb 100644
--- a/src/mds/MDSAuthCaps.h
+++ b/src/mds/MDSAuthCaps.h
@@ -19,63 +19,105 @@
 #include <vector>
 #include <string>
 #include <sstream>
+#include "include/types.h"
+#include "common/debug.h"
+
+// unix-style capabilities
+enum {
+  MAY_READ = 1,
+  MAY_WRITE = 2,
+  MAY_EXECUTE = 4,
+  MAY_CHOWN = 16,
+  MAY_CHGRP = 32
+};
 
+class CephContext;
 
+// what we can do
 struct MDSCapSpec {
-  bool read;
-  bool write;
-  bool any;
+  bool read, write, any;
 
   MDSCapSpec() : read(false), write(false), any(false) {}
-  MDSCapSpec(bool r_, bool w_, bool a_) : read(r_), write(w_), any(a_) {}
+  MDSCapSpec(bool r, bool w, bool a) : read(r), write(w), any(a) {}
 
-  bool allow_all() const {return any;}
+  bool allow_all() const {
+    return any;
+  }
+  bool allows(bool r, bool w) const {
+    if (any)
+      return true;
+    if (r && !read)
+      return false;
+    if (w && !write)
+      return false;
+    return true;
+  }
 };
 
+// conditions before we are allowed to do it
 struct MDSCapMatch {
-  static const int MDS_AUTH_UID_ANY = -1;
-  static const std::string MDS_AUTH_PATH_ROOT;
+  static const int64_t MDS_AUTH_UID_ANY = -1;
 
-  int uid;  // Require UID to be equal to this, if !=MDS_AUTH_UID_ANY
-  std::string path;  // Require path to be child of this (may be "/" for any)
+  int64_t uid;       // Require UID to be equal to this, if !=MDS_AUTH_UID_ANY
+  std::vector<gid_t> gids;  // Use these GIDs
+  std::string path;  // Require path to be child of this (may be "" or "/" for any)
+
+  MDSCapMatch() : uid(MDS_AUTH_UID_ANY) {}
+  MDSCapMatch(int64_t uid_, std::vector<gid_t>& gids_) : uid(uid_), gids(gids_) {}
+  MDSCapMatch(std::string path_)
+    : uid(MDS_AUTH_UID_ANY), path(path_) {
+    normalize_path();
+  }
+  MDSCapMatch(std::string path_, int64_t uid_, std::vector<gid_t>& gids_)
+    : uid(uid_), gids(gids_), path(path_) {
+    normalize_path();
+  }
 
-  MDSCapMatch() : uid(MDS_AUTH_UID_ANY), path(MDS_AUTH_PATH_ROOT) {}
-  MDSCapMatch(int uid_) : uid(uid_), path(MDS_AUTH_PATH_ROOT) {}
-  MDSCapMatch(std::string path_) : uid(MDS_AUTH_UID_ANY), path(path_) {}
-  MDSCapMatch(std::string path_, int uid_) : uid(uid_), path(path_) {}
+  void normalize_path();
   
   bool is_match_all() const
   {
-    return uid == MDS_AUTH_UID_ANY && path == "/";
+    return uid == MDS_AUTH_UID_ANY && path == "";
   }
 
-  bool match(const std::string &target_path, const int target_uid) const {
-    return (target_path.find(path) == 0 && (target_uid == uid || uid == MDS_AUTH_UID_ANY)); 
-  }
+  // check whether this grant matches against a given file and caller uid:gid
+  bool match(const std::string &target_path,
+	     const int caller_uid,
+	     const int caller_gid) const;
 };
 
 struct MDSCapGrant {
   MDSCapSpec spec;
   MDSCapMatch match;
 
-  MDSCapGrant(const MDSCapSpec &spec_, const MDSCapMatch &match_) : spec(spec_), match(match_) {}
+  MDSCapGrant(const MDSCapSpec &spec_, const MDSCapMatch &match_)
+    : spec(spec_), match(match_) {}
   MDSCapGrant() {}
 };
 
 class MDSAuthCaps
 {
-    protected:
-    std::vector<MDSCapGrant> grants;
-
-    public:
-    void set_allow_all();
-    bool parse(const std::string &str, std::ostream *err);
-    MDSAuthCaps() {}
-    MDSAuthCaps(const std::vector<MDSCapGrant> &grants_) : grants(grants_) {}
-
-    bool allow_all() const;
-    bool is_capable(const std::string &path, int uid, bool may_read, bool may_write) const;
-    friend std::ostream &operator<<(std::ostream &out, const MDSAuthCaps &cap);
+  CephContext *cct;
+  std::vector<MDSCapGrant> grants;
+
+public:
+  MDSAuthCaps(CephContext *cct_=NULL)
+    : cct(cct_) { }
+
+  // this ctor is used by spirit/phoenix; doesn't need cct.
+  MDSAuthCaps(const std::vector<MDSCapGrant> &grants_)
+    : cct(NULL), grants(grants_) { }
+
+  void set_allow_all();
+  bool parse(CephContext *cct, const std::string &str, std::ostream *err);
+
+  bool allow_all() const;
+  bool is_capable(const std::string &inode_path,
+		  uid_t inode_uid, gid_t inode_gid, unsigned inode_mode,
+		  uid_t uid, gid_t gid, unsigned mask,
+		  uid_t new_uid, gid_t new_gid) const;
+
+  friend std::ostream &operator<<(std::ostream &out, const MDSAuthCaps &cap);
 };
 
 
diff --git a/src/mds/MDSContinuation.h b/src/mds/MDSContinuation.h
index 82d178f..0340a20 100644
--- a/src/mds/MDSContinuation.h
+++ b/src/mds/MDSContinuation.h
@@ -17,19 +17,15 @@
 #include "mds/Server.h"
  
 class MDSContinuation : public Continuation {
-  MDRequestRef mdr;
-  Server *server;
-public:
-  MDSContinuation(MDRequestRef& mdrequest, Server *s) :
-    Continuation(NULL), mdr(mdrequest), server(s) {}
 protected:
-  void _done() {
-    server->respond_to_request(mdr, get_rval());
-  }
+  Server *server;
   MDSInternalContextBase *get_internal_callback(int stage) {
     return new MDSInternalContextWrapper(server->mds, get_callback(stage));
   }
   MDSIOContextBase *get_io_callback(int stage) {
     return new MDSIOContextWrapper(server->mds, get_callback(stage));
   }
+public:
+  MDSContinuation(Server *s) :
+    Continuation(NULL), server(s) {}
 };
diff --git a/src/mds/MDSDaemon.cc b/src/mds/MDSDaemon.cc
index cf54e82..9fcff34 100644
--- a/src/mds/MDSDaemon.cc
+++ b/src/mds/MDSDaemon.cc
@@ -236,6 +236,12 @@ void MDSDaemon::set_up_admin_socket()
                                      asok_hook,
                                      "scrub an inode and output results");
   assert(r == 0);
+  r = admin_socket->register_command("tag path",
+                                     "tag path name=path,type=CephString"
+                                     " name=tag,type=CephString",
+                                     asok_hook,
+                                     "Apply scrub tag recursively");
+   assert(r == 0);
   r = admin_socket->register_command("flush_path",
                                      "flush_path name=path,type=CephString",
                                      asok_hook,
@@ -328,6 +334,7 @@ const char** MDSDaemon::get_tracked_conf_keys() const
   static const char* KEYS[] = {
     "mds_op_complaint_time", "mds_op_log_threshold",
     "mds_op_history_size", "mds_op_history_duration",
+    "mds_enable_op_tracker",
     // clog & admin clog
     "clog_to_monitors",
     "clog_to_syslog",
@@ -357,6 +364,11 @@ void MDSDaemon::handle_conf_change(const struct md_config_t *conf,
                                                conf->mds_op_history_duration);
     }
   }
+  if (changed.count("mds_enable_op_tracker")) {
+    if (mds_rank) {
+      mds_rank->op_tracker.set_tracking(conf->mds_enable_op_tracker);
+    }
+  }
   if (changed.count("clog_to_monitors") ||
       changed.count("clog_to_syslog") ||
       changed.count("clog_to_syslog_level") ||
@@ -600,6 +612,12 @@ COMMAND("session kill " \
 COMMAND("cpu_profiler " \
 	"name=arg,type=CephChoices,strings=status|flush",
 	"run cpu profiling on daemon", "mds", "rw", "cli,rest")
+COMMAND("session ls " \
+	"name=filters,type=CephString,n=N,req=false",
+	"List client sessions", "mds", "r", "cli,rest")
+COMMAND("session evict " \
+	"name=filters,type=CephString,n=N,req=false",
+	"Evict client session(s)", "mds", "rw", "cli,rest")
 COMMAND("heap " \
 	"name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
 	"show heap usage info (available only if compiled with tcmalloc)", \
@@ -737,6 +755,15 @@ int MDSDaemon::_handle_command(
     get_str_vec(arg, argvec);
     cpu_profiler_handle_command(argvec, ds);
   } else {
+    // Give MDSRank a shot at the command
+    if (mds_rank) {
+      bool handled = mds_rank->handle_command(cmdmap, inbl, &r, &ds, &ss);
+      if (handled) {
+        goto out;
+      }
+    }
+
+    // Neither MDSDaemon nor MDSRank know this command
     std::ostringstream ss;
     ss << "unrecognized command! " << prefix;
     r = -EINVAL;
@@ -1260,6 +1287,7 @@ bool MDSDaemon::ms_verify_authorizer(Connection *con, int peer_type,
     // request to open a session (initial state of Session is `closed`)
     if (!s) {
       s = new Session;
+      s->info.auth_name = name;
       s->info.inst.addr = con->get_peer_addr();
       s->info.inst.name = n;
       dout(10) << " new session " << s << " for " << s->info.inst << " con " << con << dendl;
@@ -1286,8 +1314,8 @@ bool MDSDaemon::ms_verify_authorizer(Connection *con, int peer_type,
     }
 
     if (caps_info.allow_all) {
-        // Flag for auth providers that don't provide cap strings
-        s->auth_caps.set_allow_all();
+      // Flag for auth providers that don't provide cap strings
+      s->auth_caps.set_allow_all();
     }
 
     bufferlist::iterator p = caps_info.caps.begin();
@@ -1297,9 +1325,11 @@ bool MDSDaemon::ms_verify_authorizer(Connection *con, int peer_type,
 
       dout(10) << __func__ << ": parsing auth_cap_str='" << auth_cap_str << "'" << dendl;
       std::ostringstream errstr;
-      if (!s->auth_caps.parse(auth_cap_str, &errstr)) {
+      if (!s->auth_caps.parse(g_ceph_context, auth_cap_str, &errstr)) {
         dout(1) << __func__ << ": auth cap parse error: " << errstr.str()
-          << " parsing '" << auth_cap_str << "'" << dendl;
+		<< " parsing '" << auth_cap_str << "'" << dendl;
+	clog->warn() << name << " mds cap '" << auth_cap_str
+		     << "' does not parse: " << errstr.str() << "\n";
       }
     } catch (buffer::error& e) {
       // Assume legacy auth, defaults to:
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc
index 7e2048d..fcb8efd 100644
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -19,8 +19,6 @@
 #include "messages/MMDSMap.h"
 
 #include "MDSMap.h"
-//#include "MDS.h"
-#include "mds_table_types.h"
 #include "SnapClient.h"
 #include "SnapServer.h"
 #include "MDBalancer.h"
@@ -30,6 +28,7 @@
 #include "InoTable.h"
 #include "mon/MonClient.h"
 #include "common/HeartbeatMap.h"
+#include "ScrubStack.h"
 
 
 #include "MDSRank.h"
@@ -56,7 +55,8 @@ MDSRank::MDSRank(
     mdsmap(mdsmap_),
     objecter(objecter_),
     server(NULL), mdcache(NULL), locker(NULL), mdlog(NULL),
-    balancer(NULL), inotable(NULL), snapserver(NULL), snapclient(NULL),
+    balancer(NULL), scrubstack(NULL),
+    inotable(NULL), snapserver(NULL), snapclient(NULL),
     sessionmap(this), logger(NULL), mlogger(NULL),
     op_tracker(g_ceph_context, g_conf->mds_enable_op_tracker, 
                g_conf->osd_num_op_tracker_shard),
@@ -79,6 +79,8 @@ MDSRank::MDSRank(
   mdlog = new MDLog(this);
   balancer = new MDBalancer(this, messenger, monc);
 
+  scrubstack = new ScrubStack(mdcache, finisher);
+
   inotable = new InoTable(this);
   snapserver = new SnapServer(this, monc);
   snapclient = new SnapClient(this);
@@ -98,6 +100,7 @@ MDSRank::~MDSRank()
     g_ceph_context->get_heartbeat_map()->remove_worker(hb);
   }
 
+  if (scrubstack) { delete scrubstack; scrubstack = NULL; }
   if (mdcache) { delete mdcache; mdcache = NULL; }
   if (mdlog) { delete mdlog; mdlog = NULL; }
   if (balancer) { delete balancer; balancer = NULL; }
@@ -1651,12 +1654,14 @@ bool MDSRankDispatcher::handle_asok_command(
 {
   if (command == "dump_ops_in_flight" ||
              command == "ops") {
+    RWLock::RLocker l(op_tracker.lock);
     if (!op_tracker.tracking_enabled) {
       ss << "op_tracker tracking is not enabled";
     } else {
       op_tracker.dump_ops_in_flight(f);
     }
   } else if (command == "dump_historic_ops") {
+    RWLock::RLocker l(op_tracker.lock);
     if (!op_tracker.tracking_enabled) {
       ss << "op_tracker tracking is not enabled";
     } else {
@@ -1687,38 +1692,7 @@ bool MDSRankDispatcher::handle_asok_command(
     
     heartbeat_reset();
     
-    // Dump sessions, decorated with recovery/replay status
-    f->open_array_section("sessions");
-    const ceph::unordered_map<entity_name_t, Session*> session_map = sessionmap.get_sessions();
-    for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
-         p != session_map.end();
-         ++p)  {
-      if (!p->first.is_client()) {
-        continue;
-      }
-      
-      Session *s = p->second;
-      
-      f->open_object_section("session");
-      f->dump_int("id", p->first.num());
-      
-      f->dump_int("num_leases", s->leases.size());
-      f->dump_int("num_caps", s->caps.size());
-      
-      f->dump_string("state", s->get_state_name());
-      f->dump_int("replay_requests", is_clientreplay() ? s->get_request_count() : 0);
-      f->dump_unsigned("completed_requests", s->get_num_completed_requests());
-      f->dump_bool("reconnecting", server->waiting_for_reconnect(p->first.num()));
-      f->dump_stream("inst") << s->info.inst;
-      f->open_object_section("client_metadata");
-      for (map<string, string>::const_iterator i = s->info.client_metadata.begin();
-           i != s->info.client_metadata.end(); ++i) {
-        f->dump_string(i->first.c_str(), i->second);
-      }
-      f->close_section(); // client_metadata
-      f->close_section(); //session
-    }
-    f->close_section(); //sessions
+    dump_sessions(SessionFilter(), f);
     
     mds_lock.Unlock();
   } else if (command == "session evict") {
@@ -1743,6 +1717,12 @@ bool MDSRankDispatcher::handle_asok_command(
     string path;
     cmd_getval(g_ceph_context, cmdmap, "path", path);
     command_scrub_path(f, path);
+  } else if (command == "tag path") {
+    string path;
+    cmd_getval(g_ceph_context, cmdmap, "path", path);
+    string tag;
+    cmd_getval(g_ceph_context, cmdmap, "tag", tag);
+    command_tag_path(f, path, tag);
   } else if (command == "flush_path") {
     string path;
     cmd_getval(g_ceph_context, cmdmap, "path", path);
@@ -1792,7 +1772,84 @@ bool MDSRankDispatcher::handle_asok_command(
   return true;
 }
 
+/**
+ * This function drops the mds_lock, so don't do anything with
+ * MDSRank after calling it (we could have gone into shutdown): just
+ * send your result back to the calling client and finish.
+ */
+std::vector<entity_name_t> MDSRankDispatcher::evict_sessions(
+    const SessionFilter &filter)
+{
+  std::list<Session*> victims;
+
+  const auto sessions = sessionmap.get_sessions();
+  for (const auto p : sessions)  {
+    if (!p.first.is_client()) {
+      continue;
+    }
+
+    Session *s = p.second;
+
+    if (filter.match(*s, std::bind(&Server::waiting_for_reconnect, server, std::placeholders::_1))) {
+      victims.push_back(s);
+    }
+  }
+
+  std::vector<entity_name_t> result;
+
+  C_SaferCond on_safe;
+  C_GatherBuilder gather(g_ceph_context, &on_safe);
+  for (const auto s : victims) {
+    server->kill_session(s, gather.new_sub());
+    result.push_back(s->info.inst.name);
+  }
+  gather.activate();
+  mds_lock.Unlock();
+  on_safe.wait();
+  mds_lock.Lock();
+
+  return result;
+}
+
+void MDSRankDispatcher::dump_sessions(const SessionFilter &filter, Formatter *f) const
+{
+  // Dump sessions, decorated with recovery/replay status
+  f->open_array_section("sessions");
+  const ceph::unordered_map<entity_name_t, Session*> session_map = sessionmap.get_sessions();
+  for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
+       p != session_map.end();
+       ++p)  {
+    if (!p->first.is_client()) {
+      continue;
+    }
 
+    Session *s = p->second;
+
+    if (!filter.match(*s, std::bind(&Server::waiting_for_reconnect, server, std::placeholders::_1))) {
+      continue;
+    }
+    
+    f->open_object_section("session");
+    f->dump_int("id", p->first.num());
+    
+    f->dump_int("num_leases", s->leases.size());
+    f->dump_int("num_caps", s->caps.size());
+    
+    f->dump_string("state", s->get_state_name());
+    f->dump_int("replay_requests", is_clientreplay() ? s->get_request_count() : 0);
+    f->dump_unsigned("completed_requests", s->get_num_completed_requests());
+    f->dump_bool("reconnecting", server->waiting_for_reconnect(p->first.num()));
+    f->dump_stream("inst") << s->info.inst;
+    f->open_object_section("client_metadata");
+    for (map<string, string>::const_iterator i = s->info.client_metadata.begin();
+         i != s->info.client_metadata.end(); ++i) {
+      f->dump_string(i->first.c_str(), i->second);
+    }
+    f->close_section(); // client_metadata
+    f->close_section(); //session
+  }
+  f->close_section(); //sessions
+}
 
 void MDSRank::command_scrub_path(Formatter *f, const string& path)
 {
@@ -1805,6 +1862,17 @@ void MDSRank::command_scrub_path(Formatter *f, const string& path)
   // scrub_dentry() finishers will dump the data for us; we're done!
 }
 
+void MDSRank::command_tag_path(Formatter *f,
+    const string& path, const std::string &tag)
+{
+  C_SaferCond scond;
+  {
+    Mutex::Locker l(mds_lock);
+    mdcache->enqueue_scrub(path, tag, f, &scond);
+  }
+  scond.wait();
+}
+
 void MDSRank::command_flush_path(Formatter *f, const string& path)
 {
   C_SaferCond scond;
@@ -2403,3 +2471,50 @@ MDSRankDispatcher::MDSRankDispatcher(
       msgr, monc_, objecter_, respawn_hook_, suicide_hook_)
 {}
 
+bool MDSRankDispatcher::handle_command(
+  const cmdmap_t &cmdmap,
+  bufferlist const &inbl,
+  int *r,
+  std::stringstream *ds,
+  std::stringstream *ss)
+{
+  assert(r != nullptr);
+  assert(ds != nullptr);
+  assert(ss != nullptr);
+
+  std::string prefix;
+  cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
+
+  if (prefix == "session ls") {
+    std::vector<std::string> filter_args;
+    cmd_getval(g_ceph_context, cmdmap, "filters", filter_args);
+
+    SessionFilter filter;
+    *r = filter.parse(filter_args, ss);
+    if (*r != 0) {
+      return true;
+    }
+
+    Formatter *f = new JSONFormatter();
+    dump_sessions(filter, f);
+    f->flush(*ds);
+    delete f;
+    return true;
+  } else if (prefix == "session evict") {
+    std::vector<std::string> filter_args;
+    cmd_getval(g_ceph_context, cmdmap, "filters", filter_args);
+
+    SessionFilter filter;
+    *r = filter.parse(filter_args, ss);
+    if (*r != 0) {
+      return true;
+    }
+
+    evict_sessions(filter);
+
+    return true;
+  } else {
+    return false;
+  }
+}
+
diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h
index b999657..8d2667b 100644
--- a/src/mds/MDSRank.h
+++ b/src/mds/MDSRank.h
@@ -107,6 +107,7 @@ class Objecter;
 class MonClient;
 class Finisher;
 class MMDSMap;
+class ScrubStack;
 
 /**
  * The public part of this class's interface is what's exposed to all
@@ -151,6 +152,7 @@ class MDSRank {
     Locker       *locker;
     MDLog        *mdlog;
     MDBalancer   *balancer;
+    ScrubStack   *scrubstack;
 
     InoTable     *inotable;
 
@@ -176,21 +178,21 @@ class MDSRank {
     MDSMap::DaemonState get_state() const { return state; } 
     MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); } 
 
-    bool is_creating() { return state == MDSMap::STATE_CREATING; }
-    bool is_starting() { return state == MDSMap::STATE_STARTING; }
-    bool is_standby()  { return state == MDSMap::STATE_STANDBY; }
-    bool is_replay()   { return state == MDSMap::STATE_REPLAY; }
-    bool is_standby_replay() { return state == MDSMap::STATE_STANDBY_REPLAY; }
-    bool is_resolve()  { return state == MDSMap::STATE_RESOLVE; }
-    bool is_reconnect() { return state == MDSMap::STATE_RECONNECT; }
-    bool is_rejoin()   { return state == MDSMap::STATE_REJOIN; }
-    bool is_clientreplay()   { return state == MDSMap::STATE_CLIENTREPLAY; }
-    bool is_active()   { return state == MDSMap::STATE_ACTIVE; }
-    bool is_stopping() { return state == MDSMap::STATE_STOPPING; }
-    bool is_oneshot_replay()   { return state == MDSMap::STATE_ONESHOT_REPLAY; }
-    bool is_any_replay() { return (is_replay() || is_standby_replay() ||
+    bool is_creating() const { return state == MDSMap::STATE_CREATING; }
+    bool is_starting() const { return state == MDSMap::STATE_STARTING; }
+    bool is_standby() const { return state == MDSMap::STATE_STANDBY; }
+    bool is_replay() const { return state == MDSMap::STATE_REPLAY; }
+    bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; }
+    bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; }
+    bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; }
+    bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; }
+    bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; }
+    bool is_active() const { return state == MDSMap::STATE_ACTIVE; }
+    bool is_stopping() const { return state == MDSMap::STATE_STOPPING; }
+    bool is_oneshot_replay() const { return state == MDSMap::STATE_ONESHOT_REPLAY; }
+    bool is_any_replay() const { return (is_replay() || is_standby_replay() ||
         is_oneshot_replay()); }
-    bool is_stopped()  { return mdsmap->is_stopped(whoami); }
+    bool is_stopped() const { return mdsmap->is_stopped(whoami); }
 
     void handle_write_error(int err);
 
@@ -366,6 +368,8 @@ class MDSRank {
 
   protected:
     void command_scrub_path(Formatter *f, const string& path);
+    void command_tag_path(Formatter *f, const string& path,
+                          const string &tag);
     void command_flush_path(Formatter *f, const string& path);
     void command_flush_journal(Formatter *f);
     void command_get_subtrees(Formatter *f);
@@ -490,6 +494,18 @@ public:
   void update_log_config();
   bool handle_command_legacy(std::vector<std::string> args);
 
+  bool handle_command(
+    const cmdmap_t &cmdmap,
+    bufferlist const &inbl,
+    int *r,
+    std::stringstream *ds,
+    std::stringstream *ss);
+
+  void dump_sessions(
+      const SessionFilter &filter, Formatter *f) const;
+  std::vector<entity_name_t> evict_sessions(
+      const SessionFilter &filter);
+
   // Call into me from MDS::ms_dispatch
   bool ms_dispatch(Message *m);
 
diff --git a/src/mds/Makefile-server.am b/src/mds/Makefile-server.am
index ee3daed..951bb89 100644
--- a/src/mds/Makefile-server.am
+++ b/src/mds/Makefile-server.am
@@ -36,6 +36,8 @@ noinst_HEADERS += \
 	mds/Mutation.h \
 	mds/Migrator.h \
 	mds/ScatterLock.h \
+	mds/ScrubStack.h \
+	mds/ScrubHeader.h \
 	mds/Server.h \
 	mds/SessionMap.h \
 	mds/SimpleLock.h \
diff --git a/src/mds/Makefile.am b/src/mds/Makefile.am
index c7b0307..f3904e1 100644
--- a/src/mds/Makefile.am
+++ b/src/mds/Makefile.am
@@ -23,14 +23,14 @@ LIBMDS_SOURCES = \
 	mds/MDSTableClient.cc \
 	mds/MDSTableServer.cc \
 	mds/SimpleLock.cc \
+	mds/ScrubStack.cc \
 	mds/SnapRealm.cc \
 	mds/SnapServer.cc \
 	mds/snap.cc \
 	mds/SessionMap.cc \
 	mds/MDSContext.cc \
 	mds/MDSAuthCaps.cc \
-	mds/MDLog.cc \
-	common/TrackedOp.cc
+	mds/MDLog.cc
 LIBMDS_DEPS = $(LIBOSDC)
 
 if ENABLE_CLIENT
diff --git a/src/mds/ScrubHeader.h b/src/mds/ScrubHeader.h
new file mode 100644
index 0000000..288f3a3
--- /dev/null
+++ b/src/mds/ScrubHeader.h
@@ -0,0 +1,23 @@
+
+#ifndef SCRUB_HEADER_H_
+#define SCRUB_HEADER_H_
+
+class CDentry;
+
+/**
+ * Externally input parameters for a scrub, associated with the root
+ * of where we are doing a recursive scrub
+ *
+ * TODO: swallow up 'recurse' and 'children' settings here instead of
+ * passing them down into every scrub_info structure
+ */
+class ScrubHeader {
+public:
+  std::string tag;
+  CDentry *origin;
+};
+typedef ceph::shared_ptr<ScrubHeader> ScrubHeaderRef;
+typedef ceph::shared_ptr<const ScrubHeader> ScrubHeaderRefConst;
+
+#endif // SCRUB_HEADER_H_
+
diff --git a/src/mds/ScrubStack.cc b/src/mds/ScrubStack.cc
new file mode 100644
index 0000000..0d99beb
--- /dev/null
+++ b/src/mds/ScrubStack.cc
@@ -0,0 +1,447 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <iostream>
+
+#include "ScrubStack.h"
+#include "common/Finisher.h"
+#include "mds/MDSRank.h"
+#include "mds/MDCache.h"
+#include "mds/MDSContinuation.h"
+
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, scrubstack->mdcache->mds)
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
+  return *_dout << "mds." << mds->get_nodeid() << ".scrubstack ";
+}
+
+void ScrubStack::push_dentry(CDentry *dentry)
+{
+  dout(20) << "pushing " << *dentry << " on top of ScrubStack" << dendl;
+  if (!dentry->item_scrub.is_on_list()) {
+    dentry->get(CDentry::PIN_SCRUBQUEUE);
+    stack_size++;
+  }
+  dentry_stack.push_front(&dentry->item_scrub);
+}
+
+void ScrubStack::push_dentry_bottom(CDentry *dentry)
+{
+  dout(20) << "pushing " << *dentry << " on bottom of ScrubStack" << dendl;
+  if (!dentry->item_scrub.is_on_list()) {
+    dentry->get(CDentry::PIN_SCRUBQUEUE);
+    stack_size++;
+  }
+  dentry_stack.push_back(&dentry->item_scrub);
+}
+
+void ScrubStack::pop_dentry(CDentry *dn)
+{
+  dout(20) << "popping " << *dn
+          << " off of ScrubStack" << dendl;
+  assert(dn->item_scrub.is_on_list());
+  dn->put(CDentry::PIN_SCRUBQUEUE);
+  dn->item_scrub.remove_myself();
+  stack_size--;
+}
+
+void ScrubStack::_enqueue_dentry(CDentry *dn, CDir *parent, bool recursive,
+    bool children, ScrubHeaderRefConst header,
+    MDSInternalContextBase *on_finish, bool top)
+{
+  dout(10) << __func__ << " with {" << *dn << "}"
+           << ", recursive=" << recursive << ", children=" << children
+           << ", on_finish=" << on_finish << ", top=" << top << dendl;
+  assert(mdcache->mds->mds_lock.is_locked_by_me());
+  dn->scrub_initialize(parent, recursive, children, header, on_finish);
+  if (top)
+    push_dentry(dn);
+  else
+    push_dentry_bottom(dn);
+}
+
+void ScrubStack::enqueue_dentry(CDentry *dn, bool recursive, bool children,
+                                ScrubHeaderRefConst header,
+                                 MDSInternalContextBase *on_finish, bool top)
+{
+  _enqueue_dentry(dn, NULL, recursive, children, header, on_finish, top);
+  kick_off_scrubs();
+}
+
+void ScrubStack::kick_off_scrubs()
+{
+  dout(20) << __func__ << " entering with " << scrubs_in_progress << " in "
+              "progress and " << stack_size << " in the stack" << dendl;
+  bool can_continue = true;
+  elist<CDentry*>::iterator i = dentry_stack.begin();
+  while (g_conf->mds_max_scrub_ops_in_progress > scrubs_in_progress &&
+      can_continue && !i.end()) {
+    CDentry *cur = *i;
+
+    dout(20) << __func__ << " examining dentry " << *cur << dendl;
+
+    CInode *curi = cur->get_projected_inode();
+    ++i; // we have our reference, push iterator forward
+
+    if (!curi->is_dir()) {
+      // it's a regular file, symlink, or hard link
+      pop_dentry(cur); // we only touch it this once, so remove from stack
+
+      if (curi->is_file()) {
+	if (!cur->scrub_info()->on_finish) {
+	  scrubs_in_progress++;
+	  cur->scrub_set_finisher(&scrub_kick);
+	}
+        scrub_file_dentry(cur);
+        can_continue = true;
+      } else {
+        // drat, we don't do anything with these yet :(
+        dout(5) << "skipping scrub on non-dir, non-file dentry "
+                << *cur << dendl;
+	Context *c = NULL;
+	cur->scrub_finished(&c);
+	assert(c == NULL);
+      }
+    } else {
+      bool completed; // it's done, so pop it off the stack
+      bool terminal; // not done, but we can start ops on other directories
+      bool progress; // it added new dentries to the top of the stack
+      scrub_dir_dentry(cur, &progress, &terminal, &completed);
+      if (completed) {
+        dout(20) << __func__ << " dir completed" << dendl;
+        pop_dentry(cur);
+      } else if (progress) {
+        dout(20) << __func__ << " dir progressed" << dendl;
+        // we added new stuff to top of stack, so reset ourselves there
+        i = dentry_stack.begin();
+      } else {
+        dout(20) << __func__ << " dir no-op" << dendl;
+      }
+
+      can_continue = progress || terminal || completed;
+    }
+  }
+}
+
+void ScrubStack::scrub_dir_dentry(CDentry *dn,
+                                  bool *added_children,
+                                  bool *terminal,
+                                  bool *done)
+{
+  assert(dn != NULL);
+  dout(10) << __func__ << *dn << dendl;
+
+  if (!dn->scrub_info()->scrub_children &&
+      !dn->scrub_info()->scrub_recursive) {
+    // TODO: we have to scrub the local dentry/inode, but nothing else
+  }
+
+  *added_children = false;
+  *terminal = false;
+  *done = false;
+
+  CInode *in = dn->get_projected_inode();
+  // FIXME: greg -- is get_version the appropriate version?  (i.e. is scrub_version
+  // meant to be an actual version that we're scrubbing, or something else?)
+  if (!in->scrub_info()->scrub_in_progress) {
+    // We may come through here more than once on our way up and down
+    // the stack... or actually is that right?  Should we perhaps
+    // only see ourselves once on the way down and once on the way
+    // back up again, and not do this?
+    in->scrub_initialize(in->get_version());
+  }
+
+  list<frag_t> scrubbing_frags;
+  list<CDir*> scrubbing_cdirs;
+  in->scrub_dirfrags_scrubbing(&scrubbing_frags);
+  dout(20) << __func__ << " iterating over " << scrubbing_frags.size()
+    << " scrubbing frags" << dendl;
+  for (list<frag_t>::iterator i = scrubbing_frags.begin();
+      i != scrubbing_frags.end();
+      ++i) {
+    // turn frags into CDir *
+    CDir *dir = in->get_dirfrag(*i);
+    scrubbing_cdirs.push_back(dir);
+    dout(25) << __func__ << " got CDir " << *dir << " presently scrubbing" << dendl;
+  }
+
+
+  dout(20) << __func__ << " consuming from " << scrubbing_cdirs.size()
+    << " scrubbing cdirs" << dendl;
+
+  list<CDir*>::iterator i = scrubbing_cdirs.begin();
+  bool all_frags_terminal = true;
+  bool all_frags_done = true;
+  while (g_conf->mds_max_scrub_ops_in_progress > scrubs_in_progress) {
+    // select next CDir
+    CDir *cur_dir = NULL;
+    if (i != scrubbing_cdirs.end()) {
+      cur_dir = *i;
+      ++i;
+      dout(20) << __func__ << " got cur_dir = " << *cur_dir << dendl;
+    } else {
+      bool ready = get_next_cdir(in, &cur_dir);
+      dout(20) << __func__ << " get_next_cdir ready=" << ready << dendl;
+      if (cur_dir) {
+        cur_dir->scrub_initialize();
+      }
+
+      if (ready && cur_dir) {
+        scrubbing_cdirs.push_back(cur_dir);
+      } else if (!ready) {
+        // We are waiting for load of a frag
+        all_frags_done = false;
+        all_frags_terminal = false;
+        break;
+      } else {
+        // Finished with all frags
+        break;
+      }
+    }
+    // scrub that CDir
+    bool frag_added_children = false;
+    bool frag_terminal = true;
+    bool frag_done = false;
+    scrub_dirfrag(cur_dir, &frag_added_children, &frag_terminal, &frag_done);
+    if (frag_done) {
+      // FIXME is this right?  Can we end up hitting this more than
+      // once and is that a problem?
+      cur_dir->inode->scrub_dirfrag_finished(cur_dir->frag);
+    }
+    *added_children |= frag_added_children;
+    all_frags_terminal = all_frags_terminal && frag_terminal;
+    all_frags_done = all_frags_done && frag_done;
+  }
+
+  dout(20) << "finished looping; all_frags_terminal=" << all_frags_terminal
+           << ", all_frags_done=" << all_frags_done << dendl;
+  if (all_frags_done) {
+    assert (!*added_children); // can't do this if children are still pending
+
+    if (!dn->scrub_info()->on_finish) {
+      scrubs_in_progress++;
+      dn->scrub_set_finisher(&scrub_kick);
+    }
+
+    // OK, so now I can... fire off a validate on the dir inode, and
+    // when it completes, come through here again, noticing that we've
+    // set a flag to indicate the the validate happened, and 
+    scrub_dir_dentry_final(dn);
+  }
+
+  *terminal = all_frags_terminal;
+  *done = all_frags_done;
+  dout(10) << __func__ << " is exiting " << *terminal << " " << *done << dendl;
+  return;
+}
+
+bool ScrubStack::get_next_cdir(CInode *in, CDir **new_dir)
+{
+  dout(20) << __func__ << " on " << *in << dendl;
+  frag_t next_frag;
+  int r = in->scrub_dirfrag_next(&next_frag);
+  assert (r >= 0);
+
+  if (r == 0) {
+    // we got a frag to scrub, otherwise it would be ENOENT
+    dout(25) << "looking up new frag " << next_frag << dendl;
+    CDir *next_dir = in->get_or_open_dirfrag(mdcache, next_frag);
+    if (!next_dir->is_complete()) {
+      scrubs_in_progress++;
+      next_dir->fetch(&scrub_kick);
+      dout(25) << "fetching frag from RADOS" << dendl;
+      return false;
+    }
+    *new_dir = next_dir;
+    dout(25) << "returning dir " << *new_dir << dendl;
+    return true;
+  }
+  assert(r == ENOENT);
+  // there are no dirfrags left
+  *new_dir = NULL;
+  return true;
+}
+
+class C_InodeValidated : public MDSInternalContext
+{
+  public:
+    ScrubStack *stack;
+    CInode::validated_data result;
+    CDentry *target;
+
+    C_InodeValidated(MDSRank *mds, ScrubStack *stack_, CDentry *target_)
+      : MDSInternalContext(mds), stack(stack_), target(target_)
+    {}
+
+    void finish(int r)
+    {
+      stack->_validate_inode_done(target, r, result);
+    }
+};
+
+
+void ScrubStack::scrub_dir_dentry_final(CDentry *dn)
+{
+  dout(20) << __func__ << *dn << dendl;
+
+  // Two passes through this function.  First one triggers inode validation,
+  // second one sets finally_done
+  // FIXME: kind of overloading scrub_in_progress here, using it while
+  // dentry is still on stack to indicate that we have finished
+  // doing our validate_disk_state on the inode
+  // FIXME: the magic-constructing scrub_info() is going to leave
+  // an unneeded scrub_infop lying around here
+  if (!dn->scrub_info()->dentry_children_done) {
+    dn->scrub_children_finished();
+    CInode *in = dn->get_projected_inode();
+    C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, dn);
+    MDRequestRef null_mdr;
+    in->validate_disk_state(&fin->result, null_mdr, fin);
+  }
+
+  return;
+}
+
+void ScrubStack::scrub_dirfrag(CDir *dir, bool *added_children,
+                               bool *is_terminal, bool *done)
+{
+  assert(dir != NULL);
+
+  dout(20) << __func__ << " on " << *dir << dendl;
+  *added_children = false;
+  *is_terminal = false;
+  *done = false;
+
+  if (!dir->scrub_info()->directory_scrubbing) {
+    // Get the frag complete before calling
+    // scrub initialize, so that it can populate its lists
+    // of dentries.
+    if (!dir->is_complete()) {
+      scrubs_in_progress++;
+      dir->fetch(&scrub_kick);
+      return;
+    }
+
+    dir->scrub_initialize();
+  }
+
+  int r = 0;
+  while(r == 0) {
+    CDentry *dn = NULL;
+    scrubs_in_progress++;
+    r = dir->scrub_dentry_next(&scrub_kick, &dn);
+    if (r != EAGAIN) {
+      // ctx only used by scrub_dentry_next in EAGAIN case
+      // FIXME It's kind of annoying to keep allocating and deleting a ctx here
+      scrubs_in_progress--;
+    }
+
+    if (r == EAGAIN) {
+      // Drop out, CDir fetcher will call back our kicker context
+      dout(20) << __func__ << " waiting for fetch on " << *dir << dendl;
+      return;
+    }
+
+    if (r == ENOENT) {
+      // Nothing left to scrub, are we done?
+      std::list<CDentry*> scrubbing;
+      dir->scrub_dentries_scrubbing(&scrubbing);
+      if (scrubbing.empty()) {
+        dout(20) << __func__ << " dirfrag done: " << *dir << dendl;
+        // FIXME: greg: What's the diff meant to be between done and terminal
+	dir->scrub_finished();
+        *done = true;
+        *is_terminal = true;
+      } else {
+        dout(20) << __func__ << " " << scrubbing.size() << " dentries still "
+                   "scrubbing in " << *dir << dendl;
+      }
+      return;
+    }
+
+    if (r < 0) {
+      // FIXME: how can I handle an error here?  I can't hold someone up
+      // forever, but I can't say "sure you're scrubbed"
+      //  -- should change scrub_dentry_next definition to never
+      //  give out IO errors (handle them some other way)
+      //     
+      derr << __func__ << " error from scrub_dentry_next: "
+           << r << dendl;
+      return;
+    }
+
+    // scrub_dentry_next defined to only give -ve, EAGAIN, ENOENT, 0 -- we should
+    // never get random IO errors here.
+    assert(r == 0);
+
+    CDentry *parent_dn = dir->get_inode()->get_parent_dn();
+    ScrubHeaderRefConst header = parent_dn->scrub_info()->header;
+
+    // FIXME: Do I *really* need to construct a kick context for every
+    // single dentry I'm going to scrub?
+    _enqueue_dentry(dn,
+        dir,
+        parent_dn->scrub_info()->scrub_recursive,
+        false,  // We are already recursing so scrub_children not meaningful
+        header,
+	NULL,
+        true);
+
+    *added_children = true;
+  }
+}
+
+void ScrubStack::scrub_file_dentry(CDentry *dn)
+{
+  assert(dn->get_linkage()->get_inode() != NULL);
+
+  CInode *in = dn->get_projected_inode();
+  C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, dn);
+
+  // At this stage the DN is already past scrub_initialize, so
+  // it's in the cache, it has PIN_SCRUBQUEUE and it is authpinned
+  MDRequestRef null_mdr;
+  in->validate_disk_state(&fin->result, null_mdr, fin);
+}
+
+void ScrubStack::_validate_inode_done(CDentry *dn, int r,
+    const CInode::validated_data &result)
+{
+  // FIXME: do something real with result!  DamageTable!  Spamming
+  // the cluster log for debugging purposes
+  LogChannelRef clog = mdcache->mds->clog;
+  clog->info() << __func__ << " " << *dn << " r=" << r;
+#if 0
+  assert(dn->scrub_info_p != NULL);
+  dn->scrub_info_p->inode_validated = true;
+#endif
+
+  Context *c = NULL;
+  CInode *in = dn->get_projected_inode();
+  if (in->is_dir()) {
+    // For directories, inodes undergo a scrub_init/scrub_finish cycle
+    in->scrub_finished(&c);
+  } else {
+    // For regular files, we never touch the scrub_info on the inode,
+    // just the dentry.
+    dn->scrub_finished(&c);
+  }
+  if (c) {
+    finisher->queue(new MDSIOContextWrapper(mdcache->mds, c), 0);
+  }
+}
+
+ScrubStack::C_KickOffScrubs::C_KickOffScrubs(MDCache *mdcache, ScrubStack *s)
+  : MDSInternalContext(mdcache->mds), stack(s) { }
diff --git a/src/mds/ScrubStack.h b/src/mds/ScrubStack.h
new file mode 100644
index 0000000..b01ee84
--- /dev/null
+++ b/src/mds/ScrubStack.h
@@ -0,0 +1,201 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef SCRUBSTACK_H_
+#define SCRUBSTACK_H_
+
+#include "CDir.h"
+#include "CDentry.h"
+#include "CInode.h"
+#include "MDSContext.h"
+#include "ScrubHeader.h"
+
+#include "include/elist.h"
+
+class MDCache;
+class Finisher;
+
+class ScrubStack {
+protected:
+  /// A finisher needed so that we don't re-enter kick_off_scrubs
+  Finisher *finisher;
+
+  /// The stack of dentries we want to scrub
+  elist<CDentry*> dentry_stack;
+  /// current number of dentries we're actually scrubbing
+  int scrubs_in_progress;
+  ScrubStack *scrubstack; // hack for dout
+  int stack_size;
+
+  class C_KickOffScrubs : public MDSInternalContext {
+    ScrubStack *stack;
+  public:
+    C_KickOffScrubs(MDCache *mdcache, ScrubStack *s);
+    void finish(int r) { }
+    void complete(int r) {
+      stack->scrubs_in_progress--;
+      stack->kick_off_scrubs();
+      // don't delete self
+    }
+  };
+  C_KickOffScrubs scrub_kick;
+
+public:
+  MDCache *mdcache;
+  ScrubStack(MDCache *mdc, Finisher *finisher_) :
+    finisher(finisher_),
+    dentry_stack(member_offset(CDentry, item_scrub)),
+    scrubs_in_progress(0),
+    scrubstack(this),
+    stack_size(0),
+    scrub_kick(mdc, this),
+    mdcache(mdc) {}
+  ~ScrubStack() {
+    assert(dentry_stack.empty());
+    assert(!scrubs_in_progress);
+  }
+  /**
+   * Put a dentry on the top of the scrub stack, so it is the highest priority.
+   * If there are other scrubs in progress, they will not continue scrubbing new
+   * entries until this one is completed.
+   * @param dn The dentry to scrub
+   * @param recursive True if we want to recursively scrub the
+   * entire hierarchy under dn.
+   * @param children True if we want to scrub the direct children of
+   * dn but aren't doing a recursive scrub. (Otherwise, all checks are
+   * local to dn's disk state.)
+   * @param header The ScrubHeader propagated from whereever this scrub
+   *               was initiated
+   */
+  void enqueue_dentry_top(CDentry *dn, bool recursive, bool children,
+                          ScrubHeaderRefConst header,
+                          MDSInternalContextBase *on_finish) {
+    enqueue_dentry(dn, recursive, children, header, on_finish, true);
+  }
+  /** Like enqueue_dentry_top, but we wait for all pending scrubs before
+   * starting this one.
+   */
+  void enqueue_dentry_bottom(CDentry *dn, bool recursive, bool children,
+                             ScrubHeaderRefConst header,
+                             MDSInternalContextBase *on_finish) {
+    enqueue_dentry(dn, recursive, children, header, on_finish, false);
+  }
+
+private:
+  /**
+   * Put the dentry at either the top or bottom of the stack, with
+   * the given scrub params, and then try and kick off more scrubbing.
+   */
+  void enqueue_dentry(CDentry *dn, bool recursive, bool children,
+                      ScrubHeaderRefConst header,
+                      MDSInternalContextBase *on_finish, bool top);
+  void _enqueue_dentry(CDentry *dn, CDir *parent, bool recursive, bool children,
+                      ScrubHeaderRefConst header,
+                       MDSInternalContextBase *on_finish, bool top);
+  /**
+   * Kick off as many scrubs as are appropriate, based on the current
+   * state of the stack.
+   */
+  void kick_off_scrubs();
+  /**
+   * Push a dentry on top of the stack.
+   */
+  inline void push_dentry(CDentry *dentry);
+  /**
+   * Push a dentry to the bottom of the stack.
+   */
+  inline void push_dentry_bottom(CDentry *dentry);
+  /**
+   * Pop the given dentry off the stack.
+   */
+  inline void pop_dentry(CDentry *dn);
+
+  /**
+   * Scrub a file-representing dentry.
+   * @param dn The dentry to scrub
+   * @param progress Out pointer to a bool, which will be set to true.
+   * @pre dn->get_projected_inode()->is_file()==true;
+   */
+  void scrub_file_dentry(CDentry *dn);
+
+  /**
+   * Callback from completion of CInode::validate_disk_state
+   * @param dn The dentry owning the inode we were validating
+   * @param r The return status from validate_disk_state
+   * @param result Populated results from validate_disk_state
+   */
+  void _validate_inode_done(CDentry *dn, int r,
+    const CInode::validated_data &result);
+  friend class C_InodeValidated;
+
+  /**
+   * Make progress on scrubbing a directory-representing dirfrag and
+   * its children..
+   *
+   * 1) Select the next dirfrag which hasn't been scrubbed, and make progress
+   * on it if possible.
+   *
+   * 2) If not, move on to the next dirfrag and start it up, if any.
+   *
+   * 3) If waiting for results from dirfrag scrubs, do nothing.
+   *
+   * 4) If all dirfrags have been scrubbed, scrub my inode.
+   *
+   * @param dn The CDentry to scrub as a directory
+   * @param added_dentries set to true if we pushed some of our children
+   * onto the ScrubStack
+   * @param is_terminal set to true if there are no descendant dentries
+   * remaining to start scrubbing.
+   * @param done set to true if we and all our children have finished scrubbing
+   */
+  void scrub_dir_dentry(CDentry *dn, bool *added_children, bool *is_terminal,
+                        bool *done);
+  /**
+   * Make progress on scrubbing a dirfrag. It may return after each of the
+   * following steps, but will report making progress on each one.
+   *
+   * 1) enqueues the next unscrubbed child directory dentry at the
+   * top of the stack.
+   *
+   * 2) Initiates a scrub on the next unscrubbed file dentry
+   *
+   * If there are scrubs currently in progress on child dentries, no more child
+   * dentries to scrub, and this function is invoked, it will report no
+   * progress. Try again later.
+   *
+   */
+  void scrub_dirfrag(CDir *dir, bool *added_children, bool *is_terminal,
+		     bool *done);
+  /**
+   * Scrub a directory-representing dentry.
+   *
+   * @param dn The CDentry of the directory we're doing final scrub on.
+   */
+  void scrub_dir_dentry_final(CDentry *dn);
+
+  /**
+   * Get a CDir into memory, and return it if it's already complete.
+   * Otherwise, fetch it and kick off scrubbing when done.
+   *
+   * @param in The Inode to get the next directory from
+   * @param new_dir The CDir we're returning to you. NULL if
+   * not ready yet or there aren't any.
+   * @returns false if you have to wait, true if there's no work
+   * left to do (we returned it, or there are none left in this inode).
+   */
+  bool get_next_cdir(CInode *in, CDir **new_dir);
+
+};
+
+#endif /* SCRUBSTACK_H_ */
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index eebb836..dbba8c6 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -2098,6 +2098,27 @@ void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack)
 // HELPERS
 
 
+/**
+ * check whether we are permitted to complete a request
+ *
+ * Check whether we have permission to perform the operation specified
+ * by mask on the given inode, based on the capability in the mdr's
+ * session.
+ */
+bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
+{
+ if (mdr->session && !mdr->session->check_access(
+       in, mask,
+       mdr->client_request->get_caller_uid(),
+       mdr->client_request->get_caller_gid(),
+       mdr->client_request->head.args.setattr.uid,
+       mdr->client_request->head.args.setattr.gid)) {
+    respond_to_request(mdr, -EACCES);
+    return false;
+  }
+  return true;
+}
+
 /** validate_dentry_dir
  *
  * verify that the dir exists and would own the dname.
@@ -2663,6 +2684,9 @@ void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if (!check_access(mdr, ref, MAY_READ))
+    return;
+
   // note which caps are requested, so we return at least a snapshot
   // value for them.  (currently this matters for xattrs and inline data)
   mdr->getattr_caps = mask;
@@ -2711,6 +2735,11 @@ void Server::handle_client_lookup_ino(MDRequestRef& mdr,
     return;
   }
 
+  // check for nothing (not read or write); this still applies the
+  // path check.
+  if (!check_access(mdr, in, 0))
+    return;
+
   CDentry *dn = in->get_projected_parent_dn();
   CInode *diri = dn ? dn->get_dir()->inode : NULL;
   if (dn && (want_parent || want_dentry)) {
@@ -2719,6 +2748,10 @@ void Server::handle_client_lookup_ino(MDRequestRef& mdr,
     rdlocks.insert(&dn->lock);
     if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
       return;
+
+    // need read access to directory inode
+    if (!check_access(mdr, diri, MAY_READ))
+      return;
   }
 
   if (want_parent) {
@@ -2857,6 +2890,9 @@ void Server::handle_client_open(MDRequestRef& mdr)
     if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
       return;
 
+    if (!check_access(mdr, cur, MAY_WRITE))
+    return;
+
     // wait for pending truncate?
     const inode_t *pi = cur->get_projected_inode();
     if (pi->is_truncating()) {
@@ -2882,6 +2918,12 @@ void Server::handle_client_open(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  int mask = MAY_READ;
+  if (cmode & CEPH_FILE_MODE_WR)
+    mask |= MAY_WRITE;
+  if (!check_access(mdr, cur, mask))
+    return;
+
   if (cur->is_file() || cur->is_dir()) {
     if (mdr->snapid == CEPH_NOSNAP) {
       // register new cap
@@ -3055,6 +3097,9 @@ void Server::handle_client_openc(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if (!check_access(mdr, diri, MAY_WRITE))
+    return;
+
   CDentry::linkage_t *dnl = dn->get_projected_linkage();
 
   if (!dnl->is_null()) {
@@ -3148,6 +3193,9 @@ void Server::handle_client_readdir(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if (!check_access(mdr, diri, MAY_READ))
+    return;
+
   // which frag?
   frag_t fg = (__u32)req->head.args.readdir.frag;
   string offset_str = req->get_path2();
@@ -3537,6 +3585,7 @@ void Server::handle_client_setattr(MDRequestRef& mdr)
   }
 
   __u32 mask = req->head.args.setattr.mask;
+  __u32 access_mask = MAY_WRITE;
 
   // xlock inode
   if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID))
@@ -3549,6 +3598,15 @@ void Server::handle_client_setattr(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
+    access_mask |= MAY_CHOWN;
+
+  if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
+    access_mask |= MAY_CHGRP;
+
+  if (!check_access(mdr, cur, access_mask))
+    return;
+
   // trunc from bigger -> smaller?
   inode_t *pi = cur->get_projected_inode();
 
@@ -3754,6 +3812,9 @@ void Server::handle_client_setlayout(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if (!check_access(mdr, cur, MAY_WRITE))
+    return;
+
   // project update
   inode_t *pi = cur->project_inode();
   pi->layout = layout;
@@ -3795,6 +3856,9 @@ void Server::handle_client_setdirlayout(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if (!check_access(mdr, cur, MAY_WRITE))
+    return;
+
   // validate layout
   const inode_t *old_pi = cur->get_projected_inode();
   ceph_file_layout layout;
@@ -4213,6 +4277,9 @@ void Server::handle_client_setxattr(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if (!check_access(mdr, cur, MAY_WRITE))
+    return;
+
   map<string, bufferptr> *pxattrs = cur->get_projected_xattrs();
   if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(name)) {
     dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
@@ -4381,6 +4448,9 @@ void Server::handle_client_mknod(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if (!check_access(mdr, diri, MAY_WRITE))
+    return;
+
   unsigned mode = req->head.args.mknod.mode;
   if ((mode & S_IFMT) == 0)
     mode |= S_IFREG;
@@ -4469,6 +4539,10 @@ void Server::handle_client_mkdir(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  // mkdir check access
+  if (!check_access(mdr, diri, MAY_WRITE))
+    return;
+
   // new inode
   SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
   snapid_t follows = realm->get_newest_seq();
@@ -4545,6 +4619,9 @@ void Server::handle_client_symlink(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if (!check_access(mdr, diri, MAY_WRITE))
+   return;
+
   unsigned mode = S_IFLNK | 0777;
   CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);
   assert(newi);
@@ -4612,6 +4689,12 @@ void Server::handle_client_link(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if (!check_access(mdr, targeti, MAY_WRITE))
+    return;
+
+  if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
+    return;
+
   // go!
   assert(g_conf->mds_kill_link_at != 1);
 
@@ -5145,6 +5228,8 @@ void Server::handle_client_unlink(MDRequestRef& mdr)
     return;
   }
 
+  CInode *diri = dn->get_dir()->get_inode();
+
   CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
   assert(!dnl->is_null());
 
@@ -5193,8 +5278,8 @@ void Server::handle_client_unlink(MDRequestRef& mdr)
   for (int i=0; i<(int)trace.size()-1; i++)
     rdlocks.insert(&trace[i]->lock);
   xlocks.insert(&dn->lock);
-  wrlocks.insert(&dn->get_dir()->inode->filelock);
-  wrlocks.insert(&dn->get_dir()->inode->nestlock);
+  wrlocks.insert(&diri->filelock);
+  wrlocks.insert(&diri->nestlock);
   xlocks.insert(&in->linklock);
   if (straydn) {
     wrlocks.insert(&straydn->get_dir()->inode->filelock);
@@ -5208,6 +5293,9 @@ void Server::handle_client_unlink(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if (!check_access(mdr, diri, MAY_WRITE))
+    return;
+
   if (in->is_dir() &&
       _dir_is_nonempty(mdr, in)) {
     respond_to_request(mdr, -ENOTEMPTY);
@@ -5293,6 +5381,7 @@ void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
   dn->pre_dirty();
 
   inode_t *pi = in->project_inode();
+  dn->make_path_string(pi->stray_prior_path);
   mdr->add_projected_inode(in); // do this _after_ my dn->pre_dirty().. we apply that one manually.
   pi->version = in->pre_dirty();
   pi->ctime = mdr->get_op_stamp();
@@ -6060,6 +6149,15 @@ void Server::handle_client_rename(MDRequestRef& mdr)
 				  &remote_wrlocks, auth_pin_freeze))
     return;
 
+  if (!check_access(mdr, srcdn->get_dir()->get_inode(), MAY_WRITE))
+    return;
+
+  if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
+    return;
+
+  if (!check_access(mdr, srci, MAY_WRITE))
+    return;
+
   if (oldin &&
       oldin->is_dir() &&
       _dir_is_nonempty(mdr, oldin)) {
@@ -6466,6 +6564,7 @@ void Server::_rename_prepare(MDRequestRef& mdr,
     }
     if (tpi) {
       tpi->ctime = mdr->get_op_stamp();
+      destdn->make_path_string(tpi->stray_prior_path);
       tpi->nlink--;
       if (tpi->nlink == 0)
 	oldin->state_set(CInode::STATE_ORPHAN);
@@ -7637,6 +7736,9 @@ void Server::handle_client_lssnap(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if (!check_access(mdr, diri, MAY_READ))
+    return;
+
   SnapRealm *realm = diri->find_snaprealm();
   map<snapid_t,SnapInfo*> infomap;
   realm->get_snap_info(infomap, diri->get_oldest_snap());
@@ -7766,6 +7868,9 @@ void Server::handle_client_mksnap(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if (!check_access(mdr, diri, MAY_WRITE))
+    return;
+
   // make sure name is unique
   if (diri->snaprealm &&
       diri->snaprealm->exists(snapname)) {
@@ -7913,6 +8018,9 @@ void Server::handle_client_rmsnap(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if (!check_access(mdr, diri, MAY_WRITE))
+    return;
+
   // prepare
   if (!mdr->more()->stid) {
     mds->snapclient->prepare_destroy(diri->ino(), snapid,
@@ -8053,6 +8161,9 @@ void Server::handle_client_renamesnap(MDRequestRef& mdr)
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
     return;
 
+  if (!check_access(mdr, diri, MAY_WRITE))
+    return;
+
     // prepare
   if (!mdr->more()->stid) {
     mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
diff --git a/src/mds/Server.h b/src/mds/Server.h
index c99f7ae..803dd32 100644
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@@ -133,6 +133,8 @@ public:
   void handle_slave_auth_pin_ack(MDRequestRef& mdr, MMDSSlaveRequest *ack);
 
   // some helpers
+  bool check_access(MDRequestRef& mdr, CInode *in, unsigned mask);
+  bool _check_access(Session *session, CInode *in, unsigned mask, int caller_uid, int caller_gid, int setattr_uid, int setattr_gid);
   CDir *validate_dentry_dir(MDRequestRef& mdr, CInode *diri, const string& dname);
   CDir *traverse_to_auth_dir(MDRequestRef& mdr, vector<CDentry*> &trace, filepath refpath);
   CDentry *prepare_null_dentry(MDRequestRef& mdr, CDir *dir, const string& dname, bool okexist=false);
diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc
index dc0ba71..d6ca0af 100644
--- a/src/mds/SessionMap.cc
+++ b/src/mds/SessionMap.cc
@@ -545,27 +545,6 @@ void SessionMap::wipe_ino_prealloc()
   projected = ++version;
 }
 
-/**
- * Calculate the length of the `requests` member list,
- * because elist does not have a size() method.
- *
- * O(N) runtime.  This would be const, but elist doesn't
- * have const iterators.
- */
-size_t Session::get_request_count()
-{
-  size_t result = 0;
-
-  elist<MDRequestImpl*>::iterator p = requests.begin(
-      member_offset(MDRequestImpl, item_session_request));
-  while (!p.end()) {
-    ++result;
-    ++p;
-  }
-
-  return result;
-}
-
 void SessionMap::add_session(Session *s)
 {
   dout(10) << __func__ << " s=" << s << " name=" << s->info.inst.name << dendl;
@@ -606,85 +585,6 @@ void SessionMap::touch_session(Session *session)
   session->last_cap_renew = ceph_clock_now(g_ceph_context);
 }
 
-/**
- * Capped in response to a CEPH_MSG_CLIENT_CAPRELEASE message,
- * with n_caps equal to the number of caps that were released
- * in the message.  Used to update state about how many caps a
- * client has released since it was last instructed to RECALL_STATE.
- */
-void Session::notify_cap_release(size_t n_caps)
-{
-  if (!recalled_at.is_zero()) {
-    recall_release_count += n_caps;
-    if (recall_release_count >= recall_count) {
-      recalled_at = utime_t();
-      recall_count = 0;
-      recall_release_count = 0;
-    }
-  }
-}
-
-/**
- * Called when a CEPH_MSG_CLIENT_SESSION->CEPH_SESSION_RECALL_STATE
- * message is sent to the client.  Update our recall-related state
- * in order to generate health metrics if the session doesn't see
- * a commensurate number of calls to ::notify_cap_release
- */
-void Session::notify_recall_sent(int const new_limit)
-{
-  if (recalled_at.is_zero()) {
-    // Entering recall phase, set up counters so we can later
-    // judge whether the client has respected the recall request
-    recalled_at = ceph_clock_now(g_ceph_context);
-    assert (new_limit < caps.size());  // Behaviour of Server::recall_client_state
-    recall_count = caps.size() - new_limit;
-    recall_release_count = 0;
-  }
-}
-
-void Session::set_client_metadata(map<string, string> const &meta)
-{
-  info.client_metadata = meta;
-
-  _update_human_name();
-}
-
-/**
- * Use client metadata to generate a somewhat-friendlier
- * name for the client than its session ID.
- *
- * This is *not* guaranteed to be unique, and any machine
- * consumers of session-related output should always use
- * the session ID as a primary capacity and use this only
- * as a presentation hint.
- */
-void Session::_update_human_name()
-{
-  if (info.client_metadata.count("hostname")) {
-    // Happy path, refer to clients by hostname
-    human_name = info.client_metadata["hostname"];
-    if (info.client_metadata.count("entity_id")) {
-      EntityName entity;
-      entity.set_id(info.client_metadata["entity_id"]);
-      if (!entity.has_default_id()) {
-        // When a non-default entity ID is set by the user, assume they
-        // would like to see it in references to the client
-        human_name += std::string(":") + entity.get_id();
-      }
-    }
-  } else {
-    // Fallback, refer to clients by ID e.g. client.4567
-    human_name = stringify(info.inst.name.num());
-  }
-}
-
-void Session::decode(bufferlist::iterator &p)
-{
-  info.decode(p);
-
-  _update_human_name();
-}
-
 void SessionMap::_mark_dirty(Session *s)
 {
   if (dirty_sessions.size() >= g_conf->mds_sessionmap_keys_per_op) {
@@ -829,4 +729,254 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
   }
 }
 
+// =================
+// Session
+
+#undef dout_prefix
+#define dout_prefix *_dout << "Session "
+
+/**
+ * Calculate the length of the `requests` member list,
+ * because elist does not have a size() method.
+ *
+ * O(N) runtime.  This would be const, but elist doesn't
+ * have const iterators.
+ */
+size_t Session::get_request_count()
+{
+  size_t result = 0;
+
+  elist<MDRequestImpl*>::iterator p = requests.begin(
+      member_offset(MDRequestImpl, item_session_request));
+  while (!p.end()) {
+    ++result;
+    ++p;
+  }
+
+  return result;
+}
+
+/**
+ * Capped in response to a CEPH_MSG_CLIENT_CAPRELEASE message,
+ * with n_caps equal to the number of caps that were released
+ * in the message.  Used to update state about how many caps a
+ * client has released since it was last instructed to RECALL_STATE.
+ */
+void Session::notify_cap_release(size_t n_caps)
+{
+  if (!recalled_at.is_zero()) {
+    recall_release_count += n_caps;
+    if (recall_release_count >= recall_count) {
+      recalled_at = utime_t();
+      recall_count = 0;
+      recall_release_count = 0;
+    }
+  }
+}
+
+/**
+ * Called when a CEPH_MSG_CLIENT_SESSION->CEPH_SESSION_RECALL_STATE
+ * message is sent to the client.  Update our recall-related state
+ * in order to generate health metrics if the session doesn't see
+ * a commensurate number of calls to ::notify_cap_release
+ */
+void Session::notify_recall_sent(int const new_limit)
+{
+  if (recalled_at.is_zero()) {
+    // Entering recall phase, set up counters so we can later
+    // judge whether the client has respected the recall request
+    recalled_at = ceph_clock_now(g_ceph_context);
+    assert (new_limit < caps.size());  // Behaviour of Server::recall_client_state
+    recall_count = caps.size() - new_limit;
+    recall_release_count = 0;
+  }
+}
+
+void Session::set_client_metadata(map<string, string> const &meta)
+{
+  info.client_metadata = meta;
+
+  _update_human_name();
+}
+
+/**
+ * Use client metadata to generate a somewhat-friendlier
+ * name for the client than its session ID.
+ *
+ * This is *not* guaranteed to be unique, and any machine
+ * consumers of session-related output should always use
+ * the session ID as a primary capacity and use this only
+ * as a presentation hint.
+ */
+void Session::_update_human_name()
+{
+  if (info.client_metadata.count("hostname")) {
+    // Happy path, refer to clients by hostname
+    human_name = info.client_metadata["hostname"];
+    if (!info.auth_name.has_default_id()) {
+      // When a non-default entity ID is set by the user, assume they
+      // would like to see it in references to the client, if it's
+      // reasonable short.  Limit the length because we don't want
+      // to put e.g. uuid-generated names into a "human readable"
+      // rendering.
+      const int arbitrarily_short = 16;
+      if (info.auth_name.get_id().size() < arbitrarily_short) {
+        human_name += std::string(":") + info.auth_name.get_id();
+      }
+    }
+  } else {
+    // Fallback, refer to clients by ID e.g. client.4567
+    human_name = stringify(info.inst.name.num());
+  }
+}
+
+void Session::decode(bufferlist::iterator &p)
+{
+  info.decode(p);
+
+  _update_human_name();
+}
+
+bool Session::check_access(CInode *in, unsigned mask,
+			   int caller_uid, int caller_gid,
+			   int new_uid, int new_gid)
+{
+  string path;
+  CInode *diri = NULL;
+  if (!in->is_base())
+    diri = in->get_projected_parent_dn()->get_dir()->get_inode();
+  if (diri && diri->is_stray()){
+    path = in->get_projected_inode()->stray_prior_path;
+    dout(20) << __func__ << " stray_prior_path " << path << dendl;
+  } else {
+    in->make_path_string(path, false, in->get_projected_parent_dn());
+    dout(20) << __func__ << " path " << path << dendl;
+  }
+  if (path.length())
+    path = path.substr(1);    // drop leading /
+
+  if (auth_caps.is_capable(path, in->inode.uid, in->inode.gid, in->inode.mode,
+			   caller_uid, caller_gid, mask,
+			   new_uid, new_gid)) {
+    return true;
+  }
+  return false;
+}
+
+int SessionFilter::parse(
+    const std::vector<std::string> &args,
+    std::stringstream *ss)
+{
+  assert(ss != NULL);
+
+  for (const auto &s : args) {
+    dout(20) << __func__ << " parsing filter '" << s << "'" << dendl;
+
+    auto eq = s.find("=");
+    if (eq == std::string::npos || eq == s.size()) {
+      *ss << "Invalid filter '" << s << "'";
+      return -EINVAL;
+    }
+
+    // Keys that start with this are to be taken as referring
+    // to freeform client metadata fields.
+    const std::string metadata_prefix("client_metadata.");
+
+    auto k = s.substr(0, eq);
+    auto v = s.substr(eq + 1);
+
+    dout(20) << __func__ << " parsed k='" << k << "', v='" << v << "'" << dendl;
+
+    if (k.compare(0, metadata_prefix.size(), metadata_prefix) == 0
+        && k.size() > metadata_prefix.size()) {
+      // Filter on arbitrary metadata key (no fixed schema for this,
+      // so anything after the dot is a valid field to filter on)
+      auto metadata_key = k.substr(metadata_prefix.size());
+      metadata.insert(std::make_pair(metadata_key, v));
+    } else if (k == "auth_name") {
+      // Filter on client entity name
+      auth_name = v;
+    } else if (k == "state") {
+      state = v;
+    } else if (k == "id") {
+      std::string err;
+      id = strict_strtoll(v.c_str(), 10, &err);
+      if (!err.empty()) {
+        *ss << err;
+        return -EINVAL;
+      }
+    } else if (k == "reconnecting") {
+
+      /**
+       * Strict boolean parser.  Allow true/false/0/1.
+       * Anything else is -EINVAL.
+       */
+      auto is_true = [](const std::string &bstr, bool *out) -> bool
+      {
+        assert(out != nullptr);
+
+        if (bstr == "true" || bstr == "1") {
+          *out = true;
+          return 0;
+        } else if (bstr == "false" || bstr == "0") {
+          *out = false;
+          return 0;
+        } else {
+          return -EINVAL;
+        }
+      };
+
+      bool bval;
+      int r = is_true(v, &bval);
+      if (r == 0) {
+        set_reconnecting(bval);
+      } else {
+        *ss << "Invalid boolean value '" << v << "'";
+        return -EINVAL;
+      }
+    } else {
+      *ss << "Invalid filter key '" << k << "'";
+      return -EINVAL;
+    }
+  }
+
+  return 0;
+}
+
+bool SessionFilter::match(
+    const Session &session,
+    std::function<bool(client_t)> is_reconnecting) const
+{
+  for (auto m : metadata) {
+    auto k = m.first;
+    auto v = m.second;
+    if (session.info.client_metadata.count(k) == 0) {
+      return false;
+    }
+    if (session.info.client_metadata.at(k) != v) {
+      return false;
+    }
+  }
+
+  if (!auth_name.empty() && auth_name != session.info.auth_name.get_id()) {
+    return false;
+  }
+
+  if (!state.empty() && state != session.get_state_name()) {
+    return false;
+  }
+
+  if (id != 0 && id != session.info.inst.name.num()) {
+    return false;
+  }
+
+  if (reconnecting.first) {
+    const bool am_reconnecting = is_reconnecting(session.info.inst.name.num());
+    if (reconnecting.second != am_reconnecting) {
+      return false;
+    }
+  }
+
+  return true;
+}
 
diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h
index cfcfa04..b6a3792 100644
--- a/src/mds/SessionMap.h
+++ b/src/mds/SessionMap.h
@@ -64,7 +64,7 @@ public:
     STATE_KILLING = 5
   };
 
-  const char *get_state_name(int s) {
+  const char *get_state_name(int s) const {
     switch (s) {
     case STATE_CLOSED: return "closed";
     case STATE_OPENING: return "opening";
@@ -90,6 +90,8 @@ private:
   // that appropriate mark_dirty calls follow.
   std::deque<version_t> projected;
 
+
+
 public:
 
   void push_pv(version_t pv)
@@ -171,7 +173,7 @@ public:
   }
 
   int get_state() { return state; }
-  const char *get_state_name() { return get_state_name(state); }
+  const char *get_state_name() const { return get_state_name(state); }
   uint64_t get_state_seq() { return state_seq; }
   bool is_closed() const { return state == STATE_CLOSED; }
   bool is_opening() const { return state == STATE_OPENING; }
@@ -301,10 +303,14 @@ public:
     completed_requests_dirty = false;
   }
 
+  bool check_access(CInode *in, unsigned mask, int caller_uid, int caller_gid,
+		    int new_uid, int new_gid);
+
 
   Session() : 
     state(STATE_CLOSED), state_seq(0), importing_count(0),
     recalled_at(), recall_count(0), recall_release_count(0),
+    auth_caps(g_ceph_context),
     connection(NULL), item_session_list(this),
     requests(0),  // member_offset passed to front() manually
     cap_push_seq(0),
@@ -330,6 +336,33 @@ public:
   }
 };
 
+class SessionFilter
+{
+protected:
+  // First is whether to filter, second is filter value
+  std::pair<bool, bool> reconnecting;
+
+public:
+  std::map<std::string, std::string> metadata;
+  std::string auth_name;
+  std::string state;
+  int64_t id;
+
+  SessionFilter()
+    : reconnecting(false, false), id(0)
+  {}
+
+  bool match(
+      const Session &session,
+      std::function<bool(client_t)> is_reconnecting) const;
+  int parse(const std::vector<std::string> &args, std::stringstream *ss);
+  void set_reconnecting(bool v)
+  {
+    reconnecting.first = true;
+    reconnecting.second = v;
+  }
+};
+
 /*
  * session map
  */
diff --git a/src/mds/mdstypes.cc b/src/mds/mdstypes.cc
index d0218ec..08879c3 100644
--- a/src/mds/mdstypes.cc
+++ b/src/mds/mdstypes.cc
@@ -256,7 +256,7 @@ void inline_data_t::decode(bufferlist::iterator &p)
  */
 void inode_t::encode(bufferlist &bl) const
 {
-  ENCODE_START(11, 6, bl);
+  ENCODE_START(13, 6, bl);
 
   ::encode(ino, bl);
   ::encode(rdev, bl);
@@ -298,12 +298,17 @@ void inode_t::encode(bufferlist &bl) const
   ::encode(inline_data, bl);
   ::encode(quota, bl);
 
+  ::encode(stray_prior_path, bl);
+
+  ::encode(last_scrub_version, bl);
+  ::encode(last_scrub_stamp, bl);
+
   ENCODE_FINISH(bl);
 }
 
 void inode_t::decode(bufferlist::iterator &p)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(11, 6, 6, p);
+  DECODE_START_LEGACY_COMPAT_LEN(13, 6, 6, p);
 
   ::decode(ino, p);
   ::decode(rdev, p);
@@ -368,6 +373,15 @@ void inode_t::decode(bufferlist::iterator &p)
   if (struct_v >= 11)
     ::decode(quota, p);
 
+  if (struct_v >= 12) {
+    ::decode(stray_prior_path, p);
+  }
+
+  if (struct_v >= 13) {
+    ::decode(last_scrub_version, p);
+    ::decode(last_scrub_stamp, p);
+  }
+
   DECODE_FINISH(p);
 }
 
@@ -430,6 +444,8 @@ void inode_t::dump(Formatter *f) const
   f->dump_unsigned("file_data_version", file_data_version);
   f->dump_unsigned("xattr_version", xattr_version);
   f->dump_unsigned("backtrace_version", backtrace_version);
+
+  f->dump_string("stray_prior_path", stray_prior_path);
 }
 
 void inode_t::generate_test_instances(list<inode_t*>& ls)
@@ -552,7 +568,7 @@ void old_inode_t::generate_test_instances(list<old_inode_t*>& ls)
  */
 void fnode_t::encode(bufferlist &bl) const
 {
-  ENCODE_START(3, 3, bl);
+  ENCODE_START(4, 3, bl);
   ::encode(version, bl);
   ::encode(snap_purged_thru, bl);
   ::encode(fragstat, bl);
@@ -560,6 +576,10 @@ void fnode_t::encode(bufferlist &bl) const
   ::encode(rstat, bl);
   ::encode(accounted_rstat, bl);
   ::encode(damage_flags, bl);
+  ::encode(recursive_scrub_version, bl);
+  ::encode(recursive_scrub_stamp, bl);
+  ::encode(localized_scrub_version, bl);
+  ::encode(localized_scrub_stamp, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -575,6 +595,12 @@ void fnode_t::decode(bufferlist::iterator &bl)
   if (struct_v >= 3) {
     ::decode(damage_flags, bl);
   }
+  if (struct_v >= 4) {
+    ::decode(recursive_scrub_version, bl);
+    ::decode(recursive_scrub_stamp, bl);
+    ::decode(localized_scrub_version, bl);
+    ::decode(localized_scrub_stamp, bl);
+  }
   DECODE_FINISH(bl);
 }
 
@@ -665,19 +691,20 @@ void old_rstat_t::generate_test_instances(list<old_rstat_t*>& ls)
  */
 void session_info_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(5, 3, bl);
+  ENCODE_START(6, 3, bl);
   ::encode(inst, bl);
   ::encode(completed_requests, bl);
   ::encode(prealloc_inos, bl);   // hacky, see below.
   ::encode(used_inos, bl);
   ::encode(client_metadata, bl);
   ::encode(completed_flushes, bl);
+  ::encode(auth_name, bl);
   ENCODE_FINISH(bl);
 }
 
 void session_info_t::decode(bufferlist::iterator& p)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, p);
+  DECODE_START_LEGACY_COMPAT_LEN(6, 2, 2, p);
   ::decode(inst, p);
   if (struct_v <= 2) {
     set<ceph_tid_t> s;
@@ -699,6 +726,9 @@ void session_info_t::decode(bufferlist::iterator& p)
   if (struct_v >= 5) {
     ::decode(completed_flushes, p);
   }
+  if (struct_v >= 6) {
+    ::decode(auth_name, p);
+  }
   DECODE_FINISH(p);
 }
 
diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h
index 72ff4fc..669e8b9 100644
--- a/src/mds/mdstypes.h
+++ b/src/mds/mdstypes.h
@@ -13,6 +13,7 @@
 #include "common/config.h"
 #include "common/Clock.h"
 #include "common/DecayCounter.h"
+#include "common/entity_name.h"
 #include "MDSContext.h"
 
 #include "include/frag.h"
@@ -463,17 +464,23 @@ struct inode_t {
   version_t file_data_version; // auth only
   version_t xattr_version;
 
+  utime_t last_scrub_stamp;    // start time of last complete scrub
+  version_t last_scrub_version;// (parent) start version of last complete scrub
+
   version_t backtrace_version;
 
   snapid_t oldest_snap;
 
+  string stray_prior_path; //stores path before unlink
+
   inode_t() : ino(0), rdev(0),
 	      mode(0), uid(0), gid(0), nlink(0),
 	      size(0), max_size_ever(0),
 	      truncate_seq(0), truncate_size(0), truncate_from(0),
 	      truncate_pending(0),
 	      time_warp_seq(0),
-	      version(0), file_data_version(0), xattr_version(0), backtrace_version(0) {
+	      version(0), file_data_version(0), xattr_version(0),
+	      last_scrub_version(0), backtrace_version(0) {
     clear_layout();
     memset(&dir_layout, 0, sizeof(dir_layout));
     memset(&quota, 0, sizeof(quota));
@@ -607,11 +614,19 @@ struct fnode_t {
   nest_info_t rstat, accounted_rstat;
   damage_flags_t damage_flags;
 
+  // we know we and all our descendants have been scrubbed since this version
+  version_t recursive_scrub_version;
+  utime_t recursive_scrub_stamp;
+  // version at which we last scrubbed our personal data structures
+  version_t localized_scrub_version;
+  utime_t localized_scrub_stamp;
+
   void encode(bufferlist &bl) const;
   void decode(bufferlist::iterator& bl);
   void dump(Formatter *f) const;
   static void generate_test_instances(list<fnode_t*>& ls);
-  fnode_t() : version(0) {}
+  fnode_t() : version(0),
+	      recursive_scrub_version(0), localized_scrub_version(0) {}
 };
 WRITE_CLASS_ENCODER(fnode_t)
 
@@ -643,6 +658,7 @@ struct session_info_t {
   interval_set<inodeno_t> used_inos;       // journaling use
   std::map<std::string, std::string> client_metadata;
   std::set<ceph_tid_t> completed_flushes;
+  EntityName auth_name;
 
   client_t get_client() const { return client_t(inst.name.num()); }
 
@@ -670,6 +686,8 @@ struct dentry_key_t {
   dentry_key_t() : snapid(0), name(0) {}
   dentry_key_t(snapid_t s, const char *n) : snapid(s), name(n) {}
 
+  bool is_valid() { return name || snapid; }
+
   // encode into something that can be decoded as a string.
   // name_ (head) or name_%x (!head)
   void encode(bufferlist& bl) const {
diff --git a/src/messages/MAuthReply.h b/src/messages/MAuthReply.h
index 5fea5a5..af9e884 100644
--- a/src/messages/MAuthReply.h
+++ b/src/messages/MAuthReply.h
@@ -20,7 +20,7 @@
 
 struct MAuthReply : public Message {
   __u32 protocol;
-  __s32 result;
+  errorcode32_t result;
   uint64_t global_id;      // if zero, meaningless
   string result_msg;
   bufferlist result_bl;
diff --git a/src/messages/MClientCaps.h b/src/messages/MClientCaps.h
index 609495b..c923dd1 100644
--- a/src/messages/MClientCaps.h
+++ b/src/messages/MClientCaps.h
@@ -20,7 +20,7 @@
 
 
 class MClientCaps : public Message {
-  static const int HEAD_VERSION = 6;
+  static const int HEAD_VERSION = 7;
   static const int COMPAT_VERSION = 1;
 
  public:
@@ -35,6 +35,8 @@ class MClientCaps : public Message {
   // Receivers may not use their new caps until they have this OSD map
   epoch_t osd_epoch_barrier;
   ceph_tid_t oldest_flush_tid;
+  uint32_t caller_uid;
+  uint32_t caller_gid;
 
   int      get_caps() { return head.caps; }
   int      get_wanted() { return head.wanted; }
@@ -92,7 +94,9 @@ class MClientCaps : public Message {
 
   MClientCaps()
     : Message(CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION),
-      osd_epoch_barrier(0), oldest_flush_tid(0) {
+      osd_epoch_barrier(0),
+      oldest_flush_tid(0),
+      caller_uid(0), caller_gid(0) {
     inline_version = 0;
   }
   MClientCaps(int op,
@@ -106,7 +110,9 @@ class MClientCaps : public Message {
 	      int mseq,
               epoch_t oeb)
     : Message(CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION),
-      osd_epoch_barrier(oeb), oldest_flush_tid(0) {
+      osd_epoch_barrier(oeb),
+      oldest_flush_tid(0),
+      caller_uid(0), caller_gid(0) {
     memset(&head, 0, sizeof(head));
     head.op = op;
     head.ino = ino;
@@ -124,7 +130,9 @@ class MClientCaps : public Message {
 	      inodeno_t ino, inodeno_t realm,
 	      uint64_t id, int mseq, epoch_t oeb)
     : Message(CEPH_MSG_CLIENT_CAPS, HEAD_VERSION, COMPAT_VERSION),
-      osd_epoch_barrier(oeb) {
+      osd_epoch_barrier(oeb),
+      oldest_flush_tid(0),
+      caller_uid(0), caller_gid(0) {
     memset(&head, 0, sizeof(head));
     head.op = op;
     head.ino = ino;
@@ -199,6 +207,10 @@ public:
     if (header.version >= 6) {
       ::decode(oldest_flush_tid, p);
     }
+    if (header.version >= 7) {
+      ::decode(caller_uid, p);
+      ::decode(caller_gid, p);
+    }
   }
   void encode_payload(uint64_t features) {
     header.version = HEAD_VERSION;
@@ -240,6 +252,8 @@ public:
 
     ::encode(osd_epoch_barrier, payload);
     ::encode(oldest_flush_tid, payload);
+    ::encode(caller_uid, payload);
+    ::encode(caller_gid, payload);
   }
 };
 
diff --git a/src/messages/MCommandReply.h b/src/messages/MCommandReply.h
index eda184f..6abd6fe 100644
--- a/src/messages/MCommandReply.h
+++ b/src/messages/MCommandReply.h
@@ -20,7 +20,7 @@
 
 class MCommandReply : public Message {
  public:
-  __s32 r;
+  errorcode32_t r;
   string rs;
   
   MCommandReply()
diff --git a/src/messages/MLog.h b/src/messages/MLog.h
index 8b64e07..f2c04b2 100644
--- a/src/messages/MLog.h
+++ b/src/messages/MLog.h
@@ -37,7 +37,8 @@ public:
   void print(ostream& out) const {
     out << "log(";
     if (entries.size())
-      out << entries.size() << " entries";
+      out << entries.size() << " entries from seq " << entries.front().seq
+	  << " at " << entries.front().stamp;
     out << ")";
   }
 
diff --git a/src/messages/MMonCommandAck.h b/src/messages/MMonCommandAck.h
index bd3d62d..5ebecde 100644
--- a/src/messages/MMonCommandAck.h
+++ b/src/messages/MMonCommandAck.h
@@ -20,7 +20,7 @@
 class MMonCommandAck : public PaxosServiceMessage {
  public:
   vector<string> cmd;
-  __s32 r;
+  errorcode32_t r;
   string rs;
   
   MMonCommandAck() : PaxosServiceMessage(MSG_MON_COMMAND_ACK, 0) {}
diff --git a/src/messages/MOSDOp.h b/src/messages/MOSDOp.h
old mode 100644
new mode 100755
index 5d4ab62..259baa5
--- a/src/messages/MOSDOp.h
+++ b/src/messages/MOSDOp.h
@@ -32,7 +32,7 @@ class OSD;
 
 class MOSDOp : public Message {
 
-  static const int HEAD_VERSION = 6;
+  static const int HEAD_VERSION = 7;
   static const int COMPAT_VERSION = 3;
 
 private:
@@ -46,6 +46,11 @@ private:
   object_t oid;
   object_locator_t oloc;
   pg_t pgid;
+  bufferlist::iterator p;
+  // Decoding flags. Decoding is only needed for messages catched by pipe reader.
+  bool partial_decode_needed;
+  bool final_decode_needed;
+  //
 public:
   vector<OSDOp> ops;
 private:
@@ -61,12 +66,8 @@ private:
 public:
   friend class MOSDOpReply;
 
-  // read
-  const snapid_t& get_snapid() { return snapid; }
+  ceph_tid_t get_client_tid() { return header.tid; }
   void set_snapid(const snapid_t& s) { snapid = s; }
-  // writ
-  const snapid_t& get_snap_seq() const { return snap_seq; }
-  const vector<snapid_t> &get_snaps() const { return snaps; }
   void set_snaps(const vector<snapid_t>& i) {
     snaps = i;
   }
@@ -75,42 +76,102 @@ public:
     reqid = rid;
   }
 
+  // Fields decoded in partial decoding
+  const pg_t& get_pg() const {
+    assert(!partial_decode_needed);
+    return pgid;
+  }
+  epoch_t get_map_epoch() const {
+    assert(!partial_decode_needed);
+    return osdmap_epoch;
+  }
+  int get_flags() const {
+    assert(!partial_decode_needed);
+    return flags;
+  }
+  const eversion_t& get_version() const {
+    assert(!partial_decode_needed);
+    return reassert_version;
+  }
   osd_reqid_t get_reqid() const {
-    if (reqid != osd_reqid_t())
+    assert(!partial_decode_needed);
+    if (reqid.name != entity_name_t() || reqid.tid != 0) {
       return reqid;
-    else
+    } else {
+      if (!final_decode_needed)
+	assert(reqid.inc == (int32_t)client_inc);  // decode() should have done this
       return osd_reqid_t(get_orig_source(),
-                         client_inc,
+                         reqid.inc,
 			 header.tid);
+    }
   }
-  int get_client_inc() { return client_inc; }
-  ceph_tid_t get_client_tid() { return header.tid; }
-  
-  object_t& get_oid() { return oid; }
-
-  const pg_t&     get_pg() const { return pgid; }
 
+  // Fields decoded in final decoding
+  int get_client_inc() const {
+    assert(!final_decode_needed);
+    return client_inc;
+  }
+  utime_t get_mtime() const {
+    assert(!final_decode_needed);
+    return mtime;
+  }
   const object_locator_t& get_object_locator() const {
+    assert(!final_decode_needed);
     return oloc;
   }
+  object_t& get_oid() {
+    assert(!final_decode_needed);
+    return oid;
+  }
+  const snapid_t& get_snapid() {
+    assert(!final_decode_needed);
+    return snapid;
+  }
+  const snapid_t& get_snap_seq() const {
+    assert(!final_decode_needed);
+    return snap_seq;
+  }
+  const vector<snapid_t> &get_snaps() const {
+    assert(!final_decode_needed);
+    return snaps;
+  }
 
-  epoch_t  get_map_epoch() { return osdmap_epoch; }
-
-  const eversion_t& get_version() { return reassert_version; }
-  
-  utime_t get_mtime() { return mtime; }
+  /**
+   * get retry attempt
+   *
+   * 0 is the first attempt.
+   *
+   * @return retry attempt, or -1 if we don't know
+   */
+  int get_retry_attempt() const {
+    return retry_attempt;
+  }
+  uint64_t get_features() const {
+    if (features)
+      return features;
+    return get_connection()->get_features();
+  }
 
   MOSDOp()
-    : Message(CEPH_MSG_OSD_OP, HEAD_VERSION, COMPAT_VERSION) { }
+    : Message(CEPH_MSG_OSD_OP, HEAD_VERSION, COMPAT_VERSION),
+      partial_decode_needed(true),
+      final_decode_needed(true) { }
   MOSDOp(int inc, long tid,
-         object_t& _oid, object_locator_t& _oloc, pg_t& _pgid, epoch_t _osdmap_epoch,
+         object_t& _oid, object_locator_t& _oloc, pg_t& _pgid,
+	 epoch_t _osdmap_epoch,
 	 int _flags, uint64_t feat)
     : Message(CEPH_MSG_OSD_OP, HEAD_VERSION, COMPAT_VERSION),
       client_inc(inc),
       osdmap_epoch(_osdmap_epoch), flags(_flags), retry_attempt(-1),
       oid(_oid), oloc(_oloc), pgid(_pgid),
+      partial_decode_needed(false),
+      final_decode_needed(false),
       features(feat) {
     set_tid(tid);
+
+    // also put the client_inc in reqid.inc, so that get_reqid() can
+    // be used before the full message is decoded.
+    reqid.inc = inc;
   }
 private:
   ~MOSDOp() {}
@@ -154,16 +215,7 @@ public:
     add_simple_op(CEPH_OSD_OP_STAT, 0, 0);
   }
 
-  uint64_t get_features() const {
-    if (features)
-      return features;
-    return get_connection()->get_features();
-  }
-
-  // flags
-  int get_flags() const { return flags; }
   bool has_flag(__u32 flag) { return flags & flag; };
-
   bool wants_ack() const { return flags & CEPH_OSD_FLAG_ACK; }
   bool wants_ondisk() const { return flags & CEPH_OSD_FLAG_ONDISK; }
   bool wants_onnvram() const { return flags & CEPH_OSD_FLAG_ONNVRAM; }
@@ -181,17 +233,6 @@ public:
     retry_attempt = a;
   }
 
-  /**
-   * get retry attempt
-   *
-   * 0 is the first attempt.
-   *
-   * @return retry attempt, or -1 if we don't know
-   */
-  int get_retry_attempt() const {
-    return retry_attempt;
-  }
-
   // marshalling
   virtual void encode_payload(uint64_t features) {
 
@@ -248,22 +289,22 @@ struct ceph_osd_request_head {
 
       ::encode_nohead(oid.name, payload);
       ::encode_nohead(snaps, payload);
-    } else {
-      header.version = HEAD_VERSION;
+    } else if ((features & CEPH_FEATURE_NEW_OSDOP_ENCODING) == 0) {
+      header.version = 6;
       ::encode(client_inc, payload);
       ::encode(osdmap_epoch, payload);
       ::encode(flags, payload);
       ::encode(mtime, payload);
       ::encode(reassert_version, payload);
-
       ::encode(oloc, payload);
       ::encode(pgid, payload);
+
       ::encode(oid, payload);
 
       __u16 num_ops = ops.size();
       ::encode(num_ops, payload);
       for (unsigned i = 0; i < ops.size(); i++)
-	::encode(ops[i].op, payload);
+        ::encode(ops[i].op, payload);
 
       ::encode(snapid, payload);
       ::encode(snap_seq, payload);
@@ -272,11 +313,36 @@ struct ceph_osd_request_head {
       ::encode(retry_attempt, payload);
       ::encode(features, payload);
       ::encode(reqid, payload);
+    } else {
+      // new, reordered, v7 message encoding
+      header.version = HEAD_VERSION;
+      ::encode(pgid, payload);
+      ::encode(osdmap_epoch, payload);
+      ::encode(flags, payload);
+      ::encode(reassert_version, payload);
+      ::encode(reqid, payload);
+      ::encode(client_inc, payload);
+      ::encode(mtime, payload);
+      ::encode(oloc, payload);
+      ::encode(oid, payload);
+
+      __u16 num_ops = ops.size();
+      ::encode(num_ops, payload);
+      for (unsigned i = 0; i < ops.size(); i++)
+	::encode(ops[i].op, payload);
+
+      ::encode(snapid, payload);
+      ::encode(snap_seq, payload);
+      ::encode(snaps, payload);
+
+      ::encode(retry_attempt, payload);
+      ::encode(features, payload);
     }
   }
 
   virtual void decode_payload() {
-    bufferlist::iterator p = payload.begin();
+    assert(partial_decode_needed && final_decode_needed);
+    p = payload.begin();
 
     if (header.version < 2) {
       // old decode
@@ -319,9 +385,15 @@ struct ceph_osd_request_head {
 
       retry_attempt = -1;
       features = 0;
+      OSDOp::split_osd_op_vector_in_data(ops, data);
+
+      // we did the full decode
+      final_decode_needed = false;
+
+      // put client_inc in reqid.inc for get_reqid()'s benefit
       reqid = osd_reqid_t();
-    } else {
-      // new decode 
+      reqid.inc = client_inc;
+    } else if (header.version < 7) {
       ::decode(client_inc, p);
       ::decode(osdmap_epoch, p);
       ::decode(flags, p);
@@ -345,19 +417,19 @@ struct ceph_osd_request_head {
       ::decode(num_ops, p);
       ops.resize(num_ops);
       for (unsigned i = 0; i < num_ops; i++)
-	::decode(ops[i].op, p);
+        ::decode(ops[i].op, p);
 
       ::decode(snapid, p);
       ::decode(snap_seq, p);
       ::decode(snaps, p);
 
       if (header.version >= 4)
-	::decode(retry_attempt, p);
+        ::decode(retry_attempt, p);
       else
-	retry_attempt = -1;
+        retry_attempt = -1;
 
       if (header.version >= 5)
-	::decode(features, p);
+        ::decode(features, p);
       else
 	features = 0;
 
@@ -365,9 +437,55 @@ struct ceph_osd_request_head {
 	::decode(reqid, p);
       else
 	reqid = osd_reqid_t();
+
+      OSDOp::split_osd_op_vector_in_data(ops, data);
+
+      // we did the full decode
+      final_decode_needed = false;
+
+      // put client_inc in reqid.inc for get_reqid()'s benefit
+      if (reqid.name == entity_name_t() && reqid.tid == 0)
+	reqid.inc = client_inc;
+    } else {
+      // new, v7 decode, splitted to partial and final
+      ::decode(pgid, p);
+      ::decode(osdmap_epoch, p);
+      ::decode(flags, p);
+      ::decode(reassert_version, p);
+      ::decode(reqid, p);
     }
 
+    partial_decode_needed = false;
+  }
+
+  void finish_decode() {
+    assert(!partial_decode_needed); // partial decoding required
+    if (!final_decode_needed)
+      return; // Message is already final decoded
+    assert(header.version >= 7);
+
+    ::decode(client_inc, p);
+    ::decode(mtime, p);
+    ::decode(oloc, p);
+    ::decode(oid, p);
+
+    __u16 num_ops;
+    ::decode(num_ops, p);
+    ops.resize(num_ops);
+    for (unsigned i = 0; i < num_ops; i++)
+      ::decode(ops[i].op, p);
+
+    ::decode(snapid, p);
+    ::decode(snap_seq, p);
+    ::decode(snaps, p);
+
+    ::decode(retry_attempt, p);
+
+    ::decode(features, p);
+
     OSDOp::split_osd_op_vector_in_data(ops, data);
+
+    final_decode_needed = false;
   }
 
   void clear_buffers() {
@@ -376,35 +494,29 @@ struct ceph_osd_request_head {
 
   const char *get_type_name() const { return "osd_op"; }
   void print(ostream& out) const {
-    out << "osd_op(" << get_reqid();
-    out << " ";
-    if (!oloc.nspace.empty())
-      out << oloc.nspace << "/";
-    out << oid;
-
-#if 0
-    out << " ";
-    if (may_read())
-      out << "r";
-    if (may_write())
-      out << "w";
-#endif
-    if (snapid != CEPH_NOSNAP)
-      out << "@" << snapid;
-
-    if (oloc.key.size())
-      out << " " << oloc;
-
-    out << " " << ops;
-    out << " " << pgid;
-    if (is_retry_attempt())
-      out << " RETRY=" << get_retry_attempt();
-    if (reassert_version != eversion_t())
-      out << " reassert_version=" << reassert_version;
-    if (get_snap_seq())
-      out << " snapc " << get_snap_seq() << "=" << snaps;
-    out << " " << ceph_osd_flag_string(get_flags());
-    out << " e" << osdmap_epoch;
+    out << "osd_op(";
+    if (!partial_decode_needed) {
+      out << get_reqid() << ' ';
+      out << pgid;
+      if (!final_decode_needed) {
+	out << ' ';
+	if (!oloc.nspace.empty())
+	  out << oloc.nspace << "/";
+	out << oid
+	    << " " << ops
+	    << " snapc " << get_snap_seq() << "=" << snaps;
+	if (oloc.key.size())
+	  out << " " << oloc;
+	if (is_retry_attempt())
+	  out << " RETRY=" << get_retry_attempt();
+      } else {
+	out << " (undecoded)";
+      }
+      out << " " << ceph_osd_flag_string(get_flags());
+      if (reassert_version != eversion_t())
+	out << " reassert_version=" << reassert_version;
+      out << " e" << osdmap_epoch;
+    }
     out << ")";
   }
 };
diff --git a/src/messages/MOSDOpReply.h b/src/messages/MOSDOpReply.h
index 45ec3d0..087f165 100644
--- a/src/messages/MOSDOpReply.h
+++ b/src/messages/MOSDOpReply.h
@@ -39,7 +39,7 @@ class MOSDOpReply : public Message {
   pg_t pgid;
   vector<OSDOp> ops;
   int64_t flags;
-  int32_t result;
+  errorcode32_t result;
   eversion_t bad_replay_version;
   eversion_t replay_version;
   version_t user_version;
@@ -203,7 +203,7 @@ public:
       }
       ::decode_nohead(head.object_len, oid.name, p);
       pgid = pg_t(head.layout.ol_pgid);
-      result = head.result;
+      result = (int32_t)head.result;
       flags = head.flags;
       replay_version = head.reassert_version;
       user_version = replay_version.version;
diff --git a/src/messages/MOSDPGCreate.h b/src/messages/MOSDPGCreate.h
index a3dc3b7..c34f62a 100644
--- a/src/messages/MOSDPGCreate.h
+++ b/src/messages/MOSDPGCreate.h
@@ -81,13 +81,11 @@ public:
   }
 
   void print(ostream& out) const {
-    out << "osd_pg_create(";
-    map<pg_t,utime_t>::const_iterator ci = ctimes.begin();
+    out << "osd_pg_create(e" << epoch;
     for (map<pg_t,pg_create_t>::const_iterator i = mkpg.begin();
          i != mkpg.end();
-         ++i, ++ci) {
-      assert(ci != ctimes.end() && ci->first == i->first);
-      out << "pg" << i->first << "," << i->second.created << "@" << ci->second << "; ";
+         ++i) {
+      out << " " << i->first << ":" << i->second.created;
     }
     out << ")";
   }
diff --git a/src/messages/MRoute.h b/src/messages/MRoute.h
index 5282d39..109574e 100644
--- a/src/messages/MRoute.h
+++ b/src/messages/MRoute.h
@@ -22,24 +22,35 @@
 
 struct MRoute : public Message {
 
-  static const int HEAD_VERSION = 2;
+  static const int HEAD_VERSION = 3;
   static const int COMPAT_VERSION = 2;
 
   uint64_t session_mon_tid;
   Message *msg;
   entity_inst_t dest;
+  epoch_t send_osdmap_first;
   
-  MRoute() : Message(MSG_ROUTE, HEAD_VERSION, COMPAT_VERSION), msg(NULL) {}
+  MRoute() : Message(MSG_ROUTE, HEAD_VERSION, COMPAT_VERSION),
+	     session_mon_tid(0),
+	     msg(NULL),
+	     send_osdmap_first(0) {}
   MRoute(uint64_t t, Message *m)
-    : Message(MSG_ROUTE, HEAD_VERSION, COMPAT_VERSION), session_mon_tid(t), msg(m) {}
+    : Message(MSG_ROUTE, HEAD_VERSION, COMPAT_VERSION),
+      session_mon_tid(t),
+      msg(m),
+      send_osdmap_first(0) {}
   MRoute(bufferlist bl, const entity_inst_t& i)
-    : Message(MSG_ROUTE, HEAD_VERSION, COMPAT_VERSION), session_mon_tid(0), dest(i) {
+    : Message(MSG_ROUTE, HEAD_VERSION, COMPAT_VERSION),
+      session_mon_tid(0),
+      dest(i),
+      send_osdmap_first(0) {
     bufferlist::iterator p = bl.begin();
     msg = decode_message(NULL, 0, p);
   }
 private:
   ~MRoute() {
-    if (msg) msg->put();
+    if (msg)
+      msg->put();
   }
 
 public:
@@ -55,23 +66,25 @@ public:
     } else {
       msg = decode_message(NULL, 0, p);
     }
+    if (header.version >= 3) {
+      ::decode(send_osdmap_first, p);
+    }
   }
   void encode_payload(uint64_t features) {
     ::encode(session_mon_tid, payload);
     ::encode(dest, payload);
-    if (features & CEPH_FEATURE_MON_NULLROUTE) {
-      header.version = HEAD_VERSION;
-      header.compat_version = COMPAT_VERSION;
-      bool m = msg ? true : false;
-      ::encode(m, payload);
-      if (msg)
-	encode_message(msg, features, payload);
-    } else {
+    if ((features & CEPH_FEATURE_MON_NULLROUTE) == 0) {
       header.version = 1;
       header.compat_version = 1;
       assert(msg);
       encode_message(msg, features, payload);
+      return;
     }
+    bool m = msg ? true : false;
+    ::encode(m, payload);
+    if (msg)
+      encode_message(msg, features, payload);
+    ::encode(send_osdmap_first, payload);
   }
 
   const char *get_type_name() const { return "route"; }
@@ -80,6 +93,8 @@ public:
       o << "route(" << *msg;
     else
       o << "route(no-reply";
+    if (send_osdmap_first)
+      o << " send_osdmap_first " << send_osdmap_first;
     if (session_mon_tid)
       o << " tid " << session_mon_tid << ")";
     else
diff --git a/src/messages/MWatchNotify.h b/src/messages/MWatchNotify.h
index 50657ca..9f7d568 100644
--- a/src/messages/MWatchNotify.h
+++ b/src/messages/MWatchNotify.h
@@ -29,7 +29,7 @@ class MWatchNotify : public Message {
   uint64_t notify_id;  ///< osd unique id for a notify notification
   uint8_t opcode;      ///< CEPH_WATCH_EVENT_*
   bufferlist bl;       ///< notify payload (osd->client)
-  int32_t return_code; ///< notify result (osd->client)
+  errorcode32_t return_code; ///< notify result (osd->client)
   uint64_t notifier_gid; ///< who sent the notify
 
   MWatchNotify()
diff --git a/src/mon/Makefile.am b/src/mon/Makefile.am
index ee6542d..0835e6c 100644
--- a/src/mon/Makefile.am
+++ b/src/mon/Makefile.am
@@ -5,7 +5,7 @@ noinst_LTLIBRARIES += libmon_types.la
 if ENABLE_SERVER
 if WITH_MON
 
-libmon_la_SOURCES = \
+libmon_a_SOURCES = \
 	mon/Monitor.cc \
 	mon/Paxos.cc \
 	mon/PaxosService.cc \
@@ -19,8 +19,8 @@ libmon_la_SOURCES = \
 	mon/HealthMonitor.cc \
 	mon/DataHealthService.cc \
 	mon/ConfigKeyService.cc
-libmon_la_LIBADD = $(LIBAUTH) $(LIBCOMMON) $(LIBOS) $(LIBMON_TYPES)
-noinst_LTLIBRARIES += libmon.la
+libmon_a_LIBADD =
+noinst_LIBRARIES += libmon.a
 
 noinst_HEADERS += \
 	mon/AuthMonitor.h \
diff --git a/src/mon/MonCap.cc b/src/mon/MonCap.cc
index 989893b..a2540b5 100644
--- a/src/mon/MonCap.cc
+++ b/src/mon/MonCap.cc
@@ -134,6 +134,8 @@ void MonCapGrant::expand_profile(EntityName name) const
     profile_grants.push_back(MonCapGrant("mds", MON_CAP_ALL));
     profile_grants.push_back(MonCapGrant("mon", MON_CAP_R));
     profile_grants.push_back(MonCapGrant("osd", MON_CAP_R));
+    // This command grant is checked explicitly in MRemoveSnaps handling
+    profile_grants.push_back(MonCapGrant("osd pool rmsnap"));
     profile_grants.push_back(MonCapGrant("log", MON_CAP_W));
   }
   if (profile == "osd" || profile == "mds" || profile == "mon") {
diff --git a/src/mon/MonClient.cc b/src/mon/MonClient.cc
index 6e9843b..7183551 100644
--- a/src/mon/MonClient.cc
+++ b/src/mon/MonClient.cc
@@ -659,7 +659,13 @@ void MonClient::_reopen_session(int rank, string name)
   ::encode(global_id, m->auth_payload);
   _send_mon_message(m, true);
 
-  if (!sub_have.empty())
+  for (map<string,ceph_mon_subscribe_item>::iterator p = sub_sent.begin();
+       p != sub_sent.end();
+       ++p) {
+    if (sub_new.count(p->first) == 0)
+      sub_new[p->first] = p->second;
+  }
+  if (!sub_new.empty())
     _renew_subs();
 }
 
@@ -709,28 +715,30 @@ void MonClient::tick()
   } else if (!cur_mon.empty()) {
     // just renew as needed
     utime_t now = ceph_clock_now(cct);
-    ldout(cct, 10) << "renew subs? (now: " << now 
-		   << "; renew after: " << sub_renew_after << ") -- " 
-		   << (now > sub_renew_after ? "yes" : "no") 
-		   << dendl;
-    if (now > sub_renew_after)
-      _renew_subs();
+    if (!cur_con->has_feature(CEPH_FEATURE_MON_STATEFUL_SUB)) {
+      ldout(cct, 10) << "renew subs? (now: " << now
+		     << "; renew after: " << sub_renew_after << ") -- "
+		     << (now > sub_renew_after ? "yes" : "no")
+		     << dendl;
+      if (now > sub_renew_after)
+	_renew_subs();
+    }
 
     cur_con->send_keepalive();
 
     if (state == MC_STATE_HAVE_SESSION) {
-      send_log();
-
       if (cct->_conf->mon_client_ping_timeout > 0 &&
 	  cur_con->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) {
 	utime_t lk = cur_con->get_last_keepalive_ack();
-	utime_t interval = ceph_clock_now(cct) - lk;
+	utime_t interval = now - lk;
 	if (interval > cct->_conf->mon_client_ping_timeout) {
 	  ldout(cct, 1) << "no keepalive since " << lk << " (" << interval
 			<< " seconds), reconnecting" << dendl;
 	  _reopen_session();
 	}
       }
+
+      send_log();
     }
   }
 
@@ -751,7 +759,7 @@ void MonClient::schedule_tick()
 void MonClient::_renew_subs()
 {
   assert(monc_lock.is_locked());
-  if (sub_have.empty()) {
+  if (sub_new.empty()) {
     ldout(cct, 10) << "renew_subs - empty" << dendl;
     return;
   }
@@ -764,14 +772,19 @@ void MonClient::_renew_subs()
       sub_renew_sent = ceph_clock_now(cct);
 
     MMonSubscribe *m = new MMonSubscribe;
-    m->what = sub_have;
+    m->what = sub_new;
     _send_mon_message(m);
+
+    sub_sent.insert(sub_new.begin(), sub_new.end());
+    sub_new.clear();
   }
 }
 
 void MonClient::handle_subscribe_ack(MMonSubscribeAck *m)
 {
   if (sub_renew_sent != utime_t()) {
+    // NOTE: this is only needed for legacy (infernalis or older)
+    // mons; see tick().
     sub_renew_after = sub_renew_sent;
     sub_renew_after += m->interval / 2.0;
     ldout(cct, 10) << "handle_subscribe_ack sent " << sub_renew_sent << " renew after " << sub_renew_after << dendl;
diff --git a/src/mon/MonClient.h b/src/mon/MonClient.h
index a9761d1..c3efd78 100644
--- a/src/mon/MonClient.h
+++ b/src/mon/MonClient.h
@@ -209,31 +209,46 @@ public:
 
   // mon subscriptions
 private:
-  map<string,ceph_mon_subscribe_item> sub_have;  // my subs, and current versions
+  map<string,ceph_mon_subscribe_item> sub_sent; // my subs, and current versions
+  map<string,ceph_mon_subscribe_item> sub_new;  // unsent new subs
   utime_t sub_renew_sent, sub_renew_after;
 
   void _renew_subs();
   void handle_subscribe_ack(MMonSubscribeAck* m);
 
   bool _sub_want(string what, version_t start, unsigned flags) {
-    if (sub_have.count(what) &&
-	sub_have[what].start == start &&
-	sub_have[what].flags == flags)
+    if ((sub_new.count(what) == 0 &&
+	 sub_sent.count(what) &&
+	 sub_sent[what].start == start &&
+	 sub_sent[what].flags == flags) ||
+	(sub_new.count(what) &&
+	 sub_new[what].start == start &&
+	 sub_new[what].flags == flags))
       return false;
-    sub_have[what].start = start;
-    sub_have[what].flags = flags;
+    sub_new[what].start = start;
+    sub_new[what].flags = flags;
     return true;
   }
   void _sub_got(string what, version_t got) {
-    if (sub_have.count(what)) {
-      if (sub_have[what].flags & CEPH_SUBSCRIBE_ONETIME)
-	sub_have.erase(what);
-      else
-	sub_have[what].start = got + 1;
+    if (sub_new.count(what)) {
+      if (sub_new[what].start <= got) {
+	if (sub_new[what].flags & CEPH_SUBSCRIBE_ONETIME)
+	  sub_new.erase(what);
+	else
+	  sub_new[what].start = got + 1;
+      }
+    } else if (sub_sent.count(what)) {
+      if (sub_sent[what].start <= got) {
+	if (sub_sent[what].flags & CEPH_SUBSCRIBE_ONETIME)
+	  sub_sent.erase(what);
+	else
+	  sub_sent[what].start = got + 1;
+      }
     }
   }
   void _sub_unwant(string what) {
-    sub_have.erase(what);
+    sub_sent.erase(what);
+    sub_new.erase(what);
   }
 
   // auth tickets
@@ -262,10 +277,18 @@ public:
    */
   bool sub_want_increment(string what, version_t start, unsigned flags) {
     Mutex::Locker l(monc_lock);
-    map<string,ceph_mon_subscribe_item>::iterator i =
-            sub_have.find(what);
-    if (i == sub_have.end() || i->second.start < start) {
-      ceph_mon_subscribe_item& item = sub_have[what];
+    map<string,ceph_mon_subscribe_item>::iterator i = sub_new.find(what);
+    if (i != sub_new.end()) {
+      if (i->second.start >= start)
+	return false;
+      i->second.start = start;
+      i->second.flags = flags;
+      return true;
+    }
+
+    i = sub_sent.find(what);
+    if (i == sub_sent.end() || i->second.start < start) {
+      ceph_mon_subscribe_item& item = sub_new[what];
       item.start = start;
       item.flags = flags;
       return true;
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index 0286b83..67936d5 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -674,11 +674,11 @@ COMMAND("osd pool rename " \
 	"rename <srcpool> to <destpool>", "osd", "rw", "cli,rest")
 COMMAND("osd pool get " \
 	"name=pool,type=CephPoolname " \
-	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_ [...]
+	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|auid|target_max_objects|target_max_bytes|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|erasure_code_profile|min_read_recency_for_promote|all|min_write_recency_for_promote|fast_ [...]
 	"get pool parameter <var>", "osd", "r", "cli,rest")
 COMMAND("osd pool set " \
 	"name=pool,type=CephPoolname " \
-	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|min_write_recency_for_prom [...]
+	"name=var,type=CephChoices,strings=size|min_size|crash_replay_interval|pg_num|pgp_num|crush_ruleset|hashpspool|nodelete|nopgchange|nosizechange|write_fadvise_dontneed|noscrub|nodeep-scrub|hit_set_type|hit_set_period|hit_set_count|hit_set_fpp|use_gmt_hitset|debug_fake_ec_pool|target_max_bytes|target_max_objects|cache_target_dirty_ratio|cache_target_dirty_high_ratio|cache_target_full_ratio|cache_min_flush_age|cache_min_evict_age|auid|min_read_recency_for_promote|min_write_recency_for_prom [...]
 	"name=val,type=CephString " \
 	"name=force,type=CephChoices,strings=--yes-i-really-mean-it,req=false", \
 	"set pool parameter <var> to <val>", "osd", "rw", "cli,rest")
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index dcfd512..cbf37c2 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -1812,6 +1812,7 @@ void Monitor::win_standalone_election()
 
   // bump election epoch, in case the previous epoch included other
   // monitors; we need to be able to make the distinction.
+  elector.init();
   elector.advance_epoch();
 
   rank = monmap->get_rank(name);
@@ -1822,7 +1823,7 @@ void Monitor::win_standalone_election()
   const MonCommand *my_cmds;
   int cmdsize;
   get_locally_supported_monitor_commands(&my_cmds, &cmdsize);
-  win_election(1, q, CEPH_FEATURES_ALL, my_cmds, cmdsize, NULL);
+  win_election(elector.get_epoch(), q, CEPH_FEATURES_ALL, my_cmds, cmdsize, NULL);
 }
 
 const utime_t& Monitor::get_leader_since() const
@@ -3271,8 +3272,14 @@ void Monitor::handle_route(MonOpRequestRef op)
 	rr->con->send_message(m->msg);
 	m->msg = NULL;
       }
+      if (m->send_osdmap_first) {
+	dout(10) << " sending osdmaps from " << m->send_osdmap_first << dendl;
+	osdmon()->send_incremental(m->send_osdmap_first, rr->session,
+				   true, MonOpRequestRef());
+      }
+      assert(rr->tid == m->session_mon_tid && rr->session->routed_request_tids.count(m->session_mon_tid));
       routed_requests.erase(m->session_mon_tid);
-      rr->session->routed_request_tids.insert(rr->tid);
+      rr->session->routed_request_tids.erase(m->session_mon_tid);
       delete rr;
     } else {
       dout(10) << " don't have routed request tid " << m->session_mon_tid << dendl;
@@ -3300,6 +3307,10 @@ void Monitor::resend_routed_requests()
       dout(10) << " requeue for self tid " << rr->tid << dendl;
       rr->op->mark_event("retry routed request");
       retry.push_back(new C_RetryMessage(this, rr->op));
+      if (rr->session) {
+        assert(rr->session->routed_request_tids.count(p->first));
+        rr->session->routed_request_tids.erase(p->first);
+      }
       delete rr;
     } else {
       bufferlist::iterator q = rr->request_bl.begin();
@@ -3328,13 +3339,13 @@ void Monitor::remove_session(MonSession *s)
   for (set<uint64_t>::iterator p = s->routed_request_tids.begin();
        p != s->routed_request_tids.end();
        ++p) {
-    if (routed_requests.count(*p)) {
-      RoutedRequest *rr = routed_requests[*p];
-      dout(10) << " dropping routed request " << rr->tid << dendl;
-      delete rr;
-      routed_requests.erase(*p);
-    }
+    assert(routed_requests.count(*p));
+    RoutedRequest *rr = routed_requests[*p];
+    dout(10) << " dropping routed request " << rr->tid << dendl;
+    delete rr;
+    routed_requests.erase(*p);
   }
+  s->routed_request_tids.clear();
   s->con->set_priv(NULL);
   session_map.remove_session(s);
   logger->set(l_mon_num_sessions, session_map.get_size());
@@ -3426,22 +3437,17 @@ void Monitor::_ms_dispatch(Message *m)
       return;
     }
 
-    s = session_map.new_session(m->get_source_inst(), m->get_connection().get());
+    ConnectionRef con = m->get_connection();
+    s = session_map.new_session(m->get_source_inst(), con.get());
     assert(s);
-    m->get_connection()->set_priv(s->get());
+    con->set_priv(s->get());
     dout(10) << __func__ << " new session " << s << " " << *s << dendl;
     op->set_session(s);
 
     logger->set(l_mon_num_sessions, session_map.get_size());
     logger->inc(l_mon_session_add);
 
-    if (!src_is_mon) {
-      dout(30) << __func__ << "  setting timeout on session" << dendl;
-      // set an initial timeout here, so we will trim this session
-      // even if they don't do anything.
-      s->until = ceph_clock_now(g_ceph_context);
-      s->until += g_conf->mon_subscribe_interval;
-    } else {
+    if (src_is_mon) {
       // give it monitor caps; the peer type has been authenticated
       dout(5) << __func__ << " setting monitor caps on this connection" << dendl;
       if (!s->caps.is_allow_all()) // but no need to repeatedly copy
@@ -3454,6 +3460,10 @@ void Monitor::_ms_dispatch(Message *m)
   }
 
   assert(s);
+
+  s->session_timeout = ceph_clock_now(NULL);
+  s->session_timeout += g_conf->mon_session_timeout;
+
   if (s->auth_handler) {
     s->entity_name = s->auth_handler->get_entity_name();
   }
@@ -4125,8 +4135,6 @@ void Monitor::handle_subscribe(MonOpRequestRef op)
   MonSession *s = op->get_session();
   assert(s);
 
-  s->until = ceph_clock_now(g_ceph_context);
-  s->until += g_conf->mon_subscribe_interval;
   for (map<string,ceph_mon_subscribe_item>::iterator p = m->what.begin();
        p != m->what.end();
        ++p) {
@@ -4161,10 +4169,14 @@ void Monitor::handle_subscribe(MonOpRequestRef op)
     }
   }
 
-  // ???
-
-  if (reply)
-    m->get_connection()->send_message(new MMonSubscribeAck(monmap->get_fsid(), (int)g_conf->mon_subscribe_interval));
+  if (reply) {
+    // we only need to reply if the client is old enough to think it
+    // has to send renewals.
+    ConnectionRef con = m->get_connection();
+    if (!con->has_feature(CEPH_FEATURE_MON_STATEFUL_SUB))
+      m->get_connection()->send_message(new MMonSubscribeAck(
+	monmap->get_fsid(), (int)g_conf->mon_subscribe_interval));
+  }
 
 }
 
@@ -4697,11 +4709,17 @@ void Monitor::tick()
     
     // don't trim monitors
     if (s->inst.name.is_mon())
-      continue; 
+      continue;
 
-    if (!s->until.is_zero() && s->until < now) {
+    if (s->session_timeout < now && s->con) {
+      // check keepalive, too
+      s->session_timeout = s->con->get_last_keepalive();
+      s->session_timeout += g_conf->mon_session_timeout;
+    }
+    if (s->session_timeout < now) {
       dout(10) << " trimming session " << s->con << " " << s->inst
-	       << " (until " << s->until << " < now " << now << ")" << dendl;
+	       << " (timeout " << s->session_timeout
+	       << " < now " << now << ")" << dendl;
     } else if (out_for_too_long) {
       // boot the client Session because we've taken too long getting back in
       dout(10) << " trimming session " << s->con << " " << s->inst
@@ -5004,9 +5022,9 @@ bool Monitor::ms_verify_authorizer(Connection *con, int peer_type,
       CephXServiceTicketInfo auth_ticket_info;
       
       if (authorizer_data.length()) {
-	int ret = cephx_verify_authorizer(g_ceph_context, &keyring, iter,
+	bool ret = cephx_verify_authorizer(g_ceph_context, &keyring, iter,
 					  auth_ticket_info, authorizer_reply);
-	if (ret >= 0) {
+	if (ret) {
 	  session_key = auth_ticket_info.session_key;
 	  isvalid = true;
 	} else {
diff --git a/src/mon/MonitorDBStore.h b/src/mon/MonitorDBStore.h
index 6b1d6c8..15558e3 100644
--- a/src/mon/MonitorDBStore.h
+++ b/src/mon/MonitorDBStore.h
@@ -21,12 +21,13 @@
 #include <boost/scoped_ptr.hpp>
 #include <sstream>
 #include <fstream>
-#include "os/KeyValueDB.h"
+#include "kv/KeyValueDB.h"
 
 #include "include/assert.h"
 #include "common/Formatter.h"
 #include "common/Finisher.h"
 #include "common/errno.h"
+#include "common/debug.h"
 
 class MonitorDBStore
 {
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 9ebb349..040332c 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -43,6 +43,7 @@
 #include "messages/MMonCommand.h"
 #include "messages/MRemoveSnaps.h"
 #include "messages/MOSDScrub.h"
+#include "messages/MRoute.h"
 
 #include "common/TextTable.h"
 #include "common/Timer.h"
@@ -1023,6 +1024,9 @@ void OSDMonitor::maybe_prime_pg_temp()
 void OSDMonitor::prime_pg_temp(OSDMap& next,
 			       ceph::unordered_map<pg_t, pg_stat_t>::iterator pp)
 {
+  // do not prime creating pgs
+  if (pp->second.state & PG_STATE_CREATING)
+    return;
   // do not touch a mapping if a change is pending
   if (pending_inc.new_pg_temp.count(pp->first))
     return;
@@ -1669,9 +1673,9 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
   }
 
   dout(10) << " osd." << target_osd << " has "
-	   << fi.reporters.size() << " reporters and "
-	   << fi.num_reports << " reports, "
-	   << grace << " grace (" << orig_grace << " + " << my_grace << " + " << peer_grace << "), max_failed_since " << max_failed_since
+	   << fi.reporters.size() << " reporters, "
+	   << grace << " grace (" << orig_grace << " + " << my_grace
+	   << " + " << peer_grace << "), max_failed_since " << max_failed_since
 	   << dendl;
 
   // already pending failure?
@@ -1682,13 +1686,13 @@ bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
   }
 
   if (failed_for >= grace &&
-      ((int)fi.reporters.size() >= g_conf->mon_osd_min_down_reporters) &&
-      (fi.num_reports >= g_conf->mon_osd_min_down_reports)) {
-    dout(1) << " we have enough reports/reporters to mark osd." << target_osd << " down" << dendl;
+      ((int)fi.reporters.size() >= g_conf->mon_osd_min_down_reporters)) {
+    dout(1) << " we have enough reporters to mark osd." << target_osd
+	    << " down" << dendl;
     pending_inc.new_state[target_osd] = CEPH_OSD_UP;
 
     mon->clog->info() << osdmap.get_inst(target_osd) << " failed ("
-		     << fi.num_reports << " reports from " << (int)fi.reporters.size() << " peers after "
+		     << (int)fi.reporters.size() << " reporters after "
 		     << failed_for << " >= grace " << grace << ")\n";
     return true;
   }
@@ -1699,7 +1703,8 @@ bool OSDMonitor::prepare_failure(MonOpRequestRef op)
 {
   op->mark_osdmon_event(__func__);
   MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
-  dout(1) << "prepare_failure " << m->get_target() << " from " << m->get_orig_source_inst()
+  dout(1) << "prepare_failure " << m->get_target()
+	  << " from " << m->get_orig_source_inst()
           << " is reporting failure:" << m->if_osd_failed() << dendl;
 
   int target_osd = m->get_target().name.num();
@@ -1709,7 +1714,9 @@ bool OSDMonitor::prepare_failure(MonOpRequestRef op)
 
   // calculate failure time
   utime_t now = ceph_clock_now(g_ceph_context);
-  utime_t failed_since = m->get_recv_stamp() - utime_t(m->failed_for ? m->failed_for : g_conf->osd_heartbeat_grace, 0);
+  utime_t failed_since =
+    m->get_recv_stamp() -
+    utime_t(m->failed_for ? m->failed_for : g_conf->osd_heartbeat_grace, 0);
 
   if (m->if_osd_failed()) {
     // add a report
@@ -1725,7 +1732,7 @@ bool OSDMonitor::prepare_failure(MonOpRequestRef op)
   } else {
     // remove the report
     mon->clog->debug() << m->get_target() << " failure report canceled by "
-		      << m->get_orig_source_inst() << "\n";
+		       << m->get_orig_source_inst() << "\n";
     if (failure_info.count(target_osd)) {
       failure_info_t& fi = failure_info[target_osd];
       list<MonOpRequestRef> ls;
@@ -1737,12 +1744,12 @@ bool OSDMonitor::prepare_failure(MonOpRequestRef op)
 	ls.pop_front();
       }
       if (fi.reporters.empty()) {
-	dout(10) << " removing last failure_info for osd." << target_osd << dendl;
+	dout(10) << " removing last failure_info for osd." << target_osd
+		 << dendl;
 	failure_info.erase(target_osd);
       } else {
 	dout(10) << " failure_info for osd." << target_osd << " now "
-		 << fi.reporters.size() << " reporters and "
-		 << fi.num_reports << " reports" << dendl;
+		 << fi.reporters.size() << " reporters" << dendl;
       }
     } else {
       dout(10) << " no failure_info for osd." << target_osd << dendl;
@@ -2271,7 +2278,8 @@ bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
   MonSession *session = m->get_session();
   if (!session)
     goto ignore;
-  if (!session->is_capable("osd", MON_CAP_R | MON_CAP_W)) {
+  if (!session->caps.is_capable(g_ceph_context, session->entity_name,
+        "osd", "osd pool rmsnap", {}, true, true, false)) {
     dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
 	    << session->caps << dendl;
     goto ignore;
@@ -2400,7 +2408,20 @@ void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
 
   MonSession *s = op->get_session();
   assert(s);
-  send_incremental(first, s, false, op);
+
+  if (s->proxy_con &&
+      s->proxy_con->has_feature(CEPH_FEATURE_MON_ROUTE_OSDMAP)) {
+    // oh, we can tell the other mon to do it
+    dout(10) << __func__ << " asking proxying mon to send_incremental from "
+	     << first << dendl;
+    MRoute *r = new MRoute(s->proxy_tid, NULL);
+    r->send_osdmap_first = first;
+    s->proxy_con->send_message(r);
+    op->mark_event("reply: send routed send_osdmap_first reply");
+  } else {
+    // do it ourselves
+    send_incremental(first, s, false, op);
+  }
 }
 
 void OSDMonitor::send_incremental(epoch_t first,
@@ -2428,7 +2449,7 @@ void OSDMonitor::send_incremental(epoch_t first,
 	     << first << " " << bl.length() << " bytes" << dendl;
 
     MOSDMap *m = new MOSDMap(osdmap.get_fsid());
-    m->oldest_map = first;
+    m->oldest_map = get_first_committed();
     m->newest_map = osdmap.get_epoch();
     m->maps[first] = bl;
 
@@ -2885,7 +2906,8 @@ namespace {
     CACHE_TARGET_FULL_RATIO,
     CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
     ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
-    MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ};
+    MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
+    HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N};
 
   std::set<osd_pool_get_choices>
     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
@@ -3359,16 +3381,18 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
       ("erasure_code_profile", ERASURE_CODE_PROFILE)
       ("min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE)
       ("min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE)
-      ("fast_read", FAST_READ);
+      ("fast_read", FAST_READ)
+      ("hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE)
+      ("hit_set_search_last_n", HIT_SET_SEARCH_LAST_N);
 
     typedef std::set<osd_pool_get_choices> choices_set_t;
 
     const choices_set_t ONLY_TIER_CHOICES = boost::assign::list_of
       (HIT_SET_TYPE)(HIT_SET_PERIOD)(HIT_SET_COUNT)(HIT_SET_FPP)
       (TARGET_MAX_OBJECTS)(TARGET_MAX_BYTES)(CACHE_TARGET_FULL_RATIO)
-      (CACHE_TARGET_DIRTY_RATIO)(CACHE_TARGET_DIRTY_HIGH_RATIO)(CACHE_MIN_FLUSH_AGE)
-      (CACHE_MIN_EVICT_AGE)(MIN_READ_RECENCY_FOR_PROMOTE);
-
+      (CACHE_TARGET_DIRTY_RATIO)(CACHE_TARGET_DIRTY_HIGH_RATIO)
+      (CACHE_MIN_FLUSH_AGE)(CACHE_MIN_EVICT_AGE)(MIN_READ_RECENCY_FOR_PROMOTE)
+      (HIT_SET_GRADE_DECAY_RATE)(HIT_SET_SEARCH_LAST_N);
     const choices_set_t ONLY_ERASURE_CHOICES = boost::assign::list_of
       (ERASURE_CODE_PROFILE);
 
@@ -3530,6 +3554,14 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
           case FAST_READ:
             f->dump_int("fast_read", p->fast_read);
             break;
+	  case HIT_SET_GRADE_DECAY_RATE:
+	    f->dump_int("hit_set_grade_decay_rate",
+			p->hit_set_grade_decay_rate);
+	    break;
+	  case HIT_SET_SEARCH_LAST_N:
+	    f->dump_int("hit_set_search_last_n",
+			p->hit_set_search_last_n);
+	    break;
 	}
 	f->close_section();
 	f->flush(rdata);
@@ -3620,6 +3652,14 @@ bool OSDMonitor::preprocess_command(MonOpRequestRef op)
 	    ss << "min_read_recency_for_promote: " <<
 	      p->min_read_recency_for_promote << "\n";
 	    break;
+	  case HIT_SET_GRADE_DECAY_RATE:
+	    ss << "hit_set_grade_decay_rate: " <<
+	      p->hit_set_grade_decay_rate << "\n";
+	    break;
+	  case HIT_SET_SEARCH_LAST_N:
+	    ss << "hit_set_search_last_n: " <<
+	      p->hit_set_search_last_n << "\n";
+	    break;
 	  case HASHPSPOOL:
 	  case NODELETE:
 	  case NOPGCHANGE:
@@ -4734,8 +4774,8 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
        var == "target_max_objects" || var == "target_max_bytes" ||
        var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
        var == "cache_target_dirty_high_ratio" ||
-       var == "cache_min_flush_age" || var == "cache_min_evict_age")) {
-    ss << "pool '" << poolstr << "' is not a tier pool: variable not applicable";
+       var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
+       var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n")) {
     return -EACCES;
   }
 
@@ -4921,7 +4961,6 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
     }
     p.hit_set_period = n;
   } else if (var == "hit_set_count") {
-
     if (interr.length()) {
       ss << "error parsing integer value '" << val << "': " << interr;
       return -EINVAL;
@@ -5013,6 +5052,26 @@ int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
       return -EINVAL;
     }
     p.min_read_recency_for_promote = n;
+  } else if (var == "hit_set_grade_decay_rate") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    if (n > 100 || n < 0) {
+      ss << "value out of range,valid range is 0 - 100";
+      return -EINVAL;
+    }
+    p.hit_set_grade_decay_rate = n;
+  } else if (var == "hit_set_search_last_n") {
+    if (interr.length()) {
+      ss << "error parsing integer value '" << val << "': " << interr;
+      return -EINVAL;
+    }
+    if (n > p.hit_set_count || n < 0) {
+      ss << "value out of range,valid range is 0 - hit_set_count";
+      return -EINVAL;
+    }
+    p.hit_set_search_last_n = n;
   } else if (var == "min_write_recency_for_promote") {
     if (interr.length()) {
       ss << "error parsing integer value '" << val << "': " << interr;
@@ -6121,6 +6180,11 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
       err = -ENOENT;
       goto reply;
     }
+    if (pending_inc.new_pg_temp.count(pgid)) {
+      dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
+      wait_for_finished_proposal(op, new C_RetryMessage(this, op));
+      return true;
+    }
 
     vector<string> id_vec;
     vector<int32_t> new_pg_temp;
@@ -7167,6 +7231,8 @@ done:
     ntp->hit_set_period = g_conf->osd_tier_default_cache_hit_set_period;
     ntp->min_read_recency_for_promote = g_conf->osd_tier_default_cache_min_read_recency_for_promote;
     ntp->min_write_recency_for_promote = g_conf->osd_tier_default_cache_min_write_recency_for_promote;
+    ntp->hit_set_grade_decay_rate = g_conf->osd_tier_default_cache_hit_set_grade_decay_rate;
+    ntp->hit_set_search_last_n = g_conf->osd_tier_default_cache_hit_set_search_last_n;
     ntp->hit_set_params = hsp;
     ntp->target_max_bytes = size;
     ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index 78e00f9..7638b6a 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -51,22 +51,20 @@ class PGMap;
 
 /// information about a particular peer's failure reports for one osd
 struct failure_reporter_t {
-  int num_reports;          ///< reports from this reporter
   utime_t failed_since;     ///< when they think it failed
-  MonOpRequestRef op;       ///< most recent failure op request
+  MonOpRequestRef op;       ///< failure op request
 
-  failure_reporter_t() : num_reports(0) {}
-  failure_reporter_t(utime_t s) : num_reports(1), failed_since(s) {}
+  failure_reporter_t() {}
+  failure_reporter_t(utime_t s) : failed_since(s) {}
   ~failure_reporter_t() { }
 };
 
 /// information about all failure reports for one osd
 struct failure_info_t {
-  map<int, failure_reporter_t> reporters;  ///< reporter -> # reports
+  map<int, failure_reporter_t> reporters;  ///< reporter -> failed_since etc
   utime_t max_failed_since;                ///< most recent failed_since
-  int num_reports;
 
-  failure_info_t() : num_reports(0) {}
+  failure_info_t() {}
 
   utime_t get_failed_since() {
     if (max_failed_since == utime_t() && !reporters.empty()) {
@@ -83,7 +81,7 @@ struct failure_info_t {
   // set the message for the latest report.  return any old op request we had,
   // if any, so we can discard it.
   MonOpRequestRef add_report(int who, utime_t failed_since,
-                              MonOpRequestRef op) {
+			     MonOpRequestRef op) {
     map<int, failure_reporter_t>::iterator p = reporters.find(who);
     if (p == reporters.end()) {
       if (max_failed_since == utime_t())
@@ -91,10 +89,7 @@ struct failure_info_t {
       else if (max_failed_since < failed_since)
 	max_failed_since = failed_since;
       p = reporters.insert(map<int, failure_reporter_t>::value_type(who, failure_reporter_t(failed_since))).first;
-    } else {
-      p->second.num_reports++;
     }
-    num_reports++;
 
     MonOpRequestRef ret = p->second.op;
     p->second.op = op;
@@ -116,7 +111,6 @@ struct failure_info_t {
     map<int, failure_reporter_t>::iterator p = reporters.find(who);
     if (p == reporters.end())
       return;
-    num_reports -= p->second.num_reports;
     reporters.erase(p);
     if (reporters.empty())
       max_failed_since = utime_t();
@@ -227,11 +221,12 @@ private:
   MOSDMap *build_incremental(epoch_t first, epoch_t last);
   void send_full(MonOpRequestRef op);
   void send_incremental(MonOpRequestRef op, epoch_t first);
+public:
   // @param req an optional op request, if the osdmaps are replies to it. so
   //            @c Monitor::send_reply() can mark_event with it.
   void send_incremental(epoch_t first, MonSession *session, bool onetime,
 			MonOpRequestRef req = MonOpRequestRef());
-
+private:
   int reweight_by_utilization(int oload, std::string& out_str, bool by_pg,
 			      const set<int64_t> *pools);
 
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index 1eee600..6eb8b33 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -471,8 +471,9 @@ void PGMap::stat_pg_add(const pg_t &pgid, const pg_stat_t &s, bool nocreating,
   if (!nocreating) {
     if (s.state & PG_STATE_CREATING) {
       creating_pgs.insert(pgid);
-      if (s.acting_primary >= 0)
-	creating_pgs_by_osd[s.acting_primary].insert(pgid);
+      if (s.acting_primary >= 0) {
+	creating_pgs_by_osd_epoch[s.acting_primary][s.mapping_epoch].insert(pgid);
+      }
     }
   }
 
@@ -501,16 +502,21 @@ void PGMap::stat_pg_sub(const pg_t &pgid, const pg_stat_t &s, bool nocreating,
   pg_sum.sub(s);
 
   num_pg--;
-  if (--num_pg_by_state[s.state] == 0)
+  int end = --num_pg_by_state[s.state];
+  assert(end >= 0);
+  if (end == 0)
     num_pg_by_state.erase(s.state);
 
   if (!nocreating) {
     if (s.state & PG_STATE_CREATING) {
       creating_pgs.erase(pgid);
       if (s.acting_primary >= 0) {
-	creating_pgs_by_osd[s.acting_primary].erase(pgid);
-	if (creating_pgs_by_osd[s.acting_primary].size() == 0)
-	  creating_pgs_by_osd.erase(s.acting_primary);
+	map<epoch_t,set<pg_t> >& r = creating_pgs_by_osd_epoch[s.acting_primary];
+	r[s.mapping_epoch].erase(pgid);
+	if (r[s.mapping_epoch].empty())
+	  r.erase(s.mapping_epoch);
+	if (r.empty())
+	  creating_pgs_by_osd_epoch.erase(s.acting_primary);
       }
     }
   }
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
index e1cdf2d..d2b9e8a 100644
--- a/src/mon/PGMap.h
+++ b/src/mon/PGMap.h
@@ -181,8 +181,8 @@ public:
 
  public:
 
-  set<pg_t> creating_pgs;   // lru: front = new additions, back = recently pinged
-  map<int,set<pg_t> > creating_pgs_by_osd;
+  set<pg_t> creating_pgs;
+  map<int,map<epoch_t,set<pg_t> > > creating_pgs_by_osd_epoch;
 
   // Bits that use to be enum StuckPG
   static const int STUCK_INACTIVE = (1<<0);
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index 77a5af1..2f37a56 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -128,7 +128,8 @@ void PGMonitor::tick()
   if (mon->is_leader()) {
     bool propose = false;
     
-    if (need_check_down_pgs && check_down_pgs())
+    if ((need_check_down_pgs || !need_check_down_pg_osds.empty()) &&
+	check_down_pgs())
       propose = true;
     
     if (propose) {
@@ -275,11 +276,6 @@ void PGMonitor::update_from_paxos(bool *need_bootstrap)
 
   assert(version == pg_map.version);
 
-  if (mon->osdmon()->osdmap.get_epoch()) {
-    map_pg_creates();
-    send_pg_creates();
-  }
-
   update_logger();
 }
 
@@ -307,8 +303,8 @@ void PGMonitor::upgrade_format()
 
 void PGMonitor::post_paxos_update()
 {
+  dout(10) << __func__ << dendl;
   if (mon->osdmon()->osdmap.get_epoch()) {
-    map_pg_creates();
     send_pg_creates();
   }
 }
@@ -361,8 +357,6 @@ void PGMonitor::read_pgmap_meta()
 
   if (last_pg_scan != pg_map.get_last_pg_scan()) {
     pg_map.set_last_pg_scan(last_pg_scan);
-    // clear our osdmap epoch so that map_pg_creates() will re-run
-    last_map_pg_create_osd_epoch = 0;
   }
 
   float full_ratio, nearfull_ratio;
@@ -438,16 +432,22 @@ void PGMonitor::apply_pgmap_delta(bufferlist& bl)
   while (!p.end()) {
     pg_t pgid;
     ::decode(pgid, p);
-    bufferlist bl;
-    int r = mon->store->get(pgmap_pg_prefix, stringify(pgid), bl);
-    dout(20) << " refreshing pg " << pgid << " got " << r << " len "
-	     << bl.length() << dendl;
 
-    if (pg_pool_sum_old.count(pgid.pool()) == 0)
-      pg_pool_sum_old[pgid.pool()] = pg_map.pg_pool_sum[pgid.pool()];
+    int r;
+    bufferlist pgbl;
+    if (deleted_pools.count(pgid.pool())) {
+      r = -ENOENT;
+    } else {
+      r = mon->store->get(pgmap_pg_prefix, stringify(pgid), pgbl);
+      dout(20) << " refreshing pg " << pgid << " got " << r << " len "
+	       << pgbl.length() << dendl;
+
+      if (pg_pool_sum_old.count(pgid.pool()) == 0)
+	pg_pool_sum_old[pgid.pool()] = pg_map.pg_pool_sum[pgid.pool()];
+    }
 
     if (r >= 0) {
-      pg_map.update_pg(pgid, bl);
+      pg_map.update_pg(pgid, pgbl);
     } else {
       pg_map.remove_pg(pgid);
       if (pgid.ps() == 0)
@@ -829,13 +829,6 @@ bool PGMonitor::prepare_pg_stats(MonOpRequestRef op)
 	     << " -> " << pg_state_string(p->second.state)
 	     << dendl;
     pending_inc.pg_stat_updates[pgid] = p->second;
-
-    /*
-    // we don't care much about consistency, here; apply to live map.
-    pg_map.stat_pg_sub(pgid, pg_map.pg_stat[pgid]);
-    pg_map.pg_stat[pgid] = p->second;
-    pg_map.stat_pg_add(pgid, pg_map.pg_stat[pgid]);
-    */
   }
   
   wait_for_finished_proposal(op, new C_Stats(this, op, ack_op));
@@ -874,18 +867,19 @@ void PGMonitor::check_osd_map(epoch_t epoch)
     return; // whatever.
 
   if (pg_map.last_osdmap_epoch >= epoch) {
-    dout(10) << "check_osd_map already seen " << pg_map.last_osdmap_epoch << " >= " << epoch << dendl;
+    dout(10) << __func__ << " already seen " << pg_map.last_osdmap_epoch
+	     << " >= " << epoch << dendl;
     return;
   }
 
   if (!mon->osdmon()->is_readable()) {
-    dout(10) << "check_osd_map -- osdmap not readable, waiting" << dendl;
+    dout(10) << __func__ << " -- osdmap not readable, waiting" << dendl;
     mon->osdmon()->wait_for_readable_ctx(new RetryCheckOSDMap(this, epoch));
     return;
   }
 
   if (!is_writeable()) {
-    dout(10) << "check_osd_map -- pgmap not writeable, waiting" << dendl;
+    dout(10) << __func__ << " -- pgmap not writeable, waiting" << dendl;
     wait_for_writeable_ctx(new RetryCheckOSDMap(this, epoch));
     return;
   }
@@ -894,7 +888,7 @@ void PGMonitor::check_osd_map(epoch_t epoch)
   for (epoch_t e = pg_map.last_osdmap_epoch+1;
        e <= epoch;
        e++) {
-    dout(10) << "check_osd_map applying osdmap e" << e << " to pg_map" << dendl;
+    dout(10) << __func__ << " applying osdmap e" << e << " to pg_map" << dendl;
     bufferlist bl;
     int err = mon->osdmon()->get_version(e, bl);
     assert(err == 0);
@@ -905,7 +899,7 @@ void PGMonitor::check_osd_map(epoch_t epoch)
 	 p != inc.new_weight.end();
 	 ++p)
       if (p->second == CEPH_OSD_OUT) {
-	dout(10) << "check_osd_map  osd." << p->first << " went OUT" << dendl;
+	dout(10) << __func__ << "  osd." << p->first << " went OUT" << dendl;
 	pending_inc.stat_osd_out(p->first);
       }
 
@@ -913,8 +907,10 @@ void PGMonitor::check_osd_map(epoch_t epoch)
     for (map<int32_t,uint8_t>::iterator p = inc.new_state.begin();
 	 p != inc.new_state.end();
 	 ++p) {
-      if (p->second & CEPH_OSD_UP) {   // true if marked up OR down, but we're too lazy to check which
-	need_check_down_pgs = true;
+      if (p->second & CEPH_OSD_UP) {   // true if marked up OR down,
+				       // but we're too lazy to check
+				       // which
+	need_check_down_pg_osds.insert(p->first);
 
 	// clear out the last_osd_report for this OSD
         map<int, utime_t>::iterator report = last_osd_report.find(p->first);
@@ -931,7 +927,8 @@ void PGMonitor::check_osd_map(epoch_t epoch)
       if (p->second & CEPH_OSD_EXISTS) {
 	// whether it was created *or* destroyed, we can safely drop
 	// it's osd_stat_t record.
-	dout(10) << "check_osd_map  osd." << p->first << " created or destroyed" << dendl;
+	dout(10) << __func__ << "  osd." << p->first
+		 << " created or destroyed" << dendl;
 	pending_inc.rm_stat(p->first);
 
 	// and adjust full, nearfull set
@@ -947,23 +944,22 @@ void PGMonitor::check_osd_map(epoch_t epoch)
     propose = true;
   }
 
-  // scan pg space?
+  if (map_pg_creates())
+    propose = true;
   if (register_new_pgs())
     propose = true;
 
-  if (need_check_down_pgs && check_down_pgs())
+  if ((need_check_down_pgs || !need_check_down_pg_osds.empty()) &&
+      check_down_pgs())
     propose = true;
   
   if (propose)
     propose_pending();
-
-  if (mon->osdmon()->osdmap.get_epoch()) {
-    map_pg_creates();
-    send_pg_creates();
-  }
 }
 
-void PGMonitor::register_pg(pg_pool_t& pool, pg_t pgid, epoch_t epoch, bool new_pool)
+void PGMonitor::register_pg(OSDMap *osdmap,
+			    pg_pool_t& pool, pg_t pgid, epoch_t epoch,
+			    bool new_pool)
 {
   pg_t parent;
   int split_bits = 0;
@@ -973,11 +969,11 @@ void PGMonitor::register_pg(pg_pool_t& pool, pg_t pgid, epoch_t epoch, bool new_
     while (1) {
       // remove most significant bit
       int msb = pool.calc_bits_of(parent.ps());
-      if (!msb) break;
+      if (!msb)
+	break;
       parent.set_ps(parent.ps() & ~(1<<(msb-1)));
       split_bits++;
-      dout(10) << " is " << pgid << " parent " << parent << " ?" << dendl;
-      //if (parent.u.pg.ps < mon->osdmon->osdmap.get_pgp_num()) {
+      dout(30) << " is " << pgid << " parent " << parent << " ?" << dendl;
       if (pg_map.pg_stat.count(parent) &&
 	  pg_map.pg_stat[parent].state != PG_STATE_CREATING) {
 	dout(10) << "  parent is " << parent << dendl;
@@ -992,6 +988,7 @@ void PGMonitor::register_pg(pg_pool_t& pool, pg_t pgid, epoch_t epoch, bool new_
   stats.created = epoch;
   stats.parent = parent;
   stats.parent_split_bits = split_bits;
+  stats.mapping_epoch = epoch;
 
   if (parent_found) {
     stats.last_scrub_stamp = pg_map.pg_stat[parent].last_scrub_stamp;
@@ -1004,11 +1001,22 @@ void PGMonitor::register_pg(pg_pool_t& pool, pg_t pgid, epoch_t epoch, bool new_
     stats.last_clean_scrub_stamp = now;
   }
 
+  osdmap->pg_to_up_acting_osds(
+    pgid,
+    &stats.up,
+    &stats.up_primary,
+    &stats.acting,
+    &stats.acting_primary);
 
   if (split_bits == 0) {
-    dout(10) << "register_new_pgs  will create " << pgid << dendl;
+    dout(10) << __func__ << "  will create " << pgid
+	     << " primary " << stats.acting_primary
+	     << " acting " << stats.acting
+	     << dendl;
   } else {
-    dout(10) << "register_new_pgs  will create " << pgid
+    dout(10) << __func__ << "  will create " << pgid
+	     << " primary " << stats.acting_primary
+	     << " acting " << stats.acting
 	     << " parent " << parent
 	     << " by " << split_bits << " bits"
 	     << dendl;
@@ -1018,11 +1026,10 @@ void PGMonitor::register_pg(pg_pool_t& pool, pg_t pgid, epoch_t epoch, bool new_
 bool PGMonitor::register_new_pgs()
 {
   // iterate over crush mapspace
-  epoch_t epoch = mon->osdmon()->osdmap.get_epoch();
-  dout(10) << "register_new_pgs checking pg pools for osdmap epoch " << epoch
-	   << ", last_pg_scan " << pg_map.last_pg_scan << dendl;
-
   OSDMap *osdmap = &mon->osdmon()->osdmap;
+  epoch_t epoch = osdmap->get_epoch();
+  dout(10) << __func__ << " checking pg pools for osdmap epoch " << epoch
+	   << ", last_pg_scan " << pg_map.last_pg_scan << dendl;
 
   int created = 0;
   for (map<int64_t,pg_pool_t>::iterator p = osdmap->pools.begin();
@@ -1030,7 +1037,8 @@ bool PGMonitor::register_new_pgs()
        ++p) {
     int64_t poolid = p->first;
     pg_pool_t &pool = p->second;
-    int ruleno = osdmap->crush->find_rule(pool.get_crush_ruleset(), pool.get_type(), pool.get_size());
+    int ruleno = osdmap->crush->find_rule(pool.get_crush_ruleset(),
+					  pool.get_type(), pool.get_size());
     if (ruleno < 0 || !osdmap->crush->rule_exists(ruleno))
       continue;
 
@@ -1040,9 +1048,11 @@ bool PGMonitor::register_new_pgs()
       continue;
     }
 
-    dout(10) << "register_new_pgs scanning pool " << p->first << " " << pool << dendl;
+    dout(10) << __func__ << " scanning pool " << p->first
+	     << " " << pool << dendl;
 
-    bool new_pool = pg_map.pg_pool_sum.count(poolid) == 0;  // first pgs in this pool
+    // first pgs in this pool
+    bool new_pool = pg_map.pg_pool_sum.count(poolid) == 0;
 
     for (ps_t ps = 0; ps < pool.get_pg_num(); ps++) {
       pg_t pgid(ps, poolid, -1);
@@ -1051,7 +1061,7 @@ bool PGMonitor::register_new_pgs()
 	continue;
       }
       created++;
-      register_pg(pool, pgid, pool.get_last_change(), new_pool);
+      register_pg(osdmap, pool, pgid, pool.get_last_change(), new_pool);
     }
   }
 
@@ -1060,19 +1070,22 @@ bool PGMonitor::register_new_pgs()
        p != pg_map.creating_pgs.end();
        ++p) {
     if (p->preferred() >= 0) {
-      dout(20) << " removing creating_pg " << *p << " because it is localized and obsolete" << dendl;
+      dout(20) << " removing creating_pg " << *p
+	       << " because it is localized and obsolete" << dendl;
       pending_inc.pg_remove.insert(*p);
       removed++;
     }
     if (!osdmap->have_pg_pool(p->pool())) {
-      dout(20) << " removing creating_pg " << *p << " because containing pool deleted" << dendl;
+      dout(20) << " removing creating_pg " << *p
+	       << " because containing pool deleted" << dendl;
       pending_inc.pg_remove.insert(*p);
       ++removed;
     }
   }
 
   // deleted pools?
-  for (ceph::unordered_map<pg_t,pg_stat_t>::const_iterator p = pg_map.pg_stat.begin();
+  for (ceph::unordered_map<pg_t,pg_stat_t>::const_iterator p =
+	 pg_map.pg_stat.begin();
        p != pg_map.pg_stat.end(); ++p) {
     if (!osdmap->have_pg_pool(p->first.pool())) {
       dout(20) << " removing pg_stat " << p->first << " because "
@@ -1095,30 +1108,24 @@ bool PGMonitor::register_new_pgs()
   return (created || removed);
 }
 
-void PGMonitor::map_pg_creates()
+bool PGMonitor::map_pg_creates()
 {
   OSDMap *osdmap = &mon->osdmon()->osdmap;
-  if (osdmap->get_epoch() == last_map_pg_create_osd_epoch) {
-    dout(10) << "map_pg_creates to " << pg_map.creating_pgs.size() << " pgs -- no change" << dendl;
-    return;
-  }
 
-  dout(10) << "map_pg_creates to " << pg_map.creating_pgs.size() << " pgs osdmap epoch " << osdmap->get_epoch() << dendl;
-  last_map_pg_create_osd_epoch = osdmap->get_epoch();
+  dout(10) << __func__ << " to " << pg_map.creating_pgs.size()
+	   << " pgs, osdmap epoch " << osdmap->get_epoch()
+	   << dendl;
 
-  for (set<pg_t>::iterator p = pg_map.creating_pgs.begin();
+  unsigned changed = 0;
+  for (set<pg_t>::const_iterator p = pg_map.creating_pgs.begin();
        p != pg_map.creating_pgs.end();
        ++p) {
     pg_t pgid = *p;
     pg_t on = pgid;
-    pg_stat_t *s = NULL;
-    ceph::unordered_map<pg_t,pg_stat_t>::iterator q = pg_map.pg_stat.find(pgid);
-    if (q == pg_map.pg_stat.end()) {
-      s = &pg_map.pg_stat[pgid];
-    } else {
-      s = &q->second;
-      pg_map.stat_pg_sub(pgid, *s, true);
-    }
+    ceph::unordered_map<pg_t,pg_stat_t>::const_iterator q =
+      pg_map.pg_stat.find(pgid);
+    assert(q != pg_map.pg_stat.end());
+    const pg_stat_t *s = &q->second;
 
     if (s->parent_split_bits)
       on = s->parent;
@@ -1132,47 +1139,61 @@ void PGMonitor::map_pg_creates()
       &acting,
       &acting_primary);
 
-    if (s->acting_primary != -1) {
-      pg_map.creating_pgs_by_osd[s->acting_primary].erase(pgid);
-      if (pg_map.creating_pgs_by_osd[s->acting_primary].size() == 0)
-        pg_map.creating_pgs_by_osd.erase(s->acting_primary);
-    }
-    s->up = up;
-    s->up_primary = up_primary;
-    s->acting = acting;
-    s->acting_primary = acting_primary;
-    pg_map.stat_pg_add(pgid, *s, true);
+    if (up != s->up ||
+	up_primary != s->up_primary ||
+	acting !=  s->acting ||
+	acting_primary != s->acting_primary) {
+      dout(20) << __func__ << "  " << pgid << " "
+	       << " acting_primary: " << s->acting_primary
+	       << " -> " << acting_primary
+	       << " acting: " << s->acting << " -> " << acting
+	       << " up_primary: " << s->up_primary << " -> " << up_primary
+	       << " up: " << s->up << " -> " << up
+	       << dendl;
 
-    // don't send creates for localized pgs
-    if (pgid.preferred() >= 0)
-      continue;
+      pg_stat_t *ns = &pending_inc.pg_stat_updates[pgid];
+      *ns = *s;
 
-    // don't send creates for splits
-    if (s->parent_split_bits)
-      continue;
+      // note epoch if the target of the create message changed
+      if (acting_primary != ns->acting_primary)
+	ns->mapping_epoch = osdmap->get_epoch();
 
-    if (acting_primary != -1) {
-      pg_map.creating_pgs_by_osd[acting_primary].insert(pgid);
-    } else {
-      dout(20) << "map_pg_creates  " << pgid << " -> no osds in epoch "
-	       << mon->osdmon()->osdmap.get_epoch() << ", skipping" << dendl;
-      continue;  // blarney!
+      ns->up = up;
+      ns->up_primary = up_primary;
+      ns->acting = acting;
+      ns->acting_primary = acting_primary;
+
+      ++changed;
     }
   }
-  for (map<int, set<pg_t> >::iterator p = pg_map.creating_pgs_by_osd.begin();
-       p != pg_map.creating_pgs_by_osd.end();
-       ++p) {
-    dout(10) << "map_pg_creates osd." << p->first << " has " << p->second.size() << " pgs" << dendl;
+  if (changed) {
+    dout(10) << __func__ << " " << changed << " pgs changed primary" << dendl;
+    return true;
   }
+  return false;
 }
 
 void PGMonitor::send_pg_creates()
 {
-  dout(10) << "send_pg_creates to " << pg_map.creating_pgs.size() << " pgs" << dendl;
+  // We only need to do this old, spammy way of broadcasting create messages
+  // to every osd (even those that aren't connected) if there are old OSDs in
+  // the cluster. As soon as everybody has upgraded we can flipt to the new
+  // behavior instead
+  OSDMap& osdmap = mon->osdmon()->osdmap;
+  if (osdmap.get_num_up_osds() == 0)
+    return;
+  if (osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB) {
+    check_subs();
+    return;
+  }
+
+  dout(10) << "send_pg_creates to " << pg_map.creating_pgs.size()
+	   << " pgs" << dendl;
 
   utime_t now = ceph_clock_now(g_ceph_context);
-  for (map<int, set<pg_t> >::iterator p = pg_map.creating_pgs_by_osd.begin();
-       p != pg_map.creating_pgs_by_osd.end();
+  for (map<int, map<epoch_t, set<pg_t>> >::iterator p =
+	 pg_map.creating_pgs_by_osd_epoch.begin();
+       p != pg_map.creating_pgs_by_osd_epoch.end();
        ++p) {
     int osd = p->first;
 
@@ -1181,27 +1202,44 @@ void PGMonitor::send_pg_creates()
 	now - g_conf->mon_pg_create_interval < last_sent_pg_create[osd]) 
       continue;
       
-    if (mon->osdmon()->osdmap.is_up(osd))
-      send_pg_creates(osd, NULL);
+    if (osdmap.is_up(osd))
+      send_pg_creates(osd, NULL, 0);
   }
 }
 
-void PGMonitor::send_pg_creates(int osd, Connection *con)
+epoch_t PGMonitor::send_pg_creates(int osd, Connection *con, epoch_t next)
 {
-  map<int, set<pg_t> >::iterator p = pg_map.creating_pgs_by_osd.find(osd);
-  if (p == pg_map.creating_pgs_by_osd.end())
-    return;
+  dout(30) << __func__ << " " << pg_map.creating_pgs_by_osd_epoch << dendl;
+  map<int, map<epoch_t, set<pg_t> > >::iterator p =
+    pg_map.creating_pgs_by_osd_epoch.find(osd);
+  if (p == pg_map.creating_pgs_by_osd_epoch.end())
+    return next;
   assert(p->second.size() > 0);
 
-  dout(20) << "send_pg_creates osd." << osd << " pgs " << p->second << dendl;
-  MOSDPGCreate *m = new MOSDPGCreate(mon->osdmon()->osdmap.get_epoch());
-  for (set<pg_t>::iterator q = p->second.begin(); q != p->second.end(); ++q) {
-    m->mkpg[*q] = pg_create_t(pg_map.pg_stat[*q].created,
-			      pg_map.pg_stat[*q].parent,
-			      pg_map.pg_stat[*q].parent_split_bits);
-    // Need the create time from the monitor using its clock to set last_scrub_stamp
-    // upon pg creation.
-    m->ctimes[*q] = pg_map.pg_stat[*q].last_scrub_stamp;
+  MOSDPGCreate *m = NULL;
+  epoch_t last = 0;
+  for (map<epoch_t, set<pg_t> >::iterator q = p->second.lower_bound(next);
+       q != p->second.end();
+       ++q) {
+    dout(20) << __func__ << " osd." << osd << " from " << next
+	     << " : epoch " << q->first << " " << q->second.size() << " pgs"
+	     << dendl;
+    last = q->first;
+    for (set<pg_t>::iterator r = q->second.begin(); r != q->second.end(); ++r) {
+      if (!m)
+	m = new MOSDPGCreate(pg_map.last_osdmap_epoch);
+      m->mkpg[*r] = pg_create_t(pg_map.pg_stat[*r].created,
+				pg_map.pg_stat[*r].parent,
+				pg_map.pg_stat[*r].parent_split_bits);
+      // Need the create time from the monitor using its clock to set
+      // last_scrub_stamp upon pg creation.
+      m->ctimes[*r] = pg_map.pg_stat[*r].last_scrub_stamp;
+    }
+  }
+  if (!m) {
+    dout(20) << "send_pg_creates osd." << osd << " from " << next
+	     << " has nothing to send" << dendl;
+    return next;
   }
 
   if (con) {
@@ -1211,6 +1249,24 @@ void PGMonitor::send_pg_creates(int osd, Connection *con)
     mon->messenger->send_message(m, mon->osdmon()->osdmap.get_inst(osd));
   }
   last_sent_pg_create[osd] = ceph_clock_now(g_ceph_context);
+
+  // sub is current through last + 1
+  return last + 1;
+}
+
+void PGMonitor::_mark_pg_stale(pg_t pgid, const pg_stat_t& cur_stat)
+{
+  dout(10) << " marking pg " << pgid << " stale" << dendl;
+  map<pg_t,pg_stat_t>::iterator q = pending_inc.pg_stat_updates.find(pgid);
+  pg_stat_t *stat;
+  if (q == pending_inc.pg_stat_updates.end()) {
+    stat = &pending_inc.pg_stat_updates[pgid];
+    *stat = cur_stat;
+  } else {
+    stat = &q->second;
+  }
+  stat->state |= PG_STATE_STALE;
+  stat->last_unstale = ceph_clock_now(g_ceph_context);
 }
 
 bool PGMonitor::check_down_pgs()
@@ -1220,28 +1276,37 @@ bool PGMonitor::check_down_pgs()
   OSDMap *osdmap = &mon->osdmon()->osdmap;
   bool ret = false;
 
-  for (ceph::unordered_map<pg_t,pg_stat_t>::iterator p = pg_map.pg_stat.begin();
-       p != pg_map.pg_stat.end();
-       ++p) {
-    if ((p->second.state & PG_STATE_STALE) == 0 &&
-	p->second.acting_primary != -1 &&
-	osdmap->is_down(p->second.acting_primary)) {
-      dout(10) << " marking pg " << p->first << " stale with acting " << p->second.acting << dendl;
-
-      map<pg_t,pg_stat_t>::iterator q = pending_inc.pg_stat_updates.find(p->first);
-      pg_stat_t *stat;
-      if (q == pending_inc.pg_stat_updates.end()) {
-	stat = &pending_inc.pg_stat_updates[p->first];
-	*stat = p->second;
-      } else {
-	stat = &q->second;
+  // if a large number of osds changed state, just iterate over the whole
+  // pg map.
+  if (need_check_down_pg_osds.size() > (unsigned)osdmap->get_num_osds() *
+      g_conf->mon_pg_check_down_all_threshold)
+    need_check_down_pgs = true;
+
+  if (need_check_down_pgs) {
+    for (auto p : pg_map.pg_stat) {
+      if ((p.second.state & PG_STATE_STALE) == 0 &&
+	  p.second.acting_primary != -1 &&
+	  osdmap->is_down(p.second.acting_primary)) {
+	_mark_pg_stale(p.first, p.second);
+	ret = true;
+      }
+    }
+  } else {
+    for (auto osd : need_check_down_pg_osds) {
+      if (osdmap->is_down(osd)) {
+	for (auto pgid : pg_map.pg_by_osd[osd]) {
+	  const pg_stat_t &stat = pg_map.pg_stat[pgid];
+	  if ((stat.state & PG_STATE_STALE) == 0 &&
+	      stat.acting_primary != -1) {
+	    _mark_pg_stale(pgid, stat);
+	    ret = true;
+	  }
+	}
       }
-      stat->state |= PG_STATE_STALE;
-      stat->last_unstale = ceph_clock_now(g_ceph_context);
-      ret = true;
     }
   }
   need_check_down_pgs = false;
+  need_check_down_pg_osds.clear();
 
   return ret;
 }
@@ -1301,9 +1366,12 @@ int64_t PGMonitor::get_rule_avail(OSDMap& osdmap, int ruleno) const
   for (map<int,float>::iterator p = wm.begin(); p != wm.end(); ++p) {
     ceph::unordered_map<int32_t,osd_stat_t>::const_iterator osd_info = pg_map.osd_stat.find(p->first);
     if (osd_info != pg_map.osd_stat.end()) {
-      if (osd_info->second.kb == 0) {
+      if (osd_info->second.kb == 0 || p->second == 0) {
         // osd must be out, hence its stats have been zeroed
         // (unless we somehow managed to have a disk with size 0...)
+        //
+        // (p->second == 0), if osd weight is 0, no need to
+        // calculate proj below.
         continue;
       }
       int64_t proj = (float)((osd_info->second).kb_avail * 1024ull) /
@@ -1541,14 +1609,6 @@ bool PGMonitor::preprocess_command(MonOpRequestRef op)
     pg_map.encode(rdata);
     ss << "got pgmap version " << pg_map.version;
     r = 0;
-  } else if (prefix == "pg map_pg_creates") {
-    map_pg_creates();
-    ss << "mapped pg creates ";
-    r = 0;
-  } else if (prefix == "pg send_pg_creates") {
-    send_pg_creates();
-    ss << "sent pg creates ";
-    r = 0;
   } else if (prefix == "pg dump") {
     string val;
     vector<string> dumpcontents;
@@ -2284,10 +2344,31 @@ int PGMonitor::dump_stuck_pg_stats(stringstream &ds,
   return 0;
 }
 
+void PGMonitor::check_subs()
+{
+  dout(10) << __func__ << dendl;
+  string type = "osd_pg_creates";
+  if (mon->session_map.subs.count(type) == 0)
+    return;
+  xlist<Subscription*>::iterator p = mon->session_map.subs[type]->begin();
+  while (!p.end()) {
+    Subscription *sub = *p;
+    ++p;
+    dout(20) << __func__ << " .. " << sub->session->inst << dendl;
+    check_sub(sub);
+  }
+}
+
 void PGMonitor::check_sub(Subscription *sub)
 {
   if (sub->type == "osd_pg_creates") {
-    send_pg_creates(sub->session->inst.name.num(),
-		    sub->session->con.get());
+    // only send these if the OSD is up.  we will check_subs() when they do
+    // come up so they will get the creates then.
+    if (sub->session->inst.name.is_osd() &&
+	mon->osdmon()->osdmap.is_up(sub->session->inst.name.num())) {
+      sub->next = send_pg_creates(sub->session->inst.name.num(),
+				  sub->session->con.get(),
+				  sub->next);
+    }
   }
 }
diff --git a/src/mon/PGMonitor.h b/src/mon/PGMonitor.h
index cb725a6..29b2e03 100644
--- a/src/mon/PGMonitor.h
+++ b/src/mon/PGMonitor.h
@@ -47,9 +47,7 @@ public:
   PGMap pg_map;
 
   bool need_check_down_pgs;
-
-  epoch_t last_map_pg_create_osd_epoch;
-
+  set<int> need_check_down_pg_osds;
 
 private:
   PGMap::Incremental pending_inc;
@@ -114,7 +112,8 @@ private:
   // when we last received PG stats from each osd
   map<int,utime_t> last_osd_report;
 
-  void register_pg(pg_pool_t& pool, pg_t pgid, epoch_t epoch, bool new_pool);
+  void register_pg(OSDMap *osdmap, pg_pool_t& pool, pg_t pgid,
+		   epoch_t epoch, bool new_pool);
 
   /**
    * check latest osdmap for new pgs to register
@@ -123,18 +122,27 @@ private:
    */
   bool register_new_pgs();
 
-  void map_pg_creates();
+  /**
+   * recalculate creating pg mappings
+   *
+   * @return true if we updated pending_inc
+   */
+  bool map_pg_creates();
+
   void send_pg_creates();
-  void send_pg_creates(int osd, Connection *con);
+  epoch_t send_pg_creates(int osd, Connection *con, epoch_t next);
 
   /**
    * check pgs for down primary osds
    *
    * clears need_check_down_pgs
+   * clears need_check_down_pg_osds
    *
    * @return true if we updated pending_inc (and should propose)
    */
   bool check_down_pgs();
+  void _mark_pg_stale(pg_t pgid, const pg_stat_t& cur_stat);
+
 
   /**
    * Dump stats from pgs stuck in specified states.
@@ -156,7 +164,6 @@ public:
   PGMonitor(Monitor *mn, Paxos *p, const string& service_name)
     : PaxosService(mn, p, service_name),
       need_check_down_pgs(false),
-      last_map_pg_create_osd_epoch(0),
       pgmap_meta_prefix("pgmap_meta"),
       pgmap_pg_prefix("pgmap_pg"),
       pgmap_osd_prefix("pgmap_osd")
@@ -204,6 +211,7 @@ public:
 			     list<pair<health_status_t,string> > *detail,
 			     const set<int>& s, const char *desc, health_status_t sev) const;
 
+  void check_subs();
   void check_sub(Subscription *sub);
 
 private:
diff --git a/src/mon/Session.h b/src/mon/Session.h
index ff80730..d91de26 100644
--- a/src/mon/Session.h
+++ b/src/mon/Session.h
@@ -40,7 +40,7 @@ struct Subscription {
 struct MonSession : public RefCountedObject {
   ConnectionRef con;
   entity_inst_t inst;
-  utime_t until;
+  utime_t session_timeout;
   utime_t time_established;
   bool closed;
   xlist<MonSession*>::item item;
diff --git a/src/msg/Connection.h b/src/msg/Connection.h
index 1539b39..2362895 100644
--- a/src/msg/Connection.h
+++ b/src/msg/Connection.h
@@ -39,12 +39,12 @@ class Message;
 class Messenger;
 
 struct Connection : public RefCountedObject {
-  Mutex lock;
+  mutable Mutex lock;
   Messenger *msgr;
   RefCountedObject *priv;
   int peer_type;
   entity_addr_t peer_addr;
-  utime_t last_keepalive_ack;
+  utime_t last_keepalive, last_keepalive_ack;
 private:
   uint64_t features;
 public:
@@ -178,9 +178,23 @@ public:
     rx_buffers.erase(tid);
   }
 
+  utime_t get_last_keepalive() const {
+    Mutex::Locker l(lock);
+    return last_keepalive;
+  }
+  void set_last_keepalive(utime_t t) {
+    Mutex::Locker l(lock);
+    last_keepalive = t;
+  }
   utime_t get_last_keepalive_ack() const {
+    Mutex::Locker l(lock);
     return last_keepalive_ack;
   }
+  void set_last_keepalive_ack(utime_t t) {
+    Mutex::Locker l(lock);
+    last_keepalive_ack = t;
+  }
+
 };
 
 typedef boost::intrusive_ptr<Connection> ConnectionRef;
diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc
index 42c55ad..83c36de 100644
--- a/src/msg/async/AsyncConnection.cc
+++ b/src/msg/async/AsyncConnection.cc
@@ -229,12 +229,82 @@ int AsyncConnection::read_bulk(int fd, char *buf, int len)
   return nread;
 }
 
+/* 
+ SIGPIPE suppression - for platforms without SO_NOSIGPIPE or MSG_NOSIGNAL
+  http://krokisplace.blogspot.in/2010/02/suppressing-sigpipe-in-library.html 
+  http://www.microhowto.info/howto/ignore_sigpipe_without_affecting_other_threads_in_a_process.html 
+*/
+void AsyncConnection::suppress_sigpipe()
+{
+#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
+  /*
+    We want to ignore possible SIGPIPE that we can generate on write.
+    SIGPIPE is delivered *synchronously* and *only* to the thread
+    doing the write.  So if it is reported as already pending (which
+    means the thread blocks it), then we do nothing: if we generate
+    SIGPIPE, it will be merged with the pending one (there's no
+    queuing), and that suits us well.  If it is not pending, we block
+    it in this thread (and we avoid changing signal action, because it
+    is per-process).
+  */
+  sigset_t pending;
+  sigemptyset(&pending);
+  sigpending(&pending);
+  sigpipe_pending = sigismember(&pending, SIGPIPE);
+  if (!sigpipe_pending) {
+    sigset_t blocked;
+    sigemptyset(&blocked);
+    pthread_sigmask(SIG_BLOCK, &sigpipe_mask, &blocked);
+
+    /* Maybe is was blocked already?  */
+    sigpipe_unblock = ! sigismember(&blocked, SIGPIPE);
+  }
+#endif /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
+}
+
+
+void AsyncConnection::restore_sigpipe()
+{
+#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
+  /*
+    If SIGPIPE was pending already we do nothing.  Otherwise, if it
+    become pending (i.e., we generated it), then we sigwait() it (thus
+    clearing pending status).  Then we unblock SIGPIPE, but only if it
+    were us who blocked it.
+  */
+  if (!sigpipe_pending) {
+    sigset_t pending;
+    sigemptyset(&pending);
+    sigpending(&pending);
+    if (sigismember(&pending, SIGPIPE)) {
+      /*
+        Protect ourselves from a situation when SIGPIPE was sent
+        by the user to the whole process, and was delivered to
+        other thread before we had a chance to wait for it.
+      */
+      static const struct timespec nowait = { 0, 0 };
+      TEMP_FAILURE_RETRY(sigtimedwait(&sigpipe_mask, NULL, &nowait));
+    }
+
+    if (sigpipe_unblock)
+      pthread_sigmask(SIG_UNBLOCK, &sigpipe_mask, NULL);
+  }
+#endif  /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
+}
+
 // return the length of msg needed to be sent,
 // < 0 means error occured
 int AsyncConnection::do_sendmsg(struct msghdr &msg, int len, bool more)
 {
+  suppress_sigpipe();
+
   while (len > 0) {
-    int r = ::sendmsg(sd, &msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
+    int r;
+#if defined(MSG_NOSIGNAL)
+    r = ::sendmsg(sd, &msg, MSG_NOSIGNAL);
+#else
+    r = ::sendmsg(sd, &msg, 0);
+#endif /* defined(MSG_NOSIGNAL) */
 
     if (r == 0) {
       ldout(async_msgr->cct, 10) << __func__ << " sendmsg got r==0!" << dendl;
@@ -266,6 +336,7 @@ int AsyncConnection::do_sendmsg(struct msghdr &msg, int len, bool more)
         break;
       }
     }
+    restore_sigpipe();
   }
   return len;
 }
@@ -437,6 +508,7 @@ void AsyncConnection::process()
 {
   int r = 0;
   int prev_state = state;
+  bool already_dispatch_writer = false;
   Mutex::Locker l(lock);
   do {
     ldout(async_msgr->cct, 20) << __func__ << " state is " << get_state_name(state)
@@ -457,6 +529,7 @@ void AsyncConnection::process()
 
           if (tag == CEPH_MSGR_TAG_KEEPALIVE) {
             ldout(async_msgr->cct, 20) << __func__ << " got KEEPALIVE" << dendl;
+	    set_last_keepalive(ceph_clock_now(NULL));
           } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2) {
             state = STATE_OPEN_KEEPALIVE2;
           } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
@@ -491,8 +564,9 @@ void AsyncConnection::process()
           utime_t kp_t = utime_t(*t);
           write_lock.Lock();
           _send_keepalive_or_ack(true, &kp_t);
-          write_lock.Unlock();
+	  write_lock.Unlock();
           ldout(async_msgr->cct, 20) << __func__ << " got KEEPALIVE2 " << kp_t << dendl;
+	  set_last_keepalive(ceph_clock_now(NULL));
           state = STATE_OPEN;
           break;
         }
@@ -509,7 +583,7 @@ void AsyncConnection::process()
           }
 
           t = (ceph_timespec*)state_buffer;
-          last_keepalive_ack = utime_t(*t);
+          set_last_keepalive_ack(utime_t(*t));
           ldout(async_msgr->cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl;
           state = STATE_OPEN;
           break;
@@ -842,10 +916,12 @@ void AsyncConnection::process()
 	  ldout(async_msgr->cct, 1) << " == rx == " << message->get_source() << " seq "
                                     << message->get_seq() << " " << message << " " << *message << dendl;
 
-          // if send_message always successfully send, it may have no
-          // opportunity to send seq ack. 10 is a experience value.
-          if (ack_left.inc() > 10) {
+          ack_left.inc();
+          // if send_message always send inline, it may have no
+          // opportunity to send seq ack.
+          if (!already_dispatch_writer) {
             center->dispatch_event_external(write_handler);
+            already_dispatch_writer = true;
           }
 
           state = STATE_OPEN;
@@ -969,16 +1045,28 @@ int AsyncConnection::_process_connection()
           ::close(sd);
         }
 
-        sd = net.connect(get_peer_addr());
+        sd = net.nonblock_connect(get_peer_addr());
         if (sd < 0) {
           goto fail;
         }
-        r = net.set_nonblock(sd);
+
+        center->create_file_event(sd, EVENT_READABLE, read_handler);
+        state = STATE_CONNECTING_RE;
+        break;
+      }
+
+    case STATE_CONNECTING_RE:
+      {
+        r = net.reconnect(get_peer_addr(), sd);
         if (r < 0) {
+          ldout(async_msgr->cct, 1) << __func__ << " reconnect failed " << dendl;
           goto fail;
+        } else if (r > 0) {
+          break;
         }
 
-        center->create_file_event(sd, EVENT_READABLE, read_handler);
+        net.set_socket_options(sd);
+
         state = STATE_CONNECTING_WAIT_BANNER;
         break;
       }
@@ -2131,6 +2219,7 @@ void AsyncConnection::fault()
       if (backoff > async_msgr->cct->_conf->ms_max_backoff)
         backoff.set_from_double(async_msgr->cct->_conf->ms_max_backoff);
     }
+
     state = STATE_CONNECTING;
     ldout(async_msgr->cct, 10) << __func__ << " waiting " << backoff << dendl;
   }
@@ -2423,7 +2512,7 @@ void AsyncConnection::handle_write()
       ldout(async_msgr->cct, 10) << __func__ << " state is " << get_state_name(state)
                                  << " policy.server is false" << dendl;
       _connect();
-    } else if (sd >= 0 && state != STATE_CONNECTING && state != STATE_CLOSED) {
+    } else if (sd >= 0 && state != STATE_CONNECTING && state != STATE_CONNECTING_RE && state != STATE_CLOSED) {
       r = _try_send(bl);
       if (r < 0) {
         ldout(async_msgr->cct, 1) << __func__ << " send outcoming bl failed" << dendl;
diff --git a/src/msg/async/AsyncConnection.h b/src/msg/async/AsyncConnection.h
index 64c2921..c578a7a 100644
--- a/src/msg/async/AsyncConnection.h
+++ b/src/msg/async/AsyncConnection.h
@@ -18,6 +18,7 @@
 #define CEPH_MSG_ASYNCCONNECTION_H
 
 #include <pthread.h>
+#include <signal.h>
 #include <climits>
 #include <list>
 #include <map>
@@ -45,6 +46,8 @@ class AsyncMessenger;
 class AsyncConnection : public Connection {
 
   int read_bulk(int fd, char *buf, int len);
+  void suppress_sigpipe();
+  void restore_sigpipe();
   int do_sendmsg(struct msghdr &msg, int len, bool more);
   int try_send(bufferlist &bl, bool send=true) {
     Mutex::Locker l(write_lock);
@@ -160,6 +163,7 @@ class AsyncConnection : public Connection {
     STATE_OPEN_TAG_CLOSE,
     STATE_WAIT_SEND,
     STATE_CONNECTING,
+    STATE_CONNECTING_RE,
     STATE_CONNECTING_WAIT_BANNER,
     STATE_CONNECTING_WAIT_IDENTIFY_PEER,
     STATE_CONNECTING_SEND_CONNECT_MSG,
@@ -196,6 +200,7 @@ class AsyncConnection : public Connection {
                                         "STATE_OPEN_TAG_CLOSE",
                                         "STATE_WAIT_SEND",
                                         "STATE_CONNECTING",
+                                        "STATE_CONNECTING_RE",
                                         "STATE_CONNECTING_WAIT_BANNER",
                                         "STATE_CONNECTING_WAIT_IDENTIFY_PEER",
                                         "STATE_CONNECTING_SEND_CONNECT_MSG",
@@ -291,6 +296,12 @@ class AsyncConnection : public Connection {
   EventCenter *center;
   ceph::shared_ptr<AuthSessionHandler> session_security;
 
+#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
+  sigset_t sigpipe_mask;
+  bool sigpipe_pending;
+  bool sigpipe_unblock;
+#endif
+
  public:
   // used by eventcallback
   void handle_write();
diff --git a/src/msg/async/Event.cc b/src/msg/async/Event.cc
index 03119de..2027a9f 100644
--- a/src/msg/async/Event.cc
+++ b/src/msg/async/Event.cc
@@ -179,7 +179,7 @@ int EventCenter::create_file_event(int fd, int mask, EventCallbackRef ctxt)
 
 void EventCenter::delete_file_event(int fd, int mask)
 {
-  assert(fd > 0);
+  assert(fd >= 0);
   Mutex::Locker l(file_lock);
   if (fd > nevent) {
     ldout(cct, 1) << __func__ << " delete event fd=" << fd << " exceed nevent=" << nevent
diff --git a/src/msg/async/net_handler.cc b/src/msg/async/net_handler.cc
index 2639fdc..ba63eec 100644
--- a/src/msg/async/net_handler.cc
+++ b/src/msg/async/net_handler.cc
@@ -92,7 +92,7 @@ void NetHandler::set_socket_options(int sd)
   }
 
   // block ESIGPIPE
-#ifdef CEPH_USE_SO_NOSIGPIPE
+#ifdef SO_NOSIGPIPE
   int val = 1;
   int r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
   if (r) {
@@ -132,6 +132,20 @@ int NetHandler::generic_connect(const entity_addr_t& addr, bool nonblock)
   return s;
 }
 
+int NetHandler::reconnect(const entity_addr_t &addr, int sd)
+{
+  int ret = ::connect(sd, (sockaddr*)&addr.addr, addr.addr_size());
+
+  if (ret < 0 && errno != EISCONN) {
+    ldout(cct, 10) << __func__ << " reconnect: " << strerror(errno) << dendl;
+    if (errno == EINPROGRESS || errno == EALREADY)
+      return 1;
+    return -errno;
+  }
+
+  return 0;
+}
+
 int NetHandler::connect(const entity_addr_t &addr)
 {
   return generic_connect(addr, false);
diff --git a/src/msg/async/net_handler.h b/src/msg/async/net_handler.h
index 0179dda..64423dc 100644
--- a/src/msg/async/net_handler.h
+++ b/src/msg/async/net_handler.h
@@ -30,6 +30,15 @@ namespace ceph {
     int set_nonblock(int sd);
     void set_socket_options(int sd);
     int connect(const entity_addr_t &addr);
+    
+    /**
+     * Try to reconnect the socket.
+     *
+     * @return    0         success
+     *            > 0       just break, and wait for event
+     *            < 0       need to goto fail
+     */
+    int reconnect(const entity_addr_t &addr, int sd);
     int nonblock_connect(const entity_addr_t &addr);
   };
 }
diff --git a/src/msg/msg_types.h b/src/msg/msg_types.h
index bf668e0..8f98b88 100644
--- a/src/msg/msg_types.h
+++ b/src/msg/msg_types.h
@@ -151,8 +151,9 @@ namespace std {
  * ipv4 for now.
  */
 
+#if defined(__linux__) || defined(DARWIN) || defined(__FreeBSD__)
 /*
- * encode sockaddr.ss_family in big endian
+ * encode sockaddr.ss_family as network byte order 
  */
 static inline void encode(const sockaddr_storage& a, bufferlist& bl) {
   struct sockaddr_storage ss = a;
@@ -174,6 +175,28 @@ static inline void decode(sockaddr_storage& a, bufferlist::iterator& bl) {
   a.ss_family = ntohs(a.ss_family);
 #endif
 }
+#endif
+
+// define a wire format for sockaddr that matches Linux's.
+struct ceph_sockaddr_storage {
+  __le16 ss_family;
+  __u8 __ss_padding[128 - sizeof(__le16)];
+
+  void encode(bufferlist& bl) const {
+    struct ceph_sockaddr_storage ss = *this;
+    ss.ss_family = htons(ss.ss_family);
+    ::encode_raw(ss, bl);
+  }
+
+  void decode(bufferlist::iterator& bl) {
+    struct ceph_sockaddr_storage ss;
+    ::decode_raw(ss, bl);
+    ss.ss_family = ntohs(ss.ss_family);
+    *this = ss;
+  }
+} __attribute__ ((__packed__));
+
+WRITE_CLASS_ENCODER(ceph_sockaddr_storage)
 
 struct entity_addr_t {
   __u32 type;
@@ -330,15 +353,37 @@ struct entity_addr_t {
 
   bool parse(const char *s, const char **end = 0);
 
+  // Right now, these only deal with sockaddr_storage that have only family and content.
+  // Apparently on BSD there is also an ss_len that we need to handle; this requires
+  // broader study
+
+
   void encode(bufferlist& bl) const {
     ::encode(type, bl);
     ::encode(nonce, bl);
+#if defined(__linux__) || defined(DARWIN) || defined(__FreeBSD__)
     ::encode(addr, bl);
+#else
+    ceph_sockaddr_storage wireaddr;
+    ::memset(&wireaddr, '\0', sizeof(wireaddr));
+    unsigned copysize = MIN(sizeof(wireaddr), sizeof(addr));
+    // ceph_sockaddr_storage is in host byte order
+    ::memcpy(&wireaddr, &addr, copysize);
+    ::encode(wireaddr, bl);
+#endif
   }
   void decode(bufferlist::iterator& bl) {
     ::decode(type, bl);
     ::decode(nonce, bl);
+#if defined(__linux__) || defined(DARWIN) || defined(__FreeBSD__)
     ::decode(addr, bl);
+#else
+    ceph_sockaddr_storage wireaddr;
+    ::memset(&wireaddr, '\0', sizeof(wireaddr));
+    ::decode(wireaddr, bl);
+    unsigned copysize = MIN(sizeof(wireaddr), sizeof(addr));
+    ::memcpy(&addr, &wireaddr, copysize);
+#endif
   }
 
   void dump(Formatter *f) const;
diff --git a/src/msg/simple/Pipe.cc b/src/msg/simple/Pipe.cc
index d148378..33884c8 100644
--- a/src/msg/simple/Pipe.cc
+++ b/src/msg/simple/Pipe.cc
@@ -827,7 +827,7 @@ void Pipe::set_socket_options()
   }
 
   // block ESIGPIPE
-#ifdef CEPH_USE_SO_NOSIGPIPE
+#if defined(SO_NOSIGPIPE)
   int val = 1;
   int r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
   if (r) {
@@ -847,6 +847,7 @@ void Pipe::set_socket_options()
                          << ": " << cpp_strerror(errno) << dendl;
     }
 #endif
+#if defined(SO_PRIORITY) 
     // setsockopt(IPTOS_CLASS_CS6) sets the priority of the socket as 0.
     // See http://goo.gl/QWhvsD and http://goo.gl/laTbjT
     // We need to call setsockopt(SO_PRIORITY) after it.
@@ -857,6 +858,7 @@ void Pipe::set_socket_options()
       ldout(msgr->cct,0) << "couldn't set SO_PRIORITY to " << prio
                          << ": " << cpp_strerror(errno) << dendl;
     }
+#endif
   }
 }
 
@@ -935,7 +937,12 @@ int Pipe::connect()
 
   // identify peer
   {
+#if defined(__linux__) || defined(DARWIN) || defined(__FreeBSD__)
     bufferptr p(sizeof(paddr) * 2);
+#else
+    int wirelen = sizeof(__u32) * 2 + sizeof(ceph_sockaddr_storage);
+    bufferptr p(wirelen * 2);
+#endif
     addrbl.push_back(p);
   }
   if (tcp_read(addrbl.c_str(), addrbl.length()) < 0) {
@@ -1515,8 +1522,9 @@ void Pipe::reader()
     }
 
     if (tag == CEPH_MSGR_TAG_KEEPALIVE) {
-      ldout(msgr->cct,20) << "reader got KEEPALIVE" << dendl;
+      ldout(msgr->cct,2) << "reader got KEEPALIVE" << dendl;
       pipe_lock.Lock();
+      connection_state->set_last_keepalive(ceph_clock_now(NULL));
       continue;
     }
     if (tag == CEPH_MSGR_TAG_KEEPALIVE2) {
@@ -1531,14 +1539,15 @@ void Pipe::reader()
       } else {
 	send_keepalive_ack = true;
 	keepalive_ack_stamp = utime_t(t);
-	ldout(msgr->cct,20) << "reader got KEEPALIVE2 " << keepalive_ack_stamp
-			    << dendl;
+	ldout(msgr->cct,2) << "reader got KEEPALIVE2 " << keepalive_ack_stamp
+			   << dendl;
+	connection_state->set_last_keepalive(ceph_clock_now(NULL));
 	cond.Signal();
       }
       continue;
     }
     if (tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
-      ldout(msgr->cct,20) << "reader got KEEPALIVE_ACK" << dendl;
+      ldout(msgr->cct,2) << "reader got KEEPALIVE_ACK" << dendl;
       struct ceph_timespec t;
       int rc = tcp_read((char*)&t, sizeof(t));
       pipe_lock.Lock();
@@ -1546,7 +1555,7 @@ void Pipe::reader()
 	ldout(msgr->cct,2) << "reader couldn't read KEEPALIVE2 stamp " << cpp_strerror(errno) << dendl;
 	fault(true);
       } else {
-	connection_state->last_keepalive_ack = utime_t(t);
+	connection_state->set_last_keepalive_ack(utime_t(t));
       }
       continue;
     }
@@ -2115,8 +2124,73 @@ int Pipe::read_message(Message **pm, AuthSessionHandler* auth_handler)
   return ret;
 }
 
+/* 
+ SIGPIPE suppression - for platforms without SO_NOSIGPIPE or MSG_NOSIGNAL
+  http://krokisplace.blogspot.in/2010/02/suppressing-sigpipe-in-library.html 
+  http://www.microhowto.info/howto/ignore_sigpipe_without_affecting_other_threads_in_a_process.html 
+*/
+void Pipe::suppress_sigpipe()
+{
+#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
+  /*
+    We want to ignore possible SIGPIPE that we can generate on write.
+    SIGPIPE is delivered *synchronously* and *only* to the thread
+    doing the write.  So if it is reported as already pending (which
+    means the thread blocks it), then we do nothing: if we generate
+    SIGPIPE, it will be merged with the pending one (there's no
+    queuing), and that suits us well.  If it is not pending, we block
+    it in this thread (and we avoid changing signal action, because it
+    is per-process).
+  */
+  sigset_t pending;
+  sigemptyset(&pending);
+  sigpending(&pending);
+  sigpipe_pending = sigismember(&pending, SIGPIPE);
+  if (!sigpipe_pending) {
+    sigset_t blocked;
+    sigemptyset(&blocked);
+    pthread_sigmask(SIG_BLOCK, &sigpipe_mask, &blocked);
+
+    /* Maybe is was blocked already?  */
+    sigpipe_unblock = ! sigismember(&blocked, SIGPIPE);
+  }
+#endif  /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
+}
+
+
+void Pipe::restore_sigpipe()
+{
+#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
+  /*
+    If SIGPIPE was pending already we do nothing.  Otherwise, if it
+    become pending (i.e., we generated it), then we sigwait() it (thus
+    clearing pending status).  Then we unblock SIGPIPE, but only if it
+    were us who blocked it.
+  */
+  if (!sigpipe_pending) {
+    sigset_t pending;
+    sigemptyset(&pending);
+    sigpending(&pending);
+    if (sigismember(&pending, SIGPIPE)) {
+      /*
+        Protect ourselves from a situation when SIGPIPE was sent
+        by the user to the whole process, and was delivered to
+        other thread before we had a chance to wait for it.
+      */
+      static const struct timespec nowait = { 0, 0 };
+      TEMP_FAILURE_RETRY(sigtimedwait(&sigpipe_mask, NULL, &nowait));
+    }
+
+    if (sigpipe_unblock)
+      pthread_sigmask(SIG_UNBLOCK, &sigpipe_mask, NULL);
+  }
+#endif  /* !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE) */
+}
+
+
 int Pipe::do_sendmsg(struct msghdr *msg, int len, bool more)
 {
+  suppress_sigpipe();
   while (len > 0) {
     if (0) { // sanity
       int l = 0;
@@ -2125,16 +2199,23 @@ int Pipe::do_sendmsg(struct msghdr *msg, int len, bool more)
       assert(l == len);
     }
 
-    int r = ::sendmsg(sd, msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
+    int r;
+#if defined(MSG_NOSIGNAL)
+    r = ::sendmsg(sd, msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
+#else
+    r = ::sendmsg(sd, msg, (more ? MSG_MORE : 0));
+#endif
     if (r == 0) 
       ldout(msgr->cct,10) << "do_sendmsg hmm do_sendmsg got r==0!" << dendl;
     if (r < 0) { 
       ldout(msgr->cct,1) << "do_sendmsg error " << cpp_strerror(errno) << dendl;
+      restore_sigpipe();
       return -1;
     }
     if (state == STATE_CLOSED) {
       ldout(msgr->cct,10) << "do_sendmsg oh look, state == CLOSED, giving up" << dendl;
       errno = EINTR;
+      restore_sigpipe();
       return -1; // close enough
     }
 
@@ -2159,6 +2240,7 @@ int Pipe::do_sendmsg(struct msghdr *msg, int len, bool more)
       }
     }
   }
+  restore_sigpipe();
   return 0;
 }
 
@@ -2522,8 +2604,15 @@ int Pipe::tcp_write(const char *buf, int len)
 
   //lgeneric_dout(cct, DBL) << "tcp_write writing " << len << dendl;
   assert(len > 0);
+  suppress_sigpipe();
+
   while (len > 0) {
-    int did = ::send( sd, buf, len, MSG_NOSIGNAL );
+    int did;
+#if defined(MSG_NOSIGNAL)
+    did = ::send( sd, buf, len, MSG_NOSIGNAL );
+#else
+    did = ::send( sd, buf, len, 0);
+#endif
     if (did < 0) {
       //lgeneric_dout(cct, 1) << "tcp_write error did = " << did << " " << cpp_strerror(errno) << dendl;
       //lgeneric_derr(cct, 1) << "tcp_write error did = " << did << " " << cpp_strerror(errno) << dendl;
@@ -2533,5 +2622,7 @@ int Pipe::tcp_write(const char *buf, int len)
     buf += did;
     //lgeneric_dout(cct, DBL) << "tcp_write did " << did << ", " << len << " left" << dendl;
   }
+  restore_sigpipe();
+
   return 0;
 }
diff --git a/src/msg/simple/Pipe.h b/src/msg/simple/Pipe.h
index 0c1671a..ce24a2a 100644
--- a/src/msg/simple/Pipe.h
+++ b/src/msg/simple/Pipe.h
@@ -179,6 +179,11 @@ class DispatchQueue;
   private:
     int sd;
     struct iovec msgvec[IOV_MAX];
+#if !defined(MSG_NOSIGNAL) && !defined(SO_NOSIGPIPE)
+    sigset_t sigpipe_mask;
+    bool sigpipe_pending;
+    bool sigpipe_unblock;
+#endif
 
   public:
     int port;
@@ -247,6 +252,10 @@ class DispatchQueue;
     int write_keepalive();
     int write_keepalive2(char tag, const utime_t &t);
 
+    void suppress_sigpipe();
+    void restore_sigpipe();
+
+
     void fault(bool reader=false);
 
     void was_session_reset();
diff --git a/src/ocf/Makefile.in b/src/ocf/Makefile.in
index 518330c..2c085f2 100644
--- a/src/ocf/Makefile.in
+++ b/src/ocf/Makefile.in
@@ -168,6 +168,7 @@ AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
 BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
+BOOST_REGEX_LIBS = @BOOST_REGEX_LIBS@
 BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
 CC = @CC@
 CCAS = @CCAS@
diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc
index 8ff7ef7..449a5de 100644
--- a/src/os/DBObjectMap.cc
+++ b/src/os/DBObjectMap.cc
@@ -11,7 +11,7 @@
 #include <vector>
 
 #include "ObjectMap.h"
-#include "KeyValueDB.h"
+#include "kv/KeyValueDB.h"
 #include "DBObjectMap.h"
 #include <errno.h>
 
@@ -344,7 +344,7 @@ bool DBObjectMap::DBObjectMapIteratorImpl::valid_parent()
   return false;
 }
 
-int DBObjectMap::DBObjectMapIteratorImpl::next()
+int DBObjectMap::DBObjectMapIteratorImpl::next(bool validate)
 {
   assert(cur_iter->valid());
   assert(valid());
@@ -1089,17 +1089,16 @@ DBObjectMap::Header DBObjectMap::_lookup_map_header(
     }
   }
 
-  map<string, bufferlist> out;
-  set<string> to_get;
-  to_get.insert(map_header_key(oid));
-  int r = db->get(HOBJECT_TO_SEQ, to_get, &out);
-  if (r < 0 || out.empty()) {
+  bufferlist out;
+  int r = db->get(HOBJECT_TO_SEQ, map_header_key(oid), &out);
+  if (r < 0 || out.length()==0) {
     delete header;
     return Header();
   }
 
   Header ret(header, RemoveOnDelete(this));
-  bufferlist::iterator iter = out.begin()->second.begin();
+  bufferlist::iterator iter = out.begin();
+
   ret->decode(iter);
   {
     Mutex::Locker l(cache_lock);
diff --git a/src/os/DBObjectMap.h b/src/os/DBObjectMap.h
index ee252c1..00ce46e 100644
--- a/src/os/DBObjectMap.h
+++ b/src/os/DBObjectMap.h
@@ -12,7 +12,7 @@
 #include <boost/scoped_ptr.hpp>
 
 #include "ObjectMap.h"
-#include "KeyValueDB.h"
+#include "kv/KeyValueDB.h"
 #include "osd/osd_types.h"
 #include "common/Mutex.h"
 #include "common/Cond.h"
@@ -347,7 +347,7 @@ private:
     int upper_bound(const string &after) { return 0; }
     int lower_bound(const string &to) { return 0; }
     bool valid() { return false; }
-    int next() { assert(0); return 0; }
+    int next(bool validate=true) { assert(0); return 0; }
     string key() { assert(0); return ""; }
     bufferlist value() { assert(0); return bufferlist(); }
     int status() { return 0; }
@@ -385,7 +385,7 @@ private:
     int upper_bound(const string &after);
     int lower_bound(const string &to);
     bool valid();
-    int next();
+    int next(bool validate=true);
     string key();
     bufferlist value();
     int status();
diff --git a/src/os/FileJournal.cc b/src/os/FileJournal.cc
index fb05152..86d269b 100644
--- a/src/os/FileJournal.cc
+++ b/src/os/FileJournal.cc
@@ -296,7 +296,7 @@ int FileJournal::_open_file(int64_t oldsize, blksize_t blksize,
     char *buf;
     ret = ::posix_memalign((void **)&buf, block_size, write_size);
     if (ret != 0) {
-      return ret;
+      return -ret;
     }
     memset(static_cast<void*>(buf), 0, write_size);
     uint64_t i = 0;
@@ -633,7 +633,8 @@ int FileJournal::_fdump(Formatter &f, bool simple)
 
     if (!pos) {
       dout(2) << "_dump -- not readable" << dendl;
-      return false;
+      err = -EINVAL;
+      break;
     }
     stringstream ss;
     read_entry_result result = do_read_entry(
@@ -647,7 +648,7 @@ int FileJournal::_fdump(Formatter &f, bool simple)
         dout(2) << "Unable to read past sequence " << seq
 	    << " but header indicates the journal has committed up through "
 	    << header.committed_up_to << ", journal is corrupt" << dendl;
-        err = EINVAL;
+        err = -EINVAL;
       }
       dout(25) << ss.str() << dendl;
       dout(25) << "No further valid entries found, journal is most likely valid"
@@ -665,12 +666,11 @@ int FileJournal::_fdump(Formatter &f, bool simple)
       bufferlist::iterator p = bl.begin();
       int trans_num = 0;
       while (!p.end()) {
-        ObjectStore::Transaction *t = new ObjectStore::Transaction(p);
+        ObjectStore::Transaction t(p);
         f.open_object_section("transaction");
         f.dump_unsigned("trans_num", trans_num);
-        t->dump(&f);
+        t.dump(&f);
         f.close_section();
-        delete t;
         trans_num++;
       }
       f.close_section();
@@ -894,7 +894,7 @@ int FileJournal::prepare_multi_write(bufferlist& bl, uint64_t& orig_ops, uint64_
 	// throw out what we have so far
 	full_state = FULL_FULL;
 	while (!writeq_empty()) {
-	  put_throttle(1, peek_write().bl.length());
+	  put_throttle(1, peek_write().orig_len);
 	  pop_write();
 	}  
 	print_header(header);
@@ -974,54 +974,39 @@ int FileJournal::prepare_single_write(bufferlist& bl, off64_t& queue_pos, uint64
   write_item &next_write = peek_write();
   uint64_t seq = next_write.seq;
   bufferlist &ebl = next_write.bl;
-  unsigned head_size = sizeof(entry_header_t);
-  off64_t base_size = 2*head_size + ebl.length();
-
-  int alignment = next_write.alignment; // we want to start ebl with this alignment
-  unsigned pre_pad = 0;
-  if (alignment >= 0)
-    pre_pad = ((unsigned int)alignment - (unsigned int)head_size) & ~CEPH_PAGE_MASK;
-  off64_t size = ROUND_UP_TO(base_size + pre_pad, header.alignment);
-  unsigned post_pad = size - base_size - pre_pad;
+  off64_t size = ebl.length();
 
   int r = check_for_full(seq, queue_pos, size);
   if (r < 0)
     return r;   // ENOSPC or EAGAIN
 
-  orig_bytes += ebl.length();
+  uint32_t orig_len = next_write.orig_len;
+  orig_bytes += orig_len;
   orig_ops++;
 
   // add to write buffer
   dout(15) << "prepare_single_write " << orig_ops << " will write " << queue_pos << " : seq " << seq
-	   << " len " << ebl.length() << " -> " << size
-	   << " (head " << head_size << " pre_pad " << pre_pad
-	   << " ebl " << ebl.length() << " post_pad " << post_pad << " tail " << head_size << ")"
-	   << " (ebl alignment " << alignment << ")"
-	   << dendl;
+	   << " len " << orig_len << " -> " << size << dendl;
     
-  // add it this entry
-  entry_header_t h;
-  memset(&h, 0, sizeof(h));
-  h.seq = seq;
-  h.pre_pad = pre_pad;
-  h.len = ebl.length();
-  h.post_pad = post_pad;
-  h.make_magic(queue_pos, header.get_fsid64());
-  h.crc32c = ebl.crc32c(0);
-
-  bl.append((const char*)&h, sizeof(h));
-  if (pre_pad) {
-    bufferptr bp = buffer::create_static(pre_pad, zero_buf);
-    bl.push_back(bp);
-  }
-  bl.claim_append(ebl, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
-
-  if (h.post_pad) {
-    bufferptr bp = buffer::create_static(post_pad, zero_buf);
-    bl.push_back(bp);
-  }
-  bl.append((const char*)&h, sizeof(h));
-
+  unsigned seq_offset = offsetof(entry_header_t, seq);
+  unsigned magic1_offset = offsetof(entry_header_t, magic1);
+  unsigned magic2_offset = offsetof(entry_header_t, magic2);
+
+  bufferptr headerptr = ebl.buffers().front();
+  uint64_t _seq = seq;
+  uint64_t _queue_pos = queue_pos;
+  uint64_t magic2 = entry_header_t::make_magic(seq, orig_len, header.get_fsid64());
+  headerptr.copy_in(seq_offset, sizeof(uint64_t), (char *)&_seq);
+  headerptr.copy_in(magic1_offset, sizeof(uint64_t), (char *)&_queue_pos);
+  headerptr.copy_in(magic2_offset, sizeof(uint64_t), (char *)&magic2);
+
+  bufferptr footerptr = ebl.buffers().back();
+  unsigned post_offset  = footerptr.length() - sizeof(entry_header_t);
+  footerptr.copy_in(post_offset + seq_offset, sizeof(uint64_t), (char *)&_seq);
+  footerptr.copy_in(post_offset + magic1_offset, sizeof(uint64_t), (char *)&_queue_pos);
+  footerptr.copy_in(post_offset + magic2_offset, sizeof(uint64_t), (char *)&magic2);
+
+  bl.claim_append(ebl);
   if (next_write.tracked_op)
     next_write.tracked_op->mark_event("write_thread_in_journal_buffer");
 
@@ -1042,8 +1027,7 @@ void FileJournal::align_bl(off64_t pos, bufferlist& bl)
   // make sure list segments are page aligned
   if (directio && (!bl.is_aligned(block_size) ||
 		   !bl.is_n_align_sized(CEPH_MINIMUM_BLOCK_SIZE))) {
-    bl.rebuild_aligned(CEPH_MINIMUM_BLOCK_SIZE);
-    dout(10) << __func__ << " total memcopy: " << bl.get_memcopy_count() << dendl;
+    assert(0 == "bl should be align");
     if ((bl.length() & (CEPH_MINIMUM_BLOCK_SIZE - 1)) != 0 ||
 	(pos & (CEPH_MINIMUM_BLOCK_SIZE - 1)) != 0)
       dout(0) << "rebuild_page_aligned failed, " << bl << dendl;
@@ -1301,7 +1285,7 @@ void FileJournal::write_thread_entry()
       if (write_stop) {
 	dout(20) << "write_thread_entry full and stopping, throw out queue and finish up" << dendl;
 	while (!writeq_empty()) {
-	  put_throttle(1, peek_write().bl.length());
+	  put_throttle(1, peek_write().orig_len);
 	  pop_write();
 	}  
 	print_header(header);
@@ -1577,7 +1561,60 @@ void FileJournal::check_aio_completion()
 }
 #endif
 
-void FileJournal::submit_entry(uint64_t seq, bufferlist& e, int alignment,
+int FileJournal::prepare_entry(list<ObjectStore::Transaction*>& tls, bufferlist* tbl) {
+  dout(10) << "prepare_entry " << tls << dendl;
+  unsigned data_len = 0;
+  int data_align = -1; // -1 indicates that we don't care about the alignment
+  bufferlist bl;
+  for (list<ObjectStore::Transaction*>::iterator p = tls.begin();
+      p != tls.end(); ++p) {
+    ObjectStore::Transaction *t = *p;
+    if (t->get_data_length() > data_len &&
+     (int)t->get_data_length() >= g_conf->journal_align_min_size) {
+     data_len = t->get_data_length();
+     data_align = (t->get_data_alignment() - bl.length()) & ~CEPH_PAGE_MASK;
+    }
+    ::encode(*t, bl);
+  }
+  if (tbl->length()) {
+    bl.claim_append(*tbl);
+  }
+  // add it this entry
+  entry_header_t h;
+  unsigned head_size = sizeof(entry_header_t);
+  off64_t base_size = 2*head_size + bl.length();
+  memset(&h, 0, sizeof(h));
+  if (data_align >= 0)
+    h.pre_pad = ((unsigned int)data_align - (unsigned int)head_size) & ~CEPH_PAGE_MASK;
+  off64_t size = ROUND_UP_TO(base_size + h.pre_pad, header.alignment);
+  unsigned post_pad = size - base_size - h.pre_pad;
+  h.len = bl.length();
+  h.post_pad = post_pad;
+  h.crc32c = bl.crc32c(0);
+  dout(10) << " len " << bl.length() << " -> " << size
+       << " (head " << head_size << " pre_pad " << h.pre_pad
+       << " bl " << bl.length() << " post_pad " << post_pad << " tail " << head_size << ")"
+       << " (bl alignment " << data_align << ")"
+       << dendl;
+  bufferlist ebl;
+  // header
+  ebl.append((const char*)&h, sizeof(h));
+  if (h.pre_pad) {
+    ebl.push_back(buffer::create_static(h.pre_pad, zero_buf));
+  }
+  // payload
+  ebl.claim_append(bl, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
+  if (h.post_pad) {
+    ebl.push_back(buffer::create_static(h.post_pad, zero_buf));
+  }
+  // footer
+  ebl.append((const char*)&h, sizeof(h));
+  ebl.rebuild_aligned(CEPH_MINIMUM_BLOCK_SIZE);
+  tbl->claim(ebl);
+  return h.len;
+}
+
+void FileJournal::submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
 			       Context *oncommit, TrackedOpRef osd_op)
 {
   // dump on queue
@@ -1587,7 +1624,7 @@ void FileJournal::submit_entry(uint64_t seq, bufferlist& e, int alignment,
   assert(e.length() > 0);
 
   throttle_ops.take(1);
-  throttle_bytes.take(e.length());
+  throttle_bytes.take(orig_len);
   if (osd_op)
     osd_op->mark_event("commit_queued_for_journal_write");
   if (logger) {
@@ -1605,7 +1642,7 @@ void FileJournal::submit_entry(uint64_t seq, bufferlist& e, int alignment,
 	seq, oncommit, ceph_clock_now(g_ceph_context), osd_op));
     if (writeq.empty())
       writeq_cond.Signal();
-    writeq.push_back(write_item(seq, e, alignment, osd_op));
+    writeq.push_back(write_item(seq, e, orig_len, osd_op));
   }
 }
 
@@ -1738,7 +1775,7 @@ void FileJournal::committed_thru(uint64_t seq)
     dout(15) << " dropping committed but unwritten seq " << peek_write().seq 
 	     << " len " << peek_write().bl.length()
 	     << dendl;
-    put_throttle(1, peek_write().bl.length());
+    put_throttle(1, peek_write().orig_len);
     pop_write();
   }
   
diff --git a/src/os/FileJournal.h b/src/os/FileJournal.h
index fbe616d..50bc810 100644
--- a/src/os/FileJournal.h
+++ b/src/os/FileJournal.h
@@ -50,13 +50,13 @@ public:
   struct write_item {
     uint64_t seq;
     bufferlist bl;
-    int alignment;
+    uint32_t orig_len;
     TrackedOpRef tracked_op;
-    write_item(uint64_t s, bufferlist& b, int al, TrackedOpRef opref) :
-      seq(s), alignment(al), tracked_op(opref) {
+    write_item(uint64_t s, bufferlist& b, int ol, TrackedOpRef opref) :
+      seq(s), orig_len(ol), tracked_op(opref) {
       bl.claim(b, buffer::list::CLAIM_ALLOW_NONSHAREABLE); // potential zero-copy
     }
-    write_item() : seq(0), alignment(0) {}
+    write_item() : seq(0), orig_len(0) {}
   };
 
   Mutex finisher_lock;
@@ -88,7 +88,9 @@ public:
     completions.pop_front();
   }
 
-  void submit_entry(uint64_t seq, bufferlist& bl, int alignment,
+  int prepare_entry(list<ObjectStore::Transaction*>& tls, bufferlist* tbl);
+
+  void submit_entry(uint64_t seq, bufferlist& bl, uint32_t orig_len,
 		    Context *oncommit,
 		    TrackedOpRef osd_op = TrackedOpRef());
   /// End protected by finisher_lock
@@ -203,14 +205,13 @@ public:
     uint64_t magic1;
     uint64_t magic2;
     
-    void make_magic(off64_t pos, uint64_t fsid) {
-      magic1 = pos;
-      magic2 = fsid ^ seq ^ len;
+    static uint64_t make_magic(uint64_t seq, uint32_t len, uint64_t fsid) {
+      return (fsid ^ seq ^ len);
     }
     bool check_magic(off64_t pos, uint64_t fsid) {
       return
-	magic1 == (uint64_t)pos &&
-	magic2 == (fsid ^ seq ^ len);
+    magic1 == (uint64_t)pos &&
+    magic2 == (fsid ^ seq ^ len);
     }
   } __attribute__((__packed__, aligned(4)));
 
@@ -220,7 +221,6 @@ private:
   string fn;
 
   char *zero_buf;
-
   off64_t max_size;
   size_t block_size;
   bool directio, aio, force_aio;
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index 3e8bb29..3cfb13f 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -69,7 +69,7 @@
 #include "common/fd.h"
 #include "HashIndex.h"
 #include "DBObjectMap.h"
-#include "KeyValueDB.h"
+#include "kv/KeyValueDB.h"
 
 #include "common/ceph_crypto.h"
 using ceph::crypto::SHA1;
@@ -301,7 +301,7 @@ int FileStore::lfn_open(coll_t cid,
       goto fail;
     }
     r = chain_fsetxattr(fd, XATTR_SPILL_OUT_NAME,
-                        XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT));
+                        XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT), true);
     if (r < 0) {
       VOID_TEMP_FAILURE_RETRY(::close(fd));
       derr << "error setting spillout xattr for oid " << oid << " (" << (*path)->path()
@@ -522,7 +522,6 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, osflagbit
   basedir_fd(-1), current_fd(-1),
   backend(NULL),
   index_manager(do_update),
-  ondisk_finisher(g_ceph_context),
   lock("FileStore::lock"),
   force_sync(false), 
   sync_entry_timeo_lock("sync_entry_timeo_lock"),
@@ -530,9 +529,11 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, osflagbit
   stop(false), sync_thread(this),
   fdcache(g_ceph_context),
   wbthrottle(g_ceph_context),
+  next_osr_id(0),
   throttle_ops(g_ceph_context, "filestore_ops",g_conf->filestore_queue_max_ops),
   throttle_bytes(g_ceph_context, "filestore_bytes",g_conf->filestore_queue_max_bytes),
-  op_finisher(g_ceph_context),
+  m_ondisk_finisher_num(g_conf->filestore_ondisk_finisher_threads),
+  m_apply_finisher_num(g_conf->filestore_apply_finisher_threads),
   op_tp(g_ceph_context, "FileStore::op_tp", g_conf->filestore_op_threads, "filestore_op_threads"),
   op_wq(this, g_conf->filestore_op_thread_timeout,
 	g_conf->filestore_op_thread_suicide_timeout, &op_tp),
@@ -567,6 +568,18 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, osflagbit
   m_filestore_max_inline_xattrs(0)
 {
   m_filestore_kill_at.set(g_conf->filestore_kill_at);
+  for (int i = 0; i < m_ondisk_finisher_num; ++i) {
+    ostringstream oss;
+    oss << "filestore-ondisk-" << i;
+    Finisher *f = new Finisher(g_ceph_context, oss.str());
+    ondisk_finishers.push_back(f);
+  }
+  for (int i = 0; i < m_apply_finisher_num; ++i) {
+    ostringstream oss;
+    oss << "filestore-apply-" << i;
+    Finisher *f = new Finisher(g_ceph_context, oss.str());
+    apply_finishers.push_back(f);
+  }
 
   ostringstream oss;
   oss << basedir << "/current";
@@ -617,6 +630,14 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, osflagbit
 
 FileStore::~FileStore()
 {
+  for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+    delete *it;
+    *it = NULL;
+  }
+  for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+    delete *it;
+    *it = NULL;
+  }
   g_ceph_context->_conf->remove_observer(this);
   g_ceph_context->get_perfcounters_collection()->remove(logger);
 
@@ -1629,8 +1650,12 @@ int FileStore::mount()
   journal_start();
 
   op_tp.start();
-  op_finisher.start();
-  ondisk_finisher.start();
+  for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+    (*it)->start();
+  }
+  for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+    (*it)->start();
+  }
 
   timer.init();
 
@@ -1720,8 +1745,12 @@ int FileStore::umount()
   if (!(generic_flags & SKIP_JOURNAL_REPLAY))
     journal_write_close();
 
-  op_finisher.stop();
-  ondisk_finisher.stop();
+  for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+    (*it)->stop();
+  }
+  for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+    (*it)->stop();
+  }
 
   if (fsid_fd >= 0) {
     VOID_TEMP_FAILURE_RETRY(::close(fsid_fd));
@@ -1892,10 +1921,10 @@ void FileStore::_finish_op(OpSequencer *osr)
     o->onreadable_sync->complete(0);
   }
   if (o->onreadable) {
-    op_finisher.queue(o->onreadable);
+    apply_finishers[osr->id % m_apply_finisher_num]->queue(o->onreadable);
   }
   if (!to_queue.empty()) {
-    op_finisher.queue(to_queue);
+    apply_finishers[osr->id % m_apply_finisher_num]->queue(to_queue);
   }
   delete o;
 }
@@ -1938,7 +1967,7 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
     osr = static_cast<OpSequencer *>(posr->p.get());
     dout(5) << "queue_transactions existing " << osr << " " << *osr << dendl;
   } else {
-    osr = new OpSequencer;
+    osr = new OpSequencer(next_osr_id.inc());
     osr->set_cct(g_ceph_context);
     osr->parent = posr;
     posr->p = osr;
@@ -1956,7 +1985,7 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
     journal->throttle();
     //prepare and encode transactions data out of lock
     bufferlist tbl;
-    int data_align = _op_journal_transactions_prepare(o->tls, tbl);
+    int orig_len = journal->prepare_entry(o->tls, &tbl);
     uint64_t op_num = submit_manager.op_submit_start();
     o->op = op_num;
 
@@ -1966,7 +1995,7 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
     if (m_filestore_journal_parallel) {
       dout(5) << "queue_transactions (parallel) " << o->op << " " << o->tls << dendl;
       
-      _op_journal_transactions(tbl, data_align, o->op, ondisk, osd_op);
+      _op_journal_transactions(tbl, orig_len, o->op, ondisk, osd_op);
       
       // queue inside submit_manager op submission lock
       queue_op(osr, o);
@@ -1975,7 +2004,7 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
       
       osr->queue_journal(o->op);
 
-      _op_journal_transactions(tbl, data_align, o->op,
+      _op_journal_transactions(tbl, orig_len, o->op,
 			       new C_JournaledAhead(this, osr, o, ondisk),
 			       osd_op);
     } else {
@@ -2005,10 +2034,13 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
     return 0;
   }
 
-
+  assert(journal);
   //prepare and encode transactions data out of lock
   bufferlist tbl;
-  int data_align = _op_journal_transactions_prepare(tls, tbl);
+  int orig_len = -1;
+  if (journal->is_writeable()) {
+    orig_len = journal->prepare_entry(tls, &tbl);
+  }
   uint64_t op = submit_manager.op_submit_start();
   dout(5) << "queue_transactions (trailing journal) " << op << " " << tls << dendl;
 
@@ -2019,7 +2051,7 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
   int r = do_transactions(tls, op);
     
   if (r >= 0) {
-    _op_journal_transactions(tbl, data_align, op, ondisk, osd_op);
+    _op_journal_transactions(tbl, orig_len, op, ondisk, osd_op);
   } else {
     delete ondisk;
   }
@@ -2029,7 +2061,7 @@ int FileStore::queue_transactions(Sequencer *posr, list<Transaction*> &tls,
   if (onreadable_sync) {
     onreadable_sync->complete(r);
   }
-  op_finisher.queue(onreadable, r);
+  apply_finishers[osr->id % m_apply_finisher_num]->queue(onreadable, r);
 
   submit_manager.op_submit_finish(op);
   apply_manager.op_apply_finish(op);
@@ -2051,10 +2083,10 @@ void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk)
   // getting blocked behind an ondisk completion.
   if (ondisk) {
     dout(10) << " queueing ondisk " << ondisk << dendl;
-    ondisk_finisher.queue(ondisk);
+    ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(ondisk);
   }
   if (!to_queue.empty()) {
-    ondisk_finisher.queue(to_queue);
+    ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(to_queue);
   }
 }
 
@@ -2111,7 +2143,7 @@ void FileStore::_set_global_replay_guard(coll_t cid,
   // then record that we did it
   bufferlist v;
   ::encode(spos, v);
-  int r = chain_fsetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length());
+  int r = chain_fsetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length(), true);
   if (r < 0) {
     derr << __func__ << ": fsetxattr " << GLOBAL_REPLAY_GUARD_XATTR
 	 << " got " << cpp_strerror(r) << dendl;
@@ -2130,9 +2162,6 @@ void FileStore::_set_global_replay_guard(coll_t cid,
 int FileStore::_check_global_replay_guard(coll_t cid,
 					  const SequencerPosition& spos)
 {
-  if (!replaying || backend->can_checkpoint())
-    return 1;
-
   char fn[PATH_MAX];
   get_cdir(cid, fn, sizeof(fn));
   int fd = ::open(fn, O_RDONLY);
@@ -2204,7 +2233,7 @@ void FileStore::_set_replay_guard(int fd,
   bufferlist v(40);
   ::encode(spos, v);
   ::encode(in_progress, v);
-  int r = chain_fsetxattr(fd, REPLAY_GUARD_XATTR, v.c_str(), v.length());
+  int r = chain_fsetxattr(fd, REPLAY_GUARD_XATTR, v.c_str(), v.length(), true);
   if (r < 0) {
     derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
     assert(0 == "fsetxattr failed");
@@ -2247,7 +2276,7 @@ void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos)
   ::encode(spos, v);
   bool in_progress = false;
   ::encode(in_progress, v);
-  int r = chain_fsetxattr(fd, REPLAY_GUARD_XATTR, v.c_str(), v.length());
+  int r = chain_fsetxattr(fd, REPLAY_GUARD_XATTR, v.c_str(), v.length(), true);
   if (r < 0) {
     derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl;
     assert(0 == "fsetxattr failed");
@@ -3322,10 +3351,10 @@ int FileStore::_clone(coll_t cid, const ghobject_t& oldoid, const ghobject_t& ne
     r = chain_fgetxattr(**o, XATTR_SPILL_OUT_NAME, buf, sizeof(buf));
     if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) {
       r = chain_fsetxattr(**n, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT,
-                          sizeof(XATTR_NO_SPILL_OUT));
+                          sizeof(XATTR_NO_SPILL_OUT), true);
     } else {
       r = chain_fsetxattr(**n, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT,
-                          sizeof(XATTR_SPILL_OUT));
+                          sizeof(XATTR_SPILL_OUT), true);
     }
     if (r < 0)
       goto out3;
@@ -3802,7 +3831,9 @@ void FileStore::_flush_op_queue()
   dout(10) << "_flush_op_queue draining op tp" << dendl;
   op_wq.drain();
   dout(10) << "_flush_op_queue waiting for apply finisher" << dendl;
-  op_finisher.wait_for_empty();
+  for (vector<Finisher*>::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) {
+    (*it)->wait_for_empty();
+  }
 }
 
 /*
@@ -3826,7 +3857,9 @@ void FileStore::flush()
     if (journal)
       journal->flush();
     dout(10) << "flush draining ondisk finisher" << dendl;
-    ondisk_finisher.wait_for_empty();
+    for (vector<Finisher*>::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) {
+      (*it)->wait_for_empty();
+    }
   }
 
   _flush_op_queue();
@@ -3922,6 +3955,7 @@ int FileStore::_fgetattrs(int fd, map<string,bufferptr>& aset)
     dout(10) << " -ERANGE, got " << len << dendl;
     if (len < 0) {
       assert(!m_filestore_fail_eio || len != -EIO);
+      delete[] names2;
       return len;
     }
     name = names2;
@@ -3940,8 +3974,10 @@ int FileStore::_fgetattrs(int fd, map<string,bufferptr>& aset)
       if (*name) {
         dout(20) << "fgetattrs " << fd << " getting '" << name << "'" << dendl;
         int r = _fgetattr(fd, attrname, aset[name]);
-        if (r < 0)
+        if (r < 0) {
+	  delete[] names2;
 	  return r;
+        }
       }
     }
     name += strlen(name) + 1;
@@ -4645,7 +4681,6 @@ bool FileStore::collection_empty(coll_t c)
   RWLock::RLocker l((index.index)->access_lock);
 
   vector<ghobject_t> ls;
-  collection_list_handle_t handle;
   r = index->collection_list_partial(ghobject_t(), ghobject_t::get_max(), true,
 				     1, &ls, NULL);
   if (r < 0) {
@@ -4975,7 +5010,7 @@ int FileStore::_destroy_collection(coll_t c)
   dout(15) << "_destroy_collection " << fn << dendl;
   {
     Index from;
-    int r = get_index(c, &from);
+    r = get_index(c, &from);
     if (r < 0)
       goto out;
     assert(NULL != from.index);
diff --git a/src/os/FileStore.h b/src/os/FileStore.h
index 6580dd4..c972ebe 100644
--- a/src/os/FileStore.h
+++ b/src/os/FileStore.h
@@ -152,8 +152,6 @@ private:
   // ObjectMap
   boost::scoped_ptr<ObjectMap> object_map;
   
-  Finisher ondisk_finisher;
-
   // helper fns
   int get_cdir(coll_t cid, char *s, int len);
   
@@ -201,6 +199,7 @@ private:
   public:
     Sequencer *parent;
     Mutex apply_lock;  // for apply mutual exclusion
+    int id;
     
     /// get_max_uncompleted
     bool _get_max_uncompleted(
@@ -315,10 +314,11 @@ private:
       }
     }
 
-    OpSequencer()
+    OpSequencer(int i)
       : qlock("FileStore::OpSequencer::qlock", false, false),
 	parent(0),
-	apply_lock("FileStore::OpSequencer::apply_lock", false, false) {}
+	apply_lock("FileStore::OpSequencer::apply_lock", false, false),
+        id(i) {}
     ~OpSequencer() {
       assert(q.empty());
     }
@@ -333,9 +333,13 @@ private:
   FDCache fdcache;
   WBThrottle wbthrottle;
 
+  atomic_t next_osr_id;
   deque<OpSequencer*> op_queue;
   Throttle throttle_ops, throttle_bytes;
-  Finisher op_finisher;
+  const int m_ondisk_finisher_num;
+  const int m_apply_finisher_num;
+  vector<Finisher*> ondisk_finishers;
+  vector<Finisher*> apply_finishers;
 
   ThreadPool op_tp;
   struct OpWQ : public ThreadPool::WorkQueue<OpSequencer> {
diff --git a/src/os/GenericObjectMap.cc b/src/os/GenericObjectMap.cc
index 62f052f..14f8cd4 100644
--- a/src/os/GenericObjectMap.cc
+++ b/src/os/GenericObjectMap.cc
@@ -415,7 +415,7 @@ bool GenericObjectMap::GenericObjectMapIteratorImpl::valid_parent()
   return false;
 }
 
-int GenericObjectMap::GenericObjectMapIteratorImpl::next()
+int GenericObjectMap::GenericObjectMapIteratorImpl::next(bool validate)
 {
   assert(cur_iter->valid());
   assert(valid());
diff --git a/src/os/GenericObjectMap.h b/src/os/GenericObjectMap.h
index ecf2822..62c376c 100644
--- a/src/os/GenericObjectMap.h
+++ b/src/os/GenericObjectMap.h
@@ -26,7 +26,7 @@
 
 #include "include/memory.h"
 #include "ObjectMap.h"
-#include "KeyValueDB.h"
+#include "kv/KeyValueDB.h"
 #include "osd/osd_types.h"
 #include "common/Mutex.h"
 #include "common/Cond.h"
@@ -298,7 +298,7 @@ private:
     int upper_bound(const string &after) { return 0; }
     int lower_bound(const string &to) { return 0; }
     bool valid() { return false; }
-    int next() { assert(0); return 0; }
+    int next(bool validate=true) { assert(0); return 0; }
     string key() { assert(0); return ""; }
     bufferlist value() { assert(0); return bufferlist(); }
     int status() { return 0; }
@@ -337,7 +337,7 @@ private:
     int upper_bound(const string &after);
     int lower_bound(const string &to);
     bool valid();
-    int next();
+    int next(bool validate=true);
     string key();
     bufferlist value();
     int status();
diff --git a/src/os/IndexManager.cc b/src/os/IndexManager.cc
index 6a9f040..1415939 100644
--- a/src/os/IndexManager.cc
+++ b/src/os/IndexManager.cc
@@ -37,7 +37,7 @@ static int set_version(const char *path, uint32_t version) {
   bufferlist bl;
   ::encode(version, bl);
   return chain_setxattr(path, "user.cephos.collection_version", bl.c_str(), 
-		     bl.length());
+		     bl.length(), true);
 }
 
 static int get_version(const char *path, uint32_t *version) {
diff --git a/src/os/Journal.h b/src/os/Journal.h
index 4f8658f..d5b9186 100644
--- a/src/os/Journal.h
+++ b/src/os/Journal.h
@@ -22,6 +22,7 @@
 #include "include/Context.h"
 #include "common/Finisher.h"
 #include "common/TrackedOp.h"
+#include "os/ObjectStore.h"
 
 class PerfCounters;
 
@@ -57,7 +58,7 @@ public:
   // writes
   virtual bool is_writeable() = 0;
   virtual int make_writeable() = 0;
-  virtual void submit_entry(uint64_t seq, bufferlist& e, int alignment,
+  virtual void submit_entry(uint64_t seq, bufferlist& e, uint32_t orig_len,
 			    Context *oncommit,
 			    TrackedOpRef osd_op = TrackedOpRef()) = 0;
   virtual void commit_start(uint64_t seq) = 0;
@@ -71,6 +72,8 @@ public:
 
   virtual bool should_commit_now() = 0;
 
+  virtual int prepare_entry(list<ObjectStore::Transaction*>& tls, bufferlist* tbl) = 0;
+
   // reads/recovery
   
 };
diff --git a/src/os/JournalingObjectStore.cc b/src/os/JournalingObjectStore.cc
index 35cf74a..599a1b5 100644
--- a/src/os/JournalingObjectStore.cc
+++ b/src/os/JournalingObjectStore.cc
@@ -251,7 +251,7 @@ void JournalingObjectStore::ApplyManager::commit_finish()
 }
 
 void JournalingObjectStore::_op_journal_transactions(
-  bufferlist& tbl, int data_align,  uint64_t op,
+  bufferlist& tbl, uint32_t orig_len, uint64_t op,
   Context *onjournal, TrackedOpRef osd_op)
 {
   if (osd_op.get())
@@ -261,27 +261,9 @@ void JournalingObjectStore::_op_journal_transactions(
     dout(10) << "op_journal_transactions " << op  << dendl;
 
   if (journal && journal->is_writeable()) {
-    journal->submit_entry(op, tbl, data_align, onjournal, osd_op);
+    journal->submit_entry(op, tbl, orig_len, onjournal, osd_op);
   } else if (onjournal) {
     apply_manager.add_waiter(op, onjournal);
   }
 }
 
-int JournalingObjectStore::_op_journal_transactions_prepare(
-  list<ObjectStore::Transaction*>& tls, bufferlist& tbl)
-{
-  dout(10) << "_op_journal_transactions_prepare " << tls << dendl;
-  unsigned data_len = 0;
-  int data_align = -1; // -1 indicates that we don't care about the alignment
-  for (list<ObjectStore::Transaction*>::iterator p = tls.begin();
-      p != tls.end(); ++p) {
-    ObjectStore::Transaction *t = *p;
-    if (t->get_data_length() > data_len &&
-     (int)t->get_data_length() >= g_conf->journal_align_min_size) {
-     data_len = t->get_data_length();
-     data_align = (t->get_data_alignment() - tbl.length()) & ~CEPH_PAGE_MASK;
-    }
-    ::encode(*t, tbl);
-  }
-  return data_align;
-}
diff --git a/src/os/JournalingObjectStore.h b/src/os/JournalingObjectStore.h
index fbfa20c..42d13f6 100644
--- a/src/os/JournalingObjectStore.h
+++ b/src/os/JournalingObjectStore.h
@@ -17,6 +17,7 @@
 
 #include "ObjectStore.h"
 #include "Journal.h"
+#include "FileJournal.h"
 #include "common/RWLock.h"
 
 class JournalingObjectStore : public ObjectStore {
@@ -114,9 +115,7 @@ protected:
   void journal_write_close();
   int journal_replay(uint64_t fs_op_seq);
 
-  int _op_journal_transactions_prepare(
-    list<ObjectStore::Transaction*>& tls, bufferlist& tbl);
-  void _op_journal_transactions(bufferlist& tls, int data_align, uint64_t op,
+  void _op_journal_transactions(bufferlist& tls, uint32_t orig_len, uint64_t op,
 				Context *onjournal, TrackedOpRef osd_op);
 
   virtual int do_transactions(list<ObjectStore::Transaction*>& tls, uint64_t op_seq) = 0;
@@ -136,7 +135,9 @@ public:
       finisher(g_ceph_context),
       apply_manager(journal, finisher),
       replaying(false) {}
-  
+
+  ~JournalingObjectStore() {
+  }
 };
 
 #endif
diff --git a/src/os/KeyValueDB.h b/src/os/KeyValueDB.h
deleted file mode 100644
index e82151d..0000000
--- a/src/os/KeyValueDB.h
+++ /dev/null
@@ -1,220 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#ifndef KEY_VALUE_DB_H
-#define KEY_VALUE_DB_H
-
-#include "include/buffer.h"
-#include <set>
-#include <map>
-#include <string>
-#include "include/memory.h"
-#include <boost/scoped_ptr.hpp>
-#include "ObjectMap.h"
-
-using std::string;
-/**
- * Defines virtual interface to be implemented by key value store
- *
- * Kyoto Cabinet or LevelDB should implement this
- */
-class KeyValueDB {
-public:
-  class TransactionImpl {
-  public:
-    /// Set Keys
-    void set(
-      const string &prefix,                 ///< [in] Prefix for keys
-      const std::map<string, bufferlist> &to_set ///< [in] keys/values to set
-    ) {
-      std::map<string, bufferlist>::const_iterator it;
-      for (it = to_set.begin(); it != to_set.end(); ++it)
-	set(prefix, it->first, it->second);
-    }
-
-    /// Set Key
-    virtual void set(
-      const string &prefix,   ///< [in] Prefix for the key
-      const string &k,	      ///< [in] Key to set
-      const bufferlist &bl    ///< [in] Value to set
-      ) = 0;
-
-
-    /// Removes Keys
-    void rmkeys(
-      const string &prefix,   ///< [in] Prefix to search for
-      const std::set<string> &keys ///< [in] Keys to remove
-    ) {
-      std::set<string>::const_iterator it;
-      for (it = keys.begin(); it != keys.end(); ++it)
-	rmkey(prefix, *it);
-    }
-
-    /// Remove Key
-    virtual void rmkey(
-      const string &prefix,   ///< [in] Prefix to search for
-      const string &k	      ///< [in] Key to remove
-      ) = 0;
-
-    /// Removes keys beginning with prefix
-    virtual void rmkeys_by_prefix(
-      const string &prefix ///< [in] Prefix by which to remove keys
-      ) = 0;
-
-    virtual ~TransactionImpl() {}
-  };
-  typedef ceph::shared_ptr< TransactionImpl > Transaction;
-
-  /// create a new instance
-  static KeyValueDB *create(CephContext *cct, const string& type,
-			    const string& dir);
-
-  /// test whether we can successfully initialize; may have side effects (e.g., create)
-  static int test_init(const string& type, const string& dir);
-  virtual int init(string option_str="") = 0;
-  virtual int open(ostream &out) = 0;
-  virtual int create_and_open(ostream &out) = 0;
-
-  virtual Transaction get_transaction() = 0;
-  virtual int submit_transaction(Transaction) = 0;
-  virtual int submit_transaction_sync(Transaction t) {
-    return submit_transaction(t);
-  }
-
-  /// Retrieve Keys
-  virtual int get(
-    const string &prefix,        ///< [in] Prefix for key
-    const std::set<string> &key,      ///< [in] Key to retrieve
-    std::map<string, bufferlist> *out ///< [out] Key value retrieved
-    ) = 0;
-  virtual int get(const string &prefix, ///< [in] prefix
-		  const string &key,    ///< [in] key
-		  bufferlist *value) {  ///< [out] value
-    set<string> ks;
-    ks.insert(key);
-    map<string,bufferlist> om;
-    int r = get(prefix, ks, &om);
-    if (om.find(key) != om.end()) {
-      *value = om[key];
-    } else {
-      *value = bufferlist();
-      r = -ENOENT;
-    }
-    return r;
-  }
-
-  class WholeSpaceIteratorImpl {
-  public:
-    virtual int seek_to_first() = 0;
-    virtual int seek_to_first(const string &prefix) = 0;
-    virtual int seek_to_last() = 0;
-    virtual int seek_to_last(const string &prefix) = 0;
-    virtual int upper_bound(const string &prefix, const string &after) = 0;
-    virtual int lower_bound(const string &prefix, const string &to) = 0;
-    virtual bool valid() = 0;
-    virtual int next() = 0;
-    virtual int prev() = 0;
-    virtual string key() = 0;
-    virtual pair<string,string> raw_key() = 0;
-    virtual bufferlist value() = 0;
-    virtual int status() = 0;
-    virtual ~WholeSpaceIteratorImpl() { }
-  };
-  typedef ceph::shared_ptr< WholeSpaceIteratorImpl > WholeSpaceIterator;
-
-  class IteratorImpl : public ObjectMap::ObjectMapIteratorImpl {
-    const string prefix;
-    WholeSpaceIterator generic_iter;
-  public:
-    IteratorImpl(const string &prefix, WholeSpaceIterator iter) :
-      prefix(prefix), generic_iter(iter) { }
-    virtual ~IteratorImpl() { }
-
-    int seek_to_first() {
-      return generic_iter->seek_to_first(prefix);
-    }
-    int seek_to_last() {
-      return generic_iter->seek_to_last(prefix);
-    }
-    int upper_bound(const string &after) {
-      return generic_iter->upper_bound(prefix, after);
-    }
-    int lower_bound(const string &to) {
-      return generic_iter->lower_bound(prefix, to);
-    }
-    bool valid() {
-      if (!generic_iter->valid())
-	return false;
-      pair<string,string> raw_key = generic_iter->raw_key();
-      return (raw_key.first.compare(0, prefix.length(), prefix) == 0);
-    }
-    int next() {
-      if (valid())
-	return generic_iter->next();
-      return status();
-    }
-    int prev() {
-      if (valid())
-	return generic_iter->prev();
-      return status();
-    }
-    string key() {
-      return generic_iter->key();
-    }
-    pair<string, string> raw_key() {
-      return generic_iter->raw_key();
-    }
-    bufferlist value() {
-      return generic_iter->value();
-    }
-    int status() {
-      return generic_iter->status();
-    }
-  };
-
-  typedef ceph::shared_ptr< IteratorImpl > Iterator;
-
-  WholeSpaceIterator get_iterator() {
-    return _get_iterator();
-  }
-
-  Iterator get_iterator(const string &prefix) {
-    return ceph::shared_ptr<IteratorImpl>(
-      new IteratorImpl(prefix, get_iterator())
-    );
-  }
-
-  WholeSpaceIterator get_snapshot_iterator() {
-    return _get_snapshot_iterator();
-  }
-
-  Iterator get_snapshot_iterator(const string &prefix) {
-    return ceph::shared_ptr<IteratorImpl>(
-      new IteratorImpl(prefix, get_snapshot_iterator())
-    );
-  }
-
-  virtual uint64_t get_estimated_size(map<string,uint64_t> &extra) = 0;
-  virtual int get_statfs(struct statfs *buf) {
-    return -EOPNOTSUPP;
-  }
-
-  virtual ~KeyValueDB() {}
-
-  /// compact the underlying store
-  virtual void compact() {}
-
-  /// compact db for all keys with a given prefix
-  virtual void compact_prefix(const string& prefix) {}
-  /// compact db for all keys with a given prefix, async
-  virtual void compact_prefix_async(const string& prefix) {}
-  virtual void compact_range(const string& prefix,
-			     const string& start, const string& end) {}
-  virtual void compact_range_async(const string& prefix,
-				   const string& start, const string& end) {}
-
-protected:
-  virtual WholeSpaceIterator _get_iterator() = 0;
-  virtual WholeSpaceIterator _get_snapshot_iterator() = 0;
-};
-
-#endif
diff --git a/src/os/KeyValueStore.cc b/src/os/KeyValueStore.cc
index 1a633c6..71a97ad 100644
--- a/src/os/KeyValueStore.cc
+++ b/src/os/KeyValueStore.cc
@@ -2729,7 +2729,7 @@ int KeyValueStore::_omap_rmkeyrange(coll_t cid, const ghobject_t &hoid,
       return -ENOENT;
 
     for (iter->lower_bound(first); iter->valid() && iter->key() < last;
-         iter->next()) {
+         iter->next(false)) {
       keys.insert(iter->key());
     }
   }
@@ -2888,7 +2888,7 @@ const char** KeyValueStore::get_tracked_conf_keys() const
   static const char* KEYS[] = {
     "keyvaluestore_queue_max_ops",
     "keyvaluestore_queue_max_bytes",
-    "keyvaluestore_strip_size",
+    "keyvaluestore_default_strip_size",
     "keyvaluestore_dump_file",
     NULL
   };
diff --git a/src/os/KeyValueStore.h b/src/os/KeyValueStore.h
index 90e41ee..4307901 100644
--- a/src/os/KeyValueStore.h
+++ b/src/os/KeyValueStore.h
@@ -36,7 +36,7 @@ using namespace std;
 
 #include "common/Mutex.h"
 #include "GenericObjectMap.h"
-#include "KeyValueDB.h"
+#include "kv/KeyValueDB.h"
 #include "common/random_cache.hpp"
 
 #include "include/uuid.h"
diff --git a/src/os/KineticStore.cc b/src/os/KineticStore.cc
deleted file mode 100644
index fb6e2bf..0000000
--- a/src/os/KineticStore.cc
+++ /dev/null
@@ -1,329 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#include "KineticStore.h"
-#include "common/ceph_crypto.h"
-
-#include <set>
-#include <map>
-#include <string>
-#include "include/memory.h"
-#include <errno.h>
-using std::string;
-#include "common/perf_counters.h"
-
-#define dout_subsys ceph_subsys_keyvaluestore
-
-int KineticStore::init()
-{
-  // init defaults.  caller can override these if they want
-  // prior to calling open.
-  host = cct->_conf->kinetic_host;
-  port = cct->_conf->kinetic_port;
-  user_id = cct->_conf->kinetic_user_id;
-  hmac_key = cct->_conf->kinetic_hmac_key;
-  use_ssl = cct->_conf->kinetic_use_ssl;
-  return 0;
-}
-
-int KineticStore::_test_init(CephContext *c)
-{
-  kinetic::KineticConnectionFactory conn_factory =
-    kinetic::NewKineticConnectionFactory();
-
-  kinetic::ConnectionOptions options;
-  options.host = cct->_conf->kinetic_host;
-  options.port = cct->_conf->kinetic_port;
-  options.user_id = cct->_conf->kinetic_user_id;
-  options.hmac_key = cct->_conf->kinetic_hmac_key;
-  options.use_ssl = cct->_conf->kinetic_use_ssl;
-
-  kinetic::Status status = conn_factory.NewThreadsafeBlockingConnection(options, kinetic_conn, 10);
-  kinetic_conn.reset();
-  if (!status.ok())
-    derr << __func__ << "Unable to connect to kinetic store " << options.host
-         << ":" << options.port << " : " << status.ToString() << dendl;
-  return status.ok() ? 0 : -EIO;
-}
-
-int KineticStore::do_open(ostream &out, bool create_if_missing)
-{
-  kinetic::KineticConnectionFactory conn_factory =
-    kinetic::NewKineticConnectionFactory();
-  kinetic::ConnectionOptions options;
-  options.host = host;
-  options.port = port;
-  options.user_id = user_id;
-  options.hmac_key = hmac_key;
-  options.use_ssl = use_ssl;
-  kinetic::Status status = conn_factory.NewThreadsafeBlockingConnection(options, kinetic_conn, 10);
-  if (!status.ok()) {
-    derr << "Unable to connect to kinetic store " << host << ":" << port
-	 << " : " << status.ToString() << dendl;
-    return -EINVAL;
-  }
-
-  PerfCountersBuilder plb(g_ceph_context, "kinetic", l_kinetic_first, l_kinetic_last);
-  plb.add_u64_counter(l_kinetic_gets, "kinetic_get", "Gets");
-  plb.add_u64_counter(l_kinetic_txns, "kinetic_transaction", "Transactions");
-  logger = plb.create_perf_counters();
-  cct->get_perfcounters_collection()->add(logger);
-  return 0;
-}
-
-KineticStore::KineticStore(CephContext *c) :
-  cct(c),
-  logger(NULL)
-{
-  host = c->_conf->kinetic_host;
-  port = c->_conf->kinetic_port;
-  user_id = c->_conf->kinetic_user_id;
-  hmac_key = c->_conf->kinetic_hmac_key;
-  use_ssl = c->_conf->kinetic_use_ssl;
-}
-
-KineticStore::~KineticStore()
-{
-  close();
-  delete logger;
-}
-
-void KineticStore::close()
-{
-  kinetic_conn.reset();
-  if (logger)
-    cct->get_perfcounters_collection()->remove(logger);
-}
-
-int KineticStore::submit_transaction(KeyValueDB::Transaction t)
-{
-  KineticTransactionImpl * _t =
-    static_cast<KineticTransactionImpl *>(t.get());
-
-  dout(20) << "kinetic submit_transaction" << dendl;
-
-  for (vector<KineticOp>::iterator it = _t->ops.begin();
-       it != _t->ops.end(); ++it) {
-    kinetic::KineticStatus status(kinetic::StatusCode::OK, "");
-    if (it->type == KINETIC_OP_WRITE) {
-      string data(it->data.c_str(), it->data.length());
-      kinetic::KineticRecord record(data, "", "",
-				    com::seagate::kinetic::client::proto::Message_Algorithm_SHA1);
-      dout(30) << "kinetic before put of " << it->key << " (" << data.length() << " bytes)" << dendl;
-      status = kinetic_conn->Put(it->key, "", kinetic::WriteMode::IGNORE_VERSION,
-				 record);
-      dout(30) << "kinetic after put of " << it->key << dendl;
-    } else {
-      assert(it->type == KINETIC_OP_DELETE);
-      dout(30) << "kinetic before delete" << dendl;
-      status = kinetic_conn->Delete(it->key, "",
-				    kinetic::WriteMode::IGNORE_VERSION);
-      dout(30) << "kinetic after delete" << dendl;
-    }
-    if (!status.ok()) {
-      derr << "kinetic error submitting transaction: "
-	   << status.message() << dendl;
-      return -1;
-    }
-  }
-
-  logger->inc(l_kinetic_txns);
-  return 0;
-}
-
-int KineticStore::submit_transaction_sync(KeyValueDB::Transaction t)
-{
-  return submit_transaction(t);
-}
-
-void KineticStore::KineticTransactionImpl::set(
-  const string &prefix,
-  const string &k,
-  const bufferlist &to_set_bl)
-{
-  string key = combine_strings(prefix, k);
-  dout(30) << "kinetic set key " << key << dendl;
-  ops.push_back(KineticOp(KINETIC_OP_WRITE, key, to_set_bl));
-}
-
-void KineticStore::KineticTransactionImpl::rmkey(const string &prefix,
-					         const string &k)
-{
-  string key = combine_strings(prefix, k);
-  dout(30) << "kinetic rm key " << key << dendl;
-  ops.push_back(KineticOp(KINETIC_OP_DELETE, key));
-}
-
-void KineticStore::KineticTransactionImpl::rmkeys_by_prefix(const string &prefix)
-{
-  dout(20) << "kinetic rmkeys_by_prefix " << prefix << dendl;
-  KeyValueDB::Iterator it = db->get_iterator(prefix);
-  for (it->seek_to_first();
-       it->valid();
-       it->next()) {
-    string key = combine_strings(prefix, it->key());
-    ops.push_back(KineticOp(KINETIC_OP_DELETE, key));
-    dout(30) << "kinetic rm key by prefix: " << key << dendl;
-  }
-}
-
-int KineticStore::get(
-    const string &prefix,
-    const std::set<string> &keys,
-    std::map<string, bufferlist> *out)
-{
-  dout(30) << "kinetic get prefix: " << prefix << " keys: " << keys << dendl;
-  for (std::set<string>::const_iterator i = keys.begin();
-       i != keys.end();
-       ++i) {
-    unique_ptr<kinetic::KineticRecord> record;
-    string key = combine_strings(prefix, *i);
-    dout(30) << "before get key " << key << dendl;
-    kinetic::KineticStatus status = kinetic_conn->Get(key, record);
-    if (!status.ok())
-      break;
-    dout(30) << "kinetic get got key: " << key << dendl;
-    out->insert(make_pair(key, to_bufferlist(*record.get())));
-  }
-  logger->inc(l_kinetic_gets);
-  return 0;
-}
-
-string KineticStore::combine_strings(const string &prefix, const string &value)
-{
-  string out = prefix;
-  out.push_back(1);
-  out.append(value);
-  return out;
-}
-
-bufferlist KineticStore::to_bufferlist(const kinetic::KineticRecord &record)
-{
-  bufferlist bl;
-  bl.append(*(record.value()));
-  return bl;
-}
-
-int KineticStore::split_key(string in_prefix, string *prefix, string *key)
-{
-  size_t prefix_len = in_prefix.find('\1');
-  if (prefix_len >= in_prefix.size())
-    return -EINVAL;
-
-  if (prefix)
-    *prefix = string(in_prefix, 0, prefix_len);
-  if (key)
-    *key= string(in_prefix, prefix_len + 1);
-  return 0;
-}
-
-KineticStore::KineticWholeSpaceIteratorImpl::KineticWholeSpaceIteratorImpl(kinetic::BlockingKineticConnection *conn) : kinetic_conn(conn),
-   kinetic_status(kinetic::StatusCode::OK, "")
-{
-  dout(30) << "kinetic iterator constructor()" << dendl;
-  const static string last_key = "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF";
-  kinetic::KeyRangeIterator it =
-    kinetic_conn->IterateKeyRange("", true, last_key, true, 1024);
-  while (it != kinetic::KeyRangeEnd()) {
-    try {
-      keys.insert(*it);
-      dout(30) << "kinetic iterator added " << *it << dendl;
-    } catch (std::runtime_error &e) {
-      kinetic_status = kinetic::KineticStatus(kinetic::StatusCode::CLIENT_INTERNAL_ERROR, e.what());
-      return;
-    }
-    ++it;
-  }
-  keys_iter = keys.begin();
-}
-
-int KineticStore::KineticWholeSpaceIteratorImpl::seek_to_first(const string &prefix)
-{
-  dout(30) << "kinetic iterator seek_to_first(prefix): " << prefix << dendl;
-  keys_iter = keys.lower_bound(prefix);
-  return 0;
-}
-
-int KineticStore::KineticWholeSpaceIteratorImpl::seek_to_last()
-{
-  dout(30) << "kinetic iterator seek_to_last()" << dendl;
-  keys_iter = keys.end();
-  if (keys.begin() != keys_iter)
-    --keys_iter;
-  return 0;
-}
-
-int KineticStore::KineticWholeSpaceIteratorImpl::seek_to_last(const string &prefix)
-{
-  dout(30) << "kinetic iterator seek_to_last(prefix): " << prefix << dendl;
-  keys_iter = keys.upper_bound(prefix + "\2");
-  if (keys.begin() == keys_iter) {
-    keys_iter = keys.end();
-  } else {
-    --keys_iter;
-  }
-  return 0;
-}
-
-int KineticStore::KineticWholeSpaceIteratorImpl::upper_bound(const string &prefix, const string &after) {
-  dout(30) << "kinetic iterator upper_bound()" << dendl;
-  string bound = combine_strings(prefix, after);
-  keys_iter = keys.upper_bound(bound);
-  return 0;
-}
-
-int KineticStore::KineticWholeSpaceIteratorImpl::lower_bound(const string &prefix, const string &to) {
-  dout(30) << "kinetic iterator lower_bound()" << dendl;
-  string bound = combine_strings(prefix, to);
-  keys_iter = keys.lower_bound(bound);
-  return 0;
-}
-
-bool KineticStore::KineticWholeSpaceIteratorImpl::valid() {
-  dout(30) << "kinetic iterator valid()" << dendl;
-  return keys_iter != keys.end();
-}
-
-int KineticStore::KineticWholeSpaceIteratorImpl::next() {
-  dout(30) << "kinetic iterator next()" << dendl;
-  if (keys_iter != keys.end()) {
-      ++keys_iter;
-      return 0;
-  }
-  return -1;
-}
-
-int KineticStore::KineticWholeSpaceIteratorImpl::prev() {
-  dout(30) << "kinetic iterator prev()" << dendl;
-  if (keys_iter != keys.begin()) {
-      --keys_iter;
-      return 0;
-  }
-  keys_iter = keys.end();
-  return -1;
-}
-
-string KineticStore::KineticWholeSpaceIteratorImpl::key() {
-  dout(30) << "kinetic iterator key()" << dendl;
-  string out_key;
-  split_key(*keys_iter, NULL, &out_key);
-  return out_key;
-}
-
-pair<string,string> KineticStore::KineticWholeSpaceIteratorImpl::raw_key() {
-  dout(30) << "kinetic iterator raw_key()" << dendl;
-  string prefix, key;
-  split_key(*keys_iter, &prefix, &key);
-  return make_pair(prefix, key);
-}
-
-bufferlist KineticStore::KineticWholeSpaceIteratorImpl::value() {
-  dout(30) << "kinetic iterator value()" << dendl;
-  unique_ptr<kinetic::KineticRecord> record;
-  kinetic_status = kinetic_conn->Get(*keys_iter, record);
-  return to_bufferlist(*record.get());
-}
-
-int KineticStore::KineticWholeSpaceIteratorImpl::status() {
-  dout(30) << "kinetic iterator status()" << dendl;
-  return kinetic_status.ok() ? 0 : -1;
-}
diff --git a/src/os/KineticStore.h b/src/os/KineticStore.h
deleted file mode 100644
index cbb7633..0000000
--- a/src/os/KineticStore.h
+++ /dev/null
@@ -1,160 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#ifndef KINETIC_STORE_H
-#define KINETIC_STORE_H
-
-#include "include/types.h"
-#include "include/buffer.h"
-#include "KeyValueDB.h"
-#include <set>
-#include <map>
-#include <string>
-#include "include/memory.h"
-#include <kinetic/kinetic.h>
-
-#include <errno.h>
-#include "common/errno.h"
-#include "common/dout.h"
-#include "include/assert.h"
-#include "common/Formatter.h"
-
-#include "common/ceph_context.h"
-
-class PerfCounters;
-
-enum {
-  l_kinetic_first = 34400,
-  l_kinetic_gets,
-  l_kinetic_txns,
-  l_kinetic_last,
-};
-
-/**
- * Uses Kinetic to implement the KeyValueDB interface
- */
-class KineticStore : public KeyValueDB {
-  CephContext *cct;
-  PerfCounters *logger;
-  string host;
-  int port;
-  int user_id;
-  string hmac_key;
-  bool use_ssl;
-  std::unique_ptr<kinetic::BlockingKineticConnection> kinetic_conn;
-
-  int do_open(ostream &out, bool create_if_missing);
-
-public:
-  KineticStore(CephContext *c);
-  ~KineticStore();
-
-  static int _test_init(CephContext *c);
-  int init();
-
-  /// Opens underlying db
-  int open(ostream &out) {
-    return do_open(out, false);
-  }
-  /// Creates underlying db if missing and opens it
-  int create_and_open(ostream &out) {
-    return do_open(out, true);
-  }
-
-  void close();
-
-  enum KineticOpType {
-    KINETIC_OP_WRITE,
-    KINETIC_OP_DELETE,
-  };
-
-  struct KineticOp {
-    KineticOpType type;
-    std::string key;
-    bufferlist data;
-    KineticOp(KineticOpType type, const string &key) : type(type), key(key) {}
-    KineticOp(KineticOpType type, const string &key, const bufferlist &data)
-      : type(type), key(key), data(data) {}
-  };
-
-  class KineticTransactionImpl : public KeyValueDB::TransactionImpl {
-  public:
-    vector<KineticOp> ops;
-    KineticStore *db;
-
-    KineticTransactionImpl(KineticStore *db) : db(db) {}
-    void set(
-      const string &prefix,
-      const string &k,
-      const bufferlist &bl);
-    void rmkey(
-      const string &prefix,
-      const string &k);
-    void rmkeys_by_prefix(
-      const string &prefix
-      );
-  };
-
-  KeyValueDB::Transaction get_transaction() {
-    return ceph::shared_ptr< KineticTransactionImpl >(
-      new KineticTransactionImpl(this));
-  }
-
-  int submit_transaction(KeyValueDB::Transaction t);
-  int submit_transaction_sync(KeyValueDB::Transaction t);
-  int get(
-    const string &prefix,
-    const std::set<string> &key,
-    std::map<string, bufferlist> *out
-    );
-
-  class KineticWholeSpaceIteratorImpl :
-    public KeyValueDB::WholeSpaceIteratorImpl {
-    std::set<std::string> keys;
-    std::set<std::string>::iterator keys_iter;
-    kinetic::BlockingKineticConnection *kinetic_conn;
-    kinetic::KineticStatus kinetic_status;
-  public:
-    KineticWholeSpaceIteratorImpl(kinetic::BlockingKineticConnection *conn);
-    virtual ~KineticWholeSpaceIteratorImpl() { }
-
-    int seek_to_first() {
-      return seek_to_first("");
-    }
-    int seek_to_first(const string &prefix);
-    int seek_to_last();
-    int seek_to_last(const string &prefix);
-    int upper_bound(const string &prefix, const string &after);
-    int lower_bound(const string &prefix, const string &to);
-    bool valid();
-    int next();
-    int prev();
-    string key();
-    pair<string,string> raw_key();
-    bufferlist value();
-    int status();
-  };
-
-  /// Utility
-  static string combine_strings(const string &prefix, const string &value);
-  static int split_key(string in_prefix, string *prefix, string *key);
-  static bufferlist to_bufferlist(const kinetic::KineticRecord &record);
-  virtual uint64_t get_estimated_size(map<string,uint64_t> &extra) {
-    // not used by the osd
-    return 0;
-  }
-
-
-protected:
-  WholeSpaceIterator _get_iterator() {
-    return ceph::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
-								new KineticWholeSpaceIteratorImpl(kinetic_conn.get()));
-  }
-
-  // TODO: remove snapshots from interface
-  WholeSpaceIterator _get_snapshot_iterator() {
-    return _get_iterator();
-  }
-
-};
-
-#endif
diff --git a/src/os/LFNIndex.cc b/src/os/LFNIndex.cc
index 48d8db3..90837b6 100644
--- a/src/os/LFNIndex.cc
+++ b/src/os/LFNIndex.cc
@@ -467,7 +467,6 @@ int LFNIndex::list_subdirs(const vector<string> &to_list,
     }
     string short_name(de->d_name);
     string demangled_name;
-    ghobject_t obj;
     if (lfn_is_subdir(short_name, &demangled_name)) {
       out->push_back(demangled_name);
     }
diff --git a/src/os/LevelDBStore.cc b/src/os/LevelDBStore.cc
deleted file mode 100644
index 1aaa168..0000000
--- a/src/os/LevelDBStore.cc
+++ /dev/null
@@ -1,306 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#include "LevelDBStore.h"
-
-#include <set>
-#include <map>
-#include <string>
-#include "include/memory.h"
-#include <errno.h>
-using std::string;
-#include "common/perf_counters.h"
-
-int LevelDBStore::init(string option_str)
-{
-  // init defaults.  caller can override these if they want
-  // prior to calling open.
-  options.write_buffer_size = g_conf->leveldb_write_buffer_size;
-  options.cache_size = g_conf->leveldb_cache_size;
-  options.block_size = g_conf->leveldb_block_size;
-  options.bloom_size = g_conf->leveldb_bloom_size;
-  options.compression_enabled = g_conf->leveldb_compression;
-  options.paranoid_checks = g_conf->leveldb_paranoid;
-  options.max_open_files = g_conf->leveldb_max_open_files;
-  options.log_file = g_conf->leveldb_log;
-  return 0;
-}
-
-int LevelDBStore::do_open(ostream &out, bool create_if_missing)
-{
-  leveldb::Options ldoptions;
-
-  if (options.write_buffer_size)
-    ldoptions.write_buffer_size = options.write_buffer_size;
-  if (options.max_open_files)
-    ldoptions.max_open_files = options.max_open_files;
-  if (options.cache_size) {
-    leveldb::Cache *_db_cache = leveldb::NewLRUCache(options.cache_size);
-    db_cache.reset(_db_cache);
-    ldoptions.block_cache = db_cache.get();
-  }
-  if (options.block_size)
-    ldoptions.block_size = options.block_size;
-  if (options.bloom_size) {
-#ifdef HAVE_LEVELDB_FILTER_POLICY
-    const leveldb::FilterPolicy *_filterpolicy =
-	leveldb::NewBloomFilterPolicy(options.bloom_size);
-    filterpolicy.reset(_filterpolicy);
-    ldoptions.filter_policy = filterpolicy.get();
-#else
-    assert(0 == "bloom size set but installed leveldb doesn't support bloom filters");
-#endif
-  }
-  if (options.compression_enabled)
-    ldoptions.compression = leveldb::kSnappyCompression;
-  else
-    ldoptions.compression = leveldb::kNoCompression;
-  if (options.block_restart_interval)
-    ldoptions.block_restart_interval = options.block_restart_interval;
-
-  ldoptions.error_if_exists = options.error_if_exists;
-  ldoptions.paranoid_checks = options.paranoid_checks;
-  ldoptions.create_if_missing = create_if_missing;
-
-  if (options.log_file.length()) {
-    leveldb::Env *env = leveldb::Env::Default();
-    env->NewLogger(options.log_file, &ldoptions.info_log);
-  }
-
-  leveldb::DB *_db;
-  leveldb::Status status = leveldb::DB::Open(ldoptions, path, &_db);
-  db.reset(_db);
-  if (!status.ok()) {
-    out << status.ToString() << std::endl;
-    return -EINVAL;
-  }
-
-  PerfCountersBuilder plb(g_ceph_context, "leveldb", l_leveldb_first, l_leveldb_last);
-  plb.add_u64_counter(l_leveldb_gets, "leveldb_get", "Gets");
-  plb.add_u64_counter(l_leveldb_txns, "leveldb_transaction", "Transactions");
-  plb.add_time_avg(l_leveldb_get_latency, "leveldb_get_latency", "Get Latency");
-  plb.add_time_avg(l_leveldb_submit_latency, "leveldb_submit_latency", "Submit Latency");
-  plb.add_time_avg(l_leveldb_submit_sync_latency, "leveldb_submit_sync_latency", "Submit Sync Latency");
-  plb.add_u64_counter(l_leveldb_compact, "leveldb_compact", "Compactions");
-  plb.add_u64_counter(l_leveldb_compact_range, "leveldb_compact_range", "Compactions by range");
-  plb.add_u64_counter(l_leveldb_compact_queue_merge, "leveldb_compact_queue_merge", "Mergings of ranges in compaction queue");
-  plb.add_u64(l_leveldb_compact_queue_len, "leveldb_compact_queue_len", "Length of compaction queue");
-  logger = plb.create_perf_counters();
-  cct->get_perfcounters_collection()->add(logger);
-
-  if (g_conf->leveldb_compact_on_mount) {
-    derr << "Compacting leveldb store..." << dendl;
-    compact();
-    derr << "Finished compacting leveldb store" << dendl;
-  }
-  return 0;
-}
-
-int LevelDBStore::_test_init(const string& dir)
-{
-  leveldb::Options options;
-  options.create_if_missing = true;
-  leveldb::DB *db;
-  leveldb::Status status = leveldb::DB::Open(options, dir, &db);
-  delete db;
-  return status.ok() ? 0 : -EIO;
-}
-
-LevelDBStore::~LevelDBStore()
-{
-  close();
-  delete logger;
-
-  // Ensure db is destroyed before dependent db_cache and filterpolicy
-  db.reset();
-}
-
-void LevelDBStore::close()
-{
-  // stop compaction thread
-  compact_queue_lock.Lock();
-  if (compact_thread.is_started()) {
-    compact_queue_stop = true;
-    compact_queue_cond.Signal();
-    compact_queue_lock.Unlock();
-    compact_thread.join();
-  } else {
-    compact_queue_lock.Unlock();
-  }
-
-  if (logger)
-    cct->get_perfcounters_collection()->remove(logger);
-}
-
-int LevelDBStore::submit_transaction(KeyValueDB::Transaction t)
-{
-  utime_t start = ceph_clock_now(g_ceph_context);
-  LevelDBTransactionImpl * _t =
-    static_cast<LevelDBTransactionImpl *>(t.get());
-  leveldb::Status s = db->Write(leveldb::WriteOptions(), &(_t->bat));
-  utime_t lat = ceph_clock_now(g_ceph_context) - start;
-  logger->inc(l_leveldb_txns);
-  logger->tinc(l_leveldb_submit_latency, lat);
-  return s.ok() ? 0 : -1;
-}
-
-int LevelDBStore::submit_transaction_sync(KeyValueDB::Transaction t)
-{
-  utime_t start = ceph_clock_now(g_ceph_context);
-  LevelDBTransactionImpl * _t =
-    static_cast<LevelDBTransactionImpl *>(t.get());
-  leveldb::WriteOptions options;
-  options.sync = true;
-  leveldb::Status s = db->Write(options, &(_t->bat));
-  utime_t lat = ceph_clock_now(g_ceph_context) - start;
-  logger->inc(l_leveldb_txns);
-  logger->tinc(l_leveldb_submit_sync_latency, lat);
-  return s.ok() ? 0 : -1;
-}
-
-void LevelDBStore::LevelDBTransactionImpl::set(
-  const string &prefix,
-  const string &k,
-  const bufferlist &to_set_bl)
-{
-  string key = combine_strings(prefix, k);
-  //bufferlist::c_str() is non-constant, so we need to make a copy
-  bufferlist val = to_set_bl;
-  bat.Delete(leveldb::Slice(key));
-  bat.Put(leveldb::Slice(key),
-	  leveldb::Slice(val.c_str(), val.length()));
-}
-
-void LevelDBStore::LevelDBTransactionImpl::rmkey(const string &prefix,
-					         const string &k)
-{
-  string key = combine_strings(prefix, k);
-  bat.Delete(leveldb::Slice(key));
-}
-
-void LevelDBStore::LevelDBTransactionImpl::rmkeys_by_prefix(const string &prefix)
-{
-  KeyValueDB::Iterator it = db->get_iterator(prefix);
-  for (it->seek_to_first();
-       it->valid();
-       it->next()) {
-    string key = combine_strings(prefix, it->key());
-    bat.Delete(key);
-  }
-}
-
-int LevelDBStore::get(
-    const string &prefix,
-    const std::set<string> &keys,
-    std::map<string, bufferlist> *out)
-{
-  utime_t start = ceph_clock_now(g_ceph_context);
-  KeyValueDB::Iterator it = get_iterator(prefix);
-  for (std::set<string>::const_iterator i = keys.begin();
-       i != keys.end();
-       ++i) {
-    it->lower_bound(*i);
-    if (it->valid() && it->key() == *i) {
-      out->insert(make_pair(*i, it->value()));
-    } else if (!it->valid())
-      break;
-  }
-  utime_t lat = ceph_clock_now(g_ceph_context) - start;
-  logger->inc(l_leveldb_gets);
-  logger->tinc(l_leveldb_get_latency, lat);
-  return 0;
-}
-
-string LevelDBStore::combine_strings(const string &prefix, const string &value)
-{
-  string out = prefix;
-  out.push_back(0);
-  out.append(value);
-  return out;
-}
-
-bufferlist LevelDBStore::to_bufferlist(leveldb::Slice in)
-{
-  bufferlist bl;
-  bl.append(bufferptr(in.data(), in.size()));
-  return bl;
-}
-
-int LevelDBStore::split_key(leveldb::Slice in, string *prefix, string *key)
-{
-  string in_prefix = in.ToString();
-  size_t prefix_len = in_prefix.find('\0');
-  if (prefix_len >= in_prefix.size())
-    return -EINVAL;
-
-  if (prefix)
-    *prefix = string(in_prefix, 0, prefix_len);
-  if (key)
-    *key= string(in_prefix, prefix_len + 1);
-  return 0;
-}
-
-void LevelDBStore::compact()
-{
-  logger->inc(l_leveldb_compact);
-  db->CompactRange(NULL, NULL);
-}
-
-
-void LevelDBStore::compact_thread_entry()
-{
-  compact_queue_lock.Lock();
-  while (!compact_queue_stop) {
-    while (!compact_queue.empty()) {
-      pair<string,string> range = compact_queue.front();
-      compact_queue.pop_front();
-      logger->set(l_leveldb_compact_queue_len, compact_queue.size());
-      compact_queue_lock.Unlock();
-      logger->inc(l_leveldb_compact_range);
-      compact_range(range.first, range.second);
-      compact_queue_lock.Lock();
-      continue;
-    }
-    compact_queue_cond.Wait(compact_queue_lock);
-  }
-  compact_queue_lock.Unlock();
-}
-
-void LevelDBStore::compact_range_async(const string& start, const string& end)
-{
-  Mutex::Locker l(compact_queue_lock);
-
-  // try to merge adjacent ranges.  this is O(n), but the queue should
-  // be short.  note that we do not cover all overlap cases and merge
-  // opportunities here, but we capture the ones we currently need.
-  list< pair<string,string> >::iterator p = compact_queue.begin();
-  while (p != compact_queue.end()) {
-    if (p->first == start && p->second == end) {
-      // dup; no-op
-      return;
-    }
-    if (p->first <= end && p->first > start) {
-      // merge with existing range to the right
-      compact_queue.push_back(make_pair(start, p->second));
-      compact_queue.erase(p);
-      logger->inc(l_leveldb_compact_queue_merge);
-      break;
-    }
-    if (p->second >= start && p->second < end) {
-      // merge with existing range to the left
-      compact_queue.push_back(make_pair(p->first, end));
-      compact_queue.erase(p);
-      logger->inc(l_leveldb_compact_queue_merge);
-      break;
-    }
-    ++p;
-  }
-  if (p == compact_queue.end()) {
-    // no merge, new entry.
-    compact_queue.push_back(make_pair(start, end));
-    logger->set(l_leveldb_compact_queue_len, compact_queue.size());
-  }
-  compact_queue_cond.Signal();
-  if (!compact_thread.is_started()) {
-    compact_thread.create();
-  }
-}
diff --git a/src/os/LevelDBStore.h b/src/os/LevelDBStore.h
deleted file mode 100644
index 06ea071..0000000
--- a/src/os/LevelDBStore.h
+++ /dev/null
@@ -1,402 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#ifndef LEVEL_DB_STORE_H
-#define LEVEL_DB_STORE_H
-
-#include "include/types.h"
-#include "include/buffer.h"
-#include "KeyValueDB.h"
-#include <set>
-#include <map>
-#include <string>
-#include "include/memory.h"
-#include <boost/scoped_ptr.hpp>
-#include "leveldb/db.h"
-#include "leveldb/env.h"
-#include "leveldb/write_batch.h"
-#include "leveldb/slice.h"
-#include "leveldb/cache.h"
-#ifdef HAVE_LEVELDB_FILTER_POLICY
-#include "leveldb/filter_policy.h"
-#endif
-
-#include <errno.h>
-#include "common/errno.h"
-#include "common/dout.h"
-#include "include/assert.h"
-#include "common/Formatter.h"
-
-#include "common/ceph_context.h"
-
-class PerfCounters;
-
-enum {
-  l_leveldb_first = 34300,
-  l_leveldb_gets,
-  l_leveldb_txns,
-  l_leveldb_get_latency,
-  l_leveldb_submit_latency,
-  l_leveldb_submit_sync_latency,
-  l_leveldb_compact,
-  l_leveldb_compact_range,
-  l_leveldb_compact_queue_merge,
-  l_leveldb_compact_queue_len,
-  l_leveldb_last,
-};
-
-/**
- * Uses LevelDB to implement the KeyValueDB interface
- */
-class LevelDBStore : public KeyValueDB {
-  CephContext *cct;
-  PerfCounters *logger;
-  string path;
-  boost::scoped_ptr<leveldb::Cache> db_cache;
-#ifdef HAVE_LEVELDB_FILTER_POLICY
-  boost::scoped_ptr<const leveldb::FilterPolicy> filterpolicy;
-#endif
-  boost::scoped_ptr<leveldb::DB> db;
-
-  int do_open(ostream &out, bool create_if_missing);
-
-  // manage async compactions
-  Mutex compact_queue_lock;
-  Cond compact_queue_cond;
-  list< pair<string,string> > compact_queue;
-  bool compact_queue_stop;
-  class CompactThread : public Thread {
-    LevelDBStore *db;
-  public:
-    CompactThread(LevelDBStore *d) : db(d) {}
-    void *entry() {
-      db->compact_thread_entry();
-      return NULL;
-    }
-    friend class LevelDBStore;
-  } compact_thread;
-
-  void compact_thread_entry();
-
-  void compact_range(const string& start, const string& end) {
-    leveldb::Slice cstart(start);
-    leveldb::Slice cend(end);
-    db->CompactRange(&cstart, &cend);
-  }
-  void compact_range_async(const string& start, const string& end);
-
-public:
-  /// compact the underlying leveldb store
-  void compact();
-
-  /// compact db for all keys with a given prefix
-  void compact_prefix(const string& prefix) {
-    compact_range(prefix, past_prefix(prefix));
-  }
-  void compact_prefix_async(const string& prefix) {
-    compact_range_async(prefix, past_prefix(prefix));
-  }
-  void compact_range(const string& prefix,
-		     const string& start, const string& end) {
-    compact_range(combine_strings(prefix, start), combine_strings(prefix, end));
-  }
-  void compact_range_async(const string& prefix,
-			   const string& start, const string& end) {
-    compact_range_async(combine_strings(prefix, start),
-			combine_strings(prefix, end));
-  }
-
-
-  /**
-   * options_t: Holds options which are minimally interpreted
-   * on initialization and then passed through to LevelDB.
-   * We transform a couple of these into actual LevelDB
-   * structures, but the rest are simply passed through unchanged. See
-   * leveldb/options.h for more precise details on each.
-   *
-   * Set them after constructing the LevelDBStore, but before calling
-   * open() or create_and_open().
-   */
-  struct options_t {
-    uint64_t write_buffer_size; /// in-memory write buffer size
-    int max_open_files; /// maximum number of files LevelDB can open at once
-    uint64_t cache_size; /// size of extra decompressed cache to use
-    uint64_t block_size; /// user data per block
-    int bloom_size; /// number of bits per entry to put in a bloom filter
-    bool compression_enabled; /// whether to use libsnappy compression or not
-
-    // don't change these ones. No, seriously
-    int block_restart_interval;
-    bool error_if_exists;
-    bool paranoid_checks;
-
-    string log_file;
-
-    options_t() :
-      write_buffer_size(0), //< 0 means default
-      max_open_files(0), //< 0 means default
-      cache_size(0), //< 0 means no cache (default)
-      block_size(0), //< 0 means default
-      bloom_size(0), //< 0 means no bloom filter (default)
-      compression_enabled(true), //< set to false for no compression
-      block_restart_interval(0), //< 0 means default
-      error_if_exists(false), //< set to true if you want to check nonexistence
-      paranoid_checks(false) //< set to true if you want paranoid checks
-    {}
-  } options;
-
-  LevelDBStore(CephContext *c, const string &path) :
-    cct(c),
-    logger(NULL),
-    path(path),
-    db_cache(NULL),
-#ifdef HAVE_LEVELDB_FILTER_POLICY
-    filterpolicy(NULL),
-#endif
-    compact_queue_lock("LevelDBStore::compact_thread_lock"),
-    compact_queue_stop(false),
-    compact_thread(this),
-    options()
-  {}
-
-  ~LevelDBStore();
-
-  static int _test_init(const string& dir);
-  int init(string option_str="");
-
-  /// Opens underlying db
-  int open(ostream &out) {
-    return do_open(out, false);
-  }
-  /// Creates underlying db if missing and opens it
-  int create_and_open(ostream &out) {
-    return do_open(out, true);
-  }
-
-  void close();
-
-  class LevelDBTransactionImpl : public KeyValueDB::TransactionImpl {
-  public:
-    leveldb::WriteBatch bat;
-    LevelDBStore *db;
-    LevelDBTransactionImpl(LevelDBStore *db) : db(db) {}
-    void set(
-      const string &prefix,
-      const string &k,
-      const bufferlist &bl);
-    void rmkey(
-      const string &prefix,
-      const string &k);
-    void rmkeys_by_prefix(
-      const string &prefix
-      );
-  };
-
-  KeyValueDB::Transaction get_transaction() {
-    return ceph::shared_ptr< LevelDBTransactionImpl >(
-      new LevelDBTransactionImpl(this));
-  }
-
-  int submit_transaction(KeyValueDB::Transaction t);
-  int submit_transaction_sync(KeyValueDB::Transaction t);
-  int get(
-    const string &prefix,
-    const std::set<string> &key,
-    std::map<string, bufferlist> *out
-    );
-
-  class LevelDBWholeSpaceIteratorImpl :
-    public KeyValueDB::WholeSpaceIteratorImpl {
-  protected:
-    boost::scoped_ptr<leveldb::Iterator> dbiter;
-  public:
-    LevelDBWholeSpaceIteratorImpl(leveldb::Iterator *iter) :
-      dbiter(iter) { }
-    virtual ~LevelDBWholeSpaceIteratorImpl() { }
-
-    int seek_to_first() {
-      dbiter->SeekToFirst();
-      return dbiter->status().ok() ? 0 : -1;
-    }
-    int seek_to_first(const string &prefix) {
-      leveldb::Slice slice_prefix(prefix);
-      dbiter->Seek(slice_prefix);
-      return dbiter->status().ok() ? 0 : -1;
-    }
-    int seek_to_last() {
-      dbiter->SeekToLast();
-      return dbiter->status().ok() ? 0 : -1;
-    }
-    int seek_to_last(const string &prefix) {
-      string limit = past_prefix(prefix);
-      leveldb::Slice slice_limit(limit);
-      dbiter->Seek(slice_limit);
-
-      if (!dbiter->Valid()) {
-        dbiter->SeekToLast();
-      } else {
-        dbiter->Prev();
-      }
-      return dbiter->status().ok() ? 0 : -1;
-    }
-    int upper_bound(const string &prefix, const string &after) {
-      lower_bound(prefix, after);
-      if (valid()) {
-	pair<string,string> key = raw_key();
-	if (key.first == prefix && key.second == after)
-	  next();
-      }
-      return dbiter->status().ok() ? 0 : -1;
-    }
-    int lower_bound(const string &prefix, const string &to) {
-      string bound = combine_strings(prefix, to);
-      leveldb::Slice slice_bound(bound);
-      dbiter->Seek(slice_bound);
-      return dbiter->status().ok() ? 0 : -1;
-    }
-    bool valid() {
-      return dbiter->Valid();
-    }
-    int next() {
-      if (valid())
-	dbiter->Next();
-      return dbiter->status().ok() ? 0 : -1;
-    }
-    int prev() {
-      if (valid())
-	dbiter->Prev();
-      return dbiter->status().ok() ? 0 : -1;
-    }
-    string key() {
-      string out_key;
-      split_key(dbiter->key(), 0, &out_key);
-      return out_key;
-    }
-    pair<string,string> raw_key() {
-      string prefix, key;
-      split_key(dbiter->key(), &prefix, &key);
-      return make_pair(prefix, key);
-    }
-    bufferlist value() {
-      return to_bufferlist(dbiter->value());
-    }
-    int status() {
-      return dbiter->status().ok() ? 0 : -1;
-    }
-  };
-
-  class LevelDBSnapshotIteratorImpl : public LevelDBWholeSpaceIteratorImpl {
-    leveldb::DB *db;
-    const leveldb::Snapshot *snapshot;
-  public:
-    LevelDBSnapshotIteratorImpl(leveldb::DB *db, const leveldb::Snapshot *s,
-				leveldb::Iterator *iter) :
-      LevelDBWholeSpaceIteratorImpl(iter), db(db), snapshot(s) { }
-
-    ~LevelDBSnapshotIteratorImpl() {
-      assert(snapshot != NULL);
-      db->ReleaseSnapshot(snapshot);
-    }
-  };
-
-  /// Utility
-  static string combine_strings(const string &prefix, const string &value);
-  static int split_key(leveldb::Slice in, string *prefix, string *key);
-  static bufferlist to_bufferlist(leveldb::Slice in);
-  static string past_prefix(const string &prefix) {
-    string limit = prefix;
-    limit.push_back(1);
-    return limit;
-  }
-
-  virtual uint64_t get_estimated_size(map<string,uint64_t> &extra) {
-    DIR *store_dir = opendir(path.c_str());
-    if (!store_dir) {
-      lderr(cct) << __func__ << " something happened opening the store: "
-                 << cpp_strerror(errno) << dendl;
-      return 0;
-    }
-
-    uint64_t total_size = 0;
-    uint64_t sst_size = 0;
-    uint64_t log_size = 0;
-    uint64_t misc_size = 0;
-
-    struct dirent *entry = NULL;
-    while ((entry = readdir(store_dir)) != NULL) {
-      string n(entry->d_name);
-
-      if (n == "." || n == "..")
-        continue;
-
-      string fpath = path + '/' + n;
-      struct stat s;
-      int err = stat(fpath.c_str(), &s);
-      if (err < 0)
-	err = -errno;
-      // we may race against leveldb while reading files; this should only
-      // happen when those files are being updated, data is being shuffled
-      // and files get removed, in which case there's not much of a problem
-      // as we'll get to them next time around.
-      if (err == -ENOENT) {
-	continue;
-      }
-      if (err < 0) {
-        lderr(cct) << __func__ << " error obtaining stats for " << fpath
-                   << ": " << cpp_strerror(err) << dendl;
-        goto err;
-      }
-
-      size_t pos = n.find_last_of('.');
-      if (pos == string::npos) {
-        misc_size += s.st_size;
-        continue;
-      }
-
-      string ext = n.substr(pos+1);
-      if (ext == "sst") {
-        sst_size += s.st_size;
-      } else if (ext == "log") {
-        log_size += s.st_size;
-      } else {
-        misc_size += s.st_size;
-      }
-    }
-
-    total_size = sst_size + log_size + misc_size;
-
-    extra["sst"] = sst_size;
-    extra["log"] = log_size;
-    extra["misc"] = misc_size;
-    extra["total"] = total_size;
-
-err:
-    closedir(store_dir);
-    return total_size;
-  }
-
-
-protected:
-  WholeSpaceIterator _get_iterator() {
-    return ceph::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
-      new LevelDBWholeSpaceIteratorImpl(
-	db->NewIterator(leveldb::ReadOptions())
-      )
-    );
-  }
-
-  WholeSpaceIterator _get_snapshot_iterator() {
-    const leveldb::Snapshot *snapshot;
-    leveldb::ReadOptions options;
-
-    snapshot = db->GetSnapshot();
-    options.snapshot = snapshot;
-
-    return ceph::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
-      new LevelDBSnapshotIteratorImpl(db.get(), snapshot,
-	db->NewIterator(options))
-    );
-  }
-
-};
-
-#endif
diff --git a/src/os/Makefile.am b/src/os/Makefile.am
index fdb6c99..3ea9300 100644
--- a/src/os/Makefile.am
+++ b/src/os/Makefile.am
@@ -1,11 +1,11 @@
-libos_types_la_SOURCES = \
+libos_types_a_SOURCES = \
 	os/Transaction.cc
-libos_types_la_CXXFLAGS = ${AM_CXXFLAGS}
-noinst_LTLIBRARIES += libos_types.la
+libos_types_a_CXXFLAGS = ${AM_CXXFLAGS}
+noinst_LIBRARIES += libos_types.a
 
 if ENABLE_SERVER
 
-libos_la_SOURCES = \
+libos_a_SOURCES = \
 	os/chain_xattr.cc \
 	os/fs/FS.cc \
 	os/DBObjectMap.cc \
@@ -16,38 +16,38 @@ libos_la_SOURCES = \
 	os/HashIndex.cc \
 	os/IndexManager.cc \
 	os/JournalingObjectStore.cc \
-	os/LevelDBStore.cc \
 	os/LFNIndex.cc \
 	os/MemStore.cc \
-	os/KeyValueDB.cc \
 	os/KeyValueStore.cc \
 	os/ObjectStore.cc \
-	os/WBThrottle.cc \
-	common/TrackedOp.cc
+	os/WBThrottle.cc
 
 if LINUX
-libos_la_SOURCES += os/BtrfsFileStoreBackend.cc
+libos_a_SOURCES += os/BtrfsFileStoreBackend.cc
 endif
 
 if WITH_LIBAIO
-libos_types_la_SOURCES += os/newstore/newstore_types.cc
-libos_la_SOURCES += os/newstore/NewStore.cc
+libos_types_a_SOURCES += os/newstore/newstore_types.cc
+libos_a_SOURCES += os/newstore/NewStore.cc
 endif
 
 if WITH_LIBXFS
-libos_la_SOURCES += \
+libos_a_SOURCES += \
     os/fs/XFS.cc \
     os/XfsFileStoreBackend.cc
 endif
 
 if WITH_LIBZFS
-libos_la_SOURCES += os/ZFSFileStoreBackend.cc
+libos_a_SOURCES += os/ZFSFileStoreBackend.cc
 endif
 
-libos_la_CXXFLAGS = ${AM_CXXFLAGS}
-libos_la_LIBADD = $(LIBOS_TYPES)
+libos_a_LIBADD = libos_types.a libkv.a
 
-noinst_LTLIBRARIES += libos.la
+if WITH_LTTNG
+libos_a_LIBADD += $(LIBOS_TP)
+endif
+
+noinst_LIBRARIES += libos.a
 
 noinst_HEADERS += \
 	os/btrfs_ioctl.h \
@@ -68,8 +68,6 @@ noinst_HEADERS += \
 	os/IndexManager.h \
 	os/Journal.h \
 	os/JournalingObjectStore.h \
-	os/KeyValueDB.h \
-	os/LevelDBStore.h \
 	os/LFNIndex.h \
 	os/MemStore.h \
 	os/KeyValueStore.h \
@@ -81,20 +79,6 @@ noinst_HEADERS += \
 	os/XfsFileStoreBackend.h \
 	os/ZFSFileStoreBackend.h
 
-if WITH_SLIBROCKSDB
-libos_rocksdb_la_SOURCES = os/RocksDBStore.cc
-libos_rocksdb_la_CXXFLAGS = ${AM_CXXFLAGS} ${LIBROCKSDB_CFLAGS} -std=gnu++11 -I rocksdb/include
-libos_rocksdb_la_LIBADD = rocksdb/librocksdb.la
-noinst_LTLIBRARIES += libos_rocksdb.la
-noinst_HEADERS += os/RocksDBStore.h
-endif
-if WITH_DLIBROCKSDB
-libos_rocksdb_la_SOURCES = os/RocksDBStore.cc
-libos_rocksdb_la_CXXFLAGS = ${AM_CXXFLAGS} ${LIBROCKSDB_CFLAGS} -std=gnu++11
-libos_rocksdb_la_LIBADD = -lrocksdb
-noinst_LTLIBRARIES += libos_rocksdb.la
-noinst_HEADERS += os/RocksDBStore.h
-endif
 if WITH_LIBZFS
 libos_zfs_a_SOURCES = os/ZFS.cc
 libos_zfs_a_CXXFLAGS = ${AM_CXXFLAGS} ${LIBZFS_CFLAGS}
@@ -102,11 +86,4 @@ noinst_LIBRARIES += libos_zfs.a
 noinst_HEADERS += os/ZFS.h
 endif
 
-if WITH_KINETIC
-libos_la_SOURCES += os/KineticStore.cc
-libos_la_CXXFLAGS += -std=gnu++11
-libos_la_LIBADD += -lkinetic_client -lprotobuf -lglog -lgflags libcrypto.a
-noinst_HEADERS += os/KineticStore.h
-endif
-
 endif # ENABLE_SERVER
diff --git a/src/os/MemStore.cc b/src/os/MemStore.cc
index b0aa206..080f731 100644
--- a/src/os/MemStore.cc
+++ b/src/os/MemStore.cc
@@ -851,18 +851,18 @@ void MemStore::_do_transaction(Transaction& t)
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
-        map<string, bufferlist> aset;
-        i.decode_attrset(aset);
-	r = _omap_setkeys(cid, oid, aset);
+        bufferlist aset_bl;
+        i.decode_attrset_bl(&aset_bl);
+	r = _omap_setkeys(cid, oid, aset_bl);
       }
       break;
     case Transaction::OP_OMAP_RMKEYS:
       {
         coll_t cid = i.get_cid(op->cid);
         ghobject_t oid = i.get_oid(op->oid);
-        set<string> keys;
-        i.decode_keyset(keys);
-	r = _omap_rmkeys(cid, oid, keys);
+        bufferlist keys_bl;
+        i.decode_keyset_bl(&keys_bl);
+	r = _omap_rmkeys(cid, oid, keys_bl);
       }
       break;
     case Transaction::OP_OMAP_RMKEYRANGE:
@@ -1158,7 +1158,7 @@ int MemStore::_omap_clear(coll_t cid, const ghobject_t &oid)
 }
 
 int MemStore::_omap_setkeys(coll_t cid, const ghobject_t &oid,
-			    const map<string, bufferlist> &aset)
+			    bufferlist& aset_bl)
 {
   dout(10) << __func__ << " " << cid << " " << oid << dendl;
   CollectionRef c = get_collection(cid);
@@ -1169,13 +1169,19 @@ int MemStore::_omap_setkeys(coll_t cid, const ghobject_t &oid,
   if (!o)
     return -ENOENT;
   std::lock_guard<std::mutex> lock(o->omap_mutex);
-  for (map<string,bufferlist>::const_iterator p = aset.begin(); p != aset.end(); ++p)
-    o->omap[p->first] = p->second;
+  bufferlist::iterator p = aset_bl.begin();
+  __u32 num;
+  ::decode(num, p);
+  while (num--) {
+    string key;
+    ::decode(key, p);
+    ::decode(o->omap[key], p);
+  }
   return 0;
 }
 
 int MemStore::_omap_rmkeys(coll_t cid, const ghobject_t &oid,
-			   const set<string> &keys)
+			   bufferlist& keys_bl)
 {
   dout(10) << __func__ << " " << cid << " " << oid << dendl;
   CollectionRef c = get_collection(cid);
@@ -1186,8 +1192,14 @@ int MemStore::_omap_rmkeys(coll_t cid, const ghobject_t &oid,
   if (!o)
     return -ENOENT;
   std::lock_guard<std::mutex> lock(o->omap_mutex);
-  for (set<string>::const_iterator p = keys.begin(); p != keys.end(); ++p)
-    o->omap.erase(*p);
+  bufferlist::iterator p = keys_bl.begin();
+  __u32 num;
+  ::decode(num, p);
+  while (num--) {
+    string key;
+    ::decode(key, p);
+    o->omap.erase(key);
+  }
   return 0;
 }
 
diff --git a/src/os/MemStore.h b/src/os/MemStore.h
index 734195f..efaa2cf 100644
--- a/src/os/MemStore.h
+++ b/src/os/MemStore.h
@@ -271,7 +271,7 @@ private:
       std::lock_guard<std::mutex>(o->omap_mutex);
       return it != o->omap.end();      
     }
-    int next() {
+    int next(bool validate=true) {
       std::lock_guard<std::mutex>(o->omap_mutex);
       ++it;
       return 0;
@@ -316,9 +316,8 @@ private:
 		   const ghobject_t& newoid,
 		   uint64_t srcoff, uint64_t len, uint64_t dstoff);
   int _omap_clear(coll_t cid, const ghobject_t &oid);
-  int _omap_setkeys(coll_t cid, const ghobject_t &oid,
-		    const map<string, bufferlist> &aset);
-  int _omap_rmkeys(coll_t cid, const ghobject_t &oid, const set<string> &keys);
+  int _omap_setkeys(coll_t cid, const ghobject_t &oid, bufferlist& aset_bl);
+  int _omap_rmkeys(coll_t cid, const ghobject_t &oid, bufferlist& keys_bl);
   int _omap_rmkeyrange(coll_t cid, const ghobject_t &oid,
 		       const string& first, const string& last);
   int _omap_setheader(coll_t cid, const ghobject_t &oid, const bufferlist &bl);
diff --git a/src/os/ObjectMap.h b/src/os/ObjectMap.h
index 86f9e3e..e7a64a4 100644
--- a/src/os/ObjectMap.h
+++ b/src/os/ObjectMap.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <vector>
 #include "include/memory.h"
+#include "kv/KeyValueDB.h"
 
 /**
  * Encapsulates the FileStore key value store
@@ -137,18 +138,7 @@ public:
 
   virtual bool check(std::ostream &out) { return true; }
 
-  class ObjectMapIteratorImpl {
-  public:
-    virtual int seek_to_first() = 0;
-    virtual int upper_bound(const string &after) = 0;
-    virtual int lower_bound(const string &to) = 0;
-    virtual bool valid() = 0;
-    virtual int next() = 0;
-    virtual string key() = 0;
-    virtual bufferlist value() = 0;
-    virtual int status() = 0;
-    virtual ~ObjectMapIteratorImpl() {}
-  };
+  typedef KeyValueDB::GenericIteratorImpl ObjectMapIteratorImpl;
   typedef ceph::shared_ptr<ObjectMapIteratorImpl> ObjectMapIterator;
   virtual ObjectMapIterator get_iterator(const ghobject_t &oid) {
     return ObjectMapIterator();
diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
index 8e8886b..aa62fbd 100644
--- a/src/os/ObjectStore.cc
+++ b/src/os/ObjectStore.cc
@@ -25,6 +25,41 @@
 #include "newstore/NewStore.h"
 #endif
 
+void decode_str_str_map_to_bl(bufferlist::iterator& p,
+			      bufferlist *out)
+{
+  bufferlist::iterator start = p;
+  __u32 n;
+  ::decode(n, p);
+  unsigned len = 4;
+  while (n--) {
+    __u32 l;
+    ::decode(l, p);
+    p.advance(l);
+    len += 4 + l;
+    ::decode(l, p);
+    p.advance(l);
+    len += 4 + l;
+  }
+  start.copy(len, *out);
+}
+
+void decode_str_set_to_bl(bufferlist::iterator& p,
+			  bufferlist *out)
+{
+  bufferlist::iterator start = p;
+  __u32 n;
+  ::decode(n, p);
+  unsigned len = 4;
+  while (n--) {
+    __u32 l;
+    ::decode(l, p);
+    p.advance(l);
+    len += 4 + l;
+  }
+  start.copy(len, *out);
+}
+
 ObjectStore *ObjectStore::create(CephContext *cct,
 				 const string& type,
 				 const string& data,
diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h
index 65818ff..3b00ac7 100644
--- a/src/os/ObjectStore.h
+++ b/src/os/ObjectStore.h
@@ -27,7 +27,7 @@
 #include <vector>
 #include <map>
 
-#if defined(DARWIN) || defined(__FreeBSD__)
+#if defined(DARWIN) || defined(__FreeBSD__) || defined(__sun)
 #include <sys/statvfs.h>
 #else
 #include <sys/vfs.h>    /* or <sys/statfs.h> */
@@ -84,6 +84,10 @@ static inline void encode(const map<string,bufferptr> *attrset, bufferlist &bl)
   ::encode(*attrset, bl);
 }
 
+// this isn't the best place for these, but...
+void decode_str_str_map_to_bl(bufferlist::iterator& p, bufferlist *out);
+void decode_str_set_to_bl(bufferlist::iterator& p, bufferlist *out);
+
 // Flag bits
 typedef uint32_t osflagbits_t;
 const int SKIP_JOURNAL_REPLAY = 1 << 0;
@@ -836,9 +840,15 @@ public:
       void decode_attrset(map<string,bufferlist>& aset) {
         ::decode(aset, data_bl_p);
       }
+      void decode_attrset_bl(bufferlist *pbl) {
+	decode_str_str_map_to_bl(data_bl_p, pbl);
+      }
       void decode_keyset(set<string> &keys){
         ::decode(keys, data_bl_p);
       }
+      void decode_keyset_bl(bufferlist *pbl){
+        decode_str_set_to_bl(data_bl_p, pbl);
+      }
 
       const ghobject_t &get_oid(__le32 oid_id) {
         assert(oid_id < objects.size());
@@ -1405,6 +1415,29 @@ public:
       }
       data.ops++;
     }
+
+    /// Set keys on an oid omap (bufferlist variant).
+    void omap_setkeys(
+      coll_t cid,                           ///< [in] Collection containing oid
+      const ghobject_t &oid,                ///< [in] Object to update
+      const bufferlist &attrset_bl          ///< [in] Replacement keys and values
+      ) {
+      if (use_tbl) {
+        __u32 op = OP_OMAP_SETKEYS;
+        ::encode(op, tbl);
+        ::encode(cid, tbl);
+        ::encode(oid, tbl);
+        tbl.append(attrset_bl);
+      } else {
+        Op* _op = _get_next_op();
+        _op->op = OP_OMAP_SETKEYS;
+        _op->cid = _get_coll_id(cid);
+        _op->oid = _get_object_id(oid);
+        data_bl.append(attrset_bl);
+      }
+      data.ops++;
+    }
+
     /// Remove keys from oid omap
     void omap_rmkeys(
       coll_t cid,             ///< [in] Collection containing oid
@@ -1427,6 +1460,28 @@ public:
       data.ops++;
     }
 
+    /// Remove keys from oid omap
+    void omap_rmkeys(
+      coll_t cid,             ///< [in] Collection containing oid
+      const ghobject_t &oid,  ///< [in] Object from which to remove the omap
+      const bufferlist &keys_bl ///< [in] Keys to clear
+      ) {
+      if (use_tbl) {
+        __u32 op = OP_OMAP_RMKEYS;
+        ::encode(op, tbl);
+        ::encode(cid, tbl);
+        ::encode(oid, tbl);
+        tbl.append(keys_bl);
+      } else {
+        Op* _op = _get_next_op();
+        _op->op = OP_OMAP_RMKEYS;
+        _op->cid = _get_coll_id(cid);
+        _op->oid = _get_object_id(oid);
+        data_bl.append(keys_bl);
+      }
+      data.ops++;
+    }
+
     /// Remove key range from oid omap
     void omap_rmkeyrange(
       coll_t cid,             ///< [in] Collection containing oid
diff --git a/src/os/RocksDBStore.cc b/src/os/RocksDBStore.cc
deleted file mode 100644
index cb3ac91..0000000
--- a/src/os/RocksDBStore.cc
+++ /dev/null
@@ -1,518 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include <set>
-#include <map>
-#include <string>
-#include <memory>
-#include <errno.h>
-
-#include "rocksdb/db.h"
-#include "rocksdb/table.h"
-#include "rocksdb/env.h"
-#include "rocksdb/write_batch.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/cache.h"
-#include "rocksdb/filter_policy.h"
-#include "rocksdb/utilities/convenience.h"
-using std::string;
-#include "common/perf_counters.h"
-#include "include/str_map.h"
-#include "KeyValueDB.h"
-#include "RocksDBStore.h"
-
-int string2bool(string val, bool &b_val)
-{
-  if (strcasecmp(val.c_str(), "false") == 0) {
-    b_val = false;
-    return 0;
-  } else if (strcasecmp(val.c_str(), "true") == 0) {
-    b_val = true;
-    return 0;
-  } else {
-    std::string err;
-    int b = strict_strtol(val.c_str(), 10, &err);
-    if (!err.empty())
-      return -EINVAL;
-    b_val = !!b;
-    return 0;
-  }
-}
-  
-int RocksDBStore::tryInterpret(const string key, const string val, rocksdb::Options &opt)
-{
-  if (key == "compaction_threads") {
-    std::string err;
-    int f = strict_sistrtoll(val.c_str(), &err);
-    if (!err.empty())
-      return -EINVAL;
-    //Low priority threadpool is used for compaction
-    opt.env->SetBackgroundThreads(f, rocksdb::Env::Priority::LOW);
-  } else if (key == "flusher_threads") {
-    std::string err;
-    int f = strict_sistrtoll(val.c_str(), &err);
-    if (!err.empty())
-      return -EINVAL;
-    //High priority threadpool is used for flusher
-    opt.env->SetBackgroundThreads(f, rocksdb::Env::Priority::HIGH);
-  } else if (key == "compact_on_mount") {
-    int ret = string2bool(val, compact_on_mount);
-    if (ret != 0)
-      return ret;
-  } else if (key == "disableWAL") {
-    int ret = string2bool(val, disableWAL);
-    if (ret != 0)
-      return ret;
-  } else {
-    //unrecognize config options.
-    return -EINVAL;
-  }
-  return 0;
-}
-
-int RocksDBStore::ParseOptionsFromString(const string opt_str, rocksdb::Options &opt)
-{
-  map<string, string> str_map;
-  int r = get_str_map(opt_str, ",\n;", &str_map);
-  if (r < 0)
-    return r;
-  map<string, string>::iterator it;
-  for(it = str_map.begin(); it != str_map.end(); ++it) {
-    string this_opt = it->first + "=" + it->second;
-    rocksdb::Status status = rocksdb::GetOptionsFromString(opt, this_opt , &opt); 
-    if (!status.ok()) {
-      //unrecognized by rocksdb, try to interpret by ourselves.
-      r = tryInterpret(it->first, it->second, opt);
-      if (r < 0) {
-	derr << status.ToString() << dendl;
-	return -EINVAL;
-      }
-    }
-    lgeneric_dout(cct, 0) << " set rocksdb option " << it->first
-			  << " = " << it->second << dendl;
-  }
-  return 0;
-}
-
-int RocksDBStore::init(string _options_str)
-{
-  options_str = _options_str;
-  rocksdb::Options opt;
-  //try parse options
-  int r = ParseOptionsFromString(options_str, opt); 
-  if (r != 0) {
-    return -EINVAL;
-  }
-  return 0;
-}
-
-int RocksDBStore::do_open(ostream &out, bool create_if_missing)
-{
-  rocksdb::Options opt;
-  rocksdb::Status status;
-
-  int r = ParseOptionsFromString(options_str, opt); 
-  if (r != 0) {
-    return -EINVAL;
-  }
-  opt.create_if_missing = create_if_missing;
-
-  status = rocksdb::DB::Open(opt, path, &db);
-  if (!status.ok()) {
-    derr << status.ToString() << dendl;
-    return -EINVAL;
-  }
-
-  PerfCountersBuilder plb(g_ceph_context, "rocksdb", l_rocksdb_first, l_rocksdb_last);
-  plb.add_u64_counter(l_rocksdb_gets, "rocksdb_get", "Gets");
-  plb.add_u64_counter(l_rocksdb_txns, "rocksdb_transaction", "Transactions");
-  plb.add_time_avg(l_rocksdb_get_latency, "rocksdb_get_latency", "Get latency");
-  plb.add_time_avg(l_rocksdb_submit_latency, "rocksdb_submit_latency", "Submit Latency");
-  plb.add_time_avg(l_rocksdb_submit_sync_latency, "rocksdb_submit_sync_latency", "Submit Sync Latency");
-  plb.add_u64_counter(l_rocksdb_compact, "rocksdb_compact", "Compactions");
-  plb.add_u64_counter(l_rocksdb_compact_range, "rocksdb_compact_range", "Compactions by range");
-  plb.add_u64_counter(l_rocksdb_compact_queue_merge, "rocksdb_compact_queue_merge", "Mergings of ranges in compaction queue");
-  plb.add_u64(l_rocksdb_compact_queue_len, "rocksdb_compact_queue_len", "Length of compaction queue");
-  logger = plb.create_perf_counters();
-  cct->get_perfcounters_collection()->add(logger);
-
-  if (compact_on_mount) {
-    derr << "Compacting rocksdb store..." << dendl;
-    compact();
-    derr << "Finished compacting rocksdb store" << dendl;
-  }
-  return 0;
-}
-
-int RocksDBStore::_test_init(const string& dir)
-{
-  rocksdb::Options options;
-  options.create_if_missing = true;
-  rocksdb::DB *db;
-  rocksdb::Status status = rocksdb::DB::Open(options, dir, &db);
-  delete db;
-  return status.ok() ? 0 : -EIO;
-}
-
-RocksDBStore::~RocksDBStore()
-{
-  close();
-  delete logger;
-
-  // Ensure db is destroyed before dependent db_cache and filterpolicy
-  delete db;
-}
-
-void RocksDBStore::close()
-{
-  // stop compaction thread
-  compact_queue_lock.Lock();
-  if (compact_thread.is_started()) {
-    compact_queue_stop = true;
-    compact_queue_cond.Signal();
-    compact_queue_lock.Unlock();
-    compact_thread.join();
-  } else {
-    compact_queue_lock.Unlock();
-  }
-
-  if (logger)
-    cct->get_perfcounters_collection()->remove(logger);
-}
-
-int RocksDBStore::submit_transaction(KeyValueDB::Transaction t)
-{
-  utime_t start = ceph_clock_now(g_ceph_context);
-  RocksDBTransactionImpl * _t =
-    static_cast<RocksDBTransactionImpl *>(t.get());
-  rocksdb::WriteOptions woptions;
-  woptions.disableWAL = disableWAL;
-  rocksdb::Status s = db->Write(woptions, _t->bat);
-  utime_t lat = ceph_clock_now(g_ceph_context) - start;
-  logger->inc(l_rocksdb_txns);
-  logger->tinc(l_rocksdb_submit_latency, lat);
-  return s.ok() ? 0 : -1;
-}
-
-int RocksDBStore::submit_transaction_sync(KeyValueDB::Transaction t)
-{
-  utime_t start = ceph_clock_now(g_ceph_context);
-  RocksDBTransactionImpl * _t =
-    static_cast<RocksDBTransactionImpl *>(t.get());
-  rocksdb::WriteOptions woptions;
-  woptions.sync = true;
-  woptions.disableWAL = disableWAL;
-  rocksdb::Status s = db->Write(woptions, _t->bat);
-  utime_t lat = ceph_clock_now(g_ceph_context) - start;
-  logger->inc(l_rocksdb_txns);
-  logger->tinc(l_rocksdb_submit_sync_latency, lat);
-  return s.ok() ? 0 : -1;
-}
-int RocksDBStore::get_info_log_level(string info_log_level)
-{
-  if (info_log_level == "debug") {
-    return 0;
-  } else if (info_log_level == "info") {
-    return 1;
-  } else if (info_log_level == "warn") {
-    return 2;
-  } else if (info_log_level == "error") {
-    return 3;
-  } else if (info_log_level == "fatal") {
-    return 4;
-  } else {
-    return 1;
-  }
-}
-
-RocksDBStore::RocksDBTransactionImpl::RocksDBTransactionImpl(RocksDBStore *_db)
-{
-  db = _db;
-  bat = new rocksdb::WriteBatch();
-}
-RocksDBStore::RocksDBTransactionImpl::~RocksDBTransactionImpl()
-{
-  delete bat;
-}
-void RocksDBStore::RocksDBTransactionImpl::set(
-  const string &prefix,
-  const string &k,
-  const bufferlist &to_set_bl)
-{
-  string key = combine_strings(prefix, k);
-  //bufferlist::c_str() is non-constant, so we need to make a copy
-  bufferlist val = to_set_bl;
-  bat->Delete(rocksdb::Slice(key));
-  bat->Put(rocksdb::Slice(key),
-	  rocksdb::Slice(val.c_str(), val.length()));
-}
-
-void RocksDBStore::RocksDBTransactionImpl::rmkey(const string &prefix,
-					         const string &k)
-{
-  bat->Delete(combine_strings(prefix, k));
-}
-
-void RocksDBStore::RocksDBTransactionImpl::rmkeys_by_prefix(const string &prefix)
-{
-  KeyValueDB::Iterator it = db->get_iterator(prefix);
-  for (it->seek_to_first();
-       it->valid();
-       it->next()) {
-    bat->Delete(combine_strings(prefix, it->key()));
-  }
-}
-
-int RocksDBStore::get(
-    const string &prefix,
-    const std::set<string> &keys,
-    std::map<string, bufferlist> *out)
-{
-  utime_t start = ceph_clock_now(g_ceph_context);
-  KeyValueDB::Iterator it = get_iterator(prefix);
-  for (std::set<string>::const_iterator i = keys.begin();
-       i != keys.end();
-       ++i) {
-    it->lower_bound(*i);
-    if (it->valid() && it->key() == *i) {
-      out->insert(make_pair(*i, it->value()));
-    } else if (!it->valid())
-      break;
-  }
-  utime_t lat = ceph_clock_now(g_ceph_context) - start;
-  logger->inc(l_rocksdb_gets);
-  logger->tinc(l_rocksdb_get_latency, lat);
-  return 0;
-}
-
-string RocksDBStore::combine_strings(const string &prefix, const string &value)
-{
-  string out = prefix;
-  out.push_back(0);
-  out.append(value);
-  return out;
-}
-
-bufferlist RocksDBStore::to_bufferlist(rocksdb::Slice in)
-{
-  bufferlist bl;
-  bl.append(bufferptr(in.data(), in.size()));
-  return bl;
-}
-
-int RocksDBStore::split_key(rocksdb::Slice in, string *prefix, string *key)
-{
-  string in_prefix = in.ToString();
-  size_t prefix_len = in_prefix.find('\0');
-  if (prefix_len >= in_prefix.size())
-    return -EINVAL;
-
-  if (prefix)
-    *prefix = string(in_prefix, 0, prefix_len);
-  if (key)
-    *key= string(in_prefix, prefix_len + 1);
-  return 0;
-}
-
-void RocksDBStore::compact()
-{
-  logger->inc(l_rocksdb_compact);
-  db->CompactRange(NULL, NULL);
-}
-
-
-void RocksDBStore::compact_thread_entry()
-{
-  compact_queue_lock.Lock();
-  while (!compact_queue_stop) {
-    while (!compact_queue.empty()) {
-      pair<string,string> range = compact_queue.front();
-      compact_queue.pop_front();
-      logger->set(l_rocksdb_compact_queue_len, compact_queue.size());
-      compact_queue_lock.Unlock();
-      logger->inc(l_rocksdb_compact_range);
-      compact_range(range.first, range.second);
-      compact_queue_lock.Lock();
-      continue;
-    }
-    compact_queue_cond.Wait(compact_queue_lock);
-  }
-  compact_queue_lock.Unlock();
-}
-
-void RocksDBStore::compact_range_async(const string& start, const string& end)
-{
-  Mutex::Locker l(compact_queue_lock);
-
-  // try to merge adjacent ranges.  this is O(n), but the queue should
-  // be short.  note that we do not cover all overlap cases and merge
-  // opportunities here, but we capture the ones we currently need.
-  list< pair<string,string> >::iterator p = compact_queue.begin();
-  while (p != compact_queue.end()) {
-    if (p->first == start && p->second == end) {
-      // dup; no-op
-      return;
-    }
-    if (p->first <= end && p->first > start) {
-      // merge with existing range to the right
-      compact_queue.push_back(make_pair(start, p->second));
-      compact_queue.erase(p);
-      logger->inc(l_rocksdb_compact_queue_merge);
-      break;
-    }
-    if (p->second >= start && p->second < end) {
-      // merge with existing range to the left
-      compact_queue.push_back(make_pair(p->first, end));
-      compact_queue.erase(p);
-      logger->inc(l_rocksdb_compact_queue_merge);
-      break;
-    }
-    ++p;
-  }
-  if (p == compact_queue.end()) {
-    // no merge, new entry.
-    compact_queue.push_back(make_pair(start, end));
-    logger->set(l_rocksdb_compact_queue_len, compact_queue.size());
-  }
-  compact_queue_cond.Signal();
-  if (!compact_thread.is_started()) {
-    compact_thread.create();
-  }
-}
-bool RocksDBStore::check_omap_dir(string &omap_dir)
-{
-  rocksdb::Options options;
-  options.create_if_missing = true;
-  rocksdb::DB *db;
-  rocksdb::Status status = rocksdb::DB::Open(options, omap_dir, &db);
-  delete db;
-  return status.ok();
-}
-void RocksDBStore::compact_range(const string& start, const string& end)
-{
-    rocksdb::Slice cstart(start);
-    rocksdb::Slice cend(end);
-    db->CompactRange(&cstart, &cend);
-}
-RocksDBStore::RocksDBWholeSpaceIteratorImpl::~RocksDBWholeSpaceIteratorImpl()
-{
-  delete dbiter;
-}
-int RocksDBStore::RocksDBWholeSpaceIteratorImpl::seek_to_first()
-{
-  dbiter->SeekToFirst();
-  return dbiter->status().ok() ? 0 : -1;
-}
-int RocksDBStore::RocksDBWholeSpaceIteratorImpl::seek_to_first(const string &prefix)
-{
-  rocksdb::Slice slice_prefix(prefix);
-  dbiter->Seek(slice_prefix);
-  return dbiter->status().ok() ? 0 : -1;
-}
-int RocksDBStore::RocksDBWholeSpaceIteratorImpl::seek_to_last()
-{
-  dbiter->SeekToLast();
-  return dbiter->status().ok() ? 0 : -1;
-}
-int RocksDBStore::RocksDBWholeSpaceIteratorImpl::seek_to_last(const string &prefix)
-{
-  string limit = past_prefix(prefix);
-  rocksdb::Slice slice_limit(limit);
-  dbiter->Seek(slice_limit);
-
-  if (!dbiter->Valid()) {
-    dbiter->SeekToLast();
-  } else {
-    dbiter->Prev();
-  }
-  return dbiter->status().ok() ? 0 : -1;
-}
-int RocksDBStore::RocksDBWholeSpaceIteratorImpl::upper_bound(const string &prefix, const string &after)
-{
-  lower_bound(prefix, after);
-  if (valid()) {
-  pair<string,string> key = raw_key();
-    if (key.first == prefix && key.second == after)
-      next();
-  }
-  return dbiter->status().ok() ? 0 : -1;
-}
-int RocksDBStore::RocksDBWholeSpaceIteratorImpl::lower_bound(const string &prefix, const string &to)
-{
-  string bound = combine_strings(prefix, to);
-  rocksdb::Slice slice_bound(bound);
-  dbiter->Seek(slice_bound);
-  return dbiter->status().ok() ? 0 : -1;
-}
-bool RocksDBStore::RocksDBWholeSpaceIteratorImpl::valid()
-{
-  return dbiter->Valid();
-}
-int RocksDBStore::RocksDBWholeSpaceIteratorImpl::next()
-{
-  if (valid())
-  dbiter->Next();
-  return dbiter->status().ok() ? 0 : -1;
-}
-int RocksDBStore::RocksDBWholeSpaceIteratorImpl::prev()
-{
-  if (valid())
-    dbiter->Prev();
-    return dbiter->status().ok() ? 0 : -1;
-}
-string RocksDBStore::RocksDBWholeSpaceIteratorImpl::key()
-{
-  string out_key;
-  split_key(dbiter->key(), 0, &out_key);
-  return out_key;
-}
-pair<string,string> RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key()
-{
-  string prefix, key;
-  split_key(dbiter->key(), &prefix, &key);
-  return make_pair(prefix, key);
-}
-bufferlist RocksDBStore::RocksDBWholeSpaceIteratorImpl::value()
-{
-  return to_bufferlist(dbiter->value());
-}
-int RocksDBStore::RocksDBWholeSpaceIteratorImpl::status()
-{
-  return dbiter->status().ok() ? 0 : -1;
-}
-
-string RocksDBStore::past_prefix(const string &prefix)
-{
-  string limit = prefix;
-  limit.push_back(1);
-  return limit;
-}
-
-
-RocksDBStore::WholeSpaceIterator RocksDBStore::_get_iterator()
-{
-  return std::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
-    new RocksDBWholeSpaceIteratorImpl(
-      db->NewIterator(rocksdb::ReadOptions())
-    )
-  );
-}
-
-RocksDBStore::WholeSpaceIterator RocksDBStore::_get_snapshot_iterator()
-{
-  const rocksdb::Snapshot *snapshot;
-  rocksdb::ReadOptions options;
-
-  snapshot = db->GetSnapshot();
-  options.snapshot = snapshot;
-
-  return std::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
-    new RocksDBSnapshotIteratorImpl(db, snapshot,
-      db->NewIterator(options))
-  );
-}
-
-RocksDBStore::RocksDBSnapshotIteratorImpl::~RocksDBSnapshotIteratorImpl()
-{
-  db->ReleaseSnapshot(snapshot);
-}
diff --git a/src/os/RocksDBStore.h b/src/os/RocksDBStore.h
deleted file mode 100644
index bf58f66..0000000
--- a/src/os/RocksDBStore.h
+++ /dev/null
@@ -1,280 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#ifndef ROCKS_DB_STORE_H
-#define ROCKS_DB_STORE_H
-
-#include "include/types.h"
-#include "include/buffer.h"
-#include "KeyValueDB.h"
-#include <set>
-#include <map>
-#include <string>
-#include <memory>
-#include <boost/scoped_ptr.hpp>
-
-#include <errno.h>
-#include "common/errno.h"
-#include "common/dout.h"
-#include "include/assert.h"
-#include "common/Formatter.h"
-
-#include "common/ceph_context.h"
-class PerfCounters;
-
-enum {
-  l_rocksdb_first = 34300,
-  l_rocksdb_gets,
-  l_rocksdb_txns,
-  l_rocksdb_get_latency,
-  l_rocksdb_submit_latency,
-  l_rocksdb_submit_sync_latency,
-  l_rocksdb_compact,
-  l_rocksdb_compact_range,
-  l_rocksdb_compact_queue_merge,
-  l_rocksdb_compact_queue_len,
-  l_rocksdb_last,
-};
-
-namespace rocksdb{
-  class DB;
-  class Cache;
-  class FilterPolicy;
-  class Snapshot;
-  class Slice;
-  class WriteBatch;
-  class Iterator;
-  struct Options;
-}
-/**
- * Uses RocksDB to implement the KeyValueDB interface
- */
-class RocksDBStore : public KeyValueDB {
-  CephContext *cct;
-  PerfCounters *logger;
-  string path;
-  rocksdb::DB *db;
-  string options_str;
-  int do_open(ostream &out, bool create_if_missing);
-
-  // manage async compactions
-  Mutex compact_queue_lock;
-  Cond compact_queue_cond;
-  list< pair<string,string> > compact_queue;
-  bool compact_queue_stop;
-  class CompactThread : public Thread {
-    RocksDBStore *db;
-  public:
-    CompactThread(RocksDBStore *d) : db(d) {}
-    void *entry() {
-      db->compact_thread_entry();
-      return NULL;
-    }
-    friend class RocksDBStore;
-  } compact_thread;
-
-  void compact_thread_entry();
-
-  void compact_range(const string& start, const string& end);
-  void compact_range_async(const string& start, const string& end);
-
-public:
-  /// compact the underlying rocksdb store
-  bool compact_on_mount;
-  bool disableWAL;
-  void compact();
-
-  int tryInterpret(const string key, const string val, rocksdb::Options &opt);
-  int ParseOptionsFromString(const string opt_str, rocksdb::Options &opt);
-  static int _test_init(const string& dir);
-  int init(string options_str);
-  /// compact rocksdb for all keys with a given prefix
-  void compact_prefix(const string& prefix) {
-    compact_range(prefix, past_prefix(prefix));
-  }
-  void compact_prefix_async(const string& prefix) {
-    compact_range_async(prefix, past_prefix(prefix));
-  }
-
-  void compact_range(const string& prefix, const string& start, const string& end) {
-    compact_range(combine_strings(prefix, start), combine_strings(prefix, end));
-  }
-  void compact_range_async(const string& prefix, const string& start, const string& end) {
-    compact_range_async(combine_strings(prefix, start), combine_strings(prefix, end));
-  }
-  int get_info_log_level(string info_log_level);
-
-  RocksDBStore(CephContext *c, const string &path) :
-    cct(c),
-    logger(NULL),
-    path(path),
-    db(NULL),
-    compact_queue_lock("RocksDBStore::compact_thread_lock"),
-    compact_queue_stop(false),
-    compact_thread(this),
-    compact_on_mount(false),
-    disableWAL(false)
-  {}
-
-  ~RocksDBStore();
-
-  static bool check_omap_dir(string &omap_dir);
-  /// Opens underlying db
-  int open(ostream &out) {
-    return do_open(out, false);
-  }
-  /// Creates underlying db if missing and opens it
-  int create_and_open(ostream &out) {
-    return do_open(out, true);
-  }
-
-  void close();
-
-  class RocksDBTransactionImpl : public KeyValueDB::TransactionImpl {
-  public:
-    rocksdb::WriteBatch *bat;
-    RocksDBStore *db;
-
-    RocksDBTransactionImpl(RocksDBStore *_db);
-    ~RocksDBTransactionImpl();
-    void set(
-      const string &prefix,
-      const string &k,
-      const bufferlist &bl);
-    void rmkey(
-      const string &prefix,
-      const string &k);
-    void rmkeys_by_prefix(
-      const string &prefix
-      );
-  };
-
-  KeyValueDB::Transaction get_transaction() {
-    return std::shared_ptr< RocksDBTransactionImpl >(
-      new RocksDBTransactionImpl(this));
-  }
-
-  int submit_transaction(KeyValueDB::Transaction t);
-  int submit_transaction_sync(KeyValueDB::Transaction t);
-  int get(
-    const string &prefix,
-    const std::set<string> &key,
-    std::map<string, bufferlist> *out
-    );
-
-  class RocksDBWholeSpaceIteratorImpl :
-    public KeyValueDB::WholeSpaceIteratorImpl {
-  protected:
-    rocksdb::Iterator *dbiter;
-  public:
-    RocksDBWholeSpaceIteratorImpl(rocksdb::Iterator *iter) :
-      dbiter(iter) { }
-    //virtual ~RocksDBWholeSpaceIteratorImpl() { }
-    ~RocksDBWholeSpaceIteratorImpl();
-
-    int seek_to_first();
-    int seek_to_first(const string &prefix);
-    int seek_to_last();
-    int seek_to_last(const string &prefix);
-    int upper_bound(const string &prefix, const string &after);
-    int lower_bound(const string &prefix, const string &to);
-    bool valid();
-    int next();
-    int prev();
-    string key();
-    pair<string,string> raw_key();
-    bufferlist value();
-    int status();
-  };
-
-  class RocksDBSnapshotIteratorImpl : public RocksDBWholeSpaceIteratorImpl {
-    rocksdb::DB *db;
-    const rocksdb::Snapshot *snapshot;
-  public:
-    RocksDBSnapshotIteratorImpl(rocksdb::DB *db, const rocksdb::Snapshot *s,
-				rocksdb::Iterator *iter) :
-      RocksDBWholeSpaceIteratorImpl(iter), db(db), snapshot(s) { }
-
-    ~RocksDBSnapshotIteratorImpl();
-  };
-
-  /// Utility
-  static string combine_strings(const string &prefix, const string &value);
-  static int split_key(rocksdb::Slice in, string *prefix, string *key);
-  static bufferlist to_bufferlist(rocksdb::Slice in);
-  static string past_prefix(const string &prefix);
-
-  virtual uint64_t get_estimated_size(map<string,uint64_t> &extra) {
-    DIR *store_dir = opendir(path.c_str());
-    if (!store_dir) {
-      lderr(cct) << __func__ << " something happened opening the store: "
-                 << cpp_strerror(errno) << dendl;
-      return 0;
-    }
-
-    uint64_t total_size = 0;
-    uint64_t sst_size = 0;
-    uint64_t log_size = 0;
-    uint64_t misc_size = 0;
-
-    struct dirent *entry = NULL;
-    while ((entry = readdir(store_dir)) != NULL) {
-      string n(entry->d_name);
-
-      if (n == "." || n == "..")
-        continue;
-
-      string fpath = path + '/' + n;
-      struct stat s;
-      int err = stat(fpath.c_str(), &s);
-      if (err < 0)
-	err = -errno;
-      // we may race against rocksdb while reading files; this should only
-      // happen when those files are being updated, data is being shuffled
-      // and files get removed, in which case there's not much of a problem
-      // as we'll get to them next time around.
-      if (err == -ENOENT) {
-	continue;
-      }
-      if (err < 0) {
-        lderr(cct) << __func__ << " error obtaining stats for " << fpath
-                   << ": " << cpp_strerror(err) << dendl;
-        goto err;
-      }
-
-      size_t pos = n.find_last_of('.');
-      if (pos == string::npos) {
-        misc_size += s.st_size;
-        continue;
-      }
-
-      string ext = n.substr(pos+1);
-      if (ext == "sst") {
-        sst_size += s.st_size;
-      } else if (ext == "log") {
-        log_size += s.st_size;
-      } else {
-        misc_size += s.st_size;
-      }
-    }
-
-    total_size = sst_size + log_size + misc_size;
-
-    extra["sst"] = sst_size;
-    extra["log"] = log_size;
-    extra["misc"] = misc_size;
-    extra["total"] = total_size;
-
-err:
-    closedir(store_dir);
-    return total_size;
-  }
-
-
-protected:
-  WholeSpaceIterator _get_iterator();
-
-  WholeSpaceIterator _get_snapshot_iterator();
-
-};
-
-#endif
diff --git a/src/os/chain_xattr.cc b/src/os/chain_xattr.cc
index 1bb652a..5351abf 100644
--- a/src/os/chain_xattr.cc
+++ b/src/os/chain_xattr.cc
@@ -256,7 +256,7 @@ static int get_xattr_block_size(size_t size)
   return CHAIN_XATTR_MAX_BLOCK_LEN;
 }
 
-int chain_setxattr(const char *fn, const char *name, const void *val, size_t size)
+int chain_setxattr(const char *fn, const char *name, const void *val, size_t size, bool onechunk)
 {
   int i = 0, pos = 0;
   char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
@@ -278,7 +278,7 @@ int chain_setxattr(const char *fn, const char *name, const void *val, size_t siz
     i++;
   } while (size);
 
-  if (ret >= 0 ) {
+  if (ret >= 0 && !onechunk) {
     int r;
     do {
       get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
@@ -292,7 +292,7 @@ int chain_setxattr(const char *fn, const char *name, const void *val, size_t siz
   return ret;
 }
 
-int chain_fsetxattr(int fd, const char *name, const void *val, size_t size)
+int chain_fsetxattr(int fd, const char *name, const void *val, size_t size, bool onechunk)
 {
   int i = 0, pos = 0;
   char raw_name[CHAIN_XATTR_MAX_NAME_LEN * 2 + 16];
@@ -314,7 +314,7 @@ int chain_fsetxattr(int fd, const char *name, const void *val, size_t size)
     i++;
   } while (size);
 
-  if (ret >= 0) {
+  if (ret >= 0 && !onechunk) {
     int r;
     do {
       get_raw_xattr_name(name, i, raw_name, sizeof(raw_name));
diff --git a/src/os/chain_xattr.h b/src/os/chain_xattr.h
index b994d52..6ee8050 100644
--- a/src/os/chain_xattr.h
+++ b/src/os/chain_xattr.h
@@ -9,7 +9,7 @@
 #include <errno.h>
 
 #if defined(__linux__)
-#include <limits.h>
+#include <linux/limits.h>
 #define CHAIN_XATTR_MAX_NAME_LEN ((XATTR_NAME_MAX + 1) / 2)
 #elif defined(__APPLE__)
 #include <sys/xattr.h>
@@ -78,8 +78,8 @@ static inline int sys_fremovexattr(int fd, const char *name)
 
 int chain_getxattr(const char *fn, const char *name, void *val, size_t size);
 int chain_fgetxattr(int fd, const char *name, void *val, size_t size);
-int chain_setxattr(const char *fn, const char *name, const void *val, size_t size);
-int chain_fsetxattr(int fd, const char *name, const void *val, size_t size);
+int chain_setxattr(const char *fn, const char *name, const void *val, size_t size, bool onechunk=false);
+int chain_fsetxattr(int fd, const char *name, const void *val, size_t size, bool onechunk=false);
 int chain_listxattr(const char *fn, char *names, size_t len);
 int chain_flistxattr(int fd, char *names, size_t len);
 int chain_removexattr(const char *fn, const char *name);
diff --git a/src/os/fs/FS.cc b/src/os/fs/FS.cc
index cb0bdd5..b7c7987 100644
--- a/src/os/fs/FS.cc
+++ b/src/os/fs/FS.cc
@@ -121,16 +121,49 @@ int FS::zero(int fd, uint64_t offset, uint64_t length)
 {
   int r;
 
-#ifdef CEPH_HAVE_FALLOCATE
-# if !defined(DARWIN) && !defined(__FreeBSD__)
+  /*
+
+    From the fallocate(2) man page:
+
+       Specifying the FALLOC_FL_PUNCH_HOLE flag (available since Linux 2.6.38)
+       in mode deallocates space (i.e., creates a  hole)  in  the  byte  range
+       starting  at offset and continuing for len bytes.  Within the specified
+       range, partial filesystem  blocks  are  zeroed,  and  whole  filesystem
+       blocks  are removed from the file.  After a successful call, subsequent
+       reads from this range will return zeroes.
+
+       The FALLOC_FL_PUNCH_HOLE flag must be ORed with FALLOC_FL_KEEP_SIZE  in
+       mode;  in  other words, even when punching off the end of the file, the
+       file size (as reported by stat(2)) does not change.
+
+       Not all  filesystems  support  FALLOC_FL_PUNCH_HOLE;  if  a  filesystem
+       doesn't  support the operation, an error is returned.  The operation is
+       supported on at least the following filesystems:
+
+       *  XFS (since Linux 2.6.38)
+
+       *  ext4 (since Linux 3.0)
+
+       *  Btrfs (since Linux 3.7)
+
+       *  tmpfs (since Linux 3.5)
+
+   So: we only do this is PUNCH_HOLE *and* KEEP_SIZE are defined.
+
+  */
+#if !defined(DARWIN) && !defined(__FreeBSD__)
+# ifdef CEPH_HAVE_FALLOCATE
+#  ifdef FALLOC_FL_KEEP_SIZE
   // first try fallocate
-  r = fallocate(fd, FALLOC_FL_PUNCH_HOLE, offset, length);
+  r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, length);
   if (r < 0) {
     r = -errno;
   }
   if (r != -EOPNOTSUPP) {
     goto out;  // a real error
   }
+  // if that failed (-EOPNOTSUPP), fall back to writing zeros.
+#  endif
 # endif
 #endif
 
@@ -140,7 +173,7 @@ int FS::zero(int fd, uint64_t offset, uint64_t length)
     bufferptr bp(length);
     bp.zero();
     bl.append(bp);
-    int r = ::lseek64(fd, offset, SEEK_SET);
+    r = ::lseek64(fd, offset, SEEK_SET);
     if (r < 0) {
       r = -errno;
       goto out;
diff --git a/src/os/newstore/NewStore.cc b/src/os/newstore/NewStore.cc
index bdc5d38..e1b4d73 100644
--- a/src/os/newstore/NewStore.cc
+++ b/src/os/newstore/NewStore.cc
@@ -564,7 +564,6 @@ NewStore::NewStore(CephContext *cct, const string& path)
     cct(cct),
     db(NULL),
     fs(NULL),
-    db_path(cct->_conf->newstore_db_path),
     path_fd(-1),
     fsid_fd(-1),
     frag_fd(-1),
@@ -803,7 +802,7 @@ bool NewStore::test_mount_in_use()
   return ret;
 }
 
-int NewStore::_open_db()
+int NewStore::_open_db(bool create)
 {
   assert(!db);
   char fn[PATH_MAX];
@@ -817,17 +816,24 @@ int NewStore::_open_db()
     db = NULL;
     return -EIO;
   }
-  db->init(g_conf->newstore_backend_options);
+  string options;
+  if (g_conf->newstore_backend == "rocksdb")
+    options = g_conf->newstore_rocksdb_options;
+  db->init(options);
   stringstream err;
-  if (db->create_and_open(err)) {
+  int r;
+  if (create)
+    r = db->create_and_open(err);
+  else
+    r = db->open(err);
+  if (r) {
     derr << __func__ << " erroring opening db: " << err.str() << dendl;
     delete db;
     db = NULL;
     return -EIO;
   }
   dout(1) << __func__ << " opened " << g_conf->newstore_backend
-	  << " path " << path
-	  << " options " << g_conf->newstore_backend_options << dendl;
+	  << " path " << path << " options " << options << dendl;
   return 0;
 }
 
@@ -927,12 +933,7 @@ int NewStore::mkfs()
   if (r < 0)
     goto out_close_fsid;
 
-  if (db_path != "") {
-    r = symlinkat(db_path.c_str(), path_fd, "db");
-    if (r < 0)
-      goto out_close_frag;
-  }
-  r = _open_db();
+  r = _open_db(true);
   if (r < 0)
     goto out_close_frag;
 
@@ -976,7 +977,7 @@ int NewStore::mount()
 
   // FIXME: superblock, features
 
-  r = _open_db();
+  r = _open_db(false);
   if (r < 0)
     goto out_frag;
 
@@ -1708,7 +1709,7 @@ bool NewStore::OmapIteratorImpl::valid()
   }
 }
 
-int NewStore::OmapIteratorImpl::next()
+int NewStore::OmapIteratorImpl::next(bool validate)
 {
   RWLock::RLocker l(c->lock);
   if (o->onode.omap_head) {
@@ -3050,17 +3051,17 @@ int NewStore::_txc_add_transaction(TransContext *txc, Transaction *t)
     case Transaction::OP_OMAP_SETKEYS:
       {
         ghobject_t oid = i.get_oid(op->oid);
-        map<string, bufferlist> aset;
-        i.decode_attrset(aset);
-	r = _omap_setkeys(txc, c, oid, aset);
+	bufferlist aset_bl;
+        i.decode_attrset_bl(&aset_bl);
+	r = _omap_setkeys(txc, c, oid, aset_bl);
       }
       break;
     case Transaction::OP_OMAP_RMKEYS:
       {
         ghobject_t oid = i.get_oid(op->oid);
-        set<string> keys;
-        i.decode_keyset(keys);
-	r = _omap_rmkeys(txc, c, oid, keys);
+	bufferlist keys_bl;
+        i.decode_keyset_bl(&keys_bl);
+	r = _omap_rmkeys(txc, c, oid, keys_bl);
       }
       break;
     case Transaction::OP_OMAP_RMKEYRANGE:
@@ -4031,10 +4032,12 @@ int NewStore::_omap_clear(TransContext *txc,
 int NewStore::_omap_setkeys(TransContext *txc,
 			    CollectionRef& c,
 			    const ghobject_t& oid,
-			    const map<string,bufferlist>& m)
+			    bufferlist &bl)
 {
   dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
   int r = 0;
+  bufferlist::iterator p = bl.begin();
+  __u32 num;
 
   RWLock::WLocker l(c->lock);
   OnodeRef o = c->get_onode(oid, false);
@@ -4046,11 +4049,16 @@ int NewStore::_omap_setkeys(TransContext *txc,
     o->onode.omap_head = o->onode.nid;
     txc->write_onode(o);
   }
-  for (map<string,bufferlist>::const_iterator p = m.begin(); p != m.end(); ++p) {
+  ::decode(num, p);
+  while (num--) {
     string key;
-    get_omap_key(o->onode.omap_head, p->first, &key);
-    dout(30) << __func__ << "  " << key << " <- " << p->first << dendl;
-    txc->t->set(PREFIX_OMAP, key, p->second);
+    bufferlist value;
+    ::decode(key, p);
+    ::decode(value, p);
+    string final_key;
+    get_omap_key(o->onode.omap_head, key, &final_key);
+    dout(30) << __func__ << "  " << final_key << " <- " << value << dendl;
+    txc->t->set(PREFIX_OMAP, final_key, value);
   }
   r = 0;
 
@@ -4090,10 +4098,12 @@ int NewStore::_omap_setheader(TransContext *txc,
 int NewStore::_omap_rmkeys(TransContext *txc,
 			   CollectionRef& c,
 			   const ghobject_t& oid,
-			   const set<string>& m)
+			   bufferlist& bl)
 {
   dout(15) << __func__ << " " << c->cid << " " << oid << dendl;
   int r = 0;
+  bufferlist::iterator p = bl.begin();
+  __u32 num;
 
   RWLock::WLocker l(c->lock);
   OnodeRef o = c->get_onode(oid, false);
@@ -4109,11 +4119,14 @@ int NewStore::_omap_rmkeys(TransContext *txc,
     o->onode.omap_head = o->onode.nid;
     txc->write_onode(o);
   }
-  for (set<string>::const_iterator p = m.begin(); p != m.end(); ++p) {
+  ::decode(num, p);
+  while (num--) {
     string key;
-    get_omap_key(o->onode.omap_head, *p, &key);
-    dout(30) << __func__ << "  rm " << key << " <- " << *p << dendl;
-    txc->t->rmkey(PREFIX_OMAP, key);
+    ::decode(key, p);
+    string final_key;
+    get_omap_key(o->onode.omap_head, key, &final_key);
+    dout(30) << __func__ << "  rm " << final_key << " <- " << key << dendl;
+    txc->t->rmkey(PREFIX_OMAP, final_key);
   }
   r = 0;
 
diff --git a/src/os/newstore/NewStore.h b/src/os/newstore/NewStore.h
index 97c5d6a..c32e2a9 100644
--- a/src/os/newstore/NewStore.h
+++ b/src/os/newstore/NewStore.h
@@ -27,7 +27,7 @@
 #include "common/WorkQueue.h"
 #include "os/ObjectStore.h"
 #include "os/fs/FS.h"
-#include "os/KeyValueDB.h"
+#include "kv/KeyValueDB.h"
 
 #include "newstore_types.h"
 
@@ -124,7 +124,7 @@ public:
     int upper_bound(const string &after);
     int lower_bound(const string &to);
     bool valid();
-    int next();
+    int next(bool validate=true);
     string key();
     bufferlist value();
     int status() {
@@ -461,7 +461,6 @@ private:
   KeyValueDB *db;
   FS *fs;
   uuid_d fsid;
-  string db_path;
   int path_fd;  ///< open handle to $path
   int fsid_fd;  ///< open handle (locked) to $path/fsid
   int frag_fd;  ///< open handle to $path/fragments
@@ -525,7 +524,7 @@ private:
   int _open_frag();
   int _create_frag();
   void _close_frag();
-  int _open_db();
+  int _open_db(bool create);
   void _close_db();
   int _open_collections();
   void _close_collections();
@@ -788,7 +787,7 @@ private:
   int _omap_setkeys(TransContext *txc,
 		    CollectionRef& c,
 		    const ghobject_t& oid,
-		    const map<string,bufferlist>& m);
+		    bufferlist& bl);
   int _omap_setheader(TransContext *txc,
 		      CollectionRef& c,
 		      const ghobject_t& oid,
@@ -796,7 +795,7 @@ private:
   int _omap_rmkeys(TransContext *txc,
 		   CollectionRef& c,
 		   const ghobject_t& oid,
-		   const set<string>& m);
+		   bufferlist& bl);
   int _omap_rmkey_range(TransContext *txc,
 			CollectionRef& c,
 			const ghobject_t& oid,
diff --git a/src/osd/ClassHandler.h b/src/osd/ClassHandler.h
index e4bb999..a78c9e6 100644
--- a/src/osd/ClassHandler.h
+++ b/src/osd/ClassHandler.h
@@ -90,7 +90,7 @@ public:
     ClassFilter *get_filter(const std::string &filter_name)
     {
       Mutex::Locker l(handler->mutex);
-      std::map<std::string, ClassFilter>::iterator i = filters_map.find(name);
+      std::map<std::string, ClassFilter>::iterator i = filters_map.find(filter_name);
       if (i == filters_map.end()) {
         return NULL;
       } else {
diff --git a/src/osd/ECBackend.cc b/src/osd/ECBackend.cc
index d6b95a5..a79ba09 100644
--- a/src/osd/ECBackend.cc
+++ b/src/osd/ECBackend.cc
@@ -1121,11 +1121,21 @@ void ECBackend::handle_sub_read_reply(
 	  ++is_complete;
 	}
       } else {
-	if (!rop.complete[iter->first].errors.empty())
-	  dout(10) << __func__ << " Enough copies for " << iter->first << " (ignore errors)" << dendl;
-	++is_complete;
-	rop.complete[iter->first].errors.clear();
         assert(rop.complete[iter->first].r == 0);
+	if (!rop.complete[iter->first].errors.empty()) {
+	  if (cct->_conf->osd_read_ec_check_for_errors) {
+	    dout(10) << __func__ << ": Not ignoring errors, use one shard err=" << err << dendl;
+	    err = rop.complete[iter->first].errors.begin()->second;
+            rop.complete[iter->first].r = err;
+	  } else {
+	    get_parent()->clog_error() << __func__ << ": Error(s) ignored for "
+				       << iter->first << " enough copies available" << "\n";
+	    dout(10) << __func__ << " Error(s) ignored for " << iter->first
+		     << " enough copies available" << dendl;
+	    rop.complete[iter->first].errors.clear();
+	  }
+	}
+	++is_complete;
       }
     }
   }
@@ -2078,6 +2088,7 @@ void ECBackend::be_deep_scrub(
     dout(0) << "_scan_list  " << poid << " got "
 	    << r << " on read, read_error" << dendl;
     o.read_error = true;
+    return;
   }
 
   ECUtil::HashInfoRef hinfo = get_hash_info(poid, false);
@@ -2085,15 +2096,18 @@ void ECBackend::be_deep_scrub(
     dout(0) << "_scan_list  " << poid << " could not retrieve hash info" << dendl;
     o.read_error = true;
     o.digest_present = false;
+    return;
   } else {
     if (hinfo->get_chunk_hash(get_parent()->whoami_shard().shard) != h.digest()) {
       dout(0) << "_scan_list  " << poid << " got incorrect hash on read" << dendl;
       o.read_error = true;
+      return;
     }
 
     if (hinfo->get_total_chunk_size() != pos) {
       dout(0) << "_scan_list  " << poid << " got incorrect size on read" << dendl;
       o.read_error = true;
+      return;
     }
 
     /* We checked above that we match our own stored hash.  We cannot
diff --git a/src/osd/ECBackend.h b/src/osd/ECBackend.h
index a039b70..efb284e 100644
--- a/src/osd/ECBackend.h
+++ b/src/osd/ECBackend.h
@@ -494,6 +494,7 @@ public:
     ObjectStore::Transaction *t);
 
   bool scrub_supported() { return true; }
+  bool auto_repair_supported() const { return true; }
 
   void be_deep_scrub(
     const hobject_t &obj,
diff --git a/src/osd/Makefile.am b/src/osd/Makefile.am
index da805a2..9fafed5 100644
--- a/src/osd/Makefile.am
+++ b/src/osd/Makefile.am
@@ -8,7 +8,7 @@ noinst_LTLIBRARIES += libosd_types.la
 if ENABLE_SERVER
 if WITH_OSD
 
-libosd_la_SOURCES = \
+libosd_a_SOURCES = \
 	osd/PG.cc \
 	osd/ReplicatedPG.cc \
 	osd/ReplicatedBackend.cc \
@@ -22,16 +22,12 @@ libosd_la_SOURCES = \
 	osd/Watch.cc \
 	osd/ClassHandler.cc \
 	osd/OpRequest.cc \
-	common/TrackedOp.cc \
 	osd/SnapMapper.cc \
 	objclass/class_api.cc
 
-libosd_la_CXXFLAGS = ${AM_CXXFLAGS}
-if WITH_KINETIC
-libosd_la_CXXFLAGS += -std=gnu++11
-endif
-libosd_la_LIBADD = $(LIBOSDC) $(LIBOS) $(LIBOSD_TYPES) $(LIBOS_TYPES)
-noinst_LTLIBRARIES += libosd.la
+libosd_a_CXXFLAGS = ${AM_CXXFLAGS}
+libosd_a_LIBADD =
+noinst_LIBRARIES += libosd.a
 
 noinst_HEADERS += \
 	osd/ClassHandler.h \
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 413ad59..c2267ef 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -750,7 +750,40 @@ pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t f
 void OSDService::queue_want_pg_temp(pg_t pgid, vector<int>& want)
 {
   Mutex::Locker l(pg_temp_lock);
-  pg_temp_wanted[pgid] = want;
+  map<pg_t,vector<int> >::iterator p = pg_temp_pending.find(pgid);
+  if (p == pg_temp_pending.end() ||
+      p->second != want) {
+    pg_temp_wanted[pgid] = want;
+  }
+}
+
+void OSDService::remove_want_pg_temp(pg_t pgid)
+{
+  Mutex::Locker l(pg_temp_lock);
+  pg_temp_wanted.erase(pgid);
+  pg_temp_pending.erase(pgid);
+}
+
+void OSDService::_sent_pg_temp()
+{
+  for (map<pg_t,vector<int> >::iterator p = pg_temp_wanted.begin();
+       p != pg_temp_wanted.end();
+       ++p)
+    pg_temp_pending[p->first] = p->second;
+  pg_temp_wanted.clear();
+}
+
+void OSDService::requeue_pg_temp()
+{
+  Mutex::Locker l(pg_temp_lock);
+  // wanted overrides pending.  note that remove_want_pg_temp
+  // clears the item out of both.
+  unsigned old_wanted = pg_temp_wanted.size();
+  unsigned old_pending = pg_temp_pending.size();
+  _sent_pg_temp();
+  pg_temp_wanted.swap(pg_temp_pending);
+  dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
+	   << pg_temp_wanted.size() << dendl;
 }
 
 void OSDService::send_pg_temp()
@@ -762,6 +795,7 @@ void OSDService::send_pg_temp()
   MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch());
   m->pg_temp = pg_temp_wanted;
   monc->send_mon_message(m);
+  _sent_pg_temp();
 }
 
 
@@ -1021,13 +1055,13 @@ void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch
 bool OSDService::prepare_to_stop()
 {
   Mutex::Locker l(is_stopping_lock);
-  if (state != NOT_STOPPING)
+  if (get_state() != NOT_STOPPING)
     return false;
 
   OSDMapRef osdmap = get_osdmap();
   if (osdmap && osdmap->is_up(whoami)) {
     dout(0) << __func__ << " telling mon we are shutting down" << dendl;
-    state = PREPARING_TO_STOP;
+    set_state(PREPARING_TO_STOP);
     monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
 					      osdmap->get_inst(whoami),
 					      osdmap->get_epoch(),
@@ -1037,28 +1071,27 @@ bool OSDService::prepare_to_stop()
     utime_t timeout;
     timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
     while ((ceph_clock_now(cct) < timeout) &&
-	   (state != STOPPING)) {
+       (get_state() != STOPPING)) {
       is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
     }
   }
   dout(0) << __func__ << " starting shutdown" << dendl;
-  state = STOPPING;
+  set_state(STOPPING);
   return true;
 }
 
 void OSDService::got_stop_ack()
 {
   Mutex::Locker l(is_stopping_lock);
-  if (state == PREPARING_TO_STOP) {
+  if (get_state() == PREPARING_TO_STOP) {
     dout(0) << __func__ << " starting shutdown" << dendl;
-    state = STOPPING;
+    set_state(STOPPING);
     is_stopping_cond.Signal();
   } else {
     dout(10) << __func__ << " ignoring msg" << dendl;
   }
 }
 
-
 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
                                                OSDSuperblock& sblock)
 {
@@ -1355,7 +1388,11 @@ int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
     bufferlist sbbl;
     ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_POBJECT, 0, 0, sbbl);
     if (ret >= 0) {
+      /* if we already have superblock, check content of superblock */
       dout(0) << " have superblock" << dendl;
+      bufferlist::iterator p;
+      p = sbbl.begin();
+      ::decode(sb, p);
       if (whoami != sb.whoami) {
 	derr << "provided osd id " << whoami << " != superblock's " << sb.whoami << dendl;
 	ret = -EINVAL;
@@ -1559,11 +1596,9 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
     &osd_tp),
   map_lock("OSD::map_lock"),
   pg_map_lock("OSD::pg_map_lock"),
-  debug_drop_pg_create_probability(cct->_conf->osd_debug_drop_pg_create_probability),
-  debug_drop_pg_create_duration(cct->_conf->osd_debug_drop_pg_create_duration),
-  debug_drop_pg_create_left(-1),
-  outstanding_pg_stats(false),
-  timeout_mon_on_pg_stats(true),
+  last_pg_create_epoch(0),
+  mon_report_lock("OSD::mon_report_lock"),
+  stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
   up_thru_wanted(0), up_thru_pending(0),
   requested_full_first(0),
   requested_full_last(0),
@@ -1669,12 +1704,14 @@ bool OSD::asok_command(string command, cmdmap_t& cmdmap, string format,
     store->flush_journal();
   } else if (command == "dump_ops_in_flight" ||
 	     command == "ops") {
+    RWLock::RLocker l(op_tracker.lock);
     if (!op_tracker.tracking_enabled) {
       ss << "op_tracker tracking is not enabled";
     } else {
       op_tracker.dump_ops_in_flight(f);
     }
   } else if (command == "dump_historic_ops") {
+    RWLock::RLocker l(op_tracker.lock);
     if (!op_tracker.tracking_enabled) {
       ss << "op_tracker tracking is not enabled";
     } else {
@@ -1803,6 +1840,15 @@ int OSD::init()
 
   dout(2) << "boot" << dendl;
 
+  // initialize the daily loadavg with current 15min loadavg
+  double loadavgs[3];
+  if (getloadavg(loadavgs, 3) == 3) {
+    daily_loadavg = loadavgs[2];
+  } else {
+    derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
+    daily_loadavg = 1.0;
+  }
+
   // read superblock
   r = read_superblock();
   if (r < 0) {
@@ -1966,9 +2012,14 @@ int OSD::init()
   peering_wq.drain();
 
   dout(0) << "done with init, starting boot process" << dendl;
-  set_state(STATE_BOOTING);
+
+  // subscribe to any pg creations
+  monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
 
   // we don't need to ask for an osdmap here; objecter will
+  //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
+
+  monc->renew_subs();
 
   start_boot();
 
@@ -2725,7 +2776,6 @@ OSD::res_result OSD::_try_resurrect_pg(
 PG *OSD::_create_lock_pg(
   OSDMapRef createmap,
   spg_t pgid,
-  bool newly_created,
   bool hold_map_lock,
   bool backfill,
   int role,
@@ -2835,31 +2885,24 @@ void OSD::load_pgs()
     derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
   }
 
-  set<spg_t> pgs;
+  bool has_upgraded = false;
+
   for (vector<coll_t>::iterator it = ls.begin();
        it != ls.end();
        ++it) {
     spg_t pgid;
-    if (it->is_temp(&pgid) ||
-	it->is_removal(&pgid) ||
-	(it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
+    if (it->is_temp(&pgid) || it->is_removal(&pgid) ||
+        (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
       dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
       recursive_remove_collection(store, pgid, *it);
       continue;
     }
 
-    if (it->is_pg(&pgid)) {
-      pgs.insert(pgid);
+    if (!it->is_pg(&pgid)) {
+      dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
       continue;
     }
 
-    dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
-  }
-
-  bool has_upgraded = false;
-  for (set<spg_t>::iterator i = pgs.begin(); i != pgs.end(); ++i) {
-    spg_t pgid(*i);
-
     if (pgid.preferred() >= 0) {
       dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
       // FIXME: delete it too, eventually
@@ -2884,7 +2927,7 @@ void OSD::load_pgs()
 	  derr << __func__ << ": could not find map for epoch " << map_epoch
 	       << " on pg " << pgid << ", but the pool is not present in the "
 	       << "current map, so this is probably a result of bug 10617.  "
-	       << "Skipping the pg for now, you can use ceph_objectstore_tool "
+	       << "Skipping the pg for now, you can use ceph-objectstore-tool "
 	       << "to clean it up later." << dendl;
 	  continue;
 	} else {
@@ -3078,6 +3121,8 @@ void OSD::build_past_intervals_parallel()
 		 << " " << debug.str() << dendl;
 	p.old_up = up;
 	p.old_acting = acting;
+	p.primary = primary;
+	p.up_primary = up_primary;
 	p.same_interval_since = cur_epoch;
       }
     }
@@ -3161,7 +3206,8 @@ void OSD::handle_pg_peering_evt(
 
     if (!valid_history || epoch < history.same_interval_since) {
       dout(10) << "get_or_create_pg " << pgid << " acting changed in "
-	       << history.same_interval_since << " (msg from " << epoch << ")" << dendl;
+	       << history.same_interval_since << " (msg from " << epoch << ")"
+	       << dendl;
       return;
     }
 
@@ -3169,28 +3215,6 @@ void OSD::handle_pg_peering_evt(
       assert(0);
     }
 
-    bool create = false;
-    if (primary) {
-      // DNE on source?
-      if (info.dne()) {
-	// is there a creation pending on this pg?
-	if (creating_pgs.count(pgid)) {
-	  creating_pgs[pgid].prior.erase(from);
-	  if (!can_create_pg(pgid))
-	    return;
-	  history = creating_pgs[pgid].history;
-	  create = true;
-	} else {
-	  dout(10) << "get_or_create_pg " << pgid
-		   << " DNE on source, but creation probe, ignoring" << dendl;
-	  return;
-	}
-      }
-      creating_pgs.erase(pgid);
-    } else {
-      assert(!info.dne());  // pg exists if we are hearing about it
-    }
-
     // do we need to resurrect a deleting pg?
     spg_t resurrected;
     PGRef old_pg_state;
@@ -3209,7 +3233,7 @@ void OSD::handle_pg_peering_evt(
 
       PG *pg = _create_lock_pg(
 	get_map(epoch),
-	pgid, create, false, result == RES_SELF,
+	pgid, false, result == RES_SELF,
 	role,
 	up, up_primary,
 	acting, acting_primary,
@@ -3227,7 +3251,7 @@ void OSD::handle_pg_peering_evt(
       return;
     }
     case RES_SELF: {
-      old_pg_state->lock();
+        old_pg_state->lock();
       OSDMapRef old_osd_map = old_pg_state->get_osdmap();
       int old_role = old_pg_state->role;
       vector<int> old_up = old_pg_state->up;
@@ -3241,7 +3265,6 @@ void OSD::handle_pg_peering_evt(
 	old_osd_map,
 	resurrected,
 	false,
-	false,
 	true,
 	old_role,
 	old_up,
@@ -3278,7 +3301,6 @@ void OSD::handle_pg_peering_evt(
 	old_osd_map,
 	resurrected,
 	false,
-	false,
 	true,
 	old_role,
 	old_up,
@@ -3322,56 +3344,6 @@ void OSD::handle_pg_peering_evt(
 }
 
 
-/*
- * calculate prior pg members during an epoch interval [start,end)
- *  - from each epoch, include all osds up then AND now
- *  - if no osds from then are up now, include them all, even tho they're not reachable now
- */
-void OSD::calc_priors_during(
-  spg_t pgid, epoch_t start, epoch_t end, set<pg_shard_t>& pset)
-{
-  dout(15) << "calc_priors_during " << pgid << " [" << start
-	   << "," << end << ")" << dendl;
-  
-  for (epoch_t e = start; e < end; e++) {
-    OSDMapRef oldmap = get_map(e);
-    vector<int> acting;
-    oldmap->pg_to_acting_osds(pgid.pgid, acting);
-    dout(20) << "  " << pgid << " in epoch " << e << " was " << acting << dendl;
-    int up = 0;
-    int actual_osds = 0;
-    for (unsigned i=0; i<acting.size(); i++) {
-      if (acting[i] != CRUSH_ITEM_NONE) {
-	if (osdmap->is_up(acting[i])) {
-	  if (acting[i] != whoami) {
-	    pset.insert(
-	      pg_shard_t(
-		acting[i],
-		osdmap->pg_is_ec(pgid.pgid) ? shard_id_t(i) : shard_id_t::NO_SHARD));
-	  }
-	  up++;
-	}
-	actual_osds++;
-      }
-    }
-    if (!up && actual_osds) {
-      // sucky.  add down osds, even tho we can't reach them right now.
-      for (unsigned i=0; i<acting.size(); i++) {
-	if (acting[i] != whoami && acting[i] != CRUSH_ITEM_NONE) {
-	  pset.insert(
-	    pg_shard_t(
-	      acting[i],
-	      osdmap->pg_is_ec(pgid.pgid) ? shard_id_t(i) : shard_id_t::NO_SHARD));
-	}
-      }
-    }
-  }
-  dout(10) << "calc_priors_during " << pgid
-	   << " [" << start << "," << end 
-	   << ") = " << pset << dendl;
-}
-
-
 /**
  * Fill in the passed history so you know same_interval_since, same_up_since,
  * and same_primary_since.
@@ -3776,7 +3748,7 @@ void OSD::handle_osd_ping(MOSDPing *m)
 	}
 	if (failure_pending.count(from)) {
 	  dout(10) << "handle_osd_ping canceling in-flight failure report for osd." << from<< dendl;
-	  send_still_alive(curmap->get_epoch(), failure_pending[from]);
+	  send_still_alive(curmap->get_epoch(), failure_pending[from].second);
 	  failure_pending.erase(from);
 	}
       }
@@ -3861,8 +3833,12 @@ void OSD::heartbeat()
 
   // get CPU load avg
   double loadavgs[1];
-  if (getloadavg(loadavgs, 1) == 1)
+  int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
+  if (getloadavg(loadavgs, 1) == 1) {
     logger->set(l_osd_loadavg, 100 * loadavgs[0]);
+    daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
+    dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
+  }
 
   dout(30) << "heartbeat checking stats" << dendl;
 
@@ -3911,7 +3887,7 @@ void OSD::heartbeat()
     if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
       last_mon_heartbeat = now;
       dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
-      osdmap_subscribe(osdmap->get_epoch() + 1, true);
+      osdmap_subscribe(osdmap->get_epoch() + 1, false);
     }
   }
 
@@ -3984,31 +3960,12 @@ void OSD::tick()
     heartbeat_check();
     heartbeat_lock.Unlock();
 
-    // mon report?
-    utime_t now = ceph_clock_now(cct);
-    if (outstanding_pg_stats && timeout_mon_on_pg_stats &&
-	(now - cct->_conf->osd_mon_ack_timeout) > last_pg_stats_ack) {
-      dout(1) << "mon hasn't acked PGStats in " << now - last_pg_stats_ack
-	      << " seconds, reconnecting elsewhere" << dendl;
-      monc->reopen_session(new C_MonStatsAckTimer(this));
-      timeout_mon_on_pg_stats = false;
-      last_pg_stats_ack = ceph_clock_now(cct);  // reset clock
-      last_pg_stats_sent = utime_t();
-    }
-    if (now - last_pg_stats_sent > cct->_conf->osd_mon_report_interval_max) {
-      osd_stat_updated = true;
-      do_mon_report();
-    } else if (now - last_mon_report > cct->_conf->osd_mon_report_interval_min) {
-      do_mon_report();
-    }
-
     map_lock.put_read();
   }
 
   if (is_waiting_for_healthy()) {
     if (_is_healthy()) {
       dout(1) << "healthy again, booting" << dendl;
-      set_state(STATE_BOOTING);
       start_boot();
     }
   }
@@ -4040,6 +3997,65 @@ void OSD::tick_without_osd_lock()
   assert(tick_timer_lock.is_locked());
   dout(5) << "tick_without_osd_lock" << dendl;
 
+  // osd_lock is not being held, which means the OSD state
+  // might change when doing the monitor report
+  if (is_active() || is_waiting_for_healthy()) {
+    map_lock.get_read();
+    Mutex::Locker l(mon_report_lock);
+
+    // mon report?
+    bool reset = false;
+    bool report = false;
+    utime_t now = ceph_clock_now(cct);
+    pg_stat_queue_lock.Lock();
+    double backoff = stats_ack_timeout / g_conf->osd_mon_ack_timeout;
+    double adjusted_min = cct->_conf->osd_mon_report_interval_min * backoff;
+    // note: we shouldn't adjust max because it must remain < the
+    // mon's mon_osd_report_timeout (which defaults to 1.5x our
+    // value).
+    double max = cct->_conf->osd_mon_report_interval_max;
+    if (!outstanding_pg_stats.empty() &&
+	(now - stats_ack_timeout) > last_pg_stats_ack) {
+      dout(1) << __func__ << " mon hasn't acked PGStats in "
+	      << now - last_pg_stats_ack
+	      << " seconds, reconnecting elsewhere" << dendl;
+      reset = true;
+      last_pg_stats_ack = ceph_clock_now(cct);  // reset clock
+      last_pg_stats_sent = utime_t();
+      stats_ack_timeout =
+	MAX(g_conf->osd_mon_ack_timeout,
+	    stats_ack_timeout * g_conf->osd_stats_ack_timeout_factor);
+      outstanding_pg_stats.clear();
+    }
+    if (now - last_pg_stats_sent > max) {
+      osd_stat_updated = true;
+      report = true;
+    } else if ((int)outstanding_pg_stats.size() >=
+	       cct->_conf->osd_mon_report_max_in_flight) {
+      dout(20) << __func__ << " have max " << outstanding_pg_stats
+	       << " stats updates in flight" << dendl;
+    } else {
+      if (now - last_mon_report > adjusted_min) {
+	dout(20) << __func__ << " stats backoff " << backoff
+		 << " adjusted_min " << adjusted_min << " - sending report"
+		 << dendl;
+	report = true;
+      }
+    }
+    pg_stat_queue_lock.Unlock();
+
+    if (reset) {
+      monc->reopen_session();
+    } else if (report) {
+      last_mon_report = now;
+
+      // do any pending reports
+      send_failures();
+      send_pg_stats(now);
+    }
+    map_lock.put_read();
+  }
+
   if (!scrub_random_backoff()) {
     sched_scrub();
   }
@@ -4231,7 +4247,7 @@ bool remove_dir(
 {
   vector<ghobject_t> olist;
   int64_t num = 0;
-  ObjectStore::Transaction *t = new ObjectStore::Transaction;
+  ObjectStore::Transaction t;
   ghobject_t next;
   handle.reset_tp_timeout();
   store->collection_list(
@@ -4247,38 +4263,36 @@ bool remove_dir(
        ++i, ++num) {
     if (i->is_pgmeta())
       continue;
-    OSDriver::OSTransaction _t(osdriver->get_transaction(t));
+    OSDriver::OSTransaction _t(osdriver->get_transaction(&t));
     int r = mapper->remove_oid(i->hobj, &_t);
     if (r != 0 && r != -ENOENT) {
       assert(0);
     }
-    t->remove(coll, *i);
+    t.remove(coll, *i);
     if (num >= cct->_conf->osd_target_transaction_size) {
       C_SaferCond waiter;
-      store->queue_transaction(osr, t, &waiter);
+      store->queue_transaction(osr, &t, &waiter);
       bool cont = dstate->pause_clearing();
       handle.suspend_tp_timeout();
       waiter.wait();
       handle.reset_tp_timeout();
       if (cont)
         cont = dstate->resume_clearing();
-      delete t;
       if (!cont)
 	return false;
-      t = new ObjectStore::Transaction;
+      t = ObjectStore::Transaction();
       num = 0;
     }
   }
 
   C_SaferCond waiter;
-  store->queue_transaction(osr, t, &waiter);
+  store->queue_transaction(osr, &t, &waiter);
   bool cont = dstate->pause_clearing();
   handle.suspend_tp_timeout();
   waiter.wait();
   handle.reset_tp_timeout();
   if (cont)
     cont = dstate->resume_clearing();
-  delete t;
   // whether there are more objects to remove in the collection
   *finished = next.is_max();
   return cont;
@@ -4336,20 +4350,6 @@ void OSD::RemoveWQ::_process(
 }
 // =========================================
 
-void OSD::do_mon_report()
-{
-  dout(7) << "do_mon_report" << dendl;
-
-  utime_t now(ceph_clock_now(cct));
-  last_mon_report = now;
-
-  // do any pending reports
-  send_alive();
-  service.send_pg_temp();
-  send_failures();
-  send_pg_stats(now);
-}
-
 void OSD::ms_handle_connect(Connection *con)
 {
   if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
@@ -4357,21 +4357,27 @@ void OSD::ms_handle_connect(Connection *con)
     if (is_stopping())
       return;
     dout(10) << "ms_handle_connect on mon" << dendl;
-    if (is_booting()) {
+
+    if (is_preboot()) {
       start_boot();
+    } else if (is_booting()) {
+      _send_boot();       // resend boot message
     } else {
+      map_lock.get_read();
+      Mutex::Locker l2(mon_report_lock);
+
       utime_t now = ceph_clock_now(NULL);
       last_mon_report = now;
 
       // resend everything, it's a new session
       send_alive();
+      service.requeue_pg_temp();
       service.send_pg_temp();
+      requeue_failures();
       send_failures();
-      send_pg_stats(ceph_clock_now(cct));
+      send_pg_stats(now);
 
-      monc->sub_want("osd_pg_creates", 0, CEPH_SUBSCRIBE_ONETIME);
-      monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
-      monc->renew_subs();
+      map_lock.put_read();
     }
 
     // full map requests may happen while active or pre-boot
@@ -4440,29 +4446,32 @@ struct C_OSD_GetVersion : public Context {
   C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
   void finish(int r) {
     if (r >= 0)
-      osd->_maybe_boot(oldest, newest);
+      osd->_got_mon_epochs(oldest, newest);
   }
 };
 
 void OSD::start_boot()
 {
+  set_state(STATE_PREBOOT);
   dout(10) << "start_boot - have maps " << superblock.oldest_map
 	   << ".." << superblock.newest_map << dendl;
   C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
   monc->get_version("osdmap", &c->newest, &c->oldest, c);
 }
 
-void OSD::_maybe_boot(epoch_t oldest, epoch_t newest)
+void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
 {
   Mutex::Locker l(osd_lock);
-  if (is_stopping())
-    return;
-  dout(10) << "_maybe_boot mon has osdmaps " << oldest << ".." << newest << dendl;
-
-  if (is_initializing()) {
-    dout(10) << "still initializing" << dendl;
-    return;
+  if (is_preboot()) {
+    _preboot(oldest, newest);
   }
+}
+
+void OSD::_preboot(epoch_t oldest, epoch_t newest)
+{
+  assert(is_preboot());
+  dout(10) << __func__ << " _preboot mon has osdmaps "
+	   << oldest << ".." << newest << dendl;
 
   // if our map within recent history, try to add ourselves to the osdmap.
   if (osdmap->test_flag(CEPH_OSDMAP_NOUP)) {
@@ -4489,7 +4498,7 @@ void OSD::_maybe_boot(epoch_t oldest, epoch_t newest)
   
   // get all the latest maps
   if (osdmap->get_epoch() + 1 >= oldest)
-    osdmap_subscribe(osdmap->get_epoch() + 1, true);
+    osdmap_subscribe(osdmap->get_epoch() + 1, false);
   else
     osdmap_subscribe(oldest - 1, true);
 }
@@ -4521,7 +4530,8 @@ bool OSD::_is_healthy()
       ++num;
     }
     if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
-      dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than 1/3)" << dendl;
+      dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
+	      << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
       return false;
     }
   }
@@ -4590,6 +4600,7 @@ void OSD::_send_boot()
 	   << dendl;
   _collect_metadata(&mboot->metadata);
   monc->send_mon_message(mboot);
+  set_state(STATE_BOOTING);
 }
 
 void OSD::_collect_metadata(map<string,string> *pm)
@@ -4617,14 +4628,12 @@ void OSD::queue_want_up_thru(epoch_t want)
 {
   map_lock.get_read();
   epoch_t cur = osdmap->get_up_thru(whoami);
+  Mutex::Locker l(mon_report_lock);
   if (want > up_thru_wanted) {
     dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")" 
 	     << ", currently " << cur
 	     << dendl;
     up_thru_wanted = want;
-
-    // expedite, a bit.  WARNING this will somewhat delay other mon queries.
-    last_mon_report = ceph_clock_now(cct);
     send_alive();
   } else {
     dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted 
@@ -4636,6 +4645,7 @@ void OSD::queue_want_up_thru(epoch_t want)
 
 void OSD::send_alive()
 {
+  assert(mon_report_lock.is_locked());
   if (!osdmap->exists(whoami))
     return;
   epoch_t up_thru = osdmap->get_up_thru(whoami);
@@ -4706,24 +4716,38 @@ void OSD::got_full_map(epoch_t e)
   }
 }
 
-void OSD::send_failures()
+void OSD::requeue_failures()
 {
-  assert(osd_lock.is_locked());
-  bool locked = false;
-  if (!failure_queue.empty()) {
-    heartbeat_lock.Lock();
-    locked = true;
+  Mutex::Locker l(heartbeat_lock);
+  unsigned old_queue = failure_queue.size();
+  unsigned old_pending = failure_pending.size();
+  for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
+	 failure_pending.begin();
+       p != failure_pending.end();
+       ++p) {
+    failure_queue[p->first] = p->second.first;
   }
+  dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
+	   << failure_queue.size() << dendl;
+}
+
+void OSD::send_failures()
+{
+  assert(map_lock.is_locked());
+  assert(mon_report_lock.is_locked());
+  Mutex::Locker l(heartbeat_lock);
   utime_t now = ceph_clock_now(cct);
   while (!failure_queue.empty()) {
     int osd = failure_queue.begin()->first;
-    int failed_for = (int)(double)(now - failure_queue.begin()->second);
     entity_inst_t i = osdmap->get_inst(osd);
-    monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for, osdmap->get_epoch()));
-    failure_pending[osd] = i;
+    if (!failure_pending.count(osd)) {
+      int failed_for = (int)(double)(now - failure_queue.begin()->second);
+      monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
+					     osdmap->get_epoch()));
+      failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
+    }
     failure_queue.erase(osd);
   }
-  if (locked) heartbeat_lock.Unlock();
 }
 
 void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
@@ -4735,8 +4759,7 @@ void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
 
 void OSD::send_pg_stats(const utime_t &now)
 {
-  assert(osd_lock.is_locked());
-
+  assert(map_lock.is_locked());
   dout(20) << "send_pg_stats" << dendl;
 
   osd_stat_t cur_stat = service.get_osd_stat();
@@ -4755,7 +4778,8 @@ void OSD::send_pg_stats(const utime_t &now)
     had_for -= had_map_since;
 
     MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
-    m->set_tid(++pg_stat_tid);
+    uint64_t tid = ++pg_stat_tid;
+    m->set_tid(tid);
     m->osd_stat = cur_stat;
 
     xlist<PG*>::iterator p = pg_stat_queue.begin();
@@ -4779,10 +4803,12 @@ void OSD::send_pg_stats(const utime_t &now)
       pg->pg_stats_publish_lock.Unlock();
     }
 
-    if (!outstanding_pg_stats) {
-      outstanding_pg_stats = true;
+    if (!outstanding_pg_stats.empty()) {
       last_pg_stats_ack = ceph_clock_now(cct);
     }
+    outstanding_pg_stats.insert(tid);
+    dout(20) << __func__ << "  updates pending: " << outstanding_pg_stats << dendl;
+
     monc->send_mon_message(m);
   }
 
@@ -4798,10 +4824,20 @@ void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
     return;
   }
 
-  last_pg_stats_ack = ceph_clock_now(cct);
+  // NOTE: we may get replies from a previous mon even while
+  // outstanding_pg_stats is empty if reconnecting races with replies
+  // in flight.
 
   pg_stat_queue_lock.Lock();
 
+  last_pg_stats_ack = ceph_clock_now(cct);
+
+  // decay timeout slowly (analogous to TCP)
+  stats_ack_timeout =
+    MAX(g_conf->osd_mon_ack_timeout,
+	stats_ack_timeout * g_conf->osd_stats_ack_timeout_decay);
+  dout(20) << __func__ << "  timeout now " << stats_ack_timeout << dendl;
+
   if (ack->get_tid() > pg_stat_tid_flushed) {
     pg_stat_tid_flushed = ack->get_tid();
     pg_stat_queue_cond.Signal();
@@ -4832,10 +4868,9 @@ void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
 	       << ":" << pg->pg_stats_publish.reported_seq << dendl;
     }
   }
-  
-  if (!pg_stat_queue.size()) {
-    outstanding_pg_stats = false;
-  }
+
+  outstanding_pg_stats.erase(ack->get_tid());
+  dout(20) << __func__ << "  still pending: " << outstanding_pg_stats << dendl;
 
   pg_stat_queue_lock.Unlock();
 
@@ -4845,10 +4880,14 @@ void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
 void OSD::flush_pg_stats()
 {
   dout(10) << "flush_pg_stats" << dendl;
+  osd_lock.Unlock();
   utime_t now = ceph_clock_now(cct);
+  map_lock.get_read();
+  mon_report_lock.Lock();
   send_pg_stats(now);
+  mon_report_lock.Unlock();
+  map_lock.put_read();
 
-  osd_lock.Unlock();
 
   pg_stat_queue_lock.Lock();
   uint64_t tid = pg_stat_tid;
@@ -5607,9 +5646,9 @@ void OSD::ms_fast_preprocess(Message *m)
       MOSDMap *mm = static_cast<MOSDMap*>(m);
       Session *s = static_cast<Session*>(m->get_connection()->get_priv());
       if (s) {
-	s->received_map_lock.Lock();
+	s->received_map_lock.lock();
 	s->received_map_epoch = mm->get_last();
-	s->received_map_lock.Unlock();
+	s->received_map_lock.unlock();
 	s->put();
       }
     }
@@ -5823,9 +5862,9 @@ bool OSD::dispatch_op_fast(OpRequestRef& op, OSDMapRef& osdmap)
     Session *s = static_cast<Session*>(op->get_req()->
 				       get_connection()->get_priv());
     if (s) {
-      s->received_map_lock.Lock();
+      s->received_map_lock.lock();
       epoch_t received_epoch = s->received_map_epoch;
-      s->received_map_lock.Unlock();
+      s->received_map_lock.unlock();
       if (received_epoch < msg_epoch) {
 	osdmap_subscribe(msg_epoch, false);
       }
@@ -6065,23 +6104,35 @@ bool OSD::scrub_time_permit(utime_t now)
 
 bool OSD::scrub_load_below_threshold()
 {
-  double loadavgs[1];
-  if (getloadavg(loadavgs, 1) != 1) {
+  double loadavgs[3];
+  if (getloadavg(loadavgs, 3) != 3) {
     dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
     return false;
   }
 
-  if (loadavgs[0] >= cct->_conf->osd_scrub_load_threshold) {
-    dout(20) << __func__ << " loadavg " << loadavgs[0]
-	     << " >= max " << cct->_conf->osd_scrub_load_threshold
-	     << " = no, load too high" << dendl;
-    return false;
-  } else {
+  // allow scrub if below configured threshold
+  if (loadavgs[0] < cct->_conf->osd_scrub_load_threshold) {
     dout(20) << __func__ << " loadavg " << loadavgs[0]
 	     << " < max " << cct->_conf->osd_scrub_load_threshold
 	     << " = yes" << dendl;
     return true;
   }
+
+  // allow scrub if below daily avg and currently decreasing
+  if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
+    dout(20) << __func__ << " loadavg " << loadavgs[0]
+	     << " < daily_loadavg " << daily_loadavg
+	     << " and < 15m avg " << loadavgs[2]
+	     << " = yes" << dendl;
+    return true;
+  }
+
+  dout(20) << __func__ << " loadavg " << loadavgs[0]
+	   << " >= max " << cct->_conf->osd_scrub_load_threshold
+	   << " and ( >= daily_loadavg " << daily_loadavg
+	   << " or >= 15m avg " << loadavgs[2]
+	   << ") = no" << dendl;
+  return false;
 }
 
 void OSD::sched_scrub()
@@ -6137,7 +6188,7 @@ void OSD::wait_for_new_map(OpRequestRef op)
 {
   // ask?
   if (waiting_for_osdmap.empty()) {
-    osdmap_subscribe(osdmap->get_epoch() + 1, true);
+    osdmap_subscribe(osdmap->get_epoch() + 1, false);
   }
   
   logger->inc(l_osd_waiting_for_map);
@@ -6256,7 +6307,7 @@ void OSD::handle_osd_map(MOSDMap *m)
     dout(10) << "handle_osd_map message skips epochs " << osdmap->get_epoch() + 1
 	     << ".." << (first-1) << dendl;
     if (m->oldest_map <= osdmap->get_epoch() + 1) {
-      osdmap_subscribe(osdmap->get_epoch()+1, true);
+      osdmap_subscribe(osdmap->get_epoch()+1, false);
       m->put();
       return;
     }
@@ -6542,18 +6593,22 @@ void OSD::handle_osd_map(MOSDMap *m)
 
   if (m->newest_map && m->newest_map > last) {
     dout(10) << " msg say newest map is " << m->newest_map << ", requesting more" << dendl;
-    osdmap_subscribe(osdmap->get_epoch()+1, true);
+    osdmap_subscribe(osdmap->get_epoch()+1, false);
   }
-  else if (is_booting()) {
-    start_boot();  // retry
+  else if (do_shutdown) {
+    osd_lock.Unlock();
+    shutdown();
+    osd_lock.Lock();
+  }
+  else if (is_preboot()) {
+    if (m->get_source().is_mon())
+      _preboot(m->oldest_map, m->newest_map);
+    else
+      start_boot();
   }
   else if (do_restart)
     start_boot();
 
-  osd_lock.Unlock();
-  if (do_shutdown)
-    shutdown();
-  osd_lock.Lock();
 
   m->put();
 }
@@ -6712,28 +6767,6 @@ void OSD::advance_map()
     }
     service.set_epochs(&boot_epoch, &up_epoch, NULL);
   }
-
-  // scan pg creations
-  ceph::unordered_map<spg_t, create_pg_info>::iterator n = creating_pgs.begin();
-  while (n != creating_pgs.end()) {
-    ceph::unordered_map<spg_t, create_pg_info>::iterator p = n++;
-    spg_t pgid = p->first;
-
-    // am i still primary?
-    vector<int> acting;
-    int primary;
-    osdmap->pg_to_acting_osds(pgid.pgid, &acting, &primary);
-    if (primary != whoami) {
-      dout(10) << " no longer primary for " << pgid << ", stopping creation" << dendl;
-      creating_pgs.erase(p);
-    } else {
-      /*
-       * adding new ppl to our pg has no effect, since we're still primary,
-       * and obviously haven't given the new nodes any data.
-       */
-      p->second.acting.swap(acting);  // keep the latest
-    }
-  }
 }
 
 void OSD::consume_map()
@@ -6840,7 +6873,7 @@ void OSD::activate_map()
 
   if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
     dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
-    osdmap_subscribe(osdmap->get_epoch() + 1, true);
+    osdmap_subscribe(osdmap->get_epoch() + 1, false);
   }
 
   // norecover?
@@ -6974,22 +7007,6 @@ bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
 // ----------------------------------------
 // pg creation
 
-
-bool OSD::can_create_pg(spg_t pgid)
-{
-  assert(creating_pgs.count(pgid));
-
-  // priors empty?
-  if (!creating_pgs[pgid].prior.empty()) {
-    dout(10) << "can_create_pg " << pgid
-	     << " - waiting for priors " << creating_pgs[pgid].prior << dendl;
-    return false;
-  }
-
-  dout(10) << "can_create_pg " << pgid << " - can create now" << dendl;
-  return true;
-}
-
 void OSD::split_pgs(
   PG *parent,
   const set<spg_t> &childpgids, set<boost::intrusive_ptr<PG> > *out_pgs,
@@ -7052,21 +7069,6 @@ void OSD::handle_pg_create(OpRequestRef op)
 
   dout(10) << "handle_pg_create " << *m << dendl;
 
-  // drop the next N pg_creates in a row?
-  if (debug_drop_pg_create_left < 0 &&
-      cct->_conf->osd_debug_drop_pg_create_probability >
-      ((((double)(rand()%100))/100.0))) {
-    debug_drop_pg_create_left = debug_drop_pg_create_duration;
-  }
-  if (debug_drop_pg_create_left >= 0) {
-    --debug_drop_pg_create_left;
-    if (debug_drop_pg_create_left >= 0) {
-      dout(0) << "DEBUG dropping/ignoring pg_create, will drop the next "
-	      << debug_drop_pg_create_left << " too" << dendl;
-      return;
-    }
-  }
-
   /* we have to hack around require_mon_peer's interface limits, so
    * grab an extra reference before going in. If the peer isn't
    * a Monitor, the reference is put for us (and then cleared
@@ -7083,15 +7085,12 @@ void OSD::handle_pg_create(OpRequestRef op)
 
   op->mark_started();
 
-  int num_created = 0;
-
   map<pg_t,utime_t>::iterator ci = m->ctimes.begin();
   for (map<pg_t,pg_create_t>::iterator p = m->mkpg.begin();
        p != m->mkpg.end();
        ++p, ++ci) {
     assert(ci != m->ctimes.end() && ci->first == p->first);
     epoch_t created = p->second.created;
-    pg_t parent = p->second.parent;
     if (p->second.split_bits) // Skip split pgs
       continue;
     pg_t on = p->first;
@@ -7138,76 +7137,40 @@ void OSD::handle_pg_create(OpRequestRef op)
       continue;
     }
 
-    // figure history
     pg_history_t history;
     history.epoch_created = created;
-    history.last_epoch_clean = created;
-    // Newly created PGs don't need to scrub immediately, so mark them
-    // as scrubbed at creation time.
-    if (ci->second == utime_t()) {
-      // Older OSD doesn't send ctime, so just do what we did before
-      // The repair_test.py can fail in a mixed cluster
-      utime_t now = ceph_clock_now(NULL);
-      history.last_scrub_stamp = now;
-      history.last_deep_scrub_stamp = now;
-    } else {
-      history.last_scrub_stamp = ci->second;
-      history.last_deep_scrub_stamp = ci->second;
-    }
+    history.last_scrub_stamp = ci->second;
+    history.last_deep_scrub_stamp = ci->second;
     bool valid_history = project_pg_history(
       pgid, history, created, up, up_primary, acting, acting_primary);
     /* the pg creation message must have come from a mon and therefore
      * cannot be on the other side of a map gap
      */
     assert(valid_history);
-    
-    // register.
-    creating_pgs[pgid].history = history;
-    creating_pgs[pgid].parent = parent;
-    creating_pgs[pgid].acting.swap(acting);
-    calc_priors_during(pgid, created, history.same_interval_since, 
-		       creating_pgs[pgid].prior);
 
     PG::RecoveryCtx rctx = create_context();
-    // poll priors
-    set<pg_shard_t>& pset = creating_pgs[pgid].prior;
-    dout(10) << "mkpg " << pgid << " e" << created
-	     << " h " << history
-	     << " : querying priors " << pset << dendl;
-    for (set<pg_shard_t>::iterator p = pset.begin(); p != pset.end(); ++p)
-      if (osdmap->is_up(p->osd))
-	(*rctx.query_map)[p->osd][spg_t(pgid.pgid, p->shard)] =
-	  pg_query_t(
-	    pg_query_t::INFO,
-	    p->shard, pgid.shard,
-	    history,
-	    osdmap->get_epoch());
-
-    PG *pg = NULL;
-    if (can_create_pg(pgid)) {
-      const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
-      PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
-      PG::_init(*rctx.transaction, pgid, pp);
-
-      pg_interval_map_t pi;
-      pg = _create_lock_pg(
-	osdmap, pgid, true, false, false,
-	0, creating_pgs[pgid].acting, whoami,
-	creating_pgs[pgid].acting, whoami,
-	history, pi,
-	*rctx.transaction);
-      pg->info.last_epoch_started = pg->info.history.last_epoch_started;
-      creating_pgs.erase(pgid);
-      pg->handle_create(&rctx);
-      pg->write_if_dirty(*rctx.transaction);
-      pg->publish_stats_to_osd();
-      pg->unlock();
-      num_created++;
-      wake_pg_waiters(pg, pgid);
-    }
+    const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
+    PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
+    PG::_init(*rctx.transaction, pgid, pp);
+
+    pg_interval_map_t pi;
+    PG *pg = _create_lock_pg(
+      osdmap, pgid, false, false,
+      0, up, up_primary,
+      acting, acting_primary,
+      history, pi,
+      *rctx.transaction);
+    pg->info.last_epoch_started = created;
+    pg->handle_create(&rctx);
+    pg->write_if_dirty(*rctx.transaction);
+    pg->publish_stats_to_osd();
+    pg->unlock();
+    wake_pg_waiters(pg, pgid);
     dispatch_context(rctx, pg, osdmap);
   }
 
+  last_pg_create_epoch = m->epoch;
+
   maybe_update_heartbeat_peers();
 }
 
@@ -7940,7 +7903,7 @@ void OSD::do_recovery(PG *pg, ThreadPool::TPHandle &handle)
     dout(20) << "  active was " << recovery_oids[pg->info.pgid] << dendl;
 #endif
     
-    int started;
+    int started = 0;
     bool more = pg->start_recovery_ops(max, handle, &started);
     dout(10) << "do_recovery started " << started << "/" << max << " on " << *pg << dendl;
     // If no recovery op is started, don't bother to manipulate the RecoveryCtx
@@ -8049,17 +8012,24 @@ public:
   void finish(ThreadPool::TPHandle& tp) {
     OSD::Session *session = static_cast<OSD::Session *>(
         con->get_priv());
+    epoch_t last_sent_epoch;
     if (session) {
-      session->sent_epoch_lock.Lock();
+      session->sent_epoch_lock.lock();
+      last_sent_epoch = session->last_sent_epoch;
+      session->sent_epoch_lock.unlock();
     }
     osd->service.share_map(
 	name,
         con.get(),
         map_epoch,
         osdmap,
-        session ? &session->last_sent_epoch : NULL);
+        session ? &last_sent_epoch : NULL);
     if (session) {
-      session->sent_epoch_lock.Unlock();
+      session->sent_epoch_lock.lock();
+      if (session->last_sent_epoch < last_sent_epoch) {
+	session->last_sent_epoch = last_sent_epoch;	
+      }
+      session->sent_epoch_lock.unlock();
       session->put();
     }
   }
@@ -8094,49 +8064,23 @@ void OSD::handle_op(OpRequestRef& op, OSDMapRef& osdmap)
     return;
   }
 
-  // we don't need encoded payload anymore
-  m->clear_payload();
-
-  // object name too long?
-  unsigned max_name_len = MIN(g_conf->osd_max_object_name_len,
-			      store->get_max_object_name_length());
-  if (m->get_oid().name.size() > max_name_len) {
-    dout(4) << "handle_op '" << m->get_oid().name << "' is longer than "
-	    << max_name_len << " bytes" << dendl;
-    service.reply_op_error(op, -ENAMETOOLONG);
-    return;
-  }
-
-  // blacklisted?
-  if (osdmap->is_blacklisted(m->get_source_addr())) {
-    dout(4) << "handle_op " << m->get_source_addr() << " is blacklisted" << dendl;
-    service.reply_op_error(op, -EBLACKLISTED);
-    return;
-  }
-
   // set up a map send if the Op gets blocked for some reason
   send_map_on_destruct share_map(this, m, osdmap, m->get_map_epoch());
   Session *client_session =
       static_cast<Session*>(m->get_connection()->get_priv());
+  epoch_t last_sent_epoch;
   if (client_session) {
-    client_session->sent_epoch_lock.Lock();
+    client_session->sent_epoch_lock.lock();
+    last_sent_epoch = client_session->last_sent_epoch;
+    client_session->sent_epoch_lock.unlock();
   }
   share_map.should_send = service.should_share_map(
       m->get_source(), m->get_connection().get(), m->get_map_epoch(),
-      osdmap, &client_session->last_sent_epoch);
+      osdmap, client_session ? &last_sent_epoch : NULL);
   if (client_session) {
-    client_session->sent_epoch_lock.Unlock();
     client_session->put();
   }
 
-  if (op->rmw_flags == 0) {
-    int r = init_op_flags(op);
-    if (r) {
-      service.reply_op_error(op, r);
-      return;
-    }
-  }
-
   if (cct->_conf->osd_debug_drop_op_probability > 0 &&
       !m->get_source().is_mds()) {
     if ((double)rand() / (double)RAND_MAX < cct->_conf->osd_debug_drop_op_probability) {
@@ -8148,29 +8092,6 @@ void OSD::handle_op(OpRequestRef& op, OSDMapRef& osdmap)
   // calc actual pgid
   pg_t _pgid = m->get_pg();
   int64_t pool = _pgid.pool();
-  if (op->may_write()) {
-    const pg_pool_t *pi = osdmap->get_pg_pool(pool);
-    if (!pi) {
-      return;
-    }
-    
-    // invalid?
-    if (m->get_snapid() != CEPH_NOSNAP) {
-      service.reply_op_error(op, -EINVAL);
-      return;
-    }
-
-    // too big?
-    if (cct->_conf->osd_max_write_size &&
-	m->get_data_len() > ((int64_t)g_conf->osd_max_write_size) << 20) {
-      // journal can't hold commit!
-      derr << "handle_op msg data len " << m->get_data_len()
-	   << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
-	   << " on " << *m << dendl;
-      service.reply_op_error(op, -OSD_WRITETOOBIG);
-      return;
-    }
-  }
 
   if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0 &&
       osdmap->have_pg_pool(pool))
@@ -8182,6 +8103,17 @@ void OSD::handle_op(OpRequestRef& op, OSDMapRef& osdmap)
     return;
   }
 
+  PG *pg = get_pg_or_queue_for_pg(pgid, op);
+  if (pg) {
+    op->send_map_update = share_map.should_send;
+    op->sent_epoch = m->get_map_epoch();
+    enqueue_op(pg, op);
+    share_map.should_send = false;
+    return;
+  }
+
+  // ok, we didn't have the PG.  let's see if it's our fault or the client's.
+
   OSDMapRef send_map = service.try_get_map(m->get_map_epoch());
   // check send epoch
   if (!send_map) {
@@ -8221,14 +8153,6 @@ void OSD::handle_op(OpRequestRef& op, OSDMapRef& osdmap)
 	    << dendl;
     return;
   }
-
-  PG *pg = get_pg_or_queue_for_pg(pgid, op);
-  if (pg) {
-    op->send_map_update = share_map.should_send;
-    op->sent_epoch = m->get_map_epoch();
-    enqueue_op(pg, op);
-    share_map.should_send = false;
-  }
 }
 
 template<typename T, int MSGTYPE>
@@ -8253,15 +8177,17 @@ void OSD::handle_replica_op(OpRequestRef& op, OSDMapRef& osdmap)
   bool should_share_map = false;
   Session *peer_session =
       static_cast<Session*>(m->get_connection()->get_priv());
+  epoch_t last_sent_epoch;
   if (peer_session) {
-    peer_session->sent_epoch_lock.Lock();
+    peer_session->sent_epoch_lock.lock();
+    last_sent_epoch = peer_session->last_sent_epoch;
+    peer_session->sent_epoch_lock.unlock();
   }
   should_share_map = service.should_share_map(
       m->get_source(), m->get_connection().get(), m->map_epoch,
       osdmap,
-      peer_session ? &peer_session->last_sent_epoch : NULL);
+      peer_session ? &last_sent_epoch : NULL);
   if (peer_session) {
-    peer_session->sent_epoch_lock.Unlock();
     peer_session->put();
   }
 
@@ -8455,17 +8381,24 @@ void OSD::dequeue_op(
   if (op->send_map_update) {
     Message *m = op->get_req();
     Session *session = static_cast<Session *>(m->get_connection()->get_priv());
+    epoch_t last_sent_epoch;
     if (session) {
-      session->sent_epoch_lock.Lock();
+      session->sent_epoch_lock.lock();
+      last_sent_epoch = session->last_sent_epoch;
+      session->sent_epoch_lock.unlock();
     }
     service.share_map(
         m->get_source(),
         m->get_connection().get(),
         op->sent_epoch,
         osdmap,
-        session ? &session->last_sent_epoch : NULL);
+        session ? &last_sent_epoch : NULL);
     if (session) {
-      session->sent_epoch_lock.Unlock();
+      session->sent_epoch_lock.lock();
+      if (session->last_sent_epoch < last_sent_epoch) {
+	session->last_sent_epoch = last_sent_epoch;
+      }
+      session->sent_epoch_lock.unlock();
       session->put();
     }
   }
@@ -8574,6 +8507,7 @@ const char** OSD::get_tracked_conf_keys() const
     "osd_min_recovery_priority",
     "osd_op_complaint_time", "osd_op_log_threshold",
     "osd_op_history_size", "osd_op_history_duration",
+    "osd_enable_op_tracker",
     "osd_map_cache_size",
     "osd_map_max_advance",
     "osd_pg_epoch_persisted_max_stale",
@@ -8610,6 +8544,9 @@ void OSD::handle_conf_change(const struct md_config_t *conf,
     op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
                                              cct->_conf->osd_op_history_duration);
   }
+  if (changed.count("osd_enable_op_tracker")) {
+      op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
+  }
   if (changed.count("osd_disk_thread_ioprio_class") ||
       changed.count("osd_disk_thread_ioprio_priority")) {
     set_disk_tp_priority();
@@ -8841,6 +8778,12 @@ int OSD::init_op_flags(OpRequestRef& op)
         op->set_skip_promote();
       }
       break;
+
+    // force promotion when pin an object in cache tier
+    case CEPH_OSD_OP_CACHE_PIN:
+      op->set_promote();
+      break;
+
     default:
       break;
     }
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 44a492c..8c0cd8e 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -54,6 +54,7 @@ using namespace std;
 #include "common/sharedptr_registry.hpp"
 #include "common/PrioritizedQueue.h"
 #include "messages/MOSDOp.h"
+#include "include/Spinlock.h"
 
 #define CEPH_OSD_PROTOCOL    10 /* cluster internal */
 
@@ -801,13 +802,15 @@ public:
   AsyncReserver<spg_t> remote_reserver;
 
   // -- pg_temp --
+private:
   Mutex pg_temp_lock;
   map<pg_t, vector<int> > pg_temp_wanted;
+  map<pg_t, vector<int> > pg_temp_pending;
+  void _sent_pg_temp();
+public:
   void queue_want_pg_temp(pg_t pgid, vector<int>& want);
-  void remove_want_pg_temp(pg_t pgid) {
-    Mutex::Locker l(pg_temp_lock);
-    pg_temp_wanted.erase(pgid);
-  }
+  void remove_want_pg_temp(pg_t pgid);
+  void requeue_pg_temp();
   void send_pg_temp();
 
   void queue_for_peering(PG *pg);
@@ -969,14 +972,19 @@ public:
   enum {
     NOT_STOPPING,
     PREPARING_TO_STOP,
-    STOPPING } state;
+    STOPPING };
+  atomic_t state;
+  int get_state() {
+    return state.read();
+  }
+  void set_state(int s) {
+    state.set(s);
+  }
   bool is_stopping() {
-    Mutex::Locker l(is_stopping_lock);
-    return state == STOPPING;
+    return get_state() == STOPPING;
   }
   bool is_preparing_to_stop() {
-    Mutex::Locker l(is_stopping_lock);
-    return state == PREPARING_TO_STOP;
+    return get_state() == PREPARING_TO_STOP;
   }
   bool prepare_to_stop();
   void got_stop_ack();
@@ -1186,15 +1194,19 @@ private:
 
   // -- state --
 public:
-  static const int STATE_INITIALIZING = 1;
-  static const int STATE_BOOTING = 2;
-  static const int STATE_ACTIVE = 3;
-  static const int STATE_STOPPING = 4;
-  static const int STATE_WAITING_FOR_HEALTHY = 5;
+  typedef enum {
+    STATE_INITIALIZING = 1,
+    STATE_PREBOOT,
+    STATE_BOOTING,
+    STATE_ACTIVE,
+    STATE_STOPPING,
+    STATE_WAITING_FOR_HEALTHY
+  } osd_state_t;
 
   static const char *get_state_name(int s) {
     switch (s) {
     case STATE_INITIALIZING: return "initializing";
+    case STATE_PREBOOT: return "preboot";
     case STATE_BOOTING: return "booting";
     case STATE_ACTIVE: return "active";
     case STATE_STOPPING: return "stopping";
@@ -1216,6 +1228,9 @@ public:
   bool is_initializing() {
     return get_state() == STATE_INITIALIZING;
   }
+  bool is_preboot() {
+    return get_state() == STATE_PREBOOT;
+  }
   bool is_booting() {
     return get_state() == STATE_BOOTING;
   }
@@ -1286,17 +1301,16 @@ public:
     OSDMapRef osdmap;  /// Map as of which waiting_for_pg is current
     map<spg_t, list<OpRequestRef> > waiting_for_pg;
 
-    Mutex sent_epoch_lock;
+    Spinlock sent_epoch_lock;
     epoch_t last_sent_epoch;
-    Mutex received_map_lock;
+    Spinlock received_map_lock;
     epoch_t received_map_epoch; // largest epoch seen in MOSDMap from here
 
     Session(CephContext *cct) :
       RefCountedObject(cct),
       auid(-1), con(0),
-      session_dispatch_lock("Session::session_dispatch_lock"),
-      sent_epoch_lock("Session::sent_epoch_lock"), last_sent_epoch(0),
-      received_map_lock("Session::received_map_lock"), received_map_epoch(0)
+      session_dispatch_lock("Session::session_dispatch_lock"), 
+      last_sent_epoch(0), received_map_epoch(0)
     {}
 
 
@@ -1505,6 +1519,7 @@ private:
   Messenger *hb_front_server_messenger;
   Messenger *hb_back_server_messenger;
   utime_t last_heartbeat_resample;   ///< last time we chose random peers in waiting-for-healthy state
+  double daily_loadavg;
   
   void _add_heartbeat_peer(int p);
   void _remove_heartbeat_peer(int p);
@@ -1880,7 +1895,6 @@ protected:
   PG   *_create_lock_pg(
     OSDMapRef createmap,
     spg_t pgid,
-    bool newly_created,
     bool hold_map_lock,
     bool backfill,
     int role,
@@ -1907,9 +1921,6 @@ protected:
   void load_pgs();
   void build_past_intervals_parallel();
 
-  void calc_priors_during(
-    spg_t pgid, epoch_t start, epoch_t end, set<pg_shard_t>& pset);
-
   /// project pg history from from to now
   bool project_pg_history(
     spg_t pgid, pg_history_t& h, epoch_t from,
@@ -1937,19 +1948,9 @@ protected:
     }
   }
 
-  // -- pg creation --
-  struct create_pg_info {
-    pg_history_t history;
-    vector<int> acting;
-    set<pg_shard_t> prior;
-    pg_t parent;
-  };
-  ceph::unordered_map<spg_t, create_pg_info> creating_pgs;
-  double debug_drop_pg_create_probability;
-  int debug_drop_pg_create_duration;
-  int debug_drop_pg_create_left;  // 0 if we just dropped the last one, -1 if we can drop more
 
-  bool can_create_pg(spg_t pgid);
+  epoch_t last_pg_create_epoch;
+
   void handle_pg_create(OpRequestRef op);
 
   void split_pgs(
@@ -1960,6 +1961,7 @@ protected:
     PG::RecoveryCtx *rctx);
 
   // == monitor interaction ==
+  Mutex mon_report_lock;
   utime_t last_mon_report;
   utime_t last_pg_stats_sent;
 
@@ -1970,29 +1972,13 @@ protected:
    *  elsewhere.
    */
   utime_t last_pg_stats_ack;
-  bool outstanding_pg_stats; // some stat updates haven't been acked yet
-  bool timeout_mon_on_pg_stats;
-  void restart_stats_timer() {
-    Mutex::Locker l(osd_lock);
-    last_pg_stats_ack = ceph_clock_now(cct);
-    timeout_mon_on_pg_stats = true;
-  }
-
-  class C_MonStatsAckTimer : public Context {
-    OSD *osd;
-  public:
-    C_MonStatsAckTimer(OSD *o) : osd(o) {}
-    void finish(int r) {
-      osd->restart_stats_timer();
-    }
-  };
-  friend class C_MonStatsAckTimer;
-
-  void do_mon_report();
+  float stats_ack_timeout;
+  set<uint64_t> outstanding_pg_stats; // how many stat updates haven't been acked yet
 
   // -- boot --
   void start_boot();
-  void _maybe_boot(epoch_t oldest, epoch_t newest);
+  void _got_mon_epochs(epoch_t oldest, epoch_t newest);
+  void _preboot(epoch_t oldest, epoch_t newest);
   void _send_boot();
   void _collect_metadata(map<string,string> *pmeta);
   bool _lsb_release_set(char *buf, const char *str, map<string,string> *pm, const char *key);
@@ -2018,9 +2004,9 @@ protected:
 
   // -- failures --
   map<int,utime_t> failure_queue;
-  map<int,entity_inst_t> failure_pending;
-
+  map<int,pair<utime_t,entity_inst_t> > failure_pending;
 
+  void requeue_failures();
   void send_failures();
   void send_still_alive(epoch_t epoch, const entity_inst_t &i);
 
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index d308186..ebe73d6 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -937,9 +937,19 @@ void OSDMap::set_max_osd(int m)
 int OSDMap::calc_num_osds()
 {
   num_osd = 0;
-  for (int i=0; i<max_osd; i++)
-    if (osd_state[i] & CEPH_OSD_EXISTS)
-      num_osd++;
+  num_up_osd = 0;
+  num_in_osd = 0;
+  for (int i=0; i<max_osd; i++) {
+    if (osd_state[i] & CEPH_OSD_EXISTS) {
+      ++num_osd;
+      if (osd_state[i] & CEPH_OSD_UP) {
+	++num_up_osd;
+      }
+      if (get_weight(i) != CEPH_OSD_OUT) {
+	++num_in_osd;
+      }
+    }
+  }
   return num_osd;
 }
 
@@ -958,24 +968,6 @@ void OSDMap::get_up_osds(set<int32_t>& ls) const
   }
 }
 
-unsigned OSDMap::get_num_up_osds() const
-{
-  unsigned n = 0;
-  for (int i=0; i<max_osd; i++)
-    if ((osd_state[i] & CEPH_OSD_EXISTS) &&
-	(osd_state[i] & CEPH_OSD_UP)) n++;
-  return n;
-}
-
-unsigned OSDMap::get_num_in_osds() const
-{
-  unsigned n = 0;
-  for (int i=0; i<max_osd; i++)
-    if ((osd_state[i] & CEPH_OSD_EXISTS) &&
-	get_weight(i) != CEPH_OSD_OUT) n++;
-  return n;
-}
-
 void OSDMap::calc_state_set(int state, set<string>& st)
 {
   unsigned t = state;
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index 39e0ef0..e929b72 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -212,7 +212,10 @@ private:
 
   uint32_t flags;
 
-  int num_osd;         // not saved
+  int num_osd;         // not saved; see calc_num_osds
+  int num_up_osd;      // not saved; see calc_num_osds
+  int num_in_osd;      // not saved; see calc_num_osds
+
   int32_t max_osd;
   vector<uint8_t> osd_state;
 
@@ -265,7 +268,8 @@ private:
   OSDMap() : epoch(0), 
 	     pool_max(-1),
 	     flags(0),
-	     num_osd(0), max_osd(0),
+	     num_osd(0), num_up_osd(0), num_in_osd(0),
+	     max_osd(0),
 	     osd_addrs(new addrs_s),
 	     pg_temp(new map<pg_t,vector<int32_t> >),
 	     primary_temp(new map<pg_t,int32_t>),
@@ -329,12 +333,17 @@ public:
   unsigned get_num_osds() const {
     return num_osd;
   }
+  unsigned get_num_up_osds() const {
+    return num_up_osd;
+  }
+  unsigned get_num_in_osds() const {
+    return num_in_osd;
+  }
+  /// recalculate cached values for get_num{,_up,_in}_osds
   int calc_num_osds();
 
   void get_all_osds(set<int32_t>& ls) const;
   void get_up_osds(set<int32_t>& ls) const;
-  unsigned get_num_up_osds() const;
-  unsigned get_num_in_osds() const;
   unsigned get_num_pg_temp() const {
     return pg_temp->size();
   }
diff --git a/src/osd/OpRequest.cc b/src/osd/OpRequest.cc
index 65011a1..60fff4c 100644
--- a/src/osd/OpRequest.cc
+++ b/src/osd/OpRequest.cc
@@ -84,6 +84,7 @@ void OpRequest::_unregistered() {
 }
 
 bool OpRequest::check_rmw(int flag) {
+  assert(rmw_flags != 0);
   return rmw_flags & flag;
 }
 bool OpRequest::may_read() { return need_read_cap() || need_class_read_cap(); }
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index c13321c..24145f4 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -2039,7 +2039,7 @@ bool PG::queue_scrub()
     state_set(PG_STATE_DEEP_SCRUB);
     scrubber.must_deep_scrub = false;
   }
-  if (scrubber.must_repair) {
+  if (scrubber.must_repair || scrubber.auto_repair) {
     state_set(PG_STATE_REPAIR);
     scrubber.must_repair = false;
   }
@@ -2273,6 +2273,8 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
   split_ops(child, split_bits);
   _split_into(child_pgid, child, split_bits);
 
+  child->on_new_interval();
+
   child->dirty_info = true;
   child->dirty_big_info = true;
   dirty_info = true;
@@ -2525,7 +2527,6 @@ void PG::publish_stats_to_osd()
 
   utime_t now = ceph_clock_now(cct);
   if (info.stats.state != state) {
-    info.stats.state = state;
     info.stats.last_change = now;
     if ((state & PG_STATE_ACTIVE) &&
 	!(info.stats.state & PG_STATE_ACTIVE))
@@ -2533,6 +2534,7 @@ void PG::publish_stats_to_osd()
     if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
 	!(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
       info.stats.last_became_peered = now;
+    info.stats.state = state;
   }
 
   _update_calc_stats();
@@ -3243,9 +3245,17 @@ bool PG::sched_scrub()
     return false;
   }
 
-  bool time_for_deep = (ceph_clock_now(cct) >
+  bool time_for_deep = (ceph_clock_now(cct) >=
     info.history.last_deep_scrub_stamp + cct->_conf->osd_deep_scrub_interval);
 
+  bool deep_coin_flip = false;
+  // Only add random deep scrubs when NOT user initiated scrub
+  if (!scrubber.must_scrub)
+      deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
+  dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
+
+  time_for_deep = (time_for_deep || deep_coin_flip);
+
   //NODEEP_SCRUB so ignore time initiated deep-scrub
   if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
       pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB))
@@ -3260,6 +3270,21 @@ bool PG::sched_scrub()
       return false;
   }
 
+  if (cct->_conf->osd_scrub_auto_repair
+      && get_pgbackend()->auto_repair_supported()
+      && time_for_deep
+      // respect the command from user, and not do auto-repair
+      && !scrubber.must_repair
+      && !scrubber.must_scrub
+      && !scrubber.must_deep_scrub) {
+    dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
+    scrubber.auto_repair = true;
+  } else {
+    // this happens when user issue the scrub/repair command during
+    // the scheduling of the scrub/repair (e.g. request reservation)
+    scrubber.auto_repair = false;
+  }
+
   bool ret = true;
   if (!scrubber.reserved) {
     assert(scrubber.reserved_peers.empty());
@@ -3568,8 +3593,18 @@ void PG::_scan_snaps(ScrubMap &smap)
     if (hoid.snap < CEPH_MAXSNAP) {
       // fake nlinks for old primaries
       bufferlist bl;
+      if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
+	o.nlinks = 0;
+	continue;
+      }
       bl.push_back(o.attrs[OI_ATTR]);
-      object_info_t oi(bl);
+      object_info_t oi;
+      try {
+	oi = bl;
+      } catch(...) {
+	o.nlinks = 0;
+	continue;
+      }
       if (oi.snaps.empty()) {
 	// Just head
 	o.nlinks = 1;
@@ -4001,12 +4036,14 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
 
         // walk the log to find the latest update that affects our chunk
         scrubber.subset_last_update = pg_log.get_tail();
-        for (list<pg_log_entry_t>::const_iterator p = pg_log.get_log().log.begin();
-             p != pg_log.get_log().log.end();
+        for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
+             p != pg_log.get_log().log.rend();
              ++p) {
           if (cmp(p->soid, scrubber.start, get_sort_bitwise()) >= 0 &&
-	      cmp(p->soid, scrubber.end, get_sort_bitwise()) < 0)
+	      cmp(p->soid, scrubber.end, get_sort_bitwise()) < 0) {
             scrubber.subset_last_update = p->version;
+            break;
+          }
         }
 
         // ask replicas to wait until last_update_applied >= scrubber.subset_last_update and then scan
@@ -4231,7 +4268,7 @@ void PG::scrub_compare_maps()
   _scrub(authmap, missing_digest);
 }
 
-void PG::scrub_process_inconsistent()
+bool PG::scrub_process_inconsistent()
 {
   dout(10) << __func__ << ": checking authoritative" << dendl;
   bool repair = state_test(PG_STATE_REPAIR);
@@ -4278,19 +4315,27 @@ void PG::scrub_process_inconsistent()
       }
     }
   }
+  return (!scrubber.authoritative.empty() && repair);
 }
 
 // the part that actually finalizes a scrub
 void PG::scrub_finish() 
 {
   bool repair = state_test(PG_STATE_REPAIR);
+  // if the repair request comes from auto-repair and large number of errors,
+  // we would like to cancel auto-repair
+  if (repair && scrubber.auto_repair
+      && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
+    state_clear(PG_STATE_REPAIR);
+    repair = false;
+  }
   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
 
   // type-specific finish (can tally more errors)
   _scrub_finish();
 
-  scrub_process_inconsistent();
+  bool has_error = scrub_process_inconsistent();
 
   {
     stringstream oss;
@@ -4358,7 +4403,7 @@ void PG::scrub_finish()
   }
 
 
-  if (repair) {
+  if (has_error) {
     queue_peering_event(
       CephPeeringEvtRef(
 	new CephPeeringEvt(
@@ -5019,6 +5064,8 @@ ostream& operator<<(ostream& out, const PG& pg)
 
   if (pg.scrubber.must_repair)
     out << " MUST_REPAIR";
+  if (pg.scrubber.auto_repair)
+    out << " AUTO_REPAIR";
   if (pg.scrubber.must_deep_scrub)
     out << " MUST_DEEP_SCRUB";
   if (pg.scrubber.must_scrub)
@@ -5048,6 +5095,7 @@ ostream& operator<<(ostream& out, const PG& pg)
 bool PG::can_discard_op(OpRequestRef& op)
 {
   MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
+
   if (OSD::op_is_discardable(m)) {
     dout(20) << " discard " << *m << dendl;
     return true;
@@ -5066,22 +5114,6 @@ bool PG::can_discard_op(OpRequestRef& op)
     return true;
   }
 
-  if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
-			 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
-      op->may_read() &&
-      !(op->may_write() || op->may_cache())) {
-    // balanced reads; any replica will do
-    if (!(is_primary() || is_replica())) {
-      osd->handle_misdirected_op(this, op);
-      return true;
-    }
-  } else {
-    // normal case; must be primary
-    if (!is_primary()) {
-      osd->handle_misdirected_op(this, op);
-      return true;
-    }
-  }
   if (is_replay()) {
     if (m->get_version().version > 0) {
       dout(7) << " queueing replay at " << m->get_version()
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 0ae3879..1c2c31c 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -1072,6 +1072,7 @@ public:
       active(false), queue_snap_trim(false),
       waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
       must_scrub(false), must_deep_scrub(false), must_repair(false),
+      auto_repair(false),
       num_digest_updates_pending(0),
       state(INACTIVE),
       deep(false),
@@ -1100,6 +1101,9 @@ public:
     // flags to indicate explicitly requested scrubs (by admin)
     bool must_scrub, must_deep_scrub, must_repair;
 
+    // this flag indicates whether we would like to do auto-repair of the PG or not
+    bool auto_repair;
+
     // Maps from objects with errors to missing/inconsistent peers
     map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator> missing;
     map<hobject_t, set<pg_shard_t>, hobject_t::BitwiseComparator> inconsistent;
@@ -1188,6 +1192,7 @@ public:
       must_scrub = false;
       must_deep_scrub = false;
       must_repair = false;
+      auto_repair = false;
 
       state = PG::Scrubber::INACTIVE;
       start = hobject_t();
@@ -1218,7 +1223,10 @@ public:
   void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
   void chunky_scrub(ThreadPool::TPHandle &handle);
   void scrub_compare_maps();
-  void scrub_process_inconsistent();
+  /**
+   * return true if any inconsistency/missing is repaired, false otherwise
+   */
+  bool scrub_process_inconsistent();
   void scrub_finish();
   void scrub_clear_state();
   void _scan_snaps(ScrubMap &map);
diff --git a/src/osd/PGBackend.cc b/src/osd/PGBackend.cc
index f42e6be..7a7f6df 100644
--- a/src/osd/PGBackend.cc
+++ b/src/osd/PGBackend.cc
@@ -410,8 +410,11 @@ enum scrub_error_type PGBackend::be_compare_scrub_objects(
     if (error != CLEAN)
       errorstream << ", ";
     error = SHALLOW_ERROR;
+    bool known = auth.size == be_get_ondisk_size(auth_oi.size);
     errorstream << "size " << candidate.size
-		<< " != known size " << auth.size;
+		<< " != "
+                << (known ? "known" : "best guess")
+                << " size " << auth.size;
   }
   for (map<string,bufferptr>::const_iterator i = auth.attrs.begin();
        i != auth.attrs.end();
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index 5259994..1e410c7 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -406,10 +406,18 @@
        const hobject_t &hoid,         ///< [in] object to write
        map<string, bufferlist> &keys  ///< [in] omap keys, may be cleared
        ) { assert(0); }
+     virtual void omap_setkeys(
+       const hobject_t &hoid,         ///< [in] object to write
+       bufferlist &keys_bl  ///< [in] omap keys, may be cleared
+       ) { assert(0); }
      virtual void omap_rmkeys(
        const hobject_t &hoid,         ///< [in] object to write
        set<string> &keys              ///< [in] omap keys, may be cleared
        ) { assert(0); }
+     virtual void omap_rmkeys(
+       const hobject_t &hoid,         ///< [in] object to write
+       bufferlist &keys_bl            ///< [in] omap keys, may be cleared
+       ) { assert(0); }
      virtual void omap_clear(
        const hobject_t &hoid          ///< [in] object to clear omap
        ) { assert(0); }
@@ -548,6 +556,7 @@
      Context *on_complete, bool fast_read = false) = 0;
 
    virtual bool scrub_supported() { return false; }
+   virtual bool auto_repair_supported() const { return false; }
    void be_scan_list(
      ScrubMap &map, const vector<hobject_t> &ls, bool deep, uint32_t seed,
      ThreadPool::TPHandle &handle);
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index 86dbd3a..72520a7 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -887,7 +887,7 @@ void PGLog::read_log(ObjectStore *store, coll_t pg_coll,
   log.rollback_info_trimmed_to = eversion_t();
   ObjectMap::ObjectMapIterator p = store->get_omap_iterator(log_coll, log_oid);
   if (p) {
-    for (p->seek_to_first(); p->valid() ; p->next()) {
+    for (p->seek_to_first(); p->valid() ; p->next(false)) {
       // non-log pgmeta_oid keys are prefixed with _; skip those
       if (p->key()[0] == '_')
 	continue;
diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h
index 744f318..fd82a70 100644
--- a/src/osd/PGLog.h
+++ b/src/osd/PGLog.h
@@ -299,20 +299,20 @@ struct PGLog {
 
 protected:
   //////////////////// data members ////////////////////
-  bool pg_log_debug;
 
   map<eversion_t, hobject_t> divergent_priors;
   pg_missing_t     missing;
   IndexedLog  log;
 
-  /// Log is clean on [dirty_to, dirty_from)
-  bool touched_log;
   eversion_t dirty_to;         ///< must clear/writeout all keys <= dirty_to
   eversion_t dirty_from;       ///< must clear/writeout all keys >= dirty_from
   eversion_t writeout_from;    ///< must writout keys >= writeout_from
   set<eversion_t> trimmed;     ///< must clear keys in trimmed
-  bool dirty_divergent_priors;
   CephContext *cct;
+  bool pg_log_debug;
+  /// Log is clean on [dirty_to, dirty_from)
+  bool touched_log;
+  bool dirty_divergent_priors;
 
   bool is_dirty() const {
     return !touched_log ||
@@ -375,10 +375,11 @@ protected:
   }
 public:
   PGLog(CephContext *cct = 0) :
+    dirty_from(eversion_t::max()),
+    writeout_from(eversion_t::max()), 
+    cct(cct), 
     pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))),
-    touched_log(false), dirty_from(eversion_t::max()),
-    writeout_from(eversion_t::max()),
-    dirty_divergent_priors(false), cct(cct) {}
+    touched_log(false), dirty_divergent_priors(false) {}
 
 
   void reset_backfill();
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index 824ce46..099fc64 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -398,12 +398,25 @@ public:
       written += p->first.length() + p->second.length();
     return t->omap_setkeys(get_coll(hoid), ghobject_t(hoid), keys);
   }
+  void omap_setkeys(
+    const hobject_t &hoid,
+    bufferlist &keys_bl
+    ) {
+    written += keys_bl.length();
+    return t->omap_setkeys(get_coll(hoid), ghobject_t(hoid), keys_bl);
+  }
   void omap_rmkeys(
     const hobject_t &hoid,
     set<string> &keys
     ) {
     t->omap_rmkeys(get_coll(hoid), ghobject_t(hoid), keys);
   }
+  void omap_rmkeys(
+    const hobject_t &hoid,
+    bufferlist &keys_bl
+    ) {
+    t->omap_rmkeys(get_coll(hoid), ghobject_t(hoid), keys_bl);
+  }
   void omap_clear(
     const hobject_t &hoid
     ) {
@@ -589,8 +602,6 @@ void ReplicatedBackend::submit_transaction(
     &op,
     op_t);
 
-  ObjectStore::Transaction *local_t = new ObjectStore::Transaction;
-  local_t->set_use_tbl(op_t->get_use_tbl());
   if (!(t->get_temp_added().empty())) {
     add_temp_objs(t->get_temp_added());
   }
@@ -602,7 +613,7 @@ void ReplicatedBackend::submit_transaction(
     trim_to,
     trim_rollback_to,
     true,
-    local_t);
+    op_t);
   
   op_t->register_on_applied_sync(on_local_applied_sync);
   op_t->register_on_applied(
@@ -610,14 +621,11 @@ void ReplicatedBackend::submit_transaction(
       new C_OSD_OnOpApplied(this, &op)));
   op_t->register_on_applied(
     new ObjectStore::C_DeleteTransaction(op_t));
-  op_t->register_on_applied(
-    new ObjectStore::C_DeleteTransaction(local_t));
   op_t->register_on_commit(
     parent->bless_context(
       new C_OSD_OnOpCommit(this, &op)));
 
   list<ObjectStore::Transaction*> tls;
-  tls.push_back(local_t);
   tls.push_back(op_t);
   parent->queue_transactions(tls, op.op);
   delete t;
@@ -765,6 +773,7 @@ void ReplicatedBackend::be_deep_scrub(
     dout(25) << __func__ << "  " << poid << " got "
 	     << r << " on read, read_error" << dendl;
     o.read_error = true;
+    return;
   }
   o.digest = h.digest();
   o.digest_present = true;
@@ -792,6 +801,7 @@ void ReplicatedBackend::be_deep_scrub(
     dout(25) << __func__ << "  " << poid << " got "
 	     << r << " on omap header read, read_error" << dendl;
     o.read_error = true;
+    return;
   }
 
   ObjectMap::ObjectMapIterator iter = store->get_omap_iterator(
@@ -800,7 +810,7 @@ void ReplicatedBackend::be_deep_scrub(
       poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
   assert(iter);
   uint64_t keys_scanned = 0;
-  for (iter->seek_to_first(); iter->valid() ; iter->next()) {
+  for (iter->seek_to_first(); iter->valid() ; iter->next(false)) {
     if (cct->_conf->osd_scan_list_ping_tp_interval &&
 	(keys_scanned % cct->_conf->osd_scan_list_ping_tp_interval == 0)) {
       handle.reset_tp_timeout();
@@ -819,8 +829,8 @@ void ReplicatedBackend::be_deep_scrub(
     dout(25) << __func__ << "  " << poid << " got "
 	     << r << " on omap scan, read_error" << dendl;
     o.read_error = true;
+    return;
   }
-
   //Store final calculated CRC32 of omap header & key/values
   o.omap_digest = oh.digest();
   o.omap_digest_present = true;
@@ -2021,7 +2031,7 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
 			       ghobject_t(recovery_info.soid));
     for (iter->lower_bound(progress.omap_recovered_to);
 	 iter->valid();
-	 iter->next()) {
+	 iter->next(false)) {
       if (!out_op->omap_entries.empty() &&
 	  available <= (iter->key().size() + iter->value().length()))
 	break;
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 7d5946b..64c77a4 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -1351,10 +1351,6 @@ void ReplicatedPG::do_request(
   OpRequestRef& op,
   ThreadPool::TPHandle &handle)
 {
-  if (!op_has_sufficient_caps(op)) {
-    osd->reply_op_error(op, -EPERM);
-    return;
-  }
   assert(!op_must_wait_for_map(get_osdmap()->get_epoch(), op));
   if (can_discard_request(op)) {
     return;
@@ -1488,6 +1484,35 @@ void ReplicatedPG::do_op(OpRequestRef& op)
 {
   MOSDOp *m = static_cast<MOSDOp*>(op->get_req());
   assert(m->get_type() == CEPH_MSG_OSD_OP);
+
+  m->finish_decode();
+  m->clear_payload();
+
+  if (op->rmw_flags == 0) {
+    int r = osd->osd->init_op_flags(op);
+    if (r) {
+      osd->reply_op_error(op, r);
+      return;
+    }
+  }
+
+  if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
+			 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
+      op->may_read() &&
+      !(op->may_write() || op->may_cache())) {
+    // balanced reads; any replica will do
+    if (!(is_primary() || is_replica())) {
+      osd->handle_misdirected_op(this, op);
+      return;
+    }
+  } else {
+    // normal case; must be primary
+    if (!is_primary()) {
+      osd->handle_misdirected_op(this, op);
+      return;
+    }
+  }
+
   if (op->includes_pg_op()) {
     if (pg_op_must_wait(m)) {
       wait_for_all_missing(op);
@@ -1496,6 +1521,22 @@ void ReplicatedPG::do_op(OpRequestRef& op)
     return do_pg_op(op);
   }
 
+  if (!op_has_sufficient_caps(op)) {
+    osd->reply_op_error(op, -EPERM);
+    return;
+  }
+
+  // object name too long?
+  unsigned max_name_len = MIN(g_conf->osd_max_object_name_len,
+                              osd->osd->store->get_max_object_name_length());
+  if (m->get_oid().name.size() > max_name_len) {
+    dout(4) << "do_op '" << m->get_oid().name << "' is longer than "
+            << max_name_len << " bytes" << dendl;
+    osd->reply_op_error(op, -ENAMETOOLONG);
+    return;
+  }
+
+  // blacklisted?
   if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
     dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
     osd->reply_op_error(op, -EBLACKLISTED);
@@ -1520,6 +1561,31 @@ void ReplicatedPG::do_op(OpRequestRef& op)
 	     << dendl;
     return;
   }
+  int64_t poolid = get_pgid().pool();
+  if (op->may_write()) {
+
+    const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
+    if (!pi) {
+      return;
+    }
+
+    // invalid?
+    if (m->get_snapid() != CEPH_NOSNAP) {
+      osd->reply_op_error(op, -EINVAL);
+      return;
+    }
+
+    // too big?
+    if (cct->_conf->osd_max_write_size &&
+        m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
+      // journal can't hold commit!
+      derr << "do_op msg data len " << m->get_data_len()
+           << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
+           << " on " << *m << dendl;
+      osd->reply_op_error(op, -OSD_WRITETOOBIG);
+      return;
+    }
+  }
 
   // order this op as a write?
   bool write_ordered =
@@ -1666,8 +1732,8 @@ void ReplicatedPG::do_op(OpRequestRef& op)
     // CEPH_OSD_FLAG_LOCALIZE_READS set, we just return -EAGAIN. Otherwise,
     // we have to wait for the object.
     if (is_primary() ||
-	(!(m->has_flag(CEPH_OSD_FLAG_BALANCE_READS) &&
-	 !(m->has_flag(CEPH_OSD_FLAG_LOCALIZE_READS))))) {
+	(!(m->has_flag(CEPH_OSD_FLAG_BALANCE_READS)) &&
+	 !(m->has_flag(CEPH_OSD_FLAG_LOCALIZE_READS)))) {
       // missing the specific snap we need; requeue and wait.
       assert(!op->may_write()); // only happens on a read/cache
       wait_for_unreadable_object(missing_oid, op);
@@ -3126,13 +3192,19 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
 
   object_info_t &coi = obc->obs.oi;
   set<snapid_t> old_snaps(coi.snaps.begin(), coi.snaps.end());
-  assert(old_snaps.size());
+  if (old_snaps.empty()) {
+    osd->clog->error() << __func__ << " No object info snaps for " << coid << "\n";
+    return NULL;
+  }
 
   SnapSet& snapset = obc->ssc->snapset;
 
   dout(10) << coid << " old_snaps " << old_snaps
 	   << " old snapset " << snapset << dendl;
-  assert(snapset.seq);
+  if (snapset.seq == 0) {
+    osd->clog->error() << __func__ << " No snapset.seq for " << coid << "\n";
+    return NULL;
+  }
 
   RepGather *repop = simple_repop_create(obc);
   OpContext *ctx = repop->ctx;
@@ -3161,7 +3233,11 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
     for (p = snapset.clones.begin(); p != snapset.clones.end(); ++p)
       if (*p == last)
 	break;
-    assert(p != snapset.clones.end());
+    if (p == snapset.clones.end()) {
+      osd->clog->error() << __func__ << " Snap " << coid.snap << " not in clones" << "\n";
+      return NULL;
+    }
+
     ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
 
     if (p != snapset.clones.begin()) {
@@ -3190,6 +3266,8 @@ ReplicatedPG::RepGather *ReplicatedPG::trim_object(const hobject_t &coid)
       ctx->delta_stats.num_whiteouts--;
     }
     ctx->delta_stats.num_object_clones--;
+    if (coi.is_cache_pinned())
+      ctx->delta_stats.num_objects_pinned--;
     obc->obs.exists = false;
 
     snapset.clones.erase(p);
@@ -3346,7 +3424,6 @@ void ReplicatedPG::snap_trimmer(epoch_t queued)
   snap_trim_queued = false;
   dout(10) << "snap_trimmer entry" << dendl;
   if (is_primary()) {
-    entity_inst_t nobody;
     if (scrubber.active) {
       dout(10) << " scrubbing, will requeue snap_trimmer after" << dendl;
       scrubber.queue_snap_trim = true;
@@ -3848,6 +3925,8 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
     case CEPH_OSD_OP_UNDIRTY:
     case CEPH_OSD_OP_COPY_FROM:  // we handle user_version update explicitly
+    case CEPH_OSD_OP_CACHE_PIN:
+    case CEPH_OSD_OP_CACHE_UNPIN:
       break;
     default:
       if (op.op & CEPH_OSD_OP_MODE_WR)
@@ -4237,6 +4316,11 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  result = 0;
 	  break;
 	}
+	if (oi.is_cache_pinned()) {
+	  dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
+	  result = -EPERM;
+	  break;
+	}
 	if (oi.is_dirty()) {
 	  result = start_flush(ctx->op, ctx->obc, false, NULL, NULL);
 	  if (result == -EINPROGRESS)
@@ -4264,6 +4348,11 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  result = 0;
 	  break;
 	}
+	if (oi.is_cache_pinned()) {
+	  dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
+	  result = -EPERM;
+	  break;
+	}
 	hobject_t missing;
 	if (oi.is_dirty()) {
 	  result = start_flush(ctx->op, ctx->obc, true, &missing, NULL);
@@ -4295,6 +4384,11 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  result = 0;
 	  break;
 	}
+	if (oi.is_cache_pinned()) {
+	  dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
+	  result = -EPERM;
+	  break;
+	}
 	if (oi.is_dirty()) {
 	  result = -EBUSY;
 	  break;
@@ -4846,9 +4940,13 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	    // category is no longer implemented.
 	  }
           if (result >= 0) {
+	    bool is_whiteout = obs.exists && oi.is_whiteout();
 	    if (maybe_create_new_object(ctx)) {
-              ctx->mod_desc.create();
+	      ctx->mod_desc.create();
 	      t->touch(soid);
+	    } else if (is_whiteout) {
+	      // to change whiteout to non-whiteout, it need an op to update xattr
+	      t->nop();
 	    }
           }
 	}
@@ -5020,6 +5118,56 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
       }
       break;
 
+    case CEPH_OSD_OP_CACHE_PIN:
+      tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
+      if ((!pool.info.is_tier() ||
+	  pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
+        result = -EINVAL;
+        dout(10) << " pin object is only allowed on the cache tier " << dendl;
+        break;
+      }
+      ++ctx->num_write;
+      {
+	if (!obs.exists || oi.is_whiteout()) {
+	  result = -ENOENT;
+	  break;
+	}
+
+	if (!oi.is_cache_pinned()) {
+	  oi.set_flag(object_info_t::FLAG_CACHE_PIN);
+	  ctx->modify = true;
+	  ctx->delta_stats.num_objects_pinned++;
+	  ctx->delta_stats.num_wr++;
+	}
+	result = 0;
+      }
+      break;
+
+    case CEPH_OSD_OP_CACHE_UNPIN:
+      tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
+      if ((!pool.info.is_tier() ||
+	  pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
+        result = -EINVAL;
+        dout(10) << " pin object is only allowed on the cache tier " << dendl;
+        break;
+      }
+      ++ctx->num_write;
+      {
+	if (!obs.exists || oi.is_whiteout()) {
+	  result = -ENOENT;
+	  break;
+	}
+
+	if (oi.is_cache_pinned()) {
+	  oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
+	  ctx->modify = true;
+	  ctx->delta_stats.num_objects_pinned--;
+	  ctx->delta_stats.num_wr++;
+	}
+	result = 0;
+      }
+      break;
+
 
       // -- object attrs --
       
@@ -5192,7 +5340,8 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  ::encode(m, newbl);
 	  newop.indata = newbl;
 	}
-	do_osd_ops(ctx, nops);
+	result = do_osd_ops(ctx, nops);
+	assert(result == 0);
       }
       break;
 
@@ -5238,7 +5387,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  iter->upper_bound(start_after);
 	  for (uint64_t i = 0;
 	       i < max_return && iter->valid();
-	       ++i, iter->next()) {
+	       ++i, iter->next(false)) {
 	    out_set.insert(iter->key());
 	  }
 	} // else return empty out_set
@@ -5280,7 +5429,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  for (uint64_t i = 0;
 	       i < max_return && iter->valid() &&
 		 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
-	       ++i, iter->next()) {
+	       ++i, iter->next(false)) {
 	    dout(20) << "Found key " << iter->key() << dendl;
 	    out_set.insert(make_pair(iter->key(), iter->value()));
 	  }
@@ -5406,7 +5555,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
     case CEPH_OSD_OP_OMAPSETVALS:
       if (!pool.info.supports_omap()) {
 	result = -EOPNOTSUPP;
-	tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val, "???");
+	tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
 	break;
       }
       ctx->mod_desc.mark_unrollbackable();
@@ -5415,23 +5564,28 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	if (maybe_create_new_object(ctx)) {
 	  t->touch(soid);
 	}
-	map<string, bufferlist> to_set;
+	bufferlist to_set_bl;
 	try {
-	  ::decode(to_set, bp);
+	  decode_str_str_map_to_bl(bp, &to_set_bl);
 	}
 	catch (buffer::error& e) {
 	  result = -EINVAL;
-	  tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val, "???");
+	  tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
 	  goto fail;
 	}
-	tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val, list_keys(to_set).c_str());
-	dout(20) << "setting vals: " << dendl;
-	for (map<string, bufferlist>::iterator i = to_set.begin();
-	     i != to_set.end();
-	     ++i) {
-	  dout(20) << "\t" << i->first << dendl;
+	tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
+	if (g_ceph_context->_conf->subsys.should_gather(dout_subsys, 20)) {
+	  dout(20) << "setting vals: " << dendl;
+	  map<string,bufferlist> to_set;
+	  bufferlist::iterator pt = to_set_bl.begin();
+	  ::decode(to_set, pt);
+	  for (map<string, bufferlist>::iterator i = to_set.begin();
+	       i != to_set.end();
+	       ++i) {
+	    dout(20) << "\t" << i->first << dendl;
+	  }
 	}
-	t->omap_setkeys(soid, to_set);
+	t->omap_setkeys(soid, to_set_bl);
 	ctx->delta_stats.num_wr++;
       }
       obs.oi.set_flag(object_info_t::FLAG_OMAP);
@@ -5481,7 +5635,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
     case CEPH_OSD_OP_OMAPRMKEYS:
       if (!pool.info.supports_omap()) {
 	result = -EOPNOTSUPP;
-	tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val, "???");
+	tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
 	break;
       }
       ctx->mod_desc.mark_unrollbackable();
@@ -5489,21 +5643,21 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
       {
 	if (!obs.exists || oi.is_whiteout()) {
 	  result = -ENOENT;
-	  tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val, "???");
+	  tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
 	  break;
 	}
 	t->touch(soid);
-	set<string> to_rm;
+	bufferlist to_rm_bl;
 	try {
-	  ::decode(to_rm, bp);
+	  decode_str_set_to_bl(bp, &to_rm_bl);
 	}
 	catch (buffer::error& e) {
 	  result = -EINVAL;
-	  tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val, "???");
+	  tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
 	  goto fail;
 	}
-	tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val, list_entries(to_rm).c_str());
-	t->omap_rmkeys(soid, to_rm);
+	tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
+	t->omap_rmkeys(soid, to_rm_bl);
 	ctx->delta_stats.num_wr++;
       }
       obs.oi.set_flag(object_info_t::FLAG_OMAP);
@@ -6002,6 +6156,8 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
     }
     if (snap_oi->is_omap())
       ctx->delta_stats.num_objects_omap++;
+    if (snap_oi->is_cache_pinned())
+      ctx->delta_stats.num_objects_pinned++;
     ctx->delta_stats.num_object_clones++;
     ctx->new_snapset.clones.push_back(coid.snap);
     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
@@ -6674,7 +6830,7 @@ int ReplicatedPG::fill_in_copy_get(
 	osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
       assert(iter);
       iter->upper_bound(cursor.omap_offset);
-      for (; iter->valid(); iter->next()) {
+      for (; iter->valid(); iter->next(false)) {
 	++omap_keys;
 	::encode(iter->key(), omap_data);
 	::encode(iter->value(), omap_data);
@@ -8340,7 +8496,6 @@ void ReplicatedPG::handle_watch_timeout(WatchRef watch)
     OpContext::watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true));
 
 
-  entity_inst_t nobody;
   PGBackend::PGTransaction *t = ctx->op_t;
   ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
 				    ctx->at_version,
@@ -8427,7 +8582,7 @@ ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
 	// new object.
 	object_info_t oi(soid);
 	SnapSetContext *ssc = get_snapset_context(
-	  soid, true, 0);
+	  soid, true, 0, false);
 	obc = create_object_context(oi, ssc);
 	dout(10) << __func__ << ": " << obc << " " << soid
 		 << " " << obc->rwstate
@@ -8525,9 +8680,6 @@ int ReplicatedPG::find_object_context(const hobject_t& oid,
        << dendl;
     *pobc = obc;
 
-    if (can_create && !obc->ssc)
-      obc->ssc = get_snapset_context(oid, true);
-
     return 0;
   }
 
@@ -8745,6 +8897,8 @@ void ReplicatedPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t
     stat.num_whiteouts++;
   if (oi.is_omap())
     stat.num_objects_omap++;
+  if (oi.is_cache_pinned())
+    stat.num_objects_pinned++;
 
   if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
     stat.num_object_clones++;
@@ -8807,7 +8961,8 @@ SnapSetContext *ReplicatedPG::create_snapset_context(const hobject_t& oid)
 SnapSetContext *ReplicatedPG::get_snapset_context(
   const hobject_t& oid,
   bool can_create,
-  map<string, bufferlist> *attrs)
+  map<string, bufferlist> *attrs,
+  bool oid_existed)
 {
   Mutex::Locker l(snapset_contexts_lock);
   SnapSetContext *ssc;
@@ -8822,9 +8977,12 @@ SnapSetContext *ReplicatedPG::get_snapset_context(
   } else {
     bufferlist bv;
     if (!attrs) {
-      int r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
+      int r = -ENOENT;
+      if (!(oid.is_head() && !oid_existed))
+	r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
       if (r < 0) {
 	// try _snapset
+      if (!(oid.is_snapdir() && !oid_existed))
 	r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
 	if (r < 0 && !can_create)
 	  return NULL;
@@ -11368,7 +11526,6 @@ bool ReplicatedPG::agent_work(int start_max, int agent_flush_quota)
   if (++agent_state->hist_age > g_conf->osd_agent_hist_halflife) {
     dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
     agent_state->hist_age = 0;
-    agent_state->atime_hist.decay();
     agent_state->temp_hist.decay();
   }
 
@@ -11485,6 +11642,11 @@ bool ReplicatedPG::agent_maybe_flush(ObjectContextRef& obc)
     osd->logger->inc(l_osd_agent_skip);
     return false;
   }
+  if (obc->obs.oi.is_cache_pinned()) {
+    dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
+    osd->logger->inc(l_osd_agent_skip);
+    return false;
+  }
 
   utime_t now = ceph_clock_now(NULL);
   utime_t ob_local_mtime;
@@ -11555,6 +11717,10 @@ bool ReplicatedPG::agent_maybe_evict(ObjectContextRef& obc)
     dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
     return false;
   }
+  if (obc->obs.oi.is_cache_pinned()) {
+    dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
+    return false;
+  }
 
   if (soid.snap == CEPH_NOSNAP) {
     int result = _verify_no_head_clones(soid, obc->ssc->snapset);
@@ -11566,42 +11732,15 @@ bool ReplicatedPG::agent_maybe_evict(ObjectContextRef& obc)
 
   if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
     // is this object old and/or cold enough?
-    int atime = -1, temp = 0;
+    int temp = 0;
+    uint64_t temp_upper = 0, temp_lower = 0;
     if (hit_set)
-      agent_estimate_atime_temp(soid, &atime, NULL /*FIXME &temp*/);
-
-    uint64_t atime_upper = 0, atime_lower = 0;
-    if (atime < 0 && obc->obs.oi.mtime != utime_t()) {
-      if (obc->obs.oi.local_mtime != utime_t()) {
-        atime = ceph_clock_now(NULL).sec() - obc->obs.oi.local_mtime;
-      } else {
-        atime = ceph_clock_now(NULL).sec() - obc->obs.oi.mtime;
-      }
-    }
-    if (atime < 0) {
-      if (hit_set) {
-        atime = pool.info.hit_set_period * pool.info.hit_set_count; // "infinite"
-      } else {
-	atime_upper = 1000000;
-      }
-    }
-    if (atime >= 0) {
-      agent_state->atime_hist.add(atime);
-      agent_state->atime_hist.get_position_micro(atime, &atime_lower,
-						 &atime_upper);
-    }
-
-    unsigned temp_upper = 0, temp_lower = 0;
-    /*
-    // FIXME: bound atime based on creation time?
-    agent_state->temp_hist.add(atime);
+      agent_estimate_temp(soid, &temp);
+    agent_state->temp_hist.add(temp);
     agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
-    */
 
     dout(20) << __func__
-	     << " atime " << atime
-	     << " pos " << atime_lower << "-" << atime_upper
-	     << ", temp " << temp
+	     << " temp " << temp
 	     << " pos " << temp_lower << "-" << temp_upper
 	     << ", evict_effort " << agent_state->evict_effort
 	     << dendl;
@@ -11614,9 +11753,7 @@ bool ReplicatedPG::agent_maybe_evict(ObjectContextRef& obc)
     delete f;
     *_dout << dendl;
 
-    // FIXME: ignore temperature for now.
-
-    if (1000000 - atime_upper >= agent_state->evict_effort)
+    if (1000000 - temp_upper >= agent_state->evict_effort)
       return false;
   }
 
@@ -11900,32 +12037,21 @@ bool ReplicatedPG::agent_choose_mode(bool restart, OpRequestRef op)
   return requeued;
 }
 
-void ReplicatedPG::agent_estimate_atime_temp(const hobject_t& oid,
-					     int *atime, int *temp)
+void ReplicatedPG::agent_estimate_temp(const hobject_t& oid, int *temp)
 {
   assert(hit_set);
-  *atime = -1;
-  if (temp)
-    *temp = 0;
-  if (hit_set->contains(oid)) {
-    *atime = 0;
-    if (temp)
-      ++(*temp);
-    else
-      return;
-  }
-  time_t now = ceph_clock_now(NULL).sec();
+  assert(temp);
+  *temp = 0;
+  if (hit_set->contains(oid))
+    *temp = 1000000;
+  unsigned i = 0;
+  int last_n = pool.info.hit_set_search_last_n;
   for (map<time_t,HitSetRef>::reverse_iterator p =
-	 agent_state->hit_set_map.rbegin();
-       p != agent_state->hit_set_map.rend();
-       ++p) {
+       agent_state->hit_set_map.rbegin(); last_n > 0 &&
+       p != agent_state->hit_set_map.rend(); ++p, ++i) {
     if (p->second->contains(oid)) {
-      if (*atime < 0)
-	*atime = now - p->first;
-      if (temp)
-	++(*temp);
-      else
-	return;
+      *temp += pool.info.get_grade(i);
+      --last_n;
     }
   }
 }
@@ -11971,6 +12097,85 @@ void ReplicatedPG::_scrub_digest_updated()
   }
 }
 
+static bool doing_clones(const boost::optional<SnapSet> &snapset,
+			 const vector<snapid_t>::reverse_iterator &curclone) {
+    return snapset && curclone != snapset.get().clones.rend();
+}
+
+void ReplicatedPG::log_missing(unsigned missing,
+			const boost::optional<hobject_t> &head,
+			LogChannelRef clog,
+			const spg_t &pgid,
+			const char *func,
+			const char *mode,
+			bool allow_incomplete_clones)
+{
+  assert(head);
+  if (allow_incomplete_clones) {
+    dout(20) << func << " " << mode << " " << pgid << " " << head.get()
+               << " skipped " << missing << " clone(s) in cache tier" << dendl;
+  } else {
+    clog->info() << mode << " " << pgid << " " << head.get()
+		       << " " << missing << " missing clone(s)";
+  }
+}
+
+unsigned ReplicatedPG::process_clones_to(const boost::optional<hobject_t> &head,
+  const boost::optional<SnapSet> &snapset,
+  LogChannelRef clog,
+  const spg_t &pgid,
+  const char *mode,
+  bool allow_incomplete_clones,
+  boost::optional<snapid_t> target,
+  vector<snapid_t>::reverse_iterator *curclone)
+{
+  assert(head);
+  assert(snapset);
+  unsigned missing = 0;
+
+  // NOTE: clones are in descending order, thus **curclone > target test here
+  hobject_t next_clone(head.get());
+  while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
+    ++missing;
+    // it is okay to be missing one or more clones in a cache tier.
+    // skip higher-numbered clones in the list.
+    if (!allow_incomplete_clones) {
+      next_clone.snap = **curclone;
+      clog->error() << mode << " " << pgid << " " << head.get()
+			 << " expected clone " << next_clone;
+      ++scrubber.shallow_errors;
+    }
+    // Clones are descending
+    ++(*curclone);
+  }
+  return missing;
+}
+
+/*
+ * Validate consistency of the object info and snap sets.
+ *
+ * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
+ * the comparison of the objects is against multiple snapset.clones. There are
+ * multiple clone lists and in between lists we expect head or snapdir.
+ *
+ * Example
+ *
+ * objects              expected
+ * =======              =======
+ * obj1 snap 1          head/snapdir, unexpected obj1 snap 1
+ * obj2 head            head/snapdir, head ok
+ *              [SnapSet clones 6 4 2 1]
+ * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
+ * obj2 snap 6          obj2 snap 6, match
+ * obj2 snap 4          obj2 snap 4, match
+ * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
+ *              [Snapset clones 3 1]
+ * obj3 snap 3          obj3 snap 3 match
+ * obj3 snap 1          obj3 snap 1 match
+ * obj4 snapdir         head/snapdir, snapdir ok
+ *              [Snapset clones 4]
+ * EOL                  obj4 snap 4, (expected)
+ */
 void ReplicatedPG::_scrub(
   ScrubMap &scrubmap,
   const map<hobject_t, pair<uint32_t, uint32_t>, hobject_t::BitwiseComparator> &missing_digest)
@@ -11981,191 +12186,262 @@ void ReplicatedPG::_scrub(
   bool repair = state_test(PG_STATE_REPAIR);
   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
+  boost::optional<snapid_t> all_clones;   // Unspecified snapid_t or boost::none
 
   // traverse in reverse order.
-  hobject_t head;
-  SnapSet snapset;
-  vector<snapid_t>::reverse_iterator curclone;
-  hobject_t next_clone;
+  boost::optional<hobject_t> head;
+  boost::optional<SnapSet> snapset; // If initialized so will head (above)
+  vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
+  unsigned missing = 0;
 
   bufferlist last_data;
 
-  for (map<hobject_t,ScrubMap::object, hobject_t::BitwiseComparator>::reverse_iterator p = scrubmap.objects.rbegin();
-       p != scrubmap.objects.rend(); 
-       ++p) {
+  for (map<hobject_t,ScrubMap::object, hobject_t::BitwiseComparator>::reverse_iterator
+       p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
     const hobject_t& soid = p->first;
     object_stat_sum_t stat;
-    if (soid.snap != CEPH_SNAPDIR)
+    boost::optional<object_info_t> oi;
+
+    if (!soid.is_snapdir())
       stat.num_objects++;
 
     if (soid.nspace == cct->_conf->osd_hit_set_namespace)
       stat.num_objects_hit_set_archive++;
 
-    // new snapset?
-    if (soid.snap == CEPH_SNAPDIR ||
-	soid.snap == CEPH_NOSNAP) {
-      if (p->second.attrs.count(SS_ATTR) == 0) {
-	osd->clog->error() << mode << " " << info.pgid << " " << soid
-			  << " no '" << SS_ATTR << "' attr";
-        ++scrubber.shallow_errors;
-	continue;
-      }
-      bufferlist bl;
-      bl.push_back(p->second.attrs[SS_ATTR]);
-      bufferlist::iterator blp = bl.begin();
-      ::decode(snapset, blp);
-
-      // did we finish the last oid?
-      if (head != hobject_t() &&
-	  !pool.info.allow_incomplete_clones()) {
-	osd->clog->error() << mode << " " << info.pgid << " " << head
-			  << " missing clones";
-        ++scrubber.shallow_errors;
-      }
-      
-      // what will be next?
-      if (snapset.clones.empty())
-	head = hobject_t();  // no clones.
-      else {
-	curclone = snapset.clones.rbegin();
-	head = p->first;
-	next_clone = hobject_t();
-	dout(20) << "  snapset " << snapset << dendl;
-      }
+    if (soid.is_snap()) {
+      // it's a clone
+      stat.num_object_clones++;
     }
 
     // basic checks.
     if (p->second.attrs.count(OI_ATTR) == 0) {
+      oi = boost::none;
       osd->clog->error() << mode << " " << info.pgid << " " << soid
 			<< " no '" << OI_ATTR << "' attr";
       ++scrubber.shallow_errors;
-      continue;
+    } else {
+      bufferlist bv;
+      bv.push_back(p->second.attrs[OI_ATTR]);
+      try {
+	oi = object_info_t(); // Initialize optional<> before decode into it
+	oi.get().decode(bv);
+      } catch (buffer::error& e) {
+	oi = boost::none;
+	osd->clog->error() << mode << " " << info.pgid << " " << soid
+		<< " can't decode '" << OI_ATTR << "' attr " << e.what();
+	++scrubber.shallow_errors;
+      }
     }
-    bufferlist bv;
-    bv.push_back(p->second.attrs[OI_ATTR]);
-    object_info_t oi(bv);
 
-    if (pgbackend->be_get_ondisk_size(oi.size) != p->second.size) {
-      osd->clog->error() << mode << " " << info.pgid << " " << soid
-			<< " on disk size (" << p->second.size
-			<< ") does not match object info size ("
-			<< oi.size << ") adjusted for ondisk to ("
-			<< pgbackend->be_get_ondisk_size(oi.size)
-			<< ")";
-      ++scrubber.shallow_errors;
-    }
+    if (oi) {
+      if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
+	osd->clog->error() << mode << " " << info.pgid << " " << soid
+			   << " on disk size (" << p->second.size
+			   << ") does not match object info size ("
+			   << oi->size << ") adjusted for ondisk to ("
+			   << pgbackend->be_get_ondisk_size(oi->size)
+			   << ")";
+	++scrubber.shallow_errors;
+      }
 
-    dout(20) << mode << "  " << soid << " " << oi << dendl;
+      dout(20) << mode << "  " << soid << " " << oi.get() << dendl;
 
-    if (soid.is_snap()) {
-      stat.num_bytes += snapset.get_clone_bytes(soid.snap);
-    } else {
-      stat.num_bytes += oi.size;
+      // A clone num_bytes will be added later when we have snapset
+      if (!soid.is_snap()) {
+	stat.num_bytes += oi->size;
+      }
+      if (soid.nspace == cct->_conf->osd_hit_set_namespace)
+	stat.num_bytes_hit_set_archive += oi->size;
+
+      if (!soid.is_snapdir()) {
+	if (oi->is_dirty())
+	  ++stat.num_objects_dirty;
+	if (oi->is_whiteout())
+	  ++stat.num_whiteouts;
+	if (oi->is_omap())
+	  ++stat.num_objects_omap;
+	if (oi->is_cache_pinned())
+	  ++stat.num_objects_pinned;
+      }
     }
-    if (soid.nspace == cct->_conf->osd_hit_set_namespace)
-      stat.num_bytes_hit_set_archive += oi.size;
-
-    if (!soid.is_snapdir()) {
-      if (oi.is_dirty())
-	++stat.num_objects_dirty;
-      if (oi.is_whiteout())
-	++stat.num_whiteouts;
-      if (oi.is_omap())
-	++stat.num_objects_omap;
-    }
-
-    if (!next_clone.is_min() && next_clone != soid &&
-	pool.info.allow_incomplete_clones()) {
-      // it is okay to be missing one or more clones in a cache tier.
-      // skip higher-numbered clones in the list.
-      while (curclone != snapset.clones.rend() &&
-	     soid.snap < *curclone)
-	++curclone;
-      if (curclone != snapset.clones.rend() &&
-	  soid.snap == *curclone) {
-	dout(20) << __func__ << " skipped some clones in cache tier" << dendl;
-	next_clone.snap = *curclone;
-      }
-      if (curclone == snapset.clones.rend() ||
-	  soid.snap == CEPH_NOSNAP) {
-	dout(20) << __func__ << " skipped remaining clones in cache tier"
-		 << dendl;
-	next_clone = hobject_t();
-	head = hobject_t();
+
+    // Check for any problems while processing clones
+    if (doing_clones(snapset, curclone)) {
+      boost::optional<snapid_t> target;
+      // Expecting an object with snap for current head
+      if (soid.has_snapset() || soid.get_head() != head->get_head()) {
+
+	dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
+		 << soid << " while processing " << head.get() << dendl;
+
+        target = all_clones;
+      } else {
+        assert(soid.is_snap());
+        target = soid.snap;
       }
+
+      // Log any clones we were expecting to be there up to target
+      // This will set missing, but will be a no-op if snap.soid == *curclone.
+      missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
+		        pool.info.allow_incomplete_clones(), target, &curclone);
     }
-    if (!next_clone.is_min() && next_clone != soid) {
+    bool expected;
+    // Check doing_clones() again in case we ran process_clones_to()
+    if (doing_clones(snapset, curclone)) {
+      // A head/snapdir would have processed all clones above
+      // or all greater than *curclone.
+      assert(soid.is_snap() && *curclone <= soid.snap);
+
+      // After processing above clone snap should match the expected curclone
+      expected = (*curclone == soid.snap);
+    } else {
+      // If we aren't doing clones any longer, then expecting head/snapdir
+      expected = soid.has_snapset();
+    }
+    if (!expected) {
+      // If we couldn't read the head's snapset, then just ignore clones and
+      // don't count as an error.
+      if (head && !snapset) {
+	osd->clog->info() << mode << " " << info.pgid << " " << soid
+			  << " clone ignored due to missing snapset";
+	continue;
+      }
       osd->clog->error() << mode << " " << info.pgid << " " << soid
-			<< " expected clone " << next_clone;
+			   << " is an unexpected clone";
       ++scrubber.shallow_errors;
+      continue;
     }
 
-    if (soid.snap == CEPH_NOSNAP || soid.snap == CEPH_SNAPDIR) {
-      if (soid.snap == CEPH_NOSNAP && !snapset.head_exists) {
-	osd->clog->error() << mode << " " << info.pgid << " " << soid
-			  << " snapset.head_exists=false, but head exists";
-        ++scrubber.shallow_errors;
+    // new snapset?
+    if (soid.has_snapset()) {
+
+      if (missing) {
+	log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
+		    pool.info.allow_incomplete_clones());
       }
-      if (soid.snap == CEPH_SNAPDIR && snapset.head_exists) {
+
+      // Set this as a new head object
+      head = soid;
+      missing = 0;
+
+      dout(20) << __func__ << " " << mode << " new head " << head << dendl;
+
+      if (p->second.attrs.count(SS_ATTR) == 0) {
 	osd->clog->error() << mode << " " << info.pgid << " " << soid
-			  << " snapset.head_exists=true, but snapdir exists";
+			  << " no '" << SS_ATTR << "' attr";
         ++scrubber.shallow_errors;
-      }
-      if (curclone == snapset.clones.rend()) {
-	next_clone = hobject_t();
+	snapset = boost::none;
       } else {
-	next_clone = soid;
-	next_clone.snap = *curclone;
-      }
-    } else if (soid.snap) {
-      // it's a clone
-      stat.num_object_clones++;
-      
-      if (head == hobject_t()) {
-	osd->clog->error() << mode << " " << info.pgid << " " << soid
-			  << " found clone without head";
-	++scrubber.shallow_errors;
-	continue;
+	bufferlist bl;
+	bl.push_back(p->second.attrs[SS_ATTR]);
+	bufferlist::iterator blp = bl.begin();
+        try {
+	   snapset = SnapSet(); // Initialize optional<> before decoding into it
+	  ::decode(snapset.get(), blp);
+        } catch (buffer::error& e) {
+	  snapset = boost::none;
+          osd->clog->error() << mode << " " << info.pgid << " " << soid
+		<< " can't decode '" << SS_ATTR << "' attr " << e.what();
+	  ++scrubber.shallow_errors;
+        }
       }
 
-      if (soid.snap != *curclone) {
-	continue; // we warn above.  we could do better here...
+      if (snapset) {
+	// what will be next?
+	curclone = snapset->clones.rbegin();
+
+	if (!snapset->clones.empty()) {
+	  dout(20) << "  snapset " << snapset.get() << dendl;
+	  if (snapset->seq == 0) {
+	    osd->clog->error() << mode << " " << info.pgid << " " << soid
+			       << " snaps.seq not set";
+	    ++scrubber.shallow_errors;
+          }
+	}
+
+	if (soid.is_head() && !snapset->head_exists) {
+	  osd->clog->error() << mode << " " << info.pgid << " " << soid
+			  << " snapset.head_exists=false, but head exists";
+	  ++scrubber.shallow_errors;
+	}
+	if (soid.is_snapdir() && snapset->head_exists) {
+	  osd->clog->error() << mode << " " << info.pgid << " " << soid
+			  << " snapset.head_exists=true, but snapdir exists";
+	  ++scrubber.shallow_errors;
+	}
       }
+    } else {
+      assert(soid.is_snap());
+      assert(head);
+      assert(snapset);
+      assert(soid.snap == *curclone);
+
+      dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
 
-      if (oi.size != snapset.clone_size[*curclone]) {
+      if (snapset->clone_size.count(soid.snap) == 0) {
 	osd->clog->error() << mode << " " << info.pgid << " " << soid
-			  << " size " << oi.size << " != clone_size "
-			  << snapset.clone_size[*curclone];
+			   << " is missing in clone_size";
 	++scrubber.shallow_errors;
-      }
+      } else {
+        if (oi && oi->size != snapset->clone_size[soid.snap]) {
+	  osd->clog->error() << mode << " " << info.pgid << " " << soid
+			     << " size " << oi->size << " != clone_size "
+			     << snapset->clone_size[*curclone];
+	  ++scrubber.shallow_errors;
+        }
 
-      // verify overlap?
-      // ...
+        if (snapset->clone_overlap.count(soid.snap) == 0) {
+	  osd->clog->error() << mode << " " << info.pgid << " " << soid
+			     << " is missing in clone_overlap";
+	  ++scrubber.shallow_errors;
+        } else {
+	  // This checking is based on get_clone_bytes().  The first 2 asserts
+	  // can't happen because we know we have a clone_size and
+	  // a clone_overlap.  Now we check that the interval_set won't
+	  // cause the last assert.
+	  uint64_t size = snapset->clone_size.find(soid.snap)->second;
+	  const interval_set<uint64_t> &overlap =
+	        snapset->clone_overlap.find(soid.snap)->second;
+	  bool bad_interval_set = false;
+	  for (interval_set<uint64_t>::const_iterator i = overlap.begin();
+	       i != overlap.end(); ++i) {
+	    if (size < i.get_len()) {
+	      bad_interval_set = true;
+	      break;
+	    }
+	    size -= i.get_len();
+	  }
 
-      // what's next?
-      if (curclone != snapset.clones.rend()) {
-	++curclone;
-      }
-      if (curclone == snapset.clones.rend()) {
-	head = hobject_t();
-	next_clone = hobject_t();
-      } else {
-	next_clone.snap = *curclone;
+	  if (bad_interval_set) {
+	    osd->clog->error() << mode << " " << info.pgid << " " << soid
+			       << " bad interval_set in clone_overlap";
+	    ++scrubber.shallow_errors;
+	  } else {
+            stat.num_bytes += snapset->get_clone_bytes(soid.snap);
+	  }
+        }
       }
 
-    } else {
-      // it's unversioned.
-      next_clone = hobject_t();
+      // what's next?
+      ++curclone;
     }
 
     scrub_cstat.add(stat);
   }
 
-  if (!next_clone.is_min() &&
-      !pool.info.allow_incomplete_clones()) {
-    osd->clog->error() << mode << " " << info.pgid
-		      << " expected clone " << next_clone;
-    ++scrubber.shallow_errors;
+  if (doing_clones(snapset, curclone)) {
+    dout(10) << __func__ << " " << mode << " " << info.pgid
+	     << " No more objects while processing " << head.get() << dendl;
+
+    missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
+		      pool.info.allow_incomplete_clones(), all_clones, &curclone);
+
+  }
+  // There could be missing found by the test above or even
+  // before dropping out of the loop for the last head.
+  if (missing) {
+    log_missing(missing, head, osd->clog, info.pgid, __func__,
+		mode, pool.info.allow_incomplete_clones());
   }
 
   for (map<hobject_t,pair<uint32_t,uint32_t>, hobject_t::BitwiseComparator>::const_iterator p =
@@ -12188,7 +12464,7 @@ void ReplicatedPG::_scrub(
     simple_repop_submit(repop);
     ++scrubber.num_digest_updates_pending;
   }
-  
+
   dout(10) << "_scrub (" << mode << ") finish" << dendl;
 }
 
@@ -12216,6 +12492,7 @@ void ReplicatedPG::_scrub_finish()
 	   << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
 	   << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
 	   << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
+	   << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
 	   << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
 	   << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
 	   << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
@@ -12227,6 +12504,8 @@ void ReplicatedPG::_scrub_finish()
        !info.stats.dirty_stats_invalid) ||
       (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
        !info.stats.omap_stats_invalid) ||
+      (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
+       !info.stats.pin_stats_invalid) ||
       (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
        !info.stats.hitset_stats_invalid) ||
       (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
@@ -12239,6 +12518,7 @@ void ReplicatedPG::_scrub_finish()
 		      << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
 		      << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
 		      << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
+		      << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
 		      << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
 		      << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
 		      << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 04a6a45..e31c9fd 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -941,10 +941,8 @@ protected:
   /// estimate object atime and temperature
   ///
   /// @param oid [in] object name
-  /// @param atime [out] seconds since last access (lower bound)
-  /// @param temperature [out] relative temperature (# hitset bins we appear in)
-  void agent_estimate_atime_temp(const hobject_t& oid,
-				 int *atime, int *temperature);
+  /// @param temperature [out] relative temperature (# consider both access time and frequency)
+  void agent_estimate_temp(const hobject_t& oid, int *temperature);
 
   /// stop the agent
   void agent_stop();
@@ -1039,7 +1037,8 @@ protected:
   SnapSetContext *get_snapset_context(
     const hobject_t& oid,
     bool can_create,
-    map<string, bufferlist> *attrs = 0
+    map<string, bufferlist> *attrs = 0,
+    bool oid_existed = true //indicate this oid whether exsited in backend
     );
   void register_snapset_context(SnapSetContext *ssc) {
     Mutex::Locker l(snapset_contexts_lock);
@@ -1492,6 +1491,22 @@ private:
   hobject_t generate_temp_object();  ///< generate a new temp object name
   /// generate a new temp object name (for recovery)
   hobject_t get_temp_recovery_object(eversion_t version, snapid_t snap);
+  void log_missing(unsigned missing,
+			const boost::optional<hobject_t> &head,
+			LogChannelRef clog,
+			const spg_t &pgid,
+			const char *func,
+			const char *mode,
+			bool allow_incomplete_clones);
+  unsigned process_clones_to(const boost::optional<hobject_t> &head,
+    const boost::optional<SnapSet> &snapset,
+    LogChannelRef clog,
+    const spg_t &pgid,
+    const char *mode,
+    bool allow_incomplete_clones,
+    boost::optional<snapid_t> target,
+    vector<snapid_t>::reverse_iterator *curclone);
+
 public:
   coll_t get_coll() {
     return coll;
diff --git a/src/osd/TierAgentState.h b/src/osd/TierAgentState.h
index 57f2c72..e1665e6 100644
--- a/src/osd/TierAgentState.h
+++ b/src/osd/TierAgentState.h
@@ -23,7 +23,6 @@ struct TierAgentState {
   bool delaying;
 
   /// histogram of ages we've encountered
-  pow2_hist_t atime_hist;
   pow2_hist_t temp_hist;
   int hist_age;
 
@@ -109,9 +108,6 @@ struct TierAgentState {
     f->dump_string("evict_mode", get_evict_mode_name());
     f->dump_unsigned("evict_effort", evict_effort);
     f->dump_stream("position") << position;
-    f->open_object_section("atime_hist");
-    atime_hist.dump(f);
-    f->close_section();
     f->open_object_section("temp_hist");
     temp_hist.dump(f);
     f->close_section();
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 10458ec..3cf41bd 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -889,7 +889,6 @@ void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
   o.back()->name = "foo";
 }
 
-
 // -- pg_pool_t --
 
 void pg_pool_t::dump(Formatter *f) const
@@ -947,6 +946,12 @@ void pg_pool_t::dump(Formatter *f) const
   f->dump_bool("use_gmt_hitset", use_gmt_hitset);
   f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
   f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
+  f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
+  f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
+  f->open_array_section("grade_table");
+  for (unsigned i = 0; i < hit_set_count; ++i)
+    f->dump_unsigned("value", get_grade(i));
+  f->close_section();
   f->dump_unsigned("stripe_width", get_stripe_width());
   f->dump_unsigned("expected_num_objects", expected_num_objects);
   f->dump_bool("fast_read", fast_read);
@@ -1259,7 +1264,7 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
     return;
   }
 
-  ENCODE_START(22, 5, bl);
+  ENCODE_START(23, 5, bl);
   ::encode(type, bl);
   ::encode(size, bl);
   ::encode(crush_ruleset, bl);
@@ -1305,12 +1310,14 @@ void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
   ::encode(min_write_recency_for_promote, bl);
   ::encode(use_gmt_hitset, bl);
   ::encode(fast_read, bl);
+  ::encode(hit_set_grade_decay_rate, bl);
+  ::encode(hit_set_search_last_n, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_pool_t::decode(bufferlist::iterator& bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(22, 5, 5, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(23, 5, 5, bl);
   ::decode(type, bl);
   ::decode(size, bl);
   ::decode(crush_ruleset, bl);
@@ -1442,8 +1449,16 @@ void pg_pool_t::decode(bufferlist::iterator& bl)
   } else {
     fast_read = false;
   }
+  if (struct_v >= 23) {
+    ::decode(hit_set_grade_decay_rate, bl);
+    ::decode(hit_set_search_last_n, bl);
+  } else {
+    hit_set_grade_decay_rate = 0;
+    hit_set_search_last_n = 1;
+  }
   DECODE_FINISH(bl);
   calc_pg_masks();
+  calc_grade_table();
 }
 
 void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
@@ -1489,6 +1504,9 @@ void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
   a.hit_set_count = 8;
   a.min_read_recency_for_promote = 1;
   a.min_write_recency_for_promote = 1;
+  a.hit_set_grade_decay_rate = 50;
+  a.hit_set_search_last_n = 1;
+  a.calc_grade_table();
   a.set_stripe_width(12345);
   a.target_max_bytes = 1238132132;
   a.target_max_objects = 1232132;
@@ -1542,7 +1560,9 @@ ostream& operator<<(ostream& out, const pg_pool_t& p)
   if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
     out << " hit_set " << p.hit_set_params
 	<< " " << p.hit_set_period << "s"
-	<< " x" << p.hit_set_count;
+	<< " x" << p.hit_set_count << " decay_rate "
+	<< p.hit_set_grade_decay_rate
+	<< " search_last_n " << p.hit_set_search_last_n;
   }
   if (p.min_read_recency_for_promote)
     out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
@@ -1593,11 +1613,12 @@ void object_stat_sum_t::dump(Formatter *f) const
   f->dump_int("num_flush_mode_low", num_flush_mode_low);
   f->dump_int("num_evict_mode_some", num_evict_mode_some);
   f->dump_int("num_evict_mode_full", num_evict_mode_full);
+  f->dump_int("num_objects_pinned", num_objects_pinned);
 }
 
 void object_stat_sum_t::encode(bufferlist& bl) const
 {
-  ENCODE_START(13, 3, bl);
+  ENCODE_START(14, 3, bl);
   ::encode(num_bytes, bl);
   ::encode(num_objects, bl);
   ::encode(num_object_clones, bl);
@@ -1630,12 +1651,13 @@ void object_stat_sum_t::encode(bufferlist& bl) const
   ::encode(num_flush_mode_low, bl);
   ::encode(num_evict_mode_some, bl);
   ::encode(num_evict_mode_full, bl);
+  ::encode(num_objects_pinned, bl);
   ENCODE_FINISH(bl);
 }
 
 void object_stat_sum_t::decode(bufferlist::iterator& bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(13, 3, 3, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(14, 3, 3, bl);
   ::decode(num_bytes, bl);
   if (struct_v < 3) {
     uint64_t num_kb;
@@ -1723,6 +1745,11 @@ void object_stat_sum_t::decode(bufferlist::iterator& bl)
     num_evict_mode_some = 0;
     num_evict_mode_full = 0;
   }
+  if (struct_v >= 14) {
+    ::decode(num_objects_pinned, bl);
+  } else {
+    num_objects_pinned = 0;
+  }
   DECODE_FINISH(bl);
 }
 
@@ -1759,6 +1786,7 @@ void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
   a.num_flush_mode_low = 1;
   a.num_evict_mode_some = 1;
   a.num_evict_mode_full = 0;
+  a.num_objects_pinned = 20;
   o.push_back(new object_stat_sum_t(a));
 }
 
@@ -1796,6 +1824,7 @@ void object_stat_sum_t::add(const object_stat_sum_t& o)
   num_flush_mode_low += o.num_flush_mode_low;
   num_evict_mode_some += o.num_evict_mode_some;
   num_evict_mode_full += o.num_evict_mode_full;
+  num_objects_pinned += o.num_objects_pinned;
 }
 
 void object_stat_sum_t::sub(const object_stat_sum_t& o)
@@ -1832,6 +1861,7 @@ void object_stat_sum_t::sub(const object_stat_sum_t& o)
   num_flush_mode_low -= o.num_flush_mode_low;
   num_evict_mode_some -= o.num_evict_mode_some;
   num_evict_mode_full -= o.num_evict_mode_full;
+  num_objects_pinned -= o.num_objects_pinned;
 }
 
 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
@@ -1868,7 +1898,8 @@ bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
     l.num_flush_mode_high == r.num_flush_mode_high &&
     l.num_flush_mode_low == r.num_flush_mode_low &&
     l.num_evict_mode_some == r.num_evict_mode_some &&
-    l.num_evict_mode_full == r.num_evict_mode_full;
+    l.num_evict_mode_full == r.num_evict_mode_full &&
+    l.num_objects_pinned == r.num_objects_pinned;
 }
 
 // -- object_stat_collection_t --
@@ -1995,7 +2026,7 @@ void pg_stat_t::dump_brief(Formatter *f) const
 
 void pg_stat_t::encode(bufferlist &bl) const
 {
-  ENCODE_START(21, 8, bl);
+  ENCODE_START(22, 8, bl);
   ::encode(version, bl);
   ::encode(reported_seq, bl);
   ::encode(reported_epoch, bl);
@@ -2035,12 +2066,13 @@ void pg_stat_t::encode(bufferlist &bl) const
   ::encode(hitset_bytes_stats_invalid, bl);
   ::encode(last_peered, bl);
   ::encode(last_became_peered, bl);
+  ::encode(pin_stats_invalid, bl);
   ENCODE_FINISH(bl);
 }
 
 void pg_stat_t::decode(bufferlist::iterator &bl)
 {
-  DECODE_START_LEGACY_COMPAT_LEN(20, 8, 8, bl);
+  DECODE_START_LEGACY_COMPAT_LEN(22, 8, 8, bl);
   ::decode(version, bl);
   ::decode(reported_seq, bl);
   ::decode(reported_epoch, bl);
@@ -2174,6 +2206,13 @@ void pg_stat_t::decode(bufferlist::iterator &bl)
     last_peered = last_active;
     last_became_peered = last_became_active;
   }
+  if (struct_v >= 22) {
+    ::decode(pin_stats_invalid, bl);
+  } else {
+    // if we are decoding an old encoding of this object, then the
+    // encoder may not have supported num_objects_pinned accounting.
+    pin_stats_invalid = true;
+  }
   DECODE_FINISH(bl);
 }
 
@@ -2264,7 +2303,8 @@ bool operator==(const pg_stat_t& l, const pg_stat_t& r)
     l.hitset_stats_invalid == r.hitset_stats_invalid &&
     l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid &&
     l.up_primary == r.up_primary &&
-    l.acting_primary == r.acting_primary;
+    l.acting_primary == r.acting_primary &&
+    l.pin_stats_invalid == r.pin_stats_invalid;
 }
 
 // -- pool_stat_t --
@@ -5133,6 +5173,8 @@ ostream& operator<<(ostream& out, const OSDOp& op)
     case CEPH_OSD_OP_CACHE_FLUSH:
     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
     case CEPH_OSD_OP_CACHE_EVICT:
+    case CEPH_OSD_OP_CACHE_PIN:
+    case CEPH_OSD_OP_CACHE_UNPIN:
       break;
     case CEPH_OSD_OP_ASSERT_VER:
       out << " v" << op.op.assert_ver.ver;
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index d51c894..0f127f0 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -1104,6 +1104,9 @@ public:
     hit_set_params = HitSet::Params();
     hit_set_period = 0;
     hit_set_count = 0;
+    hit_set_grade_decay_rate = 0;
+    hit_set_search_last_n = 0;
+    grade_table.resize(0);
   }
 
   uint64_t target_max_bytes;   ///< tiering: target max pool size
@@ -1122,6 +1125,10 @@ public:
   bool use_gmt_hitset;	        ///< use gmt to name the hitset archive object
   uint32_t min_read_recency_for_promote;   ///< minimum number of HitSet to check before promote on read
   uint32_t min_write_recency_for_promote;  ///< minimum number of HitSet to check before promote on write
+  uint32_t hit_set_grade_decay_rate;   ///< current hit_set has highest priority on objects
+                                       ///temperature count,the follow hit_set's priority decay 
+                                       ///by this params than pre hit_set
+  uint32_t hit_set_search_last_n;   ///<accumulate atmost N hit_sets for temperature
 
   uint32_t stripe_width;        ///< erasure coded stripe size in bytes
 
@@ -1129,6 +1136,24 @@ public:
                                  ///< user does not specify any expected value
   bool fast_read;            ///< whether turn on fast read on the pool or not
 
+private:
+  vector<uint32_t> grade_table;
+
+public:
+  uint32_t get_grade(unsigned i) const {
+    if (grade_table.size() <= i)
+      return 0;
+    return grade_table[i];
+  }
+  void calc_grade_table() {
+    unsigned v = 1000000;
+    grade_table.resize(hit_set_count);
+    for (unsigned i = 0; i < hit_set_count; i++) {
+      v = v * (1 - (hit_set_grade_decay_rate / 100.0));
+      grade_table[i] = v;
+    }
+  }
+
   pg_pool_t()
     : flags(0), type(0), size(0), min_size(0),
       crush_ruleset(0), object_hash(0),
@@ -1154,6 +1179,8 @@ public:
       use_gmt_hitset(true),
       min_read_recency_for_promote(0),
       min_write_recency_for_promote(0),
+      hit_set_grade_decay_rate(0),
+      hit_set_search_last_n(0),
       stripe_width(0),
       expected_num_objects(0),
       fast_read(false)
@@ -1358,6 +1385,7 @@ struct object_stat_sum_t {
   int32_t num_flush_mode_low;   // 1 when in low flush mode, otherwise 0
   int32_t num_evict_mode_some;  // 1 when in evict some mode, otherwise 0
   int32_t num_evict_mode_full;  // 1 when in evict full mode, otherwise 0
+  int64_t num_objects_pinned;
 
   object_stat_sum_t()
     : num_bytes(0),
@@ -1382,7 +1410,8 @@ struct object_stat_sum_t {
       num_evict_kb(0),
       num_promote(0),
       num_flush_mode_high(0), num_flush_mode_low(0),
-      num_evict_mode_some(0), num_evict_mode_full(0)
+      num_evict_mode_some(0), num_evict_mode_full(0),
+      num_objects_pinned(0)
   {}
 
   void floor(int64_t f) {
@@ -1419,6 +1448,7 @@ struct object_stat_sum_t {
     FLOOR(num_flush_mode_low);
     FLOOR(num_evict_mode_some);
     FLOOR(num_evict_mode_full);
+    FLOOR(num_objects_pinned);
 #undef FLOOR
   }
 
@@ -1463,6 +1493,7 @@ struct object_stat_sum_t {
     SPLIT(num_flush_mode_low);
     SPLIT(num_evict_mode_some);
     SPLIT(num_evict_mode_full);
+    SPLIT(num_objects_pinned);
 #undef SPLIT
   }
 
@@ -1596,6 +1627,7 @@ struct pg_stat_t {
   bool omap_stats_invalid;
   bool hitset_stats_invalid;
   bool hitset_bytes_stats_invalid;
+  bool pin_stats_invalid;
 
   /// up, acting primaries
   int32_t up_primary;
@@ -1614,6 +1646,7 @@ struct pg_stat_t {
       omap_stats_invalid(false),
       hitset_stats_invalid(false),
       hitset_bytes_stats_invalid(false),
+      pin_stats_invalid(false),
       up_primary(-1),
       acting_primary(-1)
   { }
@@ -3012,6 +3045,7 @@ struct object_info_t {
     FLAG_OMAP     = 1 << 3,  // has (or may have) some/any omap data
     FLAG_DATA_DIGEST = 1 << 4,  // has data crc
     FLAG_OMAP_DIGEST = 1 << 5,  // has omap crc
+    FLAG_CACHE_PIN = 1 << 6,    // pin the object in cache tier
     // ...
     FLAG_USES_TMAP = 1<<8,  // deprecated; no longer used.
   } flag_t;
@@ -3034,6 +3068,8 @@ struct object_info_t {
       s += "|data_digest";
     if (flags & FLAG_OMAP_DIGEST)
       s += "|omap_digest";
+    if (flags & FLAG_CACHE_PIN)
+      s += "|cache_pin";
     if (s.length())
       return s.substr(1);
     return s;
@@ -3084,6 +3120,9 @@ struct object_info_t {
   bool is_omap_digest() const {
     return test_flag(FLAG_OMAP_DIGEST);
   }
+  bool is_cache_pinned() const {
+    return test_flag(FLAG_CACHE_PIN);
+  }
 
   void set_data_digest(__u32 d) {
     set_flag(FLAG_DATA_DIGEST);
@@ -3131,6 +3170,10 @@ struct object_info_t {
   object_info_t(bufferlist& bl) {
     decode(bl);
   }
+  object_info_t operator=(bufferlist& bl) {
+    decode(bl);
+    return *this;
+  }
 };
 WRITE_CLASS_ENCODER(object_info_t)
 
diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc
index d1f8020..060b732 100644
--- a/src/osdc/ObjectCacher.cc
+++ b/src/osdc/ObjectCacher.cc
@@ -40,6 +40,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *left, loff_t o
   right->last_read_tid = left->last_read_tid;
   right->set_state(left->get_state());
   right->snapc = left->snapc;
+  right->set_journal_tid(left->journal_tid);
 
   loff_t newleftlen = off - left->start();
   right->set_start(off);
@@ -88,8 +89,14 @@ void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right)
   assert(oc->lock.is_locked());
   assert(left->end() == right->start());
   assert(left->get_state() == right->get_state());
+  assert(left->can_merge_journal(right));
 
   ldout(oc->cct, 10) << "merge_left " << *left << " + " << *right << dendl;
+  if (left->get_journal_tid() == 0) {
+    left->set_journal_tid(right->get_journal_tid());
+  }
+  right->set_journal_tid(0);
+
   oc->bh_remove(this, right);
   oc->bh_stat_sub(left);
   left->set_length(left->length() + right->length());
@@ -97,8 +104,8 @@ void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right)
 
   // data
   left->bl.claim_append(right->bl);
-  
-  // version 
+
+  // version
   // note: this is sorta busted, but should only be used for dirty buffers
   left->last_write_tid =  MAX( left->last_write_tid, right->last_write_tid );
   left->last_write = MAX( left->last_write, right->last_write );
@@ -134,7 +141,8 @@ void ObjectCacher::Object::try_merge_bh(BufferHead *bh)
   if (p != data.begin()) {
     --p;
     if (p->second->end() == bh->start() &&
-	p->second->get_state() == bh->get_state()) {
+        p->second->get_state() == bh->get_state() &&
+        p->second->can_merge_journal(bh)) {
       merge_left(p->second, bh);
       bh = p->second;
     } else {
@@ -146,7 +154,8 @@ void ObjectCacher::Object::try_merge_bh(BufferHead *bh)
   ++p;
   if (p != data.end() &&
       p->second->start() == bh->end() &&
-      p->second->get_state() == bh->get_state())
+      p->second->get_state() == bh->get_state() &&
+      p->second->can_merge_journal(bh))
     merge_left(bh, p->second);
 }
 
@@ -363,6 +372,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(OSDWrite *wr)
           oc->bh_add(this, final);
           ldout(oc->cct, 10) << "map_write adding trailing bh " << *final << dendl;
         } else {
+          replace_journal_tid(final, wr->journal_tid);
 	  oc->bh_stat_sub(final);
           final->set_length(final->length() + max);
 	  oc->bh_stat_add(final);
@@ -371,14 +381,14 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(OSDWrite *wr)
         cur += max;
         continue;
       }
-      
+
       ldout(oc->cct, 10) << "cur is " << cur << ", p is " << *p->second << dendl;
       //oc->verify_stats();
 
       if (p->first <= cur) {
         BufferHead *bh = p->second;
         ldout(oc->cct, 10) << "map_write bh " << *bh << " intersected" << dendl;
-        
+
         if (p->first < cur) {
           assert(final == 0);
           if (cur + max >= bh->end()) {
@@ -406,24 +416,26 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(OSDWrite *wr)
 	    oc->mark_dirty(final);
 	    --p;  // move iterator back to final
 	    assert(p->second == final);
+            replace_journal_tid(bh, 0);
             merge_left(final, bh);
 	  } else {
             final = bh;
 	  }
         }
-        
+
         // keep going.
         loff_t lenfromcur = final->end() - cur;
         cur += lenfromcur;
         left -= lenfromcur;
         ++p;
-        continue; 
+        continue;
       } else {
         // gap!
         loff_t next = p->first;
         loff_t glen = MIN(next - cur, max);
         ldout(oc->cct, 10) << "map_write gap " << cur << "~" << glen << dendl;
         if (final) {
+          replace_journal_tid(final, wr->journal_tid);
 	  oc->bh_stat_sub(final);
           final->set_length(final->length() + glen);
 	  oc->bh_stat_add(final);
@@ -433,21 +445,34 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(OSDWrite *wr)
           final->set_length( glen );
           oc->bh_add(this, final);
         }
-        
+
         cur += glen;
         left -= glen;
         continue;    // more?
       }
     }
   }
-  
-  // set versoin
+
+  // set version
   assert(final);
+  replace_journal_tid(final, wr->journal_tid);
   ldout(oc->cct, 10) << "map_write final is " << *final << dendl;
 
   return final;
 }
 
+void ObjectCacher::Object::replace_journal_tid(BufferHead *bh, ceph_tid_t tid) {
+  ceph_tid_t bh_tid = bh->get_journal_tid();
+
+  assert(tid == 0 || bh_tid <= tid);
+  if (bh_tid != 0 && bh_tid != tid) {
+    // inform journal that it should not expect a writeback from this extent
+    oc->writeback_handler.overwrite_extent(get_oid(), bh->start(), bh->length(),
+                                           bh_tid);
+  }
+  bh->set_journal_tid(tid);
+}
+
 void ObjectCacher::Object::truncate(loff_t s)
 {
   assert(oc->lock.is_locked());
@@ -467,6 +492,7 @@ void ObjectCacher::Object::truncate(loff_t s)
     // remove bh entirely
     assert(bh->start() >= s);
     assert(bh->waitfor_read.empty());
+    replace_journal_tid(bh, 0);
     oc->bh_remove(this, bh);
     delete bh;
   }
@@ -507,6 +533,7 @@ void ObjectCacher::Object::discard(loff_t off, loff_t len)
     ++p;
     ldout(oc->cct, 10) << "discard " << *this << " bh " << *bh << dendl;
     assert(bh->waitfor_read.empty());
+    replace_journal_tid(bh, 0);
     oc->bh_remove(this, bh);
     delete bh;
   }
@@ -843,10 +870,11 @@ void ObjectCacher::bh_write(BufferHead *bh)
                                               bh->ob->get_soid(), bh->start(), bh->length());
   // go
   ceph_tid_t tid = writeback_handler.write(bh->ob->get_oid(), bh->ob->get_oloc(),
-				      bh->start(), bh->length(),
-				      bh->snapc, bh->bl, bh->last_write,
-				      bh->ob->truncate_size, bh->ob->truncate_seq,
-				      oncommit);
+                                           bh->start(), bh->length(),
+                                           bh->snapc, bh->bl, bh->last_write,
+                                           bh->ob->truncate_size,
+                                           bh->ob->truncate_seq,
+                                           bh->journal_tid, oncommit);
   ldout(cct, 20) << " tid " << tid << " on " << bh->ob->get_oid() << dendl;
 
   // set bh last_write_tid
@@ -920,6 +948,7 @@ void ObjectCacher::bh_write_commit(int64_t poolid, sobject_t oid, loff_t start,
       if (r >= 0) {
 	// ok!  mark bh clean and error-free
 	mark_clean(bh);
+        bh->set_journal_tid(0);
 	if (bh->get_nocache())
 	  bh_lru_rest.lru_bottouch(bh);
 	hit.push_back(bh);
@@ -1967,7 +1996,7 @@ void ObjectCacher::clear_nonexistence(ObjectSet *oset)
 /**
  * discard object extents from an ObjectSet by removing the objects in exls from the in-memory oset.
  */
-void ObjectCacher::discard_set(ObjectSet *oset, vector<ObjectExtent>& exls)
+void ObjectCacher::discard_set(ObjectSet *oset, const vector<ObjectExtent>& exls)
 {
   assert(lock.is_locked());
   if (oset->objects.empty()) {
@@ -1979,11 +2008,11 @@ void ObjectCacher::discard_set(ObjectSet *oset, vector<ObjectExtent>& exls)
 
   bool were_dirty = oset->dirty_or_tx > 0;
 
-  for (vector<ObjectExtent>::iterator p = exls.begin();
+  for (vector<ObjectExtent>::const_iterator p = exls.begin();
        p != exls.end();
        ++p) {
     ldout(cct, 10) << "discard_set " << oset << " ex " << *p << dendl;
-    ObjectExtent &ex = *p;
+    const ObjectExtent &ex = *p;
     sobject_t soid(ex.oid, CEPH_NOSNAP);
     if (objects[oset->poolid].count(soid) == 0)
       continue;
@@ -2192,6 +2221,7 @@ void ObjectCacher::bh_add(Object *ob, BufferHead *bh)
 void ObjectCacher::bh_remove(Object *ob, BufferHead *bh)
 {
   assert(lock.is_locked());
+  assert(bh->get_journal_tid() == 0);
   ldout(cct, 30) << "bh_remove " << *ob << " " << *bh << dendl;
   ob->remove_bh(bh);
   if (bh->is_dirty()) {
diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h
index eeed83a..87fe351 100644
--- a/src/osdc/ObjectCacher.h
+++ b/src/osdc/ObjectCacher.h
@@ -71,13 +71,16 @@ class ObjectCacher {
     bufferlist bl;
     utime_t mtime;
     int fadvise_flags;
-    OSDWrite(const SnapContext& sc, const bufferlist& b, utime_t mt, int f)
-      : snapc(sc), bl(b), mtime(mt), fadvise_flags(f) {}
+    ceph_tid_t journal_tid;
+    OSDWrite(const SnapContext& sc, const bufferlist& b, utime_t mt, int f,
+             ceph_tid_t _journal_tid)
+      : snapc(sc), bl(b), mtime(mt), fadvise_flags(f),
+        journal_tid(_journal_tid) {}
   };
 
   OSDWrite *prepare_write(const SnapContext& sc, const bufferlist &b,
-			  utime_t mt, int f) { 
-    return new OSDWrite(sc, b, mt, f); 
+			  utime_t mt, int f, ceph_tid_t journal_tid) {
+    return new OSDWrite(sc, b, mt, f, journal_tid);
   }
 
 
@@ -111,6 +114,7 @@ class ObjectCacher {
     ceph_tid_t last_read_tid;   // tid of last read op (if any)
     utime_t last_write;
     SnapContext snapc;
+    ceph_tid_t journal_tid;
     int error; // holds return value for failed reads
     
     map< loff_t, list<Context*> > waitfor_read;
@@ -124,6 +128,7 @@ class ObjectCacher {
       ob(o),
       last_write_tid(0),
       last_read_tid(0),
+      journal_tid(0),
       error(0) {
       ex.start = ex.length = 0;
     }
@@ -143,7 +148,15 @@ class ObjectCacher {
       state = s;
     }
     int get_state() const { return state; }
-    
+
+    inline ceph_tid_t get_journal_tid() const {
+      return journal_tid;
+    }
+    inline void set_journal_tid(ceph_tid_t _journal_tid) {
+      
+      journal_tid = _journal_tid;
+    }
+
     bool is_missing() { return state == STATE_MISSING; }
     bool is_dirty() { return state == STATE_DIRTY; }
     bool is_clean() { return state == STATE_CLEAN; }
@@ -178,6 +191,11 @@ class ObjectCacher {
     bool get_nocache() {
       return nocache;
     }
+
+    inline bool can_merge_journal(BufferHead *bh) const {
+      return (get_journal_tid() == 0 || bh->get_journal_tid() == 0 ||
+              get_journal_tid() == bh->get_journal_tid());
+    }
   };
 
   // ******* Object *********
@@ -308,7 +326,8 @@ class ObjectCacher {
                  map<loff_t, BufferHead*>& rx,
 		 map<loff_t, BufferHead*>& errors);
     BufferHead *map_write(OSDWrite *wr);
-    
+
+    void replace_journal_tid(BufferHead *bh, ceph_tid_t tid);
     void truncate(loff_t s);
     void discard(loff_t off, loff_t len);
 
@@ -635,7 +654,7 @@ public:
   loff_t release_set(ObjectSet *oset);  // returns # of bytes not released (ie non-clean)
   uint64_t release_all();
 
-  void discard_set(ObjectSet *oset, vector<ObjectExtent>& ex);
+  void discard_set(ObjectSet *oset, const vector<ObjectExtent>& ex);
 
   /**
    * Retry any in-flight reads that get -ENOENT instead of marking
@@ -687,7 +706,7 @@ public:
   int file_write(ObjectSet *oset, ceph_file_layout *layout, const SnapContext& snapc,
                  loff_t offset, uint64_t len, 
                  bufferlist& bl, utime_t mtime, int flags) {
-    OSDWrite *wr = prepare_write(snapc, bl, mtime, flags);
+    OSDWrite *wr = prepare_write(snapc, bl, mtime, flags, 0);
     Striper::file_to_extents(cct, oset->ino, layout, offset, len, oset->truncate_size, wr->extents);
     return writex(wr, oset, NULL);
   }
@@ -708,6 +727,9 @@ inline ostream& operator<<(ostream& out, ObjectCacher::BufferHead &bh)
       << " " << bh.ob
       << " (" << bh.bl.length() << ")"
       << " v " << bh.last_write_tid;
+  if (bh.get_journal_tid() != 0) {
+    out << " j " << bh.get_journal_tid();
+  }
   if (bh.is_tx()) out << " tx";
   if (bh.is_rx()) out << " rx";
   if (bh.is_dirty()) out << " dirty";
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index 60efe3a..ac06048 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -50,7 +50,6 @@
 #include "include/str_list.h"
 #include "common/errno.h"
 
-
 #define dout_subsys ceph_subsys_objecter
 #undef dout_prefix
 #define dout_prefix *_dout << messenger->get_myname() << ".objecter "
@@ -163,17 +162,21 @@ void Objecter::handle_conf_change(const struct md_config_t *conf,
 				  const std::set <std::string> &changed)
 {
   if (changed.count("crush_location")) {
-    crush_location.clear();
-    vector<string> lvec;
-    get_str_vec(cct->_conf->crush_location, ";, \t", lvec);
-    int r = CrushWrapper::parse_loc_multimap(lvec, &crush_location);
-    if (r < 0) {
-      lderr(cct) << "warning: crush_location '" << cct->_conf->crush_location
-		 << "' does not parse" << dendl;
-    }
+    update_crush_location();
   }
 }
 
+void Objecter::update_crush_location()
+{
+  crush_location.clear();
+  vector<string> lvec;
+  get_str_vec(cct->_conf->crush_location, ";, \t", lvec);
+  int r = CrushWrapper::parse_loc_multimap(lvec, &crush_location);
+  if (r < 0) {
+    lderr(cct) << "warning: crush_location '" << cct->_conf->crush_location
+               << "' does not parse" << dendl;
+  }
+}
 
 // messages ------------------------------
 
@@ -288,6 +291,7 @@ void Objecter::init()
   timer.init();
   timer_lock.Unlock();
 
+  update_crush_location();
   cct->_conf->add_observer(this);
 
   initialized.set(1);
@@ -817,10 +821,13 @@ void Objecter::handle_watch_notify(MWatchNotify *m)
 	info->notify_id != m->notify_id) {
       ldout(cct, 10) << __func__ << " reply notify " << m->notify_id
 		     << " != " << info->notify_id << ", ignoring" << dendl;
-    } else {
-      assert(info->on_notify_finish);
+    } else if (info->on_notify_finish) {
       info->notify_result_bl->claim(m->get_data());
       info->on_notify_finish->complete(m->return_code);
+
+      // if we race with reconnect we might get a second notify; only
+      // notify the caller once!
+      info->on_notify_finish = NULL;
     }
   } else {
     finisher->queue(new C_DoWatchNotify(this, info, m));
@@ -1312,6 +1319,8 @@ int Objecter::pool_snap_list(int64_t poolid, vector<uint64_t> *snaps)
   RWLock::RLocker rl(rwlock);
 
   const pg_pool_t *pi = osdmap->get_pg_pool(poolid);
+  if (!pi)
+    return -ENOENT;
   for (map<snapid_t,pool_snap_info_t>::const_iterator p = pi->snaps.begin();
        p != pi->snaps.end();
        ++p) {
@@ -2662,10 +2671,7 @@ int Objecter::_calc_target(op_target_t *t, epoch_t *last_force_resend,  bool any
 int Objecter::_map_session(op_target_t *target, OSDSession **s,
 			   RWLock::Context& lc)
 {
-  int r = _calc_target(target);
-  if (r < 0) {
-    return r;
-  }
+  _calc_target(target);
   return _get_session(target->osd, s, lc);
 }
 
@@ -3142,10 +3148,10 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
     // set rval before running handlers so that handlers
     // can change it if e.g. decoding fails
     if (*pr)
-      **pr = p->rval;
+      **pr = ceph_to_host_errno(p->rval);
     if (*ph) {
       ldout(cct, 10) << " op " << i << " handler " << *ph << dendl;
-      (*ph)->complete(p->rval);
+      (*ph)->complete(ceph_to_host_errno(p->rval));
       *ph = NULL;
     }
   }
@@ -3764,12 +3770,24 @@ void Objecter::handle_pool_op_reply(MPoolOpReply *m)
     if (osdmap->get_epoch() < m->epoch) {
       rwlock.unlock();
       rwlock.get_write();
+      // recheck op existence since we have let go of rwlock
+      // (for promotion) above.
+      iter = pool_ops.find(tid);
+      if (iter == pool_ops.end())
+        goto done; // op is gone.
       if (osdmap->get_epoch() < m->epoch) {
         ldout(cct, 20) << "waiting for client to reach epoch " << m->epoch << " before calling back" << dendl;
         _wait_for_new_map(op->onfinish, m->epoch, m->replyCode);
+      } else {
+	// map epoch changed, probably because a MOSDMap message
+	// sneaked in. Do caller-specified callback now or else
+	// we lose it forever.
+	assert(op->onfinish);
+	op->onfinish->complete(m->replyCode);	
       }
     }
     else {
+      assert(op->onfinish);
       op->onfinish->complete(m->replyCode);
     }
     op->onfinish = NULL;
@@ -3784,6 +3802,8 @@ void Objecter::handle_pool_op_reply(MPoolOpReply *m)
   } else {
     ldout(cct, 10) << "unknown request " << tid << dendl;
   }
+
+done:
   rwlock.unlock();
 
   ldout(cct, 10) << "done" << dendl;
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 379c0ae..ac09e70 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -904,10 +904,14 @@ struct ObjectOperation {
     osd_op.op.watch.op = op;
   }
 
-  void notify(uint64_t cookie, bufferlist& inbl) {
+  void notify(uint64_t cookie, uint32_t prot_ver, uint32_t timeout,
+              bufferlist &bl, bufferlist *inbl) {
     OSDOp& osd_op = add_op(CEPH_OSD_OP_NOTIFY);
     osd_op.op.notify.cookie = cookie;
-    osd_op.indata.append(inbl);
+    ::encode(prot_ver, *inbl);
+    ::encode(timeout, *inbl);
+    ::encode(bl, *inbl);
+    osd_op.indata.append(*inbl);
   }
 
   void notify_ack(uint64_t notify_id, uint64_t cookie,
@@ -1052,6 +1056,17 @@ struct ObjectOperation {
       out_rval[i] = &sops[i].rval;
     }
   }
+
+  /**
+   * Pin/unpin an object in cache tier
+   */
+  void cache_pin() {
+    add_op(CEPH_OSD_OP_CACHE_PIN);
+  }
+
+  void cache_unpin() {
+    add_op(CEPH_OSD_OP_CACHE_UNPIN);
+  }
 };
 
 
@@ -1112,6 +1127,7 @@ private:
 
   void schedule_tick();
   void tick();
+  void update_crush_location();
 
   class RequestStateHook : public AdminSocketHook {
     Objecter *m_objecter;
diff --git a/src/osdc/WritebackHandler.h b/src/osdc/WritebackHandler.h
index cbcf20d..0e51cb0 100644
--- a/src/osdc/WritebackHandler.h
+++ b/src/osdc/WritebackHandler.h
@@ -32,7 +32,10 @@ class WritebackHandler {
 			   uint64_t off, uint64_t len, const SnapContext& snapc,
 			   const bufferlist &bl, utime_t mtime,
 			   uint64_t trunc_size, __u32 trunc_seq,
-			   Context *oncommit) = 0;
+                           ceph_tid_t journal_tid, Context *oncommit) = 0;
+
+  virtual void overwrite_extent(const object_t& oid, uint64_t off, uint64_t len,
+                                ceph_tid_t journal_tid) {}
 
   virtual void get_client_lock() {}
   virtual void put_client_lock() {}
diff --git a/src/pybind/rados.py b/src/pybind/rados.py
index 804a169..47a03a1 100644
--- a/src/pybind/rados.py
+++ b/src/pybind/rados.py
@@ -11,7 +11,9 @@ import ctypes
 import errno
 import threading
 import time
+import sys
 
+from collections import Iterator
 from datetime import datetime
 from functools import wraps
 from itertools import chain
@@ -27,6 +29,16 @@ LIBRADOS_OP_FLAG_FADVISE_DONTNEED = 0x20
 LIBRADOS_OP_FLAG_FADVISE_NOCACHE = 0x40
 
 
+# Are we running Python 2.x
+_python2 = sys.hexversion < 0x03000000
+
+
+if _python2:
+    str_type = basestring
+else:
+    str_type = str
+
+
 class Error(Exception):
     """ `Error` class, derived from `Exception` """
     pass
@@ -261,6 +273,42 @@ def requires(*types):
     return wrapper
 
 
+def cstr(val, encoding="utf-8"):
+    """
+    Create a C-style string from a Python string
+
+    :param str val: Python string
+    :param encoding: Encoding to use
+    :rtype: c_char_p
+    """
+    if val is None:
+        return c_char_p(None)
+
+    if _python2 and isinstance(val, str):
+        # Don't encode str on Python 2, as it's already an 8-bit string
+        return c_char_p(val)
+    else:
+        return c_char_p(val.encode(encoding))
+
+
+def decode_cstr(addr, size=-1, encoding="utf-8"):
+    """
+    Decode a C-style string into a Python string.
+
+    Return None if a the C string is a NULL pointer.
+
+    :param c_char_p addr: C-style string
+    :param int: String size (assume NUL-terminated if size is -1)
+    :param encoding: Encoding to use
+    :rtype: str or None
+    """
+    if not addr:
+        # NULL pointer
+        return None
+
+    return ctypes.string_at(addr, size).decode(encoding)
+
+
 class Rados(object):
     """librados python wrapper"""
     def require_state(self, *args):
@@ -274,8 +322,8 @@ class Rados(object):
         raise RadosStateError("You cannot perform that operation on a \
 Rados object in state %s." % self.state)
 
-    @requires(('rados_id', opt(str)), ('name', opt(str)), ('clustername', opt(str)),
-              ('conffile', opt(str)))
+    @requires(('rados_id', opt(str_type)), ('name', opt(str_type)), ('clustername', opt(str_type)),
+              ('conffile', opt(str_type)))
     def __init__(self, rados_id=None, name=None, clustername=None,
                  conf_defaults=None, conffile=None, conf=None, flags=0):
         library_path = find_library('rados')
@@ -297,15 +345,15 @@ Rados object in state %s." % self.state)
         if clustername is None:
             clustername = 'ceph'
         ret = run_in_thread(self.librados.rados_create2,
-                            (byref(self.cluster), c_char_p(clustername),
-                             c_char_p(name), c_uint64(flags)))
+                            (byref(self.cluster), cstr(clustername),
+                            cstr(name), c_uint64(flags)))
 
         if ret != 0:
             raise Error("rados_initialize failed with error code: %d" % ret)
         self.state = "configuring"
         # order is important: conf_defaults, then conffile, then conf
         if conf_defaults:
-            for key, value in conf_defaults.iteritems():
+            for key, value in conf_defaults.items():
                 self.conf_set(key, value)
         if conffile is not None:
             # read the default conf file when '' is given
@@ -313,7 +361,7 @@ Rados object in state %s." % self.state)
                 conffile = None
             self.conf_read_file(conffile)
         if conf:
-            for key, value in conf.iteritems():
+            for key, value in conf.items():
                 self.conf_set(key, value)
 
     def shutdown(self):
@@ -348,7 +396,7 @@ Rados object in state %s." % self.state)
         return Version(major.value, minor.value, extra.value)
 
 
-    @requires(('path', opt(str)))
+    @requires(('path', opt(str_type)))
     def conf_read_file(self, path=None):
         """
         Configure the cluster handle using a Ceph config file.
@@ -358,7 +406,7 @@ Rados object in state %s." % self.state)
         """
         self.require_state("configuring", "connected")
         ret = run_in_thread(self.librados.rados_conf_read_file,
-                            (self.cluster, c_char_p(path)))
+                            (self.cluster, cstr(path)))
         if (ret != 0):
             raise make_ex(ret, "error calling conf_read_file")
 
@@ -372,7 +420,7 @@ Rados object in state %s." % self.state)
             return
         # create instances of arrays of c_char_p's, both len(args) long
         # cretargs will always be a subset of cargs (perhaps identical)
-        cargs = (c_char_p * len(args))(*args)
+        cargs = (c_char_p * len(args))(*map(cstr, args))
         cretargs = (c_char_p * len(args))()
         ret = run_in_thread(self.librados.rados_conf_parse_argv_remainder,
                             (self.cluster, len(args), cargs, cretargs))
@@ -381,8 +429,7 @@ Rados object in state %s." % self.state)
 
         # cretargs was allocated with fixed length; collapse return
         # list to eliminate any missing args
-
-        retargs = [a for a in cretargs if a is not None]
+        retargs = [a.decode('utf-8') for a in cretargs if a is not None]
         self.parsed_args = args
         return retargs
 
@@ -395,11 +442,11 @@ Rados object in state %s." % self.state)
         if not var:
             return
         ret = run_in_thread(self.librados.rados_conf_parse_env,
-                            (self.cluster, c_char_p(var)))
+                            (self.cluster, cstr(var)))
         if (ret != 0):
             raise make_ex(ret, "error calling conf_parse_env")
 
-    @requires(('option', str))
+    @requires(('option', str_type))
     def conf_get(self, option):
         """
         Get the value of a configuration option
@@ -415,10 +462,10 @@ Rados object in state %s." % self.state)
         while True:
             ret_buf = create_string_buffer(length)
             ret = run_in_thread(self.librados.rados_conf_get,
-                                (self.cluster, c_char_p(option), ret_buf,
-                                 c_size_t(length)))
+                                (self.cluster, cstr(option), ret_buf,
+                                c_size_t(length)))
             if (ret == 0):
-                return ret_buf.value
+                return decode_cstr(ret_buf)
             elif (ret == -errno.ENAMETOOLONG):
                 length = length * 2
             elif (ret == -errno.ENOENT):
@@ -426,7 +473,7 @@ Rados object in state %s." % self.state)
             else:
                 raise make_ex(ret, "error calling conf_get")
 
-    @requires(('option', str), ('val', str))
+    @requires(('option', str_type), ('val', str_type))
     def conf_set(self, option, val):
         """
         Set the value of a configuration option
@@ -440,7 +487,7 @@ Rados object in state %s." % self.state)
         """
         self.require_state("configuring", "connected")
         ret = run_in_thread(self.librados.rados_conf_set,
-                            (self.cluster, c_char_p(option), c_char_p(val)))
+                            (self.cluster, cstr(option), cstr(val)))
         if (ret != 0):
             raise make_ex(ret, "error calling conf_set")
 
@@ -463,7 +510,7 @@ Rados object in state %s." % self.state)
         outstrlen = c_long()
 
         ret = run_in_thread(self.librados.rados_ping_monitor,
-                            (self.cluster, c_char_p(mon_id),
+                            (self.cluster, cstr(mon_id),
                              outstrp, byref(outstrlen)))
 
         my_outstr = outstrp.contents[:(outstrlen.value)]
@@ -472,7 +519,7 @@ Rados object in state %s." % self.state)
 
         if ret != 0:
             raise make_ex(ret, "error calling ping_monitor")
-        return my_outstr
+        return decode_cstr(my_outstr)
 
     def connect(self, timeout=0):
         """
@@ -515,7 +562,7 @@ Rados object in state %s." % self.state)
                 'kb_avail': stats.kb_avail,
                 'num_objects': stats.num_objects}
 
-    @requires(('pool_name', str))
+    @requires(('pool_name', str_type))
     def pool_exists(self, pool_name):
         """
         Checks if a given pool exists.
@@ -528,7 +575,7 @@ Rados object in state %s." % self.state)
         """
         self.require_state("connected")
         ret = run_in_thread(self.librados.rados_pool_lookup,
-                            (self.cluster, c_char_p(pool_name)))
+                            (self.cluster, cstr(pool_name)))
         if (ret >= 0):
             return True
         elif (ret == -errno.ENOENT):
@@ -536,7 +583,7 @@ Rados object in state %s." % self.state)
         else:
             raise make_ex(ret, "error looking up pool '%s'" % pool_name)
 
-    @requires(('pool_name', str))
+    @requires(('pool_name', str_type))
     def pool_lookup(self, pool_name):
         """
         Returns a pool's ID based on its name.
@@ -549,7 +596,7 @@ Rados object in state %s." % self.state)
         """
         self.require_state("connected")
         ret = run_in_thread(self.librados.rados_pool_lookup,
-                            (self.cluster, c_char_p(pool_name)))
+                            (self.cluster, cstr(pool_name)))
         if (ret >= 0):
             return int(ret)
         elif (ret == -errno.ENOENT):
@@ -581,10 +628,10 @@ Rados object in state %s." % self.state)
             elif ret < 0:
                 raise make_ex(ret, "error reverse looking up pool '%s'" % pool_id)
             else:
-                return c_name.value
+                return decode_cstr(c_name.value)
                 break
 
-    @requires(('pool_name', str), ('auid', opt(int)), ('crush_rule', opt(int)))
+    @requires(('pool_name', str_type), ('auid', opt(int)), ('crush_rule', opt(int)))
     def create_pool(self, pool_name, auid=None, crush_rule=None):
         """
         Create a pool:
@@ -606,20 +653,20 @@ Rados object in state %s." % self.state)
         if auid is None:
             if crush_rule is None:
                 ret = run_in_thread(self.librados.rados_pool_create,
-                                    (self.cluster, c_char_p(pool_name)))
+                                    (self.cluster, cstr(pool_name)))
             else:
                 ret = run_in_thread(self.librados.
                                     rados_pool_create_with_crush_rule,
-                                    (self.cluster, c_char_p(pool_name),
+                                    (self.cluster, cstr(pool_name),
                                      c_ubyte(crush_rule)))
 
         elif crush_rule is None:
             ret = run_in_thread(self.librados.rados_pool_create_with_auid,
-                                (self.cluster, c_char_p(pool_name),
+                                (self.cluster, cstr(pool_name),
                                  c_uint64(auid)))
         else:
             ret = run_in_thread(self.librados.rados_pool_create_with_all,
-                                (self.cluster, c_char_p(pool_name),
+                                (self.cluster, cstr(pool_name),
                                  c_uint64(auid), c_ubyte(crush_rule)))
         if ret < 0:
             raise make_ex(ret, "error creating pool '%s'" % pool_name)
@@ -639,7 +686,7 @@ Rados object in state %s." % self.state)
             raise make_ex(ret, "get_pool_base_tier(%d)" % pool_id)
         return base_tier.value
 
-    @requires(('pool_name', str))
+    @requires(('pool_name', str_type))
     def delete_pool(self, pool_name):
         """
         Delete a pool and all data inside it.
@@ -654,7 +701,7 @@ Rados object in state %s." % self.state)
         """
         self.require_state("connected")
         ret = run_in_thread(self.librados.rados_pool_delete,
-                            (self.cluster, c_char_p(pool_name)))
+                            (self.cluster, cstr(pool_name)))
         if ret < 0:
             raise make_ex(ret, "error deleting pool '%s'" % pool_name)
 
@@ -674,7 +721,8 @@ Rados object in state %s." % self.state)
                 size = c_size_t(ret)
             else:
                 break
-        return filter(lambda name: name != '', c_names.raw.split('\0'))
+
+        return [decode_cstr(name) for name in c_names.raw.split(b'\0') if len(name) > 0]
 
     def get_fsid(self):
         """
@@ -692,7 +740,7 @@ Rados object in state %s." % self.state)
             raise make_ex(ret, "error getting cluster fsid")
         return fsid.value
 
-    @requires(('ioctx_name', str))
+    @requires(('ioctx_name', str_type))
     def open_ioctx(self, ioctx_name):
         """
         Create an io context
@@ -709,7 +757,7 @@ Rados object in state %s." % self.state)
         self.require_state("connected")
         ioctx = c_void_p()
         ret = run_in_thread(self.librados.rados_ioctx_create,
-                            (self.cluster, c_char_p(ioctx_name), byref(ioctx)))
+                            (self.cluster, cstr(ioctx_name), byref(ioctx)))
         if ret < 0:
             raise make_ex(ret, "error opening pool '%s'" % ioctx_name)
         return Ioctx(ioctx_name, self.librados, ioctx)
@@ -724,11 +772,11 @@ Rados object in state %s." % self.state)
         outbuflen = c_long()
         outsp = pointer(pointer(c_char()))
         outslen = c_long()
-        cmdarr = (c_char_p * len(cmd))(*cmd)
+        cmdarr = (c_char_p * len(cmd))(*map(cstr, cmd))
 
         if target:
             ret = run_in_thread(self.librados.rados_mon_command_target,
-                                (self.cluster, c_char_p(target), cmdarr,
+                                (self.cluster, cstr(target), cmdarr,
                                  len(cmd), c_char_p(inbuf), len(inbuf),
                                  outbufp, byref(outbuflen), outsp,
                                  byref(outslen)), timeout)
@@ -741,7 +789,7 @@ Rados object in state %s." % self.state)
 
         # copy returned memory (ctypes makes a copy, not a reference)
         my_outbuf = outbufp.contents[:(outbuflen.value)]
-        my_outs = outsp.contents[:(outslen.value)]
+        my_outs = decode_cstr(outsp.contents, outslen.value)
 
         # free callee's allocations
         if outbuflen.value:
@@ -761,7 +809,7 @@ Rados object in state %s." % self.state)
         outbuflen = c_long()
         outsp = pointer(pointer(c_char()))
         outslen = c_long()
-        cmdarr = (c_char_p * len(cmd))(*cmd)
+        cmdarr = (c_char_p * len(cmd))(*map(cstr, cmd))
         ret = run_in_thread(self.librados.rados_osd_command,
                             (self.cluster, osdid, cmdarr, len(cmd),
                              c_char_p(inbuf), len(inbuf),
@@ -770,7 +818,7 @@ Rados object in state %s." % self.state)
 
         # copy returned memory (ctypes makes a copy, not a reference)
         my_outbuf = outbufp.contents[:(outbuflen.value)]
-        my_outs = outsp.contents[:(outslen.value)]
+        my_outs = decode_cstr(outsp.contents, outslen.value)
 
         # free callee's allocations
         if outbuflen.value:
@@ -790,16 +838,16 @@ Rados object in state %s." % self.state)
         outbuflen = c_long()
         outsp = pointer(pointer(c_char()))
         outslen = c_long()
-        cmdarr = (c_char_p * len(cmd))(*cmd)
+        cmdarr = (c_char_p * len(cmd))(*map(cstr, cmd))
         ret = run_in_thread(self.librados.rados_pg_command,
-                            (self.cluster, c_char_p(pgid), cmdarr, len(cmd),
+                            (self.cluster, cstr(pgid), cmdarr, len(cmd),
                              c_char_p(inbuf), len(inbuf),
                              outbufp, byref(outbuflen), outsp, byref(outslen)),
                             timeout)
 
         # copy returned memory (ctypes makes a copy, not a reference)
         my_outbuf = outbufp.contents[:(outbuflen.value)]
-        my_outs = outsp.contents[:(outslen.value)]
+        my_outs = decode_cstr(outsp.contents, outslen.value)
 
         # free callee's allocations
         if outbuflen.value:
@@ -826,13 +874,13 @@ Rados object in state %s." % self.state)
         """
         self.require_state("connected")
         ret = run_in_thread(self.librados.rados_blacklist_add,
-                            (self.cluster, c_char_p(client_address),
+                            (self.cluster, cstr(client_address),
                              c_uint32(expire_seconds)))
         if ret < 0:
             raise make_ex(ret, "error blacklisting client '%s'" % client_address)
 
 
-class OmapIterator(object):
+class OmapIterator(Iterator):
     """Omap iterator"""
     def __init__(self, ioctx, ctx):
         self.ioctx = ioctx
@@ -842,6 +890,9 @@ class OmapIterator(object):
         return self
 
     def next(self):
+        return self.__next__()
+
+    def __next__(self):
         """
         Get the next key-value pair in the object
         :returns: next rados.OmapItem
@@ -855,7 +906,7 @@ class OmapIterator(object):
             raise make_ex(ret, "error iterating over the omap")
         if key_.value is None:
             raise StopIteration()
-        key = ctypes.string_at(key_)
+        key = decode_cstr(key_)
         val = None
         if val_.value is not None:
             val = ctypes.string_at(val_, len_)
@@ -865,7 +916,7 @@ class OmapIterator(object):
         run_in_thread(self.ioctx.librados.rados_omap_get_end, (self.ctx,))
 
 
-class ObjectIterator(object):
+class ObjectIterator(Iterator):
     """rados.Ioctx Object iterator"""
     def __init__(self, ioctx):
         self.ioctx = ioctx
@@ -880,26 +931,33 @@ class ObjectIterator(object):
         return self
 
     def next(self):
+        return self.__next__()
+
+    def __next__(self):
         """
         Get the next object name and locator in the pool
 
         :raises: StopIteration
         :returns: next rados.Ioctx Object
         """
-        key = c_char_p()
-        locator = c_char_p()
-        nspace = c_char_p()
+        key_ = c_char_p()
+        locator_ = c_char_p()
+        nspace_ = c_char_p()
         ret = run_in_thread(self.ioctx.librados.rados_nobjects_list_next,
-                            (self.ctx, byref(key), byref(locator), byref(nspace)))
+                            (self.ctx, byref(key_), byref(locator_), byref(nspace_)))
         if ret < 0:
             raise StopIteration()
-        return Object(self.ioctx, key.value, locator.value, nspace.value)
+
+        key = decode_cstr(key_)
+        locator = decode_cstr(locator_)
+        nspace = decode_cstr(nspace_)
+        return Object(self.ioctx, key, locator, nspace)
 
     def __del__(self):
         run_in_thread(self.ioctx.librados.rados_nobjects_list_close, (self.ctx,))
 
 
-class XattrIterator(object):
+class XattrIterator(Iterator):
     """Extended attribute iterator"""
     def __init__(self, ioctx, it, oid):
         self.ioctx = ioctx
@@ -910,6 +968,9 @@ class XattrIterator(object):
         return self
 
     def next(self):
+        return self.__next__()
+
+    def __next__(self):
         """
         Get the next xattr on the object
 
@@ -926,7 +987,7 @@ class XattrIterator(object):
 in '%s'" % self.oid)
         if name_.value is None:
             raise StopIteration()
-        name = ctypes.string_at(name_)
+        name = decode_cstr(name_)
         val = ctypes.string_at(val_, len_)
         return (name, val)
 
@@ -934,7 +995,7 @@ in '%s'" % self.oid)
         run_in_thread(self.ioctx.librados.rados_getxattrs_end, (self.it,))
 
 
-class SnapIterator(object):
+class SnapIterator(Iterator):
     """Snapshot iterator"""
     def __init__(self, ioctx):
         self.ioctx = ioctx
@@ -958,6 +1019,9 @@ ioctx '%s'" % self.ioctx.name)
         return self
 
     def next(self):
+        return self.__next__()
+
+    def __next__(self):
         """
         Get the next Snapshot
 
@@ -979,7 +1043,7 @@ ioctx '%s'" % self.ioctx.name)
             elif (ret != -errno.ERANGE):
                 raise make_ex(ret, "rados_snap_get_name error")
             name_len = name_len * 2
-        snap = Snap(self.ioctx, name.value, snap_id)
+        snap = Snap(self.ioctx, decode_cstr(name), snap_id)
         self.cur_snap = self.cur_snap + 1
         return snap
 
@@ -1244,7 +1308,7 @@ class Ioctx(object):
         """
         completion = self.__get_completion(oncomplete, onsafe)
         ret = run_in_thread(self.librados.rados_aio_write,
-                            (self.io, c_char_p(object_name),
+                            (self.io, cstr(object_name),
                              completion.rados_comp, c_char_p(to_write),
                              c_size_t(len(to_write)), c_uint64(offset)))
         if ret < 0:
@@ -1276,7 +1340,7 @@ class Ioctx(object):
         """
         completion = self.__get_completion(oncomplete, onsafe)
         ret = run_in_thread(self.librados.rados_aio_write_full,
-                            (self.io, c_char_p(object_name),
+                            (self.io, cstr(object_name),
                              completion.rados_comp, c_char_p(to_write),
                              c_size_t(len(to_write))))
         if ret < 0:
@@ -1307,7 +1371,7 @@ class Ioctx(object):
         """
         completion = self.__get_completion(oncomplete, onsafe)
         ret = run_in_thread(self.librados.rados_aio_append,
-                            (self.io, c_char_p(object_name),
+                            (self.io, cstr(object_name),
                              completion.rados_comp, c_char_p(to_append),
                              c_size_t(len(to_append))))
         if ret < 0:
@@ -1354,7 +1418,7 @@ class Ioctx(object):
 
         completion = self.__get_completion(oncomplete_, None)
         ret = run_in_thread(self.librados.rados_aio_read,
-                            (self.io, c_char_p(object_name),
+                            (self.io, cstr(object_name),
                              completion.rados_comp, buf, c_size_t(length),
                              c_uint64(offset)))
         if ret < 0:
@@ -1379,7 +1443,7 @@ class Ioctx(object):
         """
         completion = self.__get_completion(oncomplete, onsafe)
         ret = run_in_thread(self.librados.rados_aio_remove,
-                            (self.io, c_char_p(object_name),
+                            (self.io, cstr(object_name),
                              completion.rados_comp))
         if ret < 0:
             raise make_ex(ret, "error removing %s" % object_name)
@@ -1410,7 +1474,7 @@ class Ioctx(object):
             raise make_ex(ret, "error changing auid of '%s' to %d"
                           % (self.name, auid))
 
-    @requires(('loc_key', str))
+    @requires(('loc_key', str_type))
     def set_locator_key(self, loc_key):
         """
         Set the key for mapping objects to pgs within an io context.
@@ -1428,7 +1492,7 @@ class Ioctx(object):
         """
         self.require_ioctx_open()
         run_in_thread(self.librados.rados_ioctx_locator_set_key,
-                      (self.io, c_char_p(loc_key)))
+                      (self.io, cstr(loc_key)))
         self.locator_key = loc_key
 
     def get_locator_key(self):
@@ -1440,7 +1504,7 @@ class Ioctx(object):
         return self.locator_key
 
 
-    @requires(('nspace', str))
+    @requires(('nspace', str_type))
     def set_namespace(self, nspace):
         """
         Set the namespace for objects within an io context.
@@ -1459,7 +1523,7 @@ class Ioctx(object):
         if nspace is None:
             nspace = ""
         run_in_thread(self.librados.rados_ioctx_set_namespace,
-                      (self.io, c_char_p(nspace)))
+                      (self.io, cstr(nspace)))
         self.nspace = nspace
 
     def get_namespace(self):
@@ -1485,7 +1549,7 @@ class Ioctx(object):
             self.state = "closed"
 
 
-    @requires(('key', str), ('data', str))
+    @requires(('key', str_type), ('data', bytes))
     def write(self, key, data, offset=0):
         """
         Write data to an object synchronously
@@ -1493,7 +1557,7 @@ class Ioctx(object):
         :param key: name of the object
         :type key: str
         :param data: data to write
-        :type data: str
+        :type data: bytes
         :param offset: byte offset in the object to begin writing at
         :type offset: int
 
@@ -1504,7 +1568,7 @@ class Ioctx(object):
         self.require_ioctx_open()
         length = len(data)
         ret = run_in_thread(self.librados.rados_write,
-                            (self.io, c_char_p(key), c_char_p(data),
+                            (self.io, cstr(key), c_char_p(data),
                              c_size_t(length), c_uint64(offset)))
         if ret == 0:
             return ret
@@ -1515,7 +1579,7 @@ class Ioctx(object):
             raise LogicError("Ioctx.write(%s): rados_write \
 returned %d, but should return zero on success." % (self.name, ret))
 
-    @requires(('key', str), ('data', str))
+    @requires(('key', str_type), ('data', bytes))
     def write_full(self, key, data):
         """
         Write an entire object synchronously.
@@ -1526,7 +1590,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         :param key: name of the object
         :type key: str
         :param data: data to write
-        :type data: str
+        :type data: bytes
 
         :raises: :class:`TypeError`
         :raises: :class:`Error`
@@ -1535,7 +1599,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         self.require_ioctx_open()
         length = len(data)
         ret = run_in_thread(self.librados.rados_write_full,
-                            (self.io, c_char_p(key), c_char_p(data),
+                            (self.io, cstr(key), c_char_p(data),
                              c_size_t(length)))
         if ret == 0:
             return ret
@@ -1546,7 +1610,7 @@ returned %d, but should return zero on success." % (self.name, ret))
             raise LogicError("Ioctx.write_full(%s): rados_write_full \
 returned %d, but should return zero on success." % (self.name, ret))
 
-    @requires(('key', str), ('data', str))
+    @requires(('key', str_type), ('data', bytes))
     def append(self, key, data):
         """
         Append data to an object synchronously
@@ -1554,7 +1618,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         :param key: name of the object
         :type key: str
         :param data: data to write
-        :type data: str
+        :type data: bytes
 
         :raises: :class:`TypeError`
         :raises: :class:`LogicError`
@@ -1563,7 +1627,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         self.require_ioctx_open()
         length = len(data)
         ret = run_in_thread(self.librados.rados_append,
-                            (self.io, c_char_p(key), c_char_p(data),
+                            (self.io, cstr(key), c_char_p(data),
                              c_size_t(length)))
         if ret == 0:
             return ret
@@ -1574,7 +1638,7 @@ returned %d, but should return zero on success." % (self.name, ret))
             raise LogicError("Ioctx.append(%s): rados_append \
 returned %d, but should return zero on success." % (self.name, ret))
 
-    @requires(('key', str))
+    @requires(('key', str_type))
     def read(self, key, length=8192, offset=0):
         """
         Read data from an object synchronously
@@ -1593,7 +1657,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         self.require_ioctx_open()
         ret_buf = create_string_buffer(length)
         ret = run_in_thread(self.librados.rados_read,
-                            (self.io, c_char_p(key), ret_buf, c_size_t(length),
+                            (self.io, cstr(key), ret_buf, c_size_t(length),
                              c_uint64(offset)))
         if ret < 0:
             raise make_ex(ret, "Ioctx.read(%s): failed to read %s" % (self.name, key))
@@ -1649,7 +1713,7 @@ returned %d, but should return zero on success." % (self.name, ret))
                 "num_wr": stats.num_wr,
                 "num_wr_kb": stats.num_wr_kb}
 
-    @requires(('key', str))
+    @requires(('key', str_type))
     def remove_object(self, key):
         """
         Delete an object
@@ -1665,12 +1729,12 @@ returned %d, but should return zero on success." % (self.name, ret))
         """
         self.require_ioctx_open()
         ret = run_in_thread(self.librados.rados_remove,
-                            (self.io, c_char_p(key)))
+                            (self.io, cstr(key)))
         if ret < 0:
             raise make_ex(ret, "Failed to remove '%s'" % key)
         return True
 
-    @requires(('key', str))
+    @requires(('key', str_type))
     def trunc(self, key, size):
         """
         Resize an object
@@ -1690,12 +1754,12 @@ returned %d, but should return zero on success." % (self.name, ret))
 
         self.require_ioctx_open()
         ret = run_in_thread(self.librados.rados_trunc,
-                            (self.io, c_char_p(key), c_uint64(size)))
+                            (self.io, cstr(key), c_uint64(size)))
         if ret < 0:
             raise make_ex(ret, "Ioctx.trunc(%s): failed to truncate %s" % (self.name, key))
         return ret
 
-    @requires(('key', str))
+    @requires(('key', str_type))
     def stat(self, key):
         """
         Get object stats (size/mtime)
@@ -1712,13 +1776,13 @@ returned %d, but should return zero on success." % (self.name, ret))
         pmtime = c_uint64()
 
         ret = run_in_thread(self.librados.rados_stat,
-                            (self.io, c_char_p(key), pointer(psize),
+                            (self.io, cstr(key), pointer(psize),
                              pointer(pmtime)))
         if ret < 0:
             raise make_ex(ret, "Failed to stat %r" % key)
         return psize.value, time.localtime(pmtime.value)
 
-    @requires(('key', str), ('xattr_name', str))
+    @requires(('key', str_type), ('xattr_name', str_type))
     def get_xattr(self, key, xattr_name):
         """
         Get the value of an extended attribute on an object.
@@ -1737,7 +1801,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         while ret_length < 4096 * 1024 * 1024:
             ret_buf = create_string_buffer(ret_length)
             ret = run_in_thread(self.librados.rados_getxattr,
-                                (self.io, c_char_p(key), c_char_p(xattr_name),
+                                (self.io, cstr(key), cstr(xattr_name),
                                  ret_buf, c_size_t(ret_length)))
             if (ret == -errno.ERANGE):
                 ret_length *= 2
@@ -1747,13 +1811,13 @@ returned %d, but should return zero on success." % (self.name, ret))
                 break
         return ctypes.string_at(ret_buf, ret)
 
-    @requires(('oid', str))
+    @requires(('oid', str_type))
     def get_xattrs(self, oid):
         """
         Start iterating over xattrs on an object.
 
         :param oid: the name of the object to get xattrs from
-        :type key: str
+        :type oid: str
 
         :raises: :class:`TypeError`
         :raises: :class:`Error`
@@ -1762,12 +1826,12 @@ returned %d, but should return zero on success." % (self.name, ret))
         self.require_ioctx_open()
         it = c_void_p(0)
         ret = run_in_thread(self.librados.rados_getxattrs,
-                            (self.io, oid, byref(it)))
+                            (self.io, cstr(oid), byref(it)))
         if ret != 0:
             raise make_ex(ret, "Failed to get rados xattrs for object %r" % oid)
         return XattrIterator(self, it, oid)
 
-    @requires(('key', str), ('xattr_name', str), ('xattr_value', str))
+    @requires(('key', str_type), ('xattr_name', str_type), ('xattr_value', bytes))
     def set_xattr(self, key, xattr_name, xattr_value):
         """
         Set an extended attribute on an object.
@@ -1777,7 +1841,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         :param xattr_name: which extended attribute to set
         :type xattr_name: str
         :param xattr_value: the value of the  extended attribute
-        :type xattr_value: str
+        :type xattr_value: bytes
 
         :raises: :class:`TypeError`
         :raises: :class:`Error`
@@ -1785,13 +1849,13 @@ returned %d, but should return zero on success." % (self.name, ret))
         """
         self.require_ioctx_open()
         ret = run_in_thread(self.librados.rados_setxattr,
-                            (self.io, c_char_p(key), c_char_p(xattr_name),
+                            (self.io, cstr(key), cstr(xattr_name),
                              c_char_p(xattr_value), c_size_t(len(xattr_value))))
         if ret < 0:
             raise make_ex(ret, "Failed to set xattr %r" % xattr_name)
         return True
 
-    @requires(('key', str), ('xattr_name', str))
+    @requires(('key', str_type), ('xattr_name', str_type))
     def rm_xattr(self, key, xattr_name):
         """
         Removes an extended attribute on from an object.
@@ -1807,7 +1871,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         """
         self.require_ioctx_open()
         ret = run_in_thread(self.librados.rados_rmxattr,
-                            (self.io, c_char_p(key), c_char_p(xattr_name)))
+                            (self.io, cstr(key), cstr(xattr_name)))
         if ret < 0:
             raise make_ex(ret, "Failed to delete key %r xattr %r" %
                           (key, xattr_name))
@@ -1831,7 +1895,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         self.require_ioctx_open()
         return SnapIterator(self)
 
-    @requires(('snap_name', str))
+    @requires(('snap_name', str_type))
     def create_snap(self, snap_name):
         """
         Create a pool-wide snapshot
@@ -1844,11 +1908,11 @@ returned %d, but should return zero on success." % (self.name, ret))
         """
         self.require_ioctx_open()
         ret = run_in_thread(self.librados.rados_ioctx_snap_create,
-                            (self.io, c_char_p(snap_name)))
+                            (self.io, cstr(snap_name)))
         if (ret != 0):
             raise make_ex(ret, "Failed to create snap %s" % snap_name)
 
-    @requires(('snap_name', str))
+    @requires(('snap_name', str_type))
     def remove_snap(self, snap_name):
         """
         Removes a pool-wide snapshot
@@ -1861,11 +1925,11 @@ returned %d, but should return zero on success." % (self.name, ret))
         """
         self.require_ioctx_open()
         ret = run_in_thread(self.librados.rados_ioctx_snap_remove,
-                            (self.io, c_char_p(snap_name)))
+                            (self.io, cstr(snap_name)))
         if (ret != 0):
             raise make_ex(ret, "Failed to remove snap %s" % snap_name)
 
-    @requires(('snap_name', str))
+    @requires(('snap_name', str_type))
     def lookup_snap(self, snap_name):
         """
         Get the id of a pool snapshot
@@ -1880,7 +1944,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         self.require_ioctx_open()
         snap_id = c_uint64()
         ret = run_in_thread(self.librados.rados_ioctx_snap_lookup,
-                            (self.io, c_char_p(snap_name), byref(snap_id)))
+                            (self.io, cstr(snap_name), byref(snap_id)))
         if (ret != 0):
             raise make_ex(ret, "Failed to lookup snap %s" % snap_name)
         return Snap(self, snap_name, snap_id)
@@ -1945,7 +2009,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         key_num = len(keys)
         key_array_type = c_char_p*key_num
         key_array = key_array_type()
-        key_array[:] = keys
+        key_array[:] = [cstr(key) for key in keys]
 
         value_array_type = c_char_p*key_num
         value_array = value_array_type()
@@ -1960,7 +2024,7 @@ returned %d, but should return zero on success." % (self.name, ret))
                       (c_void_p(write_op), byref(key_array), byref(value_array),
                        byref(lens_array), c_int(key_num),))
 
-    @requires(('write_op', int), ('oid', str), ('mtime', opt(int)), ('flags', opt(int)))
+    @requires(('write_op', int), ('oid', str_type), ('mtime', opt(int)), ('flags', opt(int)))
     def operate_write_op(self, write_op, oid, mtime=0, flags=0):
         """
         excute the real write operation
@@ -1974,10 +2038,10 @@ returned %d, but should return zero on success." % (self.name, ret))
         :type flags: int
         """
         run_in_thread(self.librados.rados_write_op_operate,
-                      (c_void_p(write_op), self.io, c_char_p(oid),
+                      (c_void_p(write_op), self.io, cstr(oid),
                        c_long(mtime), c_int(flags),))
 
-    @requires(('read_op', int), ('oid', str), ('flag', opt(int)))
+    @requires(('read_op', int), ('oid', str_type), ('flag', opt(int)))
     def operate_read_op(self, read_op, oid, flag=0):
         """
         excute the real read operation
@@ -1989,9 +2053,9 @@ returned %d, but should return zero on success." % (self.name, ret))
         :type flag: int
         """
         run_in_thread(self.librados.rados_read_op_operate,
-                      (c_void_p(read_op), self.io, c_char_p(oid), c_int(flag),))
+                      (c_void_p(read_op), self.io, cstr(oid), c_int(flag),))
 
-    @requires(('read_op', int), ('start_after', str), ('filter_prefix', str), ('max_return', int))
+    @requires(('read_op', int), ('start_after', str_type), ('filter_prefix', str_type), ('max_return', int))
     def get_omap_vals(self, read_op, start_after, filter_prefix, max_return):
         """
         get the omap values
@@ -2008,12 +2072,12 @@ returned %d, but should return zero on success." % (self.name, ret))
         prval = c_int()
         iter_addr = c_void_p()
         run_in_thread(self.librados.rados_read_op_omap_get_vals,
-                      (c_void_p(read_op), c_char_p(start_after),
-                       c_char_p(filter_prefix), c_int(max_return),
+                      (c_void_p(read_op), cstr(start_after),
+                       cstr(filter_prefix), c_int(max_return),
                        byref(iter_addr), pointer(prval)))
         return OmapIterator(self, iter_addr), prval.value
 
-    @requires(('read_op', int), ('start_after', str), ('max_return', int))
+    @requires(('read_op', int), ('start_after', str_type), ('max_return', int))
     def get_omap_keys(self, read_op, start_after, max_return):
         """
         get the omap keys
@@ -2028,7 +2092,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         prval = c_int()
         iter_addr = c_void_p()
         run_in_thread(self.librados.rados_read_op_omap_get_keys,
-                      (c_void_p(read_op), c_char_p(start_after),
+                      (c_void_p(read_op), cstr(start_after),
                        c_int(max_return), byref(iter_addr), pointer(prval)))
         return OmapIterator(self, iter_addr), prval.value
 
@@ -2047,7 +2111,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         key_num = len(keys)
         key_array_type = c_char_p*key_num
         key_array = key_array_type()
-        key_array[:] = keys
+        key_array[:] = [cstr(key) for key in keys]
         run_in_thread(self.librados.rados_read_op_omap_get_vals_by_keys,
                       (c_void_p(read_op), byref(key_array), c_int(key_num),
                        byref(iter_addr), pointer(prval)))
@@ -2065,7 +2129,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         key_num = len(keys)
         key_array_type = c_char_p*key_num
         key_array = key_array_type()
-        key_array[:] = keys
+        key_array[:] = [cstr(key) for key in keys]
         run_in_thread(self.librados.rados_write_op_omap_rm_keys,
                       (c_void_p(write_op), byref(key_array), c_int(key_num)))
 
@@ -2079,7 +2143,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         run_in_thread(self.librados.rados_write_op_omap_clear,
                       (c_void_p(write_op),))
 
-    @requires(('key', str), ('name', str), ('cookie', str), ('desc', str),
+    @requires(('key', str_type), ('name', str_type), ('cookie', str_type), ('desc', str_type),
               ('duration', opt(int)), ('flags', int))
     def lock_exclusive(self, key, name, cookie, desc="", duration=None, flags=0):
 
@@ -2105,15 +2169,15 @@ returned %d, but should return zero on success." % (self.name, ret))
         self.require_ioctx_open()
 
         ret = run_in_thread(self.librados.rados_lock_exclusive,
-                            (self.io, c_char_p(key), c_char_p(name), c_char_p(cookie),
-                             c_char_p(desc),
+                            (self.io, cstr(key), cstr(name), cstr(cookie),
+                             cstr(desc),
                              timeval(duration, None) if duration is None else None,
                              c_uint8(flags)))
         if ret < 0:
             raise make_ex(ret, "Ioctx.rados_lock_exclusive(%s): failed to set lock %s on %s" % (self.name, name, key))
 
-    @requires(('key', str), ('name', str), ('cookie', str), ('tag', str),
-              ('desc', str), ('duration', opt(int)), ('flags', int))
+    @requires(('key', str_type), ('name', str_type), ('cookie', str_type), ('tag', str_type),
+              ('desc', str_type), ('duration', opt(int)), ('flags', int))
     def lock_shared(self, key, name, cookie, tag, desc="", duration=None, flags=0):
 
         """
@@ -2140,14 +2204,14 @@ returned %d, but should return zero on success." % (self.name, ret))
         self.require_ioctx_open()
 
         ret = run_in_thread(self.librados.rados_lock_shared,
-                            (self.io, c_char_p(key), c_char_p(name), c_char_p(cookie),
-                             c_char_p(tag), c_char_p(desc),
+                            (self.io, cstr(key), cstr(name), cstr(cookie),
+                             cstr(tag), cstr(desc),
                              timeval(duration, None) if duration is None else None,
                              c_uint8(flags)))
         if ret < 0:
             raise make_ex(ret, "Ioctx.rados_lock_exclusive(%s): failed to set lock %s on %s" % (self.name, name, key))
 
-    @requires(('key', str), ('name', str), ('cookie', str))
+    @requires(('key', str_type), ('name', str_type), ('cookie', str_type))
     def unlock(self, key, name, cookie):
 
         """
@@ -2166,7 +2230,7 @@ returned %d, but should return zero on success." % (self.name, ret))
         self.require_ioctx_open()
 
         ret = run_in_thread(self.librados.rados_unlock,
-                            (self.io, c_char_p(key), c_char_p(name), c_char_p(cookie)))
+                            (self.io, cstr(key), cstr(name), cstr(cookie)))
         if ret < 0:
             raise make_ex(ret, "Ioctx.rados_lock_exclusive(%s): failed to set lock %s on %s" % (self.name, name, key))
 
diff --git a/src/pybind/rbd.py b/src/pybind/rbd.py
index b570a00..8f910de 100644
--- a/src/pybind/rbd.py
+++ b/src/pybind/rbd.py
@@ -9,18 +9,18 @@ Error codes from librbd are turned into exceptions that subclass
 (the base class of all rbd exceptions), :class:`PermissionError`
 and :class:`IOError`, in addition to those documented for the
 method.
-
-A number of methods have string arguments, which must not be unicode
-to interact correctly with librbd. If unicode is passed to these
-methods, a :class:`TypeError` will be raised.
 """
 # Copyright 2011 Josh Durgin
+from collections import Iterable
 from ctypes import CDLL, c_char, c_char_p, c_size_t, c_void_p, c_int, \
     create_string_buffer, byref, Structure, c_uint64, c_int64, c_uint8, \
     CFUNCTYPE
 from ctypes.util import find_library
 import ctypes
 import errno
+import sys
+
+from rados import cstr, decode_cstr
 
 ANONYMOUS_AUID = 0xffffffffffffffff
 ADMIN_AUID = 0
@@ -31,13 +31,15 @@ RBD_FEATURE_EXCLUSIVE_LOCK = 4
 RBD_FEATURE_OBJECT_MAP = 8
 RBD_FEATURE_FAST_DIFF = 16
 RBD_FEATURE_DEEP_FLATTEN = 32
+RBD_FEATURE_JOURNALING = 64
 
 RBD_FEATURES_ALL = (RBD_FEATURE_LAYERING       |
                     RBD_FEATURE_STRIPINGV2     |
                     RBD_FEATURE_EXCLUSIVE_LOCK |
                     RBD_FEATURE_OBJECT_MAP     |
                     RBD_FEATURE_FAST_DIFF      |
-                    RBD_FEATURE_DEEP_FLATTEN)
+                    RBD_FEATURE_DEEP_FLATTEN   |
+                    RBD_FEATURE_JOURNALING)
 
 # features that make an image inaccessible for read or write by
 # clients that don't understand them
@@ -50,21 +52,40 @@ RBD_FEATURES_RW_INCOMPATIBLE = (RBD_FEATURES_INCOMPATIBLE  |
                                 RBD_FEATURE_EXCLUSIVE_LOCK |
                                 RBD_FEATURE_OBJECT_MAP     |
                                 RBD_FEATURE_FAST_DIFF      |
-                                RBD_FEATURE_DEEP_FLATTEN)
+                                RBD_FEATURE_DEEP_FLATTEN   |
+                                RBD_FEATURE_JOURNALING)
 
 # features that may be dynamically enabled or disabled
 RBD_FEATURES_MUTABLE = (RBD_FEATURE_EXCLUSIVE_LOCK |
                         RBD_FEATURE_OBJECT_MAP     |
-                        RBD_FEATURE_FAST_DIFF)
+                        RBD_FEATURE_FAST_DIFF      |
+                        RBD_FEATURE_JOURNALING)
 
 # features that only work when used with a single client
 # using the image for writes
 RBD_FEATURES_SINGLE_CLIENT = (RBD_FEATURE_EXCLUSIVE_LOCK |
                               RBD_FEATURE_OBJECT_MAP     |
-                              RBD_FEATURE_FAST_DIFF)
+                              RBD_FEATURE_FAST_DIFF      |
+                              RBD_FEATURE_JOURNALING)
 
 RBD_FLAG_OBJECT_MAP_INVALID = 1
 
+RBD_IMAGE_OPTION_FORMAT = 0
+RBD_IMAGE_OPTION_FEATURES = 1
+RBD_IMAGE_OPTION_ORDER = 2
+RBD_IMAGE_OPTION_STRIPE_UNIT = 3
+RBD_IMAGE_OPTION_STRIPE_COUNT = 4
+
+
+# Are we running Python 2.x
+_python2 = sys.hexversion < 0x03000000
+
+
+if _python2:
+    str_type = basestring
+else:
+    str_type = str
+
 
 class Error(Exception):
     pass
@@ -244,32 +265,58 @@ class RBD(object):
         """
         if order is None:
             order = 0
-        if not isinstance(name, str):
+        if not isinstance(name, str_type):
             raise TypeError('name must be a string')
         if old_format:
             if features != 0 or stripe_unit != 0 or stripe_count != 0:
                 raise InvalidArgument('format 1 images do not support feature'
                                       ' masks or non-default striping')
-            ret = self.librbd.rbd_create(ioctx.io, c_char_p(name),
+            ret = self.librbd.rbd_create(ioctx.io, cstr(name),
                                          c_uint64(size),
                                          byref(c_int(order)))
         else:
             if not hasattr(self.librbd, 'rbd_create2'):
                 raise FunctionNotSupported('installed version of librbd does'
                                            ' not support format 2 images')
+            has_create4 = hasattr(self.librbd, 'rbd_create4')
             has_create3 = hasattr(self.librbd, 'rbd_create3')
             if (stripe_unit != 0 or stripe_count != 0) and not has_create3:
                 raise FunctionNotSupported('installed version of librbd does'
                                            ' not support stripe unit or count')
-            if has_create3:
-                ret = self.librbd.rbd_create3(ioctx.io, c_char_p(name),
+            if has_create4:
+                format = old_format and 1 or 2
+                opts = c_void_p()
+                self.librbd.rbd_image_options_create(byref(opts))
+                self.librbd.rbd_image_options_set_uint64(opts,
+                                                         RBD_IMAGE_OPTION_FORMAT,
+                                                         c_uint64(format))
+                self.librbd.rbd_image_options_set_uint64(opts,
+                                                         RBD_IMAGE_OPTION_FEATURES,
+                                                         c_uint64(features))
+                self.librbd.rbd_image_options_set_uint64(opts,
+                                                         RBD_IMAGE_OPTION_ORDER,
+                                                         c_uint64(order))
+                self.librbd.rbd_image_options_set_uint64(opts,
+                                                         RBD_IMAGE_OPTION_STRIPE_UNIT,
+                                                         c_uint64(stripe_unit))
+                self.librbd.rbd_image_options_set_uint64(opts,
+                                                         RBD_IMAGE_OPTION_STRIPE_COUNT,
+                                                         c_uint64(stripe_count))
+                ret = self.librbd.rbd_create4(ioctx.io, cstr(name),
+                                              c_uint64(size), opts)
+                self.librbd.rbd_image_options_get_uint64(opts,
+                                                         RBD_IMAGE_OPTION_ORDER,
+                                                         byref(c_uint64(order)))
+                self.librbd.rbd_image_options_destroy(opts)
+            elif has_create3:
+                ret = self.librbd.rbd_create3(ioctx.io, cstr(name),
                                               c_uint64(size),
                                               c_uint64(features),
                                               byref(c_int(order)),
                                               c_uint64(stripe_unit),
                                               c_uint64(stripe_count))
             else:
-                ret = self.librbd.rbd_create2(ioctx.io, c_char_p(name),
+                ret = self.librbd.rbd_create2(ioctx.io, cstr(name),
                                               c_uint64(size),
                                               c_uint64(features),
                                               byref(c_int(order)))
@@ -277,7 +324,7 @@ class RBD(object):
             raise make_ex(ret, 'error creating image')
 
     def clone(self, p_ioctx, p_name, p_snapname, c_ioctx, c_name,
-              features=0, order=None):
+              features=0, order=None, stripe_unit=0, stripe_count=0):
         """
         Clone a parent rbd snapshot into a COW sparse child.
 
@@ -295,6 +342,10 @@ class RBD(object):
         :type features: int
         :param order: the image is split into (2**order) byte objects
         :type order: int
+        :param stripe_unit: stripe unit in bytes (default 0 for object size)
+        :type stripe_unit: int
+        :param stripe_count: objects to stripe over before looping
+        :type stripe_count: int
         :raises: :class:`TypeError`
         :raises: :class:`InvalidArgument`
         :raises: :class:`ImageExists`
@@ -303,16 +354,44 @@ class RBD(object):
         """
         if order is None:
             order = 0
-        if not isinstance(p_snapname, str) or not isinstance(p_name, str):
+        if not isinstance(p_snapname, str_type) or not isinstance(p_name, str_type):
             raise TypeError('parent name and snapname must be strings')
-        if not isinstance(c_name, str):
+        if not isinstance(c_name, str_type):
             raise TypeError('child name must be a string')
 
-        ret = self.librbd.rbd_clone(p_ioctx.io, c_char_p(p_name),
-                                    c_char_p(p_snapname),
-                                    c_ioctx.io, c_char_p(c_name),
-                                    c_uint64(features),
-                                    byref(c_int(order)))
+        has_clone3 = hasattr(self.librbd, 'rbd_clone3')
+        if (stripe_unit != 0 or stripe_count != 0) and not has_clone3:
+            raise FunctionNotSupported('installed version of librbd does'
+                                       ' not support stripe unit or count')
+        if has_clone3:
+            opts = c_void_p()
+            self.librbd.rbd_image_options_create(byref(opts))
+            self.librbd.rbd_image_options_set_uint64(opts,
+                                                     RBD_IMAGE_OPTION_FEATURES,
+                                                     c_uint64(features))
+            self.librbd.rbd_image_options_set_uint64(opts,
+                                                     RBD_IMAGE_OPTION_ORDER,
+                                                     c_uint64(order))
+            self.librbd.rbd_image_options_set_uint64(opts,
+                                                     RBD_IMAGE_OPTION_STRIPE_UNIT,
+                                                     c_uint64(stripe_unit))
+            self.librbd.rbd_image_options_set_uint64(opts,
+                                                     RBD_IMAGE_OPTION_STRIPE_COUNT,
+                                                     c_uint64(stripe_count))
+            ret = self.librbd.rbd_clone3(p_ioctx.io, cstr(p_name),
+                                         cstr(p_snapname),
+                                         c_ioctx.io, cstr(c_name),
+                                         opts)
+            self.librbd.rbd_image_options_get_uint64(opts,
+                                                     RBD_IMAGE_OPTION_ORDER,
+                                                     byref(c_uint64(order)))
+            self.librbd.rbd_image_options_destroy(opts)
+        else:
+            ret = self.librbd.rbd_clone(p_ioctx.io, cstr(p_name),
+                                        cstr(p_snapname),
+                                        c_ioctx.io, cstr(c_name),
+                                        c_uint64(features),
+                                        byref(c_int(order)))
         if ret < 0:
             raise make_ex(ret, 'error creating clone')
 
@@ -332,7 +411,8 @@ class RBD(object):
                 break
             elif ret != -errno.ERANGE:
                 raise make_ex(ret, 'error listing images')
-        return filter(lambda name: name != '', c_names.raw.split('\0'))
+
+        return [decode_cstr(name) for name in c_names.raw.split(b'\0') if len(name) > 0]
 
     def remove(self, ioctx, name):
         """
@@ -351,9 +431,9 @@ class RBD(object):
         :raises: :class:`ImageNotFound`, :class:`ImageBusy`,
                  :class:`ImageHasSnapshots`
         """
-        if not isinstance(name, str):
+        if not isinstance(name, str_type):
             raise TypeError('name must be a string')
-        ret = self.librbd.rbd_remove(ioctx.io, c_char_p(name))
+        ret = self.librbd.rbd_remove(ioctx.io, cstr(name))
         if ret != 0:
             raise make_ex(ret, 'error removing image')
 
@@ -369,9 +449,9 @@ class RBD(object):
         :type dest: str
         :raises: :class:`ImageNotFound`, :class:`ImageExists`
         """
-        if not isinstance(src, str) or not isinstance(dest, str):
+        if not isinstance(src, str_type) or not isinstance(dest, str_type):
             raise TypeError('src and dest must be strings')
-        ret = self.librbd.rbd_rename(ioctx.io, c_char_p(src), c_char_p(dest))
+        ret = self.librbd.rbd_rename(ioctx.io, cstr(src), cstr(dest))
         if ret != 0:
             raise make_ex(ret, 'error renaming image')
 
@@ -412,20 +492,20 @@ class Image(object):
         self.librbd = load_librbd()
         self.image = c_void_p()
         self.name = name
-        if not isinstance(name, str):
+        if not isinstance(name, str_type):
             raise TypeError('name must be a string')
-        if snapshot is not None and not isinstance(snapshot, str):
+        if snapshot is not None and not isinstance(snapshot, str_type):
             raise TypeError('snapshot must be a string or None')
         if read_only:
             if not hasattr(self.librbd, 'rbd_open_read_only'):
                 raise FunctionNotSupported('installed version of librbd does '
                                            'not support open in read-only mode')
-            ret = self.librbd.rbd_open_read_only(ioctx.io, c_char_p(name),
+            ret = self.librbd.rbd_open_read_only(ioctx.io, cstr(name),
                                                  byref(self.image),
-                                                 c_char_p(snapshot))
+                                                 cstr(snapshot))
         else:
-            ret = self.librbd.rbd_open(ioctx.io, c_char_p(name),
-                                       byref(self.image), c_char_p(snapshot))
+            ret = self.librbd.rbd_open(ioctx.io, cstr(name),
+                                       byref(self.image), cstr(snapshot))
         if ret != 0:
             raise make_ex(ret, 'error opening image %s at snapshot %s' % (name, snapshot))
         self.closed = False
@@ -506,7 +586,7 @@ class Image(object):
             'obj_size'          : info.obj_size,
             'num_objs'          : info.num_objs,
             'order'             : info.order,
-            'block_name_prefix' : info.block_name_prefix,
+            'block_name_prefix' : decode_cstr(info.block_name_prefix),
             'parent_pool'       : info.parent_pool,
             'parent_name'       : info.parent_name
             }
@@ -536,7 +616,9 @@ class Image(object):
 
         if ret != 0:
             raise make_ex(ret, 'error getting parent info for image %s' % (self.name,))
-        return (pool.value, name.value, snapname.value)
+        return (decode_cstr(pool.value),
+                decode_cstr(name.value),
+                decode_cstr(snapname.value))
 
     def old_format(self):
         """
@@ -632,7 +714,8 @@ class Image(object):
             raise make_ex(ret, 'error getting lock status for image' % (self.name))
         return owner.value == 1
 
-    def copy(self, dest_ioctx, dest_name):
+    def copy(self, dest_ioctx, dest_name, features=0, order=None, stripe_unit=0,
+             stripe_count=0):
         """
         Copy the image to another location.
 
@@ -640,11 +723,51 @@ class Image(object):
         :type dest_ioctx: :class:`rados.Ioctx`
         :param dest_name: the name of the copy
         :type dest_name: str
+        :param features: bitmask of features to enable; if set, must include layering
+        :type features: int
+        :param order: the image is split into (2**order) byte objects
+        :type order: int
+        :param stripe_unit: stripe unit in bytes (default 0 for object size)
+        :type stripe_unit: int
+        :param stripe_count: objects to stripe over before looping
+        :type stripe_count: int
+        :raises: :class:`TypeError`
+        :raises: :class:`InvalidArgument`
         :raises: :class:`ImageExists`
+        :raises: :class:`FunctionNotSupported`
+        :raises: :class:`ArgumentOutOfRange`
         """
-        if not isinstance(dest_name, str):
+        if order is None:
+            order = 0
+        if not isinstance(dest_name, str_type):
             raise TypeError('dest_name must be a string')
-        ret = self.librbd.rbd_copy(self.image, dest_ioctx.io, c_char_p(dest_name))
+        has_copy3 = hasattr(self.librbd, 'rbd_copy3')
+        if (stripe_unit != 0 or stripe_count != 0) and not has_copy3:
+            raise FunctionNotSupported('installed version of librbd does'
+                                       ' not support stripe unit or count')
+        if has_copy3:
+            opts = c_void_p()
+            self.librbd.rbd_image_options_create(byref(opts))
+            self.librbd.rbd_image_options_set_uint64(opts,
+                                                     RBD_IMAGE_OPTION_FEATURES,
+                                                     c_uint64(features))
+            self.librbd.rbd_image_options_set_uint64(opts,
+                                                     RBD_IMAGE_OPTION_ORDER,
+                                                     c_uint64(order))
+            self.librbd.rbd_image_options_set_uint64(opts,
+                                                     RBD_IMAGE_OPTION_STRIPE_UNIT,
+                                                     c_uint64(stripe_unit))
+            self.librbd.rbd_image_options_set_uint64(opts,
+                                                     RBD_IMAGE_OPTION_STRIPE_COUNT,
+                                                     c_uint64(stripe_count))
+            ret = self.librbd.rbd_copy3(self.image, dest_ioctx.io,
+                                        cstr(dest_name), opts)
+            self.librbd.rbd_image_options_get_uint64(opts,
+                                                     RBD_IMAGE_OPTION_ORDER,
+                                                     byref(c_uint64(order)))
+            self.librbd.rbd_image_options_destroy(opts)
+        else:
+            ret = self.librbd.rbd_copy(self.image, dest_ioctx.io, cstr(dest_name))
         if ret < 0:
             raise make_ex(ret, 'error copying image %s to %s' % (self.name, dest_name))
 
@@ -664,12 +787,30 @@ class Image(object):
         :type name: str
         :raises: :class:`ImageExists`
         """
-        if not isinstance(name, str):
+        if not isinstance(name, str_type):
             raise TypeError('name must be a string')
-        ret = self.librbd.rbd_snap_create(self.image, c_char_p(name))
+        ret = self.librbd.rbd_snap_create(self.image, cstr(name))
         if ret != 0:
             raise make_ex(ret, 'error creating snapshot %s from %s' % (name, self.name))
 
+    def rename_snap(self, srcname, dstname):
+        """
+        rename a snapshot of the image.
+
+        :param srcname: the src name of the snapshot
+        :type srcname: str
+        :param dstname: the dst name of the snapshot
+        :type dstname: str
+        :raises: :class:`ImageExists`
+        """
+        if not isinstance(srcname, str_type):
+            raise TypeError('src name must be a string')
+        if not isinstance(dstname, str_type):
+            raise TypeError('dst name must be a string')
+        ret = self.librbd.rbd_snap_rename(self.image, cstr(srcname), cstr(dstname))
+        if ret != 0:
+            raise make_ex(ret, 'error renaming snapshot of %s from %s to %s' % (self.name, srcname, dstname))
+
     def remove_snap(self, name):
         """
         Delete a snapshot of the image.
@@ -678,9 +819,9 @@ class Image(object):
         :type name: str
         :raises: :class:`IOError`, :class:`ImageBusy`
         """
-        if not isinstance(name, str):
+        if not isinstance(name, str_type):
             raise TypeError('name must be a string')
-        ret = self.librbd.rbd_snap_remove(self.image, c_char_p(name))
+        ret = self.librbd.rbd_snap_remove(self.image, cstr(name))
         if ret != 0:
             raise make_ex(ret, 'error removing snapshot %s from %s' % (name, self.name))
 
@@ -694,9 +835,9 @@ class Image(object):
         :type name: str
         :raises: :class:`IOError`
         """
-        if not isinstance(name, str):
+        if not isinstance(name, str_type):
             raise TypeError('name must be a string')
-        ret = self.librbd.rbd_snap_rollback(self.image, c_char_p(name))
+        ret = self.librbd.rbd_snap_rollback(self.image, cstr(name))
         if ret != 0:
             raise make_ex(ret, 'error rolling back image %s to snapshot %s' % (self.name, name))
 
@@ -709,9 +850,9 @@ class Image(object):
         :type name: str
         :raises: :class:`IOError`, :class:`ImageNotFound`
         """
-        if not isinstance(name, str):
+        if not isinstance(name, str_type):
             raise TypeError('name must be a string')
-        ret = self.librbd.rbd_snap_protect(self.image, c_char_p(name))
+        ret = self.librbd.rbd_snap_protect(self.image, cstr(name))
         if ret != 0:
             raise make_ex(ret, 'error protecting snapshot %s@%s' % (self.name, name))
 
@@ -724,9 +865,9 @@ class Image(object):
         :type name: str
         :raises: :class:`IOError`, :class:`ImageNotFound`
         """
-        if not isinstance(name, str):
+        if not isinstance(name, str_type):
             raise TypeError('name must be a string')
-        ret = self.librbd.rbd_snap_unprotect(self.image, c_char_p(name))
+        ret = self.librbd.rbd_snap_unprotect(self.image, cstr(name))
         if ret != 0:
             raise make_ex(ret, 'error unprotecting snapshot %s@%s' % (self.name, name))
 
@@ -739,10 +880,10 @@ class Image(object):
         :returns: bool - whether the snapshot is protected
         :raises: :class:`IOError`, :class:`ImageNotFound`
         """
-        if not isinstance(name, str):
+        if not isinstance(name, str_type):
             raise TypeError('name must be a string')
         is_protected = c_int()
-        ret = self.librbd.rbd_snap_is_protected(self.image, c_char_p(name),
+        ret = self.librbd.rbd_snap_is_protected(self.image, cstr(name),
                                                 byref(is_protected))
         if ret != 0:
             raise make_ex(ret, 'error checking if snapshot %s@%s is protected' % (self.name, name))
@@ -757,9 +898,9 @@ class Image(object):
         :param name: the snapshot to read from, or None to unset the snapshot
         :type name: str or None
         """
-        if name is not None and not isinstance(name, str):
+        if name is not None and not isinstance(name, str_type):
             raise TypeError('name must be a string')
-        ret = self.librbd.rbd_snap_set(self.image, c_char_p(name))
+        ret = self.librbd.rbd_snap_set(self.image, cstr(name))
         if ret != 0:
             raise make_ex(ret, 'error setting image %s to snapshot %s' % (self.name, name))
 
@@ -831,14 +972,14 @@ class Image(object):
         :raises: :class:`InvalidArgument`, :class:`IOError`,
                  :class:`ImageNotFound`
         """
-        if from_snapshot is not None and not isinstance(from_snapshot, str):
+        if from_snapshot is not None and not isinstance(from_snapshot, str_type):
             raise TypeError('client must be a string')
 
         RBD_DIFF_CB = CFUNCTYPE(c_int, c_uint64, c_size_t, c_int, c_void_p)
         cb_holder = DiffIterateCB(iterate_cb)
         cb = RBD_DIFF_CB(cb_holder.callback)
         ret = self.librbd.rbd_diff_iterate2(self.image,
-                                            c_char_p(from_snapshot),
+                                            cstr(from_snapshot),
                                             c_uint64(offset),
                                             c_uint64(length),
                                             c_uint8(include_parent),
@@ -855,7 +996,7 @@ class Image(object):
         part of the write would fall outside the image.
 
         :param data: the data to be written
-        :type data: str
+        :type data: bytes
         :param offset: where to start writing data
         :type offset: int
         :param fadvise_flags: fadvise flags for this write
@@ -864,8 +1005,8 @@ class Image(object):
         :raises: :class:`IncompleteWriteError`, :class:`LogicError`,
                  :class:`InvalidArgument`, :class:`IOError`
         """
-        if not isinstance(data, str):
-            raise TypeError('data must be a string')
+        if not isinstance(data, bytes):
+            raise TypeError('data must be a byte string')
         length = len(data)
 
         if fadvise_flags == 0:
@@ -965,9 +1106,9 @@ written." % (self.name, ret, length))
                 raise make_ex(ret, 'error listing images')
         if ret == 0:
             return []
-        pools = c_pools.raw[:pools_size.value - 1].split('\0')
-        images = c_images.raw[:images_size.value - 1].split('\0')
-        return zip(pools, images)
+        pools = map(decode_cstr, c_pools.raw[:pools_size.value - 1].split(b'\0'))
+        images = map(decode_cstr, c_images.raw[:images_size.value - 1].split(b'\0'))
+        return list(zip(pools, images))
 
     def list_lockers(self):
         """
@@ -1010,13 +1151,13 @@ written." % (self.name, ret, length))
                 raise make_ex(ret, 'error listing images')
         if ret == 0:
             return []
-        clients = c_clients.raw[:clients_size.value - 1].split('\0')
-        cookies = c_cookies.raw[:cookies_size.value - 1].split('\0')
-        addrs = c_addrs.raw[:addrs_size.value - 1].split('\0')
+        clients = [client.decode("utf-8") for client in c_clients.raw[:clients_size.value - 1].split(b'\0')]
+        cookies = [cookie.decode("utf-8") for cookie in c_cookies.raw[:cookies_size.value - 1].split(b'\0')]
+        addrs = [addr.decode("utf-8") for addr in c_addrs.raw[:addrs_size.value - 1].split(b'\0')]
         return {
-            'tag'       : c_tag.value,
+            'tag'       : decode_cstr(c_tag),
             'exclusive' : exclusive.value == 1,
-            'lockers'   : zip(clients, cookies, addrs),
+            'lockers'   : list(zip(clients, cookies, addrs)),
             }
 
     def lock_exclusive(self, cookie):
@@ -1026,9 +1167,9 @@ written." % (self.name, ret, length))
         :raises: :class:`ImageBusy` if a different client or cookie locked it
                  :class:`ImageExists` if the same client and cookie locked it
         """
-        if not isinstance(cookie, str):
+        if not isinstance(cookie, str_type):
             raise TypeError('cookie must be a string')
-        ret = self.librbd.rbd_lock_exclusive(self.image, c_char_p(cookie))
+        ret = self.librbd.rbd_lock_exclusive(self.image, cstr(cookie))
         if ret < 0:
             raise make_ex(ret, 'error acquiring exclusive lock on image')
 
@@ -1040,12 +1181,12 @@ written." % (self.name, ret, length))
         :raises: :class:`ImageBusy` if a different client or cookie locked it
                  :class:`ImageExists` if the same client and cookie locked it
         """
-        if not isinstance(cookie, str):
+        if not isinstance(cookie, str_type):
             raise TypeError('cookie must be a string')
-        if not isinstance(tag, str):
+        if not isinstance(tag, str_type):
             raise TypeError('tag must be a string')
-        ret = self.librbd.rbd_lock_shared(self.image, c_char_p(cookie),
-                                          c_char_p(tag))
+        ret = self.librbd.rbd_lock_shared(self.image, cstr(cookie),
+                                          cstr(tag))
         if ret < 0:
             raise make_ex(ret, 'error acquiring shared lock on image')
 
@@ -1053,9 +1194,9 @@ written." % (self.name, ret, length))
         """
         Release a lock on the image that was locked by this rados client.
         """
-        if not isinstance(cookie, str):
+        if not isinstance(cookie, str_type):
             raise TypeError('cookie must be a string')
-        ret = self.librbd.rbd_unlock(self.image, c_char_p(cookie))
+        ret = self.librbd.rbd_unlock(self.image, cstr(cookie))
         if ret < 0:
             raise make_ex(ret, 'error unlocking image')
 
@@ -1063,12 +1204,12 @@ written." % (self.name, ret, length))
         """
         Release a lock held by another rados client.
         """
-        if not isinstance(client, str):
+        if not isinstance(client, str_type):
             raise TypeError('client must be a string')
-        if not isinstance(cookie, str):
+        if not isinstance(cookie, str_type):
             raise TypeError('cookie must be a string')
-        ret = self.librbd.rbd_break_lock(self.image, c_char_p(client),
-                                         c_char_p(cookie))
+        ret = self.librbd.rbd_break_lock(self.image, cstr(client),
+                                         cstr(cookie))
         if ret < 0:
             raise make_ex(ret, 'error unlocking image')
 
@@ -1082,7 +1223,7 @@ class DiffIterateCB(object):
         return 0
 
 
-class SnapIterator(object):
+class SnapIterator(Iterable):
     """
     Iterator over snapshot info for an image.
 
@@ -1110,11 +1251,11 @@ class SnapIterator(object):
                 raise make_ex(ret, 'error listing snapshots for image %s' % (image.name,))
 
     def __iter__(self):
-        for i in xrange(self.num_snaps):
+        for i in range(self.num_snaps):
             yield {
                 'id'   : self.snaps[i].id,
                 'size' : self.snaps[i].size,
-                'name' : self.snaps[i].name,
+                'name' : decode_cstr(self.snaps[i].name),
                 }
 
     def __del__(self):
diff --git a/src/rbd.cc b/src/rbd.cc
deleted file mode 100755
index ff9cf40..0000000
--- a/src/rbd.cc
+++ /dev/null
@@ -1,4115 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2012 Sage Weil <sage at newdream.net> and others
- *
- * LGPL2.  See file COPYING.
- *
- */
-#include "include/int_types.h"
-
-#include "mon/MonClient.h"
-#include "common/config.h"
-
-#include "common/errno.h"
-#include "common/ceph_argparse.h"
-#include "common/strtol.h"
-#include "global/global_init.h"
-#include "common/safe_io.h"
-#include "include/krbd.h"
-#include "include/stringify.h"
-#include "include/rados/librados.hpp"
-#include "include/rbd/librbd.hpp"
-#include "include/byteorder.h"
-
-#include "include/intarith.h"
-
-#include "include/compat.h"
-#include "common/blkdev.h"
-
-#include <boost/accumulators/accumulators.hpp>
-#include <boost/accumulators/statistics/stats.hpp>
-#include <boost/accumulators/statistics/rolling_sum.hpp>
-#include <boost/assign/list_of.hpp>
-#include <boost/bind.hpp>
-#include <boost/scope_exit.hpp>
-#include <boost/scoped_ptr.hpp>
-#include <errno.h>
-#include <fcntl.h>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <time.h>
-#include "include/memory.h"
-#include <sys/ioctl.h>
-
-#include "include/rbd_types.h"
-#include "common/TextTable.h"
-#include "include/util.h"
-
-#include "common/Formatter.h"
-#include "common/Throttle.h"
-
-#if defined(__linux__)
-#include <linux/fs.h>
-#endif
-
-#if defined(__FreeBSD__)
-#include <sys/param.h>
-#endif
-
-#define MAX_SECRET_LEN 1000
-#define MAX_POOL_NAME_SIZE 128
-
-#define RBD_DIFF_BANNER "rbd diff v1\n"
-
-static string dir_oid = RBD_DIRECTORY;
-static string dir_info_oid = RBD_INFO;
-
-bool progress = true;
-bool resize_allow_shrink = false;
-
-map<string, string> map_options; // -o / --options map
-
-#define dout_subsys ceph_subsys_rbd
-
-namespace {
-
-void aio_context_callback(librbd::completion_t completion, void *arg)
-{
-  librbd::RBD::AioCompletion *aio_completion =
-    reinterpret_cast<librbd::RBD::AioCompletion*>(completion);
-  Context *context = reinterpret_cast<Context *>(arg);
-  context->complete(aio_completion->get_return_value());
-  aio_completion->release();
-}
-
-} // anonymous namespace
-
-static std::map<uint64_t, std::string> feature_mapping =
-  boost::assign::map_list_of(
-    RBD_FEATURE_LAYERING, "layering")(
-    RBD_FEATURE_STRIPINGV2, "striping")(
-    RBD_FEATURE_EXCLUSIVE_LOCK, "exclusive-lock")(
-    RBD_FEATURE_OBJECT_MAP, "object-map")(
-    RBD_FEATURE_FAST_DIFF, "fast-diff")(
-    RBD_FEATURE_DEEP_FLATTEN, "deep-flatten");
-
-void usage()
-{
-  cout <<
-"usage: rbd [-n <auth user>] [OPTIONS] <cmd> ...\n"
-"where 'pool' is a rados pool name (default is 'rbd') and 'cmd' is one of:\n"
-"  (ls | list) [-l | --long ] [pool-name]      list rbd images\n"
-"                                              (-l includes snapshots/clones)\n"
-"  (du | disk-usage) [<image-spec> | <snap-spec>]\n"
-"                                              show disk usage stats for pool,\n"
-"                                              image or snapshot\n"
-"  info <image-spec> | <snap-spec>             show information about image size,\n"
-"                                              striping, etc.\n"
-"  create [--order <bits>] [--image-features <features>] [--image-shared]\n"
-"         --size <M/G/T> <image-spec>          create an empty image\n"
-"  clone [--order <bits>] [--image-features <features>] [--image-shared]\n"
-"         <parent-snap-spec> <child-image-spec>\n"
-"                                              clone a snapshot into a COW\n"
-"                                              child image\n"
-"  children <snap-spec>                        display children of snapshot\n"
-"  flatten <image-spec>                        fill clone with parent data\n"
-"                                              (make it independent)\n"
-"  resize --size <M/G/T> <image-spec>          resize (expand or contract) image\n"
-"  rm <image-spec>                             delete an image\n"
-"  export (<image-spec> | <snap-spec>) [<path>]\n"
-"                                              export image to file\n"
-"                                              \"-\" for stdout\n"
-"  import [--image-features <features>] [--image-shared]\n"
-"         <path> [<image-spec>]                import image from file\n"
-"                                              \"-\" for stdin\n"
-"                                              \"rbd/$(basename <path>)\" is\n"
-"                                              assumed for <image-spec> if\n"
-"                                              omitted\n"
-"  diff [--from-snap <snap-name>] [--whole-object]\n"
-"         <image-spec> | <snap-spec>           print extents that differ since\n"
-"                                              a previous snap, or image creation\n"
-"  export-diff [--from-snap <snap-name>] [--whole-object]\n"
-"         (<image-spec> | <snap-spec>) <path>  export an incremental diff to\n"
-"                                              path, or \"-\" for stdout\n"
-"  merge-diff <diff1> <diff2> <path>           merge <diff1> and <diff2> into\n"
-"                                              <path>, <diff1> could be \"-\"\n"
-"                                              for stdin, and <path> could be \"-\"\n"
-"                                              for stdout\n"
-"  import-diff <path> <image-spec>             import an incremental diff from\n"
-"                                              path or \"-\" for stdin\n"
-"  (cp | copy) (<src-image-spec> | <src-snap-spec>) <dest-image-spec>\n"
-"                                              copy src image to dest\n"
-"  (mv | rename) <src-image-spec> <dest-image-spec>\n"
-"                                              rename src image to dest\n"
-"  image-meta list <image-spec>                image metadata list keys with values\n"
-"  image-meta get <image-spec> <key>           image metadata get the value associated with the key\n"
-"  image-meta set <image-spec> <key> <value>   image metadata set key with value\n"
-"  image-meta remove <image-spec> <key>        image metadata remove the key and value associated\n"
-"  object-map rebuild <image-spec> | <snap-spec>\n"
-"                                              rebuild an invalid object map\n"
-"  snap ls <image-spec>                        dump list of image snapshots\n"
-"  snap create <snap-spec>                     create a snapshot\n"
-"  snap rollback <snap-spec>                   rollback image to snapshot\n"
-"  snap rm <snap-spec>                         deletes a snapshot\n"
-"  snap purge <image-spec>                     deletes all snapshots\n"
-"  snap protect <snap-spec>                    prevent a snapshot from being deleted\n"
-"  snap unprotect <snap-spec>                  allow a snapshot to be deleted\n"
-"  watch <image-spec>                          watch events on image\n"
-"  status <image-spec>                         show the status of this image\n"
-"  map <image-spec> | <snap-spec>              map image to a block device\n"
-"                                              using the kernel\n"
-"  unmap <image-spec> | <snap-spec> | <device> unmap a rbd device that was\n"
-"                                              mapped by the kernel\n"
-"  showmapped                                  show the rbd images mapped\n"
-"                                              by the kernel\n"
-"  feature disable <image-spec> <feature>      disable the specified image feature\n"
-"  feature enable <image-spec> <feature>       enable the specified image feature\n"
-"  lock list <image-spec>                      show locks held on an image\n"
-"  lock add <image-spec> <id> [--shared <tag>] take a lock called id on an image\n"
-"  lock remove <image-spec> <id> <locker>      release a lock on an image\n"
-"  bench-write <image-spec>                    simple write benchmark\n"
-"               --io-size <size in B/K/M/G/T>    write size\n"
-"               --io-threads <num>               ios in flight\n"
-"               --io-total <size in B/K/M/G/T>   total size to write\n"
-"               --io-pattern <seq|rand>          write pattern\n"
-"\n"
-"<image-spec> is [<pool-name>]/<image-name>,\n"
-"<snap-spec> is [<pool-name>]/<image-name>@<snap-name>,\n"
-"or you may specify individual pieces of names with -p/--pool <pool-name>,\n"
-"--image <image-name> and/or --snap <snap-name>.\n"
-"\n"
-"Other input options:\n"
-"  -p, --pool <pool-name>             source pool name\n"
-"  --dest-pool <pool-name>            destination pool name\n"
-"  --image <image-name>               image name\n"
-"  --dest <image-name>                destination image name\n"
-"  --snap <snap-name>                 snapshot name\n"
-"  --path <path-name>                 path name for import/export\n"
-"  -s, --size <size in M/G/T>         size of image for create and resize\n"
-"  --order <bits>                     the object size in bits; object size will be\n"
-"                                     (1 << order) bytes. Default is 22 (4 MB).\n"
-"  --image-format <format-number>     format to use when creating an image\n"
-"                                     format 1 is the original format\n"
-"                                     format 2 supports cloning (default)\n"
-"  --image-feature <feature>          optional format 2 feature to enable.\n"
-"                                     use multiple times to enable multiple features\n"
-"  --image-shared                     image will be used concurrently (disables\n"
-"                                     RBD exclusive lock and dependent features)\n"
-"  --stripe-unit <size in B/K/M>      size of a block of data\n"
-"  --stripe-count <num>               number of consecutive objects in a stripe\n"
-"  --id <username>                    rados user (without 'client.'prefix) to\n"
-"                                     authenticate as\n"
-"  --keyfile <path>                   file containing secret key for use with cephx\n"
-"  --keyring <path>                   file containing keyring for use with cephx\n"
-"  --shared <tag>                     take a shared (rather than exclusive) lock\n"
-"  --format <output-format>           output format (default: plain, json, xml)\n"
-"  --pretty-format                    make json or xml output more readable\n"
-"  --no-progress                      do not show progress for long-running commands\n"
-"  -o, --options <map-options>        options to use when mapping an image\n"
-"  --read-only                        set device readonly when mapping image\n"
-"  --allow-shrink                     allow shrinking of an image when resizing\n"
-"\n"
-"Supported image features:\n"
-"  ";
-
-for (std::map<uint64_t, std::string>::const_iterator it = feature_mapping.begin();
-     it != feature_mapping.end(); ++it) {
-  if (it != feature_mapping.begin()) {
-    cout << ", ";
-  }
-  cout << it->second;
-  if ((it->first & RBD_FEATURES_MUTABLE) != 0) {
-    cout << " (*)";
-  }
-  if ((it->first & g_conf->rbd_default_features) != 0) {
-    cout << " (+)";
-  }
-}
-cout << "\n\n"
-     << "  (*) supports enabling/disabling on existing images\n"
-     << "  (+) enabled by default for new images if features are not specified\n";
-}
-
-static void format_bitmask(Formatter *f, const std::string &name,
-                           const std::map<uint64_t, std::string>& mapping,
-                           uint64_t bitmask)
-{
-  int count = 0;
-  std::string group_name(name + "s");
-  if (f == NULL) {
-    cout << "\t" << group_name << ": ";
-  } else {
-    f->open_array_section(group_name.c_str());
-  }
-  for (std::map<uint64_t, std::string>::const_iterator it = mapping.begin();
-       it != mapping.end(); ++it) {
-    if ((it->first & bitmask) == 0) {
-      continue;
-    }
-
-    if (f == NULL) {
-      if (count++ > 0) {
-        cout << ", ";
-      }
-      cout << it->second;
-    } else {
-      f->dump_string(name.c_str(), it->second);
-    }
-  }
-  if (f == NULL) {
-    cout << std::endl;
-  } else {
-    f->close_section();
-  }
-}
-
-static void format_features(Formatter *f, uint64_t features)
-{
-  format_bitmask(f, "feature", feature_mapping, features);
-}
-
-static void format_flags(Formatter *f, uint64_t flags)
-{
-  std::map<uint64_t, std::string> mapping = boost::assign::map_list_of(
-    RBD_FLAG_OBJECT_MAP_INVALID, "object map invalid")(
-    RBD_FLAG_FAST_DIFF_INVALID, "fast diff invalid");
-  format_bitmask(f, "flag", mapping, flags);
-}
-
-static bool decode_feature(const char* feature_name, uint64_t *feature) {
-  for (std::map<uint64_t, std::string>::const_iterator it = feature_mapping.begin();
-       it != feature_mapping.end(); ++it) {
-    if (strcmp(feature_name, it->second.c_str()) == 0) {
-      *feature = it->first;
-      return true;
-    }
-  }
-  return false;
-}
-
-struct MyProgressContext : public librbd::ProgressContext {
-  const char *operation;
-  int last_pc;
-
-  MyProgressContext(const char *o) : operation(o), last_pc(0) {
-  }
-
-  int update_progress(uint64_t offset, uint64_t total) {
-    if (progress) {
-      int pc = total ? (offset * 100ull / total) : 0;
-      if (pc != last_pc) {
-	cerr << "\r" << operation << ": "
-	  //	   << offset << " / " << total << " "
-	     << pc << "% complete...";
-	cerr.flush();
-	last_pc = pc;
-      }
-    }
-    return 0;
-  }
-  void finish() {
-    if (progress) {
-      cerr << "\r" << operation << ": 100% complete...done." << std::endl;
-    }
-  }
-  void fail() {
-    if (progress) {
-      cerr << "\r" << operation << ": " << last_pc << "% complete...failed."
-	   << std::endl;
-    }
-  }
-};
-
-static int get_outfmt(const char *output_format,
-		      bool pretty,
-		      boost::scoped_ptr<Formatter> *f)
-{
-  if (!strcmp(output_format, "json")) {
-    f->reset(new JSONFormatter(pretty));
-  } else if (!strcmp(output_format, "xml")) {
-    f->reset(new XMLFormatter(pretty));
-  } else if (strcmp(output_format, "plain")) {
-    cerr << "rbd: unknown format '" << output_format << "'" << std::endl;
-    return -EINVAL;
-  }
-
-  return 0;
-}
-
-static int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool lflag,
-		   Formatter *f)
-{
-  std::vector<string> names;
-  int r = rbd.list(io_ctx, names);
-  if (r == -ENOENT)
-    r = 0;
-  if (r < 0)
-    return r;
-
-  if (!lflag) {
-    if (f)
-      f->open_array_section("images");
-    for (std::vector<string>::const_iterator i = names.begin();
-       i != names.end(); ++i) {
-       if (f)
-	 f->dump_string("name", *i);
-       else
-	 cout << *i << std::endl;
-    }
-    if (f) {
-      f->close_section();
-      f->flush(cout);
-    }
-    return 0;
-  }
-
-  TextTable tbl;
-
-  if (f) {
-    f->open_array_section("images");
-  } else {
-    tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
-    tbl.define_column("SIZE", TextTable::RIGHT, TextTable::RIGHT);
-    tbl.define_column("PARENT", TextTable::LEFT, TextTable::LEFT);
-    tbl.define_column("FMT", TextTable::RIGHT, TextTable::RIGHT);
-    tbl.define_column("PROT", TextTable::LEFT, TextTable::LEFT);
-    tbl.define_column("LOCK", TextTable::LEFT, TextTable::LEFT);
-  }
-
-  string pool, image, snap, parent;
-
-  for (std::vector<string>::const_iterator i = names.begin();
-       i != names.end(); ++i) {
-    librbd::image_info_t info;
-    librbd::Image im;
-
-    r = rbd.open_read_only(io_ctx, im, i->c_str(), NULL);
-    // image might disappear between rbd.list() and rbd.open(); ignore
-    // that, warn about other possible errors (EPERM, say, for opening
-    // an old-format image, because you need execute permission for the
-    // class method)
-    if (r < 0) {
-      if (r != -ENOENT) {
-	cerr << "rbd: error opening " << *i << ": " << cpp_strerror(r)
-	     << std::endl;
-      }
-      // in any event, continue to next image
-      continue;
-    }
-
-    // handle second-nth trips through loop
-    parent.clear();
-    r = im.parent_info(&pool, &image, &snap);
-    if (r < 0 && r != -ENOENT)
-      return r;
-
-    bool has_parent = false;
-    if (r != -ENOENT) {
-      parent = pool + "/" + image + "@" + snap;
-      has_parent = true;
-    }
-
-    if (im.stat(info, sizeof(info)) < 0)
-      return -EINVAL;
-
-    uint8_t old_format;
-    im.old_format(&old_format);
-
-    list<librbd::locker_t> lockers;
-    bool exclusive;
-    r = im.list_lockers(&lockers, &exclusive, NULL);
-    if (r < 0)
-      return r;
-    string lockstr;
-    if (!lockers.empty()) {
-      lockstr = (exclusive) ? "excl" : "shr";
-    }
-
-    if (f) {
-      f->open_object_section("image");
-      f->dump_string("image", *i);
-      f->dump_unsigned("size", info.size);
-      if (has_parent) {
-	f->open_object_section("parent");
-	f->dump_string("pool", pool);
-	f->dump_string("image", image);
-	f->dump_string("snapshot", snap);
-	f->close_section();
-      }
-      f->dump_int("format", old_format ? 1 : 2);
-      if (!lockers.empty())
-	f->dump_string("lock_type", exclusive ? "exclusive" : "shared");
-      f->close_section();
-    } else {
-      tbl << *i
-	  << stringify(si_t(info.size))
-	  << parent
-	  << ((old_format) ? '1' : '2')
-	  << ""				// protect doesn't apply to images
-	  << lockstr
-	  << TextTable::endrow;
-    }
-
-    vector<librbd::snap_info_t> snaplist;
-    if (im.snap_list(snaplist) >= 0 && !snaplist.empty()) {
-      for (std::vector<librbd::snap_info_t>::iterator s = snaplist.begin();
-	   s != snaplist.end(); ++s) {
-	bool is_protected;
-	bool has_parent = false;
-	parent.clear();
-	im.snap_set(s->name.c_str());
-	r = im.snap_is_protected(s->name.c_str(), &is_protected);
-	if (r < 0)
-	  return r;
-	if (im.parent_info(&pool, &image, &snap) >= 0) {
-	  parent = pool + "/" + image + "@" + snap;
-	  has_parent = true;
-	}
-	if (f) {
-	  f->open_object_section("snapshot");
-	  f->dump_string("image", *i);
-	  f->dump_string("snapshot", s->name);
-	  f->dump_unsigned("size", s->size);
-	  if (has_parent) {
-	    f->open_object_section("parent");
-	    f->dump_string("pool", pool);
-	    f->dump_string("image", image);
-	    f->dump_string("snapshot", snap);
-	    f->close_section();
-	  }
-	  f->dump_int("format", old_format ? 1 : 2);
-	  f->dump_string("protected", is_protected ? "true" : "false");
-	  f->close_section();
-	} else {
-	  tbl << *i + "@" + s->name
-	      << stringify(si_t(s->size))
-	      << parent
-	      << ((old_format) ? '1' : '2')
-	      << (is_protected ? "yes" : "")
-	      << "" 			// locks don't apply to snaps
-	      << TextTable::endrow;
-	}
-      }
-    }
-  }
-  if (f) {
-    f->close_section();
-    f->flush(cout);
-  } else if (!names.empty()) {
-    cout << tbl;
-  }
-
-  return 0;
-}
-
-static int do_create(librbd::RBD &rbd, librados::IoCtx& io_ctx,
-		     const char *imgname, uint64_t size, int *order,
-		     int format, uint64_t features,
-		     uint64_t stripe_unit, uint64_t stripe_count)
-{
-  int r;
-
-  if (format == 1) {
-    // weird striping not allowed with format 1!
-    if ((stripe_unit || stripe_count) &&
-	(stripe_unit != (1ull << *order) && stripe_count != 1)) {
-      cerr << "non-default striping not allowed with format 1; use --image-format 2"
-	   << std::endl;
-      return -EINVAL;
-    }
-    r = rbd.create(io_ctx, imgname, size, order);
-  } else {
-    r = rbd.create3(io_ctx, imgname, size, features, order,
-		    stripe_unit, stripe_count);
-  }
-  if (r < 0)
-    return r;
-  return 0;
-}
-
-static int do_clone(librbd::RBD &rbd, librados::IoCtx &p_ioctx,
-		    const char *p_name, const char *p_snapname,
-		    librados::IoCtx &c_ioctx, const char *c_name,
-		    uint64_t features, int *c_order,
-                    uint64_t stripe_unit, uint64_t stripe_count)
-{
-  if ((features & RBD_FEATURE_LAYERING) != RBD_FEATURE_LAYERING) {
-    return -EINVAL;
-  }
-
-  return rbd.clone2(p_ioctx, p_name, p_snapname, c_ioctx, c_name, features,
-		    c_order, stripe_unit, stripe_count);
-}
-
-static int do_flatten(librbd::Image& image)
-{
-  MyProgressContext pc("Image flatten");
-  int r = image.flatten_with_progress(pc);
-  if (r < 0) {
-    pc.fail();
-    return r;
-  }
-  pc.finish();
-  return 0;
-}
-
-static int do_rename(librbd::RBD &rbd, librados::IoCtx& io_ctx,
-		     const char *imgname, const char *destname)
-{
-  int r = rbd.rename(io_ctx, imgname, destname);
-  if (r < 0)
-    return r;
-  return 0;
-}
-
-static int do_show_info(const char *imgname, librbd::Image& image,
-			const char *snapname, Formatter *f)
-{
-  librbd::image_info_t info;
-  string parent_pool, parent_name, parent_snapname;
-  uint8_t old_format;
-  uint64_t overlap, features, flags;
-  bool snap_protected = false;
-  int r;
-
-  r = image.stat(info, sizeof(info));
-  if (r < 0)
-    return r;
-
-  r = image.old_format(&old_format);
-  if (r < 0)
-    return r;
-
-  r = image.overlap(&overlap);
-  if (r < 0)
-    return r;
-
-  r = image.features(&features);
-  if (r < 0)
-    return r;
-
-  r = image.get_flags(&flags);
-  if (r < 0) {
-    return r;
-  }
-
-  if (snapname) {
-    r = image.snap_is_protected(snapname, &snap_protected);
-    if (r < 0)
-      return r;
-  }
-
-  char prefix[RBD_MAX_BLOCK_NAME_SIZE + 1];
-  strncpy(prefix, info.block_name_prefix, RBD_MAX_BLOCK_NAME_SIZE);
-  prefix[RBD_MAX_BLOCK_NAME_SIZE] = '\0';
-
-  if (f) {
-    f->open_object_section("image");
-    f->dump_string("name", imgname);
-    f->dump_unsigned("size", info.size);
-    f->dump_unsigned("objects", info.num_objs);
-    f->dump_int("order", info.order);
-    f->dump_unsigned("object_size", info.obj_size);
-    f->dump_string("block_name_prefix", prefix);
-    f->dump_int("format", (old_format ? 1 : 2));
-  } else {
-    cout << "rbd image '" << imgname << "':\n"
-	 << "\tsize " << prettybyte_t(info.size) << " in "
-	 << info.num_objs << " objects"
-	 << std::endl
-	 << "\torder " << info.order
-	 << " (" << prettybyte_t(info.obj_size) << " objects)"
-	 << std::endl
-	 << "\tblock_name_prefix: " << prefix
-	 << std::endl
-	 << "\tformat: " << (old_format ? "1" : "2")
-	 << std::endl;
-  }
-
-  if (!old_format) {
-    format_features(f, features);
-    format_flags(f, flags);
-  }
-
-  // snapshot info, if present
-  if (snapname) {
-    if (f) {
-      f->dump_string("protected", snap_protected ? "true" : "false");
-    } else {
-      cout << "\tprotected: " << (snap_protected ? "True" : "False")
-	   << std::endl;
-    }
-  }
-
-  // parent info, if present
-  if ((image.parent_info(&parent_pool, &parent_name, &parent_snapname) == 0) &&
-      parent_name.length() > 0) {
-    if (f) {
-      f->open_object_section("parent");
-      f->dump_string("pool", parent_pool);
-      f->dump_string("image", parent_name);
-      f->dump_string("snapshot", parent_snapname);
-      f->dump_unsigned("overlap", overlap);
-      f->close_section();
-    } else {
-      cout << "\tparent: " << parent_pool << "/" << parent_name
-	   << "@" << parent_snapname << std::endl;
-      cout << "\toverlap: " << prettybyte_t(overlap) << std::endl;
-    }
-  }
-
-  // striping info, if feature is set
-  if (features & RBD_FEATURE_STRIPINGV2) {
-    if (f) {
-      f->dump_unsigned("stripe_unit", image.get_stripe_unit());
-      f->dump_unsigned("stripe_count", image.get_stripe_count());
-    } else {
-      cout << "\tstripe unit: " << prettybyte_t(image.get_stripe_unit())
-	   << std::endl
-	   << "\tstripe count: " << image.get_stripe_count() << std::endl;
-    }
-  }
-
-  if (f) {
-    f->close_section();
-    f->flush(cout);
-  }
-
-  return 0;
-}
-
-static int do_delete(librbd::RBD &rbd, librados::IoCtx& io_ctx,
-		     const char *imgname)
-{
-  MyProgressContext pc("Removing image");
-  int r = rbd.remove_with_progress(io_ctx, imgname, pc);
-  if (r < 0) {
-    pc.fail();
-    return r;
-  }
-  pc.finish();
-  return 0;
-}
-
-static int do_resize(librbd::Image& image, uint64_t size)
-{
-  MyProgressContext pc("Resizing image");
-  int r = image.resize_with_progress(size, pc);
-  if (r < 0) {
-    pc.fail();
-    return r;
-  }
-  pc.finish();
-  return 0;
-}
-
-static int do_list_snaps(librbd::Image& image, Formatter *f)
-{
-  std::vector<librbd::snap_info_t> snaps;
-  TextTable t;
-  int r;
-
-  r = image.snap_list(snaps);
-  if (r < 0)
-    return r;
-
-  if (f) {
-    f->open_array_section("snapshots");
-  } else {
-    t.define_column("SNAPID", TextTable::RIGHT, TextTable::RIGHT);
-    t.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
-    t.define_column("SIZE", TextTable::RIGHT, TextTable::RIGHT);
-  }
-
-  for (std::vector<librbd::snap_info_t>::iterator s = snaps.begin();
-       s != snaps.end(); ++s) {
-    if (f) {
-      f->open_object_section("snapshot");
-      f->dump_unsigned("id", s->id);
-      f->dump_string("name", s->name);
-      f->dump_unsigned("size", s->size);
-      f->close_section();
-    } else {
-      t << s->id << s->name << stringify(prettybyte_t(s->size))
-	<< TextTable::endrow;
-    }
-  }
-
-  if (f) {
-    f->close_section();
-    f->flush(cout);
-  } else if (snaps.size()) {
-    cout << t;
-  }
-
-  return 0;
-}
-
-static int do_add_snap(librbd::Image& image, const char *snapname)
-{
-  int r = image.snap_create(snapname);
-  if (r < 0)
-    return r;
-
-  return 0;
-}
-
-static int do_remove_snap(librbd::Image& image, const char *snapname)
-{
-  int r = image.snap_remove(snapname);
-  if (r < 0)
-    return r;
-
-  return 0;
-}
-
-static int do_rollback_snap(librbd::Image& image, const char *snapname)
-{
-  MyProgressContext pc("Rolling back to snapshot");
-  int r = image.snap_rollback_with_progress(snapname, pc);
-  if (r < 0) {
-    pc.fail();
-    return r;
-  }
-  pc.finish();
-  return 0;
-}
-
-static int do_purge_snaps(librbd::Image& image)
-{
-  MyProgressContext pc("Removing all snapshots");
-  std::vector<librbd::snap_info_t> snaps;
-  bool is_protected = false;
-  int r = image.snap_list(snaps);
-  if (r < 0) {
-    pc.fail();
-    return r;
-  } else if (0 == snaps.size()) {
-    return 0;
-  } else {  
-    for (size_t i = 0; i < snaps.size(); ++i) {
-      r = image.snap_is_protected(snaps[i].name.c_str(), &is_protected);      
-      if (r < 0) {
-        pc.fail();
-        return r;
-      } else if (is_protected == true) {
-        pc.fail();
-        cerr << "\r" << "rbd: snapshot '" <<snaps[i].name.c_str()<< "' is protected from removal." << std::endl;
-        return -EBUSY;
-      }
-    }
-    for (size_t i = 0; i < snaps.size(); ++i) {
-      r = image.snap_remove(snaps[i].name.c_str());
-      if (r < 0) {
-        pc.fail();
-        return r;
-      }
-      pc.update_progress(i + 1, snaps.size());
-    }
-
-    pc.finish();
-    return 0;
-  }
-}
-
-static int do_protect_snap(librbd::Image& image, const char *snapname)
-{
-  int r = image.snap_protect(snapname);
-  if (r < 0)
-    return r;
-
-  return 0;
-}
-
-static int do_unprotect_snap(librbd::Image& image, const char *snapname)
-{
-  int r = image.snap_unprotect(snapname);
-  if (r < 0)
-    return r;
-
-  return 0;
-}
-
-static int do_list_children(librbd::Image &image, Formatter *f)
-{
-  set<pair<string, string> > children;
-  int r;
-
-  r = image.list_children(&children);
-  if (r < 0)
-    return r;
-
-  if (f)
-    f->open_array_section("children");
-
-  for (set<pair<string, string> >::const_iterator child_it = children.begin();
-       child_it != children.end(); child_it++) {
-    if (f) {
-      f->open_object_section("child");
-      f->dump_string("pool", child_it->first);
-      f->dump_string("image", child_it->second);
-      f->close_section();
-    } else {
-      cout << child_it->first << "/" << child_it->second << std::endl;
-    }
-  }
-
-  if (f) {
-    f->close_section();
-    f->flush(cout);
-  }
-
-  return 0;
-}
-
-static int do_lock_list(librbd::Image& image, Formatter *f)
-{
-  list<librbd::locker_t> lockers;
-  bool exclusive;
-  string tag;
-  TextTable tbl;
-  int r;
-
-  r = image.list_lockers(&lockers, &exclusive, &tag);
-  if (r < 0)
-    return r;
-
-  if (f) {
-    f->open_object_section("locks");
-  } else {
-    tbl.define_column("Locker", TextTable::LEFT, TextTable::LEFT);
-    tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
-    tbl.define_column("Address", TextTable::LEFT, TextTable::LEFT);
-  }
-
-  if (lockers.size()) {
-    bool one = (lockers.size() == 1);
-
-    if (!f) {
-      cout << "There " << (one ? "is " : "are ") << lockers.size()
-	   << (exclusive ? " exclusive" : " shared")
-	   << " lock" << (one ? "" : "s") << " on this image.\n";
-      if (!exclusive)
-	cout << "Lock tag: " << tag << "\n";
-    }
-
-    for (list<librbd::locker_t>::const_iterator it = lockers.begin();
-	 it != lockers.end(); ++it) {
-      if (f) {
-	f->open_object_section(it->cookie.c_str());
-	f->dump_string("locker", it->client);
-	f->dump_string("address", it->address);
-	f->close_section();
-      } else {
-	tbl << it->client << it->cookie << it->address << TextTable::endrow;
-      }
-    }
-    if (!f)
-      cout << tbl;
-  }
-
-  if (f) {
-    f->close_section();
-    f->flush(cout);
-  }
-  return 0;
-}
-
-static int do_lock_add(librbd::Image& image, const char *cookie,
-		       const char *tag)
-{
-  if (tag)
-    return image.lock_shared(cookie, tag);
-  else
-    return image.lock_exclusive(cookie);
-}
-
-static int do_lock_remove(librbd::Image& image, const char *client,
-			  const char *cookie)
-{
-  return image.break_lock(client, cookie);
-}
-
-static void rbd_bencher_completion(void *c, void *pc);
-
-struct rbd_bencher;
-
-struct rbd_bencher {
-  librbd::Image *image;
-  Mutex lock;
-  Cond cond;
-  int in_flight;
-
-  rbd_bencher(librbd::Image *i)
-    : image(i),
-      lock("rbd_bencher::lock"),
-      in_flight(0)
-  { }
-
-  bool start_write(int max, uint64_t off, uint64_t len, bufferlist& bl,
-		   int op_flags)
-  {
-    {
-      Mutex::Locker l(lock);
-      if (in_flight >= max)
-	return false;
-      in_flight++;
-    }
-    librbd::RBD::AioCompletion *c =
-      new librbd::RBD::AioCompletion((void *)this, rbd_bencher_completion);
-    image->aio_write2(off, len, bl, c, op_flags);
-    //cout << "start " << c << " at " << off << "~" << len << std::endl;
-    return true;
-  }
-
-  void wait_for(int max) {
-    Mutex::Locker l(lock);
-    while (in_flight > max) {
-      utime_t dur;
-      dur.set_from_double(.2);
-      cond.WaitInterval(g_ceph_context, lock, dur);
-    }
-  }
-
-};
-
-void rbd_bencher_completion(void *vc, void *pc)
-{
-  librbd::RBD::AioCompletion *c = (librbd::RBD::AioCompletion *)vc;
-  rbd_bencher *b = static_cast<rbd_bencher *>(pc);
-  //cout << "complete " << c << std::endl;
-  int ret = c->get_return_value();
-  if (ret != 0) {
-    cout << "write error: " << cpp_strerror(ret) << std::endl;
-    assert(0 == ret);
-  }
-  b->lock.Lock();
-  b->in_flight--;
-  b->cond.Signal();
-  b->lock.Unlock();
-  c->release();
-}
-
-static int do_bench_write(librbd::Image& image, uint64_t io_size,
-			  uint64_t io_threads, uint64_t io_bytes,
-			  string pattern)
-{
-  rbd_bencher b(&image);
-
-  cout << "bench-write "
-       << " io_size " << io_size
-       << " io_threads " << io_threads
-       << " bytes " << io_bytes
-       << " pattern " << pattern
-       << std::endl;
-
-  if (pattern != "rand" && pattern != "seq")
-    return -EINVAL;
-
-  srand(time(NULL) % (unsigned long) -1);
-
-  bufferptr bp(io_size);
-  memset(bp.c_str(), rand() & 0xff, io_size);
-  bufferlist bl;
-  bl.push_back(bp);
-
-  utime_t start = ceph_clock_now(NULL);
-  utime_t last;
-  unsigned ios = 0;
-
-  uint64_t size = 0;
-  image.size(&size);
-
-  vector<uint64_t> thread_offset;
-  uint64_t i;
-  uint64_t start_pos;
-
-  // disturb all thread's offset, used by seq write
-  for (i = 0; i < io_threads; i++) {
-    start_pos = (rand() % (size / io_size)) * io_size;
-    thread_offset.push_back(start_pos);
-  }
-
-  const int WINDOW_SIZE = 5;
-  typedef boost::accumulators::accumulator_set<
-    double, boost::accumulators::stats<
-      boost::accumulators::tag::rolling_sum> > RollingSum;
-
-  RollingSum time_acc(
-    boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE);
-  RollingSum ios_acc(
-    boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE);
-  RollingSum off_acc(
-    boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE);
-  uint64_t cur_ios = 0;
-  uint64_t cur_off = 0;
-
-  int op_flags;
-  if  (pattern == "rand") {
-    op_flags = LIBRADOS_OP_FLAG_FADVISE_RANDOM;
-  } else {
-    op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
-  }
-
-  printf("  SEC       OPS   OPS/SEC   BYTES/SEC\n");
-  uint64_t off;
-  for (off = 0; off < io_bytes; ) {
-    b.wait_for(io_threads - 1);
-    i = 0;
-    while (i < io_threads && off < io_bytes) {
-      if (pattern == "rand") {
-        thread_offset[i] = (rand() % (size / io_size)) * io_size;
-      } else {
-        thread_offset[i] += io_size;
-        if (thread_offset[i] + io_size > size)
-          thread_offset[i] = 0;
-      }
-
-      if (!b.start_write(io_threads, thread_offset[i], io_size, bl, op_flags))
-	break;
-
-      ++i;
-      ++ios;
-      off += io_size;
-
-      ++cur_ios;
-      cur_off += io_size;
-    }
-
-    utime_t now = ceph_clock_now(NULL);
-    utime_t elapsed = now - start;
-    if (last.is_zero()) {
-      last = elapsed;
-    } else if (elapsed.sec() != last.sec()) {
-      time_acc(elapsed - last);
-      ios_acc(static_cast<double>(cur_ios));
-      off_acc(static_cast<double>(cur_off));
-      cur_ios = 0;
-      cur_off = 0;
-
-      double time_sum = boost::accumulators::rolling_sum(time_acc);
-      printf("%5d  %8d  %8.2lf  %8.2lf\n",
-             (int)elapsed,
-             (int)(ios - io_threads),
-             boost::accumulators::rolling_sum(ios_acc) / time_sum,
-             boost::accumulators::rolling_sum(off_acc) / time_sum);
-      last = elapsed;
-    }
-  }
-  b.wait_for(0);
-  int r = image.flush();
-  if (r < 0) {
-    cerr << "Error flushing data at the end: " << cpp_strerror(r) << std::endl;
-  }
-
-  utime_t now = ceph_clock_now(NULL);
-  double elapsed = now - start;
-
-  printf("elapsed: %5d  ops: %8d  ops/sec: %8.2lf  bytes/sec: %8.2lf\n",
-	 (int)elapsed, ios, (double)ios / elapsed, (double)off / elapsed);
-
-  return 0;
-}
-
-class C_Export : public Context
-{
-public:
-  C_Export(SimpleThrottle &simple_throttle, librbd::Image &image,
-                   uint64_t offset, uint64_t length, int fd)
-    : m_aio_completion(
-        new librbd::RBD::AioCompletion(this, &aio_context_callback)),
-      m_throttle(simple_throttle), m_image(image), m_offset(offset),
-      m_length(length), m_fd(fd)
-  {
-  }
-
-  void send()
-  {
-    m_throttle.start_op();
-
-    int op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
-		   LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
-    int r = m_image.aio_read2(m_offset, m_length, m_bufferlist,
-                              m_aio_completion, op_flags);
-    if (r < 0) {
-      cerr << "rbd: error requesting read from source image" << std::endl;
-      m_aio_completion->release();
-      m_throttle.end_op(r);
-    }
-  }
-
-  virtual void finish(int r)
-  {
-    BOOST_SCOPE_EXIT((&m_throttle) (&r))
-    {
-      m_throttle.end_op(r);
-    } BOOST_SCOPE_EXIT_END
-
-    if (r < 0) {
-      cerr << "rbd: error reading from source image at offset "
-           << m_offset << ": " << cpp_strerror(r) << std::endl;
-      return;
-    }
-
-    assert(m_bufferlist.length() == static_cast<size_t>(r));
-    if (m_fd != STDOUT_FILENO) {
-      if (m_bufferlist.is_zero()) {
-        return;
-      }
-
-      uint64_t chkret = lseek64(m_fd, m_offset, SEEK_SET);
-      if (chkret != m_offset) {
-        cerr << "rbd: error seeking destination image to offset "
-             << m_offset << std::endl;
-        r = -errno;
-        return;
-      }
-    }
-
-    r = m_bufferlist.write_fd(m_fd);
-    if (r < 0) {
-      cerr << "rbd: error writing to destination image at offset "
-           << m_offset << std::endl;
-    }
-  }
-
-private:
-  librbd::RBD::AioCompletion *m_aio_completion;
-  SimpleThrottle &m_throttle;
-  librbd::Image &m_image;
-  bufferlist m_bufferlist;
-  uint64_t m_offset;
-  uint64_t m_length;
-  int m_fd;
-};
-
-static int do_export(librbd::Image& image, const char *path)
-{
-  librbd::image_info_t info;
-  int64_t r = image.stat(info, sizeof(info));
-  if (r < 0)
-    return r;
-
-  int fd;
-  int max_concurrent_ops;
-  bool to_stdout = (strcmp(path, "-") == 0);
-  if (to_stdout) {
-    fd = STDOUT_FILENO;
-    max_concurrent_ops = 1;
-  } else {
-    max_concurrent_ops = max(g_conf->rbd_concurrent_management_ops, 1);
-    fd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644);
-    if (fd < 0) {
-      return -errno;
-    }
-    posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
-  }
-
-  MyProgressContext pc("Exporting image");
-
-  SimpleThrottle throttle(max_concurrent_ops, false);
-  uint64_t period = image.get_stripe_count() * (1ull << info.order);
-  for (uint64_t offset = 0; offset < info.size; offset += period) {
-    if (throttle.pending_error()) {
-      break;
-    }
-
-    uint64_t length = min(period, info.size - offset);
-    C_Export *ctx = new C_Export(throttle, image, offset, length, fd);
-    ctx->send();
-
-    pc.update_progress(offset, info.size);
-  }
-
-  r = throttle.wait_for_ret();
-  if (!to_stdout) {
-    if (r >= 0) {
-      r = ftruncate(fd, info.size);
-    }
-    close(fd);
-  }
-
-  if (r < 0) {
-    pc.fail();
-  } else {
-    pc.finish();
-  }
-  return r;
-}
-
-struct ExportDiffContext {
-  librbd::Image *image;
-  int fd;
-  uint64_t totalsize;
-  MyProgressContext pc;
-  OrderedThrottle throttle;
-
-  ExportDiffContext(librbd::Image *i, int f, uint64_t t, int max_ops) :
-    image(i), fd(f), totalsize(t), pc("Exporting image"),
-    throttle(max_ops, true) {
-  }
-};
-
-class C_ExportDiff : public Context {
-public:
-  C_ExportDiff(ExportDiffContext *edc, uint64_t offset, uint64_t length,
-               bool exists)
-    : m_export_diff_context(edc), m_offset(offset), m_length(length),
-      m_exists(exists) {
-  }
-
-  int send() {
-    if (m_export_diff_context->throttle.pending_error()) {
-      return m_export_diff_context->throttle.wait_for_ret();
-    }
-
-    C_OrderedThrottle *ctx = m_export_diff_context->throttle.start_op(this);
-    if (m_exists) {
-      librbd::RBD::AioCompletion *aio_completion =
-        new librbd::RBD::AioCompletion(ctx, &aio_context_callback);
-
-      int op_flags = LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
-      int r = m_export_diff_context->image->aio_read2(
-        m_offset, m_length, m_read_data, aio_completion, op_flags);
-      if (r < 0) {
-        aio_completion->release();
-        ctx->complete(r);
-      }
-    } else {
-      ctx->complete(0);
-    }
-    return 0;
-  }
-
-  static int export_diff_cb(uint64_t offset, size_t length, int exists,
-                            void *arg) {
-    ExportDiffContext *edc = reinterpret_cast<ExportDiffContext *>(arg);
-
-    C_ExportDiff *context = new C_ExportDiff(edc, offset, length, exists);
-    return context->send();
-  }
-
-protected:
-  virtual void finish(int r) {
-    if (r >= 0) {
-      if (m_exists) {
-        m_exists = !m_read_data.is_zero();
-      }
-      r = write_extent(m_export_diff_context, m_offset, m_length, m_exists);
-      if (r == 0 && m_exists) {
-        r = m_read_data.write_fd(m_export_diff_context->fd);
-      }
-    }
-    m_export_diff_context->throttle.end_op(r);
-  }
-
-private:
-  ExportDiffContext *m_export_diff_context;
-  uint64_t m_offset;
-  uint64_t m_length;
-  bool m_exists;
-  bufferlist m_read_data;
-
-  static int write_extent(ExportDiffContext *edc, uint64_t offset,
-                          uint64_t length, bool exists) {
-    // extent
-    bufferlist bl;
-    __u8 tag = exists ? 'w' : 'z';
-    ::encode(tag, bl);
-    ::encode(offset, bl);
-    ::encode(length, bl);
-    int r = bl.write_fd(edc->fd);
-
-    edc->pc.update_progress(offset, edc->totalsize);
-    return r;
-  }
-};
-
-static int do_export_diff(librbd::Image& image, const char *fromsnapname,
-			  const char *endsnapname, bool whole_object,
-			  const char *path)
-{
-  int r;
-  librbd::image_info_t info;
-  int fd;
-
-  r = image.stat(info, sizeof(info));
-  if (r < 0)
-    return r;
-
-  if (strcmp(path, "-") == 0)
-    fd = 1;
-  else
-    fd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644);
-  if (fd < 0)
-    return -errno;
-
-  BOOST_SCOPE_EXIT((&r) (&fd) (&path)) {
-    close(fd);
-    if (r < 0 && fd != 1) {
-      remove(path);
-    }
-  } BOOST_SCOPE_EXIT_END
-
-  {
-    // header
-    bufferlist bl;
-    bl.append(RBD_DIFF_BANNER, strlen(RBD_DIFF_BANNER));
-
-    __u8 tag;
-    if (fromsnapname) {
-      tag = 'f';
-      ::encode(tag, bl);
-      string from(fromsnapname);
-      ::encode(from, bl);
-    }
-
-    if (endsnapname) {
-      tag = 't';
-      ::encode(tag, bl);
-      string to(endsnapname);
-      ::encode(to, bl);
-    }
-
-    tag = 's';
-    ::encode(tag, bl);
-    uint64_t endsize = info.size;
-    ::encode(endsize, bl);
-
-    r = bl.write_fd(fd);
-    if (r < 0) {
-      return r;
-    }
-  }
-
-  ExportDiffContext edc(&image, fd, info.size,
-                        g_conf->rbd_concurrent_management_ops);
-  r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object,
-                          &C_ExportDiff::export_diff_cb, (void *)&edc);
-  if (r < 0) {
-    goto out;
-  }
-
-  r = edc.throttle.wait_for_ret();
-  if (r < 0) {
-    goto out;
-  }
-
-  {
-    __u8 tag = 'e';
-    bufferlist bl;
-    ::encode(tag, bl);
-    r = bl.write_fd(fd);
-  }
-
- out:
-  if (r < 0)
-    edc.pc.fail();
-  else
-    edc.pc.finish();
-  return r;
-}
-
-struct output_method {
-  output_method() : f(NULL), t(NULL), empty(true) {}
-  Formatter *f;
-  TextTable *t;
-  bool empty;
-};
-
-static int diff_cb(uint64_t ofs, size_t len, int exists, void *arg)
-{
-  output_method *om = static_cast<output_method *>(arg);
-  om->empty = false;
-  if (om->f) {
-    om->f->open_object_section("extent");
-    om->f->dump_unsigned("offset", ofs);
-    om->f->dump_unsigned("length", len);
-    om->f->dump_string("exists", exists ? "true" : "false");
-    om->f->close_section();
-  } else {
-    assert(om->t);
-    *(om->t) << ofs << len << (exists ? "data" : "zero") << TextTable::endrow;
-  }
-  return 0;
-}
-
-static int do_diff(librbd::Image& image, const char *fromsnapname,
-                   bool whole_object, Formatter *f)
-{
-  int r;
-  librbd::image_info_t info;
-
-  r = image.stat(info, sizeof(info));
-  if (r < 0)
-    return r;
-
-  output_method om;
-  if (f) {
-    om.f = f;
-    f->open_array_section("extents");
-  } else {
-    om.t = new TextTable();
-    om.t->define_column("Offset", TextTable::LEFT, TextTable::LEFT);
-    om.t->define_column("Length", TextTable::LEFT, TextTable::LEFT);
-    om.t->define_column("Type", TextTable::LEFT, TextTable::LEFT);
-  }
-
-  r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object,
-                          diff_cb, &om);
-  if (f) {
-    f->close_section();
-    f->flush(cout);
-  } else {
-    if (!om.empty)
-      cout << *om.t;
-    delete om.t;
-  }
-  return r;
-}
-
-static const char *imgname_from_path(const char *path)
-{
-  const char *imgname;
-
-  imgname = strrchr(path, '/');
-  if (imgname)
-    imgname++;
-  else
-    imgname = path;
-
-  return imgname;
-}
-
-static void update_snap_name(char *imgname, char **snap)
-{
-  char *s;
-
-  s = strrchr(imgname, '@');
-  if (!s)
-    return;
-
-  *s = '\0';
-
-  if (!snap)
-    return;
-
-  s++;
-  if (*s)
-    *snap = s;
-}
-
-static void set_pool_image_name(const char *orig_img, char **new_pool, 
-				char **new_img, char **snap)
-{
-  const char *sep;
-
-  if (!orig_img)
-    return;
-
-  sep = strchr(orig_img, '/');
-  if (!sep) {
-    *new_img = strdup(orig_img);
-    goto done_img;
-  }
-
-  *new_pool =  strdup(orig_img);
-  sep = strchr(*new_pool, '/');
-  assert (sep);
-
-  *(char *)sep = '\0';
-  *new_img = strdup(sep + 1);
-
-done_img:
-  update_snap_name(*new_img, snap);
-}
-
-class C_Import : public Context
-{
-public:
-  C_Import(SimpleThrottle &simple_throttle, librbd::Image &image,
-           bufferlist &bl, uint64_t offset)
-    : m_throttle(simple_throttle), m_image(image),
-      m_aio_completion(
-        new librbd::RBD::AioCompletion(this, &aio_context_callback)),
-      m_bufferlist(bl), m_offset(offset)
-  {
-  }
-
-  void send()
-  {
-    m_throttle.start_op();
-
-    int op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
-		   LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
-    int r = m_image.aio_write2(m_offset, m_bufferlist.length(), m_bufferlist,
-			       m_aio_completion, op_flags);
-    if (r < 0) {
-      cerr << "rbd: error requesting write to destination image" << std::endl;
-      m_aio_completion->release();
-      m_throttle.end_op(r);
-    }
-  }
-
-  virtual void finish(int r)
-  {
-    if (r < 0) {
-      cerr << "rbd: error writing to destination image at offset "
-           << m_offset << ": " << cpp_strerror(r) << std::endl;
-    }
-    m_throttle.end_op(r);
-  }
-
-private:
-  SimpleThrottle &m_throttle;
-  librbd::Image &m_image;
-  librbd::RBD::AioCompletion *m_aio_completion;
-  bufferlist m_bufferlist;
-  uint64_t m_offset;
-};
-
-static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx,
-		     const char *imgname, int *order, const char *path,
-		     int format, uint64_t features, uint64_t size,
-                     uint64_t stripe_unit, uint64_t stripe_count)
-{
-  int fd, r;
-  struct stat stat_buf;
-  MyProgressContext pc("Importing image");
-
-  assert(imgname);
-
-  // default order as usual
-  if (*order == 0)
-    *order = 22;
-
-  // try to fill whole imgblklen blocks for sparsification
-  uint64_t image_pos = 0;
-  size_t imgblklen = 1 << *order;
-  char *p = new char[imgblklen];
-  size_t reqlen = imgblklen;	// amount requested from read
-  ssize_t readlen;		// amount received from one read
-  size_t blklen = 0;		// amount accumulated from reads to fill blk
-  librbd::Image image;
-
-  boost::scoped_ptr<SimpleThrottle> throttle;
-  bool from_stdin = !strcmp(path, "-");
-  if (from_stdin) {
-    throttle.reset(new SimpleThrottle(1, false));
-    fd = 0;
-    size = 1ULL << *order;
-  } else {
-    throttle.reset(new SimpleThrottle(
-      max(g_conf->rbd_concurrent_management_ops, 1), false));
-    if ((fd = open(path, O_RDONLY)) < 0) {
-      r = -errno;
-      cerr << "rbd: error opening " << path << std::endl;
-      goto done2;
-    }
-
-    if ((fstat(fd, &stat_buf)) < 0) {
-      r = -errno;
-      cerr << "rbd: stat error " << path << std::endl;
-      goto done;
-    }
-    if (S_ISDIR(stat_buf.st_mode)) {
-      r = -EISDIR;
-      cerr << "rbd: cannot import a directory" << std::endl;
-      goto done;
-    }
-    if (stat_buf.st_size)
-      size = (uint64_t)stat_buf.st_size;
-
-    if (!size) {
-      int64_t bdev_size = 0;
-      r = get_block_device_size(fd, &bdev_size);
-      if (r < 0) {
-	cerr << "rbd: unable to get size of file/block device" << std::endl;
-	goto done;
-      }
-      assert(bdev_size >= 0);
-      size = (uint64_t) bdev_size;
-    }
-
-    posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
-  }
-  r = do_create(rbd, io_ctx, imgname, size, order, format, features,
-                stripe_unit, stripe_count);
-  if (r < 0) {
-    cerr << "rbd: image creation failed" << std::endl;
-    goto done;
-  }
-  r = rbd.open(io_ctx, image, imgname);
-  if (r < 0) {
-    cerr << "rbd: failed to open image" << std::endl;
-    goto done;
-  }
-
-  // loop body handles 0 return, as we may have a block to flush
-  while ((readlen = ::read(fd, p + blklen, reqlen)) >= 0) {
-    if (throttle->pending_error()) {
-      break;
-    }
-
-    blklen += readlen;
-    // if read was short, try again to fill the block before writing
-    if (readlen && ((size_t)readlen < reqlen)) {
-      reqlen -= readlen;
-      continue;
-    }
-    if (!from_stdin)
-      pc.update_progress(image_pos, size);
-
-    bufferlist bl(blklen);
-    bl.append(p, blklen);
-    // resize output image by binary expansion as we go for stdin
-    if (from_stdin && (image_pos + (size_t)blklen) > size) {
-      size *= 2;
-      r = image.resize(size);
-      if (r < 0) {
-	cerr << "rbd: can't resize image during import" << std::endl;
-	goto done;
-      }
-    }
-
-    // write as much as we got; perhaps less than imgblklen
-    // but skip writing zeros to create sparse images
-    if (!bl.is_zero()) {
-      C_Import *ctx = new C_Import(*throttle, image, bl, image_pos);
-      ctx->send();
-    }
-
-    // done with whole block, whether written or not
-    image_pos += blklen;
-    // if read had returned 0, we're at EOF and should quit
-    if (readlen == 0)
-      break;
-    blklen = 0;
-    reqlen = imgblklen;
-  }
-  r = throttle->wait_for_ret();
-  if (r < 0) {
-    goto done;
-  }
-
-  if (from_stdin) {
-    r = image.resize(image_pos);
-    if (r < 0) {
-      cerr << "rbd: final image resize failed" << std::endl;
-      goto done;
-    }
-  }
-
-  r = image.close();
-
- done:
-  if (!from_stdin) {
-    if (r < 0)
-      pc.fail();
-    else
-      pc.finish();
-    close(fd);
-  }
- done2:
-  delete[] p;
-  return r;
-}
-
-static int read_string(int fd, unsigned max, string *out)
-{
-  char buf[4];
-
-  int r = safe_read_exact(fd, buf, 4);
-  if (r < 0)
-    return r;
-
-  bufferlist bl;
-  bl.append(buf, 4);
-  bufferlist::iterator p = bl.begin();
-  uint32_t len;
-  ::decode(len, p);
-  if (len > max)
-    return -EINVAL;
-
-  char sbuf[len];
-  r = safe_read_exact(fd, sbuf, len);
-  if (r < 0)
-    return r;
-  out->assign(sbuf, len);
-  return len;
-}
-
-static int do_import_diff(librbd::Image &image, const char *path)
-{
-  int fd, r;
-  struct stat stat_buf;
-  MyProgressContext pc("Importing image diff");
-  uint64_t size = 0;
-  uint64_t off = 0;
-  string from, to;
-
-  bool from_stdin = !strcmp(path, "-");
-  if (from_stdin) {
-    fd = 0;
-  } else {
-    fd = open(path, O_RDONLY);
-    if (fd < 0) {
-      r = -errno;
-      cerr << "rbd: error opening " << path << std::endl;
-      return r;
-    }
-    r = ::fstat(fd, &stat_buf);
-    if (r < 0)
-      goto done;
-    size = (uint64_t)stat_buf.st_size;
-  }
-
-  char buf[strlen(RBD_DIFF_BANNER) + 1];
-  r = safe_read_exact(fd, buf, strlen(RBD_DIFF_BANNER));
-  if (r < 0)
-    goto done;
-  buf[strlen(RBD_DIFF_BANNER)] = '\0';
-  if (strcmp(buf, RBD_DIFF_BANNER)) {
-    cerr << "invalid banner '" << buf << "', expected '" << RBD_DIFF_BANNER << "'" << std::endl;
-    r = -EINVAL;
-    goto done;
-  }
-
-  while (true) {
-    __u8 tag;
-    r = safe_read_exact(fd, &tag, 1);
-    if (r < 0) {
-      goto done;
-    }
-
-    if (tag == 'e') {
-      dout(2) << " end diff" << dendl;
-      break;
-    } else if (tag == 'f') {
-      r = read_string(fd, 4096, &from);   // 4k limit to make sure we don't get a garbage string
-      if (r < 0)
-	goto done;
-      dout(2) << " from snap " << from << dendl;
-
-      if (!image.snap_exists(from.c_str())) {
-	cerr << "start snapshot '" << from << "' does not exist in the image, aborting" << std::endl;
-	r = -EINVAL;
-	goto done;
-      }
-    }
-    else if (tag == 't') {
-      r = read_string(fd, 4096, &to);   // 4k limit to make sure we don't get a garbage string
-      if (r < 0)
-	goto done;
-      dout(2) << "   to snap " << to << dendl;
-
-      // verify this snap isn't already present
-      if (image.snap_exists(to.c_str())) {
-	cerr << "end snapshot '" << to << "' already exists, aborting" << std::endl;
-	r = -EEXIST;
-	goto done;
-      }
-    } else if (tag == 's') {
-      uint64_t end_size;
-      char buf[8];
-      r = safe_read_exact(fd, buf, 8);
-      if (r < 0)
-	goto done;
-      bufferlist bl;
-      bl.append(buf, 8);
-      bufferlist::iterator p = bl.begin();
-      ::decode(end_size, p);
-      uint64_t cur_size;
-      image.size(&cur_size);
-      if (cur_size != end_size) {
-	dout(2) << "resize " << cur_size << " -> " << end_size << dendl;
-	image.resize(end_size);
-      } else {
-	dout(2) << "size " << end_size << " (no change)" << dendl;
-      }
-      if (from_stdin)
-	size = end_size;
-    } else if (tag == 'w' || tag == 'z') {
-      uint64_t len;
-      char buf[16];
-      r = safe_read_exact(fd, buf, 16);
-      if (r < 0)
-	goto done;
-      bufferlist bl;
-      bl.append(buf, 16);
-      bufferlist::iterator p = bl.begin();
-      ::decode(off, p);
-      ::decode(len, p);
-
-      if (tag == 'w') {
-	bufferptr bp = buffer::create(len);
-	r = safe_read_exact(fd, bp.c_str(), len);
-	if (r < 0)
-	  goto done;
-	bufferlist data;
-	data.append(bp);
-	dout(2) << " write " << off << "~" << len << dendl;
-	image.write2(off, len, data, LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
-      } else {
-	dout(2) << " zero " << off << "~" << len << dendl;
-	image.discard(off, len);
-      }
-    } else {
-      cerr << "unrecognized tag byte " << (int)tag << " in stream; aborting" << std::endl;
-      r = -EINVAL;
-      goto done;
-    }
-    if (!from_stdin) {
-      // progress through input
-      uint64_t off = lseek64(fd, 0, SEEK_CUR);
-      pc.update_progress(off, size);
-    } else if (size) {
-      // progress through image offsets.  this may jitter if blocks
-      // aren't in order, but it is better than nothing.
-      pc.update_progress(off, size);
-    }
-  }
-
-  // take final snap
-  if (to.length()) {
-    dout(2) << " create end snap " << to << dendl;
-    r = image.snap_create(to.c_str());
-  }
-
- done:
-  if (r < 0)
-    pc.fail();
-  else
-    pc.finish();
-  if (!from_stdin)
-    close(fd);
-  return r;
-}
-
-static int parse_diff_header(int fd, __u8 *tag, string *from, string *to, uint64_t *size)
-{
-  int r;
-
-  {//header
-    char buf[strlen(RBD_DIFF_BANNER) + 1];
-    r = safe_read_exact(fd, buf, strlen(RBD_DIFF_BANNER));
-    if (r < 0)
-      return r;
-
-    buf[strlen(RBD_DIFF_BANNER)] = '\0';
-    if (strcmp(buf, RBD_DIFF_BANNER)) {
-      cerr << "invalid banner '" << buf << "', expected '" << RBD_DIFF_BANNER << "'" << std::endl;
-      return -EINVAL;
-    }
-  }
-
-  while (true) {
-    r = safe_read_exact(fd, tag, 1);
-    if (r < 0)
-      return r;
-
-    if (*tag == 'f') {
-      r = read_string(fd, 4096, from);   // 4k limit to make sure we don't get a garbage string
-      if (r < 0)
-        return r;
-      dout(2) << " from snap " << *from << dendl;
-    } else if (*tag == 't') {
-      r = read_string(fd, 4096, to);   // 4k limit to make sure we don't get a garbage string
-      if (r < 0)
-        return r;
-      dout(2) << " to snap " << *to << dendl;
-    } else if (*tag == 's') {
-      char buf[8];
-      r = safe_read_exact(fd, buf, 8);
-      if (r < 0)
-        return r;
-
-      bufferlist bl;
-      bl.append(buf, 8);
-      bufferlist::iterator p = bl.begin();
-      ::decode(*size, p);
-    } else {
-      break;
-    }
-  }
-
-  return 0;
-}
-
-static int parse_diff_body(int fd, __u8 *tag, uint64_t *offset, uint64_t *length)
-{
-  int r;
-
-  if (!(*tag)) {
-    r = safe_read_exact(fd, tag, 1);
-    if (r < 0)
-      return r;
-  }
-
-  if (*tag == 'e') {
-    offset = 0;
-    length = 0;
-    return 0;
-  }
-
-  if (*tag != 'w' && *tag != 'z')
-    return -ENOTSUP;
-
-  char buf[16];
-  r = safe_read_exact(fd, buf, 16);
-  if (r < 0)
-    return r;
-
-  bufferlist bl;
-  bl.append(buf, 16);
-  bufferlist::iterator p = bl.begin();
-  ::decode(*offset, p);
-  ::decode(*length, p);
-
-  if (!(*length))
-    return -ENOTSUP;
-
-  return 0;
-}
-
-/*
- * fd: the diff file to read from
- * pd: the diff file to be written into
- */
-static int accept_diff_body(int fd, int pd, __u8 tag, uint64_t offset, uint64_t length)
-{
-  if (tag == 'e')
-    return 0;
-
-  bufferlist bl;
-  ::encode(tag, bl);
-  ::encode(offset, bl);
-  ::encode(length, bl);
-  int r;
-  r = bl.write_fd(pd);
-  if (r < 0)
-    return r;
-
-  if (tag == 'w') {
-    bufferptr bp = buffer::create(length);
-    r = safe_read_exact(fd, bp.c_str(), length);
-    if (r < 0)
-      return r;
-    bufferlist data;
-    data.append(bp);
-    r = data.write_fd(pd);
-    if (r < 0)
-      return r;
-  }
-
-  return 0;
-}
-
-/*
- * Merge two diff files into one single file
- * Note: It does not do the merging work if
- * either of the source diff files is stripped,
- * since which complicates the process and is
- * rarely used
- */
-static int do_merge_diff(const char *first, const char *second, const char *path)
-{
-  MyProgressContext pc("Merging image diff");
-  int fd = -1, sd = -1, pd = -1, r;
-
-  string f_from, f_to;
-  string s_from, s_to;
-  uint64_t f_size, s_size, pc_size;
-
-  __u8 f_tag = 0, s_tag = 0;
-  uint64_t f_off = 0, f_len = 0;
-  uint64_t s_off = 0, s_len = 0;
-  bool f_end = false, s_end = false;
-
-  bool first_stdin = !strcmp(first, "-");
-  if (first_stdin) {
-    fd = 0;
-  } else {
-    fd = open(first, O_RDONLY);
-    if (fd < 0) {
-      r = -errno;
-      cerr << "rbd: error opening " << first << std::endl;
-      goto done;
-    }
-  }
-
-  sd = open(second, O_RDONLY);
-  if (sd < 0) {
-    r = -errno;
-    cerr << "rbd: error opening " << second << std::endl;
-    goto done;
-  }
-
-  if (strcmp(path, "-") == 0) {
-    pd = 1;
-  } else {
-    pd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644);
-    if (pd < 0) {
-      r = -errno;
-      cerr << "rbd: error create " << path << std::endl;
-      goto done;
-    }
-  }
-
-  //We just handle the case like 'banner, [ftag], [ttag], stag, [wztag]*,etag',
-  // and the (offset,length) in wztag must be ascending order.
-
-  r = parse_diff_header(fd, &f_tag, &f_from, &f_to, &f_size);
-  if (r < 0) {
-    cerr << "rbd: failed to parse first diff header" << std::endl;
-    goto done;
-  }
-
-  r = parse_diff_header(sd, &s_tag, &s_from, &s_to, &s_size);
-  if (r < 0) {
-    cerr << "rbd: failed to parse second diff header" << std::endl;
-    goto done;
-  }
-
-  if (f_to != s_from) {
-    r = -EINVAL;
-    cerr << "The first TO snapshot must be equal with the second FROM snapshot, aborting" << std::endl;
-    goto done;
-  }
-
-  {
-    // header
-    bufferlist bl;
-    bl.append(RBD_DIFF_BANNER, strlen(RBD_DIFF_BANNER));
-
-    __u8 tag;
-    if (f_from.size()) {
-      tag = 'f';
-      ::encode(tag, bl);
-      ::encode(f_from, bl);
-    }
-
-    if (s_to.size()) {
-      tag = 't';
-      ::encode(tag, bl);
-      ::encode(s_to, bl);
-    }
-
-    tag = 's';
-    ::encode(tag, bl);
-    ::encode(s_size, bl);
-
-    r = bl.write_fd(pd);
-    if (r < 0) {
-      cerr << "rbd: failed to write merged diff header" << std::endl;
-      goto done;
-    }
-  }
-
-  if (f_size > s_size)
-    pc_size = f_size << 1;
-  else
-    pc_size = s_size << 1;
-
-  //data block
-  while (!f_end || !s_end) {
-    // progress through input
-    pc.update_progress(f_off + s_off, pc_size);
-
-    if (!f_end && !f_len) {
-      uint64_t last_off = f_off;
-
-      r = parse_diff_body(fd, &f_tag, &f_off, &f_len);
-      dout(2) << "first diff data chunk: tag=" << f_tag << ", "
-              << "off=" << f_off << ", "
-              << "len=" << f_len << dendl;
-      if (r < 0) {
-        cerr << "rbd: failed to read first diff data chunk header" << std::endl;
-        goto done;
-      }
-
-      if (f_tag == 'e') {
-        f_end = true;
-        f_tag = 'z';
-        f_off = f_size;
-        if (f_size < s_size)
-          f_len = s_size - f_size;
-        else
-          f_len = 0;
-      }
-
-      if (last_off > f_off) {
-        r = -ENOTSUP;
-        cerr << "rbd: out-of-order offset from first diff ("
-             << last_off << " > " << f_off << ")" << std::endl;
-        goto done;
-      }
-    }
-
-    if (!s_end && !s_len) {
-      uint64_t last_off = s_off;
-
-      r = parse_diff_body(sd, &s_tag, &s_off, &s_len);
-      dout(2) << "second diff data chunk: tag=" << f_tag << ", "
-              << "off=" << f_off << ", "
-              << "len=" << f_len << dendl;
-      if (r < 0) {
-        cerr << "rbd: failed to read second diff data chunk header"
-             << std::endl;
-        goto done;
-      }
-
-      if (s_tag == 'e') {
-        s_end = true;
-        s_off = s_size;
-        if (s_size < f_size)
-          s_len = f_size - s_size;
-        else
-          s_len = 0;
-      }
-
-      if (last_off > s_off) {
-        r = -ENOTSUP;
-        cerr << "rbd: out-of-order offset from second diff ("
-             << last_off << " > " << s_off << ")" << std::endl;
-        goto done;
-      }
-    }
-
-    if (f_off < s_off && f_len) {
-      uint64_t delta = s_off - f_off;
-      if (delta > f_len)
-        delta = f_len;
-      r = accept_diff_body(fd, pd, f_tag, f_off, delta);
-      f_off += delta;
-      f_len -= delta;
-
-      if (!f_len) {
-        f_tag = 0;
-        continue;
-      }
-    }
-    assert(f_off >= s_off);
-
-    if (f_off < s_off + s_len && f_len) {
-      uint64_t delta = s_off + s_len - f_off;
-      if (delta > f_len)
-        delta = f_len;
-      if (f_tag == 'w') {
-        if (first_stdin) {
-          bufferptr bp = buffer::create(delta);
-          r = safe_read_exact(fd, bp.c_str(), delta);
-        } else {
-          r = lseek(fd, delta, SEEK_CUR);
-        }
-        if (r < 0) {
-          cerr << "rbd: failed to skip first diff data" << std::endl;
-          goto done;
-        }
-      }
-      f_off += delta;
-      f_len -= delta;
-
-      if (!f_len) {
-        f_tag = 0;
-        continue;
-      }
-    }
-    assert(f_off >= s_off + s_len);
-
-    if (s_len) {
-      r = accept_diff_body(sd, pd, s_tag, s_off, s_len);
-      s_off += s_len;
-      s_len = 0;
-      s_tag = 0;
-    } else
-      assert(f_end && s_end);
-    continue;
-  }
-
-  {//tail
-    __u8 tag = 'e';
-    bufferlist bl;
-    ::encode(tag, bl);
-    r = bl.write_fd(pd);
-  }
-
-done:
-  if (pd > 2)
-    close(pd);
-  if (sd > 2)
-    close(sd);
-  if (fd > 2)
-    close(fd);
-
-  if(r < 0) {
-    pc.fail();
-    if (pd > 2)
-      unlink(path);
-  } else
-    pc.finish();
-
-  return r;
-}
-
-static int do_metadata_list(librbd::Image& image, Formatter *f)
-{
-  map<string, bufferlist> pairs;
-  int r;
-  TextTable tbl;
-
-  r = image.metadata_list("", 0, &pairs);
-  if (r < 0) {
-    cerr << "failed to list metadata of image : " << cpp_strerror(r) << std::endl;
-    return r;
-  }
-
-  if (f) {
-    f->open_object_section("metadatas");
-  } else {
-    tbl.define_column("Key", TextTable::LEFT, TextTable::LEFT);
-    tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
-  }
-
-  if (!pairs.empty()) {
-    bool one = (pairs.size() == 1);
-
-    if (!f) {
-      cout << "There " << (one ? "is " : "are ") << pairs.size()
-           << " metadata" << (one ? "" : "s") << " on this image.\n";
-    }
-
-    for (map<string, bufferlist>::iterator it = pairs.begin();
-         it != pairs.end(); ++it) {
-      string val(it->second.c_str(), it->second.length());
-      if (f) {
-        f->dump_string(it->first.c_str(), val.c_str());
-      } else {
-        tbl << it->first << val.c_str() << TextTable::endrow;
-      }
-    }
-    if (!f)
-      cout << tbl;
-  }
-
-  if (f) {
-    f->close_section();
-    f->flush(cout);
-  }
-  return 0;
-}
-
-static int do_metadata_set(librbd::Image& image, const char *key,
-                          const char *value)
-{
-  int r = image.metadata_set(key, value);
-  if (r < 0) {
-    cerr << "failed to set metadata " << key << " of image : " << cpp_strerror(r) << std::endl;
-  }
-  return r;
-}
-
-static int do_metadata_remove(librbd::Image& image, const char *key)
-{
-  int r = image.metadata_remove(key);
-  if (r < 0) {
-    cerr << "failed to remove metadata " << key << " of image : " << cpp_strerror(r) << std::endl;
-  }
-  return r;
-}
-
-static int do_metadata_get(librbd::Image& image, const char *key)
-{
-  string s;
-  int r = image.metadata_get(key, &s);
-  if (r < 0) {
-    cerr << "failed to get metadata " << key << " of image : " << cpp_strerror(r) << std::endl;
-    return r;
-  }
-  cout << s << std::endl;
-  return r;
-}
-
-static int do_copy(librbd::Image &src, librados::IoCtx& dest_pp,
-		   const char *destname)
-{
-  MyProgressContext pc("Image copy");
-  int r = src.copy_with_progress(dest_pp, destname, pc);
-  if (r < 0){
-    pc.fail();
-    return r;
-  }
-  pc.finish();
-  return 0;
-}
-
-class RbdWatchCtx : public librados::WatchCtx2 {
-public:
-  RbdWatchCtx(librados::IoCtx& io_ctx, const char *image_name,
-	      std::string header_oid)
-    : m_io_ctx(io_ctx), m_image_name(image_name), m_header_oid(header_oid)
-  {
-  }
-
-  virtual ~RbdWatchCtx() {}
-
-  virtual void handle_notify(uint64_t notify_id,
-                             uint64_t cookie,
-                             uint64_t notifier_id,
-                             bufferlist& bl) {
-    cout << m_image_name << " received notification: notify_id=" << notify_id
-	 << ", cookie=" << cookie << ", notifier_id=" << notifier_id
-	 << ", bl.length=" << bl.length() << std::endl;
-    bufferlist reply;
-    m_io_ctx.notify_ack(m_header_oid, notify_id, cookie, reply);
-  }
-  
-  virtual void handle_error(uint64_t cookie, int err) {
-    cerr << m_image_name << " received error: cookie=" << cookie << ", err="
-	 << cpp_strerror(err) << std::endl;
-  }
-private:
-  librados::IoCtx m_io_ctx;
-  const char *m_image_name;
-  string m_header_oid;
-};
-
-static int do_watch(librados::IoCtx& pp, librbd::Image &image,
-		    const char *imgname)
-{
-  uint8_t old_format;
-  int r = image.old_format(&old_format);
-  if (r < 0) {
-    cerr << "failed to query format" << std::endl;
-    return r;
-  }
-
-  string header_oid;
-  if (old_format != 0) {
-    header_oid = string(imgname) + RBD_SUFFIX;
-  } else {
-    librbd::image_info_t info;
-    r = image.stat(info, sizeof(info));
-    if (r < 0) {
-      cerr << "failed to stat image" << std::endl;
-      return r;
-    }
-
-    char prefix[RBD_MAX_BLOCK_NAME_SIZE + 1];
-    strncpy(prefix, info.block_name_prefix, RBD_MAX_BLOCK_NAME_SIZE);
-    prefix[RBD_MAX_BLOCK_NAME_SIZE] = '\0';
-
-    string image_id(prefix + strlen(RBD_DATA_PREFIX));
-    header_oid = RBD_HEADER_PREFIX + image_id;
-  }
-
-  uint64_t cookie;
-  RbdWatchCtx ctx(pp, imgname, header_oid);
-  r = pp.watch2(header_oid, &cookie, &ctx);
-  if (r < 0) {
-    cerr << "rbd: watch failed" << std::endl;
-    return r;
-  }
-
-  cout << "press enter to exit..." << std::endl;
-  getchar();
-
-  r = pp.unwatch2(cookie);
-  if (r < 0) {
-    cerr << "rbd: unwatch failed" << std::endl;
-    return r;
-  }
-  return 0;
-}
-
-static int do_show_status(librados::IoCtx &io_ctx, librbd::Image &image,
-                          const char *imgname, Formatter *f)
-{
-  librbd::image_info_t info;
-  uint8_t old_format;
-  int r;
-  string header_oid;
-  std::list<obj_watch_t> watchers;
-
-  r = image.old_format(&old_format);
-  if (r < 0)
-    return r;
-
-  if (old_format) {
-    header_oid = imgname;
-    header_oid += RBD_SUFFIX;
-  } else {
-    r = image.stat(info, sizeof(info));
-    if (r < 0)
-      return r;
-
-    char prefix[RBD_MAX_BLOCK_NAME_SIZE + 1];
-    strncpy(prefix, info.block_name_prefix, RBD_MAX_BLOCK_NAME_SIZE);
-    prefix[RBD_MAX_BLOCK_NAME_SIZE] = '\0';
-
-    header_oid = RBD_HEADER_PREFIX;
-    header_oid.append(prefix + strlen(RBD_DATA_PREFIX));
-  }
-
-  r = io_ctx.list_watchers(header_oid, &watchers);
-  if (r < 0)
-    return r;
-
-  if (f)
-    f->open_object_section("status");
-
-  if (f) {
-    f->open_object_section("watchers");
-    for (std::list<obj_watch_t>::iterator i = watchers.begin(); i != watchers.end(); ++i) {
-      f->open_object_section("watcher");
-      f->dump_string("address", i->addr);
-      f->dump_unsigned("client", i->watcher_id);
-      f->dump_unsigned("cookie", i->cookie);
-      f->close_section();
-    }
-    f->close_section();
-  } else {
-    if (watchers.size()) {
-      cout << "Watchers:" << std::endl;
-      for (std::list<obj_watch_t>::iterator i = watchers.begin(); i != watchers.end(); ++i) {
-        cout << "\twatcher=" << i->addr << " client." << i->watcher_id << " cookie=" << i->cookie << std::endl;
-      }
-    } else {
-      cout << "Watchers: none" << std::endl;
-    }
-  }
-
-  if (f) {
-    f->close_section();
-    f->flush(cout);
-  }
-
-  return 0;
-}
-
-static int do_object_map_rebuild(librbd::Image &image)
-{
-  MyProgressContext pc("Object Map Rebuild");
-  int r = image.rebuild_object_map(pc);
-  if (r < 0) {
-    pc.fail();
-    return r;
-  }
-  pc.finish();
-  return 0;
-}
-
-static int do_kernel_map(const char *poolname, const char *imgname,
-			 const char *snapname)
-{
-  struct krbd_ctx *krbd;
-  ostringstream oss;
-  char *devnode;
-  int r;
-
-  r = krbd_create_from_context(g_ceph_context, &krbd);
-  if (r < 0)
-    return r;
-
-  for (map<string, string>::iterator it = map_options.begin();
-       it != map_options.end(); ) {
-    // for compatibility with < 3.7 kernels, assume that rw is on by
-    // default and omit it even if it was specified by the user
-    // (see ceph.git commit fb0f1986449b)
-    if (it->first == "rw" && it->second == "rw") {
-      map_options.erase(it++);
-    } else {
-      if (it != map_options.begin())
-        oss << ",";
-      oss << it->second;
-      ++it;
-    }
-  }
-
-  r = krbd_map(krbd, poolname, imgname, snapname, oss.str().c_str(), &devnode);
-  if (r < 0)
-    goto out;
-
-  cout << devnode << std::endl;
-
-  free(devnode);
-out:
-  krbd_destroy(krbd);
-  return r;
-}
-
-static int do_kernel_showmapped(Formatter *f)
-{
-  struct krbd_ctx *krbd;
-  int r;
-
-  r = krbd_create_from_context(g_ceph_context, &krbd);
-  if (r < 0)
-    return r;
-
-  r = krbd_showmapped(krbd, f);
-
-  krbd_destroy(krbd);
-  return r;
-}
-
-static int do_kernel_unmap(const char *dev, const char *poolname,
-                           const char *imgname, const char *snapname)
-{
-  struct krbd_ctx *krbd;
-  int r;
-
-  r = krbd_create_from_context(g_ceph_context, &krbd);
-  if (r < 0)
-    return r;
-
-  if (dev)
-    r = krbd_unmap(krbd, dev);
-  else
-    r = krbd_unmap_by_spec(krbd, poolname, imgname, snapname);
-
-  krbd_destroy(krbd);
-  return r;
-}
-
-static string map_option_uuid_cb(const char *value_char)
-{
-  uuid_d u;
-  if (!u.parse(value_char))
-    return "";
-
-  return stringify(u);
-}
-
-static string map_option_ip_cb(const char *value_char)
-{
-  entity_addr_t a;
-  const char *endptr;
-  if (!a.parse(value_char, &endptr) ||
-      endptr != value_char + strlen(value_char)) {
-    return "";
-  }
-
-  return stringify(a.addr);
-}
-
-static string map_option_int_cb(const char *value_char)
-{
-  string err;
-  int d = strict_strtol(value_char, 10, &err);
-  if (!err.empty() || d < 0)
-    return "";
-
-  return stringify(d);
-}
-
-static void put_map_option(const string key, string val)
-{
-  map_options[key] = val;
-}
-
-static int put_map_option_value(const string opt, const char *value_char,
-                                string (*parse_cb)(const char *))
-{
-  if (!value_char || *value_char == '\0') {
-    cerr << "rbd: " << opt << " option requires a value" << std::endl;
-    return 1;
-  }
-
-  string value = parse_cb(value_char);
-  if (value.empty()) {
-    cerr << "rbd: invalid " << opt << " value '" << value_char << "'"
-         << std::endl;
-    return 1;
-  }
-
-  put_map_option(opt, opt + "=" + value);
-  return 0;
-}
-
-static int parse_map_options(char *options)
-{
-  for (char *this_char = strtok(options, ", ");
-       this_char != NULL;
-       this_char = strtok(NULL, ",")) {
-    char *value_char;
-
-    if ((value_char = strchr(this_char, '=')) != NULL)
-      *value_char++ = '\0';
-
-    if (!strcmp(this_char, "fsid")) {
-      if (put_map_option_value("fsid", value_char, map_option_uuid_cb))
-        return 1;
-    } else if (!strcmp(this_char, "ip")) {
-      if (put_map_option_value("ip", value_char, map_option_ip_cb))
-        return 1;
-    } else if (!strcmp(this_char, "share") || !strcmp(this_char, "noshare")) {
-      put_map_option("share", this_char);
-    } else if (!strcmp(this_char, "crc") || !strcmp(this_char, "nocrc")) {
-      put_map_option("crc", this_char);
-    } else if (!strcmp(this_char, "cephx_require_signatures") ||
-               !strcmp(this_char, "nocephx_require_signatures")) {
-      put_map_option("cephx_require_signatures", this_char);
-    } else if (!strcmp(this_char, "tcp_nodelay") ||
-               !strcmp(this_char, "notcp_nodelay")) {
-      put_map_option("tcp_nodelay", this_char);
-    } else if (!strcmp(this_char, "mount_timeout")) {
-      if (put_map_option_value("mount_timeout", value_char, map_option_int_cb))
-        return 1;
-    } else if (!strcmp(this_char, "osdkeepalive")) {
-      if (put_map_option_value("osdkeepalive", value_char, map_option_int_cb))
-        return 1;
-    } else if (!strcmp(this_char, "osd_idle_ttl")) {
-      if (put_map_option_value("osd_idle_ttl", value_char, map_option_int_cb))
-        return 1;
-    } else if (!strcmp(this_char, "rw") || !strcmp(this_char, "ro")) {
-      put_map_option("rw", this_char);
-    } else if (!strcmp(this_char, "queue_depth")) {
-      if (put_map_option_value("queue_depth", value_char, map_option_int_cb))
-        return 1;
-    } else {
-      cerr << "rbd: unknown map option '" << this_char << "'" << std::endl;
-      return 1;
-    }
-  }
-
-  return 0;
-}
-
-static int disk_usage_callback(uint64_t offset, size_t len, int exists,
-                               void *arg) {
-  uint64_t *used_size = reinterpret_cast<uint64_t *>(arg);
-  if (exists) {
-    (*used_size) += len;
-  }
-  return 0;
-}
-
-static int compute_image_disk_usage(const std::string& name,
-                                    const std::string& snap_name,
-                                    const std::string& from_snap_name,
-                                    librbd::Image &image, uint64_t size,
-                                    TextTable& tbl, Formatter *f,
-                                    uint64_t *used_size) {
-  const char* from = NULL;
-  if (!from_snap_name.empty()) {
-    from = from_snap_name.c_str();
-  }
-
-  uint64_t flags;
-  int r = image.get_flags(&flags);
-  if (r < 0) {
-    cerr << "rbd: failed to retrieve image flags: " << cpp_strerror(r)
-         << std::endl;
-    return r;
-  }
-  if ((flags & RBD_FLAG_FAST_DIFF_INVALID) != 0) {
-    cerr << "warning: fast-diff map is invalid for " << name
-         << (snap_name.empty() ? "" : "@" + snap_name) << ". "
-         << "operation may be slow." << std::endl;
-  }
-
-  *used_size = 0;
-  r = image.diff_iterate2(from, 0, size, false, true,
-                          &disk_usage_callback, used_size);
-  if (r < 0) {
-    cerr << "rbd: failed to iterate diffs: " << cpp_strerror(r) << std::endl;
-    return r;
-  }
-
-  if (f) {
-    f->open_object_section("image");
-    f->dump_string("name", name);
-    if (!snap_name.empty()) {
-      f->dump_string("snapshot", snap_name);
-    }
-    f->dump_unsigned("provisioned_size", size);
-    f->dump_unsigned("used_size" , *used_size);
-    f->close_section();
-  } else {
-    std::string full_name = name;
-    if (!snap_name.empty()) {
-      full_name += "@" + snap_name;
-    }
-    tbl << full_name
-        << stringify(si_t(size))
-        << stringify(si_t(*used_size))
-        << TextTable::endrow;
-  }
-  return 0;
-}
-
-static int do_disk_usage(librbd::RBD &rbd, librados::IoCtx &io_ctx,
-                        const char *imgname, const char *snapname,
-                        Formatter *f) {
-  std::vector<string> names;
-  int r = rbd.list(io_ctx, names);
-  if (r == -ENOENT) {
-    r = 0;
-  } else if (r < 0) {
-    return r;
-  }
-
-  TextTable tbl;
-  if (f) {
-    f->open_object_section("stats");
-    f->open_array_section("images");
-  } else {
-    tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
-    tbl.define_column("PROVISIONED", TextTable::RIGHT, TextTable::RIGHT);
-    tbl.define_column("USED", TextTable::RIGHT, TextTable::RIGHT);
-  }
-
-  uint64_t used_size = 0;
-  uint64_t total_prov = 0;
-  uint64_t total_used = 0;
-  std::sort(names.begin(), names.end());
-  for (std::vector<string>::const_iterator name = names.begin();
-       name != names.end(); ++name) {
-    if (imgname != NULL && *name != imgname) {
-      continue;
-    }
-
-    librbd::Image image;
-    r = rbd.open_read_only(io_ctx, image, name->c_str(), NULL);
-    if (r < 0) {
-      if (r != -ENOENT) {
-        cerr << "rbd: error opening " << *name << ": " << cpp_strerror(r)
-             << std::endl;
-      }
-      continue;
-    }
-
-    uint64_t features;
-    int r = image.features(&features);
-    if (r < 0) {
-      cerr << "rbd: failed to retrieve image features: " << cpp_strerror(r)
-           << std::endl;
-      return r;
-    }
-    if ((features & RBD_FEATURE_FAST_DIFF) == 0) {
-      cerr << "warning: fast-diff map is not enabled for " << *name << ". "
-           << "operation may be slow." << std::endl;
-    }
-
-    librbd::image_info_t info;
-    if (image.stat(info, sizeof(info)) < 0) {
-      return -EINVAL;
-    }
-
-    std::vector<librbd::snap_info_t> snap_list;
-    r = image.snap_list(snap_list);
-    if (r < 0) {
-      cerr << "rbd: error opening " << *name << " snapshots: "
-           << cpp_strerror(r) << std::endl;
-      continue;
-    }
-
-    std::string last_snap_name;
-    std::sort(snap_list.begin(), snap_list.end(),
-              boost::bind(&librbd::snap_info_t::id, _1) <
-                boost::bind(&librbd::snap_info_t::id, _2));
-    for (std::vector<librbd::snap_info_t>::const_iterator snap =
-         snap_list.begin(); snap != snap_list.end(); ++snap) {
-      librbd::Image snap_image;
-      r = rbd.open_read_only(io_ctx, snap_image, name->c_str(),
-                             snap->name.c_str());
-      if (r < 0) {
-        cerr << "rbd: error opening snapshot " << *name << "@"
-             << snap->name << ": " << cpp_strerror(r) << std::endl;
-        return r;
-      }
-
-      if (imgname == NULL || (snapname != NULL && snap->name == snapname)) {
-        r = compute_image_disk_usage(*name, snap->name, last_snap_name,
-                                     snap_image, snap->size, tbl, f,
-                                     &used_size);
-        if (r < 0) {
-          return r;
-        }
-
-        if (snapname != NULL) {
-          total_prov += snap->size;
-        }
-        total_used += used_size;
-      }
-      last_snap_name = snap->name;
-    }
-
-    if (snapname == NULL) {
-      r = compute_image_disk_usage(*name, "", last_snap_name, image, info.size,
-                                   tbl, f, &used_size);
-      if (r < 0) {
-        return r;
-      }
-      total_prov += info.size;
-      total_used += used_size;
-    }
-  }
-
-  if (f) {
-    f->close_section();
-    if (imgname == NULL) {
-      f->dump_unsigned("total_provisioned_size", total_prov);
-      f->dump_unsigned("total_used_size", total_used);
-    }
-    f->close_section();
-    f->flush(cout);
-  } else {
-    if (imgname == NULL) {
-      tbl << "<TOTAL>"
-          << stringify(si_t(total_prov))
-          << stringify(si_t(total_used))
-          << TextTable::endrow;
-    }
-    cout << tbl;
-  }
-
-  return 0;
-}
-
-enum CommandType{
-  COMMAND_TYPE_NONE,
-  COMMAND_TYPE_SNAP,
-  COMMAND_TYPE_LOCK,
-  COMMAND_TYPE_METADATA,
-  COMMAND_TYPE_FEATURE,
-  COMMAND_TYPE_OBJECT_MAP
-};
-
-enum {
-  OPT_NO_CMD = 0,
-  OPT_LIST,
-  OPT_INFO,
-  OPT_CREATE,
-  OPT_CLONE,
-  OPT_FLATTEN,
-  OPT_CHILDREN,
-  OPT_RESIZE,
-  OPT_RM,
-  OPT_EXPORT,
-  OPT_EXPORT_DIFF,
-  OPT_DIFF,
-  OPT_IMPORT,
-  OPT_IMPORT_DIFF,
-  OPT_COPY,
-  OPT_RENAME,
-  OPT_SNAP_CREATE,
-  OPT_SNAP_ROLLBACK,
-  OPT_SNAP_REMOVE,
-  OPT_SNAP_LIST,
-  OPT_SNAP_PURGE,
-  OPT_SNAP_PROTECT,
-  OPT_SNAP_UNPROTECT,
-  OPT_WATCH,
-  OPT_STATUS,
-  OPT_MAP,
-  OPT_UNMAP,
-  OPT_SHOWMAPPED,
-  OPT_FEATURE_DISABLE,
-  OPT_FEATURE_ENABLE,
-  OPT_LOCK_LIST,
-  OPT_LOCK_ADD,
-  OPT_LOCK_REMOVE,
-  OPT_BENCH_WRITE,
-  OPT_MERGE_DIFF,
-  OPT_METADATA_LIST,
-  OPT_METADATA_SET,
-  OPT_METADATA_GET,
-  OPT_METADATA_REMOVE,
-  OPT_OBJECT_MAP_REBUILD,
-  OPT_DISK_USAGE
-};
-
-static int get_cmd(const char *cmd, CommandType command_type)
-{
-  switch (command_type)
-  {
-  case COMMAND_TYPE_NONE:
-    if (strcmp(cmd, "ls") == 0 ||
-        strcmp(cmd, "list") == 0)
-      return OPT_LIST;
-    if (strcmp(cmd, "du") == 0 ||
-        strcmp(cmd, "disk-usage") == 0)
-      return OPT_DISK_USAGE;
-    if (strcmp(cmd, "info") == 0)
-      return OPT_INFO;
-    if (strcmp(cmd, "create") == 0)
-      return OPT_CREATE;
-    if (strcmp(cmd, "clone") == 0)
-      return OPT_CLONE;
-    if (strcmp(cmd, "flatten") == 0)
-      return OPT_FLATTEN;
-    if (strcmp(cmd, "children") == 0)
-      return OPT_CHILDREN;
-    if (strcmp(cmd, "resize") == 0)
-      return OPT_RESIZE;
-    if (strcmp(cmd, "rm") == 0)
-      return OPT_RM;
-    if (strcmp(cmd, "export") == 0)
-      return OPT_EXPORT;
-    if (strcmp(cmd, "export-diff") == 0)
-      return OPT_EXPORT_DIFF;
-    if (strcmp(cmd, "merge-diff") == 0)
-      return OPT_MERGE_DIFF;
-    if (strcmp(cmd, "diff") == 0)
-      return OPT_DIFF;
-    if (strcmp(cmd, "import") == 0)
-      return OPT_IMPORT;
-    if (strcmp(cmd, "import-diff") == 0)
-      return OPT_IMPORT_DIFF;
-    if (strcmp(cmd, "copy") == 0 ||
-        strcmp(cmd, "cp") == 0)
-      return OPT_COPY;
-    if (strcmp(cmd, "rename") == 0 ||
-        strcmp(cmd, "mv") == 0)
-      return OPT_RENAME;
-    if (strcmp(cmd, "watch") == 0)
-      return OPT_WATCH;
-    if (strcmp(cmd, "status") == 0)
-      return OPT_STATUS;
-    if (strcmp(cmd, "map") == 0)
-      return OPT_MAP;
-    if (strcmp(cmd, "showmapped") == 0)
-      return OPT_SHOWMAPPED;
-    if (strcmp(cmd, "unmap") == 0)
-      return OPT_UNMAP;
-    if (strcmp(cmd, "bench-write") == 0)
-      return OPT_BENCH_WRITE;
-    break;
-  case COMMAND_TYPE_SNAP:
-    if (strcmp(cmd, "create") == 0 ||
-        strcmp(cmd, "add") == 0)
-      return OPT_SNAP_CREATE;
-    if (strcmp(cmd, "rollback") == 0 ||
-        strcmp(cmd, "revert") == 0)
-      return OPT_SNAP_ROLLBACK;
-    if (strcmp(cmd, "remove") == 0 ||
-        strcmp(cmd, "rm") == 0)
-      return OPT_SNAP_REMOVE;
-    if (strcmp(cmd, "ls") == 0 ||
-        strcmp(cmd, "list") == 0)
-      return OPT_SNAP_LIST;
-    if (strcmp(cmd, "purge") == 0)
-      return OPT_SNAP_PURGE;
-    if (strcmp(cmd, "protect") == 0)
-      return OPT_SNAP_PROTECT;
-    if (strcmp(cmd, "unprotect") == 0)
-      return OPT_SNAP_UNPROTECT;
-    break;
-  case COMMAND_TYPE_METADATA:
-    if (strcmp(cmd, "list") == 0)
-      return OPT_METADATA_LIST;
-    if (strcmp(cmd, "set") == 0)
-      return OPT_METADATA_SET;
-    if (strcmp(cmd, "get") == 0)
-      return OPT_METADATA_GET;
-    if (strcmp(cmd, "remove") == 0)
-      return OPT_METADATA_REMOVE;
-    break;
-  case COMMAND_TYPE_LOCK:
-    if (strcmp(cmd, "ls") == 0 ||
-        strcmp(cmd, "list") == 0)
-      return OPT_LOCK_LIST;
-    if (strcmp(cmd, "add") == 0)
-      return OPT_LOCK_ADD;
-    if (strcmp(cmd, "remove") == 0 ||
-	strcmp(cmd, "rm") == 0)
-      return OPT_LOCK_REMOVE;
-    break;
-  case COMMAND_TYPE_FEATURE:
-    if (strcmp(cmd, "disable") == 0) {
-      return OPT_FEATURE_DISABLE;
-    } else if (strcmp(cmd, "enable") == 0) {
-      return OPT_FEATURE_ENABLE;
-    }
-    break;
-  case COMMAND_TYPE_OBJECT_MAP:
-    if (strcmp(cmd, "rebuild") == 0)
-      return OPT_OBJECT_MAP_REBUILD;
-    break;
-  }
-
-  return OPT_NO_CMD;
-}
-
-/*
- * Called 1-N times depending on how many args the command needs.  If
- * the positional varN is already set, set the next one; this handles
- * both --args above and unadorned args below.  Calling with all args
- * filled is an error.
- */
-static bool set_conf_param(const char *param, const char **var1,
-			   const char **var2, const char **var3)
-{
-  if (!*var1)
-    *var1 = param;
-  else if (var2 && !*var2)
-    *var2 = param;
-  else if (var3 && !*var3)
-    *var3 = param;
-  else
-    return false;
-  return true;
-}
-
-bool size_set;
-
-int main(int argc, const char **argv)
-{
-  librados::Rados rados;
-  librbd::RBD rbd;
-  librados::IoCtx io_ctx, dest_io_ctx;
-  librbd::Image image;
-
-  vector<const char*> args;
-
-  argv_to_vec(argc, argv, args);
-  env_to_vec(args);
-
-  int opt_cmd = OPT_NO_CMD;
-  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
-
-  const char *poolname = NULL;
-  uint64_t size = 0;  // in bytes
-  int order = 0;
-  bool format_specified = false,
-    output_format_specified = false;
-  int format = 2;
-
-  uint64_t features = 0;
-  bool shared = false;
-
-  const char *imgname = NULL, *snapname = NULL, *destname = NULL,
-    *dest_poolname = NULL, *dest_snapname = NULL, *path = NULL,
-    *devpath = NULL, *lock_cookie = NULL, *lock_client = NULL,
-    *lock_tag = NULL, *output_format = "plain",
-    *fromsnapname = NULL,
-    *first_diff = NULL, *second_diff = NULL, *key = NULL, *value = NULL;
-  char *cli_map_options = NULL;
-  std::vector<const char*> feature_names;
-  bool lflag = false;
-  int pretty_format = 0;
-  long long stripe_unit = 0, stripe_count = 0;
-  long long bench_io_size = 4096, bench_io_threads = 16, bench_bytes = 1 << 30;
-  string bench_pattern = "seq";
-  bool diff_whole_object = false;
-  bool input_feature = false;
-
-  std::string val, parse_err;
-  std::ostringstream err;
-  uint64_t sizell = 0;
-  std::vector<const char*>::iterator i;
-  for (i = args.begin(); i != args.end(); ) {
-    if (ceph_argparse_double_dash(args, i)) {
-      break;
-    } else if (ceph_argparse_witharg(args, i, &val, "--secret", (char*)NULL)) {
-      int r = g_conf->set_val("keyfile", val.c_str());
-      assert(r == 0);
-    } else if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
-      usage();
-      return 0;
-    } else if (ceph_argparse_flag(args, i, "--new-format", (char*)NULL)) {
-      cerr << "rbd: --new-format is deprecated" << std::endl;
-      format = 2;
-      format_specified = true;
-    } else if (ceph_argparse_witharg(args, i, &val, "--image-format",
-				     (char*)NULL)) {
-      format = strict_strtol(val.c_str(), 10, &parse_err);
-      if (!parse_err.empty()) {
-	cerr << "rbd: error parsing --image-format: " << parse_err << std::endl;
-	return EXIT_FAILURE;
-      }
-      format_specified = true;
-      if (0 != g_conf->set_val("rbd_default_format", val.c_str())) {
-        cerr << "rbd: image format must be 1 or 2" << std::endl;
-        return EXIT_FAILURE;
-      }
-    } else if (ceph_argparse_witharg(args, i, &val, "-p", "--pool", (char*)NULL)) {
-      poolname = strdup(val.c_str());
-    } else if (ceph_argparse_witharg(args, i, &val, "--dest-pool", (char*)NULL)) {
-      dest_poolname = strdup(val.c_str());
-    } else if (ceph_argparse_witharg(args, i, &val, "--snap", (char*)NULL)) {
-      snapname = strdup(val.c_str());
-    } else if (ceph_argparse_witharg(args, i, &val, "--from-snap", (char*)NULL)) {
-      fromsnapname = strdup(val.c_str());
-    } else if (ceph_argparse_witharg(args, i, &val, "-i", "--image", (char*)NULL)) {
-      imgname = strdup(val.c_str());
-    } else if (ceph_argparse_witharg(args, i, &val, err, "-s", "--size", (char*)NULL)) {
-      if (!err.str().empty()) {
-        cerr << "rbd: " << err.str() << std::endl;
-        return EXIT_FAILURE;
-      }
-      const char *sizeval = val.c_str();
-      size = strict_sistrtoll(sizeval, &parse_err);
-      if (!parse_err.empty()) {
-        cerr << "rbd: error parsing --size " << parse_err << std::endl;
-        return EXIT_FAILURE;
-      }
-      //NOTE: We can remove below given three lines of code once all applications,
-      //which use this CLI will adopt B/K/M/G/T/P/E with size value 
-      sizell = atoll(sizeval);
-      if (size == sizell) 
-        size = size << 20;   // Default MB to Bytes
-      size_set = true;
-    } else if (ceph_argparse_flag(args, i, "-l", "--long", (char*)NULL)) {
-      lflag = true;
-    } else if (ceph_argparse_witharg(args, i, &val, err, "--stripe-unit", (char*)NULL)) {
-      if (!err.str().empty()) {
-        cerr << "rbd: " << err.str() << std::endl;
-        return EXIT_FAILURE;
-      }
-      const char *stripeval = val.c_str();
-      stripe_unit = strict_sistrtoll(stripeval, &parse_err);
-      if (!parse_err.empty()) {
-        cerr << "rbd: error parsing --stripe-unit " << parse_err << std::endl;
-        return EXIT_FAILURE;
-      }
-    } else if (ceph_argparse_witharg(args, i, &stripe_count, err, "--stripe-count", (char*)NULL)) {
-    } else if (ceph_argparse_witharg(args, i, &order, err, "--order", (char*)NULL)) {
-      if (!err.str().empty()) {
-	cerr << "rbd: " << err.str() << std::endl;
-	return EXIT_FAILURE;
-      }
-    } else if (ceph_argparse_witharg(args, i, &val, err, "--io-size", (char*)NULL)) {
-      if (!err.str().empty()) {
-	cerr << "rbd: " << err.str() << std::endl;
-	return EXIT_FAILURE;
-      }
-      const char *iosval = val.c_str();
-      bench_io_size = strict_sistrtoll(iosval, &parse_err);
-      if (!parse_err.empty()) {
-        cerr << "rbd: error parsing --io-size " << parse_err << std::endl;
-        return EXIT_FAILURE;
-      }
-      if (bench_io_size == 0) {
-	cerr << "rbd: io-size must be > 0" << std::endl;
-	return EXIT_FAILURE;
-      }
-    } else if (ceph_argparse_witharg(args, i, &bench_io_threads, err, "--io-threads", (char*)NULL)) {
-    } else if (ceph_argparse_witharg(args, i, &val, err, "--io-total", (char*)NULL)) {
-      if (!err.str().empty()) {
-       cerr << "rbd: " << err.str() << std::endl;
-       return EXIT_FAILURE;
-      }
-      const char *iotval = val.c_str();
-      bench_bytes = strict_sistrtoll(iotval, &parse_err);
-      if (!parse_err.empty()) {
-        cerr << "rbd: error parsing --io-total " << parse_err << std::endl;
-        return EXIT_FAILURE;
-      }
-    } else if (ceph_argparse_witharg(args, i, &bench_pattern, "--io-pattern", (char*)NULL)) {
-    } else if (ceph_argparse_witharg(args, i, &val, "--path", (char*)NULL)) {
-      path = strdup(val.c_str());
-    } else if (ceph_argparse_witharg(args, i, &val, "--dest", (char*)NULL)) {
-      destname = strdup(val.c_str());
-    } else if (ceph_argparse_witharg(args, i, &val, "--parent", (char *)NULL)) {
-      imgname = strdup(val.c_str());
-    } else if (ceph_argparse_witharg(args, i, &val, "--shared", (char *)NULL)) {
-      lock_tag = strdup(val.c_str());
-    } else if (ceph_argparse_flag(args, i, "--no-settle", (char *)NULL)) {
-      cerr << "rbd: --no-settle is deprecated" << std::endl;
-    } else if (ceph_argparse_witharg(args, i, &val, "-o", "--options", (char*)NULL)) {
-      cli_map_options = strdup(val.c_str());
-    } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) {
-      // --read-only is equivalent to -o ro
-      put_map_option("rw", "ro");
-    } else if (ceph_argparse_flag(args, i, "--no-progress", (char *)NULL)) {
-      progress = false;
-    } else if (ceph_argparse_flag(args, i , "--allow-shrink", (char *)NULL)) {
-      resize_allow_shrink = true;
-    } else if (ceph_argparse_witharg(args, i, &val, "--image-feature", (char *)NULL)) {
-      uint64_t feature;
-      input_feature = true;
-      if (!decode_feature(val.c_str(), &feature)) {
-        cerr << "rbd: invalid image feature: " << val << std::endl;
-        return EXIT_FAILURE;
-      }
-      features |= feature;
-    } else if (ceph_argparse_witharg(args, i, &val, "--image-features", (char *)NULL)) {
-      cerr << "rbd: using --image-features for specifying the rbd image format is"
-	   << " deprecated, use --image-feature instead" << std::endl;
-      features = strict_strtol(val.c_str(), 10, &parse_err);
-      input_feature = true;
-      if (!parse_err.empty()) {
-	cerr << "rbd: error parsing --image-features: " << parse_err
-             << std::endl;
-	return EXIT_FAILURE;
-      }
-    } else if (ceph_argparse_flag(args, i, "--image-shared", (char *)NULL)) {
-      shared = true;
-    } else if (ceph_argparse_witharg(args, i, &val, "--format", (char *) NULL)) {
-      long long ret = strict_strtoll(val.c_str(), 10, &parse_err);
-      if (parse_err.empty()) {
-	g_conf->set_val_or_die("rbd_default_format", val.c_str());
-	format = ret;
-	format_specified = true;
-	cerr << "rbd: using --format for specifying the rbd image format is"
-	     << " deprecated, use --image-format instead"
-	     << std::endl;
-      } else {
-	output_format = strdup(val.c_str());
-	output_format_specified = true;
-      }
-    } else if (ceph_argparse_flag(args, i, "--whole-object", (char *)NULL)) {
-      diff_whole_object = true;
-    } else if (ceph_argparse_binary_flag(args, i, &pretty_format, NULL, "--pretty-format", (char*)NULL)) {
-    } else {
-      ++i;
-    }
-  }
-
-  if (features != 0 && !format_specified) {
-    format = 2;
-    format_specified = true;
-  } else if (features == 0) {
-    features = g_conf->rbd_default_features;
-  }
-  if (shared) {
-    features &= ~(RBD_FEATURE_EXCLUSIVE_LOCK | RBD_FEATURE_OBJECT_MAP);
-  }
-  if (((features & RBD_FEATURE_EXCLUSIVE_LOCK) == 0) &&
-      ((features & RBD_FEATURE_OBJECT_MAP) != 0)) {
-    cerr << "rbd: exclusive lock image feature must be enabled to use "
-         << "the object map" << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  common_init_finish(g_ceph_context);
-
-  std::map<std::string, CommandType> command_map = boost::assign::map_list_of
-    ("snap", COMMAND_TYPE_SNAP)
-    ("lock", COMMAND_TYPE_LOCK)
-    ("image-meta", COMMAND_TYPE_METADATA)
-    ("feature", COMMAND_TYPE_FEATURE)
-    ("object-map", COMMAND_TYPE_OBJECT_MAP);
-
-  i = args.begin();
-  if (i == args.end()) {
-    cerr << "rbd: you must specify a command." << std::endl;
-    return EXIT_FAILURE;
-  } else if (command_map.count(*i) > 0) {
-    std::string command(*i);
-    i = args.erase(i);
-    if (i == args.end()) {
-      cerr << "rbd: which " << command << " command do you want?" << std::endl;
-      return EXIT_FAILURE;
-    }
-    opt_cmd = get_cmd(*i, command_map[command]);
-  } else {
-    opt_cmd = get_cmd(*i, COMMAND_TYPE_NONE);
-  }
-  if (opt_cmd == OPT_NO_CMD) {
-    cerr << "rbd: error parsing command '" << *i << "'; -h or --help for usage"
-         << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  // loop across all remaining arguments; by command, accumulate any
-  // that are still missing into the appropriate variables, one at a
-  // time (i.e. SET_CONF_PARAM will be called N times for N remaining
-  // arguments).
-
-#define SET_CONF_PARAM(v, p1, p2, p3) \
-if (!set_conf_param(v, p1, p2, p3)) { \
-  cerr << "rbd: extraneous parameter " << v << std::endl; \
-  return EXIT_FAILURE; \
-}
-
-  for (i = args.erase(i); i != args.end(); ++i) {
-    const char *v = *i;
-    switch (opt_cmd) {
-      case OPT_LIST:
-	SET_CONF_PARAM(v, &poolname, NULL, NULL);
-	break;
-      case OPT_INFO:
-      case OPT_CREATE:
-      case OPT_FLATTEN:
-      case OPT_RESIZE:
-      case OPT_RM:
-      case OPT_SNAP_CREATE:
-      case OPT_SNAP_ROLLBACK:
-      case OPT_SNAP_REMOVE:
-      case OPT_SNAP_LIST:
-      case OPT_SNAP_PURGE:
-      case OPT_SNAP_PROTECT:
-      case OPT_SNAP_UNPROTECT:
-      case OPT_WATCH:
-      case OPT_STATUS:
-      case OPT_MAP:
-      case OPT_UNMAP:
-      case OPT_BENCH_WRITE:
-      case OPT_LOCK_LIST:
-      case OPT_METADATA_LIST:
-      case OPT_DIFF:
-      case OPT_OBJECT_MAP_REBUILD:
-      case OPT_DISK_USAGE:
-	SET_CONF_PARAM(v, &imgname, NULL, NULL);
-	break;
-      case OPT_EXPORT:
-      case OPT_EXPORT_DIFF:
-	SET_CONF_PARAM(v, &imgname, &path, NULL);
-	break;
-      case OPT_MERGE_DIFF:
-        SET_CONF_PARAM(v, &first_diff, &second_diff, &path);
-        break;
-      case OPT_IMPORT:
-      case OPT_IMPORT_DIFF:
-	SET_CONF_PARAM(v, &path, &imgname, NULL);
-	break;
-      case OPT_COPY:
-      case OPT_RENAME:
-      case OPT_CLONE:
-	SET_CONF_PARAM(v, &imgname, &destname, NULL);
-	break;
-      case OPT_SHOWMAPPED:
-	cerr << "rbd: showmapped takes no parameters" << std::endl;
-	return EXIT_FAILURE;
-      case OPT_CHILDREN:
-	SET_CONF_PARAM(v, &imgname, NULL, NULL);
-	break;
-      case OPT_LOCK_ADD:
-	SET_CONF_PARAM(v, &imgname, &lock_cookie, NULL);
-	break;
-      case OPT_LOCK_REMOVE:
-	SET_CONF_PARAM(v, &imgname, &lock_cookie, &lock_client);
-	break;
-      case OPT_METADATA_SET:
-	SET_CONF_PARAM(v, &imgname, &key, &value);
-	break;
-      case OPT_METADATA_GET:
-      case OPT_METADATA_REMOVE:
-	SET_CONF_PARAM(v, &imgname, &key, NULL);
-	break;
-      case OPT_FEATURE_DISABLE:
-      case OPT_FEATURE_ENABLE:
-        if (imgname == NULL) {
-          imgname = v;
-        } else {
-          feature_names.push_back(v);
-        }
-        break;
-    default:
-	assert(0);
-	break;
-    }
-  }
-
-  g_conf->set_val_or_die("rbd_cache_writethrough_until_flush", "false");
-
-  /* get defaults from rbd_default_* options to keep behavior consistent with
-     manual short-form options */
-  if (!format_specified)
-    format = g_conf->rbd_default_format;
-  if (!order)
-    order = g_conf->rbd_default_order;
-  if (!stripe_unit)
-    stripe_unit = g_conf->rbd_default_stripe_unit;
-  if (!stripe_count)
-    stripe_count = g_conf->rbd_default_stripe_count;
-
-  if (format_specified && opt_cmd != OPT_IMPORT && opt_cmd != OPT_CREATE) {
-    cerr << "rbd: image format can only be set when "
-	 << "creating or importing an image" << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  if (opt_cmd != OPT_LOCK_ADD && lock_tag) {
-    cerr << "rbd: only the lock add command uses the --shared option"
-	 << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  if (pretty_format && !strcmp(output_format, "plain")) {
-    cerr << "rbd: --pretty-format only works when --format is json or xml"
-	 << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  boost::scoped_ptr<Formatter> formatter;
-  if (output_format_specified && opt_cmd != OPT_SHOWMAPPED &&
-      opt_cmd != OPT_INFO && opt_cmd != OPT_LIST &&
-      opt_cmd != OPT_SNAP_LIST && opt_cmd != OPT_LOCK_LIST &&
-      opt_cmd != OPT_CHILDREN && opt_cmd != OPT_DIFF &&
-      opt_cmd != OPT_METADATA_LIST && opt_cmd != OPT_STATUS &&
-      opt_cmd != OPT_DISK_USAGE) {
-    cerr << "rbd: command doesn't use output formatting"
-	 << std::endl;
-    return EXIT_FAILURE;
-  } else if (get_outfmt(output_format, pretty_format, &formatter) < 0) {
-    return EXIT_FAILURE;
-  }
-
-  if (format_specified) {
-    if (format < 1 || format > 2) {
-      cerr << "rbd: image format must be 1 or 2" << std::endl;
-      return EXIT_FAILURE;
-    }
-  }
-
-  if ((opt_cmd == OPT_IMPORT || opt_cmd == OPT_IMPORT_DIFF) && !path) {
-    cerr << "rbd: path was not specified" << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  if (opt_cmd == OPT_IMPORT && !destname) {
-    destname = imgname;
-    if (!destname)
-      destname = imgname_from_path(path);
-    imgname = NULL;
-  }
-
-  if (opt_cmd != OPT_LIST &&
-      opt_cmd != OPT_IMPORT &&
-      opt_cmd != OPT_UNMAP && /* needs imgname but handled below */
-      opt_cmd != OPT_SHOWMAPPED &&
-      opt_cmd != OPT_MERGE_DIFF &&
-      opt_cmd != OPT_DISK_USAGE && !imgname) {
-    cerr << "rbd: image name was not specified" << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  if (opt_cmd == OPT_MAP) {
-    char *default_map_options = strdup(g_conf->rbd_default_map_options.c_str());
-
-    // parse default options first so they can be overwritten by cli options
-    if (parse_map_options(default_map_options)) {
-      cerr << "rbd: couldn't parse default map options" << std::endl;
-      return EXIT_FAILURE;
-    }
-    if (cli_map_options && parse_map_options(cli_map_options)) {
-      cerr << "rbd: couldn't parse map options" << std::endl;
-      return EXIT_FAILURE;
-    }
-  }
-  if (opt_cmd == OPT_UNMAP) {
-    if (!imgname) {
-      cerr << "rbd: unmap requires either image name or device path" << std::endl;
-      return EXIT_FAILURE;
-    }
-
-    if (strncmp(imgname, "/dev/", 5) == 0) {
-      devpath = imgname;
-      imgname = NULL;
-    }
-  }
-
-  // do this unconditionally so we can parse pool/image at snapshot into
-  // the relevant parts
-  set_pool_image_name(imgname, (char **)&poolname,
-		      (char **)&imgname, (char **)&snapname);
-  if (snapname && opt_cmd != OPT_SNAP_CREATE && opt_cmd != OPT_SNAP_ROLLBACK &&
-      opt_cmd != OPT_SNAP_REMOVE && opt_cmd != OPT_INFO &&
-      opt_cmd != OPT_EXPORT && opt_cmd != OPT_EXPORT_DIFF &&
-      opt_cmd != OPT_DIFF && opt_cmd != OPT_COPY &&
-      opt_cmd != OPT_MAP && opt_cmd != OPT_UNMAP && opt_cmd != OPT_CLONE &&
-      opt_cmd != OPT_SNAP_PROTECT && opt_cmd != OPT_SNAP_UNPROTECT &&
-      opt_cmd != OPT_CHILDREN && opt_cmd != OPT_OBJECT_MAP_REBUILD &&
-      opt_cmd != OPT_DISK_USAGE) {
-    cerr << "rbd: snapname specified for a command that doesn't use it"
-	 << std::endl;
-    return EXIT_FAILURE;
-  }
-  if ((opt_cmd == OPT_SNAP_CREATE || opt_cmd == OPT_SNAP_ROLLBACK ||
-       opt_cmd == OPT_SNAP_REMOVE || opt_cmd == OPT_CLONE ||
-       opt_cmd == OPT_SNAP_PROTECT || opt_cmd == OPT_SNAP_UNPROTECT ||
-       opt_cmd == OPT_CHILDREN) && !snapname) {
-    cerr << "rbd: snap name was not specified" << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  set_pool_image_name(destname, (char **)&dest_poolname,
-		      (char **)&destname, (char **)&dest_snapname);
-  if (dest_snapname) {
-    // no command uses dest_snapname
-    cerr << "rbd: destination snapname specified for a command that doesn't use it"
-         << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  if (opt_cmd == OPT_IMPORT) {
-    if (poolname && dest_poolname) {
-      cerr << "rbd: source and destination pool both specified" << std::endl;
-      return EXIT_FAILURE;
-    }
-    if (imgname && destname) {
-      cerr << "rbd: source and destination image both specified" << std::endl;
-      return EXIT_FAILURE;
-    }
-    if (poolname)
-      dest_poolname = poolname;
-  }
-
-  if (!poolname)
-    poolname = "rbd";
-
-  if (!dest_poolname)
-    dest_poolname = "rbd";
-
-  if (opt_cmd == OPT_MERGE_DIFF) {
-    if (!first_diff) {
-      cerr << "rbd: first diff was not specified" << std::endl;
-      return EXIT_FAILURE;
-    }
-    if (!second_diff) {
-      cerr << "rbd: second diff was not specified" << std::endl;
-      return EXIT_FAILURE;
-    }
-  }
-  if ((opt_cmd == OPT_EXPORT || opt_cmd == OPT_EXPORT_DIFF ||
-      opt_cmd == OPT_MERGE_DIFF) && !path) {
-    if (opt_cmd == OPT_EXPORT) {
-      path = imgname;
-    } else {
-      cerr << "rbd: path was not specified" << std::endl;
-      return EXIT_FAILURE;
-    }
-  }
-
-  if ((opt_cmd == OPT_COPY || opt_cmd == OPT_CLONE || opt_cmd == OPT_RENAME) &&
-      ((!destname) || (destname[0] == '\0')) ) {
-    cerr << "rbd: destination image name was not specified" << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  if ((opt_cmd == OPT_CLONE) && size) {
-    cerr << "rbd: clone must begin at size of parent" << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  if ((opt_cmd == OPT_RENAME) && (strcmp(poolname, dest_poolname) != 0)) {
-    cerr << "rbd: mv/rename across pools not supported" << std::endl;
-    cerr << "source pool: " << poolname << " dest pool: " << dest_poolname
-      << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  if (opt_cmd == OPT_LOCK_ADD || opt_cmd == OPT_LOCK_REMOVE) {
-    if (!lock_cookie) {
-      cerr << "rbd: lock id was not specified" << std::endl;
-      return EXIT_FAILURE;
-    }
-    if (opt_cmd == OPT_LOCK_REMOVE && !lock_client) {
-      cerr << "rbd: locker was not specified" << std::endl;
-      return EXIT_FAILURE;
-    }
-  }
-
-  if (opt_cmd == OPT_FEATURE_DISABLE || opt_cmd == OPT_FEATURE_ENABLE) {
-    if (feature_names.empty()) {
-      cerr << "rbd: at least one feature name must be specified" << std::endl;
-      return EXIT_FAILURE;
-    }
-
-    features = 0;
-    for (size_t i = 0; i < feature_names.size(); ++i) {
-      uint64_t feature;
-      if (!decode_feature(feature_names[i], &feature)) {
-        cerr << "rbd: invalid feature name specified: " << feature_names[i]
-             << std::endl;
-        return EXIT_FAILURE;
-      }
-      features |= feature;
-    }
-  }
-
-  if (opt_cmd == OPT_METADATA_GET || opt_cmd == OPT_METADATA_REMOVE ||
-      opt_cmd == OPT_METADATA_SET) {
-    if (!key) {
-      cerr << "rbd: metadata key was not specified" << std::endl;
-      return EXIT_FAILURE;
-    }
-    if (opt_cmd == OPT_METADATA_SET && !value) {
-      cerr << "rbd: metadata value was not specified" << std::endl;
-      return EXIT_FAILURE;
-    }
-  }
-
-  bool talk_to_cluster = (opt_cmd != OPT_MAP &&
-			  opt_cmd != OPT_UNMAP &&
-			  opt_cmd != OPT_SHOWMAPPED &&
-                          opt_cmd != OPT_MERGE_DIFF);
-  if (talk_to_cluster && rados.init_with_context(g_ceph_context) < 0) {
-    cerr << "rbd: couldn't initialize rados!" << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  if (talk_to_cluster && rados.connect() < 0) {
-    cerr << "rbd: couldn't connect to the cluster!" << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  int r;
-  if (talk_to_cluster && opt_cmd != OPT_IMPORT) {
-    r = rados.ioctx_create(poolname, io_ctx);
-    if (r < 0) {
-      cerr << "rbd: error opening pool " << poolname << ": "
-	   << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-  }
-
-  if (imgname && talk_to_cluster &&
-      (opt_cmd == OPT_RESIZE || opt_cmd == OPT_SNAP_CREATE ||
-       opt_cmd == OPT_SNAP_ROLLBACK || opt_cmd == OPT_SNAP_REMOVE ||
-       opt_cmd == OPT_SNAP_PURGE || opt_cmd == OPT_SNAP_PROTECT ||
-       opt_cmd == OPT_SNAP_UNPROTECT || opt_cmd == OPT_WATCH ||
-       opt_cmd == OPT_FLATTEN || opt_cmd == OPT_LOCK_ADD ||
-       opt_cmd == OPT_LOCK_REMOVE || opt_cmd == OPT_BENCH_WRITE ||
-       opt_cmd == OPT_INFO || opt_cmd == OPT_SNAP_LIST ||
-       opt_cmd == OPT_IMPORT_DIFF ||
-       opt_cmd == OPT_EXPORT || opt_cmd == OPT_EXPORT_DIFF || opt_cmd == OPT_COPY ||
-       opt_cmd == OPT_DIFF || opt_cmd == OPT_STATUS ||
-       opt_cmd == OPT_CHILDREN || opt_cmd == OPT_LOCK_LIST ||
-       opt_cmd == OPT_METADATA_SET || opt_cmd == OPT_METADATA_LIST ||
-       opt_cmd == OPT_METADATA_REMOVE || opt_cmd == OPT_METADATA_GET ||
-       opt_cmd == OPT_FEATURE_DISABLE || opt_cmd == OPT_FEATURE_ENABLE ||
-       opt_cmd == OPT_OBJECT_MAP_REBUILD || opt_cmd == OPT_DISK_USAGE)) {
-
-    if (opt_cmd == OPT_INFO || opt_cmd == OPT_SNAP_LIST ||
-	opt_cmd == OPT_EXPORT || opt_cmd == OPT_EXPORT || opt_cmd == OPT_COPY ||
-	opt_cmd == OPT_CHILDREN || opt_cmd == OPT_LOCK_LIST ||
-        opt_cmd == OPT_METADATA_LIST || opt_cmd == OPT_STATUS ||
-        opt_cmd == OPT_WATCH || opt_cmd == OPT_DISK_USAGE) {
-      r = rbd.open_read_only(io_ctx, image, imgname, NULL);
-    } else {
-      r = rbd.open(io_ctx, image, imgname);
-    }
-    if (r < 0) {
-      cerr << "rbd: error opening image " << imgname << ": "
-	   << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-  }
-
-  if (snapname && talk_to_cluster &&
-      (opt_cmd == OPT_INFO ||
-       opt_cmd == OPT_EXPORT ||
-       opt_cmd == OPT_EXPORT_DIFF ||
-       opt_cmd == OPT_DIFF ||
-       opt_cmd == OPT_COPY ||
-       opt_cmd == OPT_CHILDREN ||
-       opt_cmd == OPT_OBJECT_MAP_REBUILD ||
-       opt_cmd == OPT_DISK_USAGE)) {
-    r = image.snap_set(snapname);
-    if (r < 0) {
-      cerr << "rbd: error setting snapshot context: " << cpp_strerror(-r)
-	   << std::endl;
-      return -r;
-    }
-  }
-
-  if (opt_cmd == OPT_COPY || opt_cmd == OPT_IMPORT || opt_cmd == OPT_CLONE) {
-    r = rados.ioctx_create(dest_poolname, dest_io_ctx);
-    if (r < 0) {
-      cerr << "rbd: error opening pool " << dest_poolname << ": "
-	   << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-  }
-
-  if (opt_cmd == OPT_CREATE || opt_cmd == OPT_RESIZE) {
-    if (!size_set) {
-      cerr << "rbd: must specify --size <M/G/T>" << std::endl;
-      return EINVAL;
-    }
-  }
-
-  if (opt_cmd == OPT_CREATE || opt_cmd == OPT_CLONE || opt_cmd == OPT_IMPORT) {
-    if ((stripe_unit && !stripe_count) || (!stripe_unit && stripe_count)) {
-      cerr << "must specify both (or neither) of stripe-unit and stripe-count"
-	   << std::endl;
-      usage();
-      return EINVAL;
-    }
-
-    if ((stripe_unit || stripe_count) &&
-	(stripe_unit != (1ll << order) && stripe_count != 1)) {
-      features |= RBD_FEATURE_STRIPINGV2;
-    } else {
-      features &= ~RBD_FEATURE_STRIPINGV2;
-    }
-  }
-
-  switch (opt_cmd) {
-  case OPT_LIST:
-    r = do_list(rbd, io_ctx, lflag, formatter.get());
-    if (r < 0) {
-      cerr << "rbd: list: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_CREATE:
-    if (input_feature && (format == 1)){
-      cerr << "feature not allowed with format 1; use --image-format 2" << std::endl;
-      return EINVAL;
-    }
-    r = do_create(rbd, io_ctx, imgname, size, &order, format, features,
-		  stripe_unit, stripe_count);
-    if (r < 0) {
-      cerr << "rbd: create error: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_CLONE:
-    r = do_clone(rbd, io_ctx, imgname, snapname, dest_io_ctx, destname,
-		 features, &order, stripe_unit, stripe_count);
-    if (r < 0) {
-      cerr << "rbd: clone error: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_FLATTEN:
-    r = do_flatten(image);
-    if (r < 0) {
-      cerr << "rbd: flatten error: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_RENAME:
-    r = do_rename(rbd, io_ctx, imgname, destname);
-    if (r < 0) {
-      cerr << "rbd: rename error: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_INFO:
-    r = do_show_info(imgname, image, snapname, formatter.get());
-    if (r < 0) {
-      cerr << "rbd: info: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_RM:
-    r = do_delete(rbd, io_ctx, imgname);
-    if (r < 0) {
-      if (r == -ENOTEMPTY) {
-	cerr << "rbd: image has snapshots - these must be deleted"
-	     << " with 'rbd snap purge' before the image can be removed."
-	     << std::endl;
-      } else if (r == -EBUSY) {
-	cerr << "rbd: error: image still has watchers"
-	     << std::endl
-	     << "This means the image is still open or the client using "
-	     << "it crashed. Try again after closing/unmapping it or "
-	     << "waiting 30s for the crashed client to timeout."
-	     << std::endl;
-      } else {
-	cerr << "rbd: delete error: " << cpp_strerror(-r) << std::endl;
-      }
-      return -r ;
-    }
-    break;
-
-  case OPT_RESIZE:
-    librbd::image_info_t info;
-    r = image.stat(info, sizeof(info));
-    if (r < 0) {
-      cerr << "rbd: resize error: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-
-    if (info.size > size && !resize_allow_shrink) {
-      cerr << "rbd: shrinking an image is only allowed with the --allow-shrink flag" << std::endl;
-      return EINVAL;
-    }
-
-    r = do_resize(image, size);
-    if (r < 0) {
-      cerr << "rbd: resize error: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_SNAP_LIST:
-    r = do_list_snaps(image, formatter.get());
-    if (r < 0) {
-      cerr << "rbd: failed to list snapshots: " << cpp_strerror(-r)
-	   << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_SNAP_CREATE:
-    r = do_add_snap(image, snapname);
-    if (r < 0) {
-      cerr << "rbd: failed to create snapshot: " << cpp_strerror(-r)
-	   << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_SNAP_ROLLBACK:
-    r = do_rollback_snap(image, snapname);
-    if (r < 0) {
-      cerr << "rbd: rollback failed: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_SNAP_REMOVE:
-    r = do_remove_snap(image, snapname);
-    if (r < 0) {
-      if (r == -EBUSY) {
-        cerr << "rbd: snapshot '" << snapname << "' is protected from removal."
-             << std::endl;
-      } else {
-        cerr << "rbd: failed to remove snapshot: " << cpp_strerror(-r)
-             << std::endl;
-      }
-      return -r;
-    }
-    break;
-
-  case OPT_SNAP_PURGE:
-    r = do_purge_snaps(image);
-    if (r < 0) {
-      if (r != -EBUSY) {
-        cerr << "rbd: removing snaps failed: " << cpp_strerror(-r) << std::endl;
-      }
-      return -r;
-    }
-    break;
-
-  case OPT_SNAP_PROTECT:
-    r = do_protect_snap(image, snapname);
-    if (r < 0) {
-      cerr << "rbd: protecting snap failed: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_SNAP_UNPROTECT:
-    r = do_unprotect_snap(image, snapname);
-    if (r < 0) {
-      cerr << "rbd: unprotecting snap failed: " << cpp_strerror(-r)
-	   << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_CHILDREN:
-    r = do_list_children(image, formatter.get());
-    if (r < 0) {
-      cerr << "rbd: listing children failed: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_EXPORT:
-    r = do_export(image, path);
-    if (r < 0) {
-      cerr << "rbd: export error: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_DIFF:
-    r = do_diff(image, fromsnapname, diff_whole_object, formatter.get());
-    if (r < 0) {
-      cerr << "rbd: diff error: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_EXPORT_DIFF:
-    r = do_export_diff(image, fromsnapname, snapname, diff_whole_object, path);
-    if (r < 0) {
-      cerr << "rbd: export-diff error: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_MERGE_DIFF:
-    r = do_merge_diff(first_diff, second_diff, path);
-    if (r < 0) {
-      cerr << "rbd: merge-diff error" << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_IMPORT:
-    r = do_import(rbd, dest_io_ctx, destname, &order, path,
-		  format, features, size, stripe_unit, stripe_count);
-    if (r < 0) {
-      cerr << "rbd: import failed: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_IMPORT_DIFF:
-    r = do_import_diff(image, path);
-    if (r < 0) {
-      cerr << "rbd: import-diff failed: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_COPY:
-    r = do_copy(image, dest_io_ctx, destname);
-    if (r < 0) {
-      cerr << "rbd: copy failed: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_WATCH:
-    r = do_watch(io_ctx, image, imgname);
-    if (r < 0) {
-      cerr << "rbd: watch failed: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_STATUS:
-    r = do_show_status(io_ctx, image, imgname, formatter.get());
-    if (r < 0) {
-      cerr << "rbd: show status failed: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_MAP:
-    r = do_kernel_map(poolname, imgname, snapname);
-    if (r < 0) {
-      cerr << "rbd: map failed: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_UNMAP:
-    r = do_kernel_unmap(devpath, poolname, imgname, snapname);
-    if (r < 0) {
-      cerr << "rbd: unmap failed: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_SHOWMAPPED:
-    r = do_kernel_showmapped(formatter.get());
-    if (r < 0) {
-      cerr << "rbd: showmapped failed: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_LOCK_LIST:
-    r = do_lock_list(image, formatter.get());
-    if (r < 0) {
-      cerr << "rbd: listing locks failed: " << cpp_strerror(r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_LOCK_ADD:
-    r = do_lock_add(image, lock_cookie, lock_tag);
-    if (r < 0) {
-      if (r == -EBUSY || r == -EEXIST) {
-	if (lock_tag) {
-	  cerr << "rbd: lock is alrady held by someone else"
-	       << " with a different tag" << std::endl;
-	} else {
-	  cerr << "rbd: lock is already held by someone else" << std::endl;
-	}
-      } else {
-	cerr << "rbd: taking lock failed: " << cpp_strerror(r) << std::endl;
-      }
-      return -r;
-    }
-    break;
-
-  case OPT_LOCK_REMOVE:
-    r = do_lock_remove(image, lock_client, lock_cookie);
-    if (r < 0) {
-      cerr << "rbd: releasing lock failed: " << cpp_strerror(r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_BENCH_WRITE:
-    r = do_bench_write(image, bench_io_size, bench_io_threads, bench_bytes, bench_pattern);
-    if (r < 0) {
-      cerr << "bench-write failed: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_METADATA_LIST:
-    r = do_metadata_list(image, formatter.get());
-    if (r < 0) {
-      cerr << "rbd: listing metadata failed: " << cpp_strerror(r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_METADATA_SET:
-    r = do_metadata_set(image, key, value);
-    if (r < 0) {
-      cerr << "rbd: setting metadata failed: " << cpp_strerror(r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_METADATA_REMOVE:
-    r = do_metadata_remove(image, key);
-    if (r < 0) {
-      cerr << "rbd: removing metadata failed: " << cpp_strerror(r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_METADATA_GET:
-    r = do_metadata_get(image, key);
-    if (r < 0) {
-      cerr << "rbd: getting metadata failed: " << cpp_strerror(r) << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_FEATURE_DISABLE:
-  case OPT_FEATURE_ENABLE:
-    r = image.update_features(features, opt_cmd == OPT_FEATURE_ENABLE);
-    if (r < 0) {
-      cerr << "rbd: failed to update image features: " << cpp_strerror(r)
-           << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_OBJECT_MAP_REBUILD:
-    r = do_object_map_rebuild(image);
-    if (r < 0) {
-      cerr << "rbd: rebuilding object map failed: " << cpp_strerror(r)
-           << std::endl;
-      return -r;
-    }
-    break;
-
-  case OPT_DISK_USAGE:
-    r = do_disk_usage(rbd, io_ctx, imgname, snapname, formatter.get());
-    if (r < 0) {
-      cerr << "du failed: " << cpp_strerror(-r) << std::endl;
-      return -r;
-    }
-    break;
-  }
-
-  r = image.close();
-  if (r < 0) {
-    cerr << "rbd: error while closing image: " << cpp_strerror(-r) << std::endl;
-    return -r;
-  }
-  return 0;
-}
diff --git a/src/rbdmap b/src/rbdmap
old mode 100644
new mode 100755
index b0784cf..09145b2
--- a/src/rbdmap
+++ b/src/rbdmap
@@ -1,2 +1,115 @@
-# RbdDevice		Parameters
-#poolname/imagename	id=client,keyring=/etc/ceph/ceph.client.keyring
+#!/bin/sh
+
+do_map() {
+	if [ ! -f "$RBDMAPFILE" ]; then
+		logger -p "daemon.warning" -t init-rbdmap "No $RBDMAPFILE found."
+		exit 0
+	fi
+
+	# Read /etc/rbdtab to create non-existant mapping
+	RET=0
+	while read DEV PARAMS; do
+		case "$DEV" in
+		  ""|\#*)
+			continue
+			;;
+		  */*)
+			;;
+		  *)
+			DEV=rbd/$DEV
+			;;
+		esac
+		logger -p "daemon.debug" -t init-rbdmap "Mapping '${DEV}'"
+		newrbd=""
+		MAP_RV=""
+		OIFS=$IFS
+		IFS=','
+		CMDPARAMS=""
+		for PARAM in ${PARAMS[@]}; do
+			CMDPARAMS="$CMDPARAMS --$(echo $PARAM | tr '=' ' ')"
+		done
+		IFS=$OIFS
+		if [ -b /dev/rbd/$DEV ]; then
+			MAP_RV="$(readlink -f /dev/rbd/$DEV)"
+		else
+			MAP_RV="$(rbd map $DEV $CMDPARAMS 2>&1)"
+			if [ $? -eq 0 ]; then
+			    newrbd="yes"
+			else
+			    RET=$((${RET}+$?))
+			    logger -p "daemon.warning" -t init-rbdmap "Failed to map '${DEV}"
+			    continue
+			fi
+		fi
+		logger -p "daemon.debug" -t init-rbdmap "Mapped '${DEV}' to '${MAP_RV}'"
+
+		if [ "$newrbd" ]; then
+			## Mount new rbd
+			MNT_RV=""
+			mount --fake /dev/rbd/$DEV >>/dev/null 2>&1 \
+			&& MNT_RV=$(mount -vn /dev/rbd/$DEV 2>&1)
+			[ -n "${MNT_RV}" ] && logger -p "daemon.debug" -t init-rbdmap "Mounted '${MAP_RV}' to '${MNT_RV}'"
+
+			## post-mapping
+			if [ -x "/etc/ceph/rbd.d/${DEV}" ]; then
+			    logger -p "daemon.debug" -t init-rbdmap "Running post-map hook '/etc/ceph/rbd.d/${DEV}'"
+			    /etc/ceph/rbd.d/${DEV} map "/dev/rbd/${DEV}"
+			fi
+		fi
+	done < $RBDMAPFILE
+	exit ${RET}
+
+}
+
+do_unmap() {
+	RET=0
+	## Unmount and unmap all rbd devices
+	if ls /dev/rbd[0-9]* >/dev/null 2>&1; then
+		for DEV in /dev/rbd[0-9]*; do
+			## pre-unmapping
+			for L in $(find /dev/rbd -type l); do
+			    LL="${L##/dev/rbd/}"
+			    if [ "$(readlink -f $L)" = "${DEV}" ] \
+			    && [ -x "/etc/ceph/rbd.d/${LL}" ]; then
+			        logger -p "daemon.debug" -t init-rbdmap "Running pre-unmap hook for '${DEV}': '/etc/ceph/rbd.d/${LL}'"
+			        /etc/ceph/rbd.d/${LL} unmap "$L"
+			        break
+			    fi
+			done
+
+			logger -p "daemon.debug" -t init-rbdmap "Unmapping '${DEV}'"
+			MNT=$(findmnt --mtab --source ${DEV} --noheadings | awk '{print $1'})
+			if [ -n "${MNT}" ]; then
+			    logger -p "daemon.debug" -t init-rbdmap "Unmounting '${MNT}'"
+			    umount "${MNT}" >>/dev/null 2>&1
+			fi
+			if mountpoint -q "${MNT}"; then
+			    ## Un-mounting failed.
+			    logger -p "daemon.warning" -t init-rbdmap "Failed to unmount '${MNT}'"
+			    RET=$((${RET}+1))
+			    continue
+			fi
+			## Un-mapping.
+			rbd unmap $DEV >>/dev/null 2>&1
+			if [ $? -ne 0 ]; then
+			    logger -p "daemon.warning" -t init-rbdmap "Failed to unmap '${MNT}'"
+			    RET=$((${RET}+$?))
+			    continue
+			fi
+			logger -p "daemon.debug" -t init-rbdmap "Unmapped '${DEV}'"
+		done
+	fi
+	exit ${RET}
+}
+
+case "$1" in
+  map)
+	do_map
+	;;
+
+  unmap)
+	do_unmap
+	;;
+  *)
+	echo "Usage: rbdmap map | unmap"
+esac
diff --git a/src/rgw/rgw_acl_s3.cc b/src/rgw/rgw_acl_s3.cc
index f0ed081..490bf98 100644
--- a/src/rgw/rgw_acl_s3.cc
+++ b/src/rgw/rgw_acl_s3.cc
@@ -568,7 +568,7 @@ bool RGWAccessControlPolicy_S3::compare_group_name(string& id, ACLGroupTypeEnum
 {
   switch (group) {
   case ACL_GROUP_ALL_USERS:
-    return (id.compare(rgw_uri_all_users) == 0);
+    return (id.compare(RGW_USER_ANON_ID) == 0);
   case ACL_GROUP_AUTHENTICATED_USERS:
     return (id.compare(rgw_uri_auth_users) == 0);
   default:
diff --git a/src/rgw/rgw_admin.cc b/src/rgw/rgw_admin.cc
index af82ecb..ca439d0 100644
--- a/src/rgw/rgw_admin.cc
+++ b/src/rgw/rgw_admin.cc
@@ -16,6 +16,7 @@ using namespace std;
 #include "common/ceph_argparse.h"
 #include "common/Formatter.h"
 #include "common/errno.h"
+#include "common/safe_io.h"
 
 #include "global/global_init.h"
 
@@ -100,7 +101,8 @@ void _usage()
   cout << "  metadata rm                remove metadata info\n";
   cout << "  metadata list              list metadata info\n";
   cout << "  mdlog list                 list metadata log\n";
-  cout << "  mdlog trim                 trim metadata log\n";
+  cout << "  mdlog trim                 trim metadata log (use start-date, end-date or\n";
+  cout << "                             start-marker, end-marker)\n";
   cout << "  bilog list                 list bucket index log\n";
   cout << "  bilog trim                 trim bucket index log (use start-marker, end-marker)\n";
   cout << "  datalog list               list data log\n";
@@ -118,7 +120,8 @@ void _usage()
   cout << "   --subuser=<name>          subuser name\n";
   cout << "   --access-key=<key>        S3 access key\n";
   cout << "   --email=<email>\n";
-  cout << "   --secret=<key>            specify secret key\n";
+  cout << "   --secret/--secret-key=<key>\n";
+  cout << "                             specify secret key\n";
   cout << "   --gen-access-key          generate random access key (for S3)\n";
   cout << "   --gen-secret              generate random secret key\n";
   cout << "   --key-type=<type>         key type, options are: swift, s3\n";
@@ -666,7 +669,7 @@ static int read_input(const string& infile, bufferlist& bl)
   do {
     char buf[READ_CHUNK];
 
-    r = read(fd, buf, READ_CHUNK);
+    r = safe_read(fd, buf, READ_CHUNK);
     if (r < 0) {
       err = -errno;
       cerr << "error while reading input" << std::endl;
@@ -1180,7 +1183,7 @@ int main(int argc, char **argv)
       access_key = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--subuser", (char*)NULL)) {
       subuser = val;
-    } else if (ceph_argparse_witharg(args, i, &val, "--secret", (char*)NULL)) {
+    } else if (ceph_argparse_witharg(args, i, &val, "--secret", "--secret-key", (char*)NULL)) {
       secret_key = val;
     } else if (ceph_argparse_witharg(args, i, &val, "-e", "--email", (char*)NULL)) {
       user_email = val;
@@ -1238,9 +1241,17 @@ int main(int argc, char **argv)
     } else if (ceph_argparse_witharg(args, i, &val, "--min-rewrite-stripe-size", (char*)NULL)) {
       min_rewrite_stripe_size = (uint64_t)atoll(val.c_str());
     } else if (ceph_argparse_witharg(args, i, &val, "--max-buckets", (char*)NULL)) {
-      max_buckets = atoi(val.c_str());
+      max_buckets = (int)strict_strtol(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse max buckets: " << err << std::endl;
+        return EINVAL;
+      }
     } else if (ceph_argparse_witharg(args, i, &val, "--max-entries", (char*)NULL)) {
-      max_entries = atoi(val.c_str());
+      max_entries = (int)strict_strtol(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse max entries: " << err << std::endl;
+        return EINVAL;
+      }
     } else if (ceph_argparse_witharg(args, i, &val, "--max-size", (char*)NULL)) {
       max_size = (int64_t)strict_strtoll(val.c_str(), 10, &err);
       if (!err.empty()) {
@@ -1264,13 +1275,29 @@ int main(int argc, char **argv)
     } else if (ceph_argparse_witharg(args, i, &val, "--end-date", "--end-time", (char*)NULL)) {
       end_date = val;
     } else if (ceph_argparse_witharg(args, i, &val, "--num-shards", (char*)NULL)) {
-      num_shards = atoi(val.c_str());
+      num_shards = (int)strict_strtol(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse num shards: " << err << std::endl;
+        return EINVAL;
+      }
     } else if (ceph_argparse_witharg(args, i, &val, "--max-concurrent-ios", (char*)NULL)) {
-      max_concurrent_ios = atoi(val.c_str());
+      max_concurrent_ios = (int)strict_strtol(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse max concurrent ios: " << err << std::endl;
+        return EINVAL;
+      }
     } else if (ceph_argparse_witharg(args, i, &val, "--orphan-stale-secs", (char*)NULL)) {
-      orphan_stale_secs = (uint64_t)atoi(val.c_str());
+      orphan_stale_secs = (uint64_t)strict_strtoll(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse orphan stale secs: " << err << std::endl;
+        return EINVAL;
+      }
     } else if (ceph_argparse_witharg(args, i, &val, "--shard-id", (char*)NULL)) {
-      shard_id = atoi(val.c_str());
+      shard_id = (int)strict_strtol(val.c_str(), 10, &err);
+      if (!err.empty()) {
+        cerr << "ERROR: failed to parse shard id: " << err << std::endl;
+        return EINVAL;
+      }
       specified_shard_id = true;
     } else if (ceph_argparse_witharg(args, i, &val, "--daemon-id", (char*)NULL)) {
       daemon_id = val;
@@ -2747,16 +2774,15 @@ next:
     do {
       list<string> keys;
       ret = store->meta_mgr->list_keys_next(handle, max, keys, &truncated);
-      if (ret < 0) {
+      if (ret < 0 && ret != -ENOENT) {
         cerr << "ERROR: lists_keys_next(): " << cpp_strerror(-ret) << std::endl;
         return -ret;
+      } if (ret != -ENOENT) {
+	for (list<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
+	  formatter->dump_string("key", *iter);
+	}
+	formatter->flush(cout);
       }
-
-      for (list<string>::iterator iter = keys.begin(); iter != keys.end(); ++iter) {
-	formatter->dump_string("key", *iter);
-      }
-      formatter->flush(cout);
-
     } while (truncated);
 
     formatter->close_section();
diff --git a/src/rgw/rgw_auth_s3.cc b/src/rgw/rgw_auth_s3.cc
index b9151d2..b372630 100644
--- a/src/rgw/rgw_auth_s3.cc
+++ b/src/rgw/rgw_auth_s3.cc
@@ -127,6 +127,9 @@ void rgw_create_s3_canonical_header(const char *method, const char *content_md5,
 
 int rgw_get_s3_header_digest(const string& auth_hdr, const string& key, string& dest)
 {
+  if (key.empty())
+    return -EINVAL;
+
   char hmac_sha1[CEPH_CRYPTO_HMACSHA1_DIGESTSIZE];
   calc_hmac_sha1(key.c_str(), key.size(), auth_hdr.c_str(), auth_hdr.size(), hmac_sha1);
 
diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc
index cfa5e20..b857a5f 100644
--- a/src/rgw/rgw_bucket.cc
+++ b/src/rgw/rgw_bucket.cc
@@ -715,16 +715,19 @@ int RGWBucket::check_bad_index_multipart(RGWBucketAdminOpState& op_state,
       string oid = key.name;
 
       int pos = oid.find_last_of('.');
-      if (pos < 0)
-        continue;
-
-      string name = oid.substr(0, pos);
-      string suffix = oid.substr(pos + 1);
-
-      if (suffix.compare("meta") == 0) {
-        meta_objs[name] = true;
+      if (pos < 0) {
+        /* obj has no suffix */
+        all_objs[key] = oid;
       } else {
-        all_objs[key] = name;
+        /* obj has suffix */
+        string name = oid.substr(0, pos);
+        string suffix = oid.substr(pos + 1);
+
+        if (suffix.compare("meta") == 0) {
+          meta_objs[name] = true;
+        } else {
+          all_objs[key] = name;
+        }
       }
     }
 
diff --git a/src/rgw/rgw_civetweb.cc b/src/rgw/rgw_civetweb.cc
index 81e504c..5c075f9 100644
--- a/src/rgw/rgw_civetweb.cc
+++ b/src/rgw/rgw_civetweb.cc
@@ -127,6 +127,13 @@ void RGWMongoose::init_env(CephContext *cct)
   char port_buf[16];
   snprintf(port_buf, sizeof(port_buf), "%d", port);
   env.set("SERVER_PORT", port_buf);
+
+  if (info->is_ssl) {
+    if (port == 0) {
+      strcpy(port_buf,"443");
+    }
+    env.set("SERVER_PORT_SECURE", port_buf);
+  }
 }
 
 int RGWMongoose::send_status(const char *status, const char *status_name)
diff --git a/src/rgw/rgw_common.cc b/src/rgw/rgw_common.cc
index d1e2971..ae045f4 100644
--- a/src/rgw/rgw_common.cc
+++ b/src/rgw/rgw_common.cc
@@ -613,6 +613,7 @@ int RGWHTTPArgs::parse()
           (name.compare("versionId") == 0) ||
           (name.compare("versions") == 0) ||
           (name.compare("versioning") == 0) ||
+          (name.compare("requestPayment") == 0) ||
           (name.compare("torrent") == 0)) {
         sub_resources[name] = val;
       } else if (name[0] == 'r') { // root of all evil
@@ -703,6 +704,30 @@ void RGWHTTPArgs::get_bool(const char *name, bool *val, bool def_val)
   }
 }
 
+bool verify_requester_payer_permission(struct req_state *s)
+{
+  if (!s->bucket_info.requester_pays)
+    return true;
+
+  if (s->bucket_info.owner == s->user.user_id)
+    return true;
+
+  const char *request_payer = s->info.env->get("HTTP_X_AMZ_REQUEST_PAYER");
+  if (!request_payer) {
+    bool exists;
+    request_payer = s->info.args.get("x-amz-request-payer", &exists).c_str();
+    if (!exists) {
+      return false;
+    }
+  }
+
+  if (strcasecmp(request_payer, "requester") == 0) {
+    return true;
+  }
+
+  return false;
+}
+
 bool verify_bucket_permission(struct req_state *s, int perm)
 {
   if (!s->bucket_acl)
@@ -711,6 +736,9 @@ bool verify_bucket_permission(struct req_state *s, int perm)
   if ((perm & (int)s->perm_mask) != perm)
     return false;
 
+  if (!verify_requester_payer_permission(s))
+    return false;
+
   return s->bucket_acl->verify_permission(s->user.user_id, perm, perm);
 }
 
@@ -721,6 +749,9 @@ static inline bool check_deferred_bucket_acl(struct req_state *s, uint8_t deferr
 
 bool verify_object_permission(struct req_state *s, RGWAccessControlPolicy *bucket_acl, RGWAccessControlPolicy *object_acl, int perm)
 {
+  if (!verify_requester_payer_permission(s))
+    return false;
+
   if (check_deferred_bucket_acl(s, RGW_DEFER_TO_BUCKET_ACLS_RECURSE, perm) ||
       check_deferred_bucket_acl(s, RGW_DEFER_TO_BUCKET_ACLS_FULL_CONTROL, RGW_PERM_FULL_CONTROL)) {
     return true;
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index 8426057..77bf2e8 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -793,8 +793,10 @@ struct RGWBucketInfo
   // Represents the shard number for blind bucket.
   const static uint32_t NUM_SHARDS_BLIND_BUCKET;
 
+  bool requester_pays;
+
   void encode(bufferlist& bl) const {
-     ENCODE_START(11, 4, bl);
+     ENCODE_START(12, 4, bl);
      ::encode(bucket, bl);
      ::encode(owner, bl);
      ::encode(flags, bl);
@@ -806,6 +808,7 @@ struct RGWBucketInfo
      ::encode(quota, bl);
      ::encode(num_shards, bl);
      ::encode(bucket_index_shard_hash_type, bl);
+     ::encode(requester_pays, bl);
      ENCODE_FINISH(bl);
   }
   void decode(bufferlist::iterator& bl) {
@@ -832,6 +835,8 @@ struct RGWBucketInfo
        ::decode(num_shards, bl);
      if (struct_v >= 11)
        ::decode(bucket_index_shard_hash_type, bl);
+     if (struct_v >= 12)
+       ::decode(requester_pays, bl);
      DECODE_FINISH(bl);
   }
   void dump(Formatter *f) const;
@@ -843,7 +848,7 @@ struct RGWBucketInfo
   int versioning_status() { return flags & (BUCKET_VERSIONED | BUCKET_VERSIONS_SUSPENDED); }
   bool versioning_enabled() { return versioning_status() == BUCKET_VERSIONED; }
 
-  RGWBucketInfo() : flags(0), creation_time(0), has_instance_obj(false), num_shards(0), bucket_index_shard_hash_type(MOD) {}
+  RGWBucketInfo() : flags(0), creation_time(0), has_instance_obj(false), num_shards(0), bucket_index_shard_hash_type(MOD), requester_pays(false) {}
 };
 WRITE_CLASS_ENCODER(RGWBucketInfo)
 
diff --git a/src/rgw/rgw_http_client.cc b/src/rgw/rgw_http_client.cc
index 3a382c0..4fedff9 100644
--- a/src/rgw/rgw_http_client.cc
+++ b/src/rgw/rgw_http_client.cc
@@ -108,7 +108,7 @@ int RGWHTTPClient::process(const char *method, const char *url)
   }
   CURLcode status = curl_easy_perform(curl_handle);
   if (status) {
-    dout(0) << "curl_easy_performed returned error: " << error_buf << dendl;
+    dout(0) << "curl_easy_perform returned error: " << error_buf << dendl;
     ret = -EINVAL;
   }
   curl_easy_cleanup(curl_handle);
diff --git a/src/rgw/rgw_json_enc.cc b/src/rgw/rgw_json_enc.cc
index 061aba9..75b0bed 100644
--- a/src/rgw/rgw_json_enc.cc
+++ b/src/rgw/rgw_json_enc.cc
@@ -549,6 +549,7 @@ void RGWBucketInfo::dump(Formatter *f) const
   encode_json("quota", quota, f);
   encode_json("num_shards", num_shards, f);
   encode_json("bi_shard_hash_type", (uint32_t)bucket_index_shard_hash_type, f);
+  encode_json("requester_pays", requester_pays, f);
 }
 
 void RGWBucketInfo::decode_json(JSONObj *obj) {
@@ -564,6 +565,7 @@ void RGWBucketInfo::decode_json(JSONObj *obj) {
   uint32_t hash_type;
   JSONDecoder::decode_json("bi_shard_hash_type", hash_type, obj);
   bucket_index_shard_hash_type = (uint8_t)hash_type;
+  JSONDecoder::decode_json("requester_pays", requester_pays, obj);
 }
 
 void RGWObjEnt::dump(Formatter *f) const
diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
index c6ba1c2..fca3ede 100644
--- a/src/rgw/rgw_main.cc
+++ b/src/rgw/rgw_main.cc
@@ -32,6 +32,7 @@
 #include "common/Throttle.h"
 #include "common/QueueRing.h"
 #include "common/safe_io.h"
+#include "include/compat.h"
 #include "include/str_list.h"
 #include "rgw_common.h"
 #include "rgw_rados.h"
@@ -1045,7 +1046,7 @@ int main(int argc, const char **argv)
   check_curl();
 
   if (g_conf->daemonize) {
-    global_init_daemonize(g_ceph_context, 0);
+    global_init_daemonize(g_ceph_context);
   }
   Mutex mutex("main");
   SafeTimer init_timer(g_ceph_context, mutex);
diff --git a/src/rgw/rgw_metadata.cc b/src/rgw/rgw_metadata.cc
index ece9ebf..cf4a0eb 100644
--- a/src/rgw/rgw_metadata.cc
+++ b/src/rgw/rgw_metadata.cc
@@ -275,10 +275,10 @@ void RGWMetadataManager::parse_metadata_key(const string& metadata_key, string&
   int pos = metadata_key.find(':');
   if (pos < 0) {
     type = metadata_key;
+  } else {
+    type = metadata_key.substr(0, pos);
+    entry = metadata_key.substr(pos + 1);
   }
-
-  type = metadata_key.substr(0, pos);
-  entry = metadata_key.substr(pos + 1);
 }
 
 int RGWMetadataManager::find_handler(const string& metadata_key, RGWMetadataHandler **handler, string& entry)
diff --git a/src/rgw/rgw_object_expirer.cc b/src/rgw/rgw_object_expirer.cc
index 63f4e96..dcbfbc1 100644
--- a/src/rgw/rgw_object_expirer.cc
+++ b/src/rgw/rgw_object_expirer.cc
@@ -73,7 +73,7 @@ int main(const int argc, const char **argv)
   }
 
   if (g_conf->daemonize) {
-    global_init_daemonize(g_ceph_context, 0);
+    global_init_daemonize(g_ceph_context);
   }
 
   common_init_finish(g_ceph_context);
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index fb4b6bb..993659a 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -17,6 +17,7 @@
 #include "rgw_rest.h"
 #include "rgw_acl.h"
 #include "rgw_acl_s3.h"
+#include "rgw_acl_swift.h"
 #include "rgw_user.h"
 #include "rgw_bucket.h"
 #include "rgw_log.h"
@@ -356,7 +357,13 @@ static int rgw_build_policies(RGWRados *store, struct req_state *s, bool only_bu
     }
   }
 
-  s->bucket_acl = new RGWAccessControlPolicy(s->cct);
+  if(s->dialect.compare("s3") == 0) {
+    s->bucket_acl = new RGWAccessControlPolicy_S3(s->cct);
+  } else if(s->dialect.compare("swift")  == 0) {
+    s->bucket_acl = new RGWAccessControlPolicy_SWIFT(s->cct);
+  } else {
+    s->bucket_acl = new RGWAccessControlPolicy(s->cct);
+  }
 
   if (s->copy_source) { /* check if copy source is within the current domain */
     const char *src = s->copy_source;
@@ -984,7 +991,7 @@ void RGWGetObj::execute()
     goto done_err;
 
   attr_iter = attrs.find(RGW_ATTR_USER_MANIFEST);
-  if (attr_iter != attrs.end()) {
+  if (attr_iter != attrs.end() && !skip_manifest) {
     ret = handle_user_manifest(attr_iter->second.c_str());
     if (ret < 0) {
       ldout(s->cct, 0) << "ERROR: failed to handle user manifest ret=" << ret << dendl;
@@ -1017,8 +1024,11 @@ void RGWGetObj::execute()
     goto done_err;
   }
 
-done_err:
   send_response_data(bl, 0, 0);
+  return;
+
+done_err:
+  send_response_data_error();
 }
 
 int RGWGetObj::init_common()
@@ -1138,7 +1148,7 @@ void RGWStatAccount::execute()
   do {
     RGWUserBuckets buckets;
 
-    ret = rgw_read_user_buckets(store, s->user.user_id, buckets, marker, max_buckets, true);
+    ret = rgw_read_user_buckets(store, s->user.user_id, buckets, marker, max_buckets, false);
     if (ret < 0) {
       /* hmm.. something wrong here.. the user was authenticated, so it
          should exist */
@@ -2428,7 +2438,16 @@ void RGWDeleteObj::execute()
 {
   ret = -EINVAL;
   rgw_obj obj(s->bucket, s->object);
+  map<string, bufferlist> orig_attrs;
+
   if (!s->object.empty()) {
+    if (need_object_expiration()) {
+      /* check if obj exists, read orig attrs */
+      ret = get_obj_attrs(store, s, obj, orig_attrs);
+      if (ret < 0) {
+        return;
+      }
+    }
     RGWObjectCtx *obj_ctx = static_cast<RGWObjectCtx *>(s->obj_ctx);
 
     obj_ctx->set_atomic(obj);
@@ -2449,6 +2468,13 @@ void RGWDeleteObj::execute()
       delete_marker = del_op.result.delete_marker;
       version_id = del_op.result.version_id;
     }
+
+    /* Check whether the object has expired. Swift API documentation
+     * stands that we should return 404 Not Found in such case. */
+    if (need_object_expiration() && object_is_expired(orig_attrs)) {
+      ret = -ENOENT;
+      return;
+    }
   }
 }
 
@@ -2796,8 +2822,17 @@ void RGWPutACLs::execute()
   new_policy.encode(bl);
   obj = rgw_obj(s->bucket, s->object);
   map<string, bufferlist> attrs;
-  attrs[RGW_ATTR_ACL] = bl;
+
   store->set_atomic(s->obj_ctx, obj);
+
+  if (!s->object.empty()) {
+    ret = get_obj_attrs(store, s, obj, attrs);
+    if (ret < 0)
+      return;
+  }
+  
+  attrs[RGW_ATTR_ACL] = bl;
+
   if (!s->object.empty()) {
     ret = store->set_attrs(s->obj_ctx, obj, attrs, NULL, ptracker);
   } else {
@@ -2968,6 +3003,49 @@ void RGWOptionsCORS::execute()
   return;
 }
 
+int RGWGetRequestPayment::verify_permission()
+{
+  return 0;
+}
+
+void RGWGetRequestPayment::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWGetRequestPayment::execute()
+{
+  requester_pays = s->bucket_info.requester_pays;
+}
+
+int RGWSetRequestPayment::verify_permission()
+{
+  if (s->user.user_id.compare(s->bucket_owner.get_id()) != 0)
+    return -EACCES;
+
+  return 0;
+}
+
+void RGWSetRequestPayment::pre_exec()
+{
+  rgw_bucket_object_pre_exec(s);
+}
+
+void RGWSetRequestPayment::execute()
+{
+  ret = get_params();
+
+  if (ret < 0)
+    return;
+
+  s->bucket_info.requester_pays = requester_pays;
+  ret = store->put_bucket_instance_info(s->bucket_info, false, 0, &s->bucket_attrs);
+  if (ret < 0) {
+    ldout(s->cct, 0) << "NOTICE: put_bucket_info on bucket=" << s->bucket.name << " returned err=" << ret << dendl;
+    return;
+  }
+}
+
 int RGWInitMultipart::verify_permission()
 {
   if (!verify_bucket_permission(s, RGW_PERM_WRITE))
diff --git a/src/rgw/rgw_op.h b/src/rgw/rgw_op.h
index 7a196a3..ee6cc2a 100644
--- a/src/rgw/rgw_op.h
+++ b/src/rgw/rgw_op.h
@@ -56,6 +56,8 @@ enum RGWOpType {
   RGW_OP_PUT_CORS,
   RGW_OP_DELETE_CORS,
   RGW_OP_OPTIONS_CORS,
+  RGW_OP_GET_REQUEST_PAYMENT,
+  RGW_OP_SET_REQUEST_PAYMENT,
   RGW_OP_INIT_MULTIPART,
   RGW_OP_COMPLETE_MULTIPART,
   RGW_OP_ABORT_MULTIPART,
@@ -135,6 +137,7 @@ protected:
   bool get_data;
   bool partial_content;
   bool range_parsed;
+  bool skip_manifest;
   rgw_obj obj;
   utime_t gc_invalidate_time;
 
@@ -158,6 +161,7 @@ public:
     get_data = false;
     partial_content = false;
     range_parsed = false;
+    skip_manifest = false;
     ret = 0;
  }
 
@@ -175,6 +179,7 @@ public:
   int get_data_cb(bufferlist& bl, off_t ofs, off_t len);
 
   virtual int get_params() = 0;
+  virtual int send_response_data_error() = 0;
   virtual int send_response_data(bufferlist& bl, off_t ofs, off_t len) = 0;
 
   virtual const string name() { return "get_obj"; }
@@ -623,6 +628,7 @@ public:
   virtual const string name() { return "delete_obj"; }
   virtual RGWOpType get_type() { return RGW_OP_DELETE_OBJ; }
   virtual uint32_t op_mask() { return RGW_OP_TYPE_DELETE; }
+  virtual bool need_object_expiration() { return false; }
 };
 
 class RGWCopyObj : public RGWOp {
@@ -829,6 +835,42 @@ public:
   virtual uint32_t op_mask() { return RGW_OP_TYPE_READ; }
 };
 
+class RGWGetRequestPayment : public RGWOp {
+protected:
+  bool requester_pays;
+
+public:
+  RGWGetRequestPayment() : requester_pays(0) {}
+
+  int verify_permission();
+  void pre_exec();
+  void execute();
+
+  virtual void send_response() = 0;
+  virtual const string name() { return "get_request_payment"; }
+  virtual RGWOpType get_type() { return RGW_OP_GET_REQUEST_PAYMENT; }
+  virtual uint32_t op_mask() { return RGW_OP_TYPE_READ; }
+};
+
+class RGWSetRequestPayment : public RGWOp {
+protected:
+  bool requester_pays;
+  int ret;
+public:
+ RGWSetRequestPayment() : requester_pays(false), ret(0) {}
+
+  int verify_permission();
+  void pre_exec();
+  void execute();
+
+  virtual int get_params() { return 0; }
+
+  virtual void send_response() = 0;
+  virtual const string name() { return "set_request_payment"; }
+  virtual RGWOpType get_type() { return RGW_OP_SET_REQUEST_PAYMENT; }
+  virtual uint32_t op_mask() { return RGW_OP_TYPE_WRITE; }
+};
+
 class RGWInitMultipart : public RGWOp {
 protected:
   int ret;
diff --git a/src/rgw/rgw_quota.cc b/src/rgw/rgw_quota.cc
index 85536bf..d29cdd2 100644
--- a/src/rgw/rgw_quota.cc
+++ b/src/rgw/rgw_quota.cc
@@ -664,9 +664,14 @@ class RGWQuotaHandlerImpl : public RGWQuotaHandler {
   RGWRados *store;
   RGWBucketStatsCache bucket_stats_cache;
   RGWUserStatsCache user_stats_cache;
+  RGWQuotaInfo def_bucket_quota;
+  RGWQuotaInfo def_user_quota;
 
   int check_quota(const char *entity, RGWQuotaInfo& quota, RGWStorageStats& stats,
                   uint64_t num_objs, uint64_t size_kb) {
+    if (!quota.enabled)
+      return 0;
+
     ldout(store->ctx(), 20) << entity << " quota: max_objects=" << quota.max_objects
                             << " max_size_kb=" << quota.max_size_kb << dendl;
 
@@ -687,12 +692,29 @@ class RGWQuotaHandlerImpl : public RGWQuotaHandler {
     return 0;
   }
 public:
-  RGWQuotaHandlerImpl(RGWRados *_store, bool quota_threads) : store(_store), bucket_stats_cache(_store), user_stats_cache(_store, quota_threads) {}
+  RGWQuotaHandlerImpl(RGWRados *_store, bool quota_threads) : store(_store), bucket_stats_cache(_store), user_stats_cache(_store, quota_threads) {
+    if (store->ctx()->_conf->rgw_bucket_default_quota_max_objects >= 0) {
+      def_bucket_quota.max_objects = store->ctx()->_conf->rgw_bucket_default_quota_max_objects;
+      def_bucket_quota.enabled = true;
+    }
+    if (store->ctx()->_conf->rgw_bucket_default_quota_max_size >= 0) {
+      def_bucket_quota.max_size_kb = store->ctx()->_conf->rgw_bucket_default_quota_max_size;
+      def_bucket_quota.enabled = true;
+    }
+    if (store->ctx()->_conf->rgw_user_default_quota_max_objects >= 0) {
+      def_user_quota.max_objects = store->ctx()->_conf->rgw_user_default_quota_max_objects;
+      def_user_quota.enabled = true;
+    }
+    if (store->ctx()->_conf->rgw_user_default_quota_max_size >= 0) {
+      def_user_quota.max_size_kb = store->ctx()->_conf->rgw_user_default_quota_max_size;
+      def_user_quota.enabled = true;
+    }
+  }
   virtual int check_quota(const string& user, rgw_bucket& bucket,
                           RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota,
 			  uint64_t num_objs, uint64_t size) {
 
-    if (!bucket_quota.enabled && !user_quota.enabled)
+    if (!bucket_quota.enabled && !user_quota.enabled && !def_bucket_quota.enabled && !def_user_quota.enabled)
       return 0;
 
     uint64_t size_kb = rgw_rounded_objsize_kb(size);
@@ -715,16 +737,28 @@ public:
         return ret;
     }
 
-    if (user_quota.enabled) {
+    if (def_bucket_quota.enabled) {
+      ret = check_quota("def_bucket", def_bucket_quota, bucket_stats, num_objs, size_kb);
+      if (ret < 0)
+        return ret;
+    }
+
+    if (user_quota.enabled || def_user_quota.enabled) {
       RGWStorageStats user_stats;
 
       ret = user_stats_cache.get_stats(user, bucket, user_stats, user_quota);
       if (ret < 0)
         return ret;
 
-      ret = check_quota("user", user_quota, user_stats, num_objs, size_kb);
-      if (ret < 0)
-        return ret;
+      if (user_quota.enabled) {
+	ret = check_quota("user", user_quota, user_stats, num_objs, size_kb);
+	if (ret < 0)
+	  return ret;
+      } else if (def_user_quota.enabled) {
+        ret = check_quota("def_user", def_user_quota, user_stats, num_objs, size_kb);
+        if (ret < 0)
+          return ret;
+      }
     }
 
     return 0;
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 45fde7a..6e93bee 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -1305,6 +1305,7 @@ class RGWWatcher : public librados::WatchCtx2 {
         watcher->reinit();
       }
   };
+  shared_ptr<C_ReinitWatch> reinit_watch;
 public:
   RGWWatcher(RGWRados *r, int i, const string& o) : rados(r), index(i), oid(o), watch_handle(0) {}
   void handle_notify(uint64_t notify_id,
@@ -1325,7 +1326,8 @@ public:
     lderr(rados->ctx()) << "RGWWatcher::handle_error cookie " << cookie
 			<< " err " << cpp_strerror(err) << dendl;
     rados->remove_watcher(index);
-    rados->schedule_context(new C_ReinitWatch(this));
+    reinit_watch.reset(new C_ReinitWatch(this));
+    rados->schedule_context(reinit_watch.get());
   }
 
   void reinit() {
@@ -1401,7 +1403,27 @@ int RGWRados::get_required_alignment(rgw_bucket& bucket, uint64_t *alignment)
     return r;
   }
 
-  *alignment = ioctx.pool_required_alignment();
+  bool requires;
+  r = ioctx.pool_requires_alignment2(&requires);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned " 
+      << r << dendl;
+    return r;
+  }
+
+  if (!requires) {
+    *alignment = 0;
+    return 0;
+  }
+
+  uint64_t align;
+  r = ioctx.pool_required_alignment2(&align);
+  if (r < 0) {
+    ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned " 
+      << r << dendl;
+    return r;
+  }
+  *alignment = align;
   return 0;
 }
 
@@ -2831,6 +2853,7 @@ int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
     info.placement_rule = selected_placement_rule;
     info.num_shards = bucket_index_max_shards;
     info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
+    info.requester_pays = false;
     if (!creation_time)
       time(&info.creation_time);
     else
@@ -7901,7 +7924,7 @@ int RGWRados::list_raw_objects(rgw_bucket& pool, const string& prefix_filter,
   if (!ctx.initialized) {
     int r = pool_iterate_begin(pool, ctx.iter_ctx);
     if (r < 0) {
-      lderr(cct) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
+      ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
       return r;
     }
     ctx.initialized = true;
@@ -7910,7 +7933,7 @@ int RGWRados::list_raw_objects(rgw_bucket& pool, const string& prefix_filter,
   vector<RGWObjEnt> objs;
   int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
   if (r < 0) {
-    lderr(cct) << "failed to list objects pool_iterate returned r=" << r << dendl;
+    ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
     return r;
   }
 
diff --git a/src/rgw/rgw_rest.cc b/src/rgw/rgw_rest.cc
index e2b1568..a73f12e 100644
--- a/src/rgw/rgw_rest.cc
+++ b/src/rgw/rgw_rest.cc
@@ -32,15 +32,13 @@ struct rgw_http_attr {
 /*
  * mapping between rgw object attrs and output http fields
  */
-static struct rgw_http_attr rgw_to_http_attr_list[] = {
-  { RGW_ATTR_CONTENT_TYPE, "Content-Type"},
-  { RGW_ATTR_CONTENT_LANG, "Content-Language"},
-  { RGW_ATTR_EXPIRES, "Expires"},
-  { RGW_ATTR_CACHE_CONTROL, "Cache-Control"},
-  { RGW_ATTR_CONTENT_DISP, "Content-Disposition"},
-  { RGW_ATTR_CONTENT_ENC, "Content-Encoding"},
-  { RGW_ATTR_USER_MANIFEST, "X-Object-Manifest"},
-  { NULL, NULL},
+static const struct rgw_http_attr base_rgw_to_http_attrs[] = {
+  { RGW_ATTR_CONTENT_LANG,      "Content-Language" },
+  { RGW_ATTR_EXPIRES,           "Expires" },
+  { RGW_ATTR_CACHE_CONTROL,     "Cache-Control" },
+  { RGW_ATTR_CONTENT_DISP,      "Content-Disposition" },
+  { RGW_ATTR_CONTENT_ENC,       "Content-Encoding" },
+  { RGW_ATTR_USER_MANIFEST,     "X-Object-Manifest" },
 };
 
 
@@ -52,14 +50,13 @@ struct generic_attr {
 /*
  * mapping between http env fields and rgw object attrs
  */
-struct generic_attr generic_attrs[] = {
-  { "CONTENT_TYPE", RGW_ATTR_CONTENT_TYPE },
-  { "HTTP_CONTENT_LANGUAGE", RGW_ATTR_CONTENT_LANG },
-  { "HTTP_EXPIRES", RGW_ATTR_EXPIRES },
-  { "HTTP_CACHE_CONTROL", RGW_ATTR_CACHE_CONTROL },
+static const struct generic_attr generic_attrs[] = {
+  { "CONTENT_TYPE",             RGW_ATTR_CONTENT_TYPE },
+  { "HTTP_CONTENT_LANGUAGE",    RGW_ATTR_CONTENT_LANG },
+  { "HTTP_EXPIRES",             RGW_ATTR_EXPIRES },
+  { "HTTP_CACHE_CONTROL",       RGW_ATTR_CACHE_CONTROL },
   { "HTTP_CONTENT_DISPOSITION", RGW_ATTR_CONTENT_DISP },
-  { "HTTP_CONTENT_ENCODING", RGW_ATTR_CONTENT_ENC },
-  { NULL, NULL },
+  { "HTTP_CONTENT_ENCODING",    RGW_ATTR_CONTENT_ENC },
 };
 
 map<string, string> rgw_to_http_attrs;
@@ -147,14 +144,16 @@ string camelcase_dash_http_attr(const string& orig)
   for (size_t i = 0; i < orig.size(); ++i, ++s) {
     switch (*s) {
       case '_':
+      case '-':
         buf[i] = '-';
         last_sep = true;
         break;
       default:
-        if (last_sep)
+        if (last_sep) {
           buf[i] = toupper(*s);
-        else
+        } else {
           buf[i] = tolower(*s);
+        }
         last_sep = false;
     }
   }
@@ -166,12 +165,12 @@ static set<string> hostnames_set;
 
 void rgw_rest_init(CephContext *cct, RGWRegion& region)
 {
-  for (struct rgw_http_attr *attr = rgw_to_http_attr_list; attr->rgw_attr; attr++) {
-    rgw_to_http_attrs[attr->rgw_attr] = attr->http_attr;
+  for (const auto& rgw2http : base_rgw_to_http_attrs)  {
+    rgw_to_http_attrs[rgw2http.rgw_attr] = rgw2http.http_attr;
   }
 
-  for (struct generic_attr *gen_attr = generic_attrs; gen_attr->http_header; gen_attr++) {
-    generic_attrs_map[gen_attr->http_header] = gen_attr->rgw_attr;
+  for (const auto& http2rgw : generic_attrs) {
+    generic_attrs_map[http2rgw.http_header] = http2rgw.rgw_attr;
   }
 
   list<string> extended_http_attrs;
@@ -520,6 +519,12 @@ void end_header(struct req_state *s, RGWOp *op, const char *content_type, const
 
   dump_trans_id(s);
 
+  if ((!s->err.is_err()) &&
+      (s->bucket_info.owner != s->user.user_id) &&
+      (s->bucket_info.requester_pays)) {
+    s->cio->print("x-amz-request-charged: requester\r\n");
+  }
+
   if (op) {
     dump_access_control(s, op);
   }
diff --git a/src/rgw/rgw_rest_s3.cc b/src/rgw/rgw_rest_s3.cc
index 487f1b1..5831869 100644
--- a/src/rgw/rgw_rest_s3.cc
+++ b/src/rgw/rgw_rest_s3.cc
@@ -72,6 +72,12 @@ static struct response_attr_param resp_attr_params[] = {
   {NULL, NULL},
 };
 
+int RGWGetObj_ObjStore_S3::send_response_data_error()
+{
+  bufferlist bl;
+  return send_response_data(bl, 0 , 0);
+}
+
 int RGWGetObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t bl_ofs, off_t bl_len)
 {
   const char *content_type = NULL;
@@ -138,22 +144,22 @@ int RGWGetObj_ObjStore_S3::send_response_data(bufferlist& bl, off_t bl_ofs, off_
 
     for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
       const char *name = iter->first.c_str();
+
       map<string, string>::iterator aiter = rgw_to_http_attrs.find(name);
       if (aiter != rgw_to_http_attrs.end()) {
-	if (response_attrs.count(aiter->second) > 0) // was already overridden by a response param
-	  continue;
-
-	if (aiter->first.compare(RGW_ATTR_CONTENT_TYPE) == 0) { // special handling for content_type
-	  if (!content_type)
-	    content_type = iter->second.c_str();
-	  continue;
+        if (response_attrs.count(aiter->second) == 0) {
+          /* Was not already overridden by a response param. */
+          response_attrs[aiter->second] = iter->second.c_str();
         }
-	response_attrs[aiter->second] = iter->second.c_str();
-      } else {
-        if (strncmp(name, RGW_ATTR_META_PREFIX, sizeof(RGW_ATTR_META_PREFIX)-1) == 0) {
-          name += sizeof(RGW_ATTR_PREFIX) - 1;
-          s->cio->print("%s: %s\r\n", name, iter->second.c_str());
+      } else if (iter->first.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
+        /* Special handling for content_type. */
+        if (!content_type) {
+          content_type = iter->second.c_str();
         }
+      } else if (strncmp(name, RGW_ATTR_META_PREFIX, sizeof(RGW_ATTR_META_PREFIX)-1) == 0) {
+        /* User custom metadata. */
+        name += sizeof(RGW_ATTR_PREFIX) - 1;
+        s->cio->print("%s: %s\r\n", name, iter->second.c_str());
       }
     }
   }
@@ -1700,6 +1706,93 @@ void RGWOptionsCORS_ObjStore_S3::send_response()
   end_header(s, NULL);
 }
 
+void RGWGetRequestPayment_ObjStore_S3::send_response()
+{
+  dump_errno(s);
+  end_header(s, this, "application/xml");
+  dump_start(s);
+
+  s->formatter->open_object_section_in_ns("RequestPaymentConfiguration",
+					  "http://s3.amazonaws.com/doc/2006-03-01/");
+  const char *payer = requester_pays ? "Requester" :  "BucketOwner";
+  s->formatter->dump_string("Payer", payer);
+  s->formatter->close_section();
+  rgw_flush_formatter_and_reset(s, s->formatter);
+}
+
+class RGWSetRequestPaymentParser : public RGWXMLParser
+{
+  XMLObj *alloc_obj(const char *el) {
+    return new XMLObj;
+  }
+
+public:
+  RGWSetRequestPaymentParser() {}
+  ~RGWSetRequestPaymentParser() {}
+
+  int get_request_payment_payer(bool *requester_pays) {
+    XMLObj *config = find_first("RequestPaymentConfiguration");
+    if (!config)
+      return -EINVAL;
+
+    *requester_pays = false;
+
+    XMLObj *field = config->find_first("Payer");
+    if (!field)
+      return 0;
+
+    string& s = field->get_data();
+
+    if (stringcasecmp(s, "Requester") == 0) {
+      *requester_pays = true;
+    } else if (stringcasecmp(s, "BucketOwner") != 0) {
+      return -EINVAL;
+    }
+
+    return 0;
+  }
+};
+
+int RGWSetRequestPayment_ObjStore_S3::get_params()
+{
+#define GET_REQUEST_PAYMENT_BUF_MAX (128 * 1024)
+  char *data;
+  int len = 0;
+  int r = rgw_rest_read_all_input(s, &data, &len, GET_REQUEST_PAYMENT_BUF_MAX);
+  if (r < 0) {
+    return r;
+  }
+
+  RGWSetRequestPaymentParser parser;
+
+  if (!parser.init()) {
+    ldout(s->cct, 0) << "ERROR: failed to initialize parser" << dendl;
+    r = -EIO;
+    goto done;
+  }
+
+  if (!parser.parse(data, len, 1)) {
+    ldout(s->cct, 10) << "failed to parse data: " << data << dendl;
+    r = -EINVAL;
+    goto done;
+  }
+
+  r = parser.get_request_payment_payer(&requester_pays);
+
+done:
+  free(data);
+
+  return r;
+}
+
+void RGWSetRequestPayment_ObjStore_S3::send_response()
+{
+  if (ret)
+    set_req_state_err(s, ret);
+  dump_errno(s);
+  end_header(s);
+}
+
 int RGWInitMultipart_ObjStore_S3::get_params()
 {
   RGWAccessControlPolicy_S3 s3policy(s->cct);
@@ -1976,6 +2069,8 @@ RGWOp *RGWHandler_ObjStore_Bucket_S3::op_get()
     return new RGWGetACLs_ObjStore_S3;
   } else if (is_cors_op()) {
     return new RGWGetCORS_ObjStore_S3;
+  } else if (is_request_payment_op()) {
+    return new RGWGetRequestPayment_ObjStore_S3;
   } else if (s->info.args.exists("uploads")) {
     return new RGWListBucketMultiparts_ObjStore_S3;
   }
@@ -2002,7 +2097,9 @@ RGWOp *RGWHandler_ObjStore_Bucket_S3::op_put()
     return new RGWPutACLs_ObjStore_S3;
   } else if (is_cors_op()) {
     return new RGWPutCORS_ObjStore_S3;
-  } 
+  } else if (is_request_payment_op()) {
+    return new RGWSetRequestPayment_ObjStore_S3;
+  }
   return new RGWCreateBucket_ObjStore_S3;
 }
 
diff --git a/src/rgw/rgw_rest_s3.h b/src/rgw/rgw_rest_s3.h
index 5db03da..1c2d529 100644
--- a/src/rgw/rgw_rest_s3.h
+++ b/src/rgw/rgw_rest_s3.h
@@ -21,6 +21,7 @@ public:
   RGWGetObj_ObjStore_S3() {}
   ~RGWGetObj_ObjStore_S3() {}
 
+  int send_response_data_error();
   int send_response_data(bufferlist& bl, off_t ofs, off_t len);
 };
 
@@ -234,6 +235,23 @@ public:
   void send_response();
 };
 
+class RGWGetRequestPayment_ObjStore_S3 : public RGWGetRequestPayment {
+public:
+  RGWGetRequestPayment_ObjStore_S3() {}
+  ~RGWGetRequestPayment_ObjStore_S3() {}
+
+  void send_response();
+};
+
+class RGWSetRequestPayment_ObjStore_S3 : public RGWSetRequestPayment {
+public:
+  RGWSetRequestPayment_ObjStore_S3() {}
+  ~RGWSetRequestPayment_ObjStore_S3() {}
+
+  int get_params();
+  void send_response();
+};
+
 class RGWInitMultipart_ObjStore_S3 : public RGWInitMultipart_ObjStore {
 public:
   RGWInitMultipart_ObjStore_S3() {}
@@ -400,6 +418,9 @@ protected:
   bool is_obj_update_op() {
     return is_acl_op() || is_cors_op();
   }
+  bool is_request_payment_op() {
+    return s->info.args.exists("requestPayment");
+  }
   RGWOp *get_obj_op(bool get_data);
 
   RGWOp *op_get();
diff --git a/src/rgw/rgw_rest_swift.cc b/src/rgw/rgw_rest_swift.cc
index cfb447a..35aa146 100644
--- a/src/rgw/rgw_rest_swift.cc
+++ b/src/rgw/rgw_rest_swift.cc
@@ -84,15 +84,17 @@ static void dump_account_metadata(struct req_state * const s,
     }
   }
 
-  /* Dump user-defined metadata items */
+  /* Dump user-defined metadata items and generic attrs. */
   const size_t PREFIX_LEN = sizeof(RGW_ATTR_META_PREFIX) - 1;
   map<string, bufferlist>::iterator iter;
-  for (iter = attrs.lower_bound(RGW_ATTR_META_PREFIX); iter != attrs.end(); ++iter) {
+  for (iter = attrs.lower_bound(RGW_ATTR_PREFIX); iter != attrs.end(); ++iter) {
     const char *name = iter->first.c_str();
-    if (strncmp(name, RGW_ATTR_META_PREFIX, PREFIX_LEN) == 0) {
+    map<string, string>::const_iterator geniter = rgw_to_http_attrs.find(name);
+
+    if (geniter != rgw_to_http_attrs.end()) {
+      s->cio->print("%s: %s\r\n", geniter->second.c_str(), iter->second.c_str());
+    } else if (strncmp(name, RGW_ATTR_META_PREFIX, PREFIX_LEN) == 0) {
       s->cio->print("X-Account-Meta-%s: %s\r\n", name + PREFIX_LEN, iter->second.c_str());
-    } else {
-      break;
     }
   }
 }
@@ -338,15 +340,20 @@ static void dump_container_metadata(struct req_state *s, RGWBucketEnt& bucket)
     if (!s->bucket_info.placement_rule.empty()) {
       s->cio->print("X-Storage-Policy: %s\r\n", s->bucket_info.placement_rule.c_str());
     }
-    // Dump user-defined metadata items
+
+    /* Dump user-defined metadata items and generic attrs. */
     const size_t PREFIX_LEN = sizeof(RGW_ATTR_META_PREFIX) - 1;
     map<string, bufferlist>::iterator iter;
-    for (iter = s->bucket_attrs.lower_bound(RGW_ATTR_META_PREFIX); iter != s->bucket_attrs.end(); ++iter) {
+    for (iter = s->bucket_attrs.lower_bound(RGW_ATTR_PREFIX);
+         iter != s->bucket_attrs.end();
+         ++iter) {
       const char *name = iter->first.c_str();
-      if (strncmp(name, RGW_ATTR_META_PREFIX, PREFIX_LEN) == 0) {
+      map<string, string>::const_iterator geniter = rgw_to_http_attrs.find(name);
+
+      if (geniter != rgw_to_http_attrs.end()) {
+        s->cio->print("%s: %s\r\n", geniter->second.c_str(), iter->second.c_str());
+      } else if (strncmp(name, RGW_ATTR_META_PREFIX, PREFIX_LEN) == 0) {
         s->cio->print("X-Container-Meta-%s: %s\r\n", name + PREFIX_LEN, iter->second.c_str());
-      } else {
-        break;
       }
     }
   }
@@ -721,9 +728,7 @@ static void dump_object_metadata(struct req_state * const s,
     const char *name = iter->first.c_str();
     map<string, string>::const_iterator aiter = rgw_to_http_attrs.find(name);
 
-    if (aiter != rgw_to_http_attrs.end() &&
-        aiter->first.compare(RGW_ATTR_CONTENT_TYPE) != 0) {
-      /* Filter out Content-Type. It must be treated separately. */
+    if (aiter != rgw_to_http_attrs.end()) {
       response_attrs[aiter->second] = iter->second.c_str();
     } else if (strncmp(name, RGW_ATTR_META_PREFIX, sizeof(RGW_ATTR_META_PREFIX)-1) == 0) {
       name += sizeof(RGW_ATTR_META_PREFIX) - 1;
@@ -841,6 +846,20 @@ void RGWCopyObj_ObjStore_SWIFT::send_response()
   }
 }
 
+int RGWGetObj_ObjStore_SWIFT::get_params()
+{
+  const string& mm = s->info.args.get("multipart-manifest");
+  skip_manifest = (mm.compare("get") == 0);
+
+  return RGWGetObj_ObjStore::get_params();
+}
+
+int RGWGetObj_ObjStore_SWIFT::send_response_data_error()
+{
+  bufferlist bl;
+  return send_response_data(bl, 0, 0);
+}
+
 int RGWGetObj_ObjStore_SWIFT::send_response_data(bufferlist& bl, off_t bl_ofs, off_t bl_len)
 {
   string content_type;
diff --git a/src/rgw/rgw_rest_swift.h b/src/rgw/rgw_rest_swift.h
index 55b41bb..66d8c81 100644
--- a/src/rgw/rgw_rest_swift.h
+++ b/src/rgw/rgw_rest_swift.h
@@ -13,6 +13,8 @@ public:
   RGWGetObj_ObjStore_SWIFT() {}
   ~RGWGetObj_ObjStore_SWIFT() {}
 
+  int get_params();
+  int send_response_data_error();
   int send_response_data(bufferlist& bl, off_t ofs, off_t len);
   bool need_object_expiration() { return true; }
 };
@@ -123,6 +125,7 @@ public:
   RGWDeleteObj_ObjStore_SWIFT() {}
   ~RGWDeleteObj_ObjStore_SWIFT() {}
 
+  bool need_object_expiration() { return true; }
   void send_response();
 };
 
diff --git a/src/rgw/rgw_tools.cc b/src/rgw/rgw_tools.cc
index 6f198c9..0dfc3e6 100644
--- a/src/rgw/rgw_tools.cc
+++ b/src/rgw/rgw_tools.cc
@@ -4,6 +4,7 @@
 #include <errno.h>
 
 #include "common/errno.h"
+#include "common/safe_io.h"
 
 #include "include/types.h"
 
@@ -141,9 +142,9 @@ static int ext_mime_map_init(CephContext *cct, const char *ext_map)
     goto done;
   }
 
-  ret = read(fd, buf, st.st_size + 1);
+  ret = safe_read(fd, buf, st.st_size + 1);
   if (ret != st.st_size) {
-    // huh? file size has changed, what are the odds?
+    // huh? file size has changed?
     ldout(cct, 0) << "ext_mime_map_init(): raced! will retry.." << dendl;
     free(buf);
     close(fd);
diff --git a/src/rgw/rgw_user.cc b/src/rgw/rgw_user.cc
index 8d691e8..9a115a2 100644
--- a/src/rgw/rgw_user.cc
+++ b/src/rgw/rgw_user.cc
@@ -1665,18 +1665,23 @@ int RGWUser::init(RGWUserAdminOpState& op_state)
     }
   }
 
-  if (!uid.empty() && (uid.compare(RGW_USER_ANON_ID) != 0))
+  if (!uid.empty() && (uid.compare(RGW_USER_ANON_ID) != 0)) {
     found = (rgw_get_user_info_by_uid(store, uid, user_info, &op_state.objv) >= 0);
-
-  if (!user_email.empty() && !found)
+    op_state.found_by_uid = found;
+  }
+  if (!user_email.empty() && !found) {
     found = (rgw_get_user_info_by_email(store, user_email, user_info, &op_state.objv) >= 0);
-
-  if (!swift_user.empty() && !found)
+    op_state.found_by_email = found;
+  }
+  if (!swift_user.empty() && !found) {
     found = (rgw_get_user_info_by_swift(store, swift_user, user_info, &op_state.objv) >= 0);
-
-  if (!access_key.empty() && !found)
+    op_state.found_by_key = found;
+  }
+  if (!access_key.empty() && !found) {
     found = (rgw_get_user_info_by_access_key(store, access_key, user_info, &op_state.objv) >= 0);
-
+    op_state.found_by_key = found;
+  }
+  
   op_state.set_existing_user(found);
   if (found) {
     op_state.set_user_info(user_info);
@@ -1807,8 +1812,13 @@ int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg)
       return execute_modify(op_state, err_msg);
     }
 
-    set_err_msg(err_msg, "user: " + op_state.user_id + " exists");
-
+    if (op_state.found_by_email) {
+      set_err_msg(err_msg, "email: " + user_email + " exists");
+    } else if (op_state.found_by_key) {
+      set_err_msg(err_msg, "duplicate key provided");
+    } else {
+      set_err_msg(err_msg, "user: " + op_state.user_id + " exists");
+    }
     return -EEXIST;
   }
 
@@ -1824,12 +1834,7 @@ int RGWUser::execute_add(RGWUserAdminOpState& op_state, std::string *err_msg)
     return -EINVAL;
   }
 
-  // fail if the user email is a duplicate
-  if (op_state.has_existing_email()) {
-    set_err_msg(err_msg, "duplicate email provided");
-    return -EEXIST;
-  }
-
+		
   // set the user info
   user_id = uid;
   user_info.user_id = user_id;
diff --git a/src/rgw/rgw_user.h b/src/rgw/rgw_user.h
index 0f26cff..06890fc 100644
--- a/src/rgw/rgw_user.h
+++ b/src/rgw/rgw_user.h
@@ -203,7 +203,10 @@ struct RGWUserAdminOpState {
   bool system_specified;
   bool key_op;
   bool temp_url_key_specified;
-
+  bool found_by_uid; 
+  bool found_by_email;  
+  bool found_by_key;
+ 
   // req parameters
   bool populated;
   bool initialized;
@@ -479,6 +482,9 @@ struct RGWUserAdminOpState {
     bucket_quota_specified = false;
     temp_url_key_specified = false;
     user_quota_specified = false;
+    found_by_uid = false;
+    found_by_email = false;
+    found_by_key = false;
   }
 };
 
diff --git a/src/rocksdb/.arcconfig b/src/rocksdb/.arcconfig
new file mode 100644
index 0000000..f06f314
--- /dev/null
+++ b/src/rocksdb/.arcconfig
@@ -0,0 +1,17 @@
+{
+  "project_id" : "rocksdb",
+  "conduit_uri" : "https://reviews.facebook.net/",
+  "copyright_holder" : "Facebook",
+  "load" : [
+    "arcanist_util"
+  ],
+  "lint.engine" : "FacebookFbcodeLintEngine",
+  "lint.engine.single.linter" : "FbcodeCppLinter",
+  "unit.engine" : "FacebookFbcodeUnitTestEngine",
+  "arcanist_configuration" : "FacebookArcanistConfiguration",
+  "base" : "git:HEAD^, hg:.^",
+  "git.default-relative-commit" : "HEAD^",
+  "git:arc.feature.start.default" : "origin/master",
+  "arc.feature.start.default" : "master",
+  "history.immutable" : false
+}
diff --git a/src/rocksdb/.clang-format b/src/rocksdb/.clang-format
new file mode 100644
index 0000000..7c27981
--- /dev/null
+++ b/src/rocksdb/.clang-format
@@ -0,0 +1,5 @@
+# Complete list of style options can be found at: 
+# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+---
+BasedOnStyle: Google
+...
diff --git a/src/rocksdb/.gitignore b/src/rocksdb/.gitignore
index c537990..6a92b5d 100644
--- a/src/rocksdb/.gitignore
+++ b/src/rocksdb/.gitignore
@@ -21,8 +21,13 @@ make_config.mk
 *.o-*
 *.swp
 *~
-*.lo
-*~
+*.vcxproj
+*.vcxproj.filters
+*.sln
+*.cmake
+CMakeCache.txt
+CMakeFiles/
+build/
 
 ldb
 manifest_dump
@@ -33,8 +38,10 @@ coverage/COVERAGE_REPORT
 .gdbhistory
 package/
 .phutil_module_cache
-unity
+unity.a
 tags
+rocksdb_dump
+rocksdb_undump
 
 java/out
 java/target
@@ -45,6 +52,8 @@ java/include/org_rocksdb_*.h
 .idea/
 *.iml
 
+rocksdb.cc
+rocksdb.h
 unity.cc
 java/crossbuild/.vagrant
 .vagrant/
@@ -52,29 +61,5 @@ java/**.asc
 java/javadoc
 
 scan_build_report/
-.dirstamp
-.deps/
-.libs/
-Makefile.in
-aclocal.m4
-ar-lib
-autom4te.cache/
-config.guess
-config.h
-config.h.in
-config.log
-config.status
-config.sub
-configure
-depcomp
-install-sh
-librocksdb.la
-libtool
-ltmain.sh
-missing
-stamp-h1
-Makefile
 t
 LOG
-
-/m4/
diff --git a/src/rocksdb/.travis.yml b/src/rocksdb/.travis.yml
new file mode 100644
index 0000000..804554c
--- /dev/null
+++ b/src/rocksdb/.travis.yml
@@ -0,0 +1,43 @@
+sudo: false
+language: cpp
+
+matrix:
+  include:
+    - os: linux
+      compiler: clang
+      env: COMPILER=clang++-3.6
+      addons:
+         apt:
+            sources: ['ubuntu-toolchain-r-test', 'llvm-toolchain-precise-3.6']
+            packages: ['clang-3.6', 'clang-format-3.6', 'zlib1g-dev', 'libbz2-dev', 'libsnappy-dev', 'curl']
+    - os: osx
+      compiler: clang
+
+install:
+  # Build gflags
+  # TODO(noetzli): Remove when gflags available through Travis
+  - pushd /tmp/ && curl -L https://github.com/gflags/gflags/archive/v2.1.2.tar.gz -o gflags.tar.gz && tar xfz gflags.tar.gz && cd gflags-2.1.2 && cmake . && make && popd
+  # Download clang-format-diff.py to check source code formatting
+  - pushd /tmp/ && curl -L http://llvm.org/svn/llvm-project/cfe/trunk/tools/clang-format/clang-format-diff.py -o clang-format-diff.py && chmod +x clang-format-diff.py && popd
+
+before_script:
+  # Add gflags to include/library paths
+  # TODO(noetzli): Remove when gflags available through Travis
+  - export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/tmp/gflags-2.1.2/lib"
+  - export LIBRARY_PATH="$LIBRARY_PATH:/tmp/gflags-2.1.2/lib"
+  - export CPLUS_INCLUDE_PATH="$CPLUS_INCLUDE_PATH:/tmp/gflags-2.1.2/include"
+  - if [ -n "${COMPILER}" ]; then CXX=${COMPILER}; fi
+  - if [[ "${TRAVIS_OS_NAME}" == 'osx' ]]; then brew install gflags snappy; fi
+  - ulimit -n 2000 || true
+
+# Lousy hack to disable use and testing of fallocate, which doesn't behave quite
+# as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
+script:
+  - if [[ "${TRAVIS_OS_NAME}" == 'linux' ]]; then OPT=-DTRAVIS CLANG_FORMAT_DIFF=/tmp/clang-format-diff.py make format || true; fi
+  - OPT=-DTRAVIS V=1 make -j4 check && OPT=-DTRAVIS V=1 make clean jclean rocksdbjava jtest
+
+notifications:
+    email:
+      - leveldb at fb.com
+    webhooks:
+      - https://buildtimetrend.herokuapp.com/travis
diff --git a/src/rocksdb/CMakeLists.txt b/src/rocksdb/CMakeLists.txt
new file mode 100644
index 0000000..446d481
--- /dev/null
+++ b/src/rocksdb/CMakeLists.txt
@@ -0,0 +1,386 @@
+# This cmake build is for Windows 64-bit only.
+#
+# Prerequisites:
+#     You must have Visual Studio 2013 Update 4 installed. Start the Developer Command Prompt window that is a part of Visual Studio installation.
+#     Run the build commands from within the Developer Command Prompt window to have paths to the compiler and runtime libraries set.
+#     You must have git.exe in your %PATH% environment variable.
+#
+# To build Rocksdb for Windows is as easy as 1-2-3-4-5:
+#
+# 1. Update paths to third-party libraries in thirdparty.inc file
+# 2. Create a new directory for build artifacts
+#        mkdir build
+#        cd build
+# 3. Run cmake to generate project files for Windows, add more options to enable required third-party libraries.
+#    See thirdparty.inc for more information.
+#        sample command: cmake -G "Visual Studio 12 Win64" -DGFLAGS=1 -DSNAPPY=1 -DJEMALLOC=1 ..
+# 4. Then build the project in debug mode (you may want to add /m:<N> flag to run msbuild in <N> parallel threads)
+#        msbuild ALL_BUILD.vcxproj
+# 5. And release mode (/m[:<N>] is also supported)
+#        msbuild ALL_BUILD.vcxproj /p:Configuration=Release
+#
+
+cmake_minimum_required(VERSION 2.6)
+project(rocksdb)
+
+include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc)
+
+execute_process(COMMAND $ENV{COMSPEC} " /C date /T" OUTPUT_VARIABLE DATE)
+execute_process(COMMAND $ENV{COMSPEC} " /C time /T" OUTPUT_VARIABLE TIME)
+string(REGEX REPLACE "(..)/(..)/..(..).*" "\\1/\\2/\\3" DATE ${DATE})
+string(REGEX REPLACE "(..):(.....).*" " \\1:\\2" TIME ${TIME})
+string(CONCAT GIT_DATE_TIME ${DATE} ${TIME})
+
+execute_process(COMMAND $ENV{COMSPEC} " /C git rev-parse HEAD 2>nil" OUTPUT_VARIABLE GIT_SHA)
+string(REGEX REPLACE "[^0-9a-f]+" "" GIT_SHA ${GIT_SHA})
+
+set(BUILD_VERSION_CC ${CMAKE_CURRENT_SOURCE_DIR}/util/build_version.cc)
+
+add_custom_command(OUTPUT ${BUILD_VERSION_CC}
+    COMMAND echo "#include \"build_version.h\"" > ${BUILD_VERSION_CC}
+    COMMAND echo "const char* rocksdb_build_git_sha = \"rocksdb_build_git_sha:${GIT_SHA}\";" >> ${BUILD_VERSION_CC}
+    COMMAND echo "const char* rocksdb_build_git_datetime = \"rocksdb_build_git_datetime:${GIT_DATE_TIME}\";" >> ${BUILD_VERSION_CC}
+    COMMAND echo const char* rocksdb_build_compile_date = __DATE__\; >> ${BUILD_VERSION_CC}
+)
+
+add_custom_target(GenerateBuildVersion DEPENDS ${BUILD_VERSION_CC})
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo  /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W3 /WX /wd4018 /wd4100 /wd4101 /wd4127 /wd4189 /wd4200 /wd4244 /wd4267 /wd4296 /wd4305 /wd4307 /wd4309 /wd4512 /wd4701 /wd4702 /wd4800 /wd4804 /wd4996")
+
+# Used to run CI build and tests so we can run faster
+set(OPTIMIZE_DEBUG_DEFAULT 0)        # Debug build is unoptimized by default use -DOPTDBG=1 to optimize
+
+if(DEFINED OPTDBG)
+   set(OPTIMIZE_DEBUG ${OPTDBG})
+else()
+   set(OPTIMIZE_DEBUG ${OPTIMIZE_DEBUG_DEFAULT})
+endif()
+
+if((${OPTIMIZE_DEBUG} EQUAL 1))
+   message(STATUS "Debug optimization is enabled")
+   set(CMAKE_CXX_FLAGS_DEBUG "/Oxt /MDd")
+else()
+   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1 /Gm /MDd")
+endif()
+
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oxt /Zp8 /Gm- /Gy /MD")
+
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG")
+
+add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64)
+
+include_directories(${PROJECT_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR}/include)
+include_directories(${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src)
+
+set(ROCKSDB_LIBS rocksdblib${ARTIFACT_SUFFIX})
+set(THIRDPARTY_LIBS ${THIRDPARTY_LIBS} gtest)
+set(SYSTEM_LIBS Shlwapi.lib Rpcrt4.lib)
+
+set(LIBS ${ROCKSDB_LIBS} ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
+
+add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest)
+
+set(SOURCES
+        db/builder.cc
+        db/c.cc
+        db/column_family.cc
+        db/compacted_db_impl.cc
+        db/compaction.cc
+        db/compaction_iterator.cc
+        db/compaction_job.cc
+        db/compaction_picker.cc
+        db/convenience.cc
+        db/dbformat.cc
+        db/db_filesnapshot.cc
+        db/db_impl.cc
+        db/db_impl_debug.cc
+        db/db_impl_experimental.cc
+        db/db_impl_readonly.cc
+        db/db_iter.cc
+        db/event_helpers.cc
+        db/experimental.cc
+        db/filename.cc
+        db/file_indexer.cc
+        db/flush_job.cc
+        db/flush_scheduler.cc
+        db/forward_iterator.cc
+        db/internal_stats.cc
+        db/log_reader.cc
+        db/log_writer.cc
+        db/managed_iterator.cc
+        db/memtable.cc
+        db/memtable_allocator.cc
+        db/memtable_list.cc
+        db/merge_helper.cc
+        db/merge_operator.cc
+        db/repair.cc
+        db/slice.cc
+        db/snapshot_impl.cc
+        db/table_cache.cc
+        db/table_properties_collector.cc
+        db/transaction_log_impl.cc
+        db/version_builder.cc
+        db/version_edit.cc
+        db/version_set.cc
+        db/wal_manager.cc
+        db/write_batch.cc
+        db/write_batch_base.cc
+        db/write_controller.cc
+        db/write_thread.cc
+        port/stack_trace.cc
+        port/win/env_win.cc
+        port/win/port_win.cc
+        port/win/win_logger.cc
+        table/adaptive_table_factory.cc
+        table/block.cc
+        table/block_based_filter_block.cc
+        table/block_based_table_builder.cc
+        table/block_based_table_factory.cc
+        table/block_based_table_reader.cc
+        table/block_builder.cc
+        table/block_hash_index.cc
+        table/block_prefix_index.cc
+        table/bloom_block.cc
+        table/cuckoo_table_builder.cc
+        table/cuckoo_table_factory.cc
+        table/cuckoo_table_reader.cc
+        table/flush_block_policy.cc
+        table/format.cc
+        table/full_filter_block.cc
+        table/get_context.cc
+        table/iterator.cc
+        table/merger.cc
+        table/sst_file_writer.cc
+        table/meta_blocks.cc
+        table/mock_table.cc
+        table/plain_table_builder.cc
+        table/plain_table_factory.cc
+        table/plain_table_index.cc
+        table/plain_table_key_coding.cc
+        table/plain_table_reader.cc
+        table/table_properties.cc
+        table/two_level_iterator.cc
+        tools/dump/db_dump_tool.cc
+        util/arena.cc
+        util/auto_roll_logger.cc
+        util/bloom.cc
+        util/build_version.cc
+        util/cache.cc
+        util/coding.cc
+        util/compaction_job_stats_impl.cc
+        util/comparator.cc
+        util/crc32c.cc
+        util/db_info_dumper.cc
+        util/delete_scheduler_impl.cc
+        util/db_test_util.cc
+        util/dynamic_bloom.cc
+        util/env.cc
+        util/env_hdfs.cc
+        util/event_logger.cc
+        util/file_util.cc
+        util/file_reader_writer.cc
+        util/filter_policy.cc
+        util/hash.cc
+        util/hash_cuckoo_rep.cc
+        util/hash_linklist_rep.cc
+        util/hash_skiplist_rep.cc
+        util/histogram.cc
+        util/instrumented_mutex.cc
+        util/iostats_context.cc
+        util/ldb_cmd.cc
+        util/ldb_tool.cc
+        util/logging.cc
+        util/log_buffer.cc
+        util/memenv.cc
+        util/mock_env.cc
+        util/murmurhash.cc
+        util/mutable_cf_options.cc
+        util/options.cc
+        util/options_builder.cc
+        util/options_helper.cc
+        util/options_parser.cc
+        util/perf_context.cc
+        util/perf_level.cc
+        util/rate_limiter.cc
+        util/skiplistrep.cc
+        util/slice.cc
+        util/sst_dump_tool.cc
+        util/statistics.cc
+        util/status.cc
+        util/status_message.cc
+        util/string_util.cc
+        util/sync_point.cc
+        util/testharness.cc
+        util/testutil.cc
+        util/thread_local.cc
+        util/thread_status_impl.cc
+        util/thread_status_updater.cc
+        util/thread_status_updater_debug.cc
+        util/thread_status_util.cc
+        util/thread_status_util_debug.cc
+        util/vectorrep.cc
+        util/xfunc.cc
+        util/xxhash.cc
+        utilities/backupable/backupable_db.cc
+        utilities/checkpoint/checkpoint.cc
+        utilities/document/document_db.cc
+        utilities/document/json_document.cc
+        utilities/document/json_document_builder.cc
+        utilities/flashcache/flashcache.cc
+        utilities/geodb/geodb_impl.cc
+        utilities/leveldb_options/leveldb_options.cc
+        utilities/merge_operators/string_append/stringappend.cc
+        utilities/merge_operators/string_append/stringappend2.cc
+        utilities/merge_operators/put.cc
+        utilities/merge_operators/uint64add.cc
+        utilities/redis/redis_lists.cc
+        utilities/spatialdb/spatial_db.cc
+        utilities/table_properties_collectors/compact_on_deletion_collector.cc
+        utilities/transactions/optimistic_transaction_impl.cc
+        utilities/transactions/optimistic_transaction_db_impl.cc
+        utilities/transactions/transaction_base.cc
+        utilities/transactions/transaction_impl.cc
+        utilities/transactions/transaction_db_impl.cc
+        utilities/transactions/transaction_db_mutex_impl.cc
+        utilities/transactions/transaction_lock_mgr.cc
+        utilities/transactions/transaction_util.cc
+        utilities/ttl/db_ttl_impl.cc
+        utilities/write_batch_with_index/write_batch_with_index.cc
+        utilities/write_batch_with_index/write_batch_with_index_internal.cc
+)
+
+add_library(rocksdblib${ARTIFACT_SUFFIX} ${SOURCES})
+set_target_properties(rocksdblib${ARTIFACT_SUFFIX} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/rocksdblib${ARTIFACT_SUFFIX}.pdb")
+add_dependencies(rocksdblib${ARTIFACT_SUFFIX} GenerateBuildVersion)
+
+add_library(rocksdb${ARTIFACT_SUFFIX} SHARED ${SOURCES})
+set_target_properties(rocksdb${ARTIFACT_SUFFIX} PROPERTIES COMPILE_FLAGS "-DROCKSDB_DLL -DROCKSDB_LIBRARY_EXPORTS /Fd${CMAKE_CFG_INTDIR}/rocksdb${ARTIFACT_SUFFIX}.pdb")
+add_dependencies(rocksdb${ARTIFACT_SUFFIX} GenerateBuildVersion)
+target_link_libraries(rocksdb${ARTIFACT_SUFFIX} ${LIBS})
+
+set(APPS
+        db/db_bench.cc
+        db/memtablerep_bench.cc
+        table/table_reader_bench.cc
+        tools/db_stress.cc
+        tools/db_repl_stress.cc
+        tools/sst_dump.cc
+        tools/dump/rocksdb_dump.cc
+        tools/dump/rocksdb_undump.cc
+        util/cache_bench.cc
+)
+
+set(C_TESTS db/c_test.c)
+
+set(TESTS
+        db/column_family_test.cc
+        db/compact_files_test.cc
+        db/compaction_job_test.cc
+        db/compaction_job_stats_test.cc
+        db/compaction_picker_test.cc
+        db/comparator_db_test.cc
+        db/corruption_test.cc
+        db/cuckoo_table_db_test.cc
+        db/db_iter_test.cc
+        db/db_test.cc
+        db/db_compaction_filter_test.cc
+        db/db_compaction_test.cc
+        db/db_dynamic_level_test.cc
+        db/db_inplace_update_test.cc
+        db/db_log_iter_test.cc
+        db/db_universal_compaction_test.cc
+        db/db_wal_test.cc
+        db/db_tailing_iter_test.cc
+        db/dbformat_test.cc
+        db/deletefile_test.cc
+        db/fault_injection_test.cc
+        db/file_indexer_test.cc
+        db/filename_test.cc
+        db/flush_job_test.cc
+        db/listener_test.cc
+        db/log_test.cc
+        db/memtable_list_test.cc
+        db/merge_test.cc
+        db/merge_helper_test.cc
+        db/perf_context_test.cc
+        db/plain_table_db_test.cc
+        db/prefix_test.cc
+        db/skiplist_test.cc
+        db/table_properties_collector_test.cc
+        db/version_builder_test.cc
+        db/version_edit_test.cc
+        db/version_set_test.cc
+        db/wal_manager_test.cc
+        db/write_batch_test.cc
+        db/write_callback_test.cc
+        db/write_controller_test.cc
+        table/block_based_filter_block_test.cc
+        table/block_hash_index_test.cc
+        table/block_test.cc
+        table/cuckoo_table_builder_test.cc
+        table/cuckoo_table_reader_test.cc
+        table/full_filter_block_test.cc
+        table/merger_test.cc
+        table/table_test.cc
+        tools/db_sanity_test.cc
+        tools/reduce_levels_test.cc
+        util/arena_test.cc
+        util/autovector_test.cc
+        util/auto_roll_logger_test.cc
+        util/bloom_test.cc
+        util/cache_test.cc
+        util/coding_test.cc
+        util/crc32c_test.cc
+        util/dynamic_bloom_test.cc
+        util/env_test.cc
+        util/event_logger_test.cc
+        util/filelock_test.cc
+        util/file_reader_writer_test.cc
+        util/heap_test.cc
+        util/histogram_test.cc
+        util/ldb_cmd_test.cc
+        util/manual_compaction_test.cc
+        util/memenv_test.cc
+        util/mock_env_test.cc
+        util/options_test.cc
+        util/rate_limiter_test.cc
+        util/slice_transform_test.cc
+        util/sst_dump_test.cc
+        util/thread_list_test.cc
+        util/thread_local_test.cc
+        utilities/backupable/backupable_db_test.cc
+        utilities/checkpoint/checkpoint_test.cc
+        utilities/document/document_db_test.cc
+        utilities/document/json_document_test.cc
+        utilities/geodb/geodb_test.cc
+        utilities/merge_operators/string_append/stringappend_test.cc
+        utilities/redis/redis_lists_test.cc
+        utilities/spatialdb/spatial_db_test.cc
+        utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
+        utilities/transactions/optimistic_transaction_test.cc
+        utilities/transactions/transaction_test.cc
+        utilities/ttl/ttl_test.cc
+        utilities/write_batch_with_index/write_batch_with_index_test.cc
+)
+
+set(EXES ${APPS} ${TESTS})
+
+foreach(sourcefile ${EXES})
+    string(REPLACE ".cc" "" exename ${sourcefile})
+    string(REGEX REPLACE "^((.+)/)+" "" exename ${exename})
+    add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile})
+    target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${LIBS})
+endforeach(sourcefile ${EXES})
+
+# C executables must link to a shared object
+set(C_EXES ${C_TESTS})
+
+foreach(sourcefile ${C_EXES})
+    string(REPLACE ".c" "" exename ${sourcefile})
+    string(REGEX REPLACE "^((.+)/)+" "" exename ${exename})
+    add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile})
+    target_link_libraries(${exename}${ARTIFACT_SUFFIX} rocksdb${ARTIFACT_SUFFIX})
+endforeach(sourcefile ${C_TESTS})
diff --git a/src/rocksdb/DUMP_FORMAT.md b/src/rocksdb/DUMP_FORMAT.md
new file mode 100644
index 0000000..009daba
--- /dev/null
+++ b/src/rocksdb/DUMP_FORMAT.md
@@ -0,0 +1,16 @@
+## RocksDB dump format
+
+The version 1 RocksDB dump format is fairly simple:
+
+1) The dump starts with the magic 8 byte identifier "ROCKDUMP"
+
+2) The magic is followed by an 8 byte big-endian version which is 0x00000001.
+
+3) Next are arbitrarily sized chunks of bytes prepended by 4 byte little endian number indicating how large each chunk is.
+
+4) The first chunk is special and is a json string indicating some things about the creation of this dump.  It contains the following keys:
+* database-path: The path of the database this dump was created from.
+* hostname: The hostname of the machine where the dump was created.
+* creation-time: Unix seconds since epoc when this dump was created.
+
+5) Following the info dump the slices paired into are key/value pairs.
diff --git a/src/rocksdb/HISTORY.md b/src/rocksdb/HISTORY.md
index 2a52367..7b64daf 100644
--- a/src/rocksdb/HISTORY.md
+++ b/src/rocksdb/HISTORY.md
@@ -1,22 +1,80 @@
 # Rocksdb Change Log
 
-## 3.11.2 (6/11/2015)
-
-### Fixes
-* Adjust the way we compensate for tombstones when chosing compactions. Previous heuristics led to pathological behavior in some cases.
-* Don't let two L0->L1 compactions run in parallel (only affected through experimental feature SuggestCompactRange)
-
-## 3.11.1 (6/1/2015)
+## 4.1.0 (10/8/2015)
+### New Features
+* Added single delete operation as a more efficient way to delete keys that have not been overwritten.
+* Added experimental AddFile() to DB interface that allow users to add files created by SstFileWriter into an empty Database, see include/rocksdb/sst_file_writer.h and DB::AddFile() for more info.
+* Added support for opening SST files with .ldb suffix which enables opening LevelDB databases.
+* CompactionFilter now supports filtering of merge operands and merge results.
+
+### Public API Changes
+* Added SingleDelete() to the DB interface.
+* Added AddFile() to DB interface.
+* Added SstFileWriter class.
+* CompactionFilter has a new method FilterMergeOperand() that RocksDB applies to every merge operand during compaction to decide whether to filter the operand.
+* We removed CompactionFilterV2 interfaces from include/rocksdb/compaction_filter.h. The functionality was deprecated already in version 3.13.
+
+## 4.0.0 (9/9/2015)
+### New Features
+* Added support for transactions.  See include/rocksdb/utilities/transaction.h for more info.
+* DB::GetProperty() now accepts "rocksdb.aggregated-table-properties" and "rocksdb.aggregated-table-properties-at-levelN", in which case it returns aggregated table properties of the target column family, or the aggregated table properties of the specified level N if the "at-level" version is used.
+* Add compression option kZSTDNotFinalCompression for people to experiment ZSTD although its format is not finalized.
+* We removed the need for LATEST_BACKUP file in BackupEngine. We still keep writing it when we create new backups (because of backward compatibility), but we don't read it anymore.
+
+### Public API Changes
+* Removed class Env::RandomRWFile and Env::NewRandomRWFile().
+* Renamed DBOptions.num_subcompactions to DBOptions.max_subcompactions to make the name better match the actual functionality of the option.
+* Added Equal() method to the Comparator interface that can optionally be overwritten in cases where equality comparisons can be done more efficiently than three-way comparisons.
+* Previous 'experimental' OptimisticTransaction class has been replaced by Transaction class.
+
+## 3.13.0 (8/6/2015)
+### New Features
+* RollbackToSavePoint() in WriteBatch/WriteBatchWithIndex
+* Add NewCompactOnDeletionCollectorFactory() in utilities/table_properties_collectors, which allows rocksdb to mark a SST file as need-compaction when it observes at least D deletion entries in any N consecutive entries in that SST file.  Note that this feature depends on an experimental NeedCompact() API --- the result of this API will not persist after DB restart.
+* Add DBOptions::delete_scheduler. Use NewDeleteScheduler() in include/rocksdb/delete_scheduler.h to create a DeleteScheduler that can be shared among multiple RocksDB instances to control the file deletion rate of SST files that exist in the first db_path.
+
+### Public API Changes
+* Deprecated WriteOptions::timeout_hint_us. We no longer support write timeout. If you really need this option, talk to us and we might consider returning it.
+* Deprecated purge_redundant_kvs_while_flush option.
+* Removed BackupEngine::NewBackupEngine() and NewReadOnlyBackupEngine() that were deprecated in RocksDB 3.8. Please use BackupEngine::Open() instead.
+* Deprecated Compaction Filter V2. We are not aware of any existing use-cases. If you use this filter, your compile will break with RocksDB 3.13. Please let us know if you use it and we'll put it back in RocksDB 3.14.
+* Env::FileExists now returns a Status instead of a boolean
+* Add statistics::getHistogramString() to print detailed distribution of a histogram metric.
+* Add DBOptions::skip_stats_update_on_db_open.  When it is on, DB::Open() will run faster as it skips the random reads required for loading necessary stats from SST files to optimize compaction.
+
+## 3.12.0 (7/2/2015)
+### New Features
+* Added experimental support for optimistic transactions.  See include/rocksdb/utilities/optimistic_transaction.h for more info.
+* Added a new way to report QPS from db_bench (check out --report_file and --report_interval_seconds)
+* Added a cache for individual rows. See DBOptions::row_cache for more info.
+* Several new features on EventListener (see include/rocksdb/listener.h):
+ - OnCompationCompleted() now returns per-compaciton job statistics, defined in include/rocksdb/compaction_job_stats.h.
+ - Added OnTableFileCreated() and OnTableFileDeleted().
+* Add compaction_options_universal.enable_trivial_move to true, to allow trivial move while performing universal compaction. Trivial move will happen only when all the input files are non overlapping.
 
-### Changes
-* Just a single change to fix the Java linking (github issue #606)
+### Public API changes
+* EventListener::OnFlushCompleted() now passes FlushJobInfo instead of a list of parameters.
+* DB::GetDbIdentity() is now a const function.  If this function is overridden in your application, be sure to also make GetDbIdentity() const to avoid compile error.
+* Move listeners from ColumnFamilyOptions to DBOptions.
+* Add max_write_buffer_number_to_maintain option
+* DB::CompactRange()'s parameter reduce_level is changed to change_level, to allow users to move levels to lower levels if allowed. It can be used to migrate a DB from options.level_compaction_dynamic_level_bytes=false to options.level_compaction_dynamic_level_bytes.true.
+* Change default value for options.compaction_filter_factory and options.compaction_filter_factory_v2 to nullptr instead of DefaultCompactionFilterFactory and DefaultCompactionFilterFactoryV2.
+* If CancelAllBackgroundWork is called without doing a flush after doing loads with WAL disabled, the changes which haven't been flushed before the call to CancelAllBackgroundWork will be lost.
+* WBWIIterator::Entry() now returns WriteEntry instead of `const WriteEntry&`
+* options.hard_rate_limit is deprecated.
+* When options.soft_rate_limit or options.level0_slowdown_writes_trigger is triggered, the way to slow down writes is changed to: write rate to DB is limited to to options.delayed_write_rate.
+* DB::GetApproximateSizes() adds a parameter to allow the estimation to include data in mem table, with default to be not to include. It is now only supported in skip list mem table.
+* DB::CompactRange() now accept CompactRangeOptions instead of multiple paramters. CompactRangeOptions is defined in include/rocksdb/options.h.
+* CompactRange() will now skip bottommost level compaction for level based compaction if there is no compaction filter, bottommost_level_compaction is introduced in CompactRangeOptions to control when it's possbile to skip bottommost level compaction. This mean that if you want the compaction to produce a single file you need to set bottommost_level_compaction to BottommostLevelCompaction::kForce.
+* Add Cache.GetPinnedUsage() to get the size of memory occupied by entries that are in use by the system.
+* DB:Open() will fail if the compression specified in Options is not linked with the binary. If you see this failure, recompile RocksDB with compression libraries present on your system. Also, previously our default compression was snappy. This behavior is now changed. Now, the default compression is snappy only if it's available on the system. If it isn't we change the default to kNoCompression.
+* We changed how we account for memory used in block cache. Previously, we only counted the sum of block sizes currently present in block cache. Now, we count the actual memory usage of the blocks. For example, a block of size 4.5KB will use 8KB memory with jemalloc. This might decrease your memory usage and possibly decrease performance. Increase block cache size if you see this happening after an upgrade.
+* Add BackupEngineImpl.options_.max_background_operations to specify the maximum number of operations that may be performed in parallel. Add support for parallelized backup and restore.
+* Add DB::SyncWAL() that does a WAL sync without blocking writers.
 
 ## 3.11.0 (5/19/2015)
-
 ### New Features
 * Added a new API Cache::SetCapacity(size_t capacity) to dynamically change the maximum configured capacity of the cache. If the new capacity is less than the existing cache usage, the implementation will try to lower the usage by evicting the necessary number of elements following a strict LRU policy.
-
-### New Features
 * Added an experimental API for handling flashcache devices (blacklists background threads from caching their reads) -- NewFlashcacheAwareEnv
 * If universal compaction is used and options.num_levels > 1, compact files are tried to be stored in none-L0 with smaller files based on options.target_file_size_base. The limitation of DB size when using universal compaction is greatly mitigated by using more levels. You can set num_levels = 1 to make universal compaction behave as before. If you set num_levels > 1 and want to roll back to a previous version, you need to compact all files to a big file in level 0 (by setting target_fil [...]
 * More information about rocksdb background threads are available in Env::GetThreadList(), including the number of bytes read / written by a compaction job, mem-table size and current number of bytes written by a flush job and many more.  Check include/rocksdb/thread_status.h for more detail.
diff --git a/src/rocksdb/INSTALL.md b/src/rocksdb/INSTALL.md
index 330f8bc..50b27c8 100644
--- a/src/rocksdb/INSTALL.md
+++ b/src/rocksdb/INSTALL.md
@@ -28,7 +28,7 @@ your make commands, like this: `PORTABLE=1 make static_lib`
       data compression.
 
 * All our tools depend on:
-  - [gflags](https://code.google.com/p/gflags/) - a library that handles
+  - [gflags](https://gflags.github.io/gflags/) - a library that handles
       command line flags processing. You can compile rocksdb library even
       if you don't have gflags installed.
 
@@ -75,8 +75,11 @@ your make commands, like this: `PORTABLE=1 make static_lib`
         * Update XCode:  run `xcode-select --install` (or install it from XCode App's settting).
         * Install via [homebrew](http://brew.sh/).
             * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line.
-            * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher).
+            * run `brew tap homebrew/versions; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher).
     * run `brew install rocksdb`
 
 * **iOS**:
   * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`.
+
+* **Windows**:
+  * Read and follow the instructions at CMakeLists.txt
diff --git a/src/rocksdb/Makefile b/src/rocksdb/Makefile
new file mode 100644
index 0000000..009f467
--- /dev/null
+++ b/src/rocksdb/Makefile
@@ -0,0 +1,1165 @@
+# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+# Inherit some settings from environment variables, if available
+
+#-----------------------------------------------
+
+CLEAN_FILES = # deliberately empty, so we can append below.
+CFLAGS += ${EXTRA_CFLAGS}
+CXXFLAGS += ${EXTRA_CXXFLAGS}
+LDFLAGS += $(EXTRA_LDFLAGS)
+MACHINE ?= $(shell uname -m)
+ARFLAGS = rs
+
+# Transform parallel LOG output into something more readable.
+perl_command = perl -n \
+  -e '@a=split("\t",$$_,-1); $$t=$$a[8]; $$t =~ s,^\./,,;'		\
+  -e '$$t =~ s, >.*,,; chomp $$t;'					\
+  -e '$$t =~ /.*--gtest_filter=(.*?\.[\w\/]+)/ and $$t=$$1;'		\
+  -e 'printf "%7.3f %s %s\n", $$a[3], $$a[6] == 0 ? "PASS" : "FAIL", $$t'
+quoted_perl_command = $(subst ','\'',$(perl_command))
+
+# DEBUG_LEVEL can have three values:
+# * DEBUG_LEVEL=2; this is the ultimate debug mode. It will compile rocksdb
+# without any optimizations. To compile with level 2, issue `make dbg`
+# * DEBUG_LEVEL=1; debug level 1 enables all assertions and debug code, but
+# compiles rocksdb with -O2 optimizations. this is the default debug level.
+# `make all` or `make <binary_target>` compile RocksDB with debug level 1.
+# We use this debug level when developing RocksDB.
+# * DEBUG_LEVEL=0; this is the debug level we use for release. If you're
+# running rocksdb in production you most definitely want to compile RocksDB
+# with debug level 0. To compile with level 0, run `make shared_lib`,
+# `make install-shared`, `make static_lib`, `make install-static` or
+# `make install`
+DEBUG_LEVEL=1
+
+ifeq ($(MAKECMDGOALS),dbg)
+	DEBUG_LEVEL=2
+endif
+
+ifeq ($(MAKECMDGOALS),shared_lib)
+	DEBUG_LEVEL=0
+endif
+
+ifeq ($(MAKECMDGOALS),install-shared)
+	DEBUG_LEVEL=0
+endif
+
+ifeq ($(MAKECMDGOALS),static_lib)
+	DEBUG_LEVEL=0
+endif
+
+ifeq ($(MAKECMDGOALS),install-static)
+	DEBUG_LEVEL=0
+endif
+
+ifeq ($(MAKECMDGOALS),install)
+	DEBUG_LEVEL=0
+endif
+
+ifeq ($(MAKECMDGOALS),rocksdbjavastatic)
+	DEBUG_LEVEL=0
+endif
+
+# compile with -O2 if debug level is not 2
+ifneq ($(DEBUG_LEVEL), 2)
+OPT += -O2 -fno-omit-frame-pointer
+ifneq ($(MACHINE),ppc64) # ppc64 doesn't support -momit-leaf-frame-pointer
+OPT += -momit-leaf-frame-pointer
+endif
+endif
+
+# if we're compiling for release, compile without debug code (-DNDEBUG) and
+# don't treat warnings as errors
+ifeq ($(DEBUG_LEVEL),0)
+OPT += -DNDEBUG
+DISABLE_WARNING_AS_ERROR=1
+endif
+
+#-----------------------------------------------
+include src.mk
+
+AM_DEFAULT_VERBOSITY = 0
+
+AM_V_GEN = $(am__v_GEN_$(V))
+am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY))
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 =
+AM_V_at = $(am__v_at_$(V))
+am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY))
+am__v_at_0 = @
+am__v_at_1 =
+
+AM_V_CC = $(am__v_CC_$(V))
+am__v_CC_ = $(am__v_CC_$(AM_DEFAULT_VERBOSITY))
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 =
+CCLD = $(CC)
+LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_$(V))
+am__v_CCLD_ = $(am__v_CCLD_$(AM_DEFAULT_VERBOSITY))
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 =
+AM_V_AR = $(am__v_AR_$(V))
+am__v_AR_ = $(am__v_AR_$(AM_DEFAULT_VERBOSITY))
+am__v_AR_0 = @echo "  AR      " $@;
+am__v_AR_1 =
+
+AM_LINK = $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
+
+# detect what platform we're building on
+dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
+# this file is generated by the previous line to set build flags and sources
+include make_config.mk
+CLEAN_FILES += make_config.mk
+
+ifneq ($(PLATFORM), IOS)
+CFLAGS += -g
+CXXFLAGS += -g
+else
+# no debug info for IOS, that will make our library big
+OPT += -DNDEBUG
+endif
+
+ifneq ($(filter -DROCKSDB_LITE,$(OPT)),)
+	# found
+	CFLAGS += -fno-exceptions
+	CXXFLAGS += -fno-exceptions
+endif
+
+# ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
+ifdef COMPILE_WITH_ASAN
+	DISABLE_JEMALLOC=1
+	EXEC_LDFLAGS += -fsanitize=address
+	PLATFORM_CCFLAGS += -fsanitize=address
+	PLATFORM_CXXFLAGS += -fsanitize=address
+endif
+
+# TSAN doesn't work well with jemalloc. If we're compiling with TSAN, we should use regular malloc.
+ifdef COMPILE_WITH_TSAN
+	DISABLE_JEMALLOC=1
+	EXEC_LDFLAGS += -fsanitize=thread -pie
+	PLATFORM_CCFLAGS += -fsanitize=thread -fPIC -DROCKSDB_TSAN_RUN
+	PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC -DROCKSDB_TSAN_RUN
+        # Turn off -pg when enabling TSAN testing, because that induces
+        # a link failure.  TODO: find the root cause
+	pg =
+else
+	pg = -pg
+endif
+
+ifndef DISABLE_JEMALLOC
+	EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS)
+	PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE)
+	PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE)
+endif
+
+export GTEST_THROW_ON_FAILURE=1 GTEST_HAS_EXCEPTIONS=1
+GTEST_DIR = ./third-party/gtest-1.7.0/fused-src
+PLATFORM_CCFLAGS += -isystem $(GTEST_DIR)
+PLATFORM_CXXFLAGS += -isystem $(GTEST_DIR)
+
+# This (the first rule) must depend on "all".
+default: all
+
+WARNING_FLAGS = -W -Wextra -Wall -Wsign-compare -Wshadow \
+  -Wno-unused-parameter
+
+ifndef DISABLE_WARNING_AS_ERROR
+	WARNING_FLAGS += -Werror
+endif
+
+CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
+CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers
+
+LDFLAGS += $(PLATFORM_LDFLAGS)
+
+date := $(shell date +%F)
+ifdef FORCE_GIT_SHA
+	git_sha := $(FORCE_GIT_SHA)
+else
+	git_sha := $(shell git rev-parse HEAD 2>/dev/null)
+endif
+gen_build_version =							\
+  printf '%s\n'								\
+    '\#include "build_version.h"'					\
+    'const char* rocksdb_build_git_sha =				\
+      "rocksdb_build_git_sha:$(git_sha)";'			\
+    'const char* rocksdb_build_git_date =				\
+      "rocksdb_build_git_date:$(date)";'				\
+    'const char* rocksdb_build_compile_date = __DATE__;'
+
+# Record the version of the source that we are compiling.
+# We keep a record of the git revision in this file.  It is then built
+# as a regular source file as part of the compilation process.
+# One can run "strings executable_filename | grep _build_" to find
+# the version of the source that we used to build the executable file.
+CLEAN_FILES += util/build_version.cc:
+FORCE:
+util/build_version.cc: FORCE
+	$(AM_V_GEN)rm -f $@-t
+	$(AM_V_at)$(gen_build_version) > $@-t
+	$(AM_V_at)if test -f $@; then					\
+	  cmp -s $@-t $@ && rm -f $@-t || mv -f $@-t $@;		\
+	else mv -f $@-t $@; fi
+
+LIBOBJECTS = $(LIB_SOURCES:.cc=.o)
+LIBOBJECTS += $(TOOL_SOURCES:.cc=.o)
+MOCKOBJECTS = $(MOCK_SOURCES:.cc=.o)
+
+GTEST = $(GTEST_DIR)/gtest/gtest-all.o
+TESTUTIL = ./util/testutil.o
+TESTHARNESS = ./util/testharness.o $(TESTUTIL) $(MOCKOBJECTS) $(GTEST)
+VALGRIND_ERROR = 2
+VALGRIND_VER := $(join $(VALGRIND_VER),valgrind)
+
+VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
+
+TESTS = \
+	db_test \
+	db_iter_test \
+	db_log_iter_test \
+	db_compaction_filter_test \
+	db_compaction_test \
+	db_dynamic_level_test \
+	db_inplace_update_test \
+	db_tailing_iter_test \
+	db_universal_compaction_test \
+	db_wal_test \
+	block_hash_index_test \
+	autovector_test \
+	column_family_test \
+	table_properties_collector_test \
+	arena_test \
+	auto_roll_logger_test \
+	block_test \
+	bloom_test \
+	dynamic_bloom_test \
+	c_test \
+	cache_test \
+	checkpoint_test \
+	coding_test \
+	corruption_test \
+	crc32c_test \
+	slice_transform_test \
+	dbformat_test \
+	env_test \
+	fault_injection_test \
+	filelock_test \
+	filename_test \
+	file_reader_writer_test \
+	block_based_filter_block_test \
+	full_filter_block_test \
+	histogram_test \
+	log_test \
+	manual_compaction_test \
+	memenv_test \
+	mock_env_test \
+	memtable_list_test \
+	merge_helper_test \
+	merge_test \
+	merger_test \
+	redis_test \
+	reduce_levels_test \
+	plain_table_db_test \
+	comparator_db_test \
+	prefix_test \
+	skiplist_test \
+	stringappend_test \
+	ttl_test \
+	backupable_db_test \
+	document_db_test \
+	json_document_test \
+	spatial_db_test \
+	version_edit_test \
+	version_set_test \
+	compaction_picker_test \
+	version_builder_test \
+	file_indexer_test \
+	write_batch_test \
+	write_batch_with_index_test \
+	write_controller_test\
+	deletefile_test \
+	table_test \
+	thread_local_test \
+	geodb_test \
+	rate_limiter_test \
+	delete_scheduler_test \
+	options_test \
+	event_logger_test \
+	cuckoo_table_builder_test \
+	cuckoo_table_reader_test \
+	cuckoo_table_db_test \
+	flush_job_test \
+	wal_manager_test \
+	listener_test \
+	compaction_iterator_test \
+	compaction_job_test \
+	thread_list_test \
+	sst_dump_test \
+	compact_files_test \
+	perf_context_test \
+	optimistic_transaction_test \
+	write_callback_test \
+	heap_test \
+	compact_on_deletion_collector_test \
+	compaction_job_stats_test \
+	transaction_test \
+	ldb_cmd_test
+
+SUBSET :=  $(shell echo $(TESTS) |sed s/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/)
+
+TOOLS = \
+	sst_dump \
+	db_sanity_test \
+	db_stress \
+	ldb \
+	db_repl_stress \
+	rocksdb_dump \
+	rocksdb_undump
+
+BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench
+
+# The library name is configurable since we are maintaining libraries of both
+# debug/release mode.
+ifeq ($(LIBNAME),)
+        LIBNAME=librocksdb
+endif
+LIBRARY = ${LIBNAME}.a
+
+ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3)
+
+default: all
+
+#-----------------------------------------------
+# Create platform independent shared libraries.
+#-----------------------------------------------
+ifneq ($(PLATFORM_SHARED_EXT),)
+
+ifneq ($(PLATFORM_SHARED_VERSIONED),true)
+SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
+SHARED2 = $(SHARED1)
+SHARED3 = $(SHARED1)
+SHARED4 = $(SHARED1)
+SHARED = $(SHARED1)
+else
+SHARED_MAJOR = $(ROCKSDB_MAJOR)
+SHARED_MINOR = $(ROCKSDB_MINOR)
+SHARED_PATCH = $(ROCKSDB_PATCH)
+SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
+ifeq ($(PLATFORM), OS_MACOSX)
+SHARED_OSX = $(LIBNAME).$(SHARED_MAJOR)
+SHARED2 = $(SHARED_OSX).$(PLATFORM_SHARED_EXT)
+SHARED3 = $(SHARED_OSX).$(SHARED_MINOR).$(PLATFORM_SHARED_EXT)
+SHARED4 = $(SHARED_OSX).$(SHARED_MINOR).$(SHARED_PATCH).$(PLATFORM_SHARED_EXT)
+else
+SHARED2 = $(SHARED1).$(SHARED_MAJOR)
+SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR)
+SHARED4 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR).$(SHARED_PATCH)
+endif
+SHARED = $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4)
+$(SHARED1): $(SHARED4)
+	ln -fs $(SHARED4) $(SHARED1)
+$(SHARED2): $(SHARED4)
+	ln -fs $(SHARED4) $(SHARED2)
+$(SHARED3): $(SHARED4)
+	ln -fs $(SHARED4) $(SHARED3)
+endif
+
+$(SHARED4):
+	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED3) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LIB_SOURCES) \
+		$(LDFLAGS) -o $@
+
+endif  # PLATFORM_SHARED_EXT
+
+.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests package \
+	release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
+	dbg rocksdbjavastatic rocksdbjava install install-static install-shared uninstall \
+	analyze tools
+
+
+all: $(LIBRARY) $(BENCHMARKS) tools $(TESTS)
+
+static_lib: $(LIBRARY)
+
+shared_lib: $(SHARED)
+
+tools: $(TOOLS)
+
+dbg: $(LIBRARY) $(BENCHMARKS) tools $(TESTS)
+
+# creates static library and programs
+release:
+	$(MAKE) clean
+	OPT="-DNDEBUG -O2" $(MAKE) static_lib tools db_bench
+
+coverage:
+	$(MAKE) clean
+	COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) J=1 all check
+	cd coverage && ./coverage_test.sh
+        # Delete intermediate files
+	find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+
+# Extract the names of its tests by running db_test with --gtest_list_tests.
+# This filter removes the "#"-introduced comments, and expands to
+# fully-qualified names by changing input like this:
+#
+#   DBTest.
+#     Empty
+#     WriteEmptyBatch
+#   MultiThreaded/MultiThreadedDBTest.
+#     MultiThreaded/0  # GetParam() = 0
+#     MultiThreaded/1  # GetParam() = 1
+#
+# into this:
+#
+#   DBTest.Empty
+#   DBTest.WriteEmptyBatch
+#   MultiThreaded/MultiThreadedDBTest.MultiThreaded/0
+#   MultiThreaded/MultiThreadedDBTest.MultiThreaded/1
+#
+test_names = \
+  ./db_test --gtest_list_tests						\
+    | perl -n								\
+      -e 's/ *\#.*//;'							\
+      -e '/^(\s*)(\S+)/; !$$1 and do {$$p=$$2; break};'			\
+      -e 'print qq! $$p$$2!'
+
+ifeq ($(MAKECMDGOALS),check)
+# Use /dev/shm if it has the sticky bit set (otherwise, /tmp),
+# and create a randomly-named rocksdb.XXXX directory therein.
+# We'll use that directory in the "make check" rules.
+ifeq ($(TMPD),)
+TMPD := $(shell f=/dev/shm; test -k $$f || f=/tmp;			\
+  perl -le 'use File::Temp "tempdir";'					\
+    -e 'print tempdir("'$$f'/rocksdb.XXXX", CLEANUP => 0)')
+endif
+endif
+
+ifneq ($(T),)
+
+# Run all tests in parallel, accumulating per-test logs in t/log-*.
+
+# t_sanitized is each $(T) with "-" in place of each "/".
+t_sanitized = $(subst /,-,$(T))
+
+# t_run is each sanitized name with a leading "t/".
+t_run = $(patsubst %,t/%,$(t_sanitized))
+
+# Each t_run file is a tiny generated bourne shell script
+# that invokes one of db_tests's sub-tests. Why use a file
+# for this?  Because that makes the invocation of parallel
+# below simpler, which in turn makes the parsing of parallel's
+# LOG simpler (the latter is for live monitoring as parallel
+# tests run).
+filter = --gtest_filter=$(subst -,/,$(@F))
+$(t_run): Makefile db_test
+	$(AM_V_GEN)mkdir -p t
+	$(AM_V_at)rm -f $@ $@-t
+	$(AM_V_at)printf '%s\n'						\
+	    '#!/bin/sh'							\
+	    'd=$(TMPD)/$(@F)'						\
+	    'mkdir -p $$d'						\
+	    'TEST_TMPDIR=$$d ./db_test $(filter)'			\
+	  > $@-t
+	$(AM_V_at)chmod a=rx $@-t
+	$(AM_V_at)mv $@-t $@
+
+# Reorder input lines (which are one per test) so that the
+# longest-running tests appear first in the output.
+# Do this by prefixing each selected name with its duration,
+# sort the resulting names, and remove the leading numbers.
+# FIXME: the "100" we prepend is a fake time, for now.
+# FIXME: squirrel away timings from each run and use them
+# (when present) on subsequent runs to order these tests.
+#
+# Without this reordering, these two tests would happen to start only
+# after almost all other tests had completed, thus adding 100 seconds
+# to the duration of parallel "make check".  That's the difference
+# between 4 minutes (old) and 2m20s (new).
+#
+# 152.120 PASS t/DBTest.FileCreationRandomFailure
+# 107.816 PASS t/DBTest.EncodeDecompressedBlockSizeTest
+#
+slow_test_regexp = \
+  ^t/DBTest\.(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$
+prioritize_long_running_tests =						\
+  perl -pe 's,($(slow_test_regexp)),100 $$1,'				\
+    | sort -k1,1gr							\
+    | sed 's/^[.0-9]* //'
+
+# "make check" uses
+# Run with "make J=1 check" to disable parallelism in "make check".
+# Run with "make J=200% check" to run two parallel jobs per core.
+# The default is to run one job per core (J=100%).
+# See "man parallel" for its "-j ..." option.
+J = 100%
+
+# Use this regexp to select the subset of tests whose names match.
+tests-regexp = .
+
+.PHONY: check_0
+check_0: $(t_run)
+	$(AM_V_GEN)export TEST_TMPDIR=$(TMPD);				\
+	printf '%s\n' ''						\
+	  'To monitor subtest <duration,pass/fail,name>,'		\
+	  '  run "make watch-log" in a separate window' '';		\
+	test -t 1 && eta=--eta || eta=;					\
+	{								\
+	  printf './%s\n' $(filter-out db_test, $(TESTS));		\
+	  printf '%s\n' $(t_run);					\
+	}								\
+	  | $(prioritize_long_running_tests)				\
+	  | grep -E '$(tests-regexp)'					\
+	  | parallel -j$(J) --joblog=LOG $$eta --gnu '{} >& t/log-{/}'
+endif
+
+CLEAN_FILES += t LOG $(TMPD)
+
+# When running parallel "make check", you can monitor its progress
+# from another window.
+# Run "make watch_LOG" to show the duration,PASS/FAIL,name of parallel
+# tests as they are being run.  We sort them so that longer-running ones
+# appear at the top of the list and any failing tests remain at the top
+# regardless of their duration. As with any use of "watch", hit ^C to
+# interrupt.
+watch-log:
+	watch --interval=0 'sort -k7,7nr -k4,4gr LOG|$(quoted_perl_command)'
+
+# If J != 1 and GNU parallel is installed, run the tests in parallel,
+# via the check_0 rule above.  Otherwise, run them sequentially.
+check: all
+	$(AM_V_GEN)if test "$(J)" != 1                                  \
+	    && (parallel --gnu --help 2>/dev/null) |                    \
+	        grep -q 'GNU Parallel';                                 \
+	then                                                            \
+	    t=$$($(test_names));                                        \
+	    $(MAKE) T="$$t" TMPD=$(TMPD) check_0;                       \
+	else                                                            \
+	    for t in $(TESTS); do                                       \
+	      echo "===== Running $$t"; ./$$t || exit 1; done;          \
+	fi
+	rm -rf $(TMPD)
+	python tools/ldb_test.py
+	sh tools/rocksdb_dump_test.sh
+
+check_some: $(SUBSET) ldb_tests
+	for t in $(SUBSET); do echo "===== Running $$t"; ./$$t || exit 1; done
+
+.PHONY: ldb_tests
+ldb_tests: ldb
+	python tools/ldb_test.py
+
+crash_test: whitebox_crash_test blackbox_crash_test
+
+blackbox_crash_test: db_stress
+	python -u tools/db_crashtest.py -s
+	python -u tools/db_crashtest.py
+
+whitebox_crash_test: db_stress
+	python -u tools/db_crashtest2.py -s
+	python -u tools/db_crashtest2.py
+
+asan_check:
+	$(MAKE) clean
+	COMPILE_WITH_ASAN=1 $(MAKE) check -j32
+	$(MAKE) clean
+
+asan_crash_test:
+	$(MAKE) clean
+	COMPILE_WITH_ASAN=1 $(MAKE) crash_test
+	$(MAKE) clean
+
+valgrind_check: $(TESTS)
+	for t in $(filter-out skiplist_test,$(TESTS)); do \
+		$(VALGRIND_VER) $(VALGRIND_OPTS) ./$$t; \
+		ret_code=$$?; \
+		if [ $$ret_code -ne 0 ]; then \
+			exit $$ret_code; \
+		fi; \
+	done
+
+analyze: clean
+	$(CLANG_SCAN_BUILD) --use-analyzer=$(CLANG_ANALYZER) \
+		--use-c++=$(CXX) --use-cc=$(CC) --status-bugs \
+		-o $(CURDIR)/scan_build_report \
+		$(MAKE) dbg
+
+CLEAN_FILES += unity.cc
+unity.cc: Makefile
+	rm -f $@ $@-t
+	for source_file in $(LIB_SOURCES); do \
+		echo "#include \"$$source_file\"" >> $@-t; \
+	done
+	chmod a=r $@-t
+	mv $@-t $@
+
+unity.a: unity.o
+	$(AM_V_AR)rm -f $@
+	$(AM_V_at)$(AR) $(ARFLAGS) $@ unity.o
+
+# try compiling db_test with unity
+unity_test: db/db_test.o util/db_test_util.o $(TESTHARNESS) unity.a
+	$(AM_LINK)
+	./unity_test
+
+rocksdb.h rocksdb.cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc
+	build_tools/amalgamate.py -I. -i./include unity.cc -x include/rocksdb/c.h -H rocksdb.h -o rocksdb.cc
+
+clean:
+	rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(LIBRARY) $(SHARED)
+	rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report
+	find . -name "*.[oda]" -exec rm -f {} \;
+	find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+	rm -rf bzip2* snappy* zlib* lz4*
+
+tags:
+	ctags * -R
+	cscope -b `find . -name '*.cc'` `find . -name '*.h'`
+
+format:
+	build_tools/format-diff.sh
+
+package:
+	bash build_tools/make_package.sh $(SHARED_MAJOR).$(SHARED_MINOR)
+
+# ---------------------------------------------------------------------------
+# 	Unit tests and tools
+# ---------------------------------------------------------------------------
+$(LIBRARY): $(LIBOBJECTS)
+	$(AM_V_AR)rm -f $@
+	$(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIBOBJECTS)
+
+db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	$(AM_LINK)
+
+cache_bench: util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	$(AM_LINK)
+
+memtablerep_bench: db/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL)
+	$(AM_LINK)
+
+block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+db_stress: tools/db_stress.o $(LIBOBJECTS) $(TESTUTIL)
+	$(AM_LINK)
+
+db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL)
+	$(AM_LINK)
+
+db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL)
+	$(AM_LINK)
+
+arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+autovector_test: util/autovector_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+column_family_test: db/column_family_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+table_properties_collector_test: db/table_properties_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+bloom_test: util/bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+dynamic_bloom_test: util/dynamic_bloom_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+c_test: db/c_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+histogram_test: util/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+slice_transform_test: util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+db_test: db/db_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+db_log_iter_test: db/db_log_iter_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+db_compaction_filter_test: db/db_compaction_filter_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+db_compaction_test: db/db_compaction_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+db_dynamic_level_test: db/db_dynamic_level_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+db_inplace_update_test: db/db_inplace_update_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+db_tailing_iter_test: db/db_tailing_iter_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+db_iter_test: db/db_iter_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+db_universal_compaction_test: db/db_universal_compaction_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+db_wal_test: db/db_wal_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK) $(pg)
+
+plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+comparator_db_test: db/comparator_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK) $(pg)
+
+perf_context_test: db/perf_context_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+prefix_test: db/prefix_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)
+
+backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+checkpoint_test: utilities/checkpoint/checkpoint_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+document_db_test: utilities/document/document_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+json_document_test: utilities/document/json_document_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+spatial_db_test: utilities/spatialdb/spatial_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+flush_job_test: db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+compaction_iterator_test: db/compaction_iterator_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+compaction_job_test: db/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+compaction_job_stats_test: db/compaction_job_stats_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+compact_on_deletion_collector_test: utilities/table_properties_collectors/compact_on_deletion_collector_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+wal_manager_test: db/wal_manager_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+fault_injection_test: db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+rate_limiter_test: util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+delete_scheduler_test: util/delete_scheduler_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+file_reader_writer_test: util/file_reader_writer_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+full_filter_block_test: table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+compaction_picker_test: db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+version_builder_test: db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+file_indexer_test: db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+write_controller_test: db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+merge_helper_test: db/merge_helper_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+merger_test: table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+geodb_test: utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+rocksdb_dump: tools/dump/rocksdb_dump.o $(LIBOBJECTS)
+	$(AM_LINK)
+
+rocksdb_undump: tools/dump/rocksdb_undump.o $(LIBOBJECTS)
+	$(AM_LINK)
+
+cuckoo_table_builder_test: table/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+cuckoo_table_reader_test: table/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+cuckoo_table_db_test: db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+listener_test: db/listener_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+thread_list_test: util/thread_list_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+compact_files_test: db/compact_files_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+options_test: util/options_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+event_logger_test: util/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+sst_dump_test: util/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+memenv_test : util/memenv_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+optimistic_transaction_test: utilities/transactions/optimistic_transaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+mock_env_test : util/mock_env_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+memtable_list_test: db/memtable_list_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+write_callback_test: db/write_callback_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+heap_test: util/heap_test.o $(GTEST)
+	$(AM_LINK)
+
+transaction_test: utilities/transactions/transaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+sst_dump: tools/sst_dump.o $(LIBOBJECTS)
+	$(AM_LINK)
+
+ldb_cmd_test: util/ldb_cmd_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+ldb: tools/ldb.o $(LIBOBJECTS)
+	$(AM_LINK)
+
+#-------------------------------------------------
+# make install related stuff
+INSTALL_PATH ?= /usr/local
+
+uninstall:
+	rm -rf $(INSTALL_PATH)/include/rocksdb \
+	  $(INSTALL_PATH)/lib/$(LIBRARY) \
+	  $(INSTALL_PATH)/lib/$(SHARED4) \
+	  $(INSTALL_PATH)/lib/$(SHARED3) \
+	  $(INSTALL_PATH)/lib/$(SHARED2) \
+	  $(INSTALL_PATH)/lib/$(SHARED1)
+
+install-headers:
+	install -d $(INSTALL_PATH)/lib
+	for header_dir in `find "include/rocksdb" -type d`; do \
+		install -d $(INSTALL_PATH)/$$header_dir; \
+	done
+	for header in `find "include/rocksdb" -type f -name *.h`; do \
+		install -C -m 644 $$header $(INSTALL_PATH)/$$header; \
+	done
+
+install-static: install-headers $(LIBRARY)
+	install -C -m 755 $(LIBRARY) $(INSTALL_PATH)/lib
+
+install-shared: install-headers $(SHARED4)
+	install -C -m 755 $(SHARED4) $(INSTALL_PATH)/lib && \
+		ln -fs $(SHARED4) $(INSTALL_PATH)/lib/$(SHARED3) && \
+		ln -fs $(SHARED4) $(INSTALL_PATH)/lib/$(SHARED2) && \
+		ln -fs $(SHARED4) $(INSTALL_PATH)/lib/$(SHARED1)
+
+# install static by default + install shared if it exists
+install: install-static
+	[ -e $(SHARED4) ] && $(MAKE) install-shared || :
+
+#-------------------------------------------------
+
+
+# ---------------------------------------------------------------------------
+# Jni stuff
+# ---------------------------------------------------------------------------
+
+JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
+ARCH := $(shell getconf LONG_BIT)
+ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so
+ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
+ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
+ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar
+ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar
+
+ifeq ($(PLATFORM), OS_MACOSX)
+ROCKSDBJNILIB = librocksdbjni-osx.jnilib
+ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar
+ifneq ("$(wildcard $(JAVA_HOME)/include/darwin)","")
+	JAVA_INCLUDE = -I$(JAVA_HOME)/include -I $(JAVA_HOME)/include/darwin
+else
+	JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/
+endif
+endif
+
+libz.a:
+	-rm -rf zlib-1.2.8
+	curl -O http://zlib.net/zlib-1.2.8.tar.gz
+	tar xvzf zlib-1.2.8.tar.gz
+	cd zlib-1.2.8 && CFLAGS='-fPIC' ./configure --static && make
+	cp zlib-1.2.8/libz.a .
+
+libbz2.a:
+	-rm -rf bzip2-1.0.6
+	curl -O  http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz
+	tar xvzf bzip2-1.0.6.tar.gz
+	cd bzip2-1.0.6 && make CFLAGS='-fPIC -O2 -g -D_FILE_OFFSET_BITS=64'
+	cp bzip2-1.0.6/libbz2.a .
+
+libsnappy.a:
+	-rm -rf snappy-1.1.1
+	curl -O https://snappy.googlecode.com/files/snappy-1.1.1.tar.gz
+	tar xvzf snappy-1.1.1.tar.gz
+	cd snappy-1.1.1 && ./configure --with-pic --enable-static
+	cd snappy-1.1.1 && make
+	cp snappy-1.1.1/.libs/libsnappy.a .
+
+liblz4.a:
+	   -rm -rf lz4-r127
+	   curl -O https://codeload.github.com/Cyan4973/lz4/tar.gz/r127
+	   mv r127 lz4-r127.tar.gz
+	   tar xvzf lz4-r127.tar.gz
+	   cd lz4-r127/lib && make CFLAGS='-fPIC' all
+	   cp lz4-r127/lib/liblz4.a .
+
+# A version of each $(LIBOBJECTS) compiled with -fPIC
+java_libobjects = $(patsubst %,jl/%,$(LIBOBJECTS))
+CLEAN_FILES += jl
+
+$(java_libobjects): jl/%.o: %.cc
+	$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -fPIC -c $< -o $@ $(COVERAGEFLAGS)
+
+rocksdbjavastatic: $(java_libobjects) libz.a libbz2.a libsnappy.a liblz4.a
+	cd java;$(MAKE) javalib;
+	rm -f ./java/target/$(ROCKSDBJNILIB)
+	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC \
+	  -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) \
+	  $(java_libobjects) $(COVERAGEFLAGS) \
+	  libz.a libbz2.a libsnappy.a liblz4.a $(LDFLAGS)
+	cd java/target;strip -S -x $(ROCKSDBJNILIB)
+	cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
+	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	cd java/target/apidocs;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) *
+	cd java/src/main/java;jar -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
+
+rocksdbjavastaticrelease: rocksdbjavastatic
+	cd java/crossbuild && vagrant destroy -f && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64
+	cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
+	cd java;jar -uf target/$(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
+	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
+
+rocksdbjavastaticpublish: rocksdbjavastaticrelease
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar -Dclassifier=javadoc
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32.jar -Dclassifier=linux32
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx
+	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
+
+rocksdbjava: $(java_libobjects)
+	$(AM_V_GEN)cd java;$(MAKE) javalib;
+	$(AM_V_at)rm -f ./java/target/$(ROCKSDBJNILIB)
+	$(AM_V_at)$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(java_libobjects) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
+	$(AM_V_at)cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	$(AM_V_at)cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
+	$(AM_V_at)cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+
+jclean:
+	cd java;$(MAKE) clean;
+
+jtest: rocksdbjava
+	cd java;$(MAKE) sample;$(MAKE) test;
+
+jdb_bench:
+	cd java;$(MAKE) db_bench;
+
+commit-prereq:
+	$(MAKE) clean && $(MAKE) all check;
+	$(MAKE) clean && $(MAKE) jclean && $(MAKE) rocksdbjava;
+	$(MAKE) clean && USE_CLANG=1 $(MAKE) all;
+	$(MAKE) clean && OPT=-DROCKSDB_LITE $(MAKE) static_lib;
+
+xfunc:
+	for xftest in $(XFUNC_TESTS); do \
+		echo "===== Running xftest $$xftest"; \
+		make check ROCKSDB_XFUNC_TEST="$$xftest" tests-regexp="DBTest" ;\
+	done
+
+
+# ---------------------------------------------------------------------------
+#  	Platform-specific compilation
+# ---------------------------------------------------------------------------
+
+ifeq ($(PLATFORM), IOS)
+# For iOS, create universal object files to be used on both the simulator and
+# a device.
+PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
+SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
+DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
+IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
+
+.cc.o:
+	mkdir -p ios-x86/$(dir $@)
+	$(CXX) $(CXXFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
+	mkdir -p ios-arm/$(dir $@)
+	xcrun -sdk iphoneos $(CXX) $(CXXFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
+	lipo ios-x86/$@ ios-arm/$@ -create -output $@
+
+.c.o:
+	mkdir -p ios-x86/$(dir $@)
+	$(CC) $(CFLAGS) -isysroot $(SIMULATORROOT)/SDKs/iPhoneSimulator$(IOSVERSION).sdk -arch i686 -arch x86_64 -c $< -o ios-x86/$@
+	mkdir -p ios-arm/$(dir $@)
+	xcrun -sdk iphoneos $(CC) $(CFLAGS) -isysroot $(DEVICEROOT)/SDKs/iPhoneOS$(IOSVERSION).sdk -arch armv6 -arch armv7 -arch armv7s -arch arm64 -c $< -o ios-arm/$@
+	lipo ios-x86/$@ ios-arm/$@ -create -output $@
+
+else
+.cc.o:
+	$(AM_V_CC)$(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
+
+.c.o:
+	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
+endif
+
+# ---------------------------------------------------------------------------
+#  	Source files dependencies detection
+# ---------------------------------------------------------------------------
+
+all_sources = $(LIB_SOURCES) $(TEST_BENCH_SOURCES) $(MOCK_SOURCES)
+DEPFILES = $(all_sources:.cc=.d)
+
+# Add proper dependency support so changing a .h file forces a .cc file to
+# rebuild.
+
+# The .d file indicates .cc file's dependencies on .h files. We generate such
+# dependency by g++'s -MM option, whose output is a make dependency rule.
+$(DEPFILES): %.d: %.cc
+	@$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
+	  -MM -MT'$@' -MT'$(<:.cc=.o)' "$<" -o '$@'
+
+depend: $(DEPFILES)
+
+# if the make goal is either "clean" or "format", we shouldn't
+# try to import the *.d files.
+# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly
+# working solution.
+ifneq ($(MAKECMDGOALS),clean)
+ifneq ($(MAKECMDGOALS),format)
+ifneq ($(MAKECMDGOALS),jclean)
+ifneq ($(MAKECMDGOALS),jtest)
+ifneq ($(MAKECMDGOALS),package)
+ifneq ($(MAKECMDGOALS),analyze)
+-include $(DEPFILES)
+endif
+endif
+endif
+endif
+endif
+endif
diff --git a/src/rocksdb/Makefile.am b/src/rocksdb/Makefile.am
deleted file mode 100644
index 55c41ae..0000000
--- a/src/rocksdb/Makefile.am
+++ /dev/null
@@ -1,383 +0,0 @@
-AUTOMAKE_OPTIONS = subdir-objects
-OPT ?= -DNDEBUG
-
-ACLOCAL_AMFLAGS = -I m4
-WARNING_FLAGS = -Wall -Werror
-ROCKSDB_CFLAGS = -g -O2 -fPIC
-ROCKSDB_CXXFLAGS = -std=c++11 -g -O2 -fPIC
-GFLAG = gflags
-noinst_LTLIBRARIES = librocksdb.la
-
-librocksdb_la_CFLAGS = $(WARNING_FLAGS) $(ROCKSDB_CFLAGS) -I$(srcdir) -I$(srcdir)/include -DOS_LINUX -DROCKSDB_PLATFORM_POSIX -fno-builtin-memcmp $(OPT) -DHAVE_JEMALLOC -Woverloaded-virtual
-librocksdb_la_CXXFLAGS = $(WARNING_FLAGS) $(ROCKSDB_CXXFLAGS) -I$(srcdir) -I$(srcdir)/include -I$(srcdir)/third-party/gtest-1.7.0/fused-src -DOS_LINUX -DROCKSDB_PLATFORM_POSIX -fno-builtin-memcmp $(OPT) -DHAVE_JEMALLOC -Woverloaded-virtual -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT
-librocksdb_la_LDFLAGS = -shared -Wl,-soname -Wl
-if WITH_ATOMIC
-   librocksdb_la_CFLAGS += -DROCKSDB_ATOMIC_PRESENT
-   librocksdb_la_CXXFLAGS += -DROCKSDB_ATOMIC_PRESENT
-endif
-
-if WITH_FALLOCATE
-   librocksdb_la_CFLAGS += -DROCKSDB_FALLOCATE_PRESENT
-   librocksdb_la_CXXFLAGS += -DROCKSDB_FALLOCATE_PRESENT
-endif
-#
-librocksdb_la_LDFLAGS += -lpthread
-if WITH_TCMALLOC
-   librocksdb_la_LDFLAGS += -ltcmalloc
-if WITH_RT
-   librocksdb_la_LDFLAGS += -lrt
-endif
-endif
-if WITH_SNAPPY
-   librocksdb_la_CFLAGS += -DSNAPPY
-   librocksdb_la_CXXFLAGS += -DSNAPPY
-   librocksdb_la_LDFLAGS += -lsnappy
-endif
-if WITH_Z
-   librocksdb_la_CFLAGS += -DZLIB
-   librocksdb_la_CXXFLAGS += -DZLIB
-   librocksdb_la_LDFLAGS += -lz
-endif
-if WITH_BZ2
-   librocksdb_la_CFLAGS += -DBZIP2
-   librocksdb_la_CXXFLAGS += -DBZIP2
-   librocksdb_la_LDFLAGS += -lbz2
-endif
-if WITH_GOOGLE_FLAGS
-   GFLAGS = google
-   librocksdb_la_CFLAGS += -DGFLAGS
-   librocksdb_la_CXXFLAGS += -DGFLAGS
-   librocksdb_la_LDFLAGS += -lgflags
-endif
-if WITH_GFLAGS_FLAGS
-   GFLAGS = gflags
-   librocksdb_la_CFLAGS += -DGFLAGS
-   librocksdb_la_CXXFLAGS += -DGFLAGS
-   librocksdb_la_LDFLAGS += -lgflags
-endif
-
-# Record the version of the source that we are compiling.
-# We keep a record of the git revision in this file.  It is then built
-# as a regular source file as part of the compilation process.
-# One can run "strings executable_filename | grep _build_" to find
-# the version of the source that we used to build the executable file.
-date := $(shell date +%F)
-git_sha := $(shell git describe HEAD 2>/dev/null)
-gen_build_version =                                                     \
-  printf '%s\n'                                                         \
-    '\#include "build_version.h"'                                       \
-    'const char* rocksdb_build_git_sha =                                \
-      "rocksdb_build_git_sha:$(git_sha)";'                              \
-    'const char* rocksdb_build_git_date =                               \
-      "rocksdb_build_git_date:$(date)";'                                \
-    'const char* rocksdb_build_compile_date = __DATE__;'
-$(shell $(gen_build_version) > util/build_version.cc)
-
-SOURCE_FILES = db/builder.cc \
-        db/c.cc \
-        db/column_family.cc \
-        db/compaction.cc \
-        db/compaction_job.cc \
-        db/compaction_picker.cc \
-        db/db_filesnapshot.cc \
-        db/dbformat.cc \
-        db/db_impl.cc \
-        db/db_impl_debug.cc \
-        db/db_impl_readonly.cc \
-        db/db_iter.cc \
-	db/event_logger_helpers.cc \
-        db/file_indexer.cc \
-        db/filename.cc \
-        db/flush_job.cc \
-        db/flush_scheduler.cc \
-        db/forward_iterator.cc \
-        db/internal_stats.cc \
-        db/log_reader.cc \
-        db/log_writer.cc \
-        db/managed_iterator.cc \
-        db/memtable_allocator.cc \
-        db/memtable.cc \
-        db/memtable_list.cc \
-        db/merge_helper.cc \
-        db/merge_operator.cc \
-        db/repair.cc \
-        db/slice.cc \
-        db/table_cache.cc \
-        db/table_properties_collector.cc \
-        db/transaction_log_impl.cc \
-        db/version_builder.cc \
-        db/version_edit.cc \
-        db/version_set.cc \
-        db/wal_manager.cc \
-        db/write_batch.cc \
-        db/write_batch_base.cc \
-        db/write_controller.cc \
-        db/write_thread.cc \
-        port/stack_trace.cc \
-        port/port_posix.cc \
-        table/adaptive_table_factory.cc \
-        table/block_based_filter_block.cc \
-        table/block_based_table_builder.cc \
-        table/block_based_table_factory.cc \
-        table/block_based_table_reader.cc \
-        table/block_builder.cc \
-        table/block.cc \
-        table/block_hash_index.cc \
-        table/block_prefix_index.cc \
-        table/bloom_block.cc \
-        table/cuckoo_table_builder.cc \
-        table/cuckoo_table_factory.cc \
-        table/cuckoo_table_reader.cc \
-        table/flush_block_policy.cc \
-        table/format.cc \
-        table/full_filter_block.cc \
-        table/get_context.cc \
-        table/iterator.cc \
-        table/merger.cc \
-        table/meta_blocks.cc \
-        table/plain_table_builder.cc \
-        table/plain_table_factory.cc \
-        table/plain_table_index.cc \
-        table/plain_table_key_coding.cc \
-        table/plain_table_reader.cc \
-        table/table_properties.cc \
-        table/two_level_iterator.cc \
-        util/arena.cc \
-        util/auto_roll_logger.cc \
-        util/bloom.cc \
-        util/build_version.cc \
-        util/cache.cc \
-        util/coding.cc \
-        util/comparator.cc \
-        util/crc32c.cc \
-        util/db_info_dumper.cc \
-        util/dynamic_bloom.cc \
-        util/env.cc \
-        util/env_hdfs.cc \
-        util/env_posix.cc \
-        util/file_util.cc \
-        util/filter_policy.cc \
-        util/hash.cc \
-        util/hash_cuckoo_rep.cc \
-        util/hash_linklist_rep.cc \
-        util/hash_skiplist_rep.cc \
-        util/histogram.cc \
-        util/instrumented_mutex.cc \
-        util/iostats_context.cc \
-        utilities/backupable/backupable_db.cc \
-        utilities/convenience/convenience.cc \
-        utilities/checkpoint/checkpoint.cc \
-        utilities/compacted_db/compacted_db_impl.cc \
-        utilities/document/document_db.cc \
-        utilities/document/json_document_builder.cc \
-        utilities/document/json_document.cc \
-        utilities/geodb/geodb_impl.cc \
-        utilities/leveldb_options/leveldb_options.cc \
-        utilities/merge_operators/put.cc \
-        utilities/merge_operators/string_append/stringappend2.cc \
-        utilities/merge_operators/string_append/stringappend.cc \
-        utilities/merge_operators/uint64add.cc \
-        utilities/redis/redis_lists.cc \
-        utilities/spatialdb/spatial_db.cc \
-        utilities/ttl/db_ttl_impl.cc \
-        utilities/write_batch_with_index/write_batch_with_index.cc \
-        util/event_logger.cc \
-        util/ldb_cmd.cc \
-        util/ldb_tool.cc \
-        util/log_buffer.cc \
-        util/logging.cc \
-        util/memenv.cc \
-        util/murmurhash.cc \
-        util/mutable_cf_options.cc \
-        util/options_builder.cc \
-        util/options.cc \
-        util/options_helper.cc \
-        util/perf_context.cc \
-        util/rate_limiter.cc \
-        util/skiplistrep.cc \
-        util/slice.cc \
-        util/sst_dump_tool.cc \
-        util/statistics.cc \
-        util/status.cc \
-        util/string_util.cc \
-        util/sync_point.cc \
-        util/thread_local.cc \
-        util/thread_status_impl.cc \
-        util/thread_status_updater.cc \
-        util/thread_status_updater_debug.cc \
-        util/thread_status_util.cc \
-        util/thread_status_util_debug.cc \
-        util/vectorrep.cc \
-        util/xfunc.cc \
-        util/xxhash.cc
-
-SOURCE_H = util/allocator.h \
-        util/arena.h \
-        util/auto_roll_logger.h \
-        util/autovector.h \
-        util/build_version.h \
-        util/coding.h \
-        util/compression.h \
-        util/crc32c.h \
-        util/db_info_dumper.h \
-        util/dynamic_bloom.h \
-        util/event_logger.h \
-        util/file_util.h \
-        util/hash.h \
-        util/hash_cuckoo_rep.h \
-        util/hash_linklist_rep.h \
-        util/hash_skiplist_rep.h \
-        util/histogram.h \
-        util/instrumented_mutex.h \
-        util/iostats_context_imp.h \
-        util/ldb_cmd.h \
-        util/ldb_cmd_execute_result.h \
-        util/log_buffer.h \
-        util/logging.h \
-        util/mock_env.h \
-        util/murmurhash.h \
-        util/mutable_cf_options.h \
-        util/mutexlock.h \
-        util/options_helper.h \
-        util/perf_context_imp.h \
-        util/posix_logger.h \
-        util/random.h \
-        util/rate_limiter.h \
-        util/scoped_arena_iterator.h \
-        util/sst_dump_tool_imp.h \
-        util/statistics.h \
-        util/stl_wrappers.h \
-        util/stop_watch.h \
-        util/string_util.h \
-        util/sync_point.h \
-        util/testharness.h \
-        util/testutil.h \
-        util/thread_local.h \
-        util/thread_operation.h \
-        util/thread_status_updater.h \
-        util/thread_status_util.h \
-        util/xfunc.h \
-        util/xxhash.h \
-        db/builder.h \
-        db/column_family.h \
-        db/compaction.h \
-        db/compaction_job.h \
-        db/compaction_picker.h \
-        db/db_impl.h \
-        db/db_impl_readonly.h \
-        db/db_iter.h \
-        db/dbformat.h \
-        db/file_indexer.h \
-        db/filename.h \
-        db/flush_job.h \
-        db/flush_scheduler.h \
-        db/forward_iterator.h \
-        db/internal_stats.h \
-        db/job_context.h \
-        db/log_format.h \
-        db/log_reader.h \
-        db/log_writer.h \
-        db/managed_iterator.h \
-        db/memtable.h \
-        db/memtable_allocator.h \
-        db/memtable_list.h \
-        db/merge_context.h \
-        db/merge_helper.h \
-        db/skiplist.h \
-        db/snapshot.h \
-        db/table_cache.h \
-        db/table_properties_collector.h \
-        db/transaction_log_impl.h \
-        db/version_builder.h \
-        db/version_edit.h \
-        db/version_set.h \
-        db/wal_manager.h \
-        db/write_batch_internal.h \
-        db/write_controller.h \
-        db/write_thread.h \
-        db/writebuffer.h \
-        table/adaptive_table_factory.h \
-        table/block.h \
-        table/block_based_filter_block.h \
-        table/block_based_table_builder.h \
-        table/block_based_table_factory.h \
-        table/block_based_table_reader.h \
-        table/block_builder.h \
-        table/block_hash_index.h \
-        table/block_prefix_index.h \
-        table/bloom_block.h \
-        table/cuckoo_table_builder.h \
-        table/cuckoo_table_factory.h \
-        table/cuckoo_table_reader.h \
-        table/filter_block.h \
-        table/format.h \
-        table/full_filter_block.h \
-        table/get_context.h \
-        table/iter_heap.h \
-        table/iterator_wrapper.h \
-        table/merger.h \
-        table/meta_blocks.h \
-        table/mock_table.h \
-        table/plain_table_builder.h \
-        table/plain_table_factory.h \
-        table/plain_table_index.h \
-        table/plain_table_key_coding.h \
-        table/plain_table_reader.h \
-        table/table_builder.h \
-        table/table_properties_internal.h \
-        table/table_reader.h \
-        table/two_level_iterator.h \
-	include/utilities/geo_db.h \
-	include/utilities/stackable_db.h \
-	include/utilities/db_ttl.h \
-	include/utilities/utility_db.h \
-	include/utilities/backupable_db.h \
-        include/rocksdb/c.h \
-        include/rocksdb/cache.h \
-        include/rocksdb/compaction_filter.h \
-        include/rocksdb/comparator.h \
-        include/rocksdb/db.h \
-        include/rocksdb/env.h \
-        include/rocksdb/filter_policy.h \
-        include/rocksdb/flush_block_policy.h \
-        include/rocksdb/immutable_options.h \
-        include/rocksdb/iostats_context.h \
-        include/rocksdb/iterator.h \
-        include/rocksdb/ldb_tool.h \
-        include/rocksdb/listener.h \
-        include/rocksdb/memtablerep.h \
-        include/rocksdb/merge_operator.h \
-        include/rocksdb/metadata.h \
-        include/rocksdb/options.h \
-        include/rocksdb/perf_context.h \
-        include/rocksdb/rate_limiter.h \
-        include/rocksdb/slice.h \
-        include/rocksdb/slice_transform.h \
-        include/rocksdb/sst_dump_tool.h \
-        include/rocksdb/statistics.h \
-        include/rocksdb/status.h \
-        include/rocksdb/table.h \
-        include/rocksdb/table_properties.h \
-        include/rocksdb/thread_status.h \
-        include/rocksdb/transaction_log.h \
-        include/rocksdb/types.h \
-        include/rocksdb/universal_compaction.h \
-        include/rocksdb/version.h \
-        include/rocksdb/write_batch.h \
-        include/rocksdb/write_batch_base.h \
-        include/rocksdb/utilities/backupable_db.h \
-        include/rocksdb/utilities/checkpoint.h \
-        include/rocksdb/utilities/convenience.h \
-        include/rocksdb/utilities/db_ttl.h \
-        include/rocksdb/utilities/document_db.h \
-        include/rocksdb/utilities/geo_db.h \
-        include/rocksdb/utilities/json_document.h \
-        include/rocksdb/utilities/leveldb_options.h \
-        include/rocksdb/utilities/spatial_db.h \
-        include/rocksdb/utilities/stackable_db.h \
-        include/rocksdb/utilities/utility_db.h \
-        include/rocksdb/utilities/write_batch_with_index.h
-
-noinst_HEADERS = $(SOURCE_H)
-
-librocksdb_la_SOURCES = $(SOURCE_FILES)
diff --git a/src/rocksdb/ROCKSDB_LITE.md b/src/rocksdb/ROCKSDB_LITE.md
index e7e3752..41cfbec 100644
--- a/src/rocksdb/ROCKSDB_LITE.md
+++ b/src/rocksdb/ROCKSDB_LITE.md
@@ -8,6 +8,7 @@ Some examples of the features disabled by ROCKSDB_LITE:
 * No support for replication (which we provide in form of TrasactionalIterator)
 * No advanced monitoring tools
 * No special-purpose memtables that are highly optimized for specific use cases
+* No Transactions
 
 When adding a new big feature to RocksDB, please add ROCKSDB_LITE compile guard if:
 * Nobody from mobile really needs your feature,
diff --git a/src/rocksdb/USERS.md b/src/rocksdb/USERS.md
index 394aa30..386a23d 100644
--- a/src/rocksdb/USERS.md
+++ b/src/rocksdb/USERS.md
@@ -15,7 +15,7 @@ Two different use cases at Linkedin are using RocksDB as a storage engine:
 Learn more about those use cases in a Tech Talk by Ankit Gupta and Naveen Somasundaram: http://www.youtube.com/watch?v=plqVp_OnSzg
 
 ## Yahoo
-Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa.
+Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa. Learn more about it here: http://yahooeng.tumblr.com/post/120730204806/sherpa-scales-new-heights
 
 ## CockroachDB
 CockroachDB is an open-source geo-replicated transactional database (still in development). They are using RocksDB as their storage engine. Check out their github: https://github.com/cockroachdb/cockroach
@@ -34,3 +34,6 @@ Tango is using RocksDB as a graph storage to store all users' connection data an
 ## Turn
 Turn is using RocksDB as a storage layer for their key/value store, serving at peak 2.4MM QPS out of different datacenters.
 Check out our RocksDB Protobuf merge operator at: https://github.com/vladb38/rocksdb_protobuf
+
+## Santanader UK/Cloudera Profession Services
+Check out their blog post: http://blog.cloudera.com/blog/2015/08/inside-santanders-near-real-time-data-ingest-architecture/
diff --git a/src/rocksdb/Vagrantfile b/src/rocksdb/Vagrantfile
new file mode 100644
index 0000000..c517182
--- /dev/null
+++ b/src/rocksdb/Vagrantfile
@@ -0,0 +1,33 @@
+Vagrant.configure("2") do |config|
+
+  config.vm.provider "virtualbox" do |v|
+    v.memory = 4096
+    v.cpus = 2
+  end
+
+  config.vm.define "ubuntu14" do |box|
+    box.vm.box = "ubuntu/trusty64"
+  end
+
+  config.vm.define "centos65" do |box|
+    box.vm.box = "chef/centos-6.5"
+  end
+
+  config.vm.define "FreeBSD10" do |box|
+    box.vm.guest = :freebsd
+    box.vm.box = "robin/freebsd-10"
+    # FreeBSD does not support 'mount_virtualbox_shared_folder', use NFS
+    box.vm.synced_folder ".", "/vagrant", :nfs => true, id: "vagrant-root"
+    box.vm.network "private_network", ip: "10.0.1.10"
+
+    # build everything after creating VM, skip using --no-provision
+    box.vm.provision "shell", inline: <<-SCRIPT
+      pkg install -y gmake clang35
+      export CXX=/usr/local/bin/clang++35
+      cd /vagrant
+      gmake clean
+      gmake all OPT=-g
+    SCRIPT
+  end
+
+end
diff --git a/src/rocksdb/WINDOWS_PORT.md b/src/rocksdb/WINDOWS_PORT.md
new file mode 100644
index 0000000..a0fe1fe
--- /dev/null
+++ b/src/rocksdb/WINDOWS_PORT.md
@@ -0,0 +1,228 @@
+# Microsoft Contribution Notes
+
+## Contributors
+* Alexander Zinoviev https://github.com/zinoale
+* Dmitri Smirnov https://github.com/yuslepukhin
+* Praveen Rao  https://github.com/PraveenSinghRao
+* Sherlock Huang  https://github.com/SherlockNoMad
+
+## Introduction
+RocksDB is a well proven open source key-value persistent store, optimized for fast storage. It provides scalability with number of CPUs and storage IOPS, to support IO-bound, in-memory and write-once workloads, most importantly, to be flexible to allow for innovation.
+
+As Microsoft Bing team we have been continuously pushing hard to improve the scalability, efficiency of platform and eventually benefit Bing end-user satisfaction.  We would like to explore the opportunity to embrace open source, RocksDB here, to use, enhance and customize for our usage, and also contribute back to the RocksDB community. Herein, we are pleased to offer this RocksDB port for Windows platform.
+
+These notes describe some decisions and changes we had to make with regards to porting RocksDB on Windows. We hope this will help both reviewers and users of the Windows port.
+We are open for comments and improvements.
+
+## OS specifics
+All of the porting, testing and benchmarking was done on Windows Server 2012 R2 Datacenter 64-bit but to the best of our knowledge there is not a specific API we used during porting that is unsupported on other Windows OS after Vista.
+
+## Porting goals
+We strive to achieve the following goals:
+* make use of the existing porting interface of RocksDB
+* make minimum [WY2]modifications within platform independent code.
+* make all unit test pass both in debug and release builds. 
+  * Note: latest introduction of SyncPoint seems to disable running db_test in Release.
+* make performance on par with published benchmarks accounting for HW differences
+* we would like to keep the port code inline with the master branch with no forking
+
+## Build system
+We have chosen CMake as a widely accepted build system to build the Windows port. It is very fast and convenient. 
+
+At the same time it generates Visual Studio projects that are both usable from a command line and IDE.
+
+The top-level CMakeLists.txt file contains description of all targets and build rules. It also provides brief instructions on how to build the software for Windows. One more build related file is thirdparty.inc that also resides on the top level. This file must be edited to point to actual third party libraries location.
+We think that it would be beneficial to merge the existing make-based build system and the new cmake-based build system into a single one to use on all platforms.
+
+All building and testing was done for 64-bit. We have not conducted any testing for 32-bit and early reports indicate that it will not run on 32-bit.
+
+## C++ and STL notes
+We had to make some minimum changes within the portable files that either account for OS differences or the shortcomings of C++11 support in the current version of the MS compiler. Most or all of them are expected to be fixed in the upcoming compiler releases.
+
+We plan to use this port for our business purposes here at Bing and this provided business justification for this port. This also means, we do not have at present to choose the compiler version at will.
+
+* Certain headers that are not present and not necessary on Windows were simply `#ifndef OS_WIN` in a few places (`unistd.h`)
+* All posix specific headers were replaced to port/port.h which worked well
+* Replaced `dirent.h` for `port/dirent.h` (very few places) with the implementation of the relevant interfaces within `rocksdb::port` namespace
+* Replaced `sys/time.h` to `port/sys_time.h` (few places) implemented equivalents within `rocksdb::port`
+* `printf %z` specification is not supported on Windows. To imitate existing standards we came up with a string macro `ROCKSDB_PRIszt` which expands to `%z` on posix systems and to Iu on windows.
+* in class member initialization were moved to a __ctors in some cases
+* `constexpr` is not supported. We had to replace `std::numeric_limits<>::max/min()` to its C macros for constants. Sometimes we had to make class members `static const` and place a definition within a .cc file.
+* `constexpr` for functions was replaced to a template specialization (1 place)
+* Union members that have non-trivial constructors were replaced to `char[]` in one place along with bug fixes (spatial experimental feature)
+* Zero-sized arrays are deemed a non-standard extension which we converted to 1 size array and that should work well for the purposes of these classes.
+* `std::chrono` lacks nanoseconds support (fixed in the upcoming release of the STL) and we had to use `QueryPerfCounter()` within env_win.cc
+* Function local statics initialization is still not safe. Used `std::once` to mitigate within WinEnv.
+
+## Windows Environments notes
+We endeavored to make it functionally on par with posix_env. This means we replicated the functionality of the thread pool and other things as precise as possible, including:
+* Replicate posix logic using std:thread primitives.
+* Implement all posix_env disk access functionality.
+* Set `use_os_buffer=false` to disable OS disk buffering for WinWritableFile and WinRandomAccessFile.
+* Replace `pread/pwrite` with `WriteFile/ReadFile` with `OVERLAPPED` structure.
+* Use `SetFileInformationByHandle` to compensate absence of `fallocate`.
+
+### In detail
+Even though Windows provides its own efficient thread-pool implementation we chose to replicate posix logic using `std::thread` primitives. This allows anyone to quickly detect any changes within the posix source code and replicate them within windows env. This has proven to work very well. At the same time for anyone who wishes to replace the built-in thread-pool can do so using RocksDB stackable environments.
+
+For disk access we implemented all of the functionality present within the posix_env which includes memory mapped files, random access, rate-limiter support etc.
+The `use_os_buffer` flag on Posix platforms currently denotes disabling read-ahead log via `fadvise` mechanism. Windows does not have `fadvise` system call. What is more, it implements disk cache in a way that differs from Linux greatly. It�s not an uncommon practice on Windows to perform un-buffered disk access to gain control of the memory consumption. We think that in our use case this may also be a good configuration option at the expense of disk throughput. To compensate one may inc [...]
+
+We have replaced `pread/pwrite` with `WriteFile/ReadFile` with `OVERLAPPED` structure so we can atomically seek to the position of the disk operation but still perform the operation synchronously. Thus we able to emulate that functionality of `pread/pwrite` reasonably well. The only difference is that the file pointer is not returned to its original position but that hardly matters given the random nature of access.
+
+We used `SetFileInformationByHandle` both to truncate files after writing a full final page to disk and to pre-allocate disk space for faster I/O thus compensating for the absence of `fallocate` although some differences remain. For example, the pre-allocated space is not filled with zeros like on Linux, however, on a positive note, the end of file position is also not modified after pre-allocation.
+
+RocksDB renames, copies and deletes files at will even though they may be opened with another handle at the same time. We had to relax and allow nearly all the concurrent access permissions possible.
+
+## Thread-Local Storage
+Thread-Local storage plays a significant role for RocksDB performance. Rather than creating a separate implementation we chose to create inline wrappers that forward `pthread_specific` calls to Windows `Tls` interfaces within `rocksdb::port` namespace. This leaves the existing meat of the logic in tact and unchanged and just as maintainable.
+
+To mitigate the lack of thread local storage cleanup on thread-exit we added a limited amount of windows specific code within the same thread_local.cc file that injects a cleanup callback into a `"__tls"` structure within `".CRT$XLB"` data segment. This approach guarantees that the callback is invoked regardless of whether RocksDB used within an executable, standalone DLL or within another DLL.
+
+## Jemalloc usage
+
+When RocksDB is used with Jemalloc the latter needs to be initialized before any of the C++ globals or statics. To accomplish that we injected an initialization routine into `".CRT$XCT"` that is automatically invoked by the runtime before initializing static objects. je-uninit is queued to `atexit()`. 
+
+The jemalloc redirecting `new/delete` global operators are used by the linker providing certain conditions are met. See build section in these notes.
+
+## Stack Trace and Unhandled Exception Handler
+
+We decided not to implement these two features because the hosting program as a rule has these two things in it.
+We experienced no inconveniences debugging issues in the debugger or analyzing process dumps if need be and thus we did not
+see this as a priority.
+
+## Performance results
+### Setup
+All of the benchmarks are run on the same set of machines. Here are the details of the test setup:
+* 2 Intel(R) Xeon(R) E5 2450 0 @ 2.10 GHz (total 16 cores)
+* 2 XK0480GDQPH SSD Device, total 894GB free disk
+* Machine has 128 GB of RAM
+* Operating System: Windows Server 2012 R2 Datacenter
+* 100 Million keys; each key is of size 10 bytes, each value is of size 800 bytes
+* total database size is ~76GB
+* The performance result is based on RocksDB 3.11.
+* The parameters used, unless specified, were exactly the same as published in the GitHub Wiki page. 
+
+### RocksDB on flash storage
+
+#### Test 1. Bulk Load of keys in Random Order
+
+Version 3.11 
+
+* Total Run Time: 17.6 min
+* Fillrandom: 5.480 micros/op 182465 ops/sec;  142.0 MB/s
+* Compact: 486056544.000 micros/op 0 ops/sec
+
+Version 3.10 
+
+* Total Run Time: 16.2 min 
+* Fillrandom: 5.018 micros/op 199269 ops/sec;  155.1 MB/s 
+* Compact: 441313173.000 micros/op 0 ops/sec; 
+
+
+#### Test 2. Bulk Load of keys in Sequential Order
+
+Version 3.11 
+
+* Fillseq: 4.944 micros/op 202k ops/sec;  157.4 MB/s
+
+Version 3.10
+
+* Fillseq: 4.105 micros/op 243.6k ops/sec;  189.6 MB/s 
+
+
+#### Test 3. Random Write
+
+Version 3.11 
+
+* Unbuffered I/O enabled
+* Overwrite: 52.661 micros/op 18.9k ops/sec;   14.8 MB/s
+
+Version 3.10
+
+* Unbuffered I/O enabled 
+* Overwrite: 52.661 micros/op 18.9k ops/sec; 
+
+
+#### Test 4. Random Read
+
+Version 3.11 
+
+* Unbuffered I/O enabled
+* Readrandom: 15.716 micros/op 63.6k ops/sec; 49.5 MB/s 
+
+Version 3.10
+
+* Unbuffered I/O enabled 
+* Readrandom: 15.548 micros/op 64.3k ops/sec; 
+
+
+#### Test 5. Multi-threaded read and single-threaded write
+
+Version 3.11
+
+* Unbuffered I/O enabled
+* Readwhilewriting: 25.128 micros/op 39.7k ops/sec; 
+
+Version 3.10
+
+* Unbuffered I/O enabled 
+* Readwhilewriting: 24.854 micros/op 40.2k ops/sec; 
+
+
+### RocksDB In Memory 
+
+#### Test 1. Point Lookup
+
+Version 3.11
+
+80K writes/sec
+* Write Rate Achieved: 40.5k write/sec;
+* Readwhilewriting: 0.314 micros/op 3187455 ops/sec;  364.8 MB/s (715454999 of 715454999 found)
+
+Version 3.10
+
+* Write Rate Achieved:  50.6k write/sec 
+* Readwhilewriting: 0.316 micros/op 3162028 ops/sec; (719576999 of 719576999 found) 
+
+
+*10K writes/sec*
+
+Version 3.11
+
+* Write Rate Achieved: 5.8k/s write/sec
+* Readwhilewriting: 0.246 micros/op 4062669 ops/sec;  464.9 MB/s (915481999 of 915481999 found)
+
+Version 3.10
+
+* Write Rate Achieved: 5.8k/s write/sec 
+* Readwhilewriting: 0.244 micros/op 4106253 ops/sec; (927986999 of 927986999 found) 
+
+
+#### Test 2. Prefix Range Query
+
+Version 3.11
+
+80K writes/sec
+* Write Rate Achieved:  46.3k/s write/sec
+* Readwhilewriting: 0.362 micros/op 2765052 ops/sec;  316.4 MB/s (611549999 of 611549999 found)
+
+Version 3.10
+
+* Write Rate Achieved: 45.8k/s write/sec 
+* Readwhilewriting: 0.317 micros/op 3154941 ops/sec; (708158999 of 708158999 found) 
+
+Version 3.11
+
+10K writes/sec
+* Write Rate Achieved: 5.78k write/sec
+* Readwhilewriting: 0.269 micros/op 3716692 ops/sec;  425.3 MB/s (837401999 of 837401999 found)
+
+Version 3.10
+
+* Write Rate Achieved: 5.7k write/sec 
+* Readwhilewriting: 0.261 micros/op 3830152 ops/sec; (863482999 of 863482999 found) 
+
+
+We think that there is still big room to improve the performance, which will be an ongoing effort for us.
+
diff --git a/src/rocksdb/appveyor.yml b/src/rocksdb/appveyor.yml
new file mode 100644
index 0000000..e13e2d2
--- /dev/null
+++ b/src/rocksdb/appveyor.yml
@@ -0,0 +1,11 @@
+version: 1.0.{build}
+before_build:
+- md %APPVEYOR_BUILD_FOLDER%\build
+- cd %APPVEYOR_BUILD_FOLDER%\build
+- cmake -G "Visual Studio 12 Win64" ..
+- cd ..
+build:
+  project: build\ALL_BUILD.vcxproj
+  parallel: true
+  verbosity: minimal
+test: off
diff --git a/src/rocksdb/appveyordailytests.yml b/src/rocksdb/appveyordailytests.yml
new file mode 100644
index 0000000..a8b4af6
--- /dev/null
+++ b/src/rocksdb/appveyordailytests.yml
@@ -0,0 +1,22 @@
+version: 1.0.{build}
+before_build:
+- md %APPVEYOR_BUILD_FOLDER%\build
+- cd %APPVEYOR_BUILD_FOLDER%\build
+- cmake -G "Visual Studio 12 Win64" -DOPTDBG=1 ..
+- cd ..
+build:
+  project: build\ALL_BUILD.vcxproj
+  parallel: true
+  verbosity: minimal
+test:
+test_script:
+- ps: build_tools\run_ci_db_test.ps1
+notifications:
+  - provider: Email
+    to:
+      - svmtrocksdb at microsoft.com
+    subject: "Build {{status}}"
+    message: "{{message}}, {{commitId}}, ..."
+    on_build_success: false
+    on_build_failure: true
+    on_build_status_changed: true
diff --git a/src/rocksdb/arcanist_util/__phutil_library_init__.php b/src/rocksdb/arcanist_util/__phutil_library_init__.php
new file mode 100644
index 0000000..bc732ca
--- /dev/null
+++ b/src/rocksdb/arcanist_util/__phutil_library_init__.php
@@ -0,0 +1,3 @@
+<?php
+
+phutil_register_library('arcanist_util', __FILE__);
diff --git a/src/rocksdb/arcanist_util/__phutil_library_map__.php b/src/rocksdb/arcanist_util/__phutil_library_map__.php
new file mode 100644
index 0000000..274ad16
--- /dev/null
+++ b/src/rocksdb/arcanist_util/__phutil_library_map__.php
@@ -0,0 +1,38 @@
+<?php
+
+/**
+ * This file is automatically generated. Use 'arc liberate' to rebuild it.
+ * @generated
+ * @phutil-library-version 2
+ */
+
+phutil_register_library_map(array(
+  '__library_version__' => 2,
+  'class' =>
+  array(
+    'ArcanistCpplintLinter' => 'cpp_linter/ArcanistCpplintLinter.php',
+    'BaseDirectoryScopedFormatLinter' => 'cpp_linter/BaseDirectoryScopedFormatLinter.php',
+    'FacebookArcanistConfiguration' => 'config/FacebookArcanistConfiguration.php',
+    'FacebookFbcodeLintEngine' => 'lint_engine/FacebookFbcodeLintEngine.php',
+    'FacebookFbcodeUnitTestEngine' => 'unit_engine/FacebookFbcodeUnitTestEngine.php',
+    'FacebookHowtoevenLintEngine' => 'lint_engine/FacebookHowtoevenLintEngine.php',
+    'FacebookHowtoevenLinter' => 'cpp_linter/FacebookHowtoevenLinter.php',
+    'FbcodeClangFormatLinter' => 'cpp_linter/FbcodeClangFormatLinter.php',
+    'FbcodeCppLinter' => 'cpp_linter/FbcodeCppLinter.php',
+  ),
+  'function' =>
+  array(
+  ),
+  'xmap' =>
+  array(
+    'ArcanistCpplintLinter' => 'ArcanistLinter',
+    'BaseDirectoryScopedFormatLinter' => 'ArcanistLinter',
+    'FacebookArcanistConfiguration' => 'ArcanistConfiguration',
+    'FacebookFbcodeLintEngine' => 'ArcanistLintEngine',
+    'FacebookFbcodeUnitTestEngine' => 'ArcanistBaseUnitTestEngine',
+    'FacebookHowtoevenLintEngine' => 'ArcanistLintEngine',
+    'FacebookHowtoevenLinter' => 'ArcanistLinter',
+    'FbcodeClangFormatLinter' => 'BaseDirectoryScopedFormatLinter',
+    'FbcodeCppLinter' => 'ArcanistLinter',
+  ),
+));
diff --git a/src/rocksdb/arcanist_util/config/FacebookArcanistConfiguration.php b/src/rocksdb/arcanist_util/config/FacebookArcanistConfiguration.php
new file mode 100644
index 0000000..c345490
--- /dev/null
+++ b/src/rocksdb/arcanist_util/config/FacebookArcanistConfiguration.php
@@ -0,0 +1,35 @@
+<?php
+// Copyright 2004-present Facebook. All Rights Reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+class FacebookArcanistConfiguration extends ArcanistConfiguration {
+
+  public function didRunWorkflow($command,
+                                 ArcanistBaseWorkflow $workflow,
+                                 $error_code) {
+    if ($command == 'diff' && !$workflow->isRawDiffSource()) {
+      $this->maybePushToJenkins($workflow);
+    }
+  }
+
+  //////////////////////////////////////////////////////////////////////
+  /* Send off builds to jenkins */
+  function maybePushToJenkins($workflow) {
+    $diffID = $workflow->getDiffID();
+    if ($diffID === null) {
+      return;
+    }
+
+    $results = $workflow->getTestResults();
+    if (!$results) {
+      return;
+    }
+
+    $url = "https://ci-builds.fb.com/view/rocksdb/job/rocksdb_diff_check/"
+               ."buildWithParameters?token=AUTH&DIFF_ID=$diffID";
+    system("curl --noproxy '*' \"$url\" > /dev/null 2>&1");
+  }
+
+}
diff --git a/src/rocksdb/arcanist_util/cpp_linter/ArcanistCpplintLinter.php b/src/rocksdb/arcanist_util/cpp_linter/ArcanistCpplintLinter.php
new file mode 100644
index 0000000..b9c4137
--- /dev/null
+++ b/src/rocksdb/arcanist_util/cpp_linter/ArcanistCpplintLinter.php
@@ -0,0 +1,88 @@
+<?php
+
+/**
+ * Uses google's cpplint.py to check code. RocksDB team forked this file from
+ * phabricator's /src/lint/linter/ArcanistCpplintLinter.php, and customized it
+ * for its own use.
+ *
+ * You can get it here:
+ * http://google-styleguide.googlecode.com/svn/trunk/cpplint/cpplint.py
+ * @group linter
+ */
+final class ArcanistCpplintLinter extends ArcanistLinter {
+
+  public function willLintPaths(array $paths) {
+    return;
+  }
+
+  public function getLinterName() {
+    return 'cpplint.py';
+  }
+
+  public function getLintPath() {
+    $bin = 'cpplint.py';
+    // Search under current dir
+    list($err) = exec_manual('which %s/%s', $this->linterDir(), $bin);
+    if (!$err) {
+      return $this->linterDir().'/'.$bin;
+    }
+
+    // Look for globally installed cpplint.py
+    list($err) = exec_manual('which %s', $bin);
+    if ($err) {
+      throw new ArcanistUsageException(
+        "cpplint.py does not appear to be installed on this system. Install ".
+        "it (e.g., with 'wget \"http://google-styleguide.googlecode.com/".
+        "svn/trunk/cpplint/cpplint.py\"') ".
+        "in your .arcconfig to point to the directory where it resides. ".
+        "Also don't forget to chmod a+x cpplint.py!");
+    }
+
+    return $bin;
+  }
+
+  public function lintPath($path) {
+    $bin = $this->getLintPath();
+    $path = $this->rocksdbDir().'/'.$path;
+
+    $f = new ExecFuture("%C $path", $bin);
+
+    list($err, $stdout, $stderr) = $f->resolve();
+
+    if ($err === 2) {
+      throw new Exception("cpplint failed to run correctly:\n".$stderr);
+    }
+
+    $lines = explode("\n", $stderr);
+    $messages = array();
+    foreach ($lines as $line) {
+      $line = trim($line);
+      $matches = null;
+      $regex = '/^[^:]+:(\d+):\s*(.*)\s*\[(.*)\] \[(\d+)\]$/';
+      if (!preg_match($regex, $line, $matches)) {
+        continue;
+      }
+      foreach ($matches as $key => $match) {
+        $matches[$key] = trim($match);
+      }
+      $message = new ArcanistLintMessage();
+      $message->setPath($path);
+      $message->setLine($matches[1]);
+      $message->setCode($matches[3]);
+      $message->setName($matches[3]);
+      $message->setDescription($matches[2]);
+      $message->setSeverity(ArcanistLintSeverity::SEVERITY_WARNING);
+      $this->addLintMessage($message);
+    }
+  }
+
+  // The path of this linter
+  private function linterDir() {
+    return dirname(__FILE__);
+  }
+
+  // TODO(kaili) a quick and dirty way to figure out rocksdb's root dir.
+  private function rocksdbDir() {
+    return $this->linterDir()."/../..";
+  }
+}
diff --git a/src/rocksdb/arcanist_util/cpp_linter/BaseDirectoryScopedFormatLinter.php b/src/rocksdb/arcanist_util/cpp_linter/BaseDirectoryScopedFormatLinter.php
new file mode 100644
index 0000000..79966e7
--- /dev/null
+++ b/src/rocksdb/arcanist_util/cpp_linter/BaseDirectoryScopedFormatLinter.php
@@ -0,0 +1,74 @@
+<?php
+// Copyright 2004-present Facebook. All Rights Reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+abstract class BaseDirectoryScopedFormatLinter extends ArcanistLinter {
+
+  const LINT_FORMATTING = 1;
+
+  private $changedLines = array();
+  private $rawLintOutput = array();
+
+  abstract protected function getPathsToLint();
+
+  protected function shouldLintPath($path) {
+    foreach ($this->getPathsToLint() as $p) {
+      // check if $path starts with $p
+      if (strncmp($path, $p, strlen($p)) === 0) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // API to tell this linter which lines were changed
+  final public function setPathChangedLines($path, $changed) {
+    $this->changedLines[$path] = $changed;
+  }
+
+  final public function willLintPaths(array $paths) {
+    $futures = array();
+    foreach ($paths as $path) {
+      if (!$this->shouldLintPath($path)) {
+        continue;
+      }
+
+      $changed = $this->changedLines[$path];
+      if (!isset($changed)) {
+        // do not run linter if there are no changes
+        continue;
+      }
+
+      $futures[$path] = $this->getFormatFuture($path, $changed);
+    }
+
+    foreach (Futures($futures)->limit(8) as $p => $f) {
+      $this->rawLintOutput[$p] = $f->resolvex();
+    }
+  }
+
+  abstract protected function getFormatFuture($path, array $changed);
+  abstract protected function getLintMessage($diff);
+
+  final public function lintPath($path) {
+    if (!isset($this->rawLintOutput[$path])) {
+      return;
+    }
+
+    list($new_content) = $this->rawLintOutput[$path];
+    $old_content = $this->getData($path);
+
+    if ($new_content != $old_content) {
+      $diff = ArcanistDiffUtils::renderDifferences($old_content, $new_content);
+      $this->raiseLintAtOffset(
+        0,
+        self::LINT_FORMATTING,
+        $this->getLintMessage($diff),
+        $old_content,
+        $new_content);
+    }
+  }
+
+}
diff --git a/src/rocksdb/arcanist_util/cpp_linter/FacebookHowtoevenLinter.php b/src/rocksdb/arcanist_util/cpp_linter/FacebookHowtoevenLinter.php
new file mode 100644
index 0000000..6edb114
--- /dev/null
+++ b/src/rocksdb/arcanist_util/cpp_linter/FacebookHowtoevenLinter.php
@@ -0,0 +1,223 @@
+<?php
+// Copyright 2015-present Facebook. All Rights Reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+final class FacebookHowtoevenLinter extends ArcanistLinter {
+
+  const VERSION = 'fd9192f324c36d28136d14380f0b552a1385b59b';
+
+  private $parsedTargets = array();
+
+  public function getLinterName() {
+    return 'Howtoeven';
+  }
+
+  protected function getSeverity($code) {
+    $severities = array(
+      ArcanistLintSeverity::SEVERITY_DISABLED,
+      ArcanistLintSeverity::SEVERITY_ADVICE,
+      ArcanistLintSeverity::SEVERITY_WARNING,
+      ArcanistLintSeverity::SEVERITY_ERROR,
+    );
+    return idx($severities, $code, ArcanistLintSeverity::SEVERITY_WARNING);
+  }
+
+  public function willLintPaths(array $paths) {
+    // Cleanup previous runs.
+    $this->localExecx("rm -rf _build/_lint");
+
+    // Build compilation database.
+    $lintable_paths = $this->getLintablePaths($paths);
+    $interesting_paths = $this->getInterestingPaths($lintable_paths);
+
+    if (!$lintable_paths) {
+      return;
+    }
+
+    // Run lint.
+    try {
+      $this->localExecx(
+        "%C %C -p _build/dev/ %Ls",
+        $this->getBinaryPath(),
+        $this->getFilteredIssues(),
+        $lintable_paths);
+    } catch (CommandException $exception) {
+      PhutilConsole::getConsole()->writeErr($exception->getMessage());
+    }
+
+    // Load results.
+    $result = id(
+      new SQLite3(
+        $this->getProjectRoot().'/_build/_lint/lint.db',
+        SQLITE3_OPEN_READONLY))
+      ->query("SELECT * FROM raised_issues");
+
+    while ($issue = $result->fetchArray(SQLITE3_ASSOC)) {
+      // Skip issues not part of the linted file.
+      if (in_array($issue['file'], $interesting_paths)) {
+        $this->addLintMessage(id(new ArcanistLintMessage())
+          ->setPath($issue['file'])
+          ->setLine($issue['line'])
+          ->setChar($issue['column'])
+          ->setCode('Howtoeven')
+          ->setSeverity($this->getSeverity($issue['severity']))
+          ->setName('Hte-'.$issue['name'])
+          ->setDescription(
+            sprintf(
+              "%s\n\n%s",
+              ($issue['message']) ? $issue['message'] : $issue['description'],
+              $issue['explanation']))
+          ->setOriginalText(idx($issue, 'original', ''))
+          ->setReplacementText(idx($issue, 'replacement', '')));
+      }
+    }
+  }
+
+  public function lintPath($path) {
+  }
+
+  /**
+   * Get the paths that we know how to lint.
+   *
+   * The strategy is to first look whether there's an existing compilation
+   * database and use that if it's exhaustive. We generate our own only if
+   * necessary.
+   */
+  private function getLintablePaths($paths) {
+    // Replace headers with existing sources.
+    for ($i = 0; $i < count($paths); $i++) {
+      if (preg_match("/\.h$/", $paths[$i])) {
+        $header = preg_replace("/\.h$/", ".cpp", $paths[$i]);
+        if (file_exists($header)) {
+          $paths[$i] = $header;
+        }
+      }
+    }
+
+    // Check if database exists and is exhaustive.
+    $available_paths = $this->getAvailablePaths();
+    $lintable_paths = array_intersect($paths, $available_paths);
+    if ($paths === $lintable_paths) {
+      return $lintable_paths;
+    }
+
+    // Generate our own database.
+    $targets = $this->getTargetsFor($paths);
+    if (!$targets) {
+      PhutilConsole::getConsole()->writeErr(
+        "No build targets found for %s\n",
+        implode(', ', $paths));
+      return array();
+    }
+
+    $this->localExecx("./tools/build/bin/fbconfig.par -r %Ls", $targets);
+    $this->localExecx("./tools/build/bin/fbmake.par gen_cdb");
+
+    $available_paths = $this->getAvailablePaths();
+    $lintable_paths = array_intersect($paths, $available_paths);
+    if ($paths != $lintable_paths) {
+      PhutilConsole::getConsole()->writeErr(
+        "Can't lint %s\n",
+        implode(', ', array_diff($paths, $available_paths)));
+    }
+
+    // Return what we know how to lint.
+    return $lintable_paths;
+  }
+
+  /**
+   * Get the available paths in the current compilation database.
+   */
+  private function getAvailablePaths() {
+    $database_path = $this->getProjectRoot()
+      .'/_build/dev/compile_commands.json';
+    if (!file_exists($database_path)) {
+      return array();
+    }
+
+    $entries = json_decode(file_get_contents($database_path), true);
+    $paths = array();
+    foreach ($entries as $entry) {
+      $paths[] = $entry['file'];
+    }
+    return $paths;
+  }
+
+  /**
+   * Search for the targets directories for the given files.
+   */
+  private static function getTargetsFor($paths) {
+    $targets = array();
+    foreach ($paths as $path) {
+      while (($path = dirname($path)) !== '.') {
+        if (in_array('TARGETS', scandir($path))) {
+          $contents = file_get_contents($path.'/TARGETS');
+          if (strpos($contents, 'cpp_binary') !== false) {
+            $targets[] = $path;
+            break;
+          }
+        }
+      }
+    }
+    return array_unique($targets);
+  }
+
+  /**
+   * The paths that we actually want to report on.
+   */
+  private function getInterestingPaths($paths) {
+    $headers = array();
+    foreach ($paths as $path) {
+      $headers[] = preg_replace("/\.cpp$/", ".h", $path);
+    }
+    return array_merge($paths, $headers);
+  }
+
+  /**
+   * The path where the binary is located. Will return the current dewey binary
+   * unless the `HOWTOEVEN_BUILD` environment variable is set.
+   */
+  private function getBinaryPath() {
+    $path = sprintf(
+      "/mnt/dewey/fbcode/.commits/%s/builds/howtoeven/client",
+      self::VERSION);
+
+    $build = getenv('HOWTOEVEN_BUILD');
+    if ($build) {
+      $path = sprintf(
+        "./_build/%s/tools/howtoeven/client",
+        $build);
+      if (!file_exists($path)) {
+        PhutilConsole::getConsole()->writeErr(">> %s does not exist\n", $path);
+        exit(1);
+      }
+    }
+
+    return $path;
+  }
+
+  /**
+   * Execute the command in the root directory.
+   */
+  private function localExecx($command /* , ... */) {
+    $arguments = func_get_args();
+    return newv('ExecFuture', $arguments)
+      ->setCWD($this->getProjectRoot())
+      ->resolvex();
+  }
+
+  /**
+   * The root of the project.
+   */
+  private function getProjectRoot() {
+    return $this->getEngine()->getWorkingCopy()->getProjectRoot();
+  }
+
+  private function getFilteredIssues() {
+    $issues = getenv('HOWTOEVEN_ISSUES');
+    return ($issues) ? csprintf('-issues %s', $issues) : '';
+  }
+
+}
diff --git a/src/rocksdb/arcanist_util/cpp_linter/FbcodeClangFormatLinter.php b/src/rocksdb/arcanist_util/cpp_linter/FbcodeClangFormatLinter.php
new file mode 100644
index 0000000..a94a0be
--- /dev/null
+++ b/src/rocksdb/arcanist_util/cpp_linter/FbcodeClangFormatLinter.php
@@ -0,0 +1,58 @@
+<?php
+// Copyright 2004-present Facebook. All Rights Reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+final class FbcodeClangFormatLinter extends BaseDirectoryScopedFormatLinter {
+
+  const LINT_FORMATTING = 1;
+  const CLANG_FORMAT_BINARY = '/mnt/vol/engshare/admin/scripts/clang-format';
+
+  protected function getPathsToLint() {
+    return array('');
+  }
+
+  public function getLinterName() {
+    return 'CLANG_FORMAT';
+  }
+
+  public function getLintSeverityMap() {
+    return array(
+      self::LINT_FORMATTING => ArcanistLintSeverity::SEVERITY_ADVICE,
+    );
+  }
+
+  public function getLintNameMap() {
+    return array(
+      self::LINT_FORMATTING => pht('Changes are not clang-formatted'),
+    );
+  }
+
+  protected function getFormatFuture($path, array $changed) {
+    $args = "";
+    foreach ($changed as $key => $value) {
+      $args .= " --lines=$key:$key";
+    }
+
+    $binary = self::CLANG_FORMAT_BINARY;
+    if (!file_exists($binary)) {
+      // trust the $PATH
+      $binary = "clang-format";
+    }
+
+    return new ExecFuture(
+      "%s %s $args",
+      $binary,
+      $this->getEngine()->getFilePathOnDisk($path));
+  }
+
+  protected function getLintMessage($diff) {
+    $link_to_clang_format =
+      "[[ http://fburl.com/clang-format | clang-format ]]";
+    return <<<LINT_MSG
+Changes in this file were not formatted using $link_to_clang_format.
+Please run build_tools/format-diff.sh or `make format`
+LINT_MSG;
+  }
+}
diff --git a/src/rocksdb/arcanist_util/cpp_linter/FbcodeCppLinter.php b/src/rocksdb/arcanist_util/cpp_linter/FbcodeCppLinter.php
new file mode 100644
index 0000000..66eefa0
--- /dev/null
+++ b/src/rocksdb/arcanist_util/cpp_linter/FbcodeCppLinter.php
@@ -0,0 +1,123 @@
+<?php
+// Copyright 2004-present Facebook.  All rights reserved.
+
+class FbcodeCppLinter extends ArcanistLinter {
+  const FLINT      = "/home/engshare/tools/flint";
+  const LINT_ERROR   = 1;
+  const LINT_WARNING = 2;
+  const LINT_ADVICE  = 3;
+  const C_FLAG = "--c_mode=true";
+
+  private $rawLintOutput = array();
+
+  public function willLintPaths(array $paths) {
+    if (!file_exists(self::FLINT)) {
+      return;
+    }
+    $futures = array();
+    foreach ($paths as $p) {
+      $lpath = $this->getEngine()->getFilePathOnDisk($p);
+      $lpath_file = file($lpath);
+      if (preg_match('/\.(c)$/', $lpath) ||
+          preg_match('/-\*-.*Mode: C[; ].*-\*-/', $lpath_file[0]) ||
+          preg_match('/vim(:.*)*:\s*(set\s+)?filetype=c\s*:/', $lpath_file[0])
+          ) {
+        $futures[$p] = new ExecFuture("%s %s %s 2>&1",
+                           self::FLINT, self::C_FLAG,
+                           $this->getEngine()->getFilePathOnDisk($p));
+      } else {
+        $futures[$p] = new ExecFuture("%s %s 2>&1",
+          self::FLINT, $this->getEngine()->getFilePathOnDisk($p));
+      }
+    }
+
+    foreach (Futures($futures)->limit(8) as $p => $f) {
+      $this->rawLintOutput[$p] = $f->resolvex();
+    }
+
+    return;
+  }
+
+  public function getLinterName() {
+    return "FBCPP";
+  }
+
+  public function lintPath($path) {
+    $this->runCppLint($path);
+  }
+
+  private function runCppLint($path) {
+    $msgs = $this->getCppLintOutput($path);
+    foreach ($msgs as $m) {
+      $this->raiseLintAtLine($m['line'], 0, $m['severity'], $m['msg']);
+    }
+  }
+
+  private function adviseOnEachPattern(
+    $path,
+    $regex,
+    $message,
+    $lint_type = self::LINT_ADVICE,
+    $match_idx = 0) {
+      $file_data = $this->getData($path);
+      $matches = array();
+      if (!preg_match_all($regex, $file_data, $matches, PREG_OFFSET_CAPTURE)) {
+        return;
+      }
+
+      foreach ($matches[$match_idx] as $match) {
+        list($match_str, $offset) = $match;
+        $this->raiseLintAtOffset($offset, $lint_type, $message, $match_str);
+      }
+  }
+
+  public function getLintSeverityMap() {
+    return array(
+      self::LINT_WARNING => ArcanistLintSeverity::SEVERITY_WARNING,
+      self::LINT_ADVICE  => ArcanistLintSeverity::SEVERITY_ADVICE,
+      self::LINT_ERROR   => ArcanistLintSeverity::SEVERITY_ERROR
+    );
+  }
+
+  public function getLintNameMap() {
+    return array(
+      self::LINT_ADVICE   => "CppLint Advice",
+      self::LINT_WARNING  => "CppLint Warning",
+      self::LINT_ERROR    => "CppLint Error"
+    );
+  }
+
+  private function getCppLintOutput($path) {
+    list($output) = $this->rawLintOutput[$path];
+
+    $msgs = array();
+    $current = null;
+    $matches = array();
+    foreach (explode("\n", $output) as $line) {
+      if (preg_match('/.*?:(\d+):(.*)/', $line, $matches)) {
+        if ($current) {
+          $msgs[] = $current;
+        }
+        $line = $matches[1];
+        $text = $matches[2];
+        if (preg_match('/.*Warning.*/', $text)) {
+          $sev = self::LINT_WARNING;
+        } else if (preg_match('/.*Advice.*/', $text)) {
+          $sev = self::LINT_ADVICE;
+        } else {
+          $sev = self::LINT_ERROR;
+        }
+        $current = array('line'     => $line,
+                         'msg'      => $text,
+                         'severity' => $sev);
+      } else if ($current) {
+        $current['msg'] .= ' ' . $line;
+      }
+    }
+    if ($current) {
+      $msgs[] = $current;
+    }
+
+    return $msgs;
+  }
+}
diff --git a/src/rocksdb/arcanist_util/cpp_linter/cpplint.py b/src/rocksdb/arcanist_util/cpp_linter/cpplint.py
new file mode 100755
index 0000000..d620194
--- /dev/null
+++ b/src/rocksdb/arcanist_util/cpp_linter/cpplint.py
@@ -0,0 +1,4767 @@
+#!/usr/bin/python
+# Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree. An additional grant
+# of patent rights can be found in the PATENTS file in the same directory.
+# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file. See the AUTHORS file for names of contributors.
+#
+# Copyright (c) 2009 Google Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#    * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#    * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Does google-lint on c++ files.
+
+The goal of this script is to identify places in the code that *may*
+be in non-compliance with google style.  It does not attempt to fix
+up these problems -- the point is to educate.  It does also not
+attempt to find all problems, or to ensure that everything it does
+find is legitimately a problem.
+
+In particular, we can get very confused by /* and // inside strings!
+We do a small hack, which is to ignore //'s with "'s after them on the
+same line, but it is far from perfect (in either direction).
+"""
+
+import codecs
+import copy
+import getopt
+import math  # for log
+import os
+import re
+import sre_compile
+import string
+import sys
+import unicodedata
+
+
+_USAGE = """
+Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
+                   [--counting=total|toplevel|detailed] [--root=subdir]
+                   [--linelength=digits]
+        <file> [file] ...
+
+  The style guidelines this tries to follow are those in
+    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
+
+  Every problem is given a confidence score from 1-5, with 5 meaning we are
+  certain of the problem, and 1 meaning it could be a legitimate construct.
+  This will miss some errors, and is not a substitute for a code review.
+
+  To suppress false-positive errors of a certain category, add a
+  'NOLINT(category)' comment to the line.  NOLINT or NOLINT(*)
+  suppresses errors of all categories on that line.
+
+  The files passed in will be linted; at least one file must be provided.
+  Default linted extensions are .cc, .cpp, .cu, .cuh and .h.  Change the
+  extensions with the --extensions flag.
+
+  Flags:
+
+    output=vs7
+      By default, the output is formatted to ease emacs parsing.  Visual Studio
+      compatible output (vs7) may also be used.  Other formats are unsupported.
+
+    verbose=#
+      Specify a number 0-5 to restrict errors to certain verbosity levels.
+
+    filter=-x,+y,...
+      Specify a comma-separated list of category-filters to apply: only
+      error messages whose category names pass the filters will be printed.
+      (Category names are printed with the message and look like
+      "[whitespace/indent]".)  Filters are evaluated left to right.
+      "-FOO" and "FOO" means "do not print categories that start with FOO".
+      "+FOO" means "do print categories that start with FOO".
+
+      Examples: --filter=-whitespace,+whitespace/braces
+                --filter=whitespace,runtime/printf,+runtime/printf_format
+                --filter=-,+build/include_what_you_use
+
+      To see a list of all the categories used in cpplint, pass no arg:
+         --filter=
+
+    counting=total|toplevel|detailed
+      The total number of errors found is always printed. If
+      'toplevel' is provided, then the count of errors in each of
+      the top-level categories like 'build' and 'whitespace' will
+      also be printed. If 'detailed' is provided, then a count
+      is provided for each category like 'build/class'.
+
+    root=subdir
+      The root directory used for deriving header guard CPP variable.
+      By default, the header guard CPP variable is calculated as the relative
+      path to the directory that contains .git, .hg, or .svn.  When this flag
+      is specified, the relative path is calculated from the specified
+      directory. If the specified directory does not exist, this flag is
+      ignored.
+
+      Examples:
+        Assuing that src/.git exists, the header guard CPP variables for
+        src/chrome/browser/ui/browser.h are:
+
+        No flag => CHROME_BROWSER_UI_BROWSER_H_
+        --root=chrome => BROWSER_UI_BROWSER_H_
+        --root=chrome/browser => UI_BROWSER_H_
+
+    linelength=digits
+      This is the allowed line length for the project. The default value is
+      80 characters.
+
+      Examples:
+        --linelength=120
+
+    extensions=extension,extension,...
+      The allowed file extensions that cpplint will check
+
+      Examples:
+        --extensions=hpp,cpp
+"""
+
+# We categorize each error message we print.  Here are the categories.
+# We want an explicit list so we can list them all in cpplint --filter=.
+# If you add a new error message with a new category, add it to the list
+# here!  cpplint_unittest.py should tell you if you forget to do this.
+_ERROR_CATEGORIES = [
+  'build/class',
+  'build/deprecated',
+  'build/endif_comment',
+  'build/explicit_make_pair',
+  'build/forward_decl',
+  'build/header_guard',
+  'build/include',
+  'build/include_alpha',
+  'build/include_order',
+  'build/include_what_you_use',
+  'build/namespaces',
+  'build/printf_format',
+  'build/storage_class',
+  'legal/copyright',
+  'readability/alt_tokens',
+  'readability/braces',
+  'readability/casting',
+  'readability/check',
+  'readability/constructors',
+  'readability/fn_size',
+  'readability/function',
+  'readability/multiline_comment',
+  'readability/multiline_string',
+  'readability/namespace',
+  'readability/nolint',
+  'readability/nul',
+  'readability/streams',
+  'readability/todo',
+  'readability/utf8',
+  'runtime/arrays',
+  'runtime/casting',
+  'runtime/explicit',
+  'runtime/int',
+  'runtime/init',
+  'runtime/invalid_increment',
+  'runtime/member_string_references',
+  'runtime/memset',
+  'runtime/operator',
+  'runtime/printf',
+  'runtime/printf_format',
+  'runtime/references',
+  'runtime/string',
+  'runtime/threadsafe_fn',
+  'runtime/vlog',
+  'whitespace/blank_line',
+  'whitespace/braces',
+  'whitespace/comma',
+  'whitespace/comments',
+  'whitespace/empty_conditional_body',
+  'whitespace/empty_loop_body',
+  'whitespace/end_of_line',
+  'whitespace/ending_newline',
+  'whitespace/forcolon',
+  'whitespace/indent',
+  'whitespace/line_length',
+  'whitespace/newline',
+  'whitespace/operators',
+  'whitespace/parens',
+  'whitespace/semicolon',
+  'whitespace/tab',
+  'whitespace/todo'
+  ]
+
+# The default state of the category filter. This is overrided by the --filter=
+# flag. By default all errors are on, so only add here categories that should be
+# off by default (i.e., categories that must be enabled by the --filter= flags).
+# All entries here should start with a '-' or '+', as in the --filter= flag.
+_DEFAULT_FILTERS = []
+
+# We used to check for high-bit characters, but after much discussion we
+# decided those were OK, as long as they were in UTF-8 and didn't represent
+# hard-coded international strings, which belong in a separate i18n file.
+
+
+# C++ headers
+_CPP_HEADERS = frozenset([
+    # Legacy
+    'algobase.h',
+    'algo.h',
+    'alloc.h',
+    'builtinbuf.h',
+    'bvector.h',
+    'complex.h',
+    'defalloc.h',
+    'deque.h',
+    'editbuf.h',
+    'fstream.h',
+    'function.h',
+    'hash_map',
+    'hash_map.h',
+    'hash_set',
+    'hash_set.h',
+    'hashtable.h',
+    'heap.h',
+    'indstream.h',
+    'iomanip.h',
+    'iostream.h',
+    'istream.h',
+    'iterator.h',
+    'list.h',
+    'map.h',
+    'multimap.h',
+    'multiset.h',
+    'ostream.h',
+    'pair.h',
+    'parsestream.h',
+    'pfstream.h',
+    'procbuf.h',
+    'pthread_alloc',
+    'pthread_alloc.h',
+    'rope',
+    'rope.h',
+    'ropeimpl.h',
+    'set.h',
+    'slist',
+    'slist.h',
+    'stack.h',
+    'stdiostream.h',
+    'stl_alloc.h',
+    'stl_relops.h',
+    'streambuf.h',
+    'stream.h',
+    'strfile.h',
+    'strstream.h',
+    'tempbuf.h',
+    'tree.h',
+    'type_traits.h',
+    'vector.h',
+    # 17.6.1.2 C++ library headers
+    'algorithm',
+    'array',
+    'atomic',
+    'bitset',
+    'chrono',
+    'codecvt',
+    'complex',
+    'condition_variable',
+    'deque',
+    'exception',
+    'forward_list',
+    'fstream',
+    'functional',
+    'future',
+    'initializer_list',
+    'iomanip',
+    'ios',
+    'iosfwd',
+    'iostream',
+    'istream',
+    'iterator',
+    'limits',
+    'list',
+    'locale',
+    'map',
+    'memory',
+    'mutex',
+    'new',
+    'numeric',
+    'ostream',
+    'queue',
+    'random',
+    'ratio',
+    'regex',
+    'set',
+    'sstream',
+    'stack',
+    'stdexcept',
+    'streambuf',
+    'string',
+    'strstream',
+    'system_error',
+    'thread',
+    'tuple',
+    'typeindex',
+    'typeinfo',
+    'type_traits',
+    'unordered_map',
+    'unordered_set',
+    'utility',
+    'valarray',
+    'vector',
+    # 17.6.1.2 C++ headers for C library facilities
+    'cassert',
+    'ccomplex',
+    'cctype',
+    'cerrno',
+    'cfenv',
+    'cfloat',
+    'cinttypes',
+    'ciso646',
+    'climits',
+    'clocale',
+    'cmath',
+    'csetjmp',
+    'csignal',
+    'cstdalign',
+    'cstdarg',
+    'cstdbool',
+    'cstddef',
+    'cstdint',
+    'cstdio',
+    'cstdlib',
+    'cstring',
+    'ctgmath',
+    'ctime',
+    'cuchar',
+    'cwchar',
+    'cwctype',
+    ])
+
+# Assertion macros.  These are defined in base/logging.h and
+# testing/base/gunit.h.  Note that the _M versions need to come first
+# for substring matching to work.
+_CHECK_MACROS = [
+    'DCHECK', 'CHECK',
+    'EXPECT_TRUE_M', 'EXPECT_TRUE',
+    'ASSERT_TRUE_M', 'ASSERT_TRUE',
+    'EXPECT_FALSE_M', 'EXPECT_FALSE',
+    'ASSERT_FALSE_M', 'ASSERT_FALSE',
+    ]
+
+# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
+_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS])
+
+for op, replacement in [('==', 'EQ'), ('!=', 'NE'),
+                        ('>=', 'GE'), ('>', 'GT'),
+                        ('<=', 'LE'), ('<', 'LT')]:
+  _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement
+  _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
+  _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
+  _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
+  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
+  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
+
+for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
+                            ('>=', 'LT'), ('>', 'LE'),
+                            ('<=', 'GT'), ('<', 'GE')]:
+  _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
+  _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
+  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
+  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
+
+# Alternative tokens and their replacements.  For full list, see section 2.5
+# Alternative tokens [lex.digraph] in the C++ standard.
+#
+# Digraphs (such as '%:') are not included here since it's a mess to
+# match those on a word boundary.
+_ALT_TOKEN_REPLACEMENT = {
+    'and': '&&',
+    'bitor': '|',
+    'or': '||',
+    'xor': '^',
+    'compl': '~',
+    'bitand': '&',
+    'and_eq': '&=',
+    'or_eq': '|=',
+    'xor_eq': '^=',
+    'not': '!',
+    'not_eq': '!='
+    }
+
+# Compile regular expression that matches all the above keywords.  The "[ =()]"
+# bit is meant to avoid matching these keywords outside of boolean expressions.
+#
+# False positives include C-style multi-line comments and multi-line strings
+# but those have always been troublesome for cpplint.
+_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(
+    r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
+
+
+# These constants define types of headers for use with
+# _IncludeState.CheckNextIncludeOrder().
+_C_SYS_HEADER = 1
+_CPP_SYS_HEADER = 2
+_LIKELY_MY_HEADER = 3
+_POSSIBLE_MY_HEADER = 4
+_OTHER_HEADER = 5
+
+# These constants define the current inline assembly state
+_NO_ASM = 0       # Outside of inline assembly block
+_INSIDE_ASM = 1   # Inside inline assembly block
+_END_ASM = 2      # Last line of inline assembly block
+_BLOCK_ASM = 3    # The whole block is an inline assembly block
+
+# Match start of assembly blocks
+_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
+                        r'(?:\s+(volatile|__volatile__))?'
+                        r'\s*[{(]')
+
+
+_regexp_compile_cache = {}
+
+# Finds occurrences of NOLINT or NOLINT(...).
+_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?')
+
+# {str, set(int)}: a map from error categories to sets of linenumbers
+# on which those errors are expected and should be suppressed.
+_error_suppressions = {}
+
+# The root directory used for deriving header guard CPP variable.
+# This is set by --root flag.
+_root = None
+
+# The allowed line length of files.
+# This is set by --linelength flag.
+_line_length = 80
+
+# The allowed extensions for file names
+# This is set by --extensions flag.
+_valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
+
+def ParseNolintSuppressions(filename, raw_line, linenum, error):
+  """Updates the global list of error-suppressions.
+
+  Parses any NOLINT comments on the current line, updating the global
+  error_suppressions store.  Reports an error if the NOLINT comment
+  was malformed.
+
+  Args:
+    filename: str, the name of the input file.
+    raw_line: str, the line of input text, with comments.
+    linenum: int, the number of the current line.
+    error: function, an error handler.
+  """
+  # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*).
+  matched = _RE_SUPPRESSION.search(raw_line)
+  if matched:
+    category = matched.group(1)
+    if category in (None, '(*)'):  # => "suppress all"
+      _error_suppressions.setdefault(None, set()).add(linenum)
+    else:
+      if category.startswith('(') and category.endswith(')'):
+        category = category[1:-1]
+        if category in _ERROR_CATEGORIES:
+          _error_suppressions.setdefault(category, set()).add(linenum)
+        else:
+          error(filename, linenum, 'readability/nolint', 5,
+                'Unknown NOLINT error category: %s' % category)
+
+
+def ResetNolintSuppressions():
+  "Resets the set of NOLINT suppressions to empty."
+  _error_suppressions.clear()
+
+
+def IsErrorSuppressedByNolint(category, linenum):
+  """Returns true if the specified error category is suppressed on this line.
+
+  Consults the global error_suppressions map populated by
+  ParseNolintSuppressions/ResetNolintSuppressions.
+
+  Args:
+    category: str, the category of the error.
+    linenum: int, the current line number.
+  Returns:
+    bool, True iff the error should be suppressed due to a NOLINT comment.
+  """
+  return (linenum in _error_suppressions.get(category, set()) or
+          linenum in _error_suppressions.get(None, set()))
+
+def Match(pattern, s):
+  """Matches the string with the pattern, caching the compiled regexp."""
+  # The regexp compilation caching is inlined in both Match and Search for
+  # performance reasons; factoring it out into a separate function turns out
+  # to be noticeably expensive.
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].match(s)
+
+
+def ReplaceAll(pattern, rep, s):
+  """Replaces instances of pattern in a string with a replacement.
+
+  The compiled regex is kept in a cache shared by Match and Search.
+
+  Args:
+    pattern: regex pattern
+    rep: replacement text
+    s: search string
+
+  Returns:
+    string with replacements made (or original string if no replacements)
+  """
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].sub(rep, s)
+
+
+def Search(pattern, s):
+  """Searches the string for the pattern, caching the compiled regexp."""
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].search(s)
+
+
+class _IncludeState(dict):
+  """Tracks line numbers for includes, and the order in which includes appear.
+
+  As a dict, an _IncludeState object serves as a mapping between include
+  filename and line number on which that file was included.
+
+  Call CheckNextIncludeOrder() once for each header in the file, passing
+  in the type constants defined above. Calls in an illegal order will
+  raise an _IncludeError with an appropriate error message.
+
+  """
+  # self._section will move monotonically through this set. If it ever
+  # needs to move backwards, CheckNextIncludeOrder will raise an error.
+  _INITIAL_SECTION = 0
+  _MY_H_SECTION = 1
+  _C_SECTION = 2
+  _CPP_SECTION = 3
+  _OTHER_H_SECTION = 4
+
+  _TYPE_NAMES = {
+      _C_SYS_HEADER: 'C system header',
+      _CPP_SYS_HEADER: 'C++ system header',
+      _LIKELY_MY_HEADER: 'header this file implements',
+      _POSSIBLE_MY_HEADER: 'header this file may implement',
+      _OTHER_HEADER: 'other header',
+      }
+  _SECTION_NAMES = {
+      _INITIAL_SECTION: "... nothing. (This can't be an error.)",
+      _MY_H_SECTION: 'a header this file implements',
+      _C_SECTION: 'C system header',
+      _CPP_SECTION: 'C++ system header',
+      _OTHER_H_SECTION: 'other header',
+      }
+
+  def __init__(self):
+    dict.__init__(self)
+    self.ResetSection()
+
+  def ResetSection(self):
+    # The name of the current section.
+    self._section = self._INITIAL_SECTION
+    # The path of last found header.
+    self._last_header = ''
+
+  def SetLastHeader(self, header_path):
+    self._last_header = header_path
+
+  def CanonicalizeAlphabeticalOrder(self, header_path):
+    """Returns a path canonicalized for alphabetical comparison.
+
+    - replaces "-" with "_" so they both cmp the same.
+    - removes '-inl' since we don't require them to be after the main header.
+    - lowercase everything, just in case.
+
+    Args:
+      header_path: Path to be canonicalized.
+
+    Returns:
+      Canonicalized path.
+    """
+    return header_path.replace('-inl.h', '.h').replace('-', '_').lower()
+
+  def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path):
+    """Check if a header is in alphabetical order with the previous header.
+
+    Args:
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      header_path: Canonicalized header to be checked.
+
+    Returns:
+      Returns true if the header is in alphabetical order.
+    """
+    # If previous section is different from current section, _last_header will
+    # be reset to empty string, so it's always less than current header.
+    #
+    # If previous line was a blank line, assume that the headers are
+    # intentionally sorted the way they are.
+    if (self._last_header > header_path and
+        not Match(r'^\s*$', clean_lines.elided[linenum - 1])):
+      return False
+    return True
+
+  def CheckNextIncludeOrder(self, header_type):
+    """Returns a non-empty error message if the next header is out of order.
+
+    This function also updates the internal state to be ready to check
+    the next include.
+
+    Args:
+      header_type: One of the _XXX_HEADER constants defined above.
+
+    Returns:
+      The empty string if the header is in the right order, or an
+      error message describing what's wrong.
+
+    """
+    error_message = ('Found %s after %s' %
+                     (self._TYPE_NAMES[header_type],
+                      self._SECTION_NAMES[self._section]))
+
+    last_section = self._section
+
+    if header_type == _C_SYS_HEADER:
+      if self._section <= self._C_SECTION:
+        self._section = self._C_SECTION
+      else:
+        self._last_header = ''
+        return error_message
+    elif header_type == _CPP_SYS_HEADER:
+      if self._section <= self._CPP_SECTION:
+        self._section = self._CPP_SECTION
+      else:
+        self._last_header = ''
+        return error_message
+    elif header_type == _LIKELY_MY_HEADER:
+      if self._section <= self._MY_H_SECTION:
+        self._section = self._MY_H_SECTION
+      else:
+        self._section = self._OTHER_H_SECTION
+    elif header_type == _POSSIBLE_MY_HEADER:
+      if self._section <= self._MY_H_SECTION:
+        self._section = self._MY_H_SECTION
+      else:
+        # This will always be the fallback because we're not sure
+        # enough that the header is associated with this file.
+        self._section = self._OTHER_H_SECTION
+    else:
+      assert header_type == _OTHER_HEADER
+      self._section = self._OTHER_H_SECTION
+
+    if last_section != self._section:
+      self._last_header = ''
+
+    return ''
+
+
+class _CppLintState(object):
+  """Maintains module-wide state.."""
+
+  def __init__(self):
+    self.verbose_level = 1  # global setting.
+    self.error_count = 0    # global count of reported errors
+    # filters to apply when emitting error messages
+    self.filters = _DEFAULT_FILTERS[:]
+    self.counting = 'total'  # In what way are we counting errors?
+    self.errors_by_category = {}  # string to int dict storing error counts
+
+    # output format:
+    # "emacs" - format that emacs can parse (default)
+    # "vs7" - format that Microsoft Visual Studio 7 can parse
+    self.output_format = 'emacs'
+
+  def SetOutputFormat(self, output_format):
+    """Sets the output format for errors."""
+    self.output_format = output_format
+
+  def SetVerboseLevel(self, level):
+    """Sets the module's verbosity, and returns the previous setting."""
+    last_verbose_level = self.verbose_level
+    self.verbose_level = level
+    return last_verbose_level
+
+  def SetCountingStyle(self, counting_style):
+    """Sets the module's counting options."""
+    self.counting = counting_style
+
+  def SetFilters(self, filters):
+    """Sets the error-message filters.
+
+    These filters are applied when deciding whether to emit a given
+    error message.
+
+    Args:
+      filters: A string of comma-separated filters (eg "+whitespace/indent").
+               Each filter should start with + or -; else we die.
+
+    Raises:
+      ValueError: The comma-separated filters did not all start with '+' or '-'.
+                  E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter"
+    """
+    # Default filters always have less priority than the flag ones.
+    self.filters = _DEFAULT_FILTERS[:]
+    for filt in filters.split(','):
+      clean_filt = filt.strip()
+      if clean_filt:
+        self.filters.append(clean_filt)
+    for filt in self.filters:
+      if not (filt.startswith('+') or filt.startswith('-')):
+        raise ValueError('Every filter in --filters must start with + or -'
+                         ' (%s does not)' % filt)
+
+  def ResetErrorCounts(self):
+    """Sets the module's error statistic back to zero."""
+    self.error_count = 0
+    self.errors_by_category = {}
+
+  def IncrementErrorCount(self, category):
+    """Bumps the module's error statistic."""
+    self.error_count += 1
+    if self.counting in ('toplevel', 'detailed'):
+      if self.counting != 'detailed':
+        category = category.split('/')[0]
+      if category not in self.errors_by_category:
+        self.errors_by_category[category] = 0
+      self.errors_by_category[category] += 1
+
+  def PrintErrorCounts(self):
+    """Print a summary of errors by category, and the total."""
+    for category, count in self.errors_by_category.iteritems():
+      sys.stderr.write('Category \'%s\' errors found: %d\n' %
+                       (category, count))
+    sys.stderr.write('Total errors found: %d\n' % self.error_count)
+
+_cpplint_state = _CppLintState()
+
+
+def _OutputFormat():
+  """Gets the module's output format."""
+  return _cpplint_state.output_format
+
+
+def _SetOutputFormat(output_format):
+  """Sets the module's output format."""
+  _cpplint_state.SetOutputFormat(output_format)
+
+
+def _VerboseLevel():
+  """Returns the module's verbosity setting."""
+  return _cpplint_state.verbose_level
+
+
+def _SetVerboseLevel(level):
+  """Sets the module's verbosity, and returns the previous setting."""
+  return _cpplint_state.SetVerboseLevel(level)
+
+
+def _SetCountingStyle(level):
+  """Sets the module's counting options."""
+  _cpplint_state.SetCountingStyle(level)
+
+
+def _Filters():
+  """Returns the module's list of output filters, as a list."""
+  return _cpplint_state.filters
+
+
+def _SetFilters(filters):
+  """Sets the module's error-message filters.
+
+  These filters are applied when deciding whether to emit a given
+  error message.
+
+  Args:
+    filters: A string of comma-separated filters (eg "whitespace/indent").
+             Each filter should start with + or -; else we die.
+  """
+  _cpplint_state.SetFilters(filters)
+
+
+class _FunctionState(object):
+  """Tracks current function name and the number of lines in its body."""
+
+  _NORMAL_TRIGGER = 250  # for --v=0, 500 for --v=1, etc.
+  _TEST_TRIGGER = 400    # about 50% more than _NORMAL_TRIGGER.
+
+  def __init__(self):
+    self.in_a_function = False
+    self.lines_in_function = 0
+    self.current_function = ''
+
+  def Begin(self, function_name):
+    """Start analyzing function body.
+
+    Args:
+      function_name: The name of the function being tracked.
+    """
+    self.in_a_function = True
+    self.lines_in_function = 0
+    self.current_function = function_name
+
+  def Count(self):
+    """Count line in current function body."""
+    if self.in_a_function:
+      self.lines_in_function += 1
+
+  def Check(self, error, filename, linenum):
+    """Report if too many lines in function body.
+
+    Args:
+      error: The function to call with any errors found.
+      filename: The name of the current file.
+      linenum: The number of the line to check.
+    """
+    if Match(r'T(EST|est)', self.current_function):
+      base_trigger = self._TEST_TRIGGER
+    else:
+      base_trigger = self._NORMAL_TRIGGER
+    trigger = base_trigger * 2**_VerboseLevel()
+
+    if self.lines_in_function > trigger:
+      error_level = int(math.log(self.lines_in_function / base_trigger, 2))
+      # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ...
+      if error_level > 5:
+        error_level = 5
+      error(filename, linenum, 'readability/fn_size', error_level,
+            'Small and focused functions are preferred:'
+            ' %s has %d non-comment lines'
+            ' (error triggered by exceeding %d lines).'  % (
+                self.current_function, self.lines_in_function, trigger))
+
+  def End(self):
+    """Stop analyzing function body."""
+    self.in_a_function = False
+
+
+class _IncludeError(Exception):
+  """Indicates a problem with the include order in a file."""
+  pass
+
+
+class FileInfo:
+  """Provides utility functions for filenames.
+
+  FileInfo provides easy access to the components of a file's path
+  relative to the project root.
+  """
+
+  def __init__(self, filename):
+    self._filename = filename
+
+  def FullName(self):
+    """Make Windows paths like Unix."""
+    return os.path.abspath(self._filename).replace('\\', '/')
+
+  def RepositoryName(self):
+    """FullName after removing the local path to the repository.
+
+    If we have a real absolute path name here we can try to do something smart:
+    detecting the root of the checkout and truncating /path/to/checkout from
+    the name so that we get header guards that don't include things like
+    "C:\Documents and Settings\..." or "/home/username/..." in them and thus
+    people on different computers who have checked the source out to different
+    locations won't see bogus errors.
+    """
+    fullname = self.FullName()
+
+    if os.path.exists(fullname):
+      project_dir = os.path.dirname(fullname)
+
+      if os.path.exists(os.path.join(project_dir, ".svn")):
+        # If there's a .svn file in the current directory, we recursively look
+        # up the directory tree for the top of the SVN checkout
+        root_dir = project_dir
+        one_up_dir = os.path.dirname(root_dir)
+        while os.path.exists(os.path.join(one_up_dir, ".svn")):
+          root_dir = os.path.dirname(root_dir)
+          one_up_dir = os.path.dirname(one_up_dir)
+
+        prefix = os.path.commonprefix([root_dir, project_dir])
+        return fullname[len(prefix) + 1:]
+
+      # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
+      # searching up from the current path.
+      root_dir = os.path.dirname(fullname)
+      while (root_dir != os.path.dirname(root_dir) and
+             not os.path.exists(os.path.join(root_dir, ".git")) and
+             not os.path.exists(os.path.join(root_dir, ".hg")) and
+             not os.path.exists(os.path.join(root_dir, ".svn"))):
+        root_dir = os.path.dirname(root_dir)
+
+      if (os.path.exists(os.path.join(root_dir, ".git")) or
+          os.path.exists(os.path.join(root_dir, ".hg")) or
+          os.path.exists(os.path.join(root_dir, ".svn"))):
+        prefix = os.path.commonprefix([root_dir, project_dir])
+        return fullname[len(prefix) + 1:]
+
+    # Don't know what to do; header guard warnings may be wrong...
+    return fullname
+
+  def Split(self):
+    """Splits the file into the directory, basename, and extension.
+
+    For 'chrome/browser/browser.cc', Split() would
+    return ('chrome/browser', 'browser', '.cc')
+
+    Returns:
+      A tuple of (directory, basename, extension).
+    """
+
+    googlename = self.RepositoryName()
+    project, rest = os.path.split(googlename)
+    return (project,) + os.path.splitext(rest)
+
+  def BaseName(self):
+    """File base name - text after the final slash, before the final period."""
+    return self.Split()[1]
+
+  def Extension(self):
+    """File extension - text following the final period."""
+    return self.Split()[2]
+
+  def NoExtension(self):
+    """File has no source file extension."""
+    return '/'.join(self.Split()[0:2])
+
+  def IsSource(self):
+    """File has a source file extension."""
+    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
+
+
+def _ShouldPrintError(category, confidence, linenum):
+  """If confidence >= verbose, category passes filter and is not suppressed."""
+
+  # There are three ways we might decide not to print an error message:
+  # a "NOLINT(category)" comment appears in the source,
+  # the verbosity level isn't high enough, or the filters filter it out.
+  if IsErrorSuppressedByNolint(category, linenum):
+    return False
+  if confidence < _cpplint_state.verbose_level:
+    return False
+
+  is_filtered = False
+  for one_filter in _Filters():
+    if one_filter.startswith('-'):
+      if category.startswith(one_filter[1:]):
+        is_filtered = True
+    elif one_filter.startswith('+'):
+      if category.startswith(one_filter[1:]):
+        is_filtered = False
+    else:
+      assert False  # should have been checked for in SetFilter.
+  if is_filtered:
+    return False
+
+  return True
+
+
+def Error(filename, linenum, category, confidence, message):
+  """Logs the fact we've found a lint error.
+
+  We log where the error was found, and also our confidence in the error,
+  that is, how certain we are this is a legitimate style regression, and
+  not a misidentification or a use that's sometimes justified.
+
+  False positives can be suppressed by the use of
+  "cpplint(category)"  comments on the offending line.  These are
+  parsed into _error_suppressions.
+
+  Args:
+    filename: The name of the file containing the error.
+    linenum: The number of the line containing the error.
+    category: A string used to describe the "category" this bug
+      falls under: "whitespace", say, or "runtime".  Categories
+      may have a hierarchy separated by slashes: "whitespace/indent".
+    confidence: A number from 1-5 representing a confidence score for
+      the error, with 5 meaning that we are certain of the problem,
+      and 1 meaning that it could be a legitimate construct.
+    message: The error message.
+  """
+  if _ShouldPrintError(category, confidence, linenum):
+    _cpplint_state.IncrementErrorCount(category)
+    if _cpplint_state.output_format == 'vs7':
+      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+    elif _cpplint_state.output_format == 'eclipse':
+      sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+    else:
+      sys.stderr.write('%s:%s:  %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+
+
+# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
+_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
+    r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
+# Matches strings.  Escape codes should already be removed by ESCAPES.
+_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"')
+# Matches characters.  Escape codes should already be removed by ESCAPES.
+_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
+# Matches multi-line C++ comments.
+# This RE is a little bit more complicated than one might expect, because we
+# have to take care of space removals tools so we can handle comments inside
+# statements better.
+# The current rule is: We only clear spaces from both sides when we're at the
+# end of the line. Otherwise, we try to remove spaces from the right side,
+# if this doesn't work we try on left side but only if there's a non-character
+# on the right.
+_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
+    r"""(\s*/\*.*\*/\s*$|
+            /\*.*\*/\s+|
+         \s+/\*.*\*/(?=\W)|
+            /\*.*\*/)""", re.VERBOSE)
+
+
+def IsCppString(line):
+  """Does line terminate so, that the next symbol is in string constant.
+
+  This function does not consider single-line nor multi-line comments.
+
+  Args:
+    line: is a partial line of code starting from the 0..n.
+
+  Returns:
+    True, if next character appended to 'line' is inside a
+    string constant.
+  """
+
+  line = line.replace(r'\\', 'XX')  # after this, \\" does not match to \"
+  return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1
+
+
+def CleanseRawStrings(raw_lines):
+  """Removes C++11 raw strings from lines.
+
+    Before:
+      static const char kData[] = R"(
+          multi-line string
+          )";
+
+    After:
+      static const char kData[] = ""
+          (replaced by blank line)
+          "";
+
+  Args:
+    raw_lines: list of raw lines.
+
+  Returns:
+    list of lines with C++11 raw strings replaced by empty strings.
+  """
+
+  delimiter = None
+  lines_without_raw_strings = []
+  for line in raw_lines:
+    if delimiter:
+      # Inside a raw string, look for the end
+      end = line.find(delimiter)
+      if end >= 0:
+        # Found the end of the string, match leading space for this
+        # line and resume copying the original lines, and also insert
+        # a "" on the last line.
+        leading_space = Match(r'^(\s*)\S', line)
+        line = leading_space.group(1) + '""' + line[end + len(delimiter):]
+        delimiter = None
+      else:
+        # Haven't found the end yet, append a blank line.
+        line = ''
+
+    else:
+      # Look for beginning of a raw string.
+      # See 2.14.15 [lex.string] for syntax.
+      matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
+      if matched:
+        delimiter = ')' + matched.group(2) + '"'
+
+        end = matched.group(3).find(delimiter)
+        if end >= 0:
+          # Raw string ended on same line
+          line = (matched.group(1) + '""' +
+                  matched.group(3)[end + len(delimiter):])
+          delimiter = None
+        else:
+          # Start of a multi-line raw string
+          line = matched.group(1) + '""'
+
+    lines_without_raw_strings.append(line)
+
+  # TODO(unknown): if delimiter is not None here, we might want to
+  # emit a warning for unterminated string.
+  return lines_without_raw_strings
+
+
+def FindNextMultiLineCommentStart(lines, lineix):
+  """Find the beginning marker for a multiline comment."""
+  while lineix < len(lines):
+    if lines[lineix].strip().startswith('/*'):
+      # Only return this marker if the comment goes beyond this line
+      if lines[lineix].strip().find('*/', 2) < 0:
+        return lineix
+    lineix += 1
+  return len(lines)
+
+
+def FindNextMultiLineCommentEnd(lines, lineix):
+  """We are inside a comment, find the end marker."""
+  while lineix < len(lines):
+    if lines[lineix].strip().endswith('*/'):
+      return lineix
+    lineix += 1
+  return len(lines)
+
+
+def RemoveMultiLineCommentsFromRange(lines, begin, end):
+  """Clears a range of lines for multi-line comments."""
+  # Having // dummy comments makes the lines non-empty, so we will not get
+  # unnecessary blank line warnings later in the code.
+  for i in range(begin, end):
+    lines[i] = '// dummy'
+
+
+def RemoveMultiLineComments(filename, lines, error):
+  """Removes multiline (c-style) comments from lines."""
+  lineix = 0
+  while lineix < len(lines):
+    lineix_begin = FindNextMultiLineCommentStart(lines, lineix)
+    if lineix_begin >= len(lines):
+      return
+    lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin)
+    if lineix_end >= len(lines):
+      error(filename, lineix_begin + 1, 'readability/multiline_comment', 5,
+            'Could not find end of multi-line comment')
+      return
+    RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1)
+    lineix = lineix_end + 1
+
+
+def CleanseComments(line):
+  """Removes //-comments and single-line C-style /* */ comments.
+
+  Args:
+    line: A line of C++ source.
+
+  Returns:
+    The line with single-line comments removed.
+  """
+  commentpos = line.find('//')
+  if commentpos != -1 and not IsCppString(line[:commentpos]):
+    line = line[:commentpos].rstrip()
+  # get rid of /* ... */
+  return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line)
+
+
+class CleansedLines(object):
+  """Holds 3 copies of all lines with different preprocessing applied to them.
+
+  1) elided member contains lines without strings and comments,
+  2) lines member contains lines without comments, and
+  3) raw_lines member contains all the lines without processing.
+  All these three members are of <type 'list'>, and of the same length.
+  """
+
+  def __init__(self, lines):
+    self.elided = []
+    self.lines = []
+    self.raw_lines = lines
+    self.num_lines = len(lines)
+    self.lines_without_raw_strings = CleanseRawStrings(lines)
+    for linenum in range(len(self.lines_without_raw_strings)):
+      self.lines.append(CleanseComments(
+          self.lines_without_raw_strings[linenum]))
+      elided = self._CollapseStrings(self.lines_without_raw_strings[linenum])
+      self.elided.append(CleanseComments(elided))
+
+  def NumLines(self):
+    """Returns the number of lines represented."""
+    return self.num_lines
+
+  @staticmethod
+  def _CollapseStrings(elided):
+    """Collapses strings and chars on a line to simple "" or '' blocks.
+
+    We nix strings first so we're not fooled by text like '"http://"'
+
+    Args:
+      elided: The line being processed.
+
+    Returns:
+      The line with collapsed strings.
+    """
+    if not _RE_PATTERN_INCLUDE.match(elided):
+      # Remove escaped characters first to make quote/single quote collapsing
+      # basic.  Things that look like escaped characters shouldn't occur
+      # outside of strings and chars.
+      elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
+      elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided)
+      elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided)
+    return elided
+
+
+def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
+  """Find the position just after the matching endchar.
+
+  Args:
+    line: a CleansedLines line.
+    startpos: start searching at this position.
+    depth: nesting level at startpos.
+    startchar: expression opening character.
+    endchar: expression closing character.
+
+  Returns:
+    On finding matching endchar: (index just after matching endchar, 0)
+    Otherwise: (-1, new depth at end of this line)
+  """
+  for i in xrange(startpos, len(line)):
+    if line[i] == startchar:
+      depth += 1
+    elif line[i] == endchar:
+      depth -= 1
+      if depth == 0:
+        return (i + 1, 0)
+  return (-1, depth)
+
+
+def CloseExpression(clean_lines, linenum, pos):
+  """If input points to ( or { or [ or <, finds the position that closes it.
+
+  If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
+  linenum/pos that correspond to the closing of the expression.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    pos: A position on the line.
+
+  Returns:
+    A tuple (line, linenum, pos) pointer *past* the closing brace, or
+    (line, len(lines), -1) if we never find a close.  Note we ignore
+    strings and comments when matching; and the line we return is the
+    'cleansed' line at linenum.
+  """
+
+  line = clean_lines.elided[linenum]
+  startchar = line[pos]
+  if startchar not in '({[<':
+    return (line, clean_lines.NumLines(), -1)
+  if startchar == '(': endchar = ')'
+  if startchar == '[': endchar = ']'
+  if startchar == '{': endchar = '}'
+  if startchar == '<': endchar = '>'
+
+  # Check first line
+  (end_pos, num_open) = FindEndOfExpressionInLine(
+      line, pos, 0, startchar, endchar)
+  if end_pos > -1:
+    return (line, linenum, end_pos)
+
+  # Continue scanning forward
+  while linenum < clean_lines.NumLines() - 1:
+    linenum += 1
+    line = clean_lines.elided[linenum]
+    (end_pos, num_open) = FindEndOfExpressionInLine(
+        line, 0, num_open, startchar, endchar)
+    if end_pos > -1:
+      return (line, linenum, end_pos)
+
+  # Did not find endchar before end of file, give up
+  return (line, clean_lines.NumLines(), -1)
+
+
+def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
+  """Find position at the matching startchar.
+
+  This is almost the reverse of FindEndOfExpressionInLine, but note
+  that the input position and returned position differs by 1.
+
+  Args:
+    line: a CleansedLines line.
+    endpos: start searching at this position.
+    depth: nesting level at endpos.
+    startchar: expression opening character.
+    endchar: expression closing character.
+
+  Returns:
+    On finding matching startchar: (index at matching startchar, 0)
+    Otherwise: (-1, new depth at beginning of this line)
+  """
+  for i in xrange(endpos, -1, -1):
+    if line[i] == endchar:
+      depth += 1
+    elif line[i] == startchar:
+      depth -= 1
+      if depth == 0:
+        return (i, 0)
+  return (-1, depth)
+
+
+def ReverseCloseExpression(clean_lines, linenum, pos):
+  """If input points to ) or } or ] or >, finds the position that opens it.
+
+  If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the
+  linenum/pos that correspond to the opening of the expression.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    pos: A position on the line.
+
+  Returns:
+    A tuple (line, linenum, pos) pointer *at* the opening brace, or
+    (line, 0, -1) if we never find the matching opening brace.  Note
+    we ignore strings and comments when matching; and the line we
+    return is the 'cleansed' line at linenum.
+  """
+  line = clean_lines.elided[linenum]
+  endchar = line[pos]
+  if endchar not in ')}]>':
+    return (line, 0, -1)
+  if endchar == ')': startchar = '('
+  if endchar == ']': startchar = '['
+  if endchar == '}': startchar = '{'
+  if endchar == '>': startchar = '<'
+
+  # Check last line
+  (start_pos, num_open) = FindStartOfExpressionInLine(
+      line, pos, 0, startchar, endchar)
+  if start_pos > -1:
+    return (line, linenum, start_pos)
+
+  # Continue scanning backward
+  while linenum > 0:
+    linenum -= 1
+    line = clean_lines.elided[linenum]
+    (start_pos, num_open) = FindStartOfExpressionInLine(
+        line, len(line) - 1, num_open, startchar, endchar)
+    if start_pos > -1:
+      return (line, linenum, start_pos)
+
+  # Did not find startchar before beginning of file, give up
+  return (line, 0, -1)
+
+
+def CheckForCopyright(filename, lines, error):
+  """Logs an error if no Copyright message appears at the top of the file."""
+
+  # We'll say it should occur by line 10. Don't forget there's a
+  # dummy line at the front.
+  for line in xrange(1, min(len(lines), 11)):
+    if re.search(r'Copyright', lines[line], re.I): break
+  else:                       # means no copyright line was found
+    error(filename, 0, 'legal/copyright', 5,
+          'No copyright message found.  '
+          'You should have a line: "Copyright [year] <Copyright Owner>"')
+
+
+def GetHeaderGuardCPPVariable(filename):
+  """Returns the CPP variable that should be used as a header guard.
+
+  Args:
+    filename: The name of a C++ header file.
+
+  Returns:
+    The CPP variable that should be used as a header guard in the
+    named file.
+
+  """
+
+  # Restores original filename in case that cpplint is invoked from Emacs's
+  # flymake.
+  filename = re.sub(r'_flymake\.h$', '.h', filename)
+  filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
+
+  fileinfo = FileInfo(filename)
+  file_path_from_root = fileinfo.RepositoryName()
+  if _root:
+    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
+  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
+
+
+def CheckForHeaderGuard(filename, lines, error):
+  """Checks that the file contains a header guard.
+
+  Logs an error if no #ifndef header guard is present.  For other
+  headers, checks that the full pathname is used.
+
+  Args:
+    filename: The name of the C++ header file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+
+  cppvar = GetHeaderGuardCPPVariable(filename)
+
+  ifndef = None
+  ifndef_linenum = 0
+  define = None
+  endif = None
+  endif_linenum = 0
+  for linenum, line in enumerate(lines):
+    # Already been well guarded, no need for further checking.
+    if line.strip() == "#pragma once":
+        return
+    linesplit = line.split()
+    if len(linesplit) >= 2:
+      # find the first occurrence of #ifndef and #define, save arg
+      if not ifndef and linesplit[0] == '#ifndef':
+        # set ifndef to the header guard presented on the #ifndef line.
+        ifndef = linesplit[1]
+        ifndef_linenum = linenum
+      if not define and linesplit[0] == '#define':
+        define = linesplit[1]
+    # find the last occurrence of #endif, save entire line
+    if line.startswith('#endif'):
+      endif = line
+      endif_linenum = linenum
+
+  if not ifndef:
+    error(filename, 0, 'build/header_guard', 5,
+          'No #ifndef header guard found, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  if not define:
+    error(filename, 0, 'build/header_guard', 5,
+          'No #define header guard found, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
+  # for backward compatibility.
+  if ifndef != cppvar:
+    error_level = 0
+    if ifndef != cppvar + '_':
+      error_level = 5
+
+    ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum,
+                            error)
+    error(filename, ifndef_linenum, 'build/header_guard', error_level,
+          '#ifndef header guard has wrong style, please use: %s' % cppvar)
+
+  if define != ifndef:
+    error(filename, 0, 'build/header_guard', 5,
+          '#ifndef and #define don\'t match, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  if endif != ('#endif  // %s' % cppvar):
+    error_level = 0
+    if endif != ('#endif  // %s' % (cppvar + '_')):
+      error_level = 5
+
+    ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum,
+                            error)
+    error(filename, endif_linenum, 'build/header_guard', error_level,
+          '#endif line should be "#endif  // %s"' % cppvar)
+
+
+def CheckForBadCharacters(filename, lines, error):
+  """Logs an error for each line containing bad characters.
+
+  Two kinds of bad characters:
+
+  1. Unicode replacement characters: These indicate that either the file
+  contained invalid UTF-8 (likely) or Unicode replacement characters (which
+  it shouldn't).  Note that it's possible for this to throw off line
+  numbering if the invalid UTF-8 occurred adjacent to a newline.
+
+  2. NUL bytes.  These are problematic for some tools.
+
+  Args:
+    filename: The name of the current file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+  for linenum, line in enumerate(lines):
+    if u'\ufffd' in line:
+      error(filename, linenum, 'readability/utf8', 5,
+            'Line contains invalid UTF-8 (or Unicode replacement character).')
+    if '\0' in line:
+      error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.')
+
+
+def CheckForNewlineAtEOF(filename, lines, error):
+  """Logs an error if there is no newline char at the end of the file.
+
+  Args:
+    filename: The name of the current file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+
+  # The array lines() was created by adding two newlines to the
+  # original file (go figure), then splitting on \n.
+  # To verify that the file ends in \n, we just have to make sure the
+  # last-but-two element of lines() exists and is empty.
+  if len(lines) < 3 or lines[-2]:
+    error(filename, len(lines) - 2, 'whitespace/ending_newline', 5,
+          'Could not find a newline character at the end of the file.')
+
+
+def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
+  """Logs an error if we see /* ... */ or "..." that extend past one line.
+
+  /* ... */ comments are legit inside macros, for one line.
+  Otherwise, we prefer // comments, so it's ok to warn about the
+  other.  Likewise, it's ok for strings to extend across multiple
+  lines, as long as a line continuation character (backslash)
+  terminates each line. Although not currently prohibited by the C++
+  style guide, it's ugly and unnecessary. We don't do well with either
+  in this lint program, so we warn about both.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Remove all \\ (escaped backslashes) from the line. They are OK, and the
+  # second (escaped) slash may trigger later \" detection erroneously.
+  line = line.replace('\\\\', '')
+
+  if line.count('/*') > line.count('*/'):
+    error(filename, linenum, 'readability/multiline_comment', 5,
+          'Complex multi-line /*...*/-style comment found. '
+          'Lint may give bogus warnings.  '
+          'Consider replacing these with //-style comments, '
+          'with #if 0...#endif, '
+          'or with more clearly structured multi-line comments.')
+
+  if (line.count('"') - line.count('\\"')) % 2:
+    error(filename, linenum, 'readability/multiline_string', 5,
+          'Multi-line string ("...") found.  This lint script doesn\'t '
+          'do well with such strings, and may give bogus warnings.  '
+          'Use C++11 raw strings or concatenation instead.')
+
+
+threading_list = (
+    ('asctime(', 'asctime_r('),
+    ('ctime(', 'ctime_r('),
+    ('getgrgid(', 'getgrgid_r('),
+    ('getgrnam(', 'getgrnam_r('),
+    ('getlogin(', 'getlogin_r('),
+    ('getpwnam(', 'getpwnam_r('),
+    ('getpwuid(', 'getpwuid_r('),
+    ('gmtime(', 'gmtime_r('),
+    ('localtime(', 'localtime_r('),
+    ('rand(', 'rand_r('),
+    ('strtok(', 'strtok_r('),
+    ('ttyname(', 'ttyname_r('),
+    )
+
+
+def CheckPosixThreading(filename, clean_lines, linenum, error):
+  """Checks for calls to thread-unsafe functions.
+
+  Much code has been originally written without consideration of
+  multi-threading. Also, engineers are relying on their old experience;
+  they have learned posix before threading extensions were added. These
+  tests guide the engineers to use thread-safe functions (when using
+  posix directly).
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  for single_thread_function, multithread_safe_function in threading_list:
+    ix = line.find(single_thread_function)
+    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
+    if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and
+                                line[ix - 1] not in ('_', '.', '>'))):
+      error(filename, linenum, 'runtime/threadsafe_fn', 2,
+            'Consider using ' + multithread_safe_function +
+            '...) instead of ' + single_thread_function +
+            '...) for improved thread safety.')
+
+
+def CheckVlogArguments(filename, clean_lines, linenum, error):
+  """Checks that VLOG() is only used for defining a logging level.
+
+  For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and
+  VLOG(FATAL) are not.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line):
+    error(filename, linenum, 'runtime/vlog', 5,
+          'VLOG() should be used with numeric verbosity level.  '
+          'Use LOG() if you want symbolic severity levels.')
+
+
+# Matches invalid increment: *count++, which moves pointer instead of
+# incrementing a value.
+_RE_PATTERN_INVALID_INCREMENT = re.compile(
+    r'^\s*\*\w+(\+\+|--);')
+
+
+def CheckInvalidIncrement(filename, clean_lines, linenum, error):
+  """Checks for invalid increment *count++.
+
+  For example following function:
+  void increment_counter(int* count) {
+    *count++;
+  }
+  is invalid, because it effectively does count++, moving pointer, and should
+  be replaced with ++*count, (*count)++ or *count += 1.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  if _RE_PATTERN_INVALID_INCREMENT.match(line):
+    error(filename, linenum, 'runtime/invalid_increment', 5,
+          'Changing pointer instead of value (or unused value of operator*).')
+
+
+class _BlockInfo(object):
+  """Stores information about a generic block of code."""
+
+  def __init__(self, seen_open_brace):
+    self.seen_open_brace = seen_open_brace
+    self.open_parentheses = 0
+    self.inline_asm = _NO_ASM
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text up to the opening brace.
+
+    This is mostly for checking the text after the class identifier
+    and the "{", usually where the base class is specified.  For other
+    blocks, there isn't much to check, so we always pass.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text after the closing brace.
+
+    This is mostly used for checking end of namespace comments.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+
+class _ClassInfo(_BlockInfo):
+  """Stores information about a class."""
+
+  def __init__(self, name, class_or_struct, clean_lines, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name
+    self.starting_linenum = linenum
+    self.is_derived = False
+    if class_or_struct == 'struct':
+      self.access = 'public'
+      self.is_struct = True
+    else:
+      self.access = 'private'
+      self.is_struct = False
+
+    # Remember initial indentation level for this class.  Using raw_lines here
+    # instead of elided to account for leading comments.
+    initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum])
+    if initial_indent:
+      self.class_indent = len(initial_indent.group(1))
+    else:
+      self.class_indent = 0
+
+    # Try to find the end of the class.  This will be confused by things like:
+    #   class A {
+    #   } *x = { ...
+    #
+    # But it's still good enough for CheckSectionSpacing.
+    self.last_line = 0
+    depth = 0
+    for i in range(linenum, clean_lines.NumLines()):
+      line = clean_lines.elided[i]
+      depth += line.count('{') - line.count('}')
+      if not depth:
+        self.last_line = i
+        break
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    # Look for a bare ':'
+    if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
+      self.is_derived = True
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    # Check that closing brace is aligned with beginning of the class.
+    # Only do this if the closing brace is indented by only whitespaces.
+    # This means we will not check single-line class definitions.
+    indent = Match(r'^( *)\}', clean_lines.elided[linenum])
+    if indent and len(indent.group(1)) != self.class_indent:
+      if self.is_struct:
+        parent = 'struct ' + self.name
+      else:
+        parent = 'class ' + self.name
+      error(filename, linenum, 'whitespace/indent', 3,
+            'Closing brace should be aligned with beginning of %s' % parent)
+
+
+class _NamespaceInfo(_BlockInfo):
+  """Stores information about a namespace."""
+
+  def __init__(self, name, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name or ''
+    self.starting_linenum = linenum
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Check end of namespace comments."""
+    line = clean_lines.raw_lines[linenum]
+
+    # Check how many lines is enclosed in this namespace.  Don't issue
+    # warning for missing namespace comments if there aren't enough
+    # lines.  However, do apply checks if there is already an end of
+    # namespace comment and it's incorrect.
+    #
+    # TODO(unknown): We always want to check end of namespace comments
+    # if a namespace is large, but sometimes we also want to apply the
+    # check if a short namespace contained nontrivial things (something
+    # other than forward declarations).  There is currently no logic on
+    # deciding what these nontrivial things are, so this check is
+    # triggered by namespace size only, which works most of the time.
+    if (linenum - self.starting_linenum < 10
+        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+      return
+
+    # Look for matching comment at end of namespace.
+    #
+    # Note that we accept C style "/* */" comments for terminating
+    # namespaces, so that code that terminate namespaces inside
+    # preprocessor macros can be cpplint clean.
+    #
+    # We also accept stuff like "// end of namespace <name>." with the
+    # period at the end.
+    #
+    # Besides these, we don't accept anything else, otherwise we might
+    # get false negatives when existing comment is a substring of the
+    # expected namespace.
+    if self.name:
+      # Named namespace
+      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
+                    r'[\*/\.\\\s]*$'),
+                   line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace %s"' %
+              self.name)
+    else:
+      # Anonymous namespace
+      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace"')
+
+
+class _PreprocessorInfo(object):
+  """Stores checkpoints of nesting stacks when #if/#else is seen."""
+
+  def __init__(self, stack_before_if):
+    # The entire nesting stack before #if
+    self.stack_before_if = stack_before_if
+
+    # The entire nesting stack up to #else
+    self.stack_before_else = []
+
+    # Whether we have already seen #else or #elif
+    self.seen_else = False
+
+
+class _NestingState(object):
+  """Holds states related to parsing braces."""
+
+  def __init__(self):
+    # Stack for tracking all braces.  An object is pushed whenever we
+    # see a "{", and popped when we see a "}".  Only 3 types of
+    # objects are possible:
+    # - _ClassInfo: a class or struct.
+    # - _NamespaceInfo: a namespace.
+    # - _BlockInfo: some other type of block.
+    self.stack = []
+
+    # Stack of _PreprocessorInfo objects.
+    self.pp_stack = []
+
+  def SeenOpenBrace(self):
+    """Check if we have seen the opening brace for the innermost block.
+
+    Returns:
+      True if we have seen the opening brace, False if the innermost
+      block is still expecting an opening brace.
+    """
+    return (not self.stack) or self.stack[-1].seen_open_brace
+
+  def InNamespaceBody(self):
+    """Check if we are currently one level inside a namespace body.
+
+    Returns:
+      True if top of the stack is a namespace block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
+
+  def UpdatePreprocessor(self, line):
+    """Update preprocessor stack.
+
+    We need to handle preprocessors due to classes like this:
+      #ifdef SWIG
+      struct ResultDetailsPageElementExtensionPoint {
+      #else
+      struct ResultDetailsPageElementExtensionPoint : public Extension {
+      #endif
+
+    We make the following assumptions (good enough for most files):
+    - Preprocessor condition evaluates to true from #if up to first
+      #else/#elif/#endif.
+
+    - Preprocessor condition evaluates to false from #else/#elif up
+      to #endif.  We still perform lint checks on these lines, but
+      these do not affect nesting stack.
+
+    Args:
+      line: current line to check.
+    """
+    if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
+      # Beginning of #if block, save the nesting stack here.  The saved
+      # stack will allow us to restore the parsing state in the #else case.
+      self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
+    elif Match(r'^\s*#\s*(else|elif)\b', line):
+      # Beginning of #else block
+      if self.pp_stack:
+        if not self.pp_stack[-1].seen_else:
+          # This is the first #else or #elif block.  Remember the
+          # whole nesting stack up to this point.  This is what we
+          # keep after the #endif.
+          self.pp_stack[-1].seen_else = True
+          self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack)
+
+        # Restore the stack to how it was before the #if
+        self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
+      else:
+        # TODO(unknown): unexpected #else, issue warning?
+        pass
+    elif Match(r'^\s*#\s*endif\b', line):
+      # End of #if or #else blocks.
+      if self.pp_stack:
+        # If we saw an #else, we will need to restore the nesting
+        # stack to its former state before the #else, otherwise we
+        # will just continue from where we left off.
+        if self.pp_stack[-1].seen_else:
+          # Here we can just use a shallow copy since we are the last
+          # reference to it.
+          self.stack = self.pp_stack[-1].stack_before_else
+        # Drop the corresponding #if
+        self.pp_stack.pop()
+      else:
+        # TODO(unknown): unexpected #endif, issue warning?
+        pass
+
+  def Update(self, filename, clean_lines, linenum, error):
+    """Update nesting state with current line.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    line = clean_lines.elided[linenum]
+
+    # Update pp_stack first
+    self.UpdatePreprocessor(line)
+
+    # Count parentheses.  This is to avoid adding struct arguments to
+    # the nesting stack.
+    if self.stack:
+      inner_block = self.stack[-1]
+      depth_change = line.count('(') - line.count(')')
+      inner_block.open_parentheses += depth_change
+
+      # Also check if we are starting or ending an inline assembly block.
+      if inner_block.inline_asm in (_NO_ASM, _END_ASM):
+        if (depth_change != 0 and
+            inner_block.open_parentheses == 1 and
+            _MATCH_ASM.match(line)):
+          # Enter assembly block
+          inner_block.inline_asm = _INSIDE_ASM
+        else:
+          # Not entering assembly block.  If previous line was _END_ASM,
+          # we will now shift to _NO_ASM state.
+          inner_block.inline_asm = _NO_ASM
+      elif (inner_block.inline_asm == _INSIDE_ASM and
+            inner_block.open_parentheses == 0):
+        # Exit assembly block
+        inner_block.inline_asm = _END_ASM
+
+    # Consume namespace declaration at the beginning of the line.  Do
+    # this in a loop so that we catch same line declarations like this:
+    #   namespace proto2 { namespace bridge { class MessageSet; } }
+    while True:
+      # Match start of namespace.  The "\b\s*" below catches namespace
+      # declarations even if it weren't followed by a whitespace, this
+      # is so that we don't confuse our namespace checker.  The
+      # missing spaces will be flagged by CheckSpacing.
+      namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line)
+      if not namespace_decl_match:
+        break
+
+      new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum)
+      self.stack.append(new_namespace)
+
+      line = namespace_decl_match.group(2)
+      if line.find('{') != -1:
+        new_namespace.seen_open_brace = True
+        line = line[line.find('{') + 1:]
+
+    # Look for a class declaration in whatever is left of the line
+    # after parsing namespaces.  The regexp accounts for decorated classes
+    # such as in:
+    #   class LOCKABLE API Object {
+    #   };
+    #
+    # Templates with class arguments may confuse the parser, for example:
+    #   template <class T
+    #             class Comparator = less<T>,
+    #             class Vector = vector<T> >
+    #   class HeapQueue {
+    #
+    # Because this parser has no nesting state about templates, by the
+    # time it saw "class Comparator", it may think that it's a new class.
+    # Nested templates have a similar problem:
+    #   template <
+    #       typename ExportedType,
+    #       typename TupleType,
+    #       template <typename, typename> class ImplTemplate>
+    #
+    # To avoid these cases, we ignore classes that are followed by '=' or '>'
+    class_decl_match = Match(
+        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
+        r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
+        r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line)
+    if (class_decl_match and
+        (not self.stack or self.stack[-1].open_parentheses == 0)):
+      self.stack.append(_ClassInfo(
+          class_decl_match.group(4), class_decl_match.group(2),
+          clean_lines, linenum))
+      line = class_decl_match.group(5)
+
+    # If we have not yet seen the opening brace for the innermost block,
+    # run checks here.
+    if not self.SeenOpenBrace():
+      self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
+
+    # Update access control if we are inside a class/struct
+    if self.stack and isinstance(self.stack[-1], _ClassInfo):
+      classinfo = self.stack[-1]
+      access_match = Match(
+          r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?'
+          r':(?:[^:]|$)',
+          line)
+      if access_match:
+        classinfo.access = access_match.group(2)
+
+        # Check that access keywords are indented +1 space.  Skip this
+        # check if the keywords are not preceded by whitespaces.
+        indent = access_match.group(1)
+        if (len(indent) != classinfo.class_indent + 1 and
+            Match(r'^\s*$', indent)):
+          if classinfo.is_struct:
+            parent = 'struct ' + classinfo.name
+          else:
+            parent = 'class ' + classinfo.name
+          slots = ''
+          if access_match.group(3):
+            slots = access_match.group(3)
+          error(filename, linenum, 'whitespace/indent', 3,
+                '%s%s: should be indented +1 space inside %s' % (
+                    access_match.group(2), slots, parent))
+
+    # Consume braces or semicolons from what's left of the line
+    while True:
+      # Match first brace, semicolon, or closed parenthesis.
+      matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
+      if not matched:
+        break
+
+      token = matched.group(1)
+      if token == '{':
+        # If namespace or class hasn't seen a opening brace yet, mark
+        # namespace/class head as complete.  Push a new block onto the
+        # stack otherwise.
+        if not self.SeenOpenBrace():
+          self.stack[-1].seen_open_brace = True
+        else:
+          self.stack.append(_BlockInfo(True))
+          if _MATCH_ASM.match(line):
+            self.stack[-1].inline_asm = _BLOCK_ASM
+      elif token == ';' or token == ')':
+        # If we haven't seen an opening brace yet, but we already saw
+        # a semicolon, this is probably a forward declaration.  Pop
+        # the stack for these.
+        #
+        # Similarly, if we haven't seen an opening brace yet, but we
+        # already saw a closing parenthesis, then these are probably
+        # function arguments with extra "class" or "struct" keywords.
+        # Also pop these stack for these.
+        if not self.SeenOpenBrace():
+          self.stack.pop()
+      else:  # token == '}'
+        # Perform end of block checks and pop the stack.
+        if self.stack:
+          self.stack[-1].CheckEnd(filename, clean_lines, linenum, error)
+          self.stack.pop()
+      line = matched.group(2)
+
+  def InnermostClass(self):
+    """Get class info on the top of the stack.
+
+    Returns:
+      A _ClassInfo object if we are inside a class, or None otherwise.
+    """
+    for i in range(len(self.stack), 0, -1):
+      classinfo = self.stack[i - 1]
+      if isinstance(classinfo, _ClassInfo):
+        return classinfo
+    return None
+
+  def CheckCompletedBlocks(self, filename, error):
+    """Checks that all classes and namespaces have been completely parsed.
+
+    Call this when all lines in a file have been processed.
+    Args:
+      filename: The name of the current file.
+      error: The function to call with any errors found.
+    """
+    # Note: This test can result in false positives if #ifdef constructs
+    # get in the way of brace matching. See the testBuildClass test in
+    # cpplint_unittest.py for an example of this.
+    for obj in self.stack:
+      if isinstance(obj, _ClassInfo):
+        error(filename, obj.starting_linenum, 'build/class', 5,
+              'Failed to find complete declaration of class %s' %
+              obj.name)
+      elif isinstance(obj, _NamespaceInfo):
+        error(filename, obj.starting_linenum, 'build/namespaces', 5,
+              'Failed to find complete declaration of namespace %s' %
+              obj.name)
+
+
+def CheckForNonStandardConstructs(filename, clean_lines, linenum,
+                                  nesting_state, error):
+  r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
+
+  Complain about several constructs which gcc-2 accepts, but which are
+  not standard C++.  Warning about these in lint is one way to ease the
+  transition to new compilers.
+  - put storage class first (e.g. "static const" instead of "const static").
+  - "%lld" instead of %qd" in printf-type functions.
+  - "%1$d" is non-standard in printf-type functions.
+  - "\%" is an undefined character escape sequence.
+  - text after #endif is not allowed.
+  - invalid inner-style forward declaration.
+  - >? and <? operators, and their >?= and <?= cousins.
+
+  Additionally, check for constructor/destructor style violations and reference
+  members, as it is very convenient to do so while checking for
+  gcc-2 compliance.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+  """
+
+  # Remove comments from the line, but leave in strings for now.
+  line = clean_lines.lines[linenum]
+
+  if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line):
+    error(filename, linenum, 'runtime/printf_format', 3,
+          '%q in format strings is deprecated.  Use %ll instead.')
+
+  if Search(r'printf\s*\(.*".*%\d+\$', line):
+    error(filename, linenum, 'runtime/printf_format', 2,
+          '%N$ formats are unconventional.  Try rewriting to avoid them.')
+
+  # Remove escaped backslashes before looking for undefined escapes.
+  line = line.replace('\\\\', '')
+
+  if Search(r'("|\').*\\(%|\[|\(|{)', line):
+    error(filename, linenum, 'build/printf_format', 3,
+          '%, [, (, and { are undefined character escapes.  Unescape them.')
+
+  # For the rest, work with both comments and strings removed.
+  line = clean_lines.elided[linenum]
+
+  if Search(r'\b(const|volatile|void|char|short|int|long'
+            r'|float|double|signed|unsigned'
+            r'|schar|u?int8|u?int16|u?int32|u?int64)'
+            r'\s+(register|static|extern|typedef)\b',
+            line):
+    error(filename, linenum, 'build/storage_class', 5,
+          'Storage class (static, extern, typedef, etc) should be first.')
+
+  if Match(r'\s*#\s*endif\s*[^/\s]+', line):
+    error(filename, linenum, 'build/endif_comment', 5,
+          'Uncommented text after #endif is non-standard.  Use a comment.')
+
+  if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line):
+    error(filename, linenum, 'build/forward_decl', 5,
+          'Inner-style forward declarations are invalid.  Remove this line.')
+
+  if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?',
+            line):
+    error(filename, linenum, 'build/deprecated', 3,
+          '>? and <? (max and min) operators are non-standard and deprecated.')
+
+  if Search(r'^\s*const\s*string\s*&\s*\w+\s*;', line):
+    # TODO(unknown): Could it be expanded safely to arbitrary references,
+    # without triggering too many false positives? The first
+    # attempt triggered 5 warnings for mostly benign code in the regtest, hence
+    # the restriction.
+    # Here's the original regexp, for the reference:
+    # type_name = r'\w+((\s*::\s*\w+)|(\s*<\s*\w+?\s*>))?'
+    # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;'
+    error(filename, linenum, 'runtime/member_string_references', 2,
+          'const string& members are dangerous. It is much better to use '
+          'alternatives, such as pointers or simple constants.')
+
+  # Everything else in this function operates on class declarations.
+  # Return early if the top of the nesting stack is not a class, or if
+  # the class head is not completed yet.
+  classinfo = nesting_state.InnermostClass()
+  if not classinfo or not classinfo.seen_open_brace:
+    return
+
+  # The class may have been declared with namespace or classname qualifiers.
+  # The constructor and destructor will not have those qualifiers.
+  base_classname = classinfo.name.split('::')[-1]
+
+  # Look for single-argument constructors that aren't marked explicit.
+  # Technically a valid construct, but against style.
+  args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)'
+               % re.escape(base_classname),
+               line)
+  if (args and
+      args.group(1) != 'void' and
+      not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&'
+                % re.escape(base_classname), args.group(1).strip())):
+    error(filename, linenum, 'runtime/explicit', 5,
+          'Single-argument constructors should be marked explicit.')
+
+
+def CheckSpacingForFunctionCall(filename, line, linenum, error):
+  """Checks for the correctness of various spacing around function calls.
+
+  Args:
+    filename: The name of the current file.
+    line: The text of the line to check.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Since function calls often occur inside if/for/while/switch
+  # expressions - which have their own, more liberal conventions - we
+  # first see if we should be looking inside such an expression for a
+  # function call, to which we can apply more strict standards.
+  fncall = line    # if there's no control flow construct, look at whole line
+  for pattern in (r'\bif\s*\((.*)\)\s*{',
+                  r'\bfor\s*\((.*)\)\s*{',
+                  r'\bwhile\s*\((.*)\)\s*[{;]',
+                  r'\bswitch\s*\((.*)\)\s*{'):
+    match = Search(pattern, line)
+    if match:
+      fncall = match.group(1)    # look inside the parens for function calls
+      break
+
+  # Except in if/for/while/switch, there should never be space
+  # immediately inside parens (eg "f( 3, 4 )").  We make an exception
+  # for nested parens ( (a+b) + c ).  Likewise, there should never be
+  # a space before a ( when it's a function argument.  I assume it's a
+  # function argument when the char before the whitespace is legal in
+  # a function name (alnum + _) and we're not starting a macro. Also ignore
+  # pointers and references to arrays and functions coz they're too tricky:
+  # we use a very simple way to recognize these:
+  # " (something)(maybe-something)" or
+  # " (something)(maybe-something," or
+  # " (something)[something]"
+  # Note that we assume the contents of [] to be short enough that
+  # they'll never need to wrap.
+  if (  # Ignore control structures.
+      not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b',
+                 fncall) and
+      # Ignore pointers/references to functions.
+      not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and
+      # Ignore pointers/references to arrays.
+      not Search(r' \([^)]+\)\[[^\]]+\]', fncall)):
+    if Search(r'\w\s*\(\s(?!\s*\\$)', fncall):      # a ( used for a fn call
+      error(filename, linenum, 'whitespace/parens', 4,
+            'Extra space after ( in function call')
+    elif Search(r'\(\s+(?!(\s*\\)|\()', fncall):
+      error(filename, linenum, 'whitespace/parens', 2,
+            'Extra space after (')
+    if (Search(r'\w\s+\(', fncall) and
+        not Search(r'#\s*define|typedef', fncall) and
+        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)):
+      error(filename, linenum, 'whitespace/parens', 4,
+            'Extra space before ( in function call')
+    # If the ) is followed only by a newline or a { + newline, assume it's
+    # part of a control statement (if/while/etc), and don't complain
+    if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
+      # If the closing parenthesis is preceded by only whitespaces,
+      # try to give a more descriptive error message.
+      if Search(r'^\s+\)', fncall):
+        error(filename, linenum, 'whitespace/parens', 2,
+              'Closing ) should be moved to the previous line')
+      else:
+        error(filename, linenum, 'whitespace/parens', 2,
+              'Extra space before )')
+
+
+def IsBlankLine(line):
+  """Returns true if the given line is blank.
+
+  We consider a line to be blank if the line is empty or consists of
+  only white spaces.
+
+  Args:
+    line: A line of a string.
+
+  Returns:
+    True, if the given line is blank.
+  """
+  return not line or line.isspace()
+
+
+def CheckForFunctionLengths(filename, clean_lines, linenum,
+                            function_state, error):
+  """Reports for long function bodies.
+
+  For an overview why this is done, see:
+  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
+
+  Uses a simplistic algorithm assuming other style guidelines
+  (especially spacing) are followed.
+  Only checks unindented functions, so class members are unchecked.
+  Trivial bodies are unchecked, so constructors with huge initializer lists
+  may be missed.
+  Blank/comment lines are not counted so as to avoid encouraging the removal
+  of vertical space and comments just to get through a lint check.
+  NOLINT *on the last line of a function* disables this check.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    function_state: Current function name and lines in body so far.
+    error: The function to call with any errors found.
+  """
+  lines = clean_lines.lines
+  line = lines[linenum]
+  raw = clean_lines.raw_lines
+  raw_line = raw[linenum]
+  joined_line = ''
+
+  starting_func = False
+  regexp = r'(\w(\w|::|\*|\&|\s)*)\('  # decls * & space::name( ...
+  match_result = Match(regexp, line)
+  if match_result:
+    # If the name is all caps and underscores, figure it's a macro and
+    # ignore it, unless it's TEST or TEST_F.
+    function_name = match_result.group(1).split()[-1]
+    if function_name == 'TEST' or function_name == 'TEST_F' or (
+        not Match(r'[A-Z_]+$', function_name)):
+      starting_func = True
+
+  if starting_func:
+    body_found = False
+    for start_linenum in xrange(linenum, clean_lines.NumLines()):
+      start_line = lines[start_linenum]
+      joined_line += ' ' + start_line.lstrip()
+      if Search(r'(;|})', start_line):  # Declarations and trivial functions
+        body_found = True
+        break                              # ... ignore
+      elif Search(r'{', start_line):
+        body_found = True
+        function = Search(r'((\w|:)*)\(', line).group(1)
+        if Match(r'TEST', function):    # Handle TEST... macros
+          parameter_regexp = Search(r'(\(.*\))', joined_line)
+          if parameter_regexp:             # Ignore bad syntax
+            function += parameter_regexp.group(1)
+        else:
+          function += '()'
+        function_state.Begin(function)
+        break
+    if not body_found:
+      # No body for the function (or evidence of a non-function) was found.
+      error(filename, linenum, 'readability/fn_size', 5,
+            'Lint failed to find start of function body.')
+  elif Match(r'^\}\s*$', line):  # function end
+    function_state.Check(error, filename, linenum)
+    function_state.End()
+  elif not Match(r'^\s*$', line):
+    function_state.Count()  # Count non-blank/non-comment lines.
+
+
+_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
+
+
+def CheckComment(comment, filename, linenum, error):
+  """Checks for common mistakes in TODO comments.
+
+  Args:
+    comment: The text of the comment from the line in question.
+    filename: The name of the current file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  match = _RE_PATTERN_TODO.match(comment)
+  if match:
+    # One whitespace is correct; zero whitespace is handled elsewhere.
+    leading_whitespace = match.group(1)
+    if len(leading_whitespace) > 1:
+      error(filename, linenum, 'whitespace/todo', 2,
+            'Too many spaces before TODO')
+
+    username = match.group(2)
+    if not username:
+      error(filename, linenum, 'readability/todo', 2,
+            'Missing username in TODO; it should look like '
+            '"// TODO(my_username): Stuff."')
+
+    middle_whitespace = match.group(3)
+    # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
+    if middle_whitespace != ' ' and middle_whitespace != '':
+      error(filename, linenum, 'whitespace/todo', 2,
+            'TODO(my_username) should be followed by a space')
+
+def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for improper use of DISALLOW* macros.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
+                   r'DISALLOW_EVIL_CONSTRUCTORS|'
+                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
+  if not matched:
+    return
+  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
+    if nesting_state.stack[-1].access != 'private':
+      error(filename, linenum, 'readability/constructors', 3,
+            '%s must be in the private: section' % matched.group(1))
+
+  else:
+    # Found DISALLOW* macro outside a class declaration, or perhaps it
+    # was used inside a function when it should have been part of the
+    # class declaration.  We could issue a warning here, but it
+    # probably resulted in a compiler error already.
+    pass
+
+
+def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
+  """Find the corresponding > to close a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_suffix: Remainder of the current line after the initial <.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_suffix
+  nesting_stack = ['<']
+  while True:
+    # Find the next operator that can tell us whether < is used as an
+    # opening bracket or as a less-than operator.  We only want to
+    # warn on the latter case.
+    #
+    # We could also check all other operators and terminate the search
+    # early, e.g. if we got something like this "a<b+c", the "<" is
+    # most likely a less-than operator, but then we will get false
+    # positives for default arguments and other template expressions.
+    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(1)
+      line = match.group(2)
+
+      if nesting_stack[-1] == '<':
+        # Expecting closing angle bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator == '>':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma after a bracket, this is most likely a template
+          # argument.  We have not seen a closing angle bracket yet, but
+          # it's probably a few lines later if we look for it, so just
+          # return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting closing parenthesis or closing bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator in (')', ']'):
+          # We don't bother checking for matching () or [].  If we got
+          # something like (] or [), it would have been a syntax error.
+          nesting_stack.pop()
+
+    else:
+      # Scan the next line
+      linenum += 1
+      if linenum >= len(clean_lines.elided):
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all remaining lines and still no matching angle bracket.
+  # Most likely the input was incomplete, otherwise we should have
+  # seen a semicolon and returned early.
+  return True
+
+
+def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
+  """Find the corresponding < that started a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_prefix: Part of the current line before the initial >.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_prefix
+  nesting_stack = ['>']
+  while True:
+    # Find the previous operator
+    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(2)
+      line = match.group(1)
+
+      if nesting_stack[-1] == '>':
+        # Expecting opening angle bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator == '<':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma before a bracket, this is most likely a
+          # template argument.  The opening angle bracket is probably
+          # there if we look for it, so just return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting opening parenthesis or opening bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator in ('(', '['):
+          nesting_stack.pop()
+
+    else:
+      # Scan the previous line
+      linenum -= 1
+      if linenum < 0:
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all earlier lines and still no matching angle bracket.
+  return False
+
+
+def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for the correctness of various spacing issues in the code.
+
+  Things we check for: spaces around operators, spaces after
+  if/for/while/switch, no spaces around parens in function calls, two
+  spaces between code and comment, don't start a block with a blank
+  line, don't end a function with a blank line, don't add a blank line
+  after public/protected/private, don't have too many blank lines in a row.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+
+  # Don't use "elided" lines here, otherwise we can't check commented lines.
+  # Don't want to use "raw" either, because we don't want to check inside C++11
+  # raw strings,
+  raw = clean_lines.lines_without_raw_strings
+  line = raw[linenum]
+
+  # Before nixing comments, check if the line is blank for no good
+  # reason.  This includes the first line after a block is opened, and
+  # blank lines at the end of a function (ie, right before a line like '}'
+  #
+  # Skip all the blank line checks if we are immediately inside a
+  # namespace body.  In other words, don't issue blank line warnings
+  # for this block:
+  #   namespace {
+  #
+  #   }
+  #
+  # A warning about missing end of namespace comments will be issued instead.
+  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
+    elided = clean_lines.elided
+    prev_line = elided[linenum - 1]
+    prevbrace = prev_line.rfind('{')
+    # TODO(unknown): Don't complain if line before blank line, and line after,
+    #                both start with alnums and are indented the same amount.
+    #                This ignores whitespace at the start of a namespace block
+    #                because those are not usually indented.
+    if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
+      # OK, we have a blank line at the start of a code block.  Before we
+      # complain, we check if it is an exception to the rule: The previous
+      # non-empty line has the parameters of a function header that are indented
+      # 4 spaces (because they did not fit in a 80 column line when placed on
+      # the same line as the function name).  We also check for the case where
+      # the previous line is indented 6 spaces, which may happen when the
+      # initializers of a constructor do not fit into a 80 column line.
+      exception = False
+      if Match(r' {6}\w', prev_line):  # Initializer list?
+        # We are looking for the opening column of initializer list, which
+        # should be indented 4 spaces to cause 6 space indentation afterwards.
+        search_position = linenum-2
+        while (search_position >= 0
+               and Match(r' {6}\w', elided[search_position])):
+          search_position -= 1
+        exception = (search_position >= 0
+                     and elided[search_position][:5] == '    :')
+      else:
+        # Search for the function arguments or an initializer list.  We use a
+        # simple heuristic here: If the line is indented 4 spaces; and we have a
+        # closing paren, without the opening paren, followed by an opening brace
+        # or colon (for initializer lists) we assume that it is the last line of
+        # a function header.  If we have a colon indented 4 spaces, it is an
+        # initializer list.
+        exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)',
+                           prev_line)
+                     or Match(r' {4}:', prev_line))
+
+      if not exception:
+        error(filename, linenum, 'whitespace/blank_line', 2,
+              'Redundant blank line at the start of a code block '
+              'should be deleted.')
+    # Ignore blank lines at the end of a block in a long if-else
+    # chain, like this:
+    #   if (condition1) {
+    #     // Something followed by a blank line
+    #
+    #   } else if (condition2) {
+    #     // Something else
+    #   }
+    if linenum + 1 < clean_lines.NumLines():
+      next_line = raw[linenum + 1]
+      if (next_line
+          and Match(r'\s*}', next_line)
+          and next_line.find('} else ') == -1):
+        error(filename, linenum, 'whitespace/blank_line', 3,
+              'Redundant blank line at the end of a code block '
+              'should be deleted.')
+
+    matched = Match(r'\s*(public|protected|private):', prev_line)
+    if matched:
+      error(filename, linenum, 'whitespace/blank_line', 3,
+            'Do not leave a blank line after "%s:"' % matched.group(1))
+
+  # Next, we complain if there's a comment too near the text
+  commentpos = line.find('//')
+  if commentpos != -1:
+    # Check if the // may be in quotes.  If so, ignore it
+    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
+    if (line.count('"', 0, commentpos) -
+        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
+      # Allow one space for new scopes, two spaces otherwise:
+      if (not Match(r'^\s*{ //', line) and
+          ((commentpos >= 1 and
+            line[commentpos-1] not in string.whitespace) or
+           (commentpos >= 2 and
+            line[commentpos-2] not in string.whitespace))):
+        error(filename, linenum, 'whitespace/comments', 2,
+              'At least two spaces is best between code and comments')
+      # There should always be a space between the // and the comment
+      commentend = commentpos + 2
+      if commentend < len(line) and not line[commentend] == ' ':
+        # but some lines are exceptions -- e.g. if they're big
+        # comment delimiters like:
+        # //----------------------------------------------------------
+        # or are an empty C++ style Doxygen comment, like:
+        # ///
+        # or C++ style Doxygen comments placed after the variable:
+        # ///<  Header comment
+        # //!<  Header comment
+        # or they begin with multiple slashes followed by a space:
+        # //////// Header comment
+        match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or
+                 Search(r'^/$', line[commentend:]) or
+                 Search(r'^!< ', line[commentend:]) or
+                 Search(r'^/< ', line[commentend:]) or
+                 Search(r'^/+ ', line[commentend:]))
+        if not match:
+          error(filename, linenum, 'whitespace/comments', 4,
+                'Should have a space between // and comment')
+      CheckComment(line[commentpos:], filename, linenum, error)
+
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  # Don't try to do spacing checks for operator methods
+  line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line)
+
+  # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
+  # Otherwise not.  Note we only check for non-spaces on *both* sides;
+  # sometimes people put non-spaces on one side when aligning ='s among
+  # many lines (not that this is behavior that I approve of...)
+  if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line):
+    error(filename, linenum, 'whitespace/operators', 4,
+          'Missing spaces around =')
+
+  # It's ok not to have spaces around binary operators like + - * /, but if
+  # there's too little whitespace, we get concerned.  It's hard to tell,
+  # though, so we punt on this one for now.  TODO.
+
+  # You should always have whitespace around binary operators.
+  #
+  # Check <= and >= first to avoid false positives with < and >, then
+  # check non-include lines for spacing around < and >.
+  match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around %s' % match.group(1))
+  # We allow no-spaces around << when used like this: 10<<20, but
+  # not otherwise (particularly, not when used as streams)
+  # Also ignore using ns::operator<<;
+  match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
+  if (match and
+      not (match.group(1).isdigit() and match.group(2).isdigit()) and
+      not (match.group(1) == 'operator' and match.group(2) == ';')):
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around <<')
+  elif not Match(r'#.*include', line):
+    # Avoid false positives on ->
+    reduced_line = line.replace('->', '')
+
+    # Look for < that is not surrounded by spaces.  This is only
+    # triggered if both sides are missing spaces, even though
+    # technically should should flag if at least one side is missing a
+    # space.  This is done to avoid some false positives with shifts.
+    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
+    if (match and
+        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around <')
+
+    # Look for > that is not surrounded by spaces.  Similar to the
+    # above, we only trigger if both sides are missing spaces to avoid
+    # false positives with shifts.
+    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
+    if (match and
+        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
+                                             match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around >')
+
+  # We allow no-spaces around >> for almost anything.  This is because
+  # C++11 allows ">>" to close nested templates, which accounts for
+  # most cases when ">>" is not followed by a space.
+  #
+  # We still warn on ">>" followed by alpha character, because that is
+  # likely due to ">>" being used for right shifts, e.g.:
+  #   value >> alpha
+  #
+  # When ">>" is used to close templates, the alphanumeric letter that
+  # follows would be part of an identifier, and there should still be
+  # a space separating the template type and the identifier.
+  #   type<type<type>> alpha
+  match = Search(r'>>[a-zA-Z_]', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around >>')
+
+  # There shouldn't be space around unary operators
+  match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 4,
+          'Extra space for operator %s' % match.group(1))
+
+  # A pet peeve of mine: no spaces after an if, while, switch, or for
+  match = Search(r' (if\(|for\(|while\(|switch\()', line)
+  if match:
+    error(filename, linenum, 'whitespace/parens', 5,
+          'Missing space before ( in %s' % match.group(1))
+
+  # For if/for/while/switch, the left and right parens should be
+  # consistent about how many spaces are inside the parens, and
+  # there should either be zero or one spaces inside the parens.
+  # We don't want: "if ( foo)" or "if ( foo   )".
+  # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed.
+  match = Search(r'\b(if|for|while|switch)\s*'
+                 r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$',
+                 line)
+  if match:
+    if len(match.group(2)) != len(match.group(4)):
+      if not (match.group(3) == ';' and
+              len(match.group(2)) == 1 + len(match.group(4)) or
+              not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)):
+        error(filename, linenum, 'whitespace/parens', 5,
+              'Mismatching spaces inside () in %s' % match.group(1))
+    if len(match.group(2)) not in [0, 1]:
+      error(filename, linenum, 'whitespace/parens', 5,
+            'Should have zero or one spaces inside ( and ) in %s' %
+            match.group(1))
+
+  # You should always have a space after a comma (either as fn arg or operator)
+  #
+  # This does not apply when the non-space character following the
+  # comma is another comma, since the only time when that happens is
+  # for empty macro arguments.
+  #
+  # We run this check in two passes: first pass on elided lines to
+  # verify that lines contain missing whitespaces, second pass on raw
+  # lines to confirm that those missing whitespaces are not due to
+  # elided comments.
+  if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]):
+    error(filename, linenum, 'whitespace/comma', 3,
+          'Missing space after ,')
+
+  # You should always have a space after a semicolon
+  # except for few corner cases
+  # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more
+  # space after ;
+  if Search(r';[^\s};\\)/]', line):
+    error(filename, linenum, 'whitespace/semicolon', 3,
+          'Missing space after ;')
+
+  # Next we will look for issues with function calls.
+  CheckSpacingForFunctionCall(filename, line, linenum, error)
+
+  # Except after an opening paren, or after another opening brace (in case of
+  # an initializer list, for instance), you should have spaces before your
+  # braces. And since you should never have braces at the beginning of a line,
+  # this is an easy test.
+  match = Match(r'^(.*[^ ({]){', line)
+  if match:
+    # Try a bit harder to check for brace initialization.  This
+    # happens in one of the following forms:
+    #   Constructor() : initializer_list_{} { ... }
+    #   Constructor{}.MemberFunction()
+    #   Type variable{};
+    #   FunctionCall(type{}, ...);
+    #   LastArgument(..., type{});
+    #   LOG(INFO) << type{} << " ...";
+    #   map_of_type[{...}] = ...;
+    #
+    # We check for the character following the closing brace, and
+    # silence the warning if it's one of those listed above, i.e.
+    # "{.;,)<]".
+    #
+    # To account for nested initializer list, we allow any number of
+    # closing braces up to "{;,)<".  We can't simply silence the
+    # warning on first sight of closing brace, because that would
+    # cause false negatives for things that are not initializer lists.
+    #   Silence this:         But not this:
+    #     Outer{                if (...) {
+    #       Inner{...}            if (...){  // Missing space before {
+    #     };                    }
+    #
+    # There is a false negative with this approach if people inserted
+    # spurious semicolons, e.g. "if (cond){};", but we will catch the
+    # spurious semicolon with a separate check.
+    (endline, endlinenum, endpos) = CloseExpression(
+        clean_lines, linenum, len(match.group(1)))
+    trailing_text = ''
+    if endpos > -1:
+      trailing_text = endline[endpos:]
+    for offset in xrange(endlinenum + 1,
+                         min(endlinenum + 3, clean_lines.NumLines() - 1)):
+      trailing_text += clean_lines.elided[offset]
+    if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text):
+      error(filename, linenum, 'whitespace/braces', 5,
+            'Missing space before {')
+
+  # Make sure '} else {' has spaces.
+  if Search(r'}else', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Missing space before else')
+
+  # You shouldn't have spaces before your brackets, except maybe after
+  # 'delete []' or 'new char * []'.
+  if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Extra space before [')
+
+  # You shouldn't have a space before a semicolon at the end of the line.
+  # There's a special case for "for" since the style guide allows space before
+  # the semicolon there.
+  if Search(r':\s*;\s*$', line):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Semicolon defining empty statement. Use {} instead.')
+  elif Search(r'^\s*;\s*$', line):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Line contains only semicolon. If this should be an empty statement, '
+          'use {} instead.')
+  elif (Search(r'\s+;\s*$', line) and
+        not Search(r'\bfor\b', line)):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Extra space before last semicolon. If this should be an empty '
+          'statement, use {} instead.')
+
+  # In range-based for, we wanted spaces before and after the colon, but
+  # not around "::" tokens that might appear.
+  if (Search('for *\(.*[^:]:[^: ]', line) or
+      Search('for *\(.*[^: ]:[^:]', line)):
+    error(filename, linenum, 'whitespace/forcolon', 2,
+          'Missing space around colon in range-based for loop')
+
+
+def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
+  """Checks for additional blank line issues related to sections.
+
+  Currently the only thing checked here is blank line before protected/private.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    class_info: A _ClassInfo objects.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Skip checks if the class is small, where small means 25 lines or less.
+  # 25 lines seems like a good cutoff since that's the usual height of
+  # terminals, and any class that can't fit in one screen can't really
+  # be considered "small".
+  #
+  # Also skip checks if we are on the first line.  This accounts for
+  # classes that look like
+  #   class Foo { public: ... };
+  #
+  # If we didn't find the end of the class, last_line would be zero,
+  # and the check will be skipped by the first condition.
+  if (class_info.last_line - class_info.starting_linenum <= 24 or
+      linenum <= class_info.starting_linenum):
+    return
+
+  matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum])
+  if matched:
+    # Issue warning if the line before public/protected/private was
+    # not a blank line, but don't do this if the previous line contains
+    # "class" or "struct".  This can happen two ways:
+    #  - We are at the beginning of the class.
+    #  - We are forward-declaring an inner class that is semantically
+    #    private, but needed to be public for implementation reasons.
+    # Also ignores cases where the previous line ends with a backslash as can be
+    # common when defining classes in C macros.
+    prev_line = clean_lines.lines[linenum - 1]
+    if (not IsBlankLine(prev_line) and
+        not Search(r'\b(class|struct)\b', prev_line) and
+        not Search(r'\\$', prev_line)):
+      # Try a bit harder to find the beginning of the class.  This is to
+      # account for multi-line base-specifier lists, e.g.:
+      #   class Derived
+      #       : public Base {
+      end_class_head = class_info.starting_linenum
+      for i in range(class_info.starting_linenum, linenum):
+        if Search(r'\{\s*$', clean_lines.lines[i]):
+          end_class_head = i
+          break
+      if end_class_head < linenum - 1:
+        error(filename, linenum, 'whitespace/blank_line', 3,
+              '"%s:" should be preceded by a blank line' % matched.group(1))
+
+
+def GetPreviousNonBlankLine(clean_lines, linenum):
+  """Return the most recent non-blank line and its line number.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file contents.
+    linenum: The number of the line to check.
+
+  Returns:
+    A tuple with two elements.  The first element is the contents of the last
+    non-blank line before the current line, or the empty string if this is the
+    first non-blank line.  The second is the line number of that line, or -1
+    if this is the first non-blank line.
+  """
+
+  prevlinenum = linenum - 1
+  while prevlinenum >= 0:
+    prevline = clean_lines.elided[prevlinenum]
+    if not IsBlankLine(prevline):     # if not a blank line...
+      return (prevline, prevlinenum)
+    prevlinenum -= 1
+  return ('', -1)
+
+
+def CheckBraces(filename, clean_lines, linenum, error):
+  """Looks for misplaced braces (e.g. at the end of line).
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  line = clean_lines.elided[linenum]        # get rid of comments and strings
+
+  if Match(r'\s*{\s*$', line):
+    # We allow an open brace to start a line in the case where someone is using
+    # braces in a block to explicitly create a new scope, which is commonly used
+    # to control the lifetime of stack-allocated variables.  Braces are also
+    # used for brace initializers inside function calls.  We don't detect this
+    # perfectly: we just don't complain if the last non-whitespace character on
+    # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
+    # previous line starts a preprocessor block.
+    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+    if (not Search(r'[,;:}{(]\s*$', prevline) and
+        not Match(r'\s*#', prevline)):
+      error(filename, linenum, 'whitespace/braces', 4,
+            '{ should almost always be at the end of the previous line')
+
+  # An else clause should be on the same line as the preceding closing brace.
+  if Match(r'\s*else\s*', line):
+    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+    if Match(r'\s*}\s*$', prevline):
+      error(filename, linenum, 'whitespace/newline', 4,
+            'An else should appear on the same line as the preceding }')
+
+  # If braces come on one side of an else, they should be on both.
+  # However, we have to worry about "else if" that spans multiple lines!
+  if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
+    if Search(r'}\s*else if([^{]*)$', line):       # could be multi-line if
+      # find the ( after the if
+      pos = line.find('else if')
+      pos = line.find('(', pos)
+      if pos > 0:
+        (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
+        if endline[endpos:].find('{') == -1:    # must be brace after if
+          error(filename, linenum, 'readability/braces', 5,
+                'If an else has a brace on one side, it should have it on both')
+    else:            # common case: else not followed by a multi-line if
+      error(filename, linenum, 'readability/braces', 5,
+            'If an else has a brace on one side, it should have it on both')
+
+  # Likewise, an else should never have the else clause on the same line
+  if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
+    error(filename, linenum, 'whitespace/newline', 4,
+          'Else clause should never be on same line as else (use 2 lines)')
+
+  # In the same way, a do/while should never be on one line
+  if Match(r'\s*do [^\s{]', line):
+    error(filename, linenum, 'whitespace/newline', 4,
+          'do/while clauses should not be on a single line')
+
+  # Block bodies should not be followed by a semicolon.  Due to C++11
+  # brace initialization, there are more places where semicolons are
+  # required than not, so we use a whitelist approach to check these
+  # rather than a blacklist.  These are the places where "};" should
+  # be replaced by just "}":
+  # 1. Some flavor of block following closing parenthesis:
+  #    for (;;) {};
+  #    while (...) {};
+  #    switch (...) {};
+  #    Function(...) {};
+  #    if (...) {};
+  #    if (...) else if (...) {};
+  #
+  # 2. else block:
+  #    if (...) else {};
+  #
+  # 3. const member function:
+  #    Function(...) const {};
+  #
+  # 4. Block following some statement:
+  #    x = 42;
+  #    {};
+  #
+  # 5. Block at the beginning of a function:
+  #    Function(...) {
+  #      {};
+  #    }
+  #
+  #    Note that naively checking for the preceding "{" will also match
+  #    braces inside multi-dimensional arrays, but this is fine since
+  #    that expression will not contain semicolons.
+  #
+  # 6. Block following another block:
+  #    while (true) {}
+  #    {};
+  #
+  # 7. End of namespaces:
+  #    namespace {};
+  #
+  #    These semicolons seems far more common than other kinds of
+  #    redundant semicolons, possibly due to people converting classes
+  #    to namespaces.  For now we do not warn for this case.
+  #
+  # Try matching case 1 first.
+  match = Match(r'^(.*\)\s*)\{', line)
+  if match:
+    # Matched closing parenthesis (case 1).  Check the token before the
+    # matching opening parenthesis, and don't warn if it looks like a
+    # macro.  This avoids these false positives:
+    #  - macro that defines a base class
+    #  - multi-line macro that defines a base class
+    #  - macro that defines the whole class-head
+    #
+    # But we still issue warnings for macros that we know are safe to
+    # warn, specifically:
+    #  - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P
+    #  - TYPED_TEST
+    #  - INTERFACE_DEF
+    #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
+    #
+    # We implement a whitelist of safe macros instead of a blacklist of
+    # unsafe macros, even though the latter appears less frequently in
+    # google code and would have been easier to implement.  This is because
+    # the downside for getting the whitelist wrong means some extra
+    # semicolons, while the downside for getting the blacklist wrong
+    # would result in compile errors.
+    #
+    # In addition to macros, we also don't want to warn on compound
+    # literals.
+    closing_brace_pos = match.group(1).rfind(')')
+    opening_parenthesis = ReverseCloseExpression(
+        clean_lines, linenum, closing_brace_pos)
+    if opening_parenthesis[2] > -1:
+      line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
+      macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
+      if ((macro and
+           macro.group(1) not in (
+               'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
+               'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
+               'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
+          Search(r'\s+=\s*$', line_prefix)):
+        match = None
+    # Whitelist lambda function definition which also requires a ";" after
+    # closing brace
+    if match:
+        if Match(r'^.*\[.*\]\s*(.*\)\s*)\{', line):
+            match = None
+
+  else:
+    # Try matching cases 2-3.
+    match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line)
+    if not match:
+      # Try matching cases 4-6.  These are always matched on separate lines.
+      #
+      # Note that we can't simply concatenate the previous line to the
+      # current line and do a single match, otherwise we may output
+      # duplicate warnings for the blank line case:
+      #   if (cond) {
+      #     // blank line
+      #   }
+      prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+      if prevline and Search(r'[;{}]\s*$', prevline):
+        match = Match(r'^(\s*)\{', line)
+
+  # Check matching closing brace
+  if match:
+    (endline, endlinenum, endpos) = CloseExpression(
+        clean_lines, linenum, len(match.group(1)))
+    if endpos > -1 and Match(r'^\s*;', endline[endpos:]):
+      # Current {} pair is eligible for semicolon check, and we have found
+      # the redundant semicolon, output warning here.
+      #
+      # Note: because we are scanning forward for opening braces, and
+      # outputting warnings for the matching closing brace, if there are
+      # nested blocks with trailing semicolons, we will get the error
+      # messages in reversed order.
+      error(filename, endlinenum, 'readability/braces', 4,
+            "You don't need a ; after a }")
+
+
+def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
+  """Look for empty loop/conditional body with only a single semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Search for loop keywords at the beginning of the line.  Because only
+  # whitespaces are allowed before the keywords, this will also ignore most
+  # do-while-loops, since those lines should start with closing brace.
+  #
+  # We also check "if" blocks here, since an empty conditional block
+  # is likely an error.
+  line = clean_lines.elided[linenum]
+  matched = Match(r'\s*(for|while|if)\s*\(', line)
+  if matched:
+    # Find the end of the conditional expression
+    (end_line, end_linenum, end_pos) = CloseExpression(
+        clean_lines, linenum, line.find('('))
+
+    # Output warning if what follows the condition expression is a semicolon.
+    # No warning for all other cases, including whitespace or newline, since we
+    # have a separate check for semicolons preceded by whitespace.
+    if end_pos >= 0 and Match(r';', end_line[end_pos:]):
+      if matched.group(1) == 'if':
+        error(filename, end_linenum, 'whitespace/empty_conditional_body', 5,
+              'Empty conditional bodies should use {}')
+      else:
+        error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
+              'Empty loop bodies should use {} or continue')
+
+
+def CheckCheck(filename, clean_lines, linenum, error):
+  """Checks the use of CHECK and EXPECT macros.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Decide the set of replacement macros that should be suggested
+  lines = clean_lines.elided
+  check_macro = None
+  start_pos = -1
+  for macro in _CHECK_MACROS:
+    i = lines[linenum].find(macro)
+    if i >= 0:
+      check_macro = macro
+
+      # Find opening parenthesis.  Do a regular expression match here
+      # to make sure that we are matching the expected CHECK macro, as
+      # opposed to some other macro that happens to contain the CHECK
+      # substring.
+      matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum])
+      if not matched:
+        continue
+      start_pos = len(matched.group(1))
+      break
+  if not check_macro or start_pos < 0:
+    # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT'
+    return
+
+  # Find end of the boolean expression by matching parentheses
+  (last_line, end_line, end_pos) = CloseExpression(
+      clean_lines, linenum, start_pos)
+  if end_pos < 0:
+    return
+  if linenum == end_line:
+    expression = lines[linenum][start_pos + 1:end_pos - 1]
+  else:
+    expression = lines[linenum][start_pos + 1:]
+    for i in xrange(linenum + 1, end_line):
+      expression += lines[i]
+    expression += last_line[0:end_pos - 1]
+
+  # Parse expression so that we can take parentheses into account.
+  # This avoids false positives for inputs like "CHECK((a < 4) == b)",
+  # which is not replaceable by CHECK_LE.
+  lhs = ''
+  rhs = ''
+  operator = None
+  while expression:
+    matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||'
+                    r'==|!=|>=|>|<=|<|\()(.*)$', expression)
+    if matched:
+      token = matched.group(1)
+      if token == '(':
+        # Parenthesized operand
+        expression = matched.group(2)
+        (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')')
+        if end < 0:
+          return  # Unmatched parenthesis
+        lhs += '(' + expression[0:end]
+        expression = expression[end:]
+      elif token in ('&&', '||'):
+        # Logical and/or operators.  This means the expression
+        # contains more than one term, for example:
+        #   CHECK(42 < a && a < b);
+        #
+        # These are not replaceable with CHECK_LE, so bail out early.
+        return
+      elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'):
+        # Non-relational operator
+        lhs += token
+        expression = matched.group(2)
+      else:
+        # Relational operator
+        operator = token
+        rhs = matched.group(2)
+        break
+    else:
+      # Unparenthesized operand.  Instead of appending to lhs one character
+      # at a time, we do another regular expression match to consume several
+      # characters at once if possible.  Trivial benchmark shows that this
+      # is more efficient when the operands are longer than a single
+      # character, which is generally the case.
+      matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression)
+      if not matched:
+        matched = Match(r'^(\s*\S)(.*)$', expression)
+        if not matched:
+          break
+      lhs += matched.group(1)
+      expression = matched.group(2)
+
+  # Only apply checks if we got all parts of the boolean expression
+  if not (lhs and operator and rhs):
+    return
+
+  # Check that rhs do not contain logical operators.  We already know
+  # that lhs is fine since the loop above parses out && and ||.
+  if rhs.find('&&') > -1 or rhs.find('||') > -1:
+    return
+
+  # At least one of the operands must be a constant literal.  This is
+  # to avoid suggesting replacements for unprintable things like
+  # CHECK(variable != iterator)
+  #
+  # The following pattern matches decimal, hex integers, strings, and
+  # characters (in that order).
+  lhs = lhs.strip()
+  rhs = rhs.strip()
+  match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$'
+  if Match(match_constant, lhs) or Match(match_constant, rhs):
+    # Note: since we know both lhs and rhs, we can provide a more
+    # descriptive error message like:
+    #   Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42)
+    # Instead of:
+    #   Consider using CHECK_EQ instead of CHECK(a == b)
+    #
+    # We are still keeping the less descriptive message because if lhs
+    # or rhs gets long, the error message might become unreadable.
+    error(filename, linenum, 'readability/check', 2,
+          'Consider using %s instead of %s(a %s b)' % (
+              _CHECK_REPLACEMENT[check_macro][operator],
+              check_macro, operator))
+
+
+def CheckAltTokens(filename, clean_lines, linenum, error):
+  """Check alternative keywords being used in boolean expressions.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Avoid preprocessor lines
+  if Match(r'^\s*#', line):
+    return
+
+  # Last ditch effort to avoid multi-line comments.  This will not help
+  # if the comment started before the current line or ended after the
+  # current line, but it catches most of the false positives.  At least,
+  # it provides a way to workaround this warning for people who use
+  # multi-line comments in preprocessor macros.
+  #
+  # TODO(unknown): remove this once cpplint has better support for
+  # multi-line comments.
+  if line.find('/*') >= 0 or line.find('*/') >= 0:
+    return
+
+  for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
+    error(filename, linenum, 'readability/alt_tokens', 2,
+          'Use operator %s instead of %s' % (
+              _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
+
+
+def GetLineWidth(line):
+  """Determines the width of the line in column positions.
+
+  Args:
+    line: A string, which may be a Unicode string.
+
+  Returns:
+    The width of the line in column positions, accounting for Unicode
+    combining characters and wide characters.
+  """
+  if isinstance(line, unicode):
+    width = 0
+    for uc in unicodedata.normalize('NFC', line):
+      if unicodedata.east_asian_width(uc) in ('W', 'F'):
+        width += 2
+      elif not unicodedata.combining(uc):
+        width += 1
+    return width
+  else:
+    return len(line)
+
+
+def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
+               error):
+  """Checks rules from the 'C++ style rules' section of cppguide.html.
+
+  Most of these rules are hard to test (naming, comment style), but we
+  do what we can.  In particular we check for 2-space indents, line lengths,
+  tab usage, spaces inside code, etc.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    file_extension: The extension (without the dot) of the filename.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+
+  # Don't use "elided" lines here, otherwise we can't check commented lines.
+  # Don't want to use "raw" either, because we don't want to check inside C++11
+  # raw strings,
+  raw_lines = clean_lines.lines_without_raw_strings
+  line = raw_lines[linenum]
+
+  if line.find('\t') != -1:
+    error(filename, linenum, 'whitespace/tab', 1,
+          'Tab found; better to use spaces')
+
+  # One or three blank spaces at the beginning of the line is weird; it's
+  # hard to reconcile that with 2-space indents.
+  # NOTE: here are the conditions rob pike used for his tests.  Mine aren't
+  # as sophisticated, but it may be worth becoming so:  RLENGTH==initial_spaces
+  # if(RLENGTH > 20) complain = 0;
+  # if(match($0, " +(error|private|public|protected):")) complain = 0;
+  # if(match(prev, "&& *$")) complain = 0;
+  # if(match(prev, "\\|\\| *$")) complain = 0;
+  # if(match(prev, "[\",=><] *$")) complain = 0;
+  # if(match($0, " <<")) complain = 0;
+  # if(match(prev, " +for \\(")) complain = 0;
+  # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
+  initial_spaces = 0
+  cleansed_line = clean_lines.elided[linenum]
+  while initial_spaces < len(line) and line[initial_spaces] == ' ':
+    initial_spaces += 1
+  if line and line[-1].isspace():
+    error(filename, linenum, 'whitespace/end_of_line', 4,
+          'Line ends in whitespace.  Consider deleting these extra spaces.')
+  # There are certain situations we allow one space, notably for section labels
+  elif ((initial_spaces == 1 or initial_spaces == 3) and
+        not Match(r'\s*\w+\s*:\s*$', cleansed_line)):
+    error(filename, linenum, 'whitespace/indent', 3,
+          'Weird number of spaces at line-start.  '
+          'Are you using a 2-space indent?')
+
+  # Check if the line is a header guard.
+  is_header_guard = False
+  if file_extension == 'h':
+    cppvar = GetHeaderGuardCPPVariable(filename)
+    if (line.startswith('#ifndef %s' % cppvar) or
+        line.startswith('#define %s' % cppvar) or
+        line.startswith('#endif  // %s' % cppvar)):
+      is_header_guard = True
+  # #include lines and header guards can be long, since there's no clean way to
+  # split them.
+  #
+  # URLs can be long too.  It's possible to split these, but it makes them
+  # harder to cut&paste.
+  #
+  # The "$Id:...$" comment may also get very long without it being the
+  # developers fault.
+  if (not line.startswith('#include') and not is_header_guard and
+      not Match(r'^\s*//.*http(s?)://\S*$', line) and
+      not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
+    line_width = GetLineWidth(line)
+    extended_length = int((_line_length * 1.25))
+    if line_width > extended_length:
+      error(filename, linenum, 'whitespace/line_length', 4,
+            'Lines should very rarely be longer than %i characters' %
+            extended_length)
+    elif line_width > _line_length:
+      error(filename, linenum, 'whitespace/line_length', 2,
+            'Lines should be <= %i characters long' % _line_length)
+
+  if (cleansed_line.count(';') > 1 and
+      # for loops are allowed two ;'s (and may run over two lines).
+      cleansed_line.find('for') == -1 and
+      (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or
+       GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and
+      # It's ok to have many commands in a switch case that fits in 1 line
+      not ((cleansed_line.find('case ') != -1 or
+            cleansed_line.find('default:') != -1) and
+           cleansed_line.find('break;') != -1)):
+    error(filename, linenum, 'whitespace/newline', 0,
+          'More than one command on the same line')
+
+  # Some more style checks
+  CheckBraces(filename, clean_lines, linenum, error)
+  CheckEmptyBlockBody(filename, clean_lines, linenum, error)
+  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
+  CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckCheck(filename, clean_lines, linenum, error)
+  CheckAltTokens(filename, clean_lines, linenum, error)
+  classinfo = nesting_state.InnermostClass()
+  if classinfo:
+    CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
+
+
+_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
+_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
+# Matches the first component of a filename delimited by -s and _s. That is:
+#  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo'
+_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+')
+
+
+def _DropCommonSuffixes(filename):
+  """Drops common suffixes like _test.cc or -inl.h from filename.
+
+  For example:
+    >>> _DropCommonSuffixes('foo/foo-inl.h')
+    'foo/foo'
+    >>> _DropCommonSuffixes('foo/bar/foo.cc')
+    'foo/bar/foo'
+    >>> _DropCommonSuffixes('foo/foo_internal.h')
+    'foo/foo'
+    >>> _DropCommonSuffixes('foo/foo_unusualinternal.h')
+    'foo/foo_unusualinternal'
+
+  Args:
+    filename: The input filename.
+
+  Returns:
+    The filename with the common suffix removed.
+  """
+  for suffix in ('test.cc', 'regtest.cc', 'unittest.cc',
+                 'inl.h', 'impl.h', 'internal.h'):
+    if (filename.endswith(suffix) and len(filename) > len(suffix) and
+        filename[-len(suffix) - 1] in ('-', '_')):
+      return filename[:-len(suffix) - 1]
+  return os.path.splitext(filename)[0]
+
+
+def _IsTestFilename(filename):
+  """Determines if the given filename has a suffix that identifies it as a test.
+
+  Args:
+    filename: The input filename.
+
+  Returns:
+    True if 'filename' looks like a test, False otherwise.
+  """
+  if (filename.endswith('_test.cc') or
+      filename.endswith('_unittest.cc') or
+      filename.endswith('_regtest.cc')):
+    return True
+  else:
+    return False
+
+
+def _ClassifyInclude(fileinfo, include, is_system):
+  """Figures out what kind of header 'include' is.
+
+  Args:
+    fileinfo: The current file cpplint is running over. A FileInfo instance.
+    include: The path to a #included file.
+    is_system: True if the #include used <> rather than "".
+
+  Returns:
+    One of the _XXX_HEADER constants.
+
+  For example:
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True)
+    _C_SYS_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True)
+    _CPP_SYS_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False)
+    _LIKELY_MY_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'),
+    ...                  'bar/foo_other_ext.h', False)
+    _POSSIBLE_MY_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False)
+    _OTHER_HEADER
+  """
+  # This is a list of all standard c++ header files, except
+  # those already checked for above.
+  is_cpp_h = include in _CPP_HEADERS
+
+  if is_system:
+    if is_cpp_h:
+      return _CPP_SYS_HEADER
+    else:
+      return _C_SYS_HEADER
+
+  # If the target file and the include we're checking share a
+  # basename when we drop common extensions, and the include
+  # lives in . , then it's likely to be owned by the target file.
+  target_dir, target_base = (
+      os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName())))
+  include_dir, include_base = os.path.split(_DropCommonSuffixes(include))
+  if target_base == include_base and (
+      include_dir == target_dir or
+      include_dir == os.path.normpath(target_dir + '/../public')):
+    return _LIKELY_MY_HEADER
+
+  # If the target and include share some initial basename
+  # component, it's possible the target is implementing the
+  # include, so it's allowed to be first, but we'll never
+  # complain if it's not there.
+  target_first_component = _RE_FIRST_COMPONENT.match(target_base)
+  include_first_component = _RE_FIRST_COMPONENT.match(include_base)
+  if (target_first_component and include_first_component and
+      target_first_component.group(0) ==
+      include_first_component.group(0)):
+    return _POSSIBLE_MY_HEADER
+
+  return _OTHER_HEADER
+
+
+
+def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
+  """Check rules that are applicable to #include lines.
+
+  Strings on #include lines are NOT removed from elided line, to make
+  certain tasks easier. However, to prevent false positives, checks
+  applicable to #include lines in CheckLanguage must be put here.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    error: The function to call with any errors found.
+  """
+  fileinfo = FileInfo(filename)
+
+  line = clean_lines.lines[linenum]
+
+  # "include" should use the new style "foo/bar.h" instead of just "bar.h"
+  if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line):
+    error(filename, linenum, 'build/include', 4,
+          'Include the directory when naming .h files')
+
+  # we shouldn't include a file more than once. actually, there are a
+  # handful of instances where doing so is okay, but in general it's
+  # not.
+  match = _RE_PATTERN_INCLUDE.search(line)
+  if match:
+    include = match.group(2)
+    is_system = (match.group(1) == '<')
+    if include in include_state:
+      error(filename, linenum, 'build/include', 4,
+            '"%s" already included at %s:%s' %
+            (include, filename, include_state[include]))
+    else:
+      include_state[include] = linenum
+
+      # We want to ensure that headers appear in the right order:
+      # 1) for foo.cc, foo.h  (preferred location)
+      # 2) c system files
+      # 3) cpp system files
+      # 4) for foo.cc, foo.h  (deprecated location)
+      # 5) other google headers
+      #
+      # We classify each include statement as one of those 5 types
+      # using a number of techniques. The include_state object keeps
+      # track of the highest type seen, and complains if we see a
+      # lower type after that.
+      error_message = include_state.CheckNextIncludeOrder(
+          _ClassifyInclude(fileinfo, include, is_system))
+      if error_message:
+        error(filename, linenum, 'build/include_order', 4,
+              '%s. Should be: %s.h, c system, c++ system, other.' %
+              (error_message, fileinfo.BaseName()))
+      canonical_include = include_state.CanonicalizeAlphabeticalOrder(include)
+      if not include_state.IsInAlphabeticalOrder(
+          clean_lines, linenum, canonical_include):
+        error(filename, linenum, 'build/include_alpha', 4,
+              'Include "%s" not in alphabetical order' % include)
+      include_state.SetLastHeader(canonical_include)
+
+  # Look for any of the stream classes that are part of standard C++.
+  match = _RE_PATTERN_INCLUDE.match(line)
+  if match:
+    include = match.group(2)
+    if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include):
+      # Many unit tests use cout, so we exempt them.
+      if not _IsTestFilename(filename):
+        error(filename, linenum, 'readability/streams', 3,
+              'Streams are highly discouraged.')
+
+
+def _GetTextInside(text, start_pattern):
+  r"""Retrieves all the text between matching open and close parentheses.
+
+  Given a string of lines and a regular expression string, retrieve all the text
+  following the expression and between opening punctuation symbols like
+  (, [, or {, and the matching close-punctuation symbol. This properly nested
+  occurrences of the punctuations, so for the text like
+    printf(a(), b(c()));
+  a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'.
+  start_pattern must match string having an open punctuation symbol at the end.
+
+  Args:
+    text: The lines to extract text. Its comments and strings must be elided.
+           It can be single line and can span multiple lines.
+    start_pattern: The regexp string indicating where to start extracting
+                   the text.
+  Returns:
+    The extracted text.
+    None if either the opening string or ending punctuation could not be found.
+  """
+  # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably
+  # rewritten to use _GetTextInside (and use inferior regexp matching today).
+
+  # Give opening punctuations to get the matching close-punctuations.
+  matching_punctuation = {'(': ')', '{': '}', '[': ']'}
+  closing_punctuation = set(matching_punctuation.itervalues())
+
+  # Find the position to start extracting text.
+  match = re.search(start_pattern, text, re.M)
+  if not match:  # start_pattern not found in text.
+    return None
+  start_position = match.end(0)
+
+  assert start_position > 0, (
+      'start_pattern must ends with an opening punctuation.')
+  assert text[start_position - 1] in matching_punctuation, (
+      'start_pattern must ends with an opening punctuation.')
+  # Stack of closing punctuations we expect to have in text after position.
+  punctuation_stack = [matching_punctuation[text[start_position - 1]]]
+  position = start_position
+  while punctuation_stack and position < len(text):
+    if text[position] == punctuation_stack[-1]:
+      punctuation_stack.pop()
+    elif text[position] in closing_punctuation:
+      # A closing punctuation without matching opening punctuations.
+      return None
+    elif text[position] in matching_punctuation:
+      punctuation_stack.append(matching_punctuation[text[position]])
+    position += 1
+  if punctuation_stack:
+    # Opening punctuations left without matching close-punctuations.
+    return None
+  # punctuations match.
+  return text[start_position:position - 1]
+
+
+# Patterns for matching call-by-reference parameters.
+#
+# Supports nested templates up to 2 levels deep using this messy pattern:
+#   < (?: < (?: < [^<>]*
+#               >
+#           |   [^<>] )*
+#         >
+#     |   [^<>] )*
+#   >
+_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*'  # =~ [[:alpha:]][[:alnum:]]*
+_RE_PATTERN_TYPE = (
+    r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?'
+    r'(?:\w|'
+    r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|'
+    r'::)+')
+# A call-by-reference parameter ends with '& identifier'.
+_RE_PATTERN_REF_PARAM = re.compile(
+    r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*'
+    r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]')
+# A call-by-const-reference parameter either ends with 'const& identifier'
+# or looks like 'const type& identifier' when 'type' is atomic.
+_RE_PATTERN_CONST_REF_PARAM = (
+    r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
+    r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
+
+
+def CheckLanguage(filename, clean_lines, linenum, file_extension,
+                  include_state, nesting_state, error):
+  """Checks rules from the 'C++ language rules' section of cppguide.html.
+
+  Some of these rules are hard to test (function overloading, using
+  uint32 inappropriately), but we do the best we can.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    file_extension: The extension (without the dot) of the filename.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  # If the line is empty or consists of entirely a comment, no need to
+  # check it.
+  line = clean_lines.elided[linenum]
+  if not line:
+    return
+
+  match = _RE_PATTERN_INCLUDE.search(line)
+  if match:
+    CheckIncludeLine(filename, clean_lines, linenum, include_state, error)
+    return
+
+  # Reset include state across preprocessor directives.  This is meant
+  # to silence warnings for conditional includes.
+  if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line):
+    include_state.ResetSection()
+
+  # Make Windows paths like Unix.
+  fullname = os.path.abspath(filename).replace('\\', '/')
+
+  # TODO(unknown): figure out if they're using default arguments in fn proto.
+
+  # Check to see if they're using an conversion function cast.
+  # I just try to capture the most common basic types, though there are more.
+  # Parameterless conversion functions, such as bool(), are allowed as they are
+  # probably a member operator declaration or default constructor.
+  match = Search(
+      r'(\bnew\s+)?\b'  # Grab 'new' operator, if it's there
+      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
+      r'(\([^)].*)', line)
+  if match:
+    matched_new = match.group(1)
+    matched_type = match.group(2)
+    matched_funcptr = match.group(3)
+
+    # gMock methods are defined using some variant of MOCK_METHODx(name, type)
+    # where type may be float(), int(string), etc.  Without context they are
+    # virtually indistinguishable from int(x) casts. Likewise, gMock's
+    # MockCallback takes a template parameter of the form return_type(arg_type),
+    # which looks much like the cast we're trying to detect.
+    #
+    # std::function<> wrapper has a similar problem.
+    #
+    # Return types for function pointers also look like casts if they
+    # don't have an extra space.
+    if (matched_new is None and  # If new operator, then this isn't a cast
+        not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
+             Search(r'\bMockCallback<.*>', line) or
+             Search(r'\bstd::function<.*>', line)) and
+        not (matched_funcptr and
+             Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
+                   matched_funcptr))):
+      # Try a bit harder to catch gmock lines: the only place where
+      # something looks like an old-style cast is where we declare the
+      # return type of the mocked method, and the only time when we
+      # are missing context is if MOCK_METHOD was split across
+      # multiple lines.  The missing MOCK_METHOD is usually one or two
+      # lines back, so scan back one or two lines.
+      #
+      # It's not possible for gmock macros to appear in the first 2
+      # lines, since the class head + section name takes up 2 lines.
+      if (linenum < 2 or
+          not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
+                     clean_lines.elided[linenum - 1]) or
+               Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
+                     clean_lines.elided[linenum - 2]))):
+        error(filename, linenum, 'readability/casting', 4,
+              'Using deprecated casting style.  '
+              'Use static_cast<%s>(...) instead' %
+              matched_type)
+
+  CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                  'static_cast',
+                  r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
+
+  # This doesn't catch all cases. Consider (const char * const)"hello".
+  #
+  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
+  # compile).
+  if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                     'const_cast', r'\((char\s?\*+\s?)\)\s*"', error):
+    pass
+  else:
+    # Check pointer casts for other than string constants
+    CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                    'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error)
+
+  # In addition, we look for people taking the address of a cast.  This
+  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
+  # point where you think.
+  match = Search(
+      r'(?:&\(([^)]+)\)[\w(])|'
+      r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line)
+  if match and match.group(1) != '*':
+    error(filename, linenum, 'runtime/casting', 4,
+          ('Are you taking an address of a cast?  '
+           'This is dangerous: could be a temp var.  '
+           'Take the address before doing the cast, rather than after'))
+
+  # Create an extended_line, which is the concatenation of the current and
+  # next lines, for more effective checking of code that may span more than one
+  # line.
+  if linenum + 1 < clean_lines.NumLines():
+    extended_line = line + clean_lines.elided[linenum + 1]
+  else:
+    extended_line = line
+
+  # Check for people declaring static/global STL strings at the top level.
+  # This is dangerous because the C++ language does not guarantee that
+  # globals with constructors are initialized before the first access.
+  match = Match(
+      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
+      line)
+  # Make sure it's not a function.
+  # Function template specialization looks like: "string foo<Type>(...".
+  # Class template definitions look like: "string Foo<Type>::Method(...".
+  #
+  # Also ignore things that look like operators.  These are matched separately
+  # because operator names cross non-word boundaries.  If we change the pattern
+  # above, we would decrease the accuracy of matching identifiers.
+  if (match and
+      not Search(r'\boperator\W', line) and
+      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))):
+    error(filename, linenum, 'runtime/string', 4,
+          'For a static/global string constant, use a C style string instead: '
+          '"%schar %s[]".' %
+          (match.group(1), match.group(2)))
+
+  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
+    error(filename, linenum, 'runtime/init', 4,
+          'You seem to be initializing a member variable with itself.')
+
+  if file_extension == 'h':
+    # TODO(unknown): check that 1-arg constructors are explicit.
+    #                How to tell it's a constructor?
+    #                (handled in CheckForNonStandardConstructs for now)
+    # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS
+    #                (level 1 error)
+    pass
+
+  # Check if people are using the verboten C basic types.  The only exception
+  # we regularly allow is "unsigned short port" for port.
+  if Search(r'\bshort port\b', line):
+    if not Search(r'\bunsigned short port\b', line):
+      error(filename, linenum, 'runtime/int', 4,
+            'Use "unsigned short" for ports, not "short"')
+  else:
+    match = Search(r'\b(short|long(?! +double)|long long)\b', line)
+    if match:
+      error(filename, linenum, 'runtime/int', 4,
+            'Use int16/int64/etc, rather than the C type %s' % match.group(1))
+
+  # When snprintf is used, the second argument shouldn't be a literal.
+  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
+  if match and match.group(2) != '0':
+    # If 2nd arg is zero, snprintf is used to calculate size.
+    error(filename, linenum, 'runtime/printf', 3,
+          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
+          'to snprintf.' % (match.group(1), match.group(2)))
+
+  # Check if some verboten C functions are being used.
+  if Search(r'\bsprintf\b', line):
+    error(filename, linenum, 'runtime/printf', 5,
+          'Never use sprintf.  Use snprintf instead.')
+  match = Search(r'\b(strcpy|strcat)\b', line)
+  if match:
+    error(filename, linenum, 'runtime/printf', 4,
+          'Almost always, snprintf is better than %s' % match.group(1))
+
+  # Check if some verboten operator overloading is going on
+  # TODO(unknown): catch out-of-line unary operator&:
+  #   class X {};
+  #   int operator&(const X& x) { return 42; }  // unary operator&
+  # The trick is it's hard to tell apart from binary operator&:
+  #   class Y { int operator&(const Y& x) { return 23; } }; // binary operator&
+  if Search(r'\boperator\s*&\s*\(\s*\)', line):
+    error(filename, linenum, 'runtime/operator', 4,
+          'Unary operator& is dangerous.  Do not use it.')
+
+  # Check for suspicious usage of "if" like
+  # } if (a == b) {
+  if Search(r'\}\s*if\s*\(', line):
+    error(filename, linenum, 'readability/braces', 4,
+          'Did you mean "else if"? If not, start a new line for "if".')
+
+  # Check for potential format string bugs like printf(foo).
+  # We constrain the pattern not to pick things like DocidForPrintf(foo).
+  # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
+  # TODO(sugawarayu): Catch the following case. Need to change the calling
+  # convention of the whole function to process multiple line to handle it.
+  #   printf(
+  #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
+  printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
+  if printf_args:
+    match = Match(r'([\w.\->()]+)$', printf_args)
+    if match and match.group(1) != '__VA_ARGS__':
+      function_name = re.search(r'\b((?:string)?printf)\s*\(',
+                                line, re.I).group(1)
+      error(filename, linenum, 'runtime/printf', 4,
+            'Potential format string bug. Do %s("%%s", %s) instead.'
+            % (function_name, match.group(1)))
+
+  # Check for potential memset bugs like memset(buf, sizeof(buf), 0).
+  match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line)
+  if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)):
+    error(filename, linenum, 'runtime/memset', 4,
+          'Did you mean "memset(%s, 0, %s)"?'
+          % (match.group(1), match.group(2)))
+
+  if Search(r'\busing namespace\b', line):
+    error(filename, linenum, 'build/namespaces', 5,
+          'Do not use namespace using-directives.  '
+          'Use using-declarations instead.')
+
+  # Detect variable-length arrays.
+  match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line)
+  if (match and match.group(2) != 'return' and match.group(2) != 'delete' and
+      match.group(3).find(']') == -1):
+    # Split the size using space and arithmetic operators as delimiters.
+    # If any of the resulting tokens are not compile time constants then
+    # report the error.
+    tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3))
+    is_const = True
+    skip_next = False
+    for tok in tokens:
+      if skip_next:
+        skip_next = False
+        continue
+
+      if Search(r'sizeof\(.+\)', tok): continue
+      if Search(r'arraysize\(\w+\)', tok): continue
+
+      tok = tok.lstrip('(')
+      tok = tok.rstrip(')')
+      if not tok: continue
+      if Match(r'\d+', tok): continue
+      if Match(r'0[xX][0-9a-fA-F]+', tok): continue
+      if Match(r'k[A-Z0-9]\w*', tok): continue
+      if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue
+      if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue
+      # A catch all for tricky sizeof cases, including 'sizeof expression',
+      # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)'
+      # requires skipping the next token because we split on ' ' and '*'.
+      if tok.startswith('sizeof'):
+        skip_next = True
+        continue
+      is_const = False
+      break
+    if not is_const:
+      error(filename, linenum, 'runtime/arrays', 1,
+            'Do not use variable-length arrays.  Use an appropriately named '
+            "('k' followed by CamelCase) compile-time constant for the size.")
+
+  # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or
+  # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing
+  # in the class declaration.
+  match = Match(
+      (r'\s*'
+       r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))'
+       r'\(.*\);$'),
+      line)
+  if match and linenum + 1 < clean_lines.NumLines():
+    next_line = clean_lines.elided[linenum + 1]
+    # We allow some, but not all, declarations of variables to be present
+    # in the statement that defines the class.  The [\w\*,\s]* fragment of
+    # the regular expression below allows users to declare instances of
+    # the class or pointers to instances, but not less common types such
+    # as function pointers or arrays.  It's a tradeoff between allowing
+    # reasonable code and avoiding trying to parse more C++ using regexps.
+    if not Search(r'^\s*}[\w\*,\s]*;', next_line):
+      error(filename, linenum, 'readability/constructors', 3,
+            match.group(1) + ' should be the last thing in the class')
+
+  # Check for use of unnamed namespaces in header files.  Registration
+  # macros are typically OK, so we allow use of "namespace {" on lines
+  # that end with backslashes.
+  if (file_extension == 'h'
+      and Search(r'\bnamespace\s*{', line)
+      and line[-1] != '\\'):
+    error(filename, linenum, 'build/namespaces', 4,
+          'Do not use unnamed namespaces in header files.  See '
+          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
+          ' for more information.')
+
+def CheckForNonConstReference(filename, clean_lines, linenum,
+                              nesting_state, error):
+  """Check for non-const references.
+
+  Separate from CheckLanguage since it scans backwards from current
+  line, instead of scanning forward.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  # Do nothing if there is no '&' on current line.
+  line = clean_lines.elided[linenum]
+  if '&' not in line:
+    return
+
+  # Long type names may be broken across multiple lines, usually in one
+  # of these forms:
+  #   LongType
+  #       ::LongTypeContinued &identifier
+  #   LongType::
+  #       LongTypeContinued &identifier
+  #   LongType<
+  #       ...>::LongTypeContinued &identifier
+  #
+  # If we detected a type split across two lines, join the previous
+  # line to current line so that we can match const references
+  # accordingly.
+  #
+  # Note that this only scans back one line, since scanning back
+  # arbitrary number of lines would be expensive.  If you have a type
+  # that spans more than 2 lines, please use a typedef.
+  if linenum > 1:
+    previous = None
+    if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line):
+      # previous_line\n + ::current_line
+      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$',
+                        clean_lines.elided[linenum - 1])
+    elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line):
+      # previous_line::\n + current_line
+      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$',
+                        clean_lines.elided[linenum - 1])
+    if previous:
+      line = previous.group(1) + line.lstrip()
+    else:
+      # Check for templated parameter that is split across multiple lines
+      endpos = line.rfind('>')
+      if endpos > -1:
+        (_, startline, startpos) = ReverseCloseExpression(
+            clean_lines, linenum, endpos)
+        if startpos > -1 and startline < linenum:
+          # Found the matching < on an earlier line, collect all
+          # pieces up to current line.
+          line = ''
+          for i in xrange(startline, linenum + 1):
+            line += clean_lines.elided[i].strip()
+
+  # Check for non-const references in function parameters.  A single '&' may
+  # found in the following places:
+  #   inside expression: binary & for bitwise AND
+  #   inside expression: unary & for taking the address of something
+  #   inside declarators: reference parameter
+  # We will exclude the first two cases by checking that we are not inside a
+  # function body, including one that was just introduced by a trailing '{'.
+  # TODO(unknwon): Doesn't account for preprocessor directives.
+  # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
+  check_params = False
+  if not nesting_state.stack:
+    check_params = True  # top level
+  elif (isinstance(nesting_state.stack[-1], _ClassInfo) or
+        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
+    check_params = True  # within class or namespace
+  elif Match(r'.*{\s*$', line):
+    if (len(nesting_state.stack) == 1 or
+        isinstance(nesting_state.stack[-2], _ClassInfo) or
+        isinstance(nesting_state.stack[-2], _NamespaceInfo)):
+      check_params = True  # just opened global/class/namespace block
+  # We allow non-const references in a few standard places, like functions
+  # called "swap()" or iostream operators like "<<" or ">>".  Do not check
+  # those function parameters.
+  #
+  # We also accept & in static_assert, which looks like a function but
+  # it's actually a declaration expression.
+  whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
+                           r'operator\s*[<>][<>]|'
+                           r'static_assert|COMPILE_ASSERT'
+                           r')\s*\(')
+  if Search(whitelisted_functions, line):
+    check_params = False
+  elif not Search(r'\S+\([^)]*$', line):
+    # Don't see a whitelisted function on this line.  Actually we
+    # didn't see any function name on this line, so this is likely a
+    # multi-line parameter list.  Try a bit harder to catch this case.
+    for i in xrange(2):
+      if (linenum > i and
+          Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])):
+        check_params = False
+        break
+
+  if check_params:
+    decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
+    for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
+      if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
+        error(filename, linenum, 'runtime/references', 2,
+              'Is this a non-const reference? '
+              'If so, make const or use a pointer: ' +
+              ReplaceAll(' *<', '<', parameter))
+
+
+def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
+                    error):
+  """Checks for a C-style cast by looking for the pattern.
+
+  Args:
+    filename: The name of the current file.
+    linenum: The number of the line to check.
+    line: The line of code to check.
+    raw_line: The raw line of code to check, with comments.
+    cast_type: The string for the C++ cast to recommend.  This is either
+      reinterpret_cast, static_cast, or const_cast, depending.
+    pattern: The regular expression used to find C-style casts.
+    error: The function to call with any errors found.
+
+  Returns:
+    True if an error was emitted.
+    False otherwise.
+  """
+  match = Search(pattern, line)
+  if not match:
+    return False
+
+  # Exclude lines with sizeof, since sizeof looks like a cast.
+  sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1])
+  if sizeof_match:
+    return False
+
+  # operator++(int) and operator--(int)
+  if (line[0:match.start(1) - 1].endswith(' operator++') or
+      line[0:match.start(1) - 1].endswith(' operator--')):
+    return False
+
+  # A single unnamed argument for a function tends to look like old
+  # style cast.  If we see those, don't issue warnings for deprecated
+  # casts, instead issue warnings for unnamed arguments where
+  # appropriate.
+  #
+  # These are things that we want warnings for, since the style guide
+  # explicitly require all parameters to be named:
+  #   Function(int);
+  #   Function(int) {
+  #   ConstMember(int) const;
+  #   ConstMember(int) const {
+  #   ExceptionMember(int) throw (...);
+  #   ExceptionMember(int) throw (...) {
+  #   PureVirtual(int) = 0;
+  #
+  # These are functions of some sort, where the compiler would be fine
+  # if they had named parameters, but people often omit those
+  # identifiers to reduce clutter:
+  #   (FunctionPointer)(int);
+  #   (FunctionPointer)(int) = value;
+  #   Function((function_pointer_arg)(int))
+  #   <TemplateArgument(int)>;
+  #   <(FunctionPointerTemplateArgument)(int)>;
+  remainder = line[match.end(0):]
+  if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder):
+    # Looks like an unnamed parameter.
+
+    # Don't warn on any kind of template arguments.
+    if Match(r'^\s*>', remainder):
+      return False
+
+    # Don't warn on assignments to function pointers, but keep warnings for
+    # unnamed parameters to pure virtual functions.  Note that this pattern
+    # will also pass on assignments of "0" to function pointers, but the
+    # preferred values for those would be "nullptr" or "NULL".
+    matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
+    if matched_zero and matched_zero.group(1) != '0':
+      return False
+
+    # Don't warn on function pointer declarations.  For this we need
+    # to check what came before the "(type)" string.
+    if Match(r'.*\)\s*$', line[0:match.start(0)]):
+      return False
+
+    # Don't warn if the parameter is named with block comments, e.g.:
+    #  Function(int /*unused_param*/);
+    if '/*' in raw_line:
+      return False
+
+    # Passed all filters, issue warning here.
+    error(filename, linenum, 'readability/function', 3,
+          'All parameters should be named in a function')
+    return True
+
+  # At this point, all that should be left is actual casts.
+  error(filename, linenum, 'readability/casting', 4,
+        'Using C-style cast.  Use %s<%s>(...) instead' %
+        (cast_type, match.group(1)))
+
+  return True
+
+
+_HEADERS_CONTAINING_TEMPLATES = (
+    ('<deque>', ('deque',)),
+    ('<functional>', ('unary_function', 'binary_function',
+                      'plus', 'minus', 'multiplies', 'divides', 'modulus',
+                      'negate',
+                      'equal_to', 'not_equal_to', 'greater', 'less',
+                      'greater_equal', 'less_equal',
+                      'logical_and', 'logical_or', 'logical_not',
+                      'unary_negate', 'not1', 'binary_negate', 'not2',
+                      'bind1st', 'bind2nd',
+                      'pointer_to_unary_function',
+                      'pointer_to_binary_function',
+                      'ptr_fun',
+                      'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t',
+                      'mem_fun_ref_t',
+                      'const_mem_fun_t', 'const_mem_fun1_t',
+                      'const_mem_fun_ref_t', 'const_mem_fun1_ref_t',
+                      'mem_fun_ref',
+                     )),
+    ('<limits>', ('numeric_limits',)),
+    ('<list>', ('list',)),
+    ('<map>', ('map', 'multimap',)),
+    ('<memory>', ('allocator',)),
+    ('<queue>', ('queue', 'priority_queue',)),
+    ('<set>', ('set', 'multiset',)),
+    ('<stack>', ('stack',)),
+    ('<string>', ('char_traits', 'basic_string',)),
+    ('<utility>', ('pair',)),
+    ('<vector>', ('vector',)),
+
+    # gcc extensions.
+    # Note: std::hash is their hash, ::hash is our hash
+    ('<hash_map>', ('hash_map', 'hash_multimap',)),
+    ('<hash_set>', ('hash_set', 'hash_multiset',)),
+    ('<slist>', ('slist',)),
+    )
+
+_RE_PATTERN_STRING = re.compile(r'\bstring\b')
+
+_re_pattern_algorithm_header = []
+for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
+                  'transform'):
+  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
+  # type::max().
+  _re_pattern_algorithm_header.append(
+      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
+       _template,
+       '<algorithm>'))
+
+_re_pattern_templates = []
+for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
+  for _template in _templates:
+    _re_pattern_templates.append(
+        (re.compile(r'(\<|\b)' + _template + r'\s*\<'),
+         _template + '<>',
+         _header))
+
+
+def FilesBelongToSameModule(filename_cc, filename_h):
+  """Check if these two filenames belong to the same module.
+
+  The concept of a 'module' here is a as follows:
+  foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the
+  same 'module' if they are in the same directory.
+  some/path/public/xyzzy and some/path/internal/xyzzy are also considered
+  to belong to the same module here.
+
+  If the filename_cc contains a longer path than the filename_h, for example,
+  '/absolute/path/to/base/sysinfo.cc', and this file would include
+  'base/sysinfo.h', this function also produces the prefix needed to open the
+  header. This is used by the caller of this function to more robustly open the
+  header file. We don't have access to the real include paths in this context,
+  so we need this guesswork here.
+
+  Known bugs: tools/base/bar.cc and base/bar.h belong to the same module
+  according to this implementation. Because of this, this function gives
+  some false positives. This should be sufficiently rare in practice.
+
+  Args:
+    filename_cc: is the path for the .cc file
+    filename_h: is the path for the header path
+
+  Returns:
+    Tuple with a bool and a string:
+    bool: True if filename_cc and filename_h belong to the same module.
+    string: the additional prefix needed to open the header file.
+  """
+
+  if not filename_cc.endswith('.cc'):
+    return (False, '')
+  filename_cc = filename_cc[:-len('.cc')]
+  if filename_cc.endswith('_unittest'):
+    filename_cc = filename_cc[:-len('_unittest')]
+  elif filename_cc.endswith('_test'):
+    filename_cc = filename_cc[:-len('_test')]
+  filename_cc = filename_cc.replace('/public/', '/')
+  filename_cc = filename_cc.replace('/internal/', '/')
+
+  if not filename_h.endswith('.h'):
+    return (False, '')
+  filename_h = filename_h[:-len('.h')]
+  if filename_h.endswith('-inl'):
+    filename_h = filename_h[:-len('-inl')]
+  filename_h = filename_h.replace('/public/', '/')
+  filename_h = filename_h.replace('/internal/', '/')
+
+  files_belong_to_same_module = filename_cc.endswith(filename_h)
+  common_path = ''
+  if files_belong_to_same_module:
+    common_path = filename_cc[:-len(filename_h)]
+  return files_belong_to_same_module, common_path
+
+
+def UpdateIncludeState(filename, include_state, io=codecs):
+  """Fill up the include_state with new includes found from the file.
+
+  Args:
+    filename: the name of the header to read.
+    include_state: an _IncludeState instance in which the headers are inserted.
+    io: The io factory to use to read the file. Provided for testability.
+
+  Returns:
+    True if a header was successfully added. False otherwise.
+  """
+  headerfile = None
+  try:
+    headerfile = io.open(filename, 'r', 'utf8', 'replace')
+  except IOError:
+    return False
+  linenum = 0
+  for line in headerfile:
+    linenum += 1
+    clean_line = CleanseComments(line)
+    match = _RE_PATTERN_INCLUDE.search(clean_line)
+    if match:
+      include = match.group(2)
+      # The value formatting is cute, but not really used right now.
+      # What matters here is that the key is in include_state.
+      include_state.setdefault(include, '%s:%d' % (filename, linenum))
+  return True
+
+
+def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
+                              io=codecs):
+  """Reports for missing stl includes.
+
+  This function will output warnings to make sure you are including the headers
+  necessary for the stl containers and functions that you use. We only give one
+  reason to include a header. For example, if you use both equal_to<> and
+  less<> in a .h file, only one (the latter in the file) of these will be
+  reported as a reason to include the <functional>.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    include_state: An _IncludeState instance.
+    error: The function to call with any errors found.
+    io: The IO factory to use to read the header file. Provided for unittest
+        injection.
+  """
+  required = {}  # A map of header name to linenumber and the template entity.
+                 # Example of required: { '<functional>': (1219, 'less<>') }
+
+  for linenum in xrange(clean_lines.NumLines()):
+    line = clean_lines.elided[linenum]
+    if not line or line[0] == '#':
+      continue
+
+    # String is special -- it is a non-templatized type in STL.
+    matched = _RE_PATTERN_STRING.search(line)
+    if matched:
+      # Don't warn about strings in non-STL namespaces:
+      # (We check only the first match per line; good enough.)
+      prefix = line[:matched.start()]
+      if prefix.endswith('std::') or not prefix.endswith('::'):
+        required['<string>'] = (linenum, 'string')
+
+    for pattern, template, header in _re_pattern_algorithm_header:
+      if pattern.search(line):
+        required[header] = (linenum, template)
+
+    # The following function is just a speed up, no semantics are changed.
+    if not '<' in line:  # Reduces the cpu time usage by skipping lines.
+      continue
+
+    for pattern, template, header in _re_pattern_templates:
+      if pattern.search(line):
+        required[header] = (linenum, template)
+
+  # The policy is that if you #include something in foo.h you don't need to
+  # include it again in foo.cc. Here, we will look at possible includes.
+  # Let's copy the include_state so it is only messed up within this function.
+  include_state = include_state.copy()
+
+  # Did we find the header for this file (if any) and successfully load it?
+  header_found = False
+
+  # Use the absolute path so that matching works properly.
+  abs_filename = FileInfo(filename).FullName()
+
+  # For Emacs's flymake.
+  # If cpplint is invoked from Emacs's flymake, a temporary file is generated
+  # by flymake and that file name might end with '_flymake.cc'. In that case,
+  # restore original file name here so that the corresponding header file can be
+  # found.
+  # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h'
+  # instead of 'foo_flymake.h'
+  abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
+
+  # include_state is modified during iteration, so we iterate over a copy of
+  # the keys.
+  header_keys = include_state.keys()
+  for header in header_keys:
+    (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
+    fullpath = common_path + header
+    if same_module and UpdateIncludeState(fullpath, include_state, io):
+      header_found = True
+
+  # If we can't find the header file for a .cc, assume it's because we don't
+  # know where to look. In that case we'll give up as we're not sure they
+  # didn't include it in the .h file.
+  # TODO(unknown): Do a better job of finding .h files so we are confident that
+  # not having the .h file means there isn't one.
+  if filename.endswith('.cc') and not header_found:
+    return
+
+  # All the lines have been processed, report the errors found.
+  for required_header_unstripped in required:
+    template = required[required_header_unstripped][1]
+    if required_header_unstripped.strip('<>"') not in include_state:
+      error(filename, required[required_header_unstripped][0],
+            'build/include_what_you_use', 4,
+            'Add #include ' + required_header_unstripped + ' for ' + template)
+
+
+_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<')
+
+
+def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
+  """Check that make_pair's template arguments are deduced.
+
+  G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are
+  specified explicitly, and such use isn't intended in any case.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line)
+  if match:
+    error(filename, linenum, 'build/explicit_make_pair',
+          4,  # 4 = high confidence
+          'For C++11-compatibility, omit template arguments from make_pair'
+          ' OR use pair directly OR if appropriate, construct a pair directly')
+
+
+def ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions=[]):
+  """Processes a single line in the file.
+
+  Args:
+    filename: Filename of the file that is being processed.
+    file_extension: The extension (dot not included) of the file.
+    clean_lines: An array of strings, each representing a line of the file,
+                 with comments stripped.
+    line: Number of line being processed.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    function_state: A _FunctionState instance which counts function lines, etc.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+  raw_lines = clean_lines.raw_lines
+  ParseNolintSuppressions(filename, raw_lines[line], line, error)
+  nesting_state.Update(filename, clean_lines, line, error)
+  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
+    return
+  CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
+  CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
+  CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
+  CheckLanguage(filename, clean_lines, line, file_extension, include_state,
+                nesting_state, error)
+  CheckForNonConstReference(filename, clean_lines, line, nesting_state, error)
+  CheckForNonStandardConstructs(filename, clean_lines, line,
+                                nesting_state, error)
+  CheckVlogArguments(filename, clean_lines, line, error)
+  CheckPosixThreading(filename, clean_lines, line, error)
+  CheckInvalidIncrement(filename, clean_lines, line, error)
+  CheckMakePairUsesDeduction(filename, clean_lines, line, error)
+  for check_fn in extra_check_functions:
+    check_fn(filename, clean_lines, line, error)
+
+def ProcessFileData(filename, file_extension, lines, error,
+                    extra_check_functions=[]):
+  """Performs lint checks and reports any errors to the given error function.
+
+  Args:
+    filename: Filename of the file that is being processed.
+    file_extension: The extension (dot not included) of the file.
+    lines: An array of strings, each representing a line of the file, with the
+           last element being empty if the file is terminated with a newline.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+  lines = (['// marker so line numbers and indices both start at 1'] + lines +
+           ['// marker so line numbers end in a known way'])
+
+  include_state = _IncludeState()
+  function_state = _FunctionState()
+  nesting_state = _NestingState()
+
+  ResetNolintSuppressions()
+
+  CheckForCopyright(filename, lines, error)
+
+  if file_extension == 'h':
+    CheckForHeaderGuard(filename, lines, error)
+
+  RemoveMultiLineComments(filename, lines, error)
+  clean_lines = CleansedLines(lines)
+  for line in xrange(clean_lines.NumLines()):
+    ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions)
+  nesting_state.CheckCompletedBlocks(filename, error)
+
+  CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
+
+  # We check here rather than inside ProcessLine so that we see raw
+  # lines rather than "cleaned" lines.
+  CheckForBadCharacters(filename, lines, error)
+
+  CheckForNewlineAtEOF(filename, lines, error)
+
+def ProcessFile(filename, vlevel, extra_check_functions=[]):
+  """Does google-lint on a single file.
+
+  Args:
+    filename: The name of the file to parse.
+
+    vlevel: The level of errors to report.  Every error of confidence
+    >= verbose_level will be reported.  0 is a good default.
+
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+
+  _SetVerboseLevel(vlevel)
+
+  try:
+    # Support the UNIX convention of using "-" for stdin.  Note that
+    # we are not opening the file with universal newline support
+    # (which codecs doesn't support anyway), so the resulting lines do
+    # contain trailing '\r' characters if we are reading a file that
+    # has CRLF endings.
+    # If after the split a trailing '\r' is present, it is removed
+    # below. If it is not expected to be present (i.e. os.linesep !=
+    # '\r\n' as in Windows), a warning is issued below if this file
+    # is processed.
+
+    if filename == '-':
+      lines = codecs.StreamReaderWriter(sys.stdin,
+                                        codecs.getreader('utf8'),
+                                        codecs.getwriter('utf8'),
+                                        'replace').read().split('\n')
+    else:
+      lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
+
+    carriage_return_found = False
+    # Remove trailing '\r'.
+    for linenum in range(len(lines)):
+      if lines[linenum].endswith('\r'):
+        lines[linenum] = lines[linenum].rstrip('\r')
+        carriage_return_found = True
+
+  except IOError:
+    sys.stderr.write(
+        "Skipping input '%s': Can't open for reading\n" % filename)
+    return
+
+  # Note, if no dot is found, this will give the entire filename as the ext.
+  file_extension = filename[filename.rfind('.') + 1:]
+
+  # When reading from stdin, the extension is unknown, so no cpplint tests
+  # should rely on the extension.
+  if filename != '-' and file_extension not in _valid_extensions:
+    sys.stderr.write('Ignoring %s; not a valid file name '
+                     '(%s)\n' % (filename, ', '.join(_valid_extensions)))
+  else:
+    ProcessFileData(filename, file_extension, lines, Error,
+                    extra_check_functions)
+    if carriage_return_found and os.linesep != '\r\n':
+      # Use 0 for linenum since outputting only one error for potentially
+      # several lines.
+      Error(filename, 0, 'whitespace/newline', 1,
+            'One or more unexpected \\r (^M) found;'
+            'better to use only a \\n')
+
+  sys.stderr.write('Done processing %s\n' % filename)
+
+
+def PrintUsage(message):
+  """Prints a brief usage string and exits, optionally with an error message.
+
+  Args:
+    message: The optional error message.
+  """
+  sys.stderr.write(_USAGE)
+  if message:
+    sys.exit('\nFATAL ERROR: ' + message)
+  else:
+    sys.exit(1)
+
+
+def PrintCategories():
+  """Prints a list of all the error-categories used by error messages.
+
+  These are the categories used to filter messages via --filter.
+  """
+  sys.stderr.write(''.join('  %s\n' % cat for cat in _ERROR_CATEGORIES))
+  sys.exit(0)
+
+
+def ParseArguments(args):
+  """Parses the command line arguments.
+
+  This may set the output format and verbosity level as side-effects.
+
+  Args:
+    args: The command line arguments:
+
+  Returns:
+    The list of filenames to lint.
+  """
+  try:
+    (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=',
+                                                 'counting=',
+                                                 'filter=',
+                                                 'root=',
+                                                 'linelength=',
+                                                 'extensions='])
+  except getopt.GetoptError:
+    PrintUsage('Invalid arguments.')
+
+  verbosity = _VerboseLevel()
+  output_format = _OutputFormat()
+  filters = ''
+  counting_style = ''
+
+  for (opt, val) in opts:
+    if opt == '--help':
+      PrintUsage(None)
+    elif opt == '--output':
+      if val not in ('emacs', 'vs7', 'eclipse'):
+        PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
+      output_format = val
+    elif opt == '--verbose':
+      verbosity = int(val)
+    elif opt == '--filter':
+      filters = val
+      if not filters:
+        PrintCategories()
+    elif opt == '--counting':
+      if val not in ('total', 'toplevel', 'detailed'):
+        PrintUsage('Valid counting options are total, toplevel, and detailed')
+      counting_style = val
+    elif opt == '--root':
+      global _root
+      _root = val
+    elif opt == '--linelength':
+      global _line_length
+      try:
+          _line_length = int(val)
+      except ValueError:
+          PrintUsage('Line length must be digits.')
+    elif opt == '--extensions':
+      global _valid_extensions
+      try:
+          _valid_extensions = set(val.split(','))
+      except ValueError:
+          PrintUsage('Extensions must be comma separated list.')
+
+  if not filenames:
+    PrintUsage('No files were specified.')
+
+  _SetOutputFormat(output_format)
+  _SetVerboseLevel(verbosity)
+  _SetFilters(filters)
+  _SetCountingStyle(counting_style)
+
+  return filenames
+
+
+def main():
+  filenames = ParseArguments(sys.argv[1:])
+
+  # Change stderr to write with replacement characters so we don't die
+  # if we try to print something containing non-ASCII characters.
+  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                         codecs.getreader('utf8'),
+                                         codecs.getwriter('utf8'),
+                                         'replace')
+
+  _cpplint_state.ResetErrorCounts()
+  for filename in filenames:
+    ProcessFile(filename, _cpplint_state.verbose_level)
+  _cpplint_state.PrintErrorCounts()
+
+  sys.exit(_cpplint_state.error_count > 0)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/src/rocksdb/arcanist_util/lint_engine/FacebookFbcodeLintEngine.php b/src/rocksdb/arcanist_util/lint_engine/FacebookFbcodeLintEngine.php
new file mode 100644
index 0000000..7b12ccc
--- /dev/null
+++ b/src/rocksdb/arcanist_util/lint_engine/FacebookFbcodeLintEngine.php
@@ -0,0 +1,140 @@
+<?php
+// Copyright 2004-present Facebook.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+class FacebookFbcodeLintEngine extends ArcanistLintEngine {
+
+  public function buildLinters() {
+    $linters = array();
+    $paths = $this->getPaths();
+
+    // Remove all deleted files, which are not checked by the
+    // following linters.
+    foreach ($paths as $key => $path) {
+      if (!Filesystem::pathExists($this->getFilePathOnDisk($path))) {
+        unset($paths[$key]);
+      }
+    }
+
+    $generated_linter = new ArcanistGeneratedLinter();
+    $linters[] = $generated_linter;
+
+    $nolint_linter = new ArcanistNoLintLinter();
+    $linters[] = $nolint_linter;
+
+    $text_linter = new ArcanistTextLinter();
+    $text_linter->setCustomSeverityMap(array(
+      ArcanistTextLinter::LINT_LINE_WRAP
+        => ArcanistLintSeverity::SEVERITY_ADVICE,
+    ));
+    $linters[] = $text_linter;
+
+    $java_text_linter = new ArcanistTextLinter();
+    $java_text_linter->setMaxLineLength(100);
+    $java_text_linter->setCustomSeverityMap(array(
+      ArcanistTextLinter::LINT_LINE_WRAP
+        => ArcanistLintSeverity::SEVERITY_ADVICE,
+    ));
+    $linters[] = $java_text_linter;
+
+    $python_linter = new ArcanistPEP8Linter();
+    $linters[] = $python_linter;
+
+    if (!$this->getCommitHookMode()) {
+      $cpp_linters = array();
+      $cpp_linters[] = $linters[] = new ArcanistCpplintLinter();
+      $cpp_linters[] = $linters[] = new FbcodeCppLinter();
+
+      $clang_format_linter = new FbcodeClangFormatLinter();
+      $linters[] = $clang_format_linter;
+    }
+
+    $spelling_linter = new ArcanistSpellingLinter();
+    $linters[] = $spelling_linter;
+
+    foreach ($paths as $path) {
+      $is_text = false;
+
+      $text_extensions = (
+        '/\.('.
+        'cpp|cxx|c|cc|h|hpp|hxx|tcc|'.
+        'py|rb|hs|pl|pm|tw|'.
+        'php|phpt|css|js|'.
+        'java|'.
+        'thrift|'.
+        'lua|'.
+        'siv|'.
+        'txt'.
+        ')$/'
+      );
+      if (preg_match($text_extensions, $path)) {
+        $is_text = true;
+      }
+      if ($is_text) {
+        $nolint_linter->addPath($path);
+
+        $generated_linter->addPath($path);
+        $generated_linter->addData($path, $this->loadData($path));
+
+        if (preg_match('/\.java$/', $path)) {
+          $java_text_linter->addPath($path);
+          $java_text_linter->addData($path, $this->loadData($path));
+        } else {
+          $text_linter->addPath($path);
+          $text_linter->addData($path, $this->loadData($path));
+        }
+
+        $spelling_linter->addPath($path);
+        $spelling_linter->addData($path, $this->loadData($path));
+      }
+      if (preg_match('/\.(cpp|c|cc|cxx|h|hh|hpp|hxx|tcc)$/', $path)
+          && !preg_match('/third-party/', $path)) {
+        foreach ($cpp_linters as &$linter) {
+          $linter->addPath($path);
+          $linter->addData($path, $this->loadData($path));
+        }
+
+        $clang_format_linter->addPath($path);
+        $clang_format_linter->addData($path, $this->loadData($path));
+        $clang_format_linter->setPathChangedLines(
+          $path, $this->getPathChangedLines($path));
+      }
+
+      // Match *.py and contbuild config files
+      if (preg_match('/(\.(py|tw|smcprops)|^contbuild\/configs\/[^\/]*)$/',
+                    $path)) {
+        $space_count = 4;
+        $real_path = $this->getFilePathOnDisk($path);
+        $dir = dirname($real_path);
+        do {
+          if (file_exists($dir.'/.python2space')) {
+            $space_count = 2;
+            break;
+          }
+          $dir = dirname($dir);
+        } while ($dir != '/' && $dir != '.');
+
+        $cur_path_linter = $python_linter;
+        $cur_path_linter->addPath($path);
+        $cur_path_linter->addData($path, $this->loadData($path));
+
+        if (preg_match('/\.tw$/', $path)) {
+          $cur_path_linter->setCustomSeverityMap(array(
+            'E251' => ArcanistLintSeverity::SEVERITY_DISABLED,
+          ));
+        }
+      }
+    }
+
+    $name_linter = new ArcanistFilenameLinter();
+    $linters[] = $name_linter;
+    foreach ($paths as $path) {
+      $name_linter->addPath($path);
+    }
+
+    return $linters;
+  }
+
+}
diff --git a/src/rocksdb/arcanist_util/lint_engine/FacebookHowtoevenLintEngine.php b/src/rocksdb/arcanist_util/lint_engine/FacebookHowtoevenLintEngine.php
new file mode 100644
index 0000000..2e01481
--- /dev/null
+++ b/src/rocksdb/arcanist_util/lint_engine/FacebookHowtoevenLintEngine.php
@@ -0,0 +1,27 @@
+<?php
+// Copyright 2015-present Facebook. All Rights Reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+final class FacebookHowtoevenLintEngine extends ArcanistLintEngine {
+
+  public function buildLinters() {
+    $paths = array();
+
+    foreach ($this->getPaths() as $path) {
+      // Don't try to lint deleted files or changed directories.
+      if (!Filesystem::pathExists($path) || is_dir($path)) {
+        continue;
+      }
+
+      if (preg_match('/\.(cpp|c|cc|cxx|h|hh|hpp|hxx|tcc)$/', $path)) {
+        $paths[] = $path;
+      }
+    }
+
+    $howtoeven = new FacebookHowtoevenLinter();
+    $howtoeven->setPaths($paths);
+    return array($howtoeven);
+  }
+}
diff --git a/src/rocksdb/arcanist_util/unit_engine/FacebookFbcodeUnitTestEngine.php b/src/rocksdb/arcanist_util/unit_engine/FacebookFbcodeUnitTestEngine.php
new file mode 100644
index 0000000..f9a9e70
--- /dev/null
+++ b/src/rocksdb/arcanist_util/unit_engine/FacebookFbcodeUnitTestEngine.php
@@ -0,0 +1,21 @@
+<?php
+// Copyright 2004-present Facebook. All Rights Reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+class FacebookFbcodeUnitTestEngine extends ArcanistBaseUnitTestEngine {
+
+  public function run() {
+    // Here we create a new unit test "jenkins_async_test" and promise we'll
+    // update the results later.
+    // Jenkins updates the results using `arc call-conduit
+    // differential.updateunitresults` call. If you change the name here, also
+    // make sure to change the name in Jenkins script that updates the test
+    // result -- they have to be the same.
+    $result = new ArcanistUnitTestResult();
+    $result->setName("jenkins_async_test");
+    $result->setResult(ArcanistUnitTestResult::RESULT_POSTPONED);
+    return array($result);
+  }
+}
diff --git a/src/rocksdb/build_tools/amalgamate.py b/src/rocksdb/build_tools/amalgamate.py
new file mode 100755
index 0000000..548b1e8
--- /dev/null
+++ b/src/rocksdb/build_tools/amalgamate.py
@@ -0,0 +1,110 @@
+#!/usr/bin/python
+
+# amalgamate.py creates an amalgamation from a unity build.
+# It can be run with either Python 2 or 3.
+# An amalgamation consists of a header that includes the contents of all public
+# headers and a source file that includes the contents of all source files and
+# private headers.
+#
+# This script works by starting with the unity build file and recursively expanding
+# #include directives. If the #include is found in a public include directory,
+# that header is expanded into the amalgamation header.
+#
+# A particular header is only expanded once, so this script will
+# break if there are multiple inclusions of the same header that are expected to
+# expand differently. Similarly, this type of code causes issues:
+#
+# #ifdef FOO
+#   #include "bar.h"
+#   // code here
+# #else
+#   #include "bar.h"            // oops, doesn't get expanded
+#   // different code here
+# #endif
+#
+# The solution is to move the include out of the #ifdef.
+
+from __future__ import print_function
+
+import argparse
+from os import path
+import re
+import sys
+
+include_re = re.compile('^[ \t]*#include[ \t]+"(.*)"[ \t]*$')
+included = set()
+excluded = set()
+
+def find_header(name, abs_path, include_paths):
+    samedir = path.join(path.dirname(abs_path), name)
+    if path.exists(samedir):
+        return samedir
+    for include_path in include_paths:
+        include_path = path.join(include_path, name)
+        if path.exists(include_path):
+            return include_path
+    return None
+
+def expand_include(include_path, f, abs_path, source_out, header_out, include_paths, public_include_paths):
+    if include_path in included:
+        return False
+
+    included.add(include_path)
+    with open(include_path) as f:
+        print('#line 1 "{}"'.format(include_path), file=source_out)
+        process_file(f, include_path, source_out, header_out, include_paths, public_include_paths)
+    return True
+
+def process_file(f, abs_path, source_out, header_out, include_paths, public_include_paths):
+    for (line, text) in enumerate(f):
+        m = include_re.match(text)
+        if m:
+            filename = m.groups()[0]
+            # first check private headers
+            include_path = find_header(filename, abs_path, include_paths)
+            if include_path:
+                if include_path in excluded:
+                    source_out.write(text)
+                    expanded = False
+                else:
+                    expanded = expand_include(include_path, f, abs_path, source_out, header_out, include_paths, public_include_paths)
+            else:
+                # now try public headers
+                include_path = find_header(filename, abs_path, public_include_paths)
+                if include_path:
+                    # found public header
+                    expanded = False
+                    if include_path in excluded:
+                        source_out.write(text)
+                    else:
+                        expand_include(include_path, f, abs_path, header_out, None, public_include_paths, [])
+                else:
+                    sys.exit("unable to find {}, included in {} on line {}".format(filename, abs_path, line))
+
+            if expanded:
+                print('#line {} "{}"'.format(line+1, abs_path), file=source_out)
+        elif text != "#pragma once\n":
+            source_out.write(text)
+
+def main():
+    parser = argparse.ArgumentParser(description="Transform a unity build into an amalgamation")
+    parser.add_argument("source", help="source file")
+    parser.add_argument("-I", action="append", dest="include_paths", help="include paths for private headers")
+    parser.add_argument("-i", action="append", dest="public_include_paths", help="include paths for public headers")
+    parser.add_argument("-x", action="append", dest="excluded", help="excluded header files")
+    parser.add_argument("-o", dest="source_out", help="output C++ file", required=True)
+    parser.add_argument("-H", dest="header_out", help="output C++ header file", required=True)
+    args = parser.parse_args()
+
+    include_paths = list(map(path.abspath, args.include_paths or []))
+    public_include_paths = list(map(path.abspath, args.public_include_paths or []))
+    excluded.update(map(path.abspath, args.excluded or []))
+    filename = args.source
+    abs_path = path.abspath(filename)
+    with open(filename) as f, open(args.source_out, 'w') as source_out, open(args.header_out, 'w') as header_out:
+        print('#line 1 "{}"'.format(filename), file=source_out)
+        print('#include "{}"'.format(header_out.name), file=source_out)
+        process_file(f, abs_path, source_out, header_out, include_paths, public_include_paths)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/rocksdb/build_tools/build_detect_platform b/src/rocksdb/build_tools/build_detect_platform
new file mode 100755
index 0000000..0e40ac5
--- /dev/null
+++ b/src/rocksdb/build_tools/build_detect_platform
@@ -0,0 +1,391 @@
+#!/bin/sh
+#
+# Detects OS we're compiling on and outputs a file specified by the first
+# argument, which in turn gets read while processing Makefile.
+#
+# The output will set the following variables:
+#   CC                          C Compiler path
+#   CXX                         C++ Compiler path
+#   PLATFORM_LDFLAGS            Linker flags
+#   JAVA_LDFLAGS                Linker flags for RocksDBJava
+#   PLATFORM_SHARED_EXT         Extension for shared libraries
+#   PLATFORM_SHARED_LDFLAGS     Flags for building shared library
+#   PLATFORM_SHARED_CFLAGS      Flags for compiling objects for shared library
+#   PLATFORM_CCFLAGS            C compiler flags
+#   PLATFORM_CXXFLAGS           C++ compiler flags.  Will contain:
+#   PLATFORM_SHARED_VERSIONED   Set to 'true' if platform supports versioned
+#                               shared libraries, empty otherwise.
+#
+# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
+#
+#       -DLEVELDB_PLATFORM_POSIX if cstdatomic is present
+#       -DLEVELDB_PLATFORM_NOATOMIC if it is not
+#       -DSNAPPY                    if the Snappy library is present
+#       -DLZ4                       if the LZ4 library is present
+#       -DZSTD                      if the ZSTD library is present
+#       -DNUMA                      if the NUMA library is present
+#
+# Using gflags in rocksdb:
+# Our project depends on gflags, which requires users to take some extra steps
+# before they can compile the whole repository:
+#   1. Install gflags. You may download it from here:
+#      https://code.google.com/p/gflags/
+#   2. Once install, add the include path/lib path for gflags to CPATH and
+#      LIBRARY_PATH respectively. If installed with default mode, the
+#      lib and include path will be /usr/local/lib and /usr/local/include
+# Mac user can do this by having brew installed and running brew install gflags
+
+OUTPUT=$1
+if test -z "$OUTPUT"; then
+  echo "usage: $0 <output-filename>" >&2
+  exit 1
+fi
+
+# we depend on C++11
+PLATFORM_CXXFLAGS="-std=c++11"
+# we currently depend on POSIX platform
+COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
+
+# Default to fbcode gcc on internal fb machines
+if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
+    FBCODE_BUILD="true"
+    # If we're compiling with TSAN we need pic build
+    PIC_BUILD=$COMPILE_WITH_TSAN
+    if [ -z "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then
+      source "$PWD/build_tools/fbcode_config.sh"
+    else
+      # we need this to build with MySQL. Don't use for other purposes.
+      source "$PWD/build_tools/fbcode_config4.8.1.sh"
+    fi
+fi
+
+# Delete existing output, if it exists
+rm -f "$OUTPUT"
+touch "$OUTPUT"
+
+if test -z "$CC"; then
+   CC=cc
+fi
+
+if test -z "$CXX"; then
+    CXX=g++
+fi
+
+# Detect OS
+if test -z "$TARGET_OS"; then
+    TARGET_OS=`uname -s`
+fi
+
+if test -z "$TARGET_ARCHITECTURE"; then
+    TARGET_ARCHITECTURE=`uname -m`
+fi
+
+if test -z "$CLANG_SCAN_BUILD"; then
+    CLANG_SCAN_BUILD=scan-build
+fi
+
+if test -z "$CLANG_ANALYZER"; then
+    CLANG_ANALYZER=$(which clang++ 2> /dev/null)
+fi
+
+COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}"
+CROSS_COMPILE=
+PLATFORM_CCFLAGS=
+PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
+PLATFORM_SHARED_EXT="so"
+PLATFORM_SHARED_LDFLAGS="-Wl,--no-as-needed -shared -Wl,-soname -Wl,"
+PLATFORM_SHARED_CFLAGS="-fPIC"
+PLATFORM_SHARED_VERSIONED=true
+
+# generic port files (working on all platform by #ifdef) go directly in /port
+GENERIC_PORT_FILES=`cd "$ROCKSDB_ROOT"; find port -name '*.cc' | tr "\n" " "`
+
+# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
+case "$TARGET_OS" in
+    Darwin)
+        PLATFORM=OS_MACOSX
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX"
+        PLATFORM_SHARED_EXT=dylib
+        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
+        # PORT_FILES=port/darwin/darwin_specific.cc
+        ;;
+    IOS)
+        PLATFORM=IOS
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE -DROCKSDB_LITE"
+        PLATFORM_SHARED_EXT=dylib
+        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
+        CROSS_COMPILE=true
+        PLATFORM_SHARED_VERSIONED=
+        ;;
+    Linux)
+        PLATFORM=OS_LINUX
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
+        if [ -z "$USE_CLANG" ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
+        fi
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        # PORT_FILES=port/linux/linux_specific.cc
+        ;;
+    SunOS)
+        PLATFORM=OS_SOLARIS
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        # PORT_FILES=port/sunos/sunos_specific.cc
+        ;;
+    FreeBSD)
+        PLATFORM=OS_FREEBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
+        # PORT_FILES=port/freebsd/freebsd_specific.cc
+        ;;
+    NetBSD)
+        PLATFORM=OS_NETBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s"
+        # PORT_FILES=port/netbsd/netbsd_specific.cc
+        ;;
+    OpenBSD)
+        PLATFORM=OS_OPENBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread"
+        # PORT_FILES=port/openbsd/openbsd_specific.cc
+        ;;
+    DragonFly)
+        PLATFORM=OS_DRAGONFLYBSD
+        COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
+        # PORT_FILES=port/dragonfly/dragonfly_specific.cc
+        ;;
+    Cygwin)
+        PLATFORM=CYGWIN
+        PLATFORM_SHARED_CFLAGS=""
+        PLATFORM_CXXFLAGS="-std=gnu++11"
+        COMMON_FLAGS="$COMMON_FLAGS -DCYGWIN"
+        if [ -z "$USE_CLANG" ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
+        fi
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
+        # PORT_FILES=port/linux/linux_specific.cc
+        ;;
+    OS_ANDROID_CROSSCOMPILE)
+        PLATFORM=OS_ANDROID
+	COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DLEVELDB_PLATFORM_POSIX"
+	PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS "  # All pthread features are in the Android C library
+        # PORT_FILES=port/android/android.cc
+        CROSS_COMPILE=true
+        ;;
+    *)
+        echo "Unknown platform!" >&2
+        exit 1
+esac
+
+PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
+JAVA_LDFLAGS="$PLATFORM_LDFLAGS"
+
+if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
+    # Cross-compiling; do not try any compilation tests.
+    # Also don't need any compilation tests if compiling on fbcode
+    true
+else
+    # Test whether fallocate is available
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <fcntl.h>
+      #include <linux/falloc.h>
+      int main() {
+	int fd = open("/dev/null", 0);
+  fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, 1024);
+      }
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT"
+    fi
+
+    # Test whether Snappy library is installed
+    # http://code.google.com/p/snappy/
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <snappy.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
+        JAVA_LDFLAGS="$JAVA_LDFLAGS -lsnappy"
+    fi
+
+    # Test whether gflags library is installed
+    # http://gflags.github.io/gflags/
+    # check if the namespace is gflags
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
+      #include <gflags/gflags.h>
+      using namespace gflags;
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=gflags"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+    else
+      # check if namespace is google
+      $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
+        #include <gflags/gflags.h>
+        using namespace google;
+        int main() {}
+EOF
+      if [ "$?" = 0 ]; then
+          COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=google"
+          PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+      fi
+    fi
+
+    # Test whether zlib library is installed
+    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <zlib.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
+        JAVA_LDFLAGS="$JAVA_LDFLAGS -lz"
+    fi
+
+    # Test whether bzip library is installed
+    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <bzlib.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DBZIP2"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2"
+        JAVA_LDFLAGS="$JAVA_LDFLAGS -lbz2"
+    fi
+
+    # Test whether lz4 library is installed
+    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <lz4.h>
+      #include <lz4hc.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DLZ4"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4"
+        JAVA_LDFLAGS="$JAVA_LDFLAGS -llz4"
+    fi
+
+    # Test whether zstd library is installed
+    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <zstd.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DZSTD"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lzstd"
+        JAVA_LDFLAGS="$JAVA_LDFLAGS -lzstd"
+    fi
+
+    # Test whether numa is available
+    $CXX $CFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null  <<EOF
+      #include <numa.h>
+      #inlcude <numaif.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DNUMA"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lnuma"
+        JAVA_LDFLAGS="$JAVA_LDFLAGS -lnuma"
+    fi
+
+    # Test whether jemalloc is available
+    if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null -ljemalloc \
+      2>/dev/null; then 
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ljemalloc"
+        JAVA_LDFLAGS="$JAVA_LDFLAGS -ljemalloc"
+    else
+        # jemalloc is not available. Let's try tcmalloc
+        if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null \
+          -ltcmalloc 2>/dev/null; then 
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -ltcmalloc"
+        fi
+    fi
+
+    # Test whether malloc_usable_size is available
+    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+      #include <malloc.h>
+      int main() {
+        size_t res = malloc_usable_size(0);
+        return 0;
+      }
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_MALLOC_USABLE_SIZE"
+    fi
+fi
+
+# TODO(tec): Fix -Wshorten-64-to-32 errors on FreeBSD and enable the warning.
+# -Wshorten-64-to-32 breaks compilation on FreeBSD i386
+if ! [ "$TARGET_OS" = FreeBSD -a "$TARGET_ARCHITECTURE" = i386 ]; then
+  # Test whether -Wshorten-64-to-32 is available
+  $CXX $CFLAGS -x c++ - -o /dev/null -Wshorten-64-to-32 2>/dev/null  <<EOF
+    int main() {}
+EOF
+  if [ "$?" = 0 ]; then
+    COMMON_FLAGS="$COMMON_FLAGS -Wshorten-64-to-32"
+  fi
+fi
+
+# shall we use HDFS?
+
+if test "$USE_HDFS"; then
+  if test -z "$JAVA_HOME"; then
+    echo "JAVA_HOME has to be set for HDFS usage."
+    exit 1
+  fi
+  HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -lhdfs -L$JAVA_HOME/jre/lib/amd64"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -ldl -lverify -ljava -ljvm"
+  COMMON_FLAGS="$COMMON_FLAGS $HDFS_CCFLAGS"
+  PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $HDFS_LDFLAGS"
+  JAVA_LDFLAGS="$JAVA_LDFLAGS $HDFS_LDFLAGS"
+fi
+
+if [ "$TARGET_OS" = FreeBSD -a "$TARGET_ARCHITECTURE" = i386 ]; then
+  # Intel SSE instructions breaks compilation on FreeBSD i386
+  unset USE_SSE
+fi
+
+if test "$USE_SSE"; then
+  # if Intel SSE instruction set is supported, set USE_SSE=1
+  COMMON_FLAGS="$COMMON_FLAGS -msse -msse4.2 "
+elif test -z "$PORTABLE"; then
+  COMMON_FLAGS="$COMMON_FLAGS -march=native "
+fi
+
+PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
+PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
+
+VALGRIND_VER="$VALGRIND_VER"
+
+ROCKSDB_MAJOR=`build_tools/version.sh major`
+ROCKSDB_MINOR=`build_tools/version.sh minor`
+ROCKSDB_PATCH=`build_tools/version.sh patch`
+
+echo "CC=$CC" >> "$OUTPUT"
+echo "CXX=$CXX" >> "$OUTPUT"
+echo "PLATFORM=$PLATFORM" >> "$OUTPUT"
+echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT"
+echo "JAVA_LDFLAGS=$JAVA_LDFLAGS" >> "$OUTPUT"
+echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT"
+echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT"
+echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> "$OUTPUT"
+echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> "$OUTPUT"
+echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> "$OUTPUT"
+echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> "$OUTPUT"
+echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> "$OUTPUT"
+echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> "$OUTPUT"
+echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> "$OUTPUT"
+echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> "$OUTPUT"
+echo "ROCKSDB_MAJOR=$ROCKSDB_MAJOR" >> "$OUTPUT"
+echo "ROCKSDB_MINOR=$ROCKSDB_MINOR" >> "$OUTPUT"
+echo "ROCKSDB_PATCH=$ROCKSDB_PATCH" >> "$OUTPUT"
+echo "CLANG_SCAN_BUILD=$CLANG_SCAN_BUILD" >> "$OUTPUT"
+echo "CLANG_ANALYZER=$CLANG_ANALYZER" >> "$OUTPUT"
diff --git a/src/rocksdb/build_tools/dockerbuild.sh b/src/rocksdb/build_tools/dockerbuild.sh
new file mode 100755
index 0000000..2685380
--- /dev/null
+++ b/src/rocksdb/build_tools/dockerbuild.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+docker run -v $PWD:/rocks -w /rocks buildpack-deps make
diff --git a/src/rocksdb/build_tools/fb_compile_mongo.sh b/src/rocksdb/build_tools/fb_compile_mongo.sh
new file mode 100755
index 0000000..c087f81
--- /dev/null
+++ b/src/rocksdb/build_tools/fb_compile_mongo.sh
@@ -0,0 +1,55 @@
+#!/bin/sh
+
+# fail early
+set -e
+
+if test -z $ROCKSDB_PATH; then
+  ROCKSDB_PATH=~/rocksdb
+fi
+source $ROCKSDB_PATH/build_tools/fbcode_config4.8.1.sh
+
+EXTRA_LDFLAGS=""
+
+if test -z $ALLOC; then
+  # default
+  ALLOC=tcmalloc
+elif [[ $ALLOC == "jemalloc" ]]; then
+  ALLOC=system
+  EXTRA_LDFLAGS+=" -Wl,--whole-archive $JEMALLOC_LIB -Wl,--no-whole-archive"
+fi
+
+# we need to force mongo to use static library, not shared
+STATIC_LIB_DEP_DIR='build/static_library_dependencies'
+test -d $STATIC_LIB_DEP_DIR || mkdir $STATIC_LIB_DEP_DIR
+test -h $STATIC_LIB_DEP_DIR/`basename $SNAPPY_LIBS` || ln -s $SNAPPY_LIBS $STATIC_LIB_DEP_DIR
+test -h $STATIC_LIB_DEP_DIR/`basename $LZ4_LIBS` || ln -s $LZ4_LIBS $STATIC_LIB_DEP_DIR
+
+EXTRA_LDFLAGS+=" -L $STATIC_LIB_DEP_DIR"
+
+set -x
+
+EXTRA_CMD=""
+if ! test -e version.json; then
+  # this is Mongo 3.0
+  EXTRA_CMD="--rocksdb \
+    --variant-dir=linux2/norm
+    --cxx=${CXX} \
+    --cc=${CC} \
+    --use-system-zlib"  # add this line back to normal code path
+                        # when https://jira.mongodb.org/browse/SERVER-19123 is resolved
+fi
+
+scons \
+  LINKFLAGS="$EXTRA_LDFLAGS $EXEC_LDFLAGS $PLATFORM_LDFLAGS" \
+  CCFLAGS="$CXXFLAGS -L $STATIC_LIB_DEP_DIR" \
+  LIBS="lz4 gcc stdc++" \
+  LIBPATH="$ROCKSDB_PATH" \
+  CPPPATH="$ROCKSDB_PATH/include" \
+  -j32 \
+  --allocator=$ALLOC \
+  --nostrip \
+  --opt=on \
+  --disable-minimum-compiler-version-enforcement \
+  --use-system-snappy \
+  --disable-warnings-as-errors \
+  $EXTRA_CMD $*
diff --git a/src/rocksdb/build_tools/fbcode_config.sh b/src/rocksdb/build_tools/fbcode_config.sh
new file mode 100644
index 0000000..572c0fe
--- /dev/null
+++ b/src/rocksdb/build_tools/fbcode_config.sh
@@ -0,0 +1,133 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ and clang compilers and also
+# uses jemalloc
+# Environment variables that change the behavior of this script:
+# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
+
+CFLAGS=""
+
+# location of libgcc
+LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/0473c80518a10d6efcbe24c5eeca3fb4ec9b519c/4.9.x/gcc-4.9-glibc-2.20/e1a7e4e"
+LIBGCC_INCLUDE="$LIBGCC_BASE/include"
+LIBGCC_LIBS=" -L $LIBGCC_BASE/libs"
+
+# location of glibc
+GLIBC_REV=7397bed99280af5d9543439cdb7d018af7542720
+GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/99df8fc/include"
+GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/99df8fc/lib"
+
+SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/include/"
+
+if test -z $PIC_BUILD; then
+  SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy.a"
+else
+  SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy_pic.a"
+fi
+
+CFLAGS+=" -DSNAPPY"
+
+if test -z $PIC_BUILD; then
+  # location of zlib headers and libraries
+  ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/include/"
+  ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/lib/libz.a"
+  CFLAGS+=" -DZLIB"
+
+  # location of bzip headers and libraries
+  BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/include/"
+  BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/lib/libbz2.a"
+  CFLAGS+=" -DBZIP2"
+
+  LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/include/"
+  LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/lib/liblz4.a"
+  CFLAGS+=" -DLZ4"
+
+  ZSTD_REV=8df2d01673ae6afcc8c8d16fec862b2d67ecc1e9
+  ZSTD_INCLUDE=" -I /mnt/gvfs/third-party2/zstd/$ZSTD_REV/0.1.1/gcc-4.8.1-glibc-2.17/c3f970a/include"
+  ZSTD_LIBS=" /mnt/gvfs/third-party2/zstd/$ZSTD_REV/0.1.1/gcc-4.8.1-glibc-2.17/c3f970a/lib/libzstd.a"
+  CFLAGS+=" -DZSTD"
+fi
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/include/"
+if test -z $PIC_BUILD; then
+  GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags.a"
+else
+  GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags_pic.a"
+fi
+CFLAGS+=" -DGFLAGS=google"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/include/"
+JEMALLOC_LIB=" /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/lib/libjemalloc.a"
+
+if test -z $PIC_BUILD; then
+  # location of numa
+  NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/include/"
+  NUMA_LIB=" /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/lib/libnuma.a"
+  CFLAGS+=" -DNUMA"
+
+  # location of libunwind
+  LIBUNWIND="/mnt/gvfs/third-party2/libunwind/1de3b75e0afedfe5585b231bbb340ec7a1542335/1.1/gcc-4.9-glibc-2.20/34235e8/lib/libunwind.a"
+fi
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=1
+
+BINUTILS="/mnt/gvfs/third-party2/binutils/0b6ad0c88ddd903333a48ae8bff134efac468e4a/2.25/centos6-native/da39a3e/bin"
+AR="$BINUTILS/ar"
+
+DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
+
+GCC_BASE="/mnt/gvfs/third-party2/gcc/1c67a0b88f64d4d9ced0382d141c76aaa7d62fba/4.9.x/centos6-native/1317bc4"
+STDLIBS="-L $GCC_BASE/lib64"
+
+CLANG_BASE="/mnt/gvfs/third-party2/clang/d81444dd214df3d2466734de45bb264a0486acc3/dev"
+CLANG_BIN="$CLANG_BASE/centos6-native/af4b1a0/bin"
+CLANG_ANALYZER="$CLANG_BIN/clang++"
+CLANG_SCAN_BUILD="$CLANG_BASE/src/clang/tools/scan-build/scan-build"
+
+if [ -z "$USE_CLANG" ]; then
+  # gcc
+  CC="$GCC_BASE/bin/gcc"
+  CXX="$GCC_BASE/bin/g++"
+  
+  CFLAGS+=" -B$BINUTILS/gold"
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+else
+  # clang 
+  CLANG_INCLUDE="$CLANG_BASE/gcc-4.9-glibc-2.20/74c386f/lib/clang/dev/include/"
+  CC="$CLANG_BIN/clang"
+  CXX="$CLANG_BIN/clang++"
+
+  KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/ffd14f660a43c4b92717986b1bba66722ef089d0/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.9-glibc-2.20/da39a3e/include"
+
+  CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib"
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x/x86_64-facebook-linux "
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $CLANG_INCLUDE"
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
+  CXXFLAGS="-nostdinc++"
+fi
+
+CFLAGS+=" $DEPS_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE"
+CXXFLAGS+=" $CFLAGS"
+
+EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.9-glibc-2.20/lib/ld.so"
+EXEC_LDFLAGS+=" $LIBUNWIND"
+EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/gcc-4.9-glibc-2.20/lib"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS"
+
+VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/6c45ef049cbf11c2df593addb712cd891049e737/3.10.0/gcc-4.9-glibc-2.20/4230243/bin/"
+
+export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD
diff --git a/src/rocksdb/build_tools/fbcode_config4.8.1.sh b/src/rocksdb/build_tools/fbcode_config4.8.1.sh
new file mode 100644
index 0000000..524a5ed
--- /dev/null
+++ b/src/rocksdb/build_tools/fbcode_config4.8.1.sh
@@ -0,0 +1,110 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ compiler and also
+# uses jemalloc
+
+# location of libgcc
+LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/7712e757d7355cb51292454ee0b7b46a467fdfed/4.8.1/gcc-4.8.1-glibc-2.17/8aac7fc"
+LIBGCC_INCLUDE="$LIBGCC_BASE/include"
+LIBGCC_LIBS=" -L $LIBGCC_BASE/libs"
+
+# location of glibc
+GLIBC_REV=6e40560b4e0b6d690fd1cf8c7a43ad7452b04cfa
+GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/include"
+GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/lib"
+
+# location of snappy headers and libraries
+SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/include"
+SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/lib/libsnappy.a"
+
+# location of zlib headers and libraries
+ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/include"
+ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/lib/libz.a"
+
+# location of bzip headers and libraries
+BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/include/"
+BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libbz2.a"
+
+LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b
+LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include"
+LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a"
+
+ZSTD_REV=8df2d01673ae6afcc8c8d16fec862b2d67ecc1e9
+ZSTD_INCLUDE=" -I /mnt/gvfs/third-party2/zstd/$ZSTD_REV/0.1.1/gcc-4.8.1-glibc-2.17/c3f970a/include"
+ZSTD_LIBS=" /mnt/gvfs/third-party2/zstd/$ZSTD_REV/0.1.1/gcc-4.8.1-glibc-2.17/c3f970a/lib/libzstd.a"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/include/"
+GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libgflags.a"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/3691c776ac26dd8781e84f8888b6a0fbdbc0a9ed/dev/gcc-4.8.1-glibc-2.17/4d53c6f/include"
+JEMALLOC_LIB="/mnt/gvfs/third-party2/jemalloc/3691c776ac26dd8781e84f8888b6a0fbdbc0a9ed/dev/gcc-4.8.1-glibc-2.17/4d53c6f/lib/libjemalloc.a"
+
+# location of numa
+NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65
+NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/"
+NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a"
+
+# location of libunwind
+LIBUNWIND_REV=2c060e64064559905d46fd194000d61592087bdc
+LIBUNWIND="/mnt/gvfs/third-party2/libunwind/$LIBUNWIND_REV/1.1/gcc-4.8.1-glibc-2.17/675d945/lib/libunwind.a"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=1
+
+BINUTILS="/mnt/gvfs/third-party2/binutils/2aff2e7b474cd3e6ab23495ad1224b7d214b9f8e/2.21.1/centos6-native/da39a3e/bin"
+AR="$BINUTILS/ar"
+
+DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
+
+GCC_BASE="/mnt/gvfs/third-party2/gcc/1ec615e23800f0815d474478ba476a0adc3fe788/4.8.1/centos6-native/cc6c9dc"
+STDLIBS="-L $GCC_BASE/lib64"
+
+if [ -z "$USE_CLANG" ]; then
+  # gcc
+  CC="$GCC_BASE/bin/gcc"
+  CXX="$GCC_BASE/bin/g++"
+  
+  CFLAGS="-B$BINUTILS/gold -m64 -mtune=generic"
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+else
+  # clang 
+  CLANG_BASE="/mnt/gvfs/third-party2/clang/9ab68376f938992c4eb5946ca68f90c3185cffc8/3.4"
+  CLANG_INCLUDE="$CLANG_BASE/gcc-4.8.1-glibc-2.17/fb0f730/lib/clang/3.4/include"
+  CC="$CLANG_BASE/centos6-native/9cefd8a/bin/clang"
+  CXX="$CLANG_BASE/centos6-native/9cefd8a/bin/clang++"
+
+  KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/a683ed7135276731065a9d76d3016c9731f4e2f9/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.8.1-glibc-2.17/da39a3e/include/"
+
+  CFLAGS="-B$BINUTILS/gold -nostdinc -nostdlib"
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1 "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1/x86_64-facebook-linux "
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $CLANG_INCLUDE"
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
+  CXXFLAGS="-nostdinc++"
+fi
+
+CFLAGS+=" $DEPS_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE"
+CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DZSTD -DNUMA"
+CXXFLAGS+=" $CFLAGS"
+
+EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
+EXEC_LDFLAGS+=" $LIBUNWIND"
+EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS"
+
+VALGRIND_REV=b2a9f85e4b70cd03abc85a7f3027fbc4cef35bd0
+VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/$VALGRIND_REV/3.8.1/gcc-4.8.1-glibc-2.17/c3f970a/bin/"
+
+export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE
diff --git a/src/rocksdb/build_tools/format-diff.sh b/src/rocksdb/build_tools/format-diff.sh
new file mode 100755
index 0000000..5b2efdd
--- /dev/null
+++ b/src/rocksdb/build_tools/format-diff.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# If clang_format_diff.py command is not specfied, we assume we are able to
+# access directly without any path.
+if [ -z $CLANG_FORMAT_DIFF ]
+then
+CLANG_FORMAT_DIFF="clang-format-diff.py"
+fi
+
+# Check clang-format-diff.py
+if ! which $CLANG_FORMAT_DIFF &> /dev/null
+then
+  echo "You didn't have clang-format-diff.py available in your computer!"
+  echo "You can download it by running: "
+  echo "    curl http://goo.gl/iUW1u2"
+  exit 128
+fi
+
+# Check argparse, a library that clang-format-diff.py requires.
+python 2>/dev/null << EOF
+import argparse
+EOF
+
+if [ "$?" != 0 ]
+then
+  echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
+  echo "installed. You can try either of the follow ways to install it:"
+  echo "  1. Manually download argparse: https://pypi.python.org/pypi/argparse"
+  echo "  2. easy_install argparse (if you have easy_install)"
+  echo "  3. pip install argparse (if you have pip)"
+  exit 129
+fi
+
+# TODO(kailiu) following work is not complete since we still need to figure
+# out how to add the modified files done pre-commit hook to git's commit index.
+#
+# Check if this script has already been added to pre-commit hook.
+# Will suggest user to add this script to pre-commit hook if their pre-commit
+# is empty.
+# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit"
+# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null
+# then
+#   echo "Would you like to add this script to pre-commit hook, which will do "
+#   echo -n "the format check for all the affected lines before you check in (y/n):"
+#   read add_to_hook
+#   if [ "$add_to_hook" == "y" ]
+#   then
+#     ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
+#   fi
+# fi
+set -e
+
+uncommitted_code=`git diff HEAD`
+
+# If there's no uncommitted changes, we assume user are doing post-commit
+# format check, in which case we'll check the modified lines from latest commit.
+# Otherwise, we'll check format of the uncommitted code only.
+if [ -z "$uncommitted_code" ]
+then
+  # Check the format of last commit
+  diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1)
+else
+  # Check the format of uncommitted lines,
+  diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
+fi
+
+if [ -z "$diffs" ]
+then
+  echo "Nothing needs to be reformatted!"
+  exit 0
+fi
+
+# Highlight the insertion/deletion from the clang-format-diff.py's output
+COLOR_END="\033[0m"
+COLOR_RED="\033[0;31m" 
+COLOR_GREEN="\033[0;32m" 
+
+echo -e "Detect lines that doesn't follow the format rules:\r"
+# Add the color to the diff. lines added will be green; lines removed will be red.
+echo "$diffs" | 
+  sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
+  sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
+
+if [[ "$OPT" == *"-DTRAVIS"* ]]
+then
+  exit 1
+fi
+
+echo -e "Would you like to fix the format automatically (y/n): \c"
+
+# Make sure under any mode, we can read user input.
+exec < /dev/tty
+read to_fix
+
+if [ "$to_fix" != "y" ]
+then
+  exit 1
+fi
+
+# Do in-place format adjustment.
+git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1
+echo "Files reformatted!"
+
+# Amend to last commit if user do the post-commit format check
+if [ -z "$uncommitted_code" ]; then
+  echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c"
+  read to_amend
+
+  if [ "$to_amend" == "y" ]
+  then
+    git commit -a --amend --reuse-message HEAD
+    echo "Amended to last commit"
+  fi
+fi
diff --git a/src/rocksdb/build_tools/make_new_version.sh b/src/rocksdb/build_tools/make_new_version.sh
new file mode 100755
index 0000000..409944f
--- /dev/null
+++ b/src/rocksdb/build_tools/make_new_version.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under the BSD-style license found in the
+#  LICENSE file in the root directory of this source tree. An additional grant
+#  of patent rights can be found in the PATENTS file in the same directory.
+
+set -e
+if [ -z "$GIT" ]
+then
+  GIT="git"
+fi
+
+# Print out the colored progress info so that it can be brainlessly 
+# distinguished by users.
+function title() {
+  echo -e "\033[1;32m$*\033[0m"
+}
+
+usage="Create new RocksDB version and prepare it for the release process\n"
+usage+="USAGE: ./make_new_version.sh <version>"
+
+# -- Pre-check
+if [[ $# < 1 ]]; then
+  echo -e $usage
+  exit 1
+fi
+
+ROCKSDB_VERSION=$1
+
+GIT_BRANCH=`git rev-parse --abbrev-ref HEAD`
+echo $GIT_BRANCH
+
+if [ $GIT_BRANCH != "master" ]; then
+  echo "Error: Current branch is '$GIT_BRANCH', Please switch to master branch."
+  exit 1
+fi
+
+title "Adding new tag for this release ..."
+BRANCH="$ROCKSDB_VERSION.fb"
+$GIT checkout -b $BRANCH
+
+# Setting up the proxy for remote repo access
+title "Pushing new branch to remote repo ..."
+git push origin --set-upstream $BRANCH
+
+title "Branch $BRANCH is pushed to github;"
diff --git a/src/rocksdb/build_tools/make_package.sh b/src/rocksdb/build_tools/make_package.sh
new file mode 100755
index 0000000..2ca2802
--- /dev/null
+++ b/src/rocksdb/build_tools/make_package.sh
@@ -0,0 +1,116 @@
+#/usr/bin/env bash
+
+set -e
+
+function log() {
+  echo "[+] $1"
+}
+
+function fatal() {
+  echo "[!] $1"
+  exit 1
+}
+
+function platform() {
+  local  __resultvar=$1
+  if [[ -f "/etc/yum.conf" ]]; then
+    eval $__resultvar="centos"
+  elif [[ -f "/etc/dpkg/dpkg.cfg" ]]; then
+    eval $__resultvar="ubuntu"
+  else
+    fatal "Unknwon operating system"
+  fi
+}
+platform OS
+
+function package() {
+  if [[ $OS = "ubuntu" ]]; then
+    if dpkg --get-selections | grep --quiet $1; then
+      log "$1 is already installed. skipping."
+    else
+      apt-get install $@ -y
+    fi
+  elif [[ $OS = "centos" ]]; then
+    if rpm -qa | grep --quiet $1; then
+      log "$1 is already installed. skipping."
+    else
+      yum install $@ -y
+    fi
+  fi
+}
+
+function detect_fpm_output() {
+  if [[ $OS = "ubuntu" ]]; then
+    export FPM_OUTPUT=deb
+  elif [[ $OS = "centos" ]]; then
+    export FPM_OUTPUT=rpm
+  fi
+}
+detect_fpm_output
+
+function gem_install() {
+  if gem list | grep --quiet $1; then
+    log "$1 is already installed. skipping."
+  else
+    gem install $@
+  fi
+}
+
+function main() {
+  if [[ $# -ne 1 ]]; then
+    fatal "Usage: $0 <rocksdb_version>"
+  else
+    log "using rocksdb version: $1"
+  fi
+
+  if [[ -d /vagrant ]]; then
+    if [[ $OS = "ubuntu" ]]; then
+      package g++-4.7
+      export CXX=g++-4.7
+
+      # the deb would depend on libgflags2, but the static lib is the only thing
+      # installed by make install
+      package libgflags-dev
+
+      package ruby-all-dev
+    elif [[ $OS = "centos" ]]; then
+      pushd /etc/yum.repos.d
+      if [[ ! -f /etc/yum.repos.d/devtools-1.1.repo ]]; then
+        wget http://people.centos.org/tru/devtools-1.1/devtools-1.1.repo
+      fi
+      package devtoolset-1.1-gcc --enablerepo=testing-1.1-devtools-6
+      package devtoolset-1.1-gcc-c++ --enablerepo=testing-1.1-devtools-6
+      export CC=/opt/centos/devtoolset-1.1/root/usr/bin/gcc
+      export CPP=/opt/centos/devtoolset-1.1/root/usr/bin/cpp
+      export CXX=/opt/centos/devtoolset-1.1/root/usr/bin/c++
+      export PATH=$PATH:/opt/centos/devtoolset-1.1/root/usr/bin
+      popd
+      if ! rpm -qa | grep --quiet gflags; then
+        rpm -i https://github.com/schuhschuh/gflags/releases/download/v2.1.0/gflags-devel-2.1.0-1.amd64.rpm
+      fi
+
+      package ruby
+      package ruby-devel
+      package rubygems
+      package rpm-build
+    fi
+  fi
+  gem_install fpm
+
+  make static_lib
+  make install INSTALL_PATH=package
+  fpm \
+    -s dir \
+    -t $FPM_OUTPUT \
+    -n rocksdb \
+    -v $1 \
+    --prefix /usr \
+    --url http://rocksdb.org/ \
+    -m rocksdb at fb.com \
+    --license BSD \
+    --vendor Facebook \
+    --description "RocksDB is an embeddable persistent key-value store for fast storage." \
+    package
+}
+
+main $@
diff --git a/src/rocksdb/build_tools/regression_build_test.sh b/src/rocksdb/build_tools/regression_build_test.sh
new file mode 100755
index 0000000..ee2d334
--- /dev/null
+++ b/src/rocksdb/build_tools/regression_build_test.sh
@@ -0,0 +1,428 @@
+#!/bin/bash
+
+set -e
+
+NUM=10000000
+
+if [ $# -eq 1 ];then
+  DATA_DIR=$1
+elif [ $# -eq 2 ];then
+  DATA_DIR=$1
+  STAT_FILE=$2
+fi
+
+# On the production build servers, set data and stat
+# files/directories not in /tmp or else the tempdir cleaning
+# scripts will make you very unhappy.
+DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)}
+STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)}
+
+function cleanup {
+  rm -rf $DATA_DIR
+  rm -f $STAT_FILE.fillseq
+  rm -f $STAT_FILE.readrandom
+  rm -f $STAT_FILE.overwrite
+  rm -f $STAT_FILE.memtablefillreadrandom
+}
+
+trap cleanup EXIT
+
+if [ -z $GIT_BRANCH ]; then
+  git_br=`git rev-parse --abbrev-ref HEAD`
+else
+  git_br=$(basename $GIT_BRANCH)
+fi
+
+if [ $git_br == "master" ]; then
+  git_br=""
+else
+  git_br="."$git_br
+fi
+
+make release
+
+# measure fillseq + fill up the DB for overwrite benchmark
+./db_bench \
+    --benchmarks=fillseq \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0  > ${STAT_FILE}.fillseq
+
+# measure overwrite performance
+./db_bench \
+    --benchmarks=overwrite \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$((NUM / 10)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6  \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=8 > ${STAT_FILE}.overwrite
+
+# fill up the db for readrandom benchmark (1GB total size)
+./db_bench \
+    --benchmarks=fillseq \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --writes=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=1 > /dev/null
+
+# measure readrandom with 6GB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom
+
+# measure readrandom with 6GB block cache and tailing iterator
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --use_tailing_iterator=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandomtailing
+
+# measure readrandom with 100MB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --cache_size=104857600 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandomsmallblockcache
+
+# measure readrandom with 8k data in memtable
+./db_bench \
+    --benchmarks=overwrite,readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$NUM \
+    --reads=$((NUM / 5)) \
+    --writes=512 \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --write_buffer_size=1000000000 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom_mem_sst
+
+
+# fill up the db for readrandom benchmark with filluniquerandom (1GB total size)
+./db_bench \
+    --benchmarks=filluniquerandom \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --writes=$((NUM / 4)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=1 > /dev/null
+
+# dummy test just to compact the data
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 1000)) \
+    --reads=$((NUM / 1000)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > /dev/null
+
+# measure readrandom after load with filluniquerandom with 6GB block cache
+./db_bench \
+    --benchmarks=readrandom \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --reads=$((NUM / 4)) \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --disable_auto_compactions=1 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readrandom_filluniquerandom
+
+# measure readwhilewriting after load with filluniquerandom with 6GB block cache
+./db_bench \
+    --benchmarks=readwhilewriting \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --bloom_bits=10 \
+    --num=$((NUM / 4)) \
+    --reads=$((NUM / 4)) \
+    --writes_per_second=1000 \
+    --write_buffer_size=100000000 \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=16 > ${STAT_FILE}.readwhilewriting
+
+# measure memtable performance -- none of the data gets flushed to disk
+./db_bench \
+    --benchmarks=fillrandom,readrandom, \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --num=$((NUM / 10)) \
+    --reads=$NUM \
+    --cache_size=6442450944 \
+    --cache_numshardbits=6 \
+    --table_cache_numshardbits=4 \
+    --write_buffer_size=1000000000 \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --value_size=10 \
+    --threads=16 > ${STAT_FILE}.memtablefillreadrandom
+
+common_in_mem_args="--db=/dev/shm/rocksdb \
+    --num_levels=6 \
+    --key_size=20 \
+    --prefix_size=12 \
+    --keys_per_prefix=10 \
+    --value_size=100 \
+    --compression_type=none \
+    --compression_ratio=1 \
+    --hard_rate_limit=2 \
+    --write_buffer_size=134217728 \
+    --max_write_buffer_number=4 \
+    --level0_file_num_compaction_trigger=8 \
+    --level0_slowdown_writes_trigger=16 \
+    --level0_stop_writes_trigger=24 \
+    --target_file_size_base=134217728 \
+    --max_bytes_for_level_base=1073741824 \
+    --disable_wal=0 \
+    --wal_dir=/dev/shm/rocksdb \
+    --sync=0 \
+    --disable_data_sync=1 \
+    --verify_checksum=1 \
+    --delete_obsolete_files_period_micros=314572800 \
+    --max_grandparent_overlap_factor=10 \
+    --use_plain_table=1 \
+    --open_files=-1 \
+    --mmap_read=1 \
+    --mmap_write=0 \
+    --memtablerep=prefix_hash \
+    --bloom_bits=10 \
+    --bloom_locality=1 \
+    --perf_level=0"
+
+# prepare a in-memory DB with 50M keys, total DB size is ~6G
+./db_bench \
+    $common_in_mem_args \
+    --statistics=0 \
+    --max_background_compactions=16 \
+    --max_background_flushes=16 \
+    --benchmarks=filluniquerandom \
+    --use_existing_db=0 \
+    --num=52428800 \
+    --threads=1 > /dev/null
+
+# Readwhilewriting
+./db_bench \
+    $common_in_mem_args \
+    --statistics=1 \
+    --max_background_compactions=4 \
+    --max_background_flushes=0 \
+    --benchmarks=readwhilewriting\
+    --use_existing_db=1 \
+    --duration=600 \
+    --threads=32 \
+    --writes_per_second=81920 > ${STAT_FILE}.readwhilewriting_in_ram
+
+# Seekrandomwhilewriting
+./db_bench \
+    $common_in_mem_args \
+    --statistics=1 \
+    --max_background_compactions=4 \
+    --max_background_flushes=0 \
+    --benchmarks=seekrandomwhilewriting \
+    --use_existing_db=1 \
+    --use_tailing_iterator=1 \
+    --duration=600 \
+    --threads=32 \
+    --writes_per_second=81920 > ${STAT_FILE}.seekwhilewriting_in_ram
+
+# measure fillseq with bunch of column families
+./db_bench \
+    --benchmarks=fillseq \
+    --num_column_families=500 \
+    --write_buffer_size=1048576 \
+    --db=$DATA_DIR \
+    --use_existing_db=0 \
+    --num=$NUM \
+    --writes=$NUM \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0  > ${STAT_FILE}.fillseq_lots_column_families
+
+# measure overwrite performance with bunch of column families
+./db_bench \
+    --benchmarks=overwrite \
+    --num_column_families=500 \
+    --write_buffer_size=1048576 \
+    --db=$DATA_DIR \
+    --use_existing_db=1 \
+    --num=$NUM \
+    --writes=$((NUM / 10)) \
+    --open_files=55000 \
+    --statistics=1 \
+    --histogram=1 \
+    --disable_data_sync=1 \
+    --disable_wal=1 \
+    --sync=0 \
+    --threads=8 > ${STAT_FILE}.overwrite_lots_column_families
+
+# send data to ods
+function send_to_ods {
+  key="$1"
+  value="$2"
+
+  if [ -z $JENKINS_HOME ]; then
+    # running on devbox, just print out the values
+    echo $1 $2
+    return
+  fi
+
+  if [ -z "$value" ];then
+    echo >&2 "ERROR: Key $key doesn't have a value."
+    return
+  fi
+  curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build$git_br&key=$key&value=$value" \
+    --connect-timeout 60
+}
+
+function send_benchmark_to_ods {
+  bench="$1"
+  bench_key="$2"
+  file="$3"
+
+  QPS=$(grep $bench $file | awk '{print $5}')
+  P50_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $3}' )
+  P75_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $5}' )
+  P99_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $7}' )
+
+  send_to_ods rocksdb.build.$bench_key.qps $QPS
+  send_to_ods rocksdb.build.$bench_key.p50_micros $P50_MICROS
+  send_to_ods rocksdb.build.$bench_key.p75_micros $P75_MICROS
+  send_to_ods rocksdb.build.$bench_key.p99_micros $P99_MICROS
+}
+
+send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite
+send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq
+send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom
+send_benchmark_to_ods readrandom readrandom_tailing $STAT_FILE.readrandomtailing
+send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache
+send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst
+send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom
+send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom
+send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom
+send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting
+send_benchmark_to_ods readwhilewriting readwhilewriting_in_ram ${STAT_FILE}.readwhilewriting_in_ram
+send_benchmark_to_ods seekrandomwhilewriting seekwhilewriting_in_ram ${STAT_FILE}.seekwhilewriting_in_ram
+send_benchmark_to_ods fillseq fillseq_lots_column_families ${STAT_FILE}.fillseq_lots_column_families
+send_benchmark_to_ods overwrite overwrite_lots_column_families ${STAT_FILE}.overwrite_lots_column_families
diff --git a/src/rocksdb/build_tools/rocksdb-lego-determinator b/src/rocksdb/build_tools/rocksdb-lego-determinator
new file mode 100755
index 0000000..392231c
--- /dev/null
+++ b/src/rocksdb/build_tools/rocksdb-lego-determinator
@@ -0,0 +1,587 @@
+#!/bin/bash
+# This script is executed by Sandcastle
+# to determine next steps to run
+
+# Usage:
+# EMAIL=<email> ONCALL=<email> TRIGGER=<trigger> SUBSCRIBER=<email> rocks_ci.py <test-name>
+#
+# Input         Value
+# -------------------------------------------------------------------------
+# EMAIL         Email address to report on trigger conditions
+# ONCAL         Email address to raise a task on failure
+# TRIGGER       Trigger conditions for email. Valid values are fail, warn, all
+# SUBSCRIBER    Email addresss to add as subscriber for task
+#
+
+#
+# Report configuration
+#
+REPORT_EMAIL=
+if [ ! -z $EMAIL ]; then
+  if [ -z $TRIGGER ]; then
+    TRIGGER="fail"
+  fi
+
+  REPORT_EMAIL="
+  {
+      'type':'email',
+      'triggers': [ '$TRIGGER' ],
+      'emails':['$EMAIL']
+  },"
+fi
+
+CREATE_TASK=
+if [ ! -z $ONCALL ]; then
+  CREATE_TASK="
+  {
+      'type':'task',
+      'triggers':[ 'fail' ],
+      'priority':0,
+      'subscribers':[ '$SUBSCRIBER' ],
+      'tags':[ 'rocksdb', 'ci' ],
+  },"
+fi
+
+REPORT=
+if [[ ! -z $REPORT_EMAIL || ! -z $CREATE_TASK ]]; then
+  REPORT="'report': [
+    $REPORT_EMAIL
+    $CREATE_TASK
+  ]"
+fi
+
+#
+# Helper variables
+#
+CLEANUP_ENV="
+{
+    'name':'Cleanup environment',
+    'shell':'rm -rf /dev/shm/rocksdb && mkdir /dev/shm/rocksdb && make clean',
+    'user':'root'
+}"
+
+DEBUG="OPT=-g"
+SHM="TEST_TMPDIR=/dev/shm/rocksdb"
+GCC_481="ROCKSDB_FBCODE_BUILD_WITH_481=1"
+ASAN="COMPILE_WITH_ASAN=1"
+CLANG="USE_CLANG=1"
+LITE="OPT=-DROCKSDB_LITE"
+TSAN="COMPILE_WITH_TSAN=1"
+DISABLE_JEMALLOC="DISABLE_JEMALLOC=1"
+
+#
+# A mechanism to disable tests temporarily
+#
+DISABLE_COMMANDS="[
+    {
+        'name':'Disable test',
+        'oncall':'$ONCALL',
+        'steps': [
+            {
+              'name':'Job disabled. Please contact test owner',
+              'shell':'exit 1',
+              'user':'root'
+            },
+        ],
+    }
+]"
+
+#
+# RocksDB unit test in parallel
+# Currently we always have noise in our parallel runs. This job is to help
+# manage the noise
+#
+PARALLEL_UNIT_TEST_COMMANDS="[
+    {
+        'name':'Rocksdb Parallel Unit Test',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Build and test RocksDB debug version',
+                'shell':'$DEBUG make -j$(nproc) all && $SHM make check > /dev/null 2>&1 || cat t/log-*',
+                'user':'root'
+            },
+            $CLEANUP_ENV,
+            {
+                'name':'Build and test RocksDB debug version under gcc-4.8.1',
+                'shell':'$GCC_481 $DEBUG make -j$(nproc) all && $SHM make check > /dev/null 2>&1 || cat t/log-*',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB unit test
+#
+UNIT_TEST_COMMANDS="[
+    {
+        'name':'Rocksdb Unit Test',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Build and test RocksDB debug version',
+                'shell':'$SHM $DEBUG make J=1 check',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB unit test on gcc-4.8.1
+#
+UNIT_TEST_COMMANDS_481="[
+    {
+        'name':'Rocksdb Unit Test on GCC 4.8.1',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Build and test RocksDB debug version',
+                'shell':'$SHM $GCC_481 $DEBUG make J=1 check',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB unit test with CLANG
+#
+CLANG_UNIT_TEST_COMMANDS="[
+    {
+        'name':'Rocksdb Unit Test',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Build and test RocksDB debug',
+                'shell':'$CLANG $SHM $DEBUG make J=1 check',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB analyze
+#
+CLANG_ANALYZE_COMMANDS="[
+    {
+        'name':'Rocksdb analyze',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'RocksDB build and analyze',
+                'shell':'$CLANG $SHM $DEBUG make J=1 analyze',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB code coverage
+#
+CODE_COV_COMMANDS="[
+    {
+        'name':'Rocksdb Unit Test Code Coverage',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Build, test and collect code coverage info',
+                'shell':'$SHM $DEBUG make J=1 coverage',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB unity
+#
+UNITY_COMMANDS="[
+    {
+        'name':'Rocksdb Unity',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Build, test unity test',
+                'shell':'$SHM $DEBUG V=1 make J=1 unity_test',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# Build RocksDB lite
+#
+LITE_BUILD_COMMANDS="[
+    {
+        'name':'Rocksdb Lite build',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Build RocksDB debug version',
+                'shell':'$LITE $DEBUG make J=1 static_lib',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB stress/crash test
+#
+STRESS_CRASH_TEST_COMMANDS="[
+    {
+        'name':'Rocksdb Stress/Crash Test',
+        'oncall':'$ONCALL',
+        'timeout': 86400,
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Build and run RocksDB debug stress tests',
+                'shell':'$SHM $DEBUG make J=1 db_stress',
+                'user':'root'
+            },
+            {
+                'name':'Build and run RocksDB debug crash tests',
+                'timeout': 86400,
+                'shell':'$SHM $DEBUG make J=1 crash_test',
+                'user':'root'
+            }
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB test under address sanitizer
+#
+ASAN_TEST_COMMANDS="[
+    {
+        'name':'Rocksdb Unit Test under ASAN',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Test RocksDB debug under ASAN',
+                'shell':'set -o pipefail && $SHM $ASAN $DEBUG make J=1 asan_check |& /usr/facebook/ops/scripts/asan_symbolize.py -d',
+                'user':'root'
+            }
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB crash testing under address sanitizer
+#
+ASAN_CRASH_TEST_COMMANDS="[
+    {
+        'name':'Rocksdb crash test under ASAN',
+        'oncall':'$ONCALL',
+        'timeout': 86400,
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Build and run RocksDB debug asan_crash_test',
+                'timeout': 86400,
+                'shell':'$SHM $DEBUG make J=1 asan_crash_test',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB unit test under valgrind
+#
+VALGRIND_TEST_COMMANDS="[
+    {
+        'name':'Rocksdb Unit Test under valgrind',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Run RocksDB debug unit tests',
+                'shell':'$DISABLE_JEMALLOC $SHM $DEBUG make valgrind_check',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB test under TSAN
+#
+TSAN_UNIT_TEST_COMMANDS="[
+    {
+        'name':'Rocksdb Unit Test under TSAN',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Run RocksDB debug unit test',
+                'shell':'set -o pipefail && $SHM $DEBUG $TSAN make J=1 check',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB crash test under TSAN
+#
+TSAN_CRASH_TEST_COMMANDS="[
+    {
+        'name':'Rocksdb Crash Test under TSAN',
+        'oncall':'$ONCALL',
+        'timeout': 86400,
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Compile and run',
+                'timeout': 86400,
+                'shell':'set -o pipefail && $SHM $DEBUG $TSAN make J=1 crash_test',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB format compatible
+#
+
+run_format_compatible()
+{
+  export TEST_TMPDIR=/dev/shm/rocksdb
+  rm -rf /dev/shm/rocksdb
+  mkdir /dev/shm/rocksdb
+
+  echo '
+  if [ -e "build_tools/build_detect_platform" ]
+  then
+    sed "s/tcmalloc/nothingnothingnothing/g" build_tools/build_detect_platform > $TEST_TMPDIR/temp_build_file
+    rm -rf build_tools/build_detect_platform
+    cp $TEST_TMPDIR/temp_build_file build_tools/build_detect_platform
+    chmod +x build_tools/build_detect_platform
+  fi
+
+  if [ -e "build_detect_platform" ]
+  then
+    sed "s/tcmalloc/nothingnothingnothing/g" build_detect_platform > $TEST_TMPDIR/temp_build_file
+    rm -rf build_detect_platform 
+    cp $TEST_TMPDIR/temp_build_file build_detect_platform
+    chmod +x build_detect_platform 
+  fi
+
+  make ldb -j32
+
+  if [ -e "build_detect_platform" ]
+  then
+    git checkout -- build_detect_platform
+  fi
+
+  if [ -e "build_tools/build_detect_platform" ]
+  then
+    git checkout -- build_tools/build_detect_platform
+  fi
+  ' > temp_build_ldb.sh
+
+  sed "s/make ldb -j32/source temp_build_ldb.sh/g" tools/check_format_compatible.sh > tools/temp_check_format_compatible.sh
+  chmod +x tools/temp_check_format_compatible.sh
+  tools/temp_check_format_compatible.sh
+}
+
+FORMAT_COMPATIBLE_COMMANDS="[
+    {
+        'name':'Rocksdb Format Compatible tests',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Run RocksDB debug unit test',
+                'shell':'build_tools/rocksdb-lego-determinator run_format_compatible',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB no compression
+#
+run_no_compression()
+{
+  export TEST_TMPDIR=/dev/shm/rocksdb
+  rm -rf /dev/shm/rocksdb
+  mkdir /dev/shm/rocksdb
+  make clean
+  cat build_tools/fbcode_config.sh | grep -iv dzlib | grep -iv dlz4 | grep -iv dsnappy | grep -iv dbzip2 > .tmp.fbcode_config.sh
+  mv .tmp.fbcode_config.sh build_tools/fbcode_config.sh
+  cat Makefile | grep -v tools/ldb_test.py > .tmp.Makefile
+  mv .tmp.Makefile Makefile
+  make $DEBUG J=1 check
+}
+
+NO_COMPRESSION_COMMANDS="[
+    {
+        'name':'Rocksdb No Compression tests',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Run RocksDB debug unit test',
+                'shell':'build_tools/rocksdb-lego-determinator run_no_compression',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB regression
+#
+run_regression()
+{
+  time -v bash -vx ./build_tools/regression_build_test.sh $(mktemp -d  $WORKSPACE/leveldb.XXXX) $(mktemp leveldb_test_stats.XXXX)
+
+  # ======= report size to ODS ========
+
+  # parameters: $1 -- key, $2 -- value
+  function send_size_to_ods {
+    curl -s "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=rocksdb.build_size.$1&value=$2" \
+      --connect-timeout 60
+  }
+
+  # === normal build ===
+  make clean
+  make -j$(nproc) static_lib
+  send_size_to_ods static_lib $(stat --printf="%s" librocksdb.a)
+  strip librocksdb.a
+  send_size_to_ods static_lib_stripped $(stat --printf="%s" librocksdb.a)
+
+  make -j$(nproc) shared_lib
+  send_size_to_ods shared_lib $(stat --printf="%s" `readlink -f librocksdb.so`)
+  strip `readlink -f librocksdb.so`
+  send_size_to_ods shared_lib_stripped $(stat --printf="%s" `readlink -f librocksdb.so`)
+
+  # === lite build ===
+  make clean
+  OPT=-DROCKSDB_LITE make -j$(nproc) static_lib
+  send_size_to_ods static_lib_lite $(stat --printf="%s" librocksdb.a)
+  strip librocksdb.a
+  send_size_to_ods static_lib_lite_stripped $(stat --printf="%s" librocksdb.a)
+
+  OPT=-DROCKSDB_LITE make -j$(nproc) shared_lib
+  send_size_to_ods shared_lib_lite $(stat --printf="%s" `readlink -f librocksdb.so`)
+  strip `readlink -f librocksdb.so`
+  send_size_to_ods shared_lib_lite_stripped $(stat --printf="%s" `readlink -f librocksdb.so`)
+}
+
+REGRESSION_COMMANDS="[
+    {
+        'name':'Rocksdb regression commands',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Make and run script',
+                'shell':'build_tools/rocksdb-lego-determinator run_regression',
+                'user':'root'
+            },
+        ],
+        $REPORT
+    }
+]"
+
+case $1 in
+  punit)
+    echo $PARALLEL_UNIT_TEST_COMMANDS
+    ;;
+  unit)
+    echo $UNIT_TEST_COMMANDS
+    ;;
+  unit_481)
+    echo $UNIT_TEST_COMMANDS_481
+    ;;
+  clang_unit)
+    echo $CLANG_UNIT_TEST_COMMANDS
+    ;;
+  clang_analyze)
+    echo $CLANG_ANALYZE_COMMANDS
+    ;;
+  code_cov)
+    echo $CODE_COV_COMMANDS
+    ;;
+  unity)
+    echo $UNITY_COMMANDS
+    ;;
+  lite)
+    echo $LITE_BUILD_COMMANDS
+    ;;
+  stress_crash)
+    echo $STRESS_CRASH_TEST_COMMANDS
+    ;;
+  asan)
+    echo $ASAN_TEST_COMMANDS
+    ;;
+  asan_crash)
+    echo $ASAN_CRASH_TEST_COMMANDS
+    ;;
+  valgrind)
+    echo $VALGRIND_TEST_COMMANDS
+    ;;
+  tsan)
+    echo $TSAN_UNIT_TEST_COMMANDS
+    ;;
+  tsan_crash)
+    echo $TSAN_CRASH_TEST_COMMANDS
+    ;;
+  format_compatible)
+    echo $FORMAT_COMPATIBLE_COMMANDS
+    ;;
+  run_format_compatible)
+    run_format_compatible
+    ;;
+  no_compression)
+    echo $NO_COMPRESSION_COMMANDS
+    ;;
+  run_no_compression)
+    run_no_compression
+    ;;
+  regression)
+    echo $REGRESSION_COMMANDS
+    ;;
+  run_regression)
+    run_regression
+    ;;
+  *)
+    echo "Invalid determinator command"
+    ;;
+esac
diff --git a/src/rocksdb/build_tools/run_ci_db_test.ps1 b/src/rocksdb/build_tools/run_ci_db_test.ps1
new file mode 100644
index 0000000..5f47f3d
--- /dev/null
+++ b/src/rocksdb/build_tools/run_ci_db_test.ps1
@@ -0,0 +1,252 @@
+# This script enables you running RocksDB tests by running
+# All the tests in paralell and utilizing all the cores
+# For db_test the script first lists and parses the tests
+# and then fires them up in parallel using async PS Job functionality
+# Run the script from the enlistment
+Param(
+  [switch]$EnableJE = $false,  # Use je executable
+  [string]$WorkFolder = "",  # Direct tests to use that folder
+  [int]$Limit = -1, # -1 means run all otherwise limit for testing purposes
+  [string]$Exclude = "", # Expect a comma separated list, no spaces
+  [string]$Run = "db_test"  # Run db_test|tests
+)
+
+# Folders and commands must be fullpath to run assuming
+# the current folder is at the root of the git enlistment
+Get-Date
+
+# If running under Appveyor assume that root
+[string]$Appveyor = $Env:APPVEYOR_BUILD_FOLDER
+if($Appveyor -ne "") {
+    $RootFolder = $Appveyor
+} else {
+    $RootFolder = $PSScriptRoot -replace '\\build_tools', ''
+}
+
+$LogFolder = -Join($RootFolder, "\db_logs\")
+$BinariesFolder = -Join($RootFolder, "\build\Debug\")
+
+if($WorkFolder -eq "") {
+
+    # If TEST_TMPDIR is set use it    
+    [string]$var = $Env:TEST_TMPDIR
+    if($var -eq "") {
+        $WorkFolder = -Join($RootFolder, "\db_tests\")
+        $Env:TEST_TMPDIR = $WorkFolder
+    } else {
+        $WorkFolder = $var
+    }
+} else {
+# Override from a command line
+  $Env:TEST_TMPDIR = $WorkFolder
+}
+
+# Use JEMALLOC executables
+if($EnableJE) {
+    $db_test = -Join ($BinariesFolder, "db_test_je.exe")
+} else {
+    $db_test = -Join ($BinariesFolder, "db_test.exe")
+}
+
+Write-Output "Root: $RootFolder, WorkFolder: $WorkFolder"
+Write-Output "Binaries: $BinariesFolder exe: $db_test"
+
+#Exclusions that we do not want to run
+$ExcludeTests = New-Object System.Collections.Generic.HashSet[string]
+
+
+if($Exclude -ne "") {
+    Write-Host "Exclude: $Exclude"
+    $l = $Exclude -split ','
+    ForEach($t in $l) { $ExcludeTests.Add($t) | Out-Null }
+}
+
+# Create test directories in the current folder
+md -Path $WorkFolder -ErrorAction Ignore | Out-Null
+md -Path $LogFolder -ErrorAction Ignore | Out-Null
+
+# Extract the names of its tests by running db_test with --gtest_list_tests.
+# This filter removes the "#"-introduced comments, and expands to
+# fully-qualified names by changing input like this:
+#
+#   DBTest.
+#     Empty
+#     WriteEmptyBatch
+#   MultiThreaded/MultiThreadedDBTest.
+#     MultiThreaded/0  # GetParam() = 0
+#     MultiThreaded/1  # GetParam() = 1
+#
+# into this:
+#
+#   DBTest.Empty
+#   DBTest.WriteEmptyBatch
+#   MultiThreaded/MultiThreadedDBTest.MultiThreaded/0
+#   MultiThreaded/MultiThreadedDBTest.MultiThreaded/1
+# Output into the parameter in a form TestName -> Log File Name
+function Normalize-DbTests($HashTable) {
+
+    $Tests = @()
+# Run db_test to get a list of tests and store it into $a array
+    &$db_test --gtest_list_tests | tee -Variable Tests | Out-Null
+
+    # Current group
+    $Group=""
+
+    ForEach( $l in $Tests) {
+      # Trailing dot is a test group
+      if( $l -match "\.$") {
+        $Group = $l
+      }  else {
+        # Otherwise it is a test name, remove leading space
+        $test = $l -replace '^\s+',''
+        # remove trailing comment if any and create a log name
+        $test = $test -replace '\s+\#.*',''
+        $test = "$Group$test"
+
+        if($ExcludeTests.Contains($test)) {
+            continue
+        }
+
+        $test_log = $test -replace '[\./]','_'
+        $test_log += ".log"
+
+        # Add to a hashtable
+        $HashTable.Add($test, $test_log);
+      }
+    }
+}
+
+# The function scans build\Debug folder to discover
+# Test executables. It then populates a table with
+# Test executable name -> Log file
+function Discover-TestBinaries($HashTable) {
+
+    $Exclusions = @("db_test*", "db_sanity_test*")
+    $p = -join ($BinariesFolder, "*_test*.exe")
+
+    dir -Path $p -Exclude $Exclusions | ForEach-Object {
+       $t = ($_.Name) -replace '.exe$', ''
+       $test_log = -join ($t, ".log")
+       $HashTable.Add($t, $test_log)
+    }
+}
+
+$TestToLog = [ordered]@{}
+
+if($Run -ceq "db_test") {
+    Normalize-DbTests -HashTable $TestToLog
+} elseif($Run -ceq "tests") {
+    Discover-TestBinaries -HashTable $TestToLog
+}
+
+
+Write-Host "Attempting to start: " ($TestToLog.Count) " tests"
+
+# Invoke a test with a filter and redirect all output
+$InvokeTestCase = {
+    param($exe, $test, $log);
+    &$exe --gtest_filter=$test > $log 2>&1
+}
+
+# Invoke all tests and redirect output
+$InvokeTestAsync = {
+    param($exe, $log)
+    &$exe > $log 2>&1
+}
+
+$jobs = @()
+$JobToLog = @{}
+# Test limiting factor here
+$count = 0
+
+ForEach($k in $TestToLog.keys) {
+
+    Write-Host "Starting $k"
+    $log_path = -join ($LogFolder, ($TestToLog.$k))
+
+    if($Run -ceq "db_test") {
+        $job = Start-Job -Name $k -ScriptBlock $InvokeTestCase -ArgumentList @($db_test,$k,$log_path)
+    } else {
+        [string]$Exe =  -Join ($BinariesFolder, $k)
+        $job = Start-Job -Name $k -ScriptBlock $InvokeTestAsync -ArgumentList @($exe,$log_path)
+    }
+
+    $JobToLog.Add($job, $log_path)
+
+    # Limiting trial runs
+    if(($Limit -gt 0) -and (++$count -ge $Limit)) {
+         break
+    }
+}
+
+[bool]$success = $true;
+
+# Wait for all to finish and get the results
+while($JobToLog.Count -gt 0) {
+
+    $jobs = @()
+    foreach($k in $JobToLog.Keys) { $jobs += $k }
+
+<#
+    if(!$success) {
+        break
+    }
+#>
+
+    $completed = Wait-Job -Job $jobs -Any
+    $log = $JobToLog[$completed]
+    $JobToLog.Remove($completed)
+
+    $message = -join @($completed.Name, " State: ", ($completed.State))
+
+    $log_content = @(Get-Content $log)
+
+    if($completed.State -ne "Completed") {
+        $success = $false
+        Write-Warning $message
+        $log_content | Write-Warning
+    } else {
+        # Scan the log. If we find PASSED and no occurence of FAILED
+        # then it is a success
+        [bool]$pass_found = $false
+        ForEach($l in $log_content) {
+
+            if(($l -match "^\[\s+FAILED") -or
+               ($l -match "Assertion failed:")) {
+                $pass_found = $false
+                break
+            }
+
+            if(($l -match "^\[\s+PASSED") -or
+               ($l -match " : PASSED$") -or
+                ($l -match "^PASSED") -or
+                ($l -match "Passed all tests!") ) {
+                $pass_found = $true
+            }
+        }
+
+        if(!$pass_found) {
+            $success = $false;
+            Write-Warning $message
+            $log_content | Write-Warning
+        } else {
+            Write-Host $message
+        }
+    }
+
+    # Remove cached job info from the system
+    # Should be no output
+    Receive-Job -Job $completed | Out-Null
+}
+
+Get-Date
+
+if(!$success) {
+# This does not succeed killing off jobs quick
+# So we simply exit
+#    Remove-Job -Job $jobs -Force
+# indicate failure using this exit code
+    exit 12345
+ }
+
+ 
\ No newline at end of file
diff --git a/src/rocksdb/build_tools/version.sh b/src/rocksdb/build_tools/version.sh
new file mode 100755
index 0000000..c5a8595
--- /dev/null
+++ b/src/rocksdb/build_tools/version.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+if [ "$#" = "0" ]; then
+  echo "Usage: $0 major|minor|patch"
+  exit 1
+fi
+if [ "$1" = "major" ]; then
+  cat include/rocksdb/version.h  | grep MAJOR | head -n1 | awk '{print $3}'
+fi
+if [ "$1" = "minor" ]; then
+  cat include/rocksdb/version.h  | grep MINOR | head -n1 | awk '{print $3}'
+fi
+if [ "$1" = "patch" ]; then
+  cat include/rocksdb/version.h  | grep PATCH | head -n1 | awk '{print $3}'
+fi
diff --git a/src/rocksdb/configure.ac b/src/rocksdb/configure.ac
deleted file mode 100644
index b312784..0000000
--- a/src/rocksdb/configure.ac
+++ /dev/null
@@ -1,87 +0,0 @@
-#                                               -*- Autoconf -*-
-# Process this file with autoconf to produce a configure script.
-
-AC_PREREQ([2.59])
-AC_INIT([rockdb], [3.0])
-AC_CONFIG_MACRO_DIR([m4])
-AM_INIT_AUTOMAKE([-Wall -Werror foreign -Wno-portability])
-m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])])
-m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
-LT_INIT
-AC_PROG_CC
-AC_PROG_CXX
-AC_PROG_LIBTOOL
-AC_LANG_CPLUSPLUS
-AC_CONFIG_HEADERS([config.h])
-AC_CONFIG_FILES([Makefile])
-
-AC_CHECK_LIB([snappy], [snappy_compress], [HAVE_LIBSNAPPY=yes], [AC_MSG_FAILURE([libsnappy not found])])
-AC_CHECK_LIB([z], [gzread], [HAVE_LIBZ=yes], [AC_MSG_FAILURE([libz not found])])
-AC_CHECK_LIB([bz2], [BZ2_bzCompressInit], [HAVE_LIBBZ2=yes], [AC_MSG_FAILURE([libbz2 not found])])
-AC_CHECK_LIB([rt], [clock_gettime], [HAVE_LIBRT=yes], [AC_MSG_FAILURE([librt not found])])
-
-AC_ARG_WITH([tcmalloc],
-	    [AS_HELP_STRING([--without-tcmalloc], [disable tcmalloc for memory allocations])],
-	    [],
-	    [with_tcmalloc=no])
-AS_IF([test "x$with_tcmalloc" != xno],
-	    [AC_CHECK_LIB([tcmalloc], [malloc],  [HAVE_LIBTCMALLOC=yes],[AC_MSG_FAILURE([no tcmalloc found ])])]
-	    [])
-
-OLD_CXXFLAGS="$CXXFLAGS"
-CXXFLAGS="$CXXFLAGS -std=c++11"
-
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([
-#include <atomic>
-int main() {}
-])], [
-     HAVE_ATOMIC=yes
-   ],
-   [
-     HAVE_ATOMIC=no
-   ])
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([
-#include <gflags/gflags.h>
-using namespace gflags;
-int main() {}
-])], [
-     HAVE_GFLAGS_GFLAGS=yes
-   ],
-   [
-     HAVE_GFLAGS_GFLAGS=no
-   ])
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([
-#include <gflags/gflags.h>
-using namespace google;
-int main() {}
-])], [
-     HAVE_GOOGLE_GFLAGS=yes
-   ],
-   [
-     HAVE_GOOGLE_GFLAGS=no
-   ])
-
-AC_COMPILE_IFELSE([AC_LANG_SOURCE([
-#include <fcntl.h>
-      int main() {
-        int fd = open("/dev/null", 0);
-        fallocate(fd, 0, 0, 1024);
-      }
-])], [
-     HAVE_FALLOCATE=yes
-   ],
-   [
-     HAVE_FALLOCATE=no
-   ])
-
-AM_CONDITIONAL(WITH_TCMALLOC, [test x"$HAVE_LIBTCMALLOC" = "xyes"])
-AM_CONDITIONAL(WITH_ATOMIC, [test x"$HAVE_ATOMIC" = "xyes"])
-AM_CONDITIONAL(WITH_SNAPPY, [test x"$HAVE_LIBSNAPPY" = "xyes"])
-AM_CONDITIONAL(WITH_Z, [test x"$HAVE_LIBZ" = "xyes" ])
-AM_CONDITIONAL(WITH_BZ2, [test x"$HAVE_LIBBZ2" = "xyes"])
-AM_CONDITIONAL(WITH_RT, [test x"$HAVE_LIBRT" = "xyes"])
-AM_CONDITIONAL(WITH_GOOGLE_FLAGS, [test x"$HAVE_GFLAGS_GFLAGS" = "xyes"])
-AM_CONDITIONAL(WITH_GFLAGS_FLAGS, [test x"$HAVE_GOOGLE_GFLAGS" = "xyes"])
-AM_CONDITIONAL(WITH_FALLOCATE, [test x"$HAVE_FALLOCATE" = "xyes"])
-
-AC_OUTPUT
diff --git a/src/rocksdb/coverage/coverage_test.sh b/src/rocksdb/coverage/coverage_test.sh
new file mode 100755
index 0000000..4d8052c
--- /dev/null
+++ b/src/rocksdb/coverage/coverage_test.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+# Exit on error.
+set -e
+
+if [ -n "$USE_CLANG" ]; then
+  echo "Error: Coverage test is supported only for gcc."
+  exit 1
+fi
+
+ROOT=".."
+# Fetch right version of gcov
+if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
+  source $ROOT/build_tools/fbcode_config.sh
+  GCOV=$GCC_BASE/bin/gcov
+else
+  GCOV=$(which gcov)
+fi
+
+COVERAGE_DIR="$PWD/COVERAGE_REPORT"
+mkdir -p $COVERAGE_DIR
+
+# Find all gcno files to generate the coverage report
+
+GCNO_FILES=`find $ROOT -name "*.gcno"`
+$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
+  # Parse the raw gcov report to more human readable form.
+  python $ROOT/coverage/parse_gcov_output.py |
+  # Write the output to both stdout and report file.
+  tee $COVERAGE_DIR/coverage_report_all.txt &&
+echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n"
+
+# TODO: we also need to get the files of the latest commits.
+# Get the most recently committed files.
+LATEST_FILES=`
+  git show --pretty="format:" --name-only HEAD |
+  grep -v "^$" |
+  paste -s -d,`
+RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt
+
+echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT
+$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
+  python $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES |
+  tee -a $RECENT_REPORT &&
+echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
+
+# Unless otherwise specified, we'll not generate html report by default
+if [ -z "$HTML" ]; then
+  exit 0
+fi
+
+# Generate the html report. If we cannot find lcov in this machine, we'll simply
+# skip this step.
+echo "Generating the html coverage report..."
+
+LCOV=$(which lcov || true 2>/dev/null)
+if [ -z $LCOV ]
+then
+  echo "Skip: Cannot find lcov to generate the html report."
+  exit 0
+fi
+
+LCOV_VERSION=$(lcov -v | grep 1.1 || true)
+if [ $LCOV_VERSION ]
+then
+  echo "Not supported lcov version. Expect lcov 1.1."
+  exit 0
+fi
+
+(cd $ROOT; lcov --no-external \
+     --capture  \
+     --directory $PWD \
+     --gcov-tool $GCOV \
+     --output-file $COVERAGE_DIR/coverage.info)
+
+genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR
+
+echo "HTML Coverage report is generated in $COVERAGE_DIR"
diff --git a/src/rocksdb/coverage/parse_gcov_output.py b/src/rocksdb/coverage/parse_gcov_output.py
new file mode 100644
index 0000000..72e8b07
--- /dev/null
+++ b/src/rocksdb/coverage/parse_gcov_output.py
@@ -0,0 +1,118 @@
+import optparse
+import re
+import sys
+
+from optparse import OptionParser
+
+# the gcov report follows certain pattern. Each file will have two lines
+# of report, from which we can extract the file name, total lines and coverage
+# percentage.
+def parse_gcov_report(gcov_input):
+    per_file_coverage = {}
+    total_coverage = None
+
+    for line in sys.stdin:
+        line = line.strip()
+
+        # --First line of the coverage report (with file name in it)?
+        match_obj = re.match("^File '(.*)'$", line)
+        if match_obj:
+            # fetch the file name from the first line of the report.
+            current_file = match_obj.group(1)
+            continue
+
+        # -- Second line of the file report (with coverage percentage)
+        match_obj = re.match("^Lines executed:(.*)% of (.*)", line)
+
+        if match_obj:
+            coverage = float(match_obj.group(1))
+            lines = int(match_obj.group(2))
+
+            if current_file is not None:
+                per_file_coverage[current_file] = (coverage, lines)
+                current_file = None
+            else:
+                # If current_file is not set, we reach the last line of report,
+                # which contains the summarized coverage percentage.
+                total_coverage = (coverage, lines)
+            continue
+
+        # If the line's pattern doesn't fall into the above categories. We
+        # can simply ignore them since they're either empty line or doesn't
+        # find executable lines of the given file.
+        current_file = None
+
+    return per_file_coverage, total_coverage
+
+def get_option_parser():
+    usage = "Parse the gcov output and generate more human-readable code " +\
+            "coverage report."
+    parser = OptionParser(usage)
+
+    parser.add_option(
+        "--interested-files", "-i",
+        dest="filenames",
+        help="Comma separated files names. if specified, we will display " +
+             "the coverage report only for interested source files. " +
+             "Otherwise we will display the coverage report for all " +
+             "source files."
+    )
+    return parser
+
+def display_file_coverage(per_file_coverage, total_coverage):
+    # To print out auto-adjustable column, we need to know the longest
+    # length of file names.
+    max_file_name_length = max(
+        len(fname) for fname in per_file_coverage.keys()
+    )
+
+    # -- Print header
+    # size of separator is determined by 3 column sizes:
+    # file name, coverage percentage and lines.
+    header_template = \
+        "%" + str(max_file_name_length) + "s\t%s\t%s"
+    separator = "-" * (max_file_name_length + 10 + 20)
+    print header_template % ("Filename", "Coverage", "Lines")
+    print separator
+
+    # -- Print body
+    # template for printing coverage report for each file.
+    record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d"
+
+    for fname, coverage_info in per_file_coverage.items():
+        coverage, lines = coverage_info
+        print record_template % (fname, coverage, lines)
+
+    # -- Print footer
+    if total_coverage:
+        print separator
+        print record_template % ("Total", total_coverage[0], total_coverage[1])
+
+def report_coverage():
+    parser = get_option_parser()
+    (options, args) = parser.parse_args()
+
+    interested_files = set()
+    if options.filenames is not None:
+        interested_files = set(f.strip() for f in options.filenames.split(','))
+
+    # To make things simple, right now we only read gcov report from the input
+    per_file_coverage, total_coverage = parse_gcov_report(sys.stdin)
+
+    # Check if we need to display coverage info for interested files.
+    if len(interested_files):
+        per_file_coverage = dict(
+            (fname, per_file_coverage[fname]) for fname in interested_files
+            if fname in per_file_coverage
+        )
+        # If we only interested in several files, it makes no sense to report
+        # the total_coverage
+        total_coverage = None
+
+    if not len(per_file_coverage):
+        print >> sys.stderr, "Cannot find coverage info for the given files."
+        return
+    display_file_coverage(per_file_coverage, total_coverage)
+
+if __name__ == "__main__":
+    report_coverage()
diff --git a/src/rocksdb/db/builder.cc b/src/rocksdb/db/builder.cc
index 2a33bb0..3d07a0f 100644
--- a/src/rocksdb/db/builder.cc
+++ b/src/rocksdb/db/builder.cc
@@ -9,9 +9,14 @@
 
 #include "db/builder.h"
 
+#include <algorithm>
+#include <deque>
 #include <vector>
+
+#include "db/compaction_iterator.h"
 #include "db/dbformat.h"
 #include "db/filename.h"
+#include "db/internal_stats.h"
 #include "db/merge_helper.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
@@ -21,9 +26,10 @@
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
 #include "table/block_based_table_builder.h"
+#include "util/file_reader_writer.h"
 #include "util/iostats_context_imp.h"
-#include "util/thread_status_util.h"
 #include "util/stop_watch.h"
+#include "util/thread_status_util.h"
 
 namespace rocksdb {
 
@@ -34,7 +40,7 @@ TableBuilder* NewTableBuilder(
     const InternalKeyComparator& internal_comparator,
     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
         int_tbl_prop_collector_factories,
-    WritableFile* file, const CompressionType compression_type,
+    WritableFileWriter* file, const CompressionType compression_type,
     const CompressionOptions& compression_opts, const bool skip_filters) {
   return ioptions.table_factory->NewTableBuilder(
       TableBuilderOptions(ioptions, internal_comparator,
@@ -49,179 +55,72 @@ Status BuildTable(
     FileMetaData* meta, const InternalKeyComparator& internal_comparator,
     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
         int_tbl_prop_collector_factories,
-    const SequenceNumber newest_snapshot,
-    const SequenceNumber earliest_seqno_in_memtable,
-    const CompressionType compression,
+    std::vector<SequenceNumber> snapshots, const CompressionType compression,
     const CompressionOptions& compression_opts, bool paranoid_file_checks,
-    const Env::IOPriority io_priority, TableProperties* table_properties) {
+    InternalStats* internal_stats, const Env::IOPriority io_priority,
+    TableProperties* table_properties) {
   // Reports the IOStats for flush for every following bytes.
   const size_t kReportFlushIOStatsEvery = 1048576;
   Status s;
   meta->fd.file_size = 0;
-  meta->smallest_seqno = meta->largest_seqno = 0;
   iter->SeekToFirst();
 
-  // If the sequence number of the smallest entry in the memtable is
-  // smaller than the most recent snapshot, then we do not trigger
-  // removal of duplicate/deleted keys as part of this builder.
-  bool purge = ioptions.purge_redundant_kvs_while_flush;
-  if (earliest_seqno_in_memtable <= newest_snapshot) {
-    purge = false;
-  }
-
   std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(),
                                     meta->fd.GetPathId());
   if (iter->Valid()) {
-    unique_ptr<WritableFile> file;
-    s = env->NewWritableFile(fname, &file, env_options);
-    if (!s.ok()) {
-      return s;
-    }
-    file->SetIOPriority(io_priority);
+    TableBuilder* builder;
+    unique_ptr<WritableFileWriter> file_writer;
+    {
+      unique_ptr<WritableFile> file;
+      s = env->NewWritableFile(fname, &file, env_options);
+      if (!s.ok()) {
+        return s;
+      }
+      file->SetIOPriority(io_priority);
 
-    TableBuilder* builder = NewTableBuilder(
-        ioptions, internal_comparator, int_tbl_prop_collector_factories,
-        file.get(), compression, compression_opts);
+      file_writer.reset(new WritableFileWriter(std::move(file), env_options));
 
-    {
-      // the first key is the smallest key
-      Slice key = iter->key();
-      meta->smallest.DecodeFrom(key);
-      meta->smallest_seqno = GetInternalKeySeqno(key);
-      meta->largest_seqno = meta->smallest_seqno;
+      builder = NewTableBuilder(
+          ioptions, internal_comparator, int_tbl_prop_collector_factories,
+          file_writer.get(), compression, compression_opts);
     }
 
-    MergeHelper merge(internal_comparator.user_comparator(),
-                      ioptions.merge_operator, ioptions.info_log,
+    MergeHelper merge(env, internal_comparator.user_comparator(),
+                      ioptions.merge_operator, nullptr, ioptions.info_log,
                       ioptions.min_partial_merge_operands,
-                      true /* internal key corruption is not ok */);
-
-    if (purge) {
-      // Ugly walkaround to avoid compiler error for release build
-      bool ok __attribute__((unused)) = true;
-
-      // Will write to builder if current key != prev key
-      ParsedInternalKey prev_ikey;
-      std::string prev_key;
-      bool is_first_key = true;    // Also write if this is the very first key
-
-      while (iter->Valid()) {
-        bool iterator_at_next = false;
-
-        // Get current key
-        ParsedInternalKey this_ikey;
-        Slice key = iter->key();
-        Slice value = iter->value();
-
-        // In-memory key corruption is not ok;
-        // TODO: find a clean way to treat in memory key corruption
-        ok = ParseInternalKey(key, &this_ikey);
-        assert(ok);
-        assert(this_ikey.sequence >= earliest_seqno_in_memtable);
-
-        // If the key is the same as the previous key (and it is not the
-        // first key), then we skip it, since it is an older version.
-        // Otherwise we output the key and mark it as the "new" previous key.
-        if (!is_first_key && !internal_comparator.user_comparator()->Compare(
-                                  prev_ikey.user_key, this_ikey.user_key)) {
-          // seqno within the same key are in decreasing order
-          assert(this_ikey.sequence < prev_ikey.sequence);
-        } else {
-          is_first_key = false;
-
-          if (this_ikey.type == kTypeMerge) {
-            // TODO(tbd): Add a check here to prevent RocksDB from crash when
-            // reopening a DB w/o properly specifying the merge operator.  But
-            // currently we observed a memory leak on failing in RocksDB
-            // recovery, so we decide to let it crash instead of causing
-            // memory leak for now before we have identified the real cause
-            // of the memory leak.
-
-            // Handle merge-type keys using the MergeHelper
-            // TODO: pass statistics to MergeUntil
-            merge.MergeUntil(iter, 0 /* don't worry about snapshot */);
-            iterator_at_next = true;
-            if (merge.IsSuccess()) {
-              // Merge completed correctly.
-              // Add the resulting merge key/value and continue to next
-              builder->Add(merge.key(), merge.value());
-              prev_key.assign(merge.key().data(), merge.key().size());
-              ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
-              assert(ok);
-            } else {
-              // Merge did not find a Put/Delete.
-              // Can not compact these merges into a kValueType.
-              // Write them out one-by-one. (Proceed back() to front())
-              const std::deque<std::string>& keys = merge.keys();
-              const std::deque<std::string>& values = merge.values();
-              assert(keys.size() == values.size() && keys.size() >= 1);
-              std::deque<std::string>::const_reverse_iterator key_iter;
-              std::deque<std::string>::const_reverse_iterator value_iter;
-              for (key_iter=keys.rbegin(), value_iter = values.rbegin();
-                   key_iter != keys.rend() && value_iter != values.rend();
-                   ++key_iter, ++value_iter) {
-
-                builder->Add(Slice(*key_iter), Slice(*value_iter));
-              }
-
-              // Sanity check. Both iterators should end at the same time
-              assert(key_iter == keys.rend() && value_iter == values.rend());
-
-              prev_key.assign(keys.front());
-              ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
-              assert(ok);
-            }
-          } else {
-            // Handle Put/Delete-type keys by simply writing them
-            builder->Add(key, value);
-            prev_key.assign(key.data(), key.size());
-            ok = ParseInternalKey(Slice(prev_key), &prev_ikey);
-            assert(ok);
-          }
-        }
-
-        if (io_priority == Env::IO_HIGH &&
-            IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
-          ThreadStatusUtil::IncreaseThreadOperationProperty(
-              ThreadStatus::FLUSH_BYTES_WRITTEN,
-              IOSTATS(bytes_written));
-          IOSTATS_RESET(bytes_written);
-        }
-        if (!iterator_at_next) iter->Next();
-      }
-
-      // The last key is the largest key
-      meta->largest.DecodeFrom(Slice(prev_key));
-      SequenceNumber seqno = GetInternalKeySeqno(Slice(prev_key));
-      meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
-      meta->largest_seqno = std::max(meta->largest_seqno, seqno);
-
-    } else {
-      for (; iter->Valid(); iter->Next()) {
-        Slice key = iter->key();
-        meta->largest.DecodeFrom(key);
-        builder->Add(key, iter->value());
-        SequenceNumber seqno = GetInternalKeySeqno(key);
-        meta->smallest_seqno = std::min(meta->smallest_seqno, seqno);
-        meta->largest_seqno = std::max(meta->largest_seqno, seqno);
-        if (io_priority == Env::IO_HIGH &&
-            IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
-          ThreadStatusUtil::IncreaseThreadOperationProperty(
-              ThreadStatus::FLUSH_BYTES_WRITTEN,
-              IOSTATS(bytes_written));
-          IOSTATS_RESET(bytes_written);
-        }
+                      true /* internal key corruption is not ok */,
+                      snapshots.empty() ? 0 : snapshots.back());
+
+    CompactionIterator c_iter(iter, internal_comparator.user_comparator(),
+                              &merge, kMaxSequenceNumber, &snapshots, env,
+                              true /* internal key corruption is not ok */);
+    c_iter.SeekToFirst();
+    for (; c_iter.Valid(); c_iter.Next()) {
+      const Slice& key = c_iter.key();
+      const Slice& value = c_iter.value();
+      builder->Add(key, value);
+      meta->UpdateBoundaries(key, c_iter.ikey().sequence);
+
+      // TODO(noetzli): Update stats after flush, too.
+      if (io_priority == Env::IO_HIGH &&
+          IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
+        ThreadStatusUtil::SetThreadOperationProperty(
+            ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
       }
     }
 
     // Finish and check for builder errors
-    if (s.ok()) {
-      s = builder->Finish();
-    } else {
+    bool empty = builder->NumEntries() == 0;
+    s = c_iter.status();
+    if (!s.ok() || empty) {
       builder->Abandon();
+    } else {
+      s = builder->Finish();
     }
-    if (s.ok()) {
+
+    if (s.ok() && !empty) {
       meta->fd.file_size = builder->FileSize();
+      meta->marked_for_compaction = builder->NeedCompact();
       assert(meta->fd.GetFileSize() > 0);
       if (table_properties) {
         *table_properties = builder->GetTableProperties();
@@ -230,30 +129,27 @@ Status BuildTable(
     delete builder;
 
     // Finish and check for file errors
-    if (s.ok() && !ioptions.disable_data_sync) {
-      if (ioptions.use_fsync) {
-        StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
-        s = file->Fsync();
-      } else {
-        StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
-        s = file->Sync();
-      }
+    if (s.ok() && !empty && !ioptions.disable_data_sync) {
+      StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
+      file_writer->Sync(ioptions.use_fsync);
     }
-    if (s.ok()) {
-      s = file->Close();
+    if (s.ok() && !empty) {
+      s = file_writer->Close();
     }
 
-    if (s.ok()) {
+    if (s.ok() && !empty) {
       // Verify that the table is usable
-      Iterator* it = table_cache->NewIterator(ReadOptions(), env_options,
-                                              internal_comparator, meta->fd);
+      std::unique_ptr<Iterator> it(table_cache->NewIterator(
+          ReadOptions(), env_options, internal_comparator, meta->fd, nullptr,
+          (internal_stats == nullptr) ? nullptr
+                                      : internal_stats->GetFileReadHist(0),
+          false));
       s = it->status();
       if (s.ok() && paranoid_file_checks) {
-        for (it->SeekToFirst(); it->Valid(); it->Next()) {}
+        for (it->SeekToFirst(); it->Valid(); it->Next()) {
+        }
         s = it->status();
       }
-
-      delete it;
     }
   }
 
@@ -262,9 +158,7 @@ Status BuildTable(
     s = iter->status();
   }
 
-  if (s.ok() && meta->fd.GetFileSize() > 0) {
-    // Keep it
-  } else {
+  if (!s.ok() || meta->fd.GetFileSize() == 0) {
     env->DeleteFile(fname);
   }
   return s;
diff --git a/src/rocksdb/db/builder.h b/src/rocksdb/db/builder.h
index 9d2888d..09d81bf 100644
--- a/src/rocksdb/db/builder.h
+++ b/src/rocksdb/db/builder.h
@@ -29,14 +29,15 @@ class Iterator;
 class TableCache;
 class VersionEdit;
 class TableBuilder;
-class WritableFile;
+class WritableFileWriter;
+class InternalStats;
 
 TableBuilder* NewTableBuilder(
     const ImmutableCFOptions& options,
     const InternalKeyComparator& internal_comparator,
     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
         int_tbl_prop_collector_factories,
-    WritableFile* file, const CompressionType compression_type,
+    WritableFileWriter* file, const CompressionType compression_type,
     const CompressionOptions& compression_opts,
     const bool skip_filters = false);
 
@@ -51,10 +52,9 @@ extern Status BuildTable(
     FileMetaData* meta, const InternalKeyComparator& internal_comparator,
     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
         int_tbl_prop_collector_factories,
-    const SequenceNumber newest_snapshot,
-    const SequenceNumber earliest_seqno_in_memtable,
-    const CompressionType compression,
+    std::vector<SequenceNumber> snapshots, const CompressionType compression,
     const CompressionOptions& compression_opts, bool paranoid_file_checks,
+    InternalStats* internal_stats,
     const Env::IOPriority io_priority = Env::IO_HIGH,
     TableProperties* table_properties = nullptr);
 
diff --git a/src/rocksdb/db/c.cc b/src/rocksdb/db/c.cc
index 985c9fb..8cd0826 100644
--- a/src/rocksdb/db/c.cc
+++ b/src/rocksdb/db/c.cc
@@ -12,10 +12,11 @@
 #include "rocksdb/c.h"
 
 #include <stdlib.h>
-#include <unistd.h>
+#include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/comparator.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
@@ -30,6 +31,7 @@
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
 #include "rocksdb/utilities/backupable_db.h"
+#include "utilities/merge_operators.h"
 
 using rocksdb::Cache;
 using rocksdb::ColumnFamilyDescriptor;
@@ -37,8 +39,6 @@ using rocksdb::ColumnFamilyHandle;
 using rocksdb::ColumnFamilyOptions;
 using rocksdb::CompactionFilter;
 using rocksdb::CompactionFilterFactory;
-using rocksdb::CompactionFilterV2;
-using rocksdb::CompactionFilterFactoryV2;
 using rocksdb::CompactionFilterContext;
 using rocksdb::CompactionOptionsFIFO;
 using rocksdb::Comparator;
@@ -53,6 +53,7 @@ using rocksdb::FlushOptions;
 using rocksdb::Iterator;
 using rocksdb::Logger;
 using rocksdb::MergeOperator;
+using rocksdb::MergeOperators;
 using rocksdb::NewBloomFilterPolicy;
 using rocksdb::NewLRUCache;
 using rocksdb::Options;
@@ -63,6 +64,7 @@ using rocksdb::Range;
 using rocksdb::ReadOptions;
 using rocksdb::SequentialFile;
 using rocksdb::Slice;
+using rocksdb::SliceParts;
 using rocksdb::SliceTransform;
 using rocksdb::Snapshot;
 using rocksdb::Status;
@@ -74,6 +76,7 @@ using rocksdb::BackupEngine;
 using rocksdb::BackupableDBOptions;
 using rocksdb::BackupInfo;
 using rocksdb::RestoreOptions;
+using rocksdb::CompactRangeOptions;
 
 using std::shared_ptr;
 
@@ -167,99 +170,6 @@ struct rocksdb_compactionfilterfactory_t : public CompactionFilterFactory {
   virtual const char* Name() const override { return (*name_)(state_); }
 };
 
-struct rocksdb_compactionfilterv2_t : public CompactionFilterV2 {
-  void* state_;
-  void (*destructor_)(void*);
-  const char* (*name_)(void*);
-  void (*filter_)(void*, int level, size_t num_keys,
-                  const char* const* keys_list, const size_t* keys_list_sizes,
-                  const char* const* existing_values_list, const size_t* existing_values_list_sizes,
-                  char** new_values_list, size_t* new_values_list_sizes,
-                  unsigned char* to_delete_list);
-
-  virtual ~rocksdb_compactionfilterv2_t() {
-    (*destructor_)(state_);
-  }
-
-  virtual const char* Name() const override { return (*name_)(state_); }
-
-  virtual std::vector<bool> Filter(
-      int level, const SliceVector& keys, const SliceVector& existing_values,
-      std::vector<std::string>* new_values,
-      std::vector<bool>* values_changed) const override {
-    // Make a vector pointing to the underlying key data.
-    size_t num_keys = keys.size();
-    std::vector<const char*> keys_list(num_keys);
-    std::vector<size_t> keys_list_sizes(num_keys);
-    for (size_t i = 0; i < num_keys; ++i) {
-      keys_list[i] = keys[i].data();
-      keys_list_sizes[i] = keys[i].size();
-    }
-    // Make a vector pointing to the underlying value data.
-    std::vector<const char*> existing_values_list(num_keys);
-    std::vector<size_t> existing_values_list_sizes(num_keys);
-    for (size_t i = 0; i < num_keys; ++i) {
-      existing_values_list[i] = existing_values[i].data();
-      existing_values_list_sizes[i] = existing_values[i].size();
-    }
-    // Make a vector which will accept newly-allocated char* arrays
-    // which we will take ownership of and assign to strings in new_values.
-    new_values->clear();
-    std::vector<char*> new_values_list(num_keys);
-    std::vector<size_t> new_values_list_sizes(num_keys);
-    // Resize values_changed to hold all keys.
-    values_changed->resize(num_keys);
-    // Make a vector for bools indicating a value should be deleted
-    // on compaction (true) or maintained (false).
-    std::vector<unsigned char> to_delete_list(num_keys);
-
-    (*filter_)(
-        state_, level, num_keys, &keys_list[0], &keys_list_sizes[0],
-        &existing_values_list[0], &existing_values_list_sizes[0],
-        &new_values_list[0], &new_values_list_sizes[0], &to_delete_list[0]);
-
-    // Now, we transfer any changed values, setting values_changed and
-    // initializing new_values in the event a value changed.
-    std::vector<bool> to_delete(num_keys);
-    for (size_t i = 0; i < num_keys; ++i) {
-      to_delete[i] = to_delete_list[i];
-      (*values_changed)[i] = new_values_list[i] != nullptr;
-      if ((*values_changed)[i]) {
-        new_values->push_back(std::string(new_values_list[i], new_values_list_sizes[i]));
-        free(new_values_list[i]);
-      }
-    }
-    return to_delete;
-  }
-};
-
-struct rocksdb_compactionfilterfactoryv2_t : public CompactionFilterFactoryV2 {
-  void* state_;
-  void (*destructor_)(void*);
-  const char* (*name_)(void*);
-  rocksdb_compactionfilterv2_t* (*create_compaction_filter_v2_)(
-      void* state, const rocksdb_compactionfiltercontext_t* context);
-
-  rocksdb_compactionfilterfactoryv2_t(const SliceTransform* prefix_extractor)
-      : CompactionFilterFactoryV2(prefix_extractor) {
-  }
-
-  virtual ~rocksdb_compactionfilterfactoryv2_t() {
-    (*destructor_)(state_);
-  }
-
-  virtual const char* Name() const override { return (*name_)(state_); }
-
-  virtual std::unique_ptr<CompactionFilterV2> CreateCompactionFilterV2(
-      const CompactionFilterContext& context) override {
-    struct rocksdb_compactionfiltercontext_t c_context;
-    c_context.rep.is_full_compaction = context.is_full_compaction;
-    c_context.rep.is_manual_compaction = context.is_manual_compaction;
-    return std::unique_ptr<CompactionFilterV2>(
-        (*create_compaction_filter_v2_)(state_, &c_context));
-  }
-};
-
 struct rocksdb_comparator_t : public Comparator {
   void* state_;
   void (*destructor_)(void*);
@@ -479,6 +389,7 @@ static bool SaveError(char** errptr, const Status& s) {
     *errptr = strdup(s.ToString().c_str());
   } else {
     // TODO(sanjay): Merge with existing error?
+    // This is a bug if *errptr is not created by malloc()
     free(*errptr);
     *errptr = strdup(s.ToString().c_str());
   }
@@ -602,6 +513,10 @@ void rocksdb_close(rocksdb_t* db) {
   delete db;
 }
 
+void rocksdb_options_set_uint64add_merge_operator(rocksdb_options_t* opt) {
+  opt->rep.merge_operator = rocksdb::MergeOperators::CreateUInt64AddOperator();
+}
+
 rocksdb_t* rocksdb_open_column_families(
     const rocksdb_options_t* db_options,
     const char* name,
@@ -830,6 +745,69 @@ char* rocksdb_get_cf(
   return result;
 }
 
+void rocksdb_multi_get(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    char** values_list, size_t* values_list_sizes,
+    char** errs) {
+  std::vector<Slice> keys(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses = db->rep->MultiGet(options->rep, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+void rocksdb_multi_get_cf(
+    rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    char** values_list, size_t* values_list_sizes,
+    char** errs) {
+  std::vector<Slice> keys(num_keys);
+  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    cfs[i] = column_families[i]->rep;
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses = db->rep->MultiGet(options->rep, cfs, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
 rocksdb_iterator_t* rocksdb_create_iterator(
     rocksdb_t* db,
     const rocksdb_readoptions_t* options) {
@@ -936,6 +914,7 @@ void rocksdb_compact_range(
     const char* limit_key, size_t limit_key_len) {
   Slice a, b;
   db->rep->CompactRange(
+      CompactRangeOptions(),
       // Pass nullptr Slice if corresponding "const char*" is nullptr
       (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
       (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
@@ -948,7 +927,7 @@ void rocksdb_compact_range_cf(
     const char* limit_key, size_t limit_key_len) {
   Slice a, b;
   db->rep->CompactRange(
-      column_family->rep,
+      CompactRangeOptions(), column_family->rep,
       // Pass nullptr Slice if corresponding "const char*" is nullptr
       (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
       (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
@@ -1071,6 +1050,43 @@ void rocksdb_writebatch_put_cf(
   b->rep.Put(column_family->rep, Slice(key, klen), Slice(val, vlen));
 }
 
+void rocksdb_writebatch_putv(
+    rocksdb_writebatch_t* b,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Put(SliceParts(key_slices.data(), num_keys),
+             SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_putv_cf(
+    rocksdb_writebatch_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
+             SliceParts(value_slices.data(), num_values));
+}
+
 void rocksdb_writebatch_merge(
     rocksdb_writebatch_t* b,
     const char* key, size_t klen,
@@ -1086,6 +1102,43 @@ void rocksdb_writebatch_merge_cf(
   b->rep.Merge(column_family->rep, Slice(key, klen), Slice(val, vlen));
 }
 
+void rocksdb_writebatch_mergev(
+    rocksdb_writebatch_t* b,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Merge(SliceParts(key_slices.data(), num_keys),
+               SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_mergev_cf(
+    rocksdb_writebatch_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
+               SliceParts(value_slices.data(), num_values));
+}
+
 void rocksdb_writebatch_delete(
     rocksdb_writebatch_t* b,
     const char* key, size_t klen) {
@@ -1099,6 +1152,35 @@ void rocksdb_writebatch_delete_cf(
   b->rep.Delete(column_family->rep, Slice(key, klen));
 }
 
+void rocksdb_writebatch_deletev(
+    rocksdb_writebatch_t* b,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  b->rep.Delete(SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_deletev_cf(
+    rocksdb_writebatch_t* b,
+    rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  b->rep.Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_put_log_data(
+    rocksdb_writebatch_t* b,
+    const char* blob, size_t len) {
+  b->rep.PutLogData(Slice(blob, len));
+}
+
 void rocksdb_writebatch_iterate(
     rocksdb_writebatch_t* b,
     void* state,
@@ -1186,6 +1268,26 @@ void rocksdb_block_based_options_set_whole_key_filtering(
   options->rep.whole_key_filtering = v;
 }
 
+void rocksdb_block_based_options_set_format_version(
+    rocksdb_block_based_table_options_t* options, int v) {
+  options->rep.format_version = v;
+}
+
+void rocksdb_block_based_options_set_index_type(
+    rocksdb_block_based_table_options_t* options, int v) {
+  options->rep.index_type = static_cast<BlockBasedTableOptions::IndexType>(v);
+}
+
+void rocksdb_block_based_options_set_hash_index_allow_collision(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.hash_index_allow_collision = v;
+}
+
+void rocksdb_block_based_options_set_cache_index_and_filter_blocks(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.cache_index_and_filter_blocks = v;
+}
+
 void rocksdb_options_set_block_based_table_factory(
     rocksdb_options_t *opt,
     rocksdb_block_based_table_options_t* table_options) {
@@ -1293,11 +1395,6 @@ void rocksdb_options_set_merge_operator(
   opt->rep.merge_operator = std::shared_ptr<MergeOperator>(merge_operator);
 }
 
-void rocksdb_options_set_compaction_filter_factory_v2(
-    rocksdb_options_t* opt,
-    rocksdb_compactionfilterfactoryv2_t* compaction_filter_factory_v2) {
-  opt->rep.compaction_filter_factory_v2 = std::shared_ptr<CompactionFilterFactoryV2>(compaction_filter_factory_v2);
-}
 
 void rocksdb_options_set_create_if_missing(
     rocksdb_options_t* opt, unsigned char v) {
@@ -1412,10 +1509,8 @@ void rocksdb_options_set_level0_stop_writes_trigger(
   opt->rep.level0_stop_writes_trigger = n;
 }
 
-void rocksdb_options_set_max_mem_compaction_level(
-    rocksdb_options_t* opt, int n) {
-  opt->rep.max_mem_compaction_level = n;
-}
+void rocksdb_options_set_max_mem_compaction_level(rocksdb_options_t* opt,
+                                                  int n) {}
 
 void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) {
   opt->rep.compression = static_cast<CompressionType>(t);
@@ -1477,10 +1572,9 @@ void rocksdb_options_set_manifest_preallocation_size(
   opt->rep.manifest_preallocation_size = v;
 }
 
-void rocksdb_options_set_purge_redundant_kvs_while_flush(
-    rocksdb_options_t* opt, unsigned char v) {
-  opt->rep.purge_redundant_kvs_while_flush = v;
-}
+// noop
+void rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t* opt,
+                                                         unsigned char v) {}
 
 void rocksdb_options_set_allow_os_buffer(rocksdb_options_t* opt,
                                          unsigned char v) {
@@ -1568,6 +1662,11 @@ void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t* opt
   opt->rep.min_write_buffer_number_to_merge = n;
 }
 
+void rocksdb_options_set_max_write_buffer_number_to_maintain(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.max_write_buffer_number_to_maintain = n;
+}
+
 void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) {
   opt->rep.max_background_compactions = n;
 }
@@ -1732,10 +1831,17 @@ void rocksdb_options_set_fifo_compaction_options(
   opt->rep.compaction_options_fifo = fifo->rep;
 }
 
+char *rocksdb_options_statistics_get_string(rocksdb_options_t *opt) {
+  rocksdb::Statistics *statistics = opt->rep.statistics.get();
+  if (statistics) {
+    return strdup(statistics->ToString().c_str());
+  }
+  return nullptr;
+}
+
 /*
 TODO:
 DB::OpenForReadOnly
-DB::MultiGet
 DB::KeyMayExist
 DB::GetOptions
 DB::GetSortedWalFiles
@@ -1799,46 +1905,6 @@ void rocksdb_compactionfilterfactory_destroy(
   delete factory;
 }
 
-rocksdb_compactionfilterv2_t* rocksdb_compactionfilterv2_create(
-    void* state,
-    void (*destructor)(void*),
-    void (*filter)(void*, int level, size_t num_keys,
-                   const char* const* keys_list, const size_t* keys_list_sizes,
-                   const char* const* existing_values_list, const size_t* existing_values_list_sizes,
-                   char** new_values_list, size_t* new_values_list_sizes,
-                   unsigned char* to_delete_list),
-    const char* (*name)(void*)) {
-  rocksdb_compactionfilterv2_t* result = new rocksdb_compactionfilterv2_t;
-  result->state_ = state;
-  result->destructor_ = destructor;
-  result->filter_ = filter;
-  result->name_ = name;
-  return result;
-}
-
-void rocksdb_compactionfilterv2_destroy(rocksdb_compactionfilterv2_t* filter) {
-  delete filter;
-}
-
-rocksdb_compactionfilterfactoryv2_t* rocksdb_compactionfilterfactoryv2_create(
-    void* state,
-    rocksdb_slicetransform_t* prefix_extractor,
-    void (*destructor)(void*),
-    rocksdb_compactionfilterv2_t* (*create_compaction_filter_v2)(
-        void* state, const rocksdb_compactionfiltercontext_t* context),
-    const char* (*name)(void*)) {
-  rocksdb_compactionfilterfactoryv2_t* result = new rocksdb_compactionfilterfactoryv2_t(prefix_extractor);
-  result->state_ = state;
-  result->destructor_ = destructor;
-  result->create_compaction_filter_v2_ = create_compaction_filter_v2;
-  result->name_ = name;
-  return result;
-}
-
-void rocksdb_compactionfilterfactoryv2_destroy(rocksdb_compactionfilterfactoryv2_t* factory) {
-  delete factory;
-}
-
 rocksdb_comparator_t* rocksdb_comparator_create(
     void* state,
     void (*destructor)(void*),
@@ -2046,6 +2112,10 @@ void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n)
   env->rep->SetBackgroundThreads(n, Env::HIGH);
 }
 
+void rocksdb_env_join_all_threads(rocksdb_env_t* env) {
+  env->rep->WaitForJoin();
+}
+
 void rocksdb_env_destroy(rocksdb_env_t* env) {
   if (!env->is_default) delete env->rep;
   delete env;
@@ -2100,6 +2170,27 @@ rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t pref
   return wrapper;
 }
 
+rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() {
+  struct Wrapper : public rocksdb_slicetransform_t {
+    const SliceTransform* rep_;
+    ~Wrapper() { delete rep_; }
+    const char* Name() const override { return rep_->Name(); }
+    Slice Transform(const Slice& src) const override {
+      return rep_->Transform(src);
+    }
+    bool InDomain(const Slice& src) const override {
+      return rep_->InDomain(src);
+    }
+    bool InRange(const Slice& src) const override { return rep_->InRange(src); }
+    static void DoNothing(void*) { }
+  };
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ = rocksdb::NewNoopTransform();
+  wrapper->state_ = nullptr;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
 rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() {
   rocksdb_universal_compaction_options_t* result = new rocksdb_universal_compaction_options_t;
   result->rep = new rocksdb::CompactionOptionsUniversal;
@@ -2215,6 +2306,17 @@ extern void rocksdb_livefiles_destroy(
   delete lf;
 }
 
+void rocksdb_get_options_from_string(const rocksdb_options_t* base_options,
+                                     const char* opts_str,
+                                     rocksdb_options_t* new_options,
+                                     char** errptr) {
+  SaveError(errptr,
+            GetOptionsFromString(base_options->rep, std::string(opts_str),
+                                 &new_options->rep));
+}
+
+void rocksdb_free(void* ptr) { free(ptr); }
+
 }  // end extern "C"
 
-#endif  // ROCKSDB_LITE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/c_test.c b/src/rocksdb/db/c_test.c
index 2a9dc20..5543627 100644
--- a/src/rocksdb/db/c_test.c
+++ b/src/rocksdb/db/c_test.c
@@ -2,6 +2,8 @@
    Use of this source code is governed by a BSD-style license that can be
    found in the LICENSE file. See the AUTHORS file for names of contributors. */
 
+#ifndef ROCKSDB_LITE  // Lite does not support C API
+
 #include "rocksdb/c.h"
 
 #include <stddef.h>
@@ -9,9 +11,31 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
-#include <unistd.h>
+#ifndef OS_WIN
+#  include <unistd.h>
+#endif
 #include <inttypes.h>
 
+// Can not use port/port.h macros as this is a c file
+#ifdef OS_WIN
+
+#include <Windows.h>
+
+# define snprintf _snprintf
+
+// Ok for uniqueness
+int geteuid() {
+
+  int result = 0;
+
+  result = ((int)GetCurrentProcessId() << 16);
+  result |= (int)GetCurrentThreadId();
+
+  return result;
+}
+
+#endif
+
 const char* phase = "";
 static char dbname[200];
 static char dbbackupname[200];
@@ -227,79 +251,6 @@ static rocksdb_t* CheckCompaction(rocksdb_t* db, rocksdb_options_t* options,
   return db;
 }
 
-// Custom compaction filter V2.
-static void CompactionFilterV2Destroy(void* arg) { }
-static const char* CompactionFilterV2Name(void* arg) {
-  return "TestCompactionFilterV2";
-}
-static void CompactionFilterV2Filter(
-    void* arg, int level, size_t num_keys,
-    const char* const* keys_list, const size_t* keys_list_sizes,
-    const char* const* existing_values_list, const size_t* existing_values_list_sizes,
-    char** new_values_list, size_t* new_values_list_sizes,
-    unsigned char* to_delete_list) {
-  size_t i;
-  for (i = 0; i < num_keys; i++) {
-    // If any value is "gc", it's removed.
-    if (existing_values_list_sizes[i] == 2 && memcmp(existing_values_list[i], "gc", 2) == 0) {
-      to_delete_list[i] = 1;
-    } else if (existing_values_list_sizes[i] == 6 && memcmp(existing_values_list[i], "gc all", 6) == 0) {
-      // If any value is "gc all", all keys are removed.
-      size_t j;
-      for (j = 0; j < num_keys; j++) {
-        to_delete_list[j] = 1;
-      }
-      return;
-    } else if (existing_values_list_sizes[i] == 6 && memcmp(existing_values_list[i], "change", 6) == 0) {
-      // If value is "change", set changed value to "changed".
-      size_t len;
-      len = strlen("changed");
-      new_values_list[i] = malloc(len);
-      memcpy(new_values_list[i], "changed", len);
-      new_values_list_sizes[i] = len;
-    } else {
-      // Otherwise, no keys are removed.
-    }
-  }
-}
-
-// Custom prefix extractor for compaction filter V2 which extracts first 3 characters.
-static void CFV2PrefixExtractorDestroy(void* arg) { }
-static char* CFV2PrefixExtractorTransform(void* arg, const char* key, size_t length, size_t* dst_length) {
-  // Verify keys are maximum length 4; this verifies fix for a
-  // prior bug which was passing the RocksDB-encoded key with
-  // logical timestamp suffix instead of parsed user key.
-  if (length > 4) {
-    fprintf(stderr, "%s:%d: %s: key %s is not user key\n", __FILE__, __LINE__, phase, key);
-    abort();
-  }
-  *dst_length = length < 3 ? length : 3;
-  return (char*)key;
-}
-static unsigned char CFV2PrefixExtractorInDomain(void* state, const char* key, size_t length) {
-  return 1;
-}
-static unsigned char CFV2PrefixExtractorInRange(void* state, const char* key, size_t length) {
-  return 1;
-}
-static const char* CFV2PrefixExtractorName(void* state) {
-  return "TestCFV2PrefixExtractor";
-}
-
-// Custom compaction filter factory V2.
-static void CompactionFilterFactoryV2Destroy(void* arg) {
-  rocksdb_slicetransform_destroy((rocksdb_slicetransform_t*)arg);
-}
-static const char* CompactionFilterFactoryV2Name(void* arg) {
-  return "TestCompactionFilterFactoryV2";
-}
-static rocksdb_compactionfilterv2_t* CompactionFilterFactoryV2Create(
-    void* state, const rocksdb_compactionfiltercontext_t* context) {
-  return rocksdb_compactionfilterv2_create(state, CompactionFilterV2Destroy,
-                                           CompactionFilterV2Filter,
-                                           CompactionFilterV2Name);
-}
-
 // Custom merge operator
 static void MergeOperatorDestroy(void* arg) { }
 static const char* MergeOperatorName(void* arg) {
@@ -465,6 +416,24 @@ int main(int argc, char** argv) {
     rocksdb_writebatch_destroy(wb);
   }
 
+  StartPhase("writebatch_vectors");
+  {
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    const char* k_list[2] = { "z", "ap" };
+    const size_t k_sizes[2] = { 1, 2 };
+    const char* v_list[3] = { "x", "y", "z" };
+    const size_t v_sizes[3] = { 1, 1, 1 };
+    rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", "xyz");
+    rocksdb_writebatch_delete(wb, "zap", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", NULL);
+    rocksdb_writebatch_destroy(wb);
+  }
+
   StartPhase("writebatch_rep");
   {
     rocksdb_writebatch_t* wb1 = rocksdb_writebatch_create();
@@ -505,6 +474,33 @@ int main(int argc, char** argv) {
     rocksdb_iter_destroy(iter);
   }
 
+  StartPhase("multiget");
+  {
+    const char* keys[3] = { "box", "foo", "notfound" };
+    const size_t keys_sizes[3] = { 3, 3, 8 };
+    char* vals[3];
+    size_t vals_sizes[3];
+    char* errs[3];
+    rocksdb_multi_get(db, roptions, 3, keys, keys_sizes, vals, vals_sizes, errs);
+
+    int i;
+    for (i = 0; i < 3; i++) {
+      CheckEqual(NULL, errs[i], 0);
+      switch (i) {
+      case 0:
+        CheckEqual("c", vals[i], vals_sizes[i]);
+        break;
+      case 1:
+        CheckEqual("hello", vals[i], vals_sizes[i]);
+        break;
+      case 2:
+        CheckEqual(NULL, vals[i], vals_sizes[i]);
+        break;
+      }
+      Free(&vals[i]);
+    }
+  }
+
   StartPhase("approximate_sizes");
   {
     int i;
@@ -653,50 +649,6 @@ int main(int argc, char** argv) {
     rocksdb_options_destroy(options_with_filter_factory);
   }
 
-  StartPhase("compaction_filter_v2");
-  {
-    rocksdb_compactionfilterfactoryv2_t* factory;
-    rocksdb_slicetransform_t* prefix_extractor;
-    prefix_extractor = rocksdb_slicetransform_create(
-        NULL, CFV2PrefixExtractorDestroy, CFV2PrefixExtractorTransform,
-        CFV2PrefixExtractorInDomain, CFV2PrefixExtractorInRange,
-        CFV2PrefixExtractorName);
-    factory = rocksdb_compactionfilterfactoryv2_create(
-        prefix_extractor, prefix_extractor, CompactionFilterFactoryV2Destroy,
-        CompactionFilterFactoryV2Create, CompactionFilterFactoryV2Name);
-    // Create new database
-    rocksdb_close(db);
-    rocksdb_destroy_db(options, dbname, &err);
-    rocksdb_options_set_compaction_filter_factory_v2(options, factory);
-    db = rocksdb_open(options, dbname, &err);
-    CheckNoError(err);
-    // Only foo2 is GC'd, foo3 is changed.
-    rocksdb_put(db, woptions, "foo1", 4, "no gc", 5, &err);
-    CheckNoError(err);
-    rocksdb_put(db, woptions, "foo2", 4, "gc", 2, &err);
-    CheckNoError(err);
-    rocksdb_put(db, woptions, "foo3", 4, "change", 6, &err);
-    CheckNoError(err);
-    // All bars are GC'd.
-    rocksdb_put(db, woptions, "bar1", 4, "no gc", 5, &err);
-    CheckNoError(err);
-    rocksdb_put(db, woptions, "bar2", 4, "gc all", 6, &err);
-    CheckNoError(err);
-    rocksdb_put(db, woptions, "bar3", 4, "no gc", 5, &err);
-    CheckNoError(err);
-    // Compact the DB to garbage collect.
-    rocksdb_compact_range(db, NULL, 0, NULL, 0);
-
-    // Verify foo entries.
-    CheckGet(db, roptions, "foo1", "no gc");
-    CheckGet(db, roptions, "foo2", NULL);
-    CheckGet(db, roptions, "foo3", "changed");
-    // Verify bar entries were all deleted.
-    CheckGet(db, roptions, "bar1", NULL);
-    CheckGet(db, roptions, "bar2", NULL);
-    CheckGet(db, roptions, "bar3", NULL);
-  }
-
   StartPhase("merge_operator");
   {
     rocksdb_mergeoperator_t* merge_operator;
@@ -778,12 +730,36 @@ int main(int argc, char** argv) {
     CheckGetCF(db, roptions, handles[1], "box", "c");
     rocksdb_writebatch_destroy(wb);
 
+    const char* keys[3] = { "box", "box", "barfooxx" };
+    const rocksdb_column_family_handle_t* get_handles[3] = { handles[0], handles[1], handles[1] };
+    const size_t keys_sizes[3] = { 3, 3, 8 };
+    char* vals[3];
+    size_t vals_sizes[3];
+    char* errs[3];
+    rocksdb_multi_get_cf(db, roptions, get_handles, 3, keys, keys_sizes, vals, vals_sizes, errs);
+
+    int i;
+    for (i = 0; i < 3; i++) {
+      CheckEqual(NULL, errs[i], 0);
+      switch (i) {
+      case 0:
+        CheckEqual(NULL, vals[i], vals_sizes[i]); // wrong cf
+        break;
+      case 1:
+        CheckEqual("c", vals[i], vals_sizes[i]); // bingo
+        break;
+      case 2:
+        CheckEqual(NULL, vals[i], vals_sizes[i]); // normal not found
+        break;
+      }
+      Free(&vals[i]);
+    }
+
     rocksdb_iterator_t* iter = rocksdb_create_iterator_cf(db, roptions, handles[1]);
     CheckCondition(!rocksdb_iter_valid(iter));
     rocksdb_iter_seek_to_first(iter);
     CheckCondition(rocksdb_iter_valid(iter));
 
-    int i;
     for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) {
       i++;
     }
@@ -938,3 +914,13 @@ int main(int argc, char** argv) {
   fprintf(stderr, "PASS\n");
   return 0;
 }
+
+#else
+#include <stdio.h>
+
+int main() {
+  fprintf(stderr, "SKIPPED\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/column_family.cc b/src/rocksdb/db/column_family.cc
index 7df5c97..88bf033 100644
--- a/src/rocksdb/db/column_family.cc
+++ b/src/rocksdb/db/column_family.cc
@@ -21,50 +21,21 @@
 
 #include "db/compaction_picker.h"
 #include "db/db_impl.h"
-#include "db/job_context.h"
-#include "db/version_set.h"
-#include "db/writebuffer.h"
 #include "db/internal_stats.h"
 #include "db/job_context.h"
 #include "db/table_properties_collector.h"
 #include "db/version_set.h"
 #include "db/write_controller.h"
+#include "db/writebuffer.h"
 #include "util/autovector.h"
+#include "util/compression.h"
 #include "util/hash_skiplist_rep.h"
 #include "util/options_helper.h"
+#include "util/thread_status_util.h"
+#include "util/xfunc.h"
 
 namespace rocksdb {
 
-namespace {
-// This function computes the amount of time in microseconds by which a write
-// should be delayed based on the number of level-0 files according to the
-// following formula:
-// if n < bottom, return 0;
-// if n >= top, return 1000;
-// otherwise, let r = (n - bottom) /
-//                    (top - bottom)
-//  and return r^2 * 1000.
-// The goal of this formula is to gradually increase the rate at which writes
-// are slowed. We also tried linear delay (r * 1000), but it seemed to do
-// slightly worse. There is no other particular reason for choosing quadratic.
-uint64_t SlowdownAmount(int n, double bottom, double top) {
-  uint64_t delay;
-  if (n >= top) {
-    delay = 1000;
-  } else if (n < bottom) {
-    delay = 0;
-  } else {
-    // If we are here, we know that:
-    //   level0_start_slowdown <= n < level0_slowdown
-    // since the previous two conditions are false.
-    double how_much = static_cast<double>(n - bottom) / (top - bottom);
-    delay = std::max(how_much * how_much * 1000, 100.0);
-  }
-  assert(delay <= 1000);
-  return delay;
-}
-}  // namespace
-
 ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
     ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex)
     : cfd_(column_family_data), db_(db), mutex_(mutex) {
@@ -87,6 +58,7 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
     if (job_context.HaveSomethingToDelete()) {
       db_->PurgeObsoleteFiles(job_context);
     }
+    job_context.Clean();
   }
 }
 
@@ -116,6 +88,28 @@ void GetIntTblPropCollectorFactory(
       new InternalKeyPropertiesCollectorFactory);
 }
 
+Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
+  if (!cf_options.compression_per_level.empty()) {
+    for (size_t level = 0; level < cf_options.compression_per_level.size();
+         ++level) {
+      if (!CompressionTypeSupported(cf_options.compression_per_level[level])) {
+        return Status::InvalidArgument(
+            "Compression type " +
+            CompressionTypeToString(cf_options.compression_per_level[level]) +
+            " is not linked with the binary.");
+      }
+    }
+  } else {
+    if (!CompressionTypeSupported(cf_options.compression)) {
+      return Status::InvalidArgument(
+          "Compression type " +
+          CompressionTypeToString(cf_options.compression) +
+          " is not linked with the binary.");
+    }
+  }
+  return Status::OK();
+}
+
 ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options,
                                     const InternalKeyComparator* icmp,
                                     const ColumnFamilyOptions& src) {
@@ -131,20 +125,38 @@ ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options,
   // if user sets arena_block_size, we trust user to use this value. Otherwise,
   // calculate a proper value from writer_buffer_size;
   if (result.arena_block_size <= 0) {
-    result.arena_block_size = result.write_buffer_size / 10;
+    result.arena_block_size = result.write_buffer_size / 8;
+
+    // Align up to 4k
+    const size_t align = 4 * 1024;
+    result.arena_block_size =
+        ((result.arena_block_size + align - 1) / align) * align;
   }
   result.min_write_buffer_number_to_merge =
       std::min(result.min_write_buffer_number_to_merge,
                result.max_write_buffer_number - 1);
-  if (result.max_mem_compaction_level >= result.num_levels) {
-    result.max_mem_compaction_level = result.num_levels - 1;
+  if (result.num_levels < 1) {
+    result.num_levels = 1;
   }
-  if (result.soft_rate_limit > result.hard_rate_limit) {
-    result.soft_rate_limit = result.hard_rate_limit;
+  if (result.compaction_style == kCompactionStyleLevel &&
+      result.num_levels < 2) {
+    result.num_levels = 2;
   }
   if (result.max_write_buffer_number < 2) {
     result.max_write_buffer_number = 2;
   }
+  if (result.max_write_buffer_number_to_maintain < 0) {
+    result.max_write_buffer_number_to_maintain = result.max_write_buffer_number;
+  }
+  XFUNC_TEST("memtablelist_history", "transaction_xftest_SanitizeOptions",
+             xf_transaction_set_memtable_history1,
+             xf_transaction_set_memtable_history,
+             &result.max_write_buffer_number_to_maintain);
+  XFUNC_TEST("memtablelist_history_clear", "transaction_xftest_SanitizeOptions",
+             xf_transaction_clear_memtable_history1,
+             xf_transaction_clear_memtable_history,
+             &result.max_write_buffer_number_to_maintain);
+
   if (!result.prefix_extractor) {
     assert(result.memtable_factory);
     Slice name = result.memtable_factory->Name();
@@ -154,24 +166,6 @@ ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options,
     }
   }
 
-  if (!src.compression_per_level.empty()) {
-    for (size_t level = 0; level < src.compression_per_level.size(); ++level) {
-      if (!CompressionTypeSupported(src.compression_per_level[level])) {
-        Log(InfoLogLevel::WARN_LEVEL, db_options.info_log,
-            "Compression type chosen for level %zu is not supported: %s. "
-            "RocksDB "
-            "will not compress data on level %zu.",
-            level, CompressionTypeToString(src.compression_per_level[level]),
-            level);
-      }
-    }
-  } else if (!CompressionTypeSupported(src.compression)) {
-    Log(InfoLogLevel::WARN_LEVEL, db_options.info_log,
-        "Compression type chosen is not supported: %s. RocksDB will not "
-        "compress data.",
-        CompressionTypeToString(src.compression));
-  }
-
   if (result.compaction_style == kCompactionStyleFIFO) {
     result.num_levels = 1;
     // since we delete level0 files in FIFO compaction when there are too many
@@ -252,6 +246,9 @@ void SuperVersion::Cleanup() {
   imm->Unref(&to_delete);
   MemTable* m = mem->Unref();
   if (m != nullptr) {
+    auto* memory_usage = current->cfd()->imm()->current_memory_usage();
+    assert(*memory_usage >= m->ApproximateMemoryUsage());
+    *memory_usage -= m->ApproximateMemoryUsage();
     to_delete.push_back(m);
   }
   current->Unref();
@@ -302,7 +299,8 @@ ColumnFamilyData::ColumnFamilyData(
       mutable_cf_options_(options_, ioptions_),
       write_buffer_(write_buffer),
       mem_(nullptr),
-      imm_(options_.min_write_buffer_number_to_merge),
+      imm_(options_.min_write_buffer_number_to_merge,
+           options_.max_write_buffer_number_to_maintain),
       super_version_(nullptr),
       super_version_number_(0),
       local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
@@ -352,7 +350,7 @@ ColumnFamilyData::ColumnFamilyData(
     if (column_family_set_->NumberOfColumnFamilies() < 10) {
       Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
           "--------------- Options for column family [%s]:\n", name.c_str());
-      options_.Dump(ioptions_.info_log);
+      options_.DumpCFOptions(ioptions_.info_log);
     } else {
       Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
           "\t(skipping printing options)\n");
@@ -435,58 +433,56 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
     auto* vstorage = current_->storage_info();
     const double score = vstorage->max_compaction_score();
     const int max_level = vstorage->max_compaction_score_level();
-
     auto write_controller = column_family_set_->write_controller_;
 
-    if (imm()->size() >= mutable_cf_options.max_write_buffer_number) {
+    if (imm()->NumNotFlushed() >= mutable_cf_options.max_write_buffer_number) {
       write_controller_token_ = write_controller->GetStopToken();
       internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1);
       Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
           "[%s] Stopping writes because we have %d immutable memtables "
           "(waiting for flush), max_write_buffer_number is set to %d",
-          name_.c_str(), imm()->size(),
+          name_.c_str(), imm()->NumNotFlushed(),
           mutable_cf_options.max_write_buffer_number);
     } else if (vstorage->l0_delay_trigger_count() >=
                mutable_cf_options.level0_stop_writes_trigger) {
       write_controller_token_ = write_controller->GetStopToken();
-      internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
+      internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES_TOTAL, 1);
+      if (compaction_picker_->IsLevel0CompactionInProgress()) {
+        internal_stats_->AddCFStats(
+            InternalStats::LEVEL0_NUM_FILES_WITH_COMPACTION, 1);
+      }
       Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
           "[%s] Stopping writes because we have %d level-0 files",
           name_.c_str(), vstorage->l0_delay_trigger_count());
+    } else if (mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
+               vstorage->estimated_compaction_needed_bytes() >=
+                   mutable_cf_options.hard_pending_compaction_bytes_limit) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(
+          InternalStats::HARD_PENDING_COMPACTION_BYTES_LIMIT, 1);
+      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
+          "[%s] Stopping writes because estimated pending compaction "
+          "bytes exceed %" PRIu64,
+          name_.c_str(), vstorage->estimated_compaction_needed_bytes());
     } else if (mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
                vstorage->l0_delay_trigger_count() >=
                    mutable_cf_options.level0_slowdown_writes_trigger) {
-      uint64_t slowdown =
-          SlowdownAmount(vstorage->l0_delay_trigger_count(),
-                         mutable_cf_options.level0_slowdown_writes_trigger,
-                         mutable_cf_options.level0_stop_writes_trigger);
-      write_controller_token_ = write_controller->GetDelayToken(slowdown);
-      internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown);
-      Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
-          "[%s] Stalling writes because we have %d level-0 files (%" PRIu64
-          "us)",
-          name_.c_str(), vstorage->l0_delay_trigger_count(), slowdown);
-    } else if (mutable_cf_options.hard_rate_limit > 1.0 &&
-               score > mutable_cf_options.hard_rate_limit) {
-      uint64_t kHardLimitSlowdown = 1000;
-      write_controller_token_ =
-          write_controller->GetDelayToken(kHardLimitSlowdown);
-      internal_stats_->RecordLevelNSlowdown(max_level, false);
+      write_controller_token_ = write_controller->GetDelayToken();
+      internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN_TOTAL, 1);
+      if (compaction_picker_->IsLevel0CompactionInProgress()) {
+        internal_stats_->AddCFStats(
+            InternalStats::LEVEL0_SLOWDOWN_WITH_COMPACTION, 1);
+      }
       Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
-          "[%s] Stalling writes because we hit hard limit on level %d. "
-          "(%" PRIu64 "us)",
-          name_.c_str(), max_level, kHardLimitSlowdown);
+          "[%s] Stalling writes because we have %d level-0 files",
+          name_.c_str(), vstorage->l0_delay_trigger_count());
     } else if (mutable_cf_options.soft_rate_limit > 0.0 &&
                score > mutable_cf_options.soft_rate_limit) {
-      uint64_t slowdown = SlowdownAmount(score,
-          mutable_cf_options.soft_rate_limit,
-          mutable_cf_options.hard_rate_limit);
-      write_controller_token_ = write_controller->GetDelayToken(slowdown);
+      write_controller_token_ = write_controller->GetDelayToken();
       internal_stats_->RecordLevelNSlowdown(max_level, true);
       Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log,
-          "[%s] Stalling writes because we hit soft limit on level %d (%" PRIu64
-          "us)",
-          name_.c_str(), max_level, slowdown);
+          "[%s] Stalling writes because we hit soft limit on level %d",
+          name_.c_str(), max_level);
     } else {
       write_controller_token_.reset();
     }
@@ -505,19 +501,23 @@ uint64_t ColumnFamilyData::GetNumLiveVersions() const {
   return VersionSet::GetNumLiveVersions(dummy_versions_);
 }
 
+uint64_t ColumnFamilyData::GetTotalSstFilesSize() const {
+  return VersionSet::GetTotalSstFilesSize(dummy_versions_);
+}
+
 MemTable* ColumnFamilyData::ConstructNewMemtable(
-    const MutableCFOptions& mutable_cf_options) {
+    const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
   assert(current_ != nullptr);
-  return new MemTable(internal_comparator_, ioptions_,
-                      mutable_cf_options, write_buffer_);
+  return new MemTable(internal_comparator_, ioptions_, mutable_cf_options,
+                      write_buffer_, earliest_seq);
 }
 
 void ColumnFamilyData::CreateNewMemtable(
-    const MutableCFOptions& mutable_cf_options) {
+    const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
   if (mem_ != nullptr) {
     delete mem_->Unref();
   }
-  SetMemtable(ConstructNewMemtable(mutable_cf_options));
+  SetMemtable(ConstructNewMemtable(mutable_cf_options, earliest_seq));
   mem_->Ref();
 }
 
@@ -627,51 +627,6 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
   return false;
 }
 
-void ColumnFamilyData::NotifyOnCompactionCompleted(
-    DB* db, Compaction* c, const Status& status) {
-#ifndef ROCKSDB_LITE
-  auto listeners = ioptions()->listeners;
-  assert(listeners.size() > 0U);
-  CompactionJobInfo info;
-  info.cf_name = c->column_family_data()->GetName();
-  info.status = status;
-  info.output_level = c->output_level();
-  for (size_t i = 0; i < c->num_input_levels(); ++i) {
-    for (const auto fmd : *c->inputs(i)) {
-      info.input_files.push_back(
-          TableFileName(options_.db_paths,
-                        fmd->fd.GetNumber(),
-                        fmd->fd.GetPathId()));
-    }
-  }
-  for (const auto newf : c->edit()->GetNewFiles()) {
-    info.output_files.push_back(
-        TableFileName(options_.db_paths,
-                      newf.second.fd.GetNumber(),
-                      newf.second.fd.GetPathId()));
-  }
-  for (auto listener : listeners) {
-    listener->OnCompactionCompleted(db, info);
-  }
-#endif  // ROCKSDB_LITE
-}
-
-void ColumnFamilyData::NotifyOnFlushCompleted(
-    DB* db, const std::string& file_path,
-    bool triggered_flush_slowdown,
-    bool triggered_flush_stop) {
-
-#ifndef ROCKSDB_LITE
-  auto listeners = ioptions()->listeners;
-  for (auto listener : listeners) {
-    listener->OnFlushCompleted(
-        db, GetName(), file_path,
-        // Use path 0 as fulled memtables are first flushed into path 0.
-        triggered_flush_slowdown, triggered_flush_stop);
-  }
-#endif  // ROCKSDB_LITE
-}
-
 SuperVersion* ColumnFamilyData::InstallSuperVersion(
     SuperVersion* new_superversion, InstrumentedMutex* db_mutex) {
   db_mutex->AssertHeld();
diff --git a/src/rocksdb/db/column_family.h b/src/rocksdb/db/column_family.h
index 77af5c7..e44873c 100644
--- a/src/rocksdb/db/column_family.h
+++ b/src/rocksdb/db/column_family.h
@@ -14,15 +14,16 @@
 #include <vector>
 #include <atomic>
 
-#include "rocksdb/options.h"
-#include "rocksdb/db.h"
-#include "rocksdb/env.h"
 #include "db/memtable_list.h"
 #include "db/write_batch_internal.h"
 #include "db/write_controller.h"
 #include "db/table_cache.h"
 #include "db/table_properties_collector.h"
 #include "db/flush_scheduler.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
 #include "util/instrumented_mutex.h"
 #include "util/mutable_cf_options.h"
 #include "util/thread_local.h"
@@ -129,6 +130,8 @@ struct SuperVersion {
   autovector<MemTable*> to_delete;
 };
 
+extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
+
 extern ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options,
                                            const InternalKeyComparator* icmp,
                                            const ColumnFamilyOptions& src);
@@ -223,10 +226,14 @@ class ColumnFamilyData {
   Version* dummy_versions() { return dummy_versions_; }
   void SetCurrent(Version* current);
   uint64_t GetNumLiveVersions() const;  // REQUIRE: DB mutex held
-
-  MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options);
+  uint64_t GetTotalSstFilesSize() const;  // REQUIRE: DB mutex held
   void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
-  void CreateNewMemtable(const MutableCFOptions& mutable_cf_options);
+
+  // See Memtable constructor for explanation of earliest_seq param.
+  MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options,
+                                 SequenceNumber earliest_seq);
+  void CreateNewMemtable(const MutableCFOptions& mutable_cf_options,
+                         SequenceNumber earliest_seq);
 
   TableCache* table_cache() const { return table_cache_.get(); }
 
@@ -292,13 +299,6 @@ class ColumnFamilyData {
 
   void ResetThreadLocalSuperVersions();
 
-  void NotifyOnCompactionCompleted(DB* db, Compaction* c, const Status& status);
-
-  void NotifyOnFlushCompleted(
-      DB* db, const std::string& file_path,
-      bool triggered_flush_slowdown,
-      bool triggered_flush_stop);
-
   // Protected by DB mutex
   void set_pending_flush(bool value) { pending_flush_ = value; }
   void set_pending_compaction(bool value) { pending_compaction_ = value; }
diff --git a/src/rocksdb/db/column_family_test.cc b/src/rocksdb/db/column_family_test.cc
index 8be8cd2..a258b83 100644
--- a/src/rocksdb/db/column_family_test.cc
+++ b/src/rocksdb/db/column_family_test.cc
@@ -10,6 +10,7 @@
 #include <algorithm>
 #include <vector>
 #include <string>
+#include <thread>
 
 #include "db/db_impl.h"
 #include "rocksdb/db.h"
@@ -19,8 +20,11 @@
 #include "util/testharness.h"
 #include "util/testutil.h"
 #include "util/coding.h"
+#include "util/sync_point.h"
 #include "utilities/merge_operators.h"
 
+#if !(defined NDEBUG) || !defined(OS_WIN)
+
 namespace rocksdb {
 
 namespace {
@@ -98,10 +102,12 @@ class ColumnFamilyTest : public testing::Test {
                                &db_);
   }
 
+#ifndef ROCKSDB_LITE  // ReadOnlyDB is not supported
   void AssertOpenReadOnly(std::vector<std::string> cf,
                     std::vector<ColumnFamilyOptions> options = {}) {
     ASSERT_OK(OpenReadOnly(cf, options));
   }
+#endif  // !ROCKSDB_LITE
 
 
   void Open(std::vector<std::string> cf,
@@ -121,7 +127,7 @@ class ColumnFamilyTest : public testing::Test {
 #ifndef CYGWIN
     return std::stoi(value);
 #else
-    return std::strtol(value.c_str(), 0);
+    return std::strtol(value.c_str(), 0 /* off */, 10 /* base */);
 #endif
   }
 
@@ -186,10 +192,28 @@ class ColumnFamilyTest : public testing::Test {
   }
 
   void WaitForFlush(int cf) {
+#ifndef ROCKSDB_LITE  // TEST functions are not supported in lite
     ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+#endif  // !ROCKSDB_LITE
+  }
+
+  void WaitForCompaction() {
+#ifndef ROCKSDB_LITE  // TEST functions are not supported in lite
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+#endif  // !ROCKSDB_LITE
+  }
+
+  uint64_t MaxTotalInMemoryState() {
+#ifndef ROCKSDB_LITE
+    return dbfull()->TEST_MaxTotalInMemoryState();
+#else
+    return 0;
+#endif  // !ROCKSDB_LITE
   }
 
-  void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); }
+  void AssertMaxTotalInMemoryState(uint64_t value) {
+    ASSERT_EQ(value, MaxTotalInMemoryState());
+  }
 
   Status Put(int cf, const std::string& key, const std::string& value) {
     return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
@@ -215,11 +239,13 @@ class ColumnFamilyTest : public testing::Test {
   }
 
   void CompactAll(int cf) {
-    ASSERT_OK(db_->CompactRange(handles_[cf], nullptr, nullptr));
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr,
+                                nullptr));
   }
 
   void Compact(int cf, const Slice& start, const Slice& limit) {
-    ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit));
+    ASSERT_OK(
+        db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
   }
 
   int NumTableFilesAtLevel(int level, int cf) {
@@ -227,6 +253,7 @@ class ColumnFamilyTest : public testing::Test {
                        "rocksdb.num-files-at-level" + ToString(level));
   }
 
+#ifndef ROCKSDB_LITE
   // Return spread of files per level
   std::string FilesPerLevel(int cf) {
     std::string result;
@@ -243,12 +270,27 @@ class ColumnFamilyTest : public testing::Test {
     result.resize(last_non_zero_offset);
     return result;
   }
+#endif
 
+  void AssertFilesPerLevel(const std::string& value, int cf) {
+#ifndef ROCKSDB_LITE
+    ASSERT_EQ(value, FilesPerLevel(cf));
+#endif
+  }
+
+#ifndef ROCKSDB_LITE  // GetLiveFilesMetaData is not supported
   int CountLiveFiles() {
     std::vector<LiveFileMetaData> metadata;
     db_->GetLiveFilesMetaData(&metadata);
     return static_cast<int>(metadata.size());
   }
+#endif  // !ROCKSDB_LITE
+
+  void AssertCountLiveFiles(int expected_value) {
+#ifndef ROCKSDB_LITE
+    ASSERT_EQ(expected_value, CountLiveFiles());
+#endif
+  }
 
   // Do n memtable flushes, each of which produces an sstable
   // covering the range [small,large].
@@ -261,6 +303,7 @@ class ColumnFamilyTest : public testing::Test {
     }
   }
 
+#ifndef ROCKSDB_LITE  // GetSortedWalFiles is not supported
   int CountLiveLogFiles() {
     int micros_wait_for_log_deletion = 20000;
     env_->SleepForMicroseconds(micros_wait_for_log_deletion);
@@ -287,15 +330,25 @@ class ColumnFamilyTest : public testing::Test {
       }
     }
     return ret;
+    return 0;
+  }
+#endif  // !ROCKSDB_LITE
+
+  void AssertCountLiveLogFiles(int value) {
+#ifndef ROCKSDB_LITE  // GetSortedWalFiles is not supported
+    ASSERT_EQ(value, CountLiveLogFiles());
+#endif  // !ROCKSDB_LITE
   }
 
   void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
     assert(num_per_cf.size() == handles_.size());
 
+#ifndef ROCKSDB_LITE  // GetProperty is not supported in lite
     for (size_t i = 0; i < num_per_cf.size(); ++i) {
       ASSERT_EQ(num_per_cf[i], GetProperty(static_cast<int>(i),
                                            "rocksdb.num-immutable-mem-table"));
     }
+#endif  // !ROCKSDB_LITE
   }
 
   void CopyFile(const std::string& source, const std::string& destination,
@@ -332,13 +385,6 @@ class ColumnFamilyTest : public testing::Test {
   Random rnd_;
 };
 
-class DumbLogger : public Logger {
- public:
-  using Logger::Logv;
-  virtual void Logv(const char* format, va_list ap) override {}
-  virtual size_t GetLogFileSize() const override { return 0; }
-};
-
 TEST_F(ColumnFamilyTest, DontReuseColumnFamilyID) {
   for (int iter = 0; iter < 3; ++iter) {
     Open();
@@ -408,10 +454,10 @@ TEST_F(ColumnFamilyTest, DropTest) {
     }
     ASSERT_EQ("bar1", Get(1, "1"));
 
-    ASSERT_EQ(CountLiveFiles(), 1);
+    AssertCountLiveFiles(1);
     DropColumnFamilies({1});
     // make sure that all files are deleted when we drop the column family
-    ASSERT_EQ(CountLiveFiles(), 0);
+    AssertCountLiveFiles(0);
     Destroy();
   }
 }
@@ -552,10 +598,9 @@ TEST_F(ColumnFamilyTest, FlushTest) {
 
     for (int i = 0; i < 3; ++i) {
       uint64_t max_total_in_memory_state =
-          dbfull()->TEST_MaxTotalInMemoryState();
+          MaxTotalInMemoryState();
       Flush(i);
-      ASSERT_EQ(dbfull()->TEST_MaxTotalInMemoryState(),
-                max_total_in_memory_state);
+      AssertMaxTotalInMemoryState(max_total_in_memory_state);
     }
     ASSERT_OK(Put(1, "foofoo", "bar"));
     ASSERT_OK(Put(0, "foofoo", "bar"));
@@ -584,13 +629,14 @@ TEST_F(ColumnFamilyTest, FlushTest) {
 // Makes sure that obsolete log files get deleted
 TEST_F(ColumnFamilyTest, LogDeletionTest) {
   db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+  column_family_options_.arena_block_size = 4 * 1024;
   column_family_options_.write_buffer_size = 100000;  // 100KB
   Open();
   CreateColumnFamilies({"one", "two", "three", "four"});
   // Each bracket is one log file. if number is in (), it means
   // we don't need it anymore (it's been flushed)
   // []
-  ASSERT_EQ(CountLiveLogFiles(), 0);
+  AssertCountLiveLogFiles(0);
   PutRandomData(0, 1, 100);
   // [0]
   PutRandomData(1, 1, 100);
@@ -598,53 +644,53 @@ TEST_F(ColumnFamilyTest, LogDeletionTest) {
   PutRandomData(1, 1000, 100);
   WaitForFlush(1);
   // [0, (1)] [1]
-  ASSERT_EQ(CountLiveLogFiles(), 2);
+  AssertCountLiveLogFiles(2);
   PutRandomData(0, 1, 100);
   // [0, (1)] [0, 1]
-  ASSERT_EQ(CountLiveLogFiles(), 2);
+  AssertCountLiveLogFiles(2);
   PutRandomData(2, 1, 100);
   // [0, (1)] [0, 1, 2]
   PutRandomData(2, 1000, 100);
   WaitForFlush(2);
   // [0, (1)] [0, 1, (2)] [2]
-  ASSERT_EQ(CountLiveLogFiles(), 3);
+  AssertCountLiveLogFiles(3);
   PutRandomData(2, 1000, 100);
   WaitForFlush(2);
   // [0, (1)] [0, 1, (2)] [(2)] [2]
-  ASSERT_EQ(CountLiveLogFiles(), 4);
+  AssertCountLiveLogFiles(4);
   PutRandomData(3, 1, 100);
   // [0, (1)] [0, 1, (2)] [(2)] [2, 3]
   PutRandomData(1, 1, 100);
   // [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3]
-  ASSERT_EQ(CountLiveLogFiles(), 4);
+  AssertCountLiveLogFiles(4);
   PutRandomData(1, 1000, 100);
   WaitForFlush(1);
   // [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1]
-  ASSERT_EQ(CountLiveLogFiles(), 5);
+  AssertCountLiveLogFiles(5);
   PutRandomData(0, 1000, 100);
   WaitForFlush(0);
   // [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0]
   // delete obsolete logs -->
   // [(1), 2, 3] [1, (0)] [0]
-  ASSERT_EQ(CountLiveLogFiles(), 3);
+  AssertCountLiveLogFiles(3);
   PutRandomData(0, 1000, 100);
   WaitForFlush(0);
   // [(1), 2, 3] [1, (0)], [(0)] [0]
-  ASSERT_EQ(CountLiveLogFiles(), 4);
+  AssertCountLiveLogFiles(4);
   PutRandomData(1, 1000, 100);
   WaitForFlush(1);
   // [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1]
-  ASSERT_EQ(CountLiveLogFiles(), 5);
+  AssertCountLiveLogFiles(5);
   PutRandomData(2, 1000, 100);
   WaitForFlush(2);
   // [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2]
-  ASSERT_EQ(CountLiveLogFiles(), 6);
+  AssertCountLiveLogFiles(6);
   PutRandomData(3, 1000, 100);
   WaitForFlush(3);
   // [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3]
   // delete obsolete logs -->
   // [0, (1)] [1, (2)], [2, (3)] [3]
-  ASSERT_EQ(CountLiveLogFiles(), 4);
+  AssertCountLiveLogFiles(4);
   Close();
 }
 
@@ -661,17 +707,25 @@ TEST_F(ColumnFamilyTest, DifferentWriteBufferSizes) {
   // "two" -> 1MB memtable, start flushing with three immutable memtables
   // "three" -> 90KB memtable, start flushing with four immutable memtables
   default_cf.write_buffer_size = 100000;
+  default_cf.arena_block_size = 4 * 4096;
   default_cf.max_write_buffer_number = 10;
   default_cf.min_write_buffer_number_to_merge = 1;
+  default_cf.max_write_buffer_number_to_maintain = 0;
   one.write_buffer_size = 200000;
+  one.arena_block_size = 4 * 4096;
   one.max_write_buffer_number = 10;
   one.min_write_buffer_number_to_merge = 2;
+  one.max_write_buffer_number_to_maintain = 1;
   two.write_buffer_size = 1000000;
+  two.arena_block_size = 4 * 4096;
   two.max_write_buffer_number = 10;
   two.min_write_buffer_number_to_merge = 3;
-  three.write_buffer_size = 90000;
+  two.max_write_buffer_number_to_maintain = 2;
+  three.write_buffer_size = 4096 * 22 + 2048;
+  three.arena_block_size = 4096;
   three.max_write_buffer_number = 10;
   three.min_write_buffer_number_to_merge = 4;
+  three.max_write_buffer_number_to_maintain = -1;
 
   Reopen({default_cf, one, two, three});
 
@@ -679,72 +733,73 @@ TEST_F(ColumnFamilyTest, DifferentWriteBufferSizes) {
   PutRandomData(0, 100, 1000);
   WaitForFlush(0);
   AssertNumberOfImmutableMemtables({0, 0, 0, 0});
-  ASSERT_EQ(CountLiveLogFiles(), 1);
+  AssertCountLiveLogFiles(1);
   PutRandomData(1, 200, 1000);
   env_->SleepForMicroseconds(micros_wait_for_flush);
   AssertNumberOfImmutableMemtables({0, 1, 0, 0});
-  ASSERT_EQ(CountLiveLogFiles(), 2);
+  AssertCountLiveLogFiles(2);
   PutRandomData(2, 1000, 1000);
   env_->SleepForMicroseconds(micros_wait_for_flush);
   AssertNumberOfImmutableMemtables({0, 1, 1, 0});
-  ASSERT_EQ(CountLiveLogFiles(), 3);
+  AssertCountLiveLogFiles(3);
   PutRandomData(2, 1000, 1000);
   env_->SleepForMicroseconds(micros_wait_for_flush);
   AssertNumberOfImmutableMemtables({0, 1, 2, 0});
-  ASSERT_EQ(CountLiveLogFiles(), 4);
-  PutRandomData(3, 90, 1000);
+  AssertCountLiveLogFiles(4);
+  PutRandomData(3, 91, 990);
   env_->SleepForMicroseconds(micros_wait_for_flush);
   AssertNumberOfImmutableMemtables({0, 1, 2, 1});
-  ASSERT_EQ(CountLiveLogFiles(), 5);
-  PutRandomData(3, 90, 1000);
+  AssertCountLiveLogFiles(5);
+  PutRandomData(3, 90, 990);
   env_->SleepForMicroseconds(micros_wait_for_flush);
   AssertNumberOfImmutableMemtables({0, 1, 2, 2});
-  ASSERT_EQ(CountLiveLogFiles(), 6);
-  PutRandomData(3, 90, 1000);
+  AssertCountLiveLogFiles(6);
+  PutRandomData(3, 90, 990);
   env_->SleepForMicroseconds(micros_wait_for_flush);
   AssertNumberOfImmutableMemtables({0, 1, 2, 3});
-  ASSERT_EQ(CountLiveLogFiles(), 7);
+  AssertCountLiveLogFiles(7);
   PutRandomData(0, 100, 1000);
   WaitForFlush(0);
   AssertNumberOfImmutableMemtables({0, 1, 2, 3});
-  ASSERT_EQ(CountLiveLogFiles(), 8);
+  AssertCountLiveLogFiles(8);
   PutRandomData(2, 100, 10000);
   WaitForFlush(2);
   AssertNumberOfImmutableMemtables({0, 1, 0, 3});
-  ASSERT_EQ(CountLiveLogFiles(), 9);
-  PutRandomData(3, 90, 1000);
+  AssertCountLiveLogFiles(9);
+  PutRandomData(3, 90, 990);
   WaitForFlush(3);
   AssertNumberOfImmutableMemtables({0, 1, 0, 0});
-  ASSERT_EQ(CountLiveLogFiles(), 10);
-  PutRandomData(3, 90, 1000);
+  AssertCountLiveLogFiles(10);
+  PutRandomData(3, 90, 990);
   env_->SleepForMicroseconds(micros_wait_for_flush);
   AssertNumberOfImmutableMemtables({0, 1, 0, 1});
-  ASSERT_EQ(CountLiveLogFiles(), 11);
+  AssertCountLiveLogFiles(11);
   PutRandomData(1, 200, 1000);
   WaitForFlush(1);
   AssertNumberOfImmutableMemtables({0, 0, 0, 1});
-  ASSERT_EQ(CountLiveLogFiles(), 5);
-  PutRandomData(3, 240, 1000);
+  AssertCountLiveLogFiles(5);
+  PutRandomData(3, 90 * 3, 990);
   WaitForFlush(3);
-  PutRandomData(3, 300, 1000);
+  PutRandomData(3, 90 * 4, 990);
   WaitForFlush(3);
   AssertNumberOfImmutableMemtables({0, 0, 0, 0});
-  ASSERT_EQ(CountLiveLogFiles(), 12);
+  AssertCountLiveLogFiles(12);
   PutRandomData(0, 100, 1000);
   WaitForFlush(0);
   AssertNumberOfImmutableMemtables({0, 0, 0, 0});
-  ASSERT_EQ(CountLiveLogFiles(), 12);
-  PutRandomData(2, 3*100, 10000);
+  AssertCountLiveLogFiles(12);
+  PutRandomData(2, 3 * 1000, 1000);
   WaitForFlush(2);
   AssertNumberOfImmutableMemtables({0, 0, 0, 0});
-  ASSERT_EQ(CountLiveLogFiles(), 12);
+  AssertCountLiveLogFiles(12);
   PutRandomData(1, 2*200, 1000);
   WaitForFlush(1);
   AssertNumberOfImmutableMemtables({0, 0, 0, 0});
-  ASSERT_EQ(CountLiveLogFiles(), 7);
+  AssertCountLiveLogFiles(7);
   Close();
 }
 
+#ifndef ROCKSDB_LITE  // Cuckoo is not supported in lite
 TEST_F(ColumnFamilyTest, MemtableNotSupportSnapshot) {
   Open();
   auto* s1 = dbfull()->GetSnapshot();
@@ -765,6 +820,7 @@ TEST_F(ColumnFamilyTest, MemtableNotSupportSnapshot) {
   ASSERT_TRUE(s3 == nullptr);
   Close();
 }
+#endif  // !ROCKSDB_LITE
 
 TEST_F(ColumnFamilyTest, DifferentMergeOperators) {
   Open();
@@ -813,14 +869,14 @@ TEST_F(ColumnFamilyTest, DifferentCompactionStyles) {
   default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   one.compaction_style = kCompactionStyleUniversal;
+
   one.num_levels = 1;
   // trigger compaction if there are >= 4 files
   one.level0_file_num_compaction_trigger = 4;
-  one.write_buffer_size = 100000;
+  one.write_buffer_size = 120000;
 
   two.compaction_style = kCompactionStyleLevel;
   two.num_levels = 4;
-  two.max_mem_compaction_level = 0;
   two.level0_file_num_compaction_trigger = 3;
   two.write_buffer_size = 100000;
 
@@ -828,38 +884,43 @@ TEST_F(ColumnFamilyTest, DifferentCompactionStyles) {
 
   // SETUP column family "one" -- universal style
   for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
-    PutRandomData(1, 11, 10000);
+    PutRandomData(1, 10, 12000);
+    PutRandomData(1, 1, 10);
     WaitForFlush(1);
-    ASSERT_EQ(ToString(i + 1), FilesPerLevel(1));
+    AssertFilesPerLevel(ToString(i + 1), 1);
   }
 
   // SETUP column family "two" -- level style with 4 levels
   for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
-    PutRandomData(2, 15, 10000);
+    PutRandomData(2, 10, 12000);
+    PutRandomData(2, 1, 10);
     WaitForFlush(2);
-    ASSERT_EQ(ToString(i + 1), FilesPerLevel(2));
+    AssertFilesPerLevel(ToString(i + 1), 2);
   }
 
   // TRIGGER compaction "one"
-  PutRandomData(1, 12, 10000);
+  PutRandomData(1, 10, 12000);
+  PutRandomData(1, 1, 10);
 
   // TRIGGER compaction "two"
-  PutRandomData(2, 10, 10000);
+  PutRandomData(2, 10, 12000);
+  PutRandomData(2, 1, 10);
 
   // WAIT for compactions
   WaitForCompaction();
 
   // VERIFY compaction "one"
-  ASSERT_EQ("1", FilesPerLevel(1));
+  AssertFilesPerLevel("1", 1);
 
   // VERIFY compaction "two"
-  ASSERT_EQ("0,1", FilesPerLevel(2));
+  AssertFilesPerLevel("0,1", 2);
   CompactAll(2);
-  ASSERT_EQ("0,1", FilesPerLevel(2));
+  AssertFilesPerLevel("0,1", 2);
 
   Close();
 }
 
+#ifndef ROCKSDB_LITE  // Tailing interator not supported
 namespace {
 std::string IterStatus(Iterator* iter) {
   std::string result;
@@ -916,7 +977,9 @@ TEST_F(ColumnFamilyTest, NewIteratorsTest) {
     Destroy();
   }
 }
+#endif  // !ROCKSDB_LITE
 
+#ifndef ROCKSDB_LITE  // ReadOnlyDB is not supported
 TEST_F(ColumnFamilyTest, ReadOnlyDBTest) {
   Open();
   CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
@@ -966,6 +1029,7 @@ TEST_F(ColumnFamilyTest, ReadOnlyDBTest) {
   s = OpenReadOnly({"one", "four"});
   ASSERT_TRUE(!s.ok());
 }
+#endif  // !ROCKSDB_LITE
 
 TEST_F(ColumnFamilyTest, DontRollEmptyLogs) {
   Open();
@@ -981,7 +1045,7 @@ TEST_F(ColumnFamilyTest, DontRollEmptyLogs) {
   }
 
   for (int i = 0; i < 4; ++i) {
-    dbfull()->TEST_WaitForFlushMemTable(handles_[i]);
+    WaitForFlush(i);
   }
   int total_new_writable_files =
       env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start;
@@ -994,6 +1058,7 @@ TEST_F(ColumnFamilyTest, FlushStaleColumnFamilies) {
   CreateColumnFamilies({"one", "two"});
   ColumnFamilyOptions default_cf, one, two;
   default_cf.write_buffer_size = 100000;  // small write buffer size
+  default_cf.arena_block_size = 4096;
   default_cf.disable_auto_compactions = true;
   one.disable_auto_compactions = true;
   two.disable_auto_compactions = true;
@@ -1005,7 +1070,8 @@ TEST_F(ColumnFamilyTest, FlushStaleColumnFamilies) {
   for (int i = 0; i < 2; ++i) {
     PutRandomData(0, 100, 1000);  // flush
     WaitForFlush(0);
-    ASSERT_EQ(i + 1, CountLiveFiles());
+
+    AssertCountLiveFiles(i + 1);
   }
   // third flush. now, CF [two] should be detected as stale and flushed
   // column family 1 should not be flushed since it's empty
@@ -1014,7 +1080,7 @@ TEST_F(ColumnFamilyTest, FlushStaleColumnFamilies) {
   WaitForFlush(2);
   // 3 files for default column families, 1 file for column family [two], zero
   // files for column family [one], because it's empty
-  ASSERT_EQ(4, CountLiveFiles());
+  AssertCountLiveFiles(4);
   Close();
 }
 
@@ -1029,21 +1095,48 @@ TEST_F(ColumnFamilyTest, CreateMissingColumnFamilies) {
 
 TEST_F(ColumnFamilyTest, SanitizeOptions) {
   DBOptions db_options;
-  for (int i = 1; i <= 3; i++) {
-    for (int j = 1; j <= 3; j++) {
-      for (int k = 1; k <= 3; k++) {
-        ColumnFamilyOptions original;
-        original.level0_stop_writes_trigger = i;
-        original.level0_slowdown_writes_trigger = j;
-        original.level0_file_num_compaction_trigger = k;
-        ColumnFamilyOptions result =
-            SanitizeOptions(db_options, nullptr, original);
-        ASSERT_TRUE(result.level0_stop_writes_trigger >=
-                    result.level0_slowdown_writes_trigger);
-        ASSERT_TRUE(result.level0_slowdown_writes_trigger >=
-                    result.level0_file_num_compaction_trigger);
-        ASSERT_TRUE(result.level0_file_num_compaction_trigger ==
-                    original.level0_file_num_compaction_trigger);
+  for (int s = kCompactionStyleLevel; s <= kCompactionStyleUniversal; ++s) {
+    for (int l = 0; l <= 2; l++) {
+      for (int i = 1; i <= 3; i++) {
+        for (int j = 1; j <= 3; j++) {
+          for (int k = 1; k <= 3; k++) {
+            ColumnFamilyOptions original;
+            original.compaction_style = static_cast<CompactionStyle>(s);
+            original.num_levels = l;
+            original.level0_stop_writes_trigger = i;
+            original.level0_slowdown_writes_trigger = j;
+            original.level0_file_num_compaction_trigger = k;
+            original.write_buffer_size =
+                l * 4 * 1024 * 1024 + i * 1024 * 1024 + j * 1024 + k;
+
+            ColumnFamilyOptions result =
+                SanitizeOptions(db_options, nullptr, original);
+            ASSERT_TRUE(result.level0_stop_writes_trigger >=
+                        result.level0_slowdown_writes_trigger);
+            ASSERT_TRUE(result.level0_slowdown_writes_trigger >=
+                        result.level0_file_num_compaction_trigger);
+            ASSERT_TRUE(result.level0_file_num_compaction_trigger ==
+                        original.level0_file_num_compaction_trigger);
+            if (s == kCompactionStyleLevel) {
+              ASSERT_GE(result.num_levels, 2);
+            } else {
+              ASSERT_GE(result.num_levels, 1);
+              if (original.num_levels >= 1) {
+                ASSERT_EQ(result.num_levels, original.num_levels);
+              }
+            }
+
+            // Make sure Sanitize options sets arena_block_size to 1/8 of
+            // the write_buffer_size, rounded up to a multiple of 4k.
+            size_t expected_arena_block_size =
+                l * 4 * 1024 * 1024 / 8 + i * 1024 * 1024 / 8;
+            if (j + k != 0) {
+              // not a multiple of 4k, round up 4k
+              expected_arena_block_size += 4 * 1024;
+            }
+            ASSERT_EQ(expected_arena_block_size, result.arena_block_size);
+          }
+        }
       }
     }
   }
@@ -1098,6 +1191,7 @@ TEST_F(ColumnFamilyTest, ReadDroppedColumnFamily) {
         ASSERT_OK(iterator->status());
         ++count;
       }
+      ASSERT_OK(iterator->status());
       ASSERT_EQ(count, kKeysNum * ((i == 2) ? 1 : 2));
     }
 
@@ -1106,9 +1200,75 @@ TEST_F(ColumnFamilyTest, ReadDroppedColumnFamily) {
   }
 }
 
+TEST_F(ColumnFamilyTest, FlushAndDropRaceCondition) {
+  db_options_.create_missing_column_families = true;
+  Open({"default", "one"});
+  ColumnFamilyOptions options;
+  options.level0_file_num_compaction_trigger = 100;
+  options.level0_slowdown_writes_trigger = 200;
+  options.level0_stop_writes_trigger = 200;
+  options.max_write_buffer_number = 20;
+  options.write_buffer_size = 100000;  // small write buffer size
+  Reopen({options, options});
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"VersionSet::LogAndApply::ColumnFamilyDrop:1"
+        "FlushJob::InstallResults"},
+       {"FlushJob::InstallResults",
+        "VersionSet::LogAndApply::ColumnFamilyDrop:2", }});
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  test::SleepingBackgroundTask sleeping_task;
+
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+
+  // 1MB should create ~10 files for each CF
+  int kKeysNum = 10000;
+  PutRandomData(1, kKeysNum, 100);
+
+  std::vector<std::thread> threads;
+  threads.emplace_back([&] { ASSERT_OK(db_->DropColumnFamily(handles_[1])); });
+
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+  sleeping_task.Reset();
+  // now we sleep again. this is just so we're certain that flush job finished
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+
+  {
+    // Since we didn't delete CF handle, RocksDB's contract guarantees that
+    // we're still able to read dropped CF
+    std::unique_ptr<Iterator> iterator(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    int count = 0;
+    for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+      ASSERT_OK(iterator->status());
+      ++count;
+    }
+    ASSERT_OK(iterator->status());
+    ASSERT_EQ(count, kKeysNum);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  Close();
+  Destroy();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
 }  // namespace rocksdb
+#endif
 
 int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif
 }
diff --git a/src/rocksdb/db/compact_files_test.cc b/src/rocksdb/db/compact_files_test.cc
index b7255c2..cbd9d7a 100644
--- a/src/rocksdb/db/compact_files_test.cc
+++ b/src/rocksdb/db/compact_files_test.cc
@@ -3,12 +3,15 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef ROCKSDB_LITE
+
 #include <mutex>
 #include <string>
 #include <vector>
 
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "util/string_util.h"
 #include "util/testharness.h"
 
 namespace rocksdb {
@@ -31,12 +34,9 @@ class FlushedFileCollector : public EventListener {
   ~FlushedFileCollector() {}
 
   virtual void OnFlushCompleted(
-      DB* db, const std::string& column_family_name,
-      const std::string& file_path,
-      bool triggered_writes_slowdown,
-      bool triggered_writes_stop) {
+      DB* db, const FlushJobInfo& info) override {
     std::lock_guard<std::mutex> lock(mutex_);
-    flushed_files_.push_back(file_path);
+    flushed_files_.push_back(info.file_path);
   }
 
   std::vector<std::string> GetFlushedFiles() {
@@ -79,9 +79,8 @@ TEST_F(CompactFilesTest, ObsoleteFiles) {
 
   // create couple files
   for (int i = 1000; i < 2000; ++i) {
-    db->Put(WriteOptions(),
-        std::to_string(i),
-        std::string(kWriteBufferSize / 10, 'a' + (i % 26)));
+    db->Put(WriteOptions(), ToString(i),
+            std::string(kWriteBufferSize / 10, 'a' + (i % 26)));
   }
 
   auto l0_files = collector->GetFlushedFiles();
@@ -92,7 +91,7 @@ TEST_F(CompactFilesTest, ObsoleteFiles) {
 
   // verify all compaction input files are deleted
   for (auto fname : l0_files) {
-    ASSERT_TRUE(!env_->FileExists(fname));
+    ASSERT_EQ(Status::NotFound(), env_->FileExists(fname));
   }
   delete db;
 }
@@ -103,3 +102,14 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr,
+          "SKIPPED as DBImpl::CompactFiles is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compacted_db_impl.cc b/src/rocksdb/db/compacted_db_impl.cc
new file mode 100644
index 0000000..980b34e
--- /dev/null
+++ b/src/rocksdb/db/compacted_db_impl.cc
@@ -0,0 +1,163 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#include "db/compacted_db_impl.h"
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "table/get_context.h"
+
+namespace rocksdb {
+
+extern void MarkKeyMayExist(void* arg);
+extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
+                      const Slice& v, bool hit_and_return);
+
+CompactedDBImpl::CompactedDBImpl(
+  const DBOptions& options, const std::string& dbname)
+  : DBImpl(options, dbname) {
+}
+
+CompactedDBImpl::~CompactedDBImpl() {
+}
+
+size_t CompactedDBImpl::FindFile(const Slice& key) {
+  size_t left = 0;
+  size_t right = files_.num_files - 1;
+  while (left < right) {
+    size_t mid = (left + right) >> 1;
+    const FdWithKeyRange& f = files_.files[mid];
+    if (user_comparator_->Compare(ExtractUserKey(f.largest_key), key) < 0) {
+      // Key at "mid.largest" is < "target".  Therefore all
+      // files at or before "mid" are uninteresting.
+      left = mid + 1;
+    } else {
+      // Key at "mid.largest" is >= "target".  Therefore all files
+      // after "mid" are uninteresting.
+      right = mid;
+    }
+  }
+  return right;
+}
+
+Status CompactedDBImpl::Get(const ReadOptions& options,
+     ColumnFamilyHandle*, const Slice& key, std::string* value) {
+  GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, key, value, nullptr, nullptr,
+                         nullptr);
+  LookupKey lkey(key, kMaxSequenceNumber);
+  files_.files[FindFile(key)].fd.table_reader->Get(
+      options, lkey.internal_key(), &get_context);
+  if (get_context.State() == GetContext::kFound) {
+    return Status::OK();
+  }
+  return Status::NotFound();
+}
+
+std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
+    const std::vector<ColumnFamilyHandle*>&,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  autovector<TableReader*, 16> reader_list;
+  for (const auto& key : keys) {
+    const FdWithKeyRange& f = files_.files[FindFile(key)];
+    if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) {
+      reader_list.push_back(nullptr);
+    } else {
+      LookupKey lkey(key, kMaxSequenceNumber);
+      f.fd.table_reader->Prepare(lkey.internal_key());
+      reader_list.push_back(f.fd.table_reader);
+    }
+  }
+  std::vector<Status> statuses(keys.size(), Status::NotFound());
+  values->resize(keys.size());
+  int idx = 0;
+  for (auto* r : reader_list) {
+    if (r != nullptr) {
+      GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+                             GetContext::kNotFound, keys[idx], &(*values)[idx],
+                             nullptr, nullptr, nullptr);
+      LookupKey lkey(keys[idx], kMaxSequenceNumber);
+      r->Get(options, lkey.internal_key(), &get_context);
+      if (get_context.State() == GetContext::kFound) {
+        statuses[idx] = Status::OK();
+      }
+    }
+    ++idx;
+  }
+  return statuses;
+}
+
+Status CompactedDBImpl::Init(const Options& options) {
+  mutex_.Lock();
+  ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
+                            ColumnFamilyOptions(options));
+  Status s = Recover({ cf }, true /* read only */, false);
+  if (s.ok()) {
+    cfd_ = reinterpret_cast<ColumnFamilyHandleImpl*>(
+              DefaultColumnFamily())->cfd();
+    delete cfd_->InstallSuperVersion(new SuperVersion(), &mutex_);
+  }
+  mutex_.Unlock();
+  if (!s.ok()) {
+    return s;
+  }
+  NewThreadStatusCfInfo(cfd_);
+  version_ = cfd_->GetSuperVersion()->current;
+  user_comparator_ = cfd_->user_comparator();
+  auto* vstorage = version_->storage_info();
+  if (vstorage->num_non_empty_levels() == 0) {
+    return Status::NotSupported("no file exists");
+  }
+  const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
+  // L0 should not have files
+  if (l0.num_files > 1) {
+    return Status::NotSupported("L0 contain more than 1 file");
+  }
+  if (l0.num_files == 1) {
+    if (vstorage->num_non_empty_levels() > 1) {
+      return Status::NotSupported("Both L0 and other level contain files");
+    }
+    files_ = l0;
+    return Status::OK();
+  }
+
+  for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) {
+    if (vstorage->LevelFilesBrief(i).num_files > 0) {
+      return Status::NotSupported("Other levels also contain files");
+    }
+  }
+
+  int level = vstorage->num_non_empty_levels() - 1;
+  if (vstorage->LevelFilesBrief(level).num_files > 0) {
+    files_ = vstorage->LevelFilesBrief(level);
+    return Status::OK();
+  }
+  return Status::NotSupported("no file exists");
+}
+
+Status CompactedDBImpl::Open(const Options& options,
+                             const std::string& dbname, DB** dbptr) {
+  *dbptr = nullptr;
+
+  if (options.max_open_files != -1) {
+    return Status::InvalidArgument("require max_open_files = -1");
+  }
+  if (options.merge_operator.get() != nullptr) {
+    return Status::InvalidArgument("merge operator is not supported");
+  }
+  DBOptions db_options(options);
+  std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
+  Status s = db->Init(options);
+  if (s.ok()) {
+    Log(INFO_LEVEL, db->db_options_.info_log,
+        "Opened the db as fully compacted mode");
+    LogFlush(db->db_options_.info_log);
+    *dbptr = db.release();
+  }
+  return s;
+}
+
+}   // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compacted_db_impl.h b/src/rocksdb/db/compacted_db_impl.h
new file mode 100644
index 0000000..ec2d537
--- /dev/null
+++ b/src/rocksdb/db/compacted_db_impl.h
@@ -0,0 +1,95 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include "db/db_impl.h"
+#include <vector>
+#include <string>
+
+namespace rocksdb {
+
+class CompactedDBImpl : public DBImpl {
+ public:
+  CompactedDBImpl(const DBOptions& options, const std::string& dbname);
+  virtual ~CompactedDBImpl();
+
+  static Status Open(const Options& options, const std::string& dbname,
+                     DB** dbptr);
+
+  // Implementations of the DB interface
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value) override;
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>&,
+      const std::vector<Slice>& keys, std::vector<std::string>* values)
+    override;
+
+  using DBImpl::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::Delete;
+  virtual Status Delete(const WriteOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status Write(const WriteOptions& options,
+                       WriteBatch* updates) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::CompactRange;
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  virtual Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status EnableFileDeletions(bool force) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::Flush;
+  virtual Status Flush(const FlushOptions& options,
+                       ColumnFamilyHandle* column_family) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+ private:
+  friend class DB;
+  inline size_t FindFile(const Slice& key);
+  Status Init(const Options& options);
+
+  ColumnFamilyData* cfd_;
+  Version* version_;
+  const Comparator* user_comparator_;
+  LevelFilesBrief files_;
+
+  // No copying allowed
+  CompactedDBImpl(const CompactedDBImpl&);
+  void operator=(const CompactedDBImpl&);
+};
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction.cc b/src/rocksdb/db/compaction.cc
index 7ece0c4..bb80665 100644
--- a/src/rocksdb/db/compaction.cc
+++ b/src/rocksdb/db/compaction.cc
@@ -16,6 +16,7 @@
 #include <inttypes.h>
 #include <vector>
 
+#include "rocksdb/compaction_filter.h"
 #include "db/column_family.h"
 #include "util/logging.h"
 #include "util/sync_point.h"
@@ -39,6 +40,47 @@ void Compaction::SetInputVersion(Version* _input_version) {
   edit_.SetColumnFamily(cfd_->GetID());
 }
 
+void Compaction::GetBoundaryKeys(
+    VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs, Slice* smallest_user_key,
+    Slice* largest_user_key) {
+  bool initialized = false;
+  const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+  for (uint32_t i = 0; i < inputs.size(); ++i) {
+    if (inputs[i].files.empty()) {
+      continue;
+    }
+    if (inputs[i].level == 0) {
+      // we need to consider all files on level 0
+      for (const auto* f : inputs[i].files) {
+        const Slice& start_user_key = f->smallest.user_key();
+        if (!initialized ||
+            ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+          *smallest_user_key = start_user_key;
+        }
+        const Slice& end_user_key = f->largest.user_key();
+        if (!initialized ||
+            ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+          *largest_user_key = end_user_key;
+        }
+        initialized = true;
+      }
+    } else {
+      // we only need to consider the first and last file
+      const Slice& start_user_key = inputs[i].files[0]->smallest.user_key();
+      if (!initialized ||
+          ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+        *smallest_user_key = start_user_key;
+      }
+      const Slice& end_user_key = inputs[i].files.back()->largest.user_key();
+      if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+        *largest_user_key = end_user_key;
+      }
+      initialized = true;
+    }
+  }
+}
+
 // helper function to determine if compaction is creating files at the
 // bottommost level
 bool Compaction::IsBottommostLevel(
@@ -49,15 +91,40 @@ bool Compaction::IsBottommostLevel(
     return false;
   }
 
-  // checks whether there are files living beyond the output_level.
+  Slice smallest_key, largest_key;
+  GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key);
+
+  // Checks whether there are files living beyond the output_level.
+  // If lower levels have files, it checks for overlap between files
+  // if the compaction process and those files.
+  // Bottomlevel optimizations can be made if there are no files in
+  // lower levels or if there is no overlap with the files in
+  // the lower levels.
   for (int i = output_level + 1; i < vstorage->num_levels(); i++) {
-    if (vstorage->NumLevelFiles(i) > 0) {
+    // It is not the bottommost level if there are files in higher
+    // levels when the output level is 0 or if there are files in
+    // higher levels which overlap with files to be compacted.
+    // output_level == 0 means that we want it to be considered
+    // s the bottommost level only if the last file on the level
+    // is a part of the files to be compacted - this is verified by
+    // the first if condition in this function
+    if (vstorage->NumLevelFiles(i) > 0 &&
+        (output_level == 0 ||
+         vstorage->OverlapInLevel(i, &smallest_key, &largest_key))) {
       return false;
     }
   }
   return true;
 }
 
+// test function to validate the functionality of IsBottommostLevel()
+// function -- determines if compaction with inputs and storage is bottommost
+bool Compaction::TEST_IsBottommostLevel(
+    int output_level, VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs) {
+  return IsBottommostLevel(output_level, vstorage, inputs);
+}
+
 bool Compaction::IsFullCompaction(
     VersionStorageInfo* vstorage,
     const std::vector<CompactionInputFiles>& inputs) {
@@ -100,8 +167,7 @@ Compaction::Compaction(VersionStorageInfo* vstorage,
       score_(_score),
       bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),
       is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
-      is_manual_compaction_(_manual_compaction),
-      level_ptrs_(std::vector<size_t>(number_levels_, 0)) {
+      is_manual_compaction_(_manual_compaction) {
   MarkFilesBeingCompacted(true);
 
 #ifndef NDEBUG
@@ -148,10 +214,32 @@ bool Compaction::IsTrivialMove() const {
   // Otherwise, the move could create a parent file that will require
   // a very expensive merge later on.
   // If start_level_== output_level_, the purpose is to force compaction
-  // filter to be applied to that level, and thus cannot be a trivia move.
+  // filter to be applied to that level, and thus cannot be a trivial move.
+
+  // Check if start level have files with overlapping ranges
+  if (start_level_ == 0 &&
+      input_version_->storage_info()->level0_non_overlapping() == false) {
+    // We cannot move files from L0 to L1 if the files are overlapping
+    return false;
+  }
+
+  if (is_manual_compaction_ &&
+      (cfd_->ioptions()->compaction_filter != nullptr ||
+       cfd_->ioptions()->compaction_filter_factory != nullptr)) {
+    // This is a manual compaction and we have a compaction filter that should
+    // be executed, we cannot do a trivial move
+    return false;
+  }
+
+  // Used in universal compaction, where trivial move can be done if the
+  // input files are non overlapping
+  if ((cfd_->ioptions()->compaction_options_universal.allow_trivial_move) &&
+      (output_level_ != 0)) {
+    return is_trivial_move_;
+  }
+
   return (start_level_ != output_level_ && num_input_levels() == 1 &&
-          num_input_files(0) == 1 &&
-          input(0, 0)->fd.GetPathId() == GetOutputPathId() &&
+          input(0, 0)->fd.GetPathId() == output_path_id() &&
           InputCompressionMatchesOutput() &&
           TotalFileSize(grandparents_) <= max_grandparent_overlap_bytes_);
 }
@@ -164,8 +252,11 @@ void Compaction::AddInputDeletions(VersionEdit* out_edit) {
   }
 }
 
-bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) {
+bool Compaction::KeyNotExistsBeyondOutputLevel(
+    const Slice& user_key, std::vector<size_t>* level_ptrs) const {
   assert(input_version_ != nullptr);
+  assert(level_ptrs != nullptr);
+  assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
   assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO);
   if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
     return bottommost_level_;
@@ -175,8 +266,8 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) {
   for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
     const std::vector<FileMetaData*>& files =
         input_version_->storage_info()->LevelFiles(lvl);
-    for (; level_ptrs_[lvl] < files.size(); ) {
-      FileMetaData* f = files[level_ptrs_[lvl]];
+    for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
+      auto* f = files[level_ptrs->at(lvl)];
       if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
         // We've advanced far enough
         if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
@@ -186,7 +277,6 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) {
         }
         break;
       }
-      level_ptrs_[lvl]++;
     }
   }
   return true;
@@ -247,7 +337,8 @@ const char* Compaction::InputLevelSummary(
       is_first = false;
     }
     len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
-                    "%zu@%d", input_level.size(), input_level.level);
+                    "%" ROCKSDB_PRIszt "@%d", input_level.size(),
+                    input_level.level);
   }
   snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
            " files to L%d", output_level());
@@ -340,4 +431,33 @@ uint64_t Compaction::OutputFilePreallocationSize() {
   return preallocation_size * 1.1;
 }
 
+std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const {
+  if (!cfd_->ioptions()->compaction_filter_factory) {
+    return nullptr;
+  }
+
+  CompactionFilter::Context context;
+  context.is_full_compaction = is_full_compaction_;
+  context.is_manual_compaction = is_manual_compaction_;
+  return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
+      context);
+}
+
+bool Compaction::IsOutputLevelEmpty() const {
+  return inputs_.back().level != output_level_ || inputs_.back().empty();
+}
+
+bool Compaction::ShouldFormSubcompactions() const {
+  if (mutable_cf_options_.max_subcompactions <= 1 || cfd_ == nullptr) {
+    return false;
+  }
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+    return start_level_ == 0 && !IsOutputLevelEmpty();
+  } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
+    return number_levels_ > 1 && output_level_ > 0;
+  } else {
+    return false;
+  }
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/compaction.h b/src/rocksdb/db/compaction.h
index 3bb87c2..36c62ff 100644
--- a/src/rocksdb/db/compaction.h
+++ b/src/rocksdb/db/compaction.h
@@ -29,6 +29,7 @@ struct CompactionInputFiles {
 class Version;
 class ColumnFamilyData;
 class VersionStorageInfo;
+class CompactionFilter;
 
 // A Compaction encapsulates information about a compaction.
 class Compaction {
@@ -107,29 +108,28 @@ class Compaction {
   }
 
   // Maximum size of files to build during this compaction.
-  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+  uint64_t max_output_file_size() const { return max_output_file_size_; }
 
   // What compression for output
-  CompressionType OutputCompressionType() const { return output_compression_; }
+  CompressionType output_compression() const { return output_compression_; }
 
   // Whether need to write output file to second DB path.
-  uint32_t GetOutputPathId() const { return output_path_id_; }
+  uint32_t output_path_id() const { return output_path_id_; }
 
   // Is this a trivial compaction that can be implemented by just
   // moving a single input file to the next level (no merging or splitting)
   bool IsTrivialMove() const;
 
   // If true, then the compaction can be done by simply deleting input files.
-  bool IsDeletionCompaction() const {
-    return deletion_compaction_;
-  }
+  bool deletion_compaction() const { return deletion_compaction_; }
 
   // Add all inputs to this compaction as delete operations to *edit.
   void AddInputDeletions(VersionEdit* edit);
 
   // Returns true if the available information we have guarantees that
   // the input "user_key" does not exist in any level beyond "output_level()".
-  bool KeyNotExistsBeyondOutputLevel(const Slice& user_key);
+  bool KeyNotExistsBeyondOutputLevel(const Slice& user_key,
+                                     std::vector<size_t>* level_ptrs) const;
 
   // Returns true iff we should stop building the current output
   // before processing "internal_key".
@@ -148,13 +148,29 @@ class Compaction {
   double score() const { return score_; }
 
   // Is this compaction creating a file in the bottom most level?
-  bool BottomMostLevel() { return bottommost_level_; }
+  bool bottommost_level() { return bottommost_level_; }
 
   // Does this compaction include all sst files?
-  bool IsFullCompaction() { return is_full_compaction_; }
+  bool is_full_compaction() { return is_full_compaction_; }
 
   // Was this compaction triggered manually by the client?
-  bool IsManualCompaction() { return is_manual_compaction_; }
+  bool is_manual_compaction() { return is_manual_compaction_; }
+
+  // Used when allow_trivial_move option is set in
+  // Universal compaction. If all the input files are
+  // non overlapping, then is_trivial_move_ variable
+  // will be set true, else false
+  void set_is_trivial_move(bool trivial_move) {
+    is_trivial_move_ = trivial_move;
+  }
+
+  // Used when allow_trivial_move option is set in
+  // Universal compaction. Returns true, if the input files
+  // are non-overlapping and can be trivially moved.
+  bool is_trivial_move() { return is_trivial_move_; }
+
+  // How many total levels are there?
+  int number_levels() const { return number_levels_; }
 
   // Return the MutableCFOptions that should be used throughout the compaction
   // procedure
@@ -179,15 +195,36 @@ class Compaction {
   // to pick up the next file to be compacted from files_by_size_
   void ResetNextCompactionIndex();
 
+  // Create a CompactionFilter from compaction_filter_factory
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter() const;
+
+  // Is the input level corresponding to output_level_ empty?
+  bool IsOutputLevelEmpty() const;
+
+  // Should this compaction be broken up into smaller ones run in parallel?
+  bool ShouldFormSubcompactions() const;
+
+  // test function to validate the functionality of IsBottommostLevel()
+  // function -- determines if compaction with inputs and storage is bottommost
+  static bool TEST_IsBottommostLevel(
+      int output_level, VersionStorageInfo* vstorage,
+      const std::vector<CompactionInputFiles>& inputs);
+
  private:
   // mark (or clear) all files that are being compacted
   void MarkFilesBeingCompacted(bool mark_as_compacted);
 
+  // get the smallest and largest key present in files to be compacted
+  static void GetBoundaryKeys(VersionStorageInfo* vstorage,
+                              const std::vector<CompactionInputFiles>& inputs,
+                              Slice* smallest_key, Slice* largest_key);
+
   // helper function to determine if compaction with inputs and storage is
   // bottommost
   static bool IsBottommostLevel(
       int output_level, VersionStorageInfo* vstorage,
       const std::vector<CompactionInputFiles>& inputs);
+
   static bool IsFullCompaction(VersionStorageInfo* vstorage,
                                const std::vector<CompactionInputFiles>& inputs);
 
@@ -230,12 +267,9 @@ class Compaction {
   // Is this compaction requested by the client?
   const bool is_manual_compaction_;
 
-  // "level_ptrs_" holds indices into "input_version_->levels_", where each
-  // index remembers which file of an associated level we are currently used
-  // to check KeyNotExistsBeyondOutputLevel() for deletion operation.
-  // As it is for checking KeyNotExistsBeyondOutputLevel(), it only
-  // records indices for all levels beyond "output_level_".
-  std::vector<size_t> level_ptrs_;
+  // True if we can do trivial move in Universal multi level
+  // compaction
+  bool is_trivial_move_;
 
   // Does input compression match the output compression?
   bool InputCompressionMatchesOutput() const;
diff --git a/src/rocksdb/db/compaction_iterator.cc b/src/rocksdb/db/compaction_iterator.cc
new file mode 100644
index 0000000..d242291
--- /dev/null
+++ b/src/rocksdb/db/compaction_iterator.cc
@@ -0,0 +1,338 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/compaction_iterator.h"
+
+namespace rocksdb {
+
+CompactionIterator::CompactionIterator(
+    Iterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+    SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+    Env* env, bool expect_valid_internal_key, Compaction* compaction,
+    const CompactionFilter* compaction_filter, LogBuffer* log_buffer)
+    : input_(input),
+      cmp_(cmp),
+      merge_helper_(merge_helper),
+      snapshots_(snapshots),
+      env_(env),
+      expect_valid_internal_key_(expect_valid_internal_key),
+      compaction_(compaction),
+      compaction_filter_(compaction_filter),
+      log_buffer_(log_buffer),
+      merge_out_iter_(merge_helper_) {
+  assert(compaction_filter_ == nullptr || compaction_ != nullptr);
+  bottommost_level_ =
+      compaction_ == nullptr ? false : compaction_->bottommost_level();
+  if (compaction_ != nullptr) {
+    level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
+  }
+
+  if (snapshots_->size() == 0) {
+    // optimize for fast path if there are no snapshots
+    visible_at_tip_ = last_sequence;
+    earliest_snapshot_ = visible_at_tip_;
+    latest_snapshot_ = 0;
+  } else {
+    visible_at_tip_ = 0;
+    earliest_snapshot_ = snapshots_->at(0);
+    latest_snapshot_ = snapshots_->back();
+  }
+}
+
+void CompactionIterator::ResetRecordCounts() {
+  iter_stats_.num_record_drop_user = 0;
+  iter_stats_.num_record_drop_hidden = 0;
+  iter_stats_.num_record_drop_obsolete = 0;
+}
+
+void CompactionIterator::SeekToFirst() {
+  NextFromInput();
+  PrepareOutput();
+}
+
+void CompactionIterator::Next() {
+  // If there is a merge output, return it before continuing to process the
+  // input.
+  if (merge_out_iter_.Valid()) {
+    merge_out_iter_.Next();
+
+    // Check if we returned all records of the merge output.
+    if (merge_out_iter_.Valid()) {
+      key_ = merge_out_iter_.key();
+      value_ = merge_out_iter_.value();
+      bool valid_key __attribute__((__unused__)) =
+          ParseInternalKey(key_, &ikey_);
+      // MergeUntil stops when it encounters a corrupt key and does not
+      // include them in the result, so we expect the keys here to be valid.
+      assert(valid_key);
+      // Keep current_key_ in sync.
+      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+      key_ = current_key_.GetKey();
+      ikey_.user_key = current_key_.GetUserKey();
+      valid_ = true;
+    } else {
+      // MergeHelper moves the iterator to the first record after the merged
+      // records, so even though we reached the end of the merge output, we do
+      // not want to advance the iterator.
+      NextFromInput();
+    }
+  } else {
+    // Only advance the input iterator if there is no merge output and the
+    // iterator is not already at the next record.
+    if (!at_next_) {
+      input_->Next();
+    }
+    NextFromInput();
+  }
+
+  PrepareOutput();
+}
+
+void CompactionIterator::NextFromInput() {
+  at_next_ = false;
+  valid_ = false;
+
+  while (!valid_ && input_->Valid()) {
+    key_ = input_->key();
+    value_ = input_->value();
+    iter_stats_.num_input_records++;
+
+    if (!ParseInternalKey(key_, &ikey_)) {
+      // If `expect_valid_internal_key_` is false, return the corrupted key
+      // and let the caller decide what to do with it.
+      // TODO(noetzli): We should have a more elegant solution for this.
+      if (expect_valid_internal_key_) {
+        assert(!"Corrupted internal key not expected.");
+        status_ = Status::Corruption("Corrupted internal key not expected.");
+        break;
+      }
+      key_ = current_key_.SetKey(key_);
+      has_current_user_key_ = false;
+      current_user_key_sequence_ = kMaxSequenceNumber;
+      current_user_key_snapshot_ = 0;
+      iter_stats_.num_input_corrupt_records++;
+      valid_ = true;
+      break;
+    }
+
+    // Update input statistics
+    if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) {
+      iter_stats_.num_input_deletion_records++;
+    }
+    iter_stats_.total_input_raw_key_bytes += key_.size();
+    iter_stats_.total_input_raw_value_bytes += value_.size();
+
+    // Check whether the user key changed. After this if statement current_key_
+    // is a copy of the current input key (maybe converted to a delete by the
+    // compaction filter). ikey_.user_key is pointing to the copy.
+    if (!has_current_user_key_ ||
+        !cmp_->Equal(ikey_.user_key, current_user_key_)) {
+      // First occurrence of this user key
+      key_ = current_key_.SetKey(key_, &ikey_);
+      current_user_key_ = ikey_.user_key;
+      has_current_user_key_ = true;
+      current_user_key_sequence_ = kMaxSequenceNumber;
+      current_user_key_snapshot_ = 0;
+      // apply the compaction filter to the first occurrence of the user key
+      if (compaction_filter_ != nullptr && ikey_.type == kTypeValue &&
+          (visible_at_tip_ || ikey_.sequence > latest_snapshot_)) {
+        // If the user has specified a compaction filter and the sequence
+        // number is greater than any external snapshot, then invoke the
+        // filter. If the return value of the compaction filter is true,
+        // replace the entry with a deletion marker.
+        bool value_changed = false;
+        bool to_delete = false;
+        compaction_filter_value_.clear();
+        {
+          StopWatchNano timer(env_, true);
+          to_delete = compaction_filter_->Filter(
+              compaction_->level(), ikey_.user_key, value_,
+              &compaction_filter_value_, &value_changed);
+          iter_stats_.total_filter_time +=
+              env_ != nullptr ? timer.ElapsedNanos() : 0;
+        }
+        if (to_delete) {
+          // convert the current key to a delete
+          ikey_.type = kTypeDeletion;
+          current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
+          // no value associated with delete
+          value_.clear();
+          iter_stats_.num_record_drop_user++;
+        } else if (value_changed) {
+          value_ = compaction_filter_value_;
+        }
+      }
+    } else {
+      // Update the current key to reflect the new sequence number/type without
+      // copying the user key.
+      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+      key_ = current_key_.GetKey();
+      ikey_.user_key = current_key_.GetUserKey();
+    }
+
+    // If there are no snapshots, then this kv affect visibility at tip.
+    // Otherwise, search though all existing snapshots to find the earliest
+    // snapshot that is affected by this kv.
+    SequenceNumber last_sequence __attribute__((__unused__)) =
+        current_user_key_sequence_;
+    current_user_key_sequence_ = ikey_.sequence;
+    SequenceNumber last_snapshot = current_user_key_snapshot_;
+    SequenceNumber prev_snapshot = 0;  // 0 means no previous snapshot
+    current_user_key_snapshot_ =
+        visible_at_tip_ ? visible_at_tip_ : findEarliestVisibleSnapshot(
+                                                ikey_.sequence, &prev_snapshot);
+
+    if (ikey_.type == kTypeSingleDeletion) {
+      ParsedInternalKey next_ikey;
+      input_->Next();
+
+      // Check whether the current key is valid, not corrupt and the same
+      // as the single delete.
+      if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
+          cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
+        // Mixing single deletes and merges is not supported. Consecutive
+        // single deletes are not valid.
+        if (next_ikey.type != kTypeValue) {
+          assert(false);
+          status_ =
+              Status::InvalidArgument("Put expected after single delete.");
+          break;
+        }
+
+        // Check whether the current key belongs to the same snapshot as the
+        // single delete.
+        if (prev_snapshot == 0 || next_ikey.sequence > prev_snapshot) {
+          // Found the matching value, we can drop the single delete and the
+          // value.
+          ++iter_stats_.num_record_drop_hidden;
+          ++iter_stats_.num_record_drop_obsolete;
+          input_->Next();
+        } else {
+          // We hit the next snapshot without hitting a put, so the iterator
+          // returns the single delete.
+          valid_ = true;
+        }
+      } else {
+        // We are at the end of the input, could not parse the next key, or hit
+        // the next key. The iterator returns the single delete if the key
+        // possibly exists beyond the current output level.  We set
+        // has_current_user_key to false so that if the iterator is at the next
+        // key, we do not compare it again against the previous key at the next
+        // iteration. If the next key is corrupt, we return before the
+        // comparison, so the value of has_current_user_key does not matter.
+        has_current_user_key_ = false;
+        if (compaction_ != nullptr &&
+            compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+                                                       &level_ptrs_)) {
+          ++iter_stats_.num_record_drop_obsolete;
+        } else {
+          valid_ = true;
+        }
+      }
+
+      if (valid_) {
+        at_next_ = true;
+      }
+    } else if (last_snapshot == current_user_key_snapshot_) {
+      // If the earliest snapshot is which this key is visible in
+      // is the same as the visibility of a previous instance of the
+      // same key, then this kv is not visible in any snapshot.
+      // Hidden by an newer entry for same user key
+      // TODO: why not > ?
+      assert(last_sequence >= current_user_key_sequence_);
+      ++iter_stats_.num_record_drop_hidden;  // (A)
+      input_->Next();
+    } else if (compaction_ != nullptr && ikey_.type == kTypeDeletion &&
+               ikey_.sequence <= earliest_snapshot_ &&
+               compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+                                                          &level_ptrs_)) {
+      // TODO(noetzli): This is the only place where we use compaction_
+      // (besides the constructor). We should probably get rid of this
+      // dependency and find a way to do similar filtering during flushes.
+      //
+      // For this user key:
+      // (1) there is no data in higher levels
+      // (2) data in lower levels will have larger sequence numbers
+      // (3) data in layers that are being compacted here and have
+      //     smaller sequence numbers will be dropped in the next
+      //     few iterations of this loop (by rule (A) above).
+      // Therefore this deletion marker is obsolete and can be dropped.
+      ++iter_stats_.num_record_drop_obsolete;
+      input_->Next();
+    } else if (ikey_.type == kTypeMerge) {
+      if (!merge_helper_->HasOperator()) {
+        LogToBuffer(log_buffer_, "Options::merge_operator is null.");
+        status_ = Status::InvalidArgument(
+            "merge_operator is not properly initialized.");
+        return;
+      }
+
+      // We know the merge type entry is not hidden, otherwise we would
+      // have hit (A)
+      // We encapsulate the merge related state machine in a different
+      // object to minimize change to the existing flow.
+      merge_helper_->MergeUntil(input_, prev_snapshot, bottommost_level_);
+      merge_out_iter_.SeekToFirst();
+
+      if (merge_out_iter_.Valid()) {
+        // NOTE: key, value, and ikey_ refer to old entries.
+        //       These will be correctly set below.
+        key_ = merge_out_iter_.key();
+        value_ = merge_out_iter_.value();
+        bool valid_key __attribute__((__unused__)) =
+            ParseInternalKey(key_, &ikey_);
+        // MergeUntil stops when it encounters a corrupt key and does not
+        // include them in the result, so we expect the keys here to valid.
+        assert(valid_key);
+        // Keep current_key_ in sync.
+        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+        key_ = current_key_.GetKey();
+        ikey_.user_key = current_key_.GetUserKey();
+        valid_ = true;
+      } else {
+        // all merge operands were filtered out. reset the user key, since the
+        // batch consumed by the merge operator should not shadow any keys
+        // coming after the merges
+        has_current_user_key_ = false;
+      }
+    } else {
+      valid_ = true;
+    }
+  }
+}
+
+void CompactionIterator::PrepareOutput() {
+  // Zeroing out the sequence number leads to better compression.
+  // If this is the bottommost level (no files in lower levels)
+  // and the earliest snapshot is larger than this seqno
+  // then we can squash the seqno to zero.
+  if (bottommost_level_ && valid_ && ikey_.sequence < earliest_snapshot_ &&
+      ikey_.type != kTypeMerge) {
+    assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion);
+    ikey_.sequence = 0;
+    current_key_.UpdateInternalKey(0, ikey_.type);
+  }
+}
+
+inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
+    SequenceNumber in, SequenceNumber* prev_snapshot) {
+  assert(snapshots_->size());
+  SequenceNumber prev __attribute__((unused)) = 0;
+  for (const auto cur : *snapshots_) {
+    assert(prev <= cur);
+    if (cur >= in) {
+      *prev_snapshot = prev;
+      return cur;
+    }
+    prev = cur;
+    assert(prev);
+  }
+  *prev_snapshot = prev;
+  return kMaxSequenceNumber;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/compaction_iterator.h b/src/rocksdb/db/compaction_iterator.h
new file mode 100644
index 0000000..da242f6
--- /dev/null
+++ b/src/rocksdb/db/compaction_iterator.h
@@ -0,0 +1,138 @@
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <algorithm>
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "db/compaction.h"
+#include "db/merge_helper.h"
+#include "rocksdb/compaction_filter.h"
+#include "util/log_buffer.h"
+
+namespace rocksdb {
+
+struct CompactionIteratorStats {
+  // Compaction statistics
+  int64_t num_record_drop_user = 0;
+  int64_t num_record_drop_hidden = 0;
+  int64_t num_record_drop_obsolete = 0;
+  uint64_t total_filter_time = 0;
+
+  // Input statistics
+  // TODO(noetzli): The stats are incomplete. They are lacking everything
+  // consumed by MergeHelper.
+  uint64_t num_input_records = 0;
+  uint64_t num_input_deletion_records = 0;
+  uint64_t num_input_corrupt_records = 0;
+  uint64_t total_input_raw_key_bytes = 0;
+  uint64_t total_input_raw_value_bytes = 0;
+};
+
+class CompactionIterator {
+ public:
+  CompactionIterator(Iterator* input, const Comparator* cmp,
+                     MergeHelper* merge_helper, SequenceNumber last_sequence,
+                     std::vector<SequenceNumber>* snapshots, Env* env,
+                     bool expect_valid_internal_key,
+                     Compaction* compaction = nullptr,
+                     const CompactionFilter* compaction_filter = nullptr,
+                     LogBuffer* log_buffer = nullptr);
+
+  void ResetRecordCounts();
+
+  // Seek to the beginning of the compaction iterator output.
+  //
+  // REQUIRED: Call only once.
+  void SeekToFirst();
+
+  // Produces the next record in the compaction.
+  //
+  // REQUIRED: SeekToFirst() has been called.
+  void Next();
+
+  // Getters
+  const Slice& key() const { return key_; }
+  const Slice& value() const { return value_; }
+  const Status& status() const { return status_; }
+  const ParsedInternalKey& ikey() const { return ikey_; }
+  bool Valid() const { return valid_; }
+  const Slice& user_key() const { return current_user_key_; }
+  const CompactionIteratorStats& iter_stats() const { return iter_stats_; }
+
+ private:
+  // Processes the input stream to find the next output
+  void NextFromInput();
+
+  // Do last preparations before presenting the output to the callee. At this
+  // point this only zeroes out the sequence number if possible for better
+  // compression.
+  void PrepareOutput();
+
+  // Given a sequence number, return the sequence number of the
+  // earliest snapshot that this sequence number is visible in.
+  // The snapshots themselves are arranged in ascending order of
+  // sequence numbers.
+  // Employ a sequential search because the total number of
+  // snapshots are typically small.
+  inline SequenceNumber findEarliestVisibleSnapshot(
+      SequenceNumber in, SequenceNumber* prev_snapshot);
+
+  Iterator* input_;
+  const Comparator* cmp_;
+  MergeHelper* merge_helper_;
+  const std::vector<SequenceNumber>* snapshots_;
+  Env* env_;
+  bool expect_valid_internal_key_;
+  Compaction* compaction_;
+  const CompactionFilter* compaction_filter_;
+  LogBuffer* log_buffer_;
+  bool bottommost_level_;
+  bool valid_ = false;
+  SequenceNumber visible_at_tip_;
+  SequenceNumber earliest_snapshot_;
+  SequenceNumber latest_snapshot_;
+
+  // State
+  //
+  // Points to a copy of the current compaction iterator output (current_key_)
+  // if valid_.
+  Slice key_;
+  // Points to the value in the underlying iterator that corresponds to the
+  // current output.
+  Slice value_;
+  // The status is OK unless compaction iterator encounters a merge operand
+  // while not having a merge operator defined.
+  Status status_;
+  // Stores the user key, sequence number and type of the current compaction
+  // iterator output (or current key in the underlying iterator during
+  // NextFromInput()).
+  ParsedInternalKey ikey_;
+  // Stores whether ikey_.user_key is valid. If set to false, the user key is
+  // not compared against the current key in the underlying iterator.
+  bool has_current_user_key_ = false;
+  bool at_next_ = false;  // If false, the iterator
+  // Holds a copy of the current compaction iterator output (or current key in
+  // the underlying iterator during NextFromInput()).
+  IterKey current_key_;
+  Slice current_user_key_;
+  SequenceNumber current_user_key_sequence_;
+  SequenceNumber current_user_key_snapshot_;
+  MergeOutputIterator merge_out_iter_;
+  std::string compaction_filter_value_;
+  // "level_ptrs" holds indices that remember which file of an associated
+  // level we were last checking during the last call to compaction->
+  // KeyNotExistsBeyondOutputLevel(). This allows future calls to the function
+  // to pick off where it left off since each subcompaction's key range is
+  // increasing so a later call to the function must be looking for a key that
+  // is in or beyond the last file checked during the previous call
+  std::vector<size_t> level_ptrs_;
+  CompactionIteratorStats iter_stats_;
+};
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/compaction_iterator_test.cc b/src/rocksdb/db/compaction_iterator_test.cc
new file mode 100644
index 0000000..1148c2a
--- /dev/null
+++ b/src/rocksdb/db/compaction_iterator_test.cc
@@ -0,0 +1,71 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/compaction_iterator.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class CompactionIteratorTest : public testing::Test {
+ public:
+  CompactionIteratorTest() : cmp_(BytewiseComparator()), snapshots_({}) {}
+
+  void InitIterator(const std::vector<std::string>& ks,
+                    const std::vector<std::string>& vs,
+                    SequenceNumber last_sequence) {
+    merge_helper_.reset(new MergeHelper(Env::Default(), cmp_, nullptr, nullptr,
+                                        nullptr, 0U, false, 0));
+    iter_.reset(new test::VectorIterator(ks, vs));
+    iter_->SeekToFirst();
+    c_iter_.reset(new CompactionIterator(iter_.get(), cmp_, merge_helper_.get(),
+                                         last_sequence, &snapshots_,
+                                         Env::Default(), false));
+  }
+
+  const Comparator* cmp_;
+  std::vector<SequenceNumber> snapshots_;
+  std::unique_ptr<MergeHelper> merge_helper_;
+  std::unique_ptr<test::VectorIterator> iter_;
+  std::unique_ptr<CompactionIterator> c_iter_;
+};
+
+// It is possible that the output of the compaction iterator is empty even if
+// the input is not.
+TEST_F(CompactionIteratorTest, EmptyResult) {
+  InitIterator({test::KeyStr("a", 5, kTypeSingleDeletion),
+                test::KeyStr("a", 3, kTypeValue)},
+               {"", "val"}, 5);
+  c_iter_->SeekToFirst();
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+// If there is a corruption after a single deletion, the corrupted key should
+// be preserved.
+TEST_F(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
+  InitIterator({test::KeyStr("a", 5, kTypeSingleDeletion),
+                test::KeyStr("a", 3, kTypeValue, true),
+                test::KeyStr("b", 10, kTypeValue)},
+               {"", "val", "val2"}, 10);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion),
+            c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction_job.cc b/src/rocksdb/db/compaction_job.cc
index 12f35cb..4d6656d 100644
--- a/src/rocksdb/db/compaction_job.cc
+++ b/src/rocksdb/db/compaction_job.cc
@@ -15,24 +15,28 @@
 
 #include <inttypes.h>
 #include <algorithm>
+#include <functional>
 #include <vector>
 #include <memory>
 #include <list>
+#include <set>
+#include <thread>
+#include <utility>
 
 #include "db/builder.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
-#include "db/event_logger_helpers.h"
+#include "db/event_helpers.h"
 #include "db/filename.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
 #include "db/memtable.h"
-#include "db/merge_helper.h"
 #include "db/memtable_list.h"
 #include "db/merge_context.h"
+#include "db/merge_helper.h"
 #include "db/version_set.h"
-#include "port/port.h"
 #include "port/likely.h"
+#include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/statistics.h"
@@ -42,13 +46,13 @@
 #include "table/block_based_table_factory.h"
 #include "table/merger.h"
 #include "table/table_builder.h"
-#include "table/two_level_iterator.h"
 #include "util/coding.h"
-#include "util/logging.h"
+#include "util/file_reader_writer.h"
+#include "util/iostats_context_imp.h"
 #include "util/log_buffer.h"
+#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/perf_context_imp.h"
-#include "util/iostats_context_imp.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
@@ -56,146 +60,150 @@
 
 namespace rocksdb {
 
-struct CompactionJob::CompactionState {
-  Compaction* const compaction;
+// Maintains state for each sub-compaction
+struct CompactionJob::SubcompactionState {
+  Compaction* compaction;
+  std::unique_ptr<CompactionIterator> c_iter;
+
+  // The boundaries of the key-range this compaction is interested in. No two
+  // subcompactions may have overlapping key-ranges.
+  // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded
+  Slice *start, *end;
 
-  // Files produced by compaction
+  // The return status of this subcompaction
+  Status status;
+
+  // Files produced by this subcompaction
   struct Output {
-    uint64_t number;
-    uint32_t path_id;
-    uint64_t file_size;
-    InternalKey smallest, largest;
-    SequenceNumber smallest_seqno, largest_seqno;
+    FileMetaData meta;
+    bool finished;
   };
-  std::vector<Output> outputs;
 
   // State kept for output being generated
-  std::unique_ptr<WritableFile> outfile;
+  std::vector<Output> outputs;
+  std::unique_ptr<WritableFileWriter> outfile;
   std::unique_ptr<TableBuilder> builder;
+  Output* current_output() {
+    if (outputs.empty()) {
+      // This subcompaction's outptut could be empty if compaction was aborted
+      // before this subcompaction had a chance to generate any output files.
+      // When subcompactions are executed sequentially this is more likely and
+      // will be particulalry likely for the later subcompactions to be empty.
+      // Once they are run in parallel however it should be much rarer.
+      return nullptr;
+    } else {
+      return &outputs.back();
+    }
+  }
 
+  // State during the subcompaction
   uint64_t total_bytes;
+  uint64_t num_input_records;
+  uint64_t num_output_records;
+  CompactionJobStats compaction_job_stats;
+  uint64_t approx_size;
 
-  Output* current_output() { return &outputs[outputs.size() - 1]; }
-
-  explicit CompactionState(Compaction* c)
+  SubcompactionState(Compaction* c, Slice* _start, Slice* _end,
+                     uint64_t size = 0)
       : compaction(c),
+        start(_start),
+        end(_end),
+        outfile(nullptr),
+        builder(nullptr),
         total_bytes(0),
         num_input_records(0),
-        num_output_records(0) {}
-
-  // Create a client visible context of this compaction
-  CompactionFilter::Context GetFilterContextV1() {
-    CompactionFilter::Context context;
-    context.is_full_compaction = compaction->IsFullCompaction();
-    context.is_manual_compaction = compaction->IsManualCompaction();
-    return context;
+        num_output_records(0),
+        approx_size(size) {
+    assert(compaction != nullptr);
   }
 
-  // Create a client visible context of this compaction
-  CompactionFilterContext GetFilterContext() {
-    CompactionFilterContext context;
-    context.is_full_compaction = compaction->IsFullCompaction();
-    context.is_manual_compaction = compaction->IsManualCompaction();
-    return context;
+  SubcompactionState(SubcompactionState&& o) { *this = std::move(o); }
+
+  SubcompactionState& operator=(SubcompactionState&& o) {
+    compaction = std::move(o.compaction);
+    start = std::move(o.start);
+    end = std::move(o.end);
+    status = std::move(o.status);
+    outputs = std::move(o.outputs);
+    outfile = std::move(o.outfile);
+    builder = std::move(o.builder);
+    total_bytes = std::move(o.total_bytes);
+    num_input_records = std::move(o.num_input_records);
+    num_output_records = std::move(o.num_output_records);
+    compaction_job_stats = std::move(o.compaction_job_stats);
+    approx_size = std::move(o.approx_size);
+    return *this;
   }
 
-  std::vector<std::string> key_str_buf_;
-  std::vector<std::string> existing_value_str_buf_;
-  // new_value_buf_ will only be appended if a value changes
-  std::vector<std::string> new_value_buf_;
-  // if values_changed_buf_[i] is true
-  // new_value_buf_ will add a new entry with the changed value
-  std::vector<bool> value_changed_buf_;
-  // to_delete_buf_[i] is true iff key_buf_[i] is deleted
-  std::vector<bool> to_delete_buf_;
+  // Because member unique_ptrs do not have these.
+  SubcompactionState(const SubcompactionState&) = delete;
 
-  std::vector<std::string> other_key_str_buf_;
-  std::vector<std::string> other_value_str_buf_;
+  SubcompactionState& operator=(const SubcompactionState&) = delete;
+};
 
-  std::vector<Slice> combined_key_buf_;
-  std::vector<Slice> combined_value_buf_;
+// Maintains state for the entire compaction
+struct CompactionJob::CompactionState {
+  Compaction* const compaction;
 
-  std::string cur_prefix_;
+  // REQUIRED: subcompaction states are stored in order of increasing
+  // key-range
+  std::vector<CompactionJob::SubcompactionState> sub_compact_states;
+  Status status;
 
+  uint64_t total_bytes;
   uint64_t num_input_records;
   uint64_t num_output_records;
 
-  // Buffers the kv-pair that will be run through compaction filter V2
-  // in the future.
-  void BufferKeyValueSlices(const Slice& key, const Slice& value) {
-    key_str_buf_.emplace_back(key.ToString());
-    existing_value_str_buf_.emplace_back(value.ToString());
-  }
+  explicit CompactionState(Compaction* c)
+      : compaction(c),
+        total_bytes(0),
+        num_input_records(0),
+        num_output_records(0) {}
 
-  // Buffers the kv-pair that will not be run through compaction filter V2
-  // in the future.
-  void BufferOtherKeyValueSlices(const Slice& key, const Slice& value) {
-    other_key_str_buf_.emplace_back(key.ToString());
-    other_value_str_buf_.emplace_back(value.ToString());
+  size_t NumOutputFiles() {
+    size_t total = 0;
+    for (auto& s : sub_compact_states) {
+      total += s.outputs.size();
+    }
+    return total;
   }
 
-  // Add a kv-pair to the combined buffer
-  void AddToCombinedKeyValueSlices(const Slice& key, const Slice& value) {
-    // The real strings are stored in the batch buffers
-    combined_key_buf_.emplace_back(key);
-    combined_value_buf_.emplace_back(value);
+  Slice SmallestUserKey() {
+    for (const auto& sub_compact_state : sub_compact_states) {
+      if (!sub_compact_state.outputs.empty() &&
+          sub_compact_state.outputs[0].finished) {
+        return sub_compact_state.outputs[0].meta.smallest.user_key();
+      }
+    }
+    // If there is no finished output, return an empty slice.
+    return Slice(nullptr, 0);
   }
 
-  // Merging the two buffers
-  void MergeKeyValueSliceBuffer(const InternalKeyComparator* comparator) {
-    size_t i = 0;
-    size_t j = 0;
-    size_t total_size = key_str_buf_.size() + other_key_str_buf_.size();
-    combined_key_buf_.reserve(total_size);
-    combined_value_buf_.reserve(total_size);
-
-    while (i + j < total_size) {
-      int comp_res = 0;
-      if (i < key_str_buf_.size() && j < other_key_str_buf_.size()) {
-        comp_res = comparator->Compare(key_str_buf_[i], other_key_str_buf_[j]);
-      } else if (i >= key_str_buf_.size() && j < other_key_str_buf_.size()) {
-        comp_res = 1;
-      } else if (j >= other_key_str_buf_.size() && i < key_str_buf_.size()) {
-        comp_res = -1;
-      }
-      if (comp_res > 0) {
-        AddToCombinedKeyValueSlices(other_key_str_buf_[j],
-                                    other_value_str_buf_[j]);
-        j++;
-      } else if (comp_res < 0) {
-        AddToCombinedKeyValueSlices(key_str_buf_[i],
-                                    existing_value_str_buf_[i]);
-        i++;
+  Slice LargestUserKey() {
+    for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend();
+         ++it) {
+      if (!it->outputs.empty() && it->current_output()->finished) {
+        assert(it->current_output() != nullptr);
+        return it->current_output()->meta.largest.user_key();
       }
     }
+    // If there is no finished output, return an empty slice.
+    return Slice(nullptr, 0);
   }
+};
 
-  void CleanupBatchBuffer() {
-    to_delete_buf_.clear();
-    key_str_buf_.clear();
-    existing_value_str_buf_.clear();
-    new_value_buf_.clear();
-    value_changed_buf_.clear();
-
-    to_delete_buf_.shrink_to_fit();
-    key_str_buf_.shrink_to_fit();
-    existing_value_str_buf_.shrink_to_fit();
-    new_value_buf_.shrink_to_fit();
-    value_changed_buf_.shrink_to_fit();
-
-    other_key_str_buf_.clear();
-    other_value_str_buf_.clear();
-    other_key_str_buf_.shrink_to_fit();
-    other_value_str_buf_.shrink_to_fit();
+void CompactionJob::AggregateStatistics() {
+  for (SubcompactionState& sc : compact_->sub_compact_states) {
+    compact_->total_bytes += sc.total_bytes;
+    compact_->num_input_records += sc.num_input_records;
+    compact_->num_output_records += sc.num_output_records;
   }
-
-  void CleanupMergedBuffer() {
-    combined_key_buf_.clear();
-    combined_value_buf_.clear();
-    combined_key_buf_.shrink_to_fit();
-    combined_value_buf_.shrink_to_fit();
+  if (compaction_job_stats_) {
+    for (SubcompactionState& sc : compact_->sub_compact_states) {
+      compaction_job_stats_->Add(sc.compaction_job_stats);
+    }
   }
-};
+}
 
 CompactionJob::CompactionJob(
     int job_id, Compaction* compaction, const DBOptions& db_options,
@@ -203,12 +211,14 @@ CompactionJob::CompactionJob(
     std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
     Directory* db_directory, Directory* output_directory, Statistics* stats,
     std::vector<SequenceNumber> existing_snapshots,
-    std::shared_ptr<Cache> table_cache,
-    std::function<uint64_t()> yield_callback, EventLogger* event_logger,
-    bool paranoid_file_checks)
+    std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+    bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname,
+    CompactionJobStats* compaction_job_stats)
     : job_id_(job_id),
       compact_(new CompactionState(compaction)),
+      compaction_job_stats_(compaction_job_stats),
       compaction_stats_(1),
+      dbname_(dbname),
       db_options_(db_options),
       env_options_(env_options),
       env_(db_options.env),
@@ -220,9 +230,10 @@ CompactionJob::CompactionJob(
       stats_(stats),
       existing_snapshots_(std::move(existing_snapshots)),
       table_cache_(std::move(table_cache)),
-      yield_callback_(std::move(yield_callback)),
       event_logger_(event_logger),
-      paranoid_file_checks_(paranoid_file_checks) {
+      paranoid_file_checks_(paranoid_file_checks),
+      measure_io_stats_(measure_io_stats) {
+  assert(log_buffer_ != nullptr);
   ThreadStatusUtil::SetColumnFamily(compact_->compaction->column_family_data());
   ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
   ReportStartedCompaction(compaction);
@@ -247,11 +258,15 @@ void CompactionJob::ReportStartedCompaction(
       (static_cast<uint64_t>(compact_->compaction->start_level()) << 32) +
           compact_->compaction->output_level());
 
+  // In the current design, a CompactionJob is always created
+  // for non-trivial compaction.
+  assert(compaction->IsTrivialMove() == false ||
+         compaction->is_manual_compaction() == true);
+
   ThreadStatusUtil::SetThreadOperationProperty(
       ThreadStatus::COMPACTION_PROP_FLAGS,
-      compaction->IsManualCompaction() +
-          (compaction->IsDeletionCompaction() << 1) +
-          (compaction->IsTrivialMove() << 2));
+      compaction->is_manual_compaction() +
+          (compaction->deletion_compaction() << 1));
 
   ThreadStatusUtil::SetThreadOperationProperty(
       ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES,
@@ -268,320 +283,284 @@ void CompactionJob::ReportStartedCompaction(
   // to ensure GetThreadList() can always show them all together.
   ThreadStatusUtil::SetThreadOperation(
       ThreadStatus::OP_COMPACTION);
+
+  if (compaction_job_stats_) {
+    compaction_job_stats_->is_manual_compaction =
+        compaction->is_manual_compaction();
+  }
 }
 
 void CompactionJob::Prepare() {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_PREPARE);
-  compact_->CleanupBatchBuffer();
-  compact_->CleanupMergedBuffer();
 
   // Generate file_levels_ for compaction berfore making Iterator
-  ColumnFamilyData* cfd __attribute__((unused)) =
-      compact_->compaction->column_family_data();
-  assert(cfd != nullptr);
-
-  assert(cfd->current()->storage_info()->NumLevelFiles(
-             compact_->compaction->level()) > 0);
-  assert(compact_->builder == nullptr);
-  assert(!compact_->outfile);
-
-  visible_at_tip_ = 0;
-  latest_snapshot_ = 0;
-  if (existing_snapshots_.size() == 0) {
-    // optimize for fast path if there are no snapshots
-    visible_at_tip_ = versions_->LastSequence();
-    earliest_snapshot_ = visible_at_tip_;
-  } else {
-    latest_snapshot_ = existing_snapshots_.back();
-    // Add the current seqno as the 'latest' virtual
-    // snapshot to the end of this list.
-    existing_snapshots_.push_back(versions_->LastSequence());
-    earliest_snapshot_ = existing_snapshots_[0];
-  }
+  auto* c = compact_->compaction;
+  assert(c->column_family_data() != nullptr);
+  assert(c->column_family_data()->current()->storage_info()
+      ->NumLevelFiles(compact_->compaction->level()) > 0);
 
   // Is this compaction producing files at the bottommost level?
-  bottommost_level_ = compact_->compaction->BottomMostLevel();
-}
+  bottommost_level_ = c->bottommost_level();
 
-Status CompactionJob::Run() {
-  AutoThreadOperationStageUpdater stage_updater(
-      ThreadStatus::STAGE_COMPACTION_RUN);
-  TEST_SYNC_POINT("CompactionJob::Run():Start");
-  log_buffer_->FlushBufferToLog();
-  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  if (c->ShouldFormSubcompactions()) {
+    const uint64_t start_micros = env_->NowMicros();
+    GenSubcompactionBoundaries();
+    MeasureTime(stats_, SUBCOMPACTION_SETUP_TIME,
+                env_->NowMicros() - start_micros);
 
-  auto* compaction = compact_->compaction;
-  // Let's check if anything will get logged. Don't prepare all the info if
-  // we're not logging
-  if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) {
-    Compaction::InputLevelSummaryBuffer inputs_summary;
-    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
-        "[%s] [JOB %d] Compacting %s, score %.2f", cfd->GetName().c_str(),
-        job_id_, compaction->InputLevelSummary(&inputs_summary),
-        compaction->score());
-    char scratch[2345];
-    compact_->compaction->Summary(scratch, sizeof(scratch));
-    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
-        "[%s] Compaction start summary: %s\n", cfd->GetName().c_str(), scratch);
-    // build event logger report
-    auto stream = event_logger_->Log();
-    stream << "job" << job_id_ << "event"
-           << "compaction_started";
-    for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
-      stream << ("files_L" + ToString(compaction->level(i)));
-      stream.StartArray();
-      for (auto f : *compaction->inputs(i)) {
-        stream << f->fd.GetNumber();
-      }
-      stream.EndArray();
+    assert(sizes_.size() == boundaries_.size() + 1);
+
+    for (size_t i = 0; i <= boundaries_.size(); i++) {
+      Slice* start = i == 0 ? nullptr : &boundaries_[i - 1];
+      Slice* end = i == boundaries_.size() ? nullptr : &boundaries_[i];
+      compact_->sub_compact_states.emplace_back(c, start, end, sizes_[i]);
     }
-    stream << "score" << compaction->score() << "input_data_size"
-           << compaction->CalculateTotalInputSize();
+    MeasureTime(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
+                compact_->sub_compact_states.size());
+  } else {
+    compact_->sub_compact_states.emplace_back(c, nullptr, nullptr);
   }
+}
 
-  const uint64_t start_micros = env_->NowMicros();
-  std::unique_ptr<Iterator> input(
-      versions_->MakeInputIterator(compact_->compaction));
-  input->SeekToFirst();
+struct RangeWithSize {
+  Range range;
+  uint64_t size;
 
-  Status status;
-  ParsedInternalKey ikey;
-  std::unique_ptr<CompactionFilterV2> compaction_filter_from_factory_v2 =
-      nullptr;
-  auto context = compact_->GetFilterContext();
-  compaction_filter_from_factory_v2 =
-      cfd->ioptions()->compaction_filter_factory_v2->CreateCompactionFilterV2(
-          context);
-  auto compaction_filter_v2 = compaction_filter_from_factory_v2.get();
-
-  int64_t imm_micros = 0;  // Micros spent doing imm_ compactions
-  if (!compaction_filter_v2) {
-    status = ProcessKeyValueCompaction(&imm_micros, input.get(), false);
-  } else {
-    // temp_backup_input always point to the start of the current buffer
-    // temp_backup_input = backup_input;
-    // iterate through input,
-    // 1) buffer ineligible keys and value keys into 2 separate buffers;
-    // 2) send value_buffer to compaction filter and alternate the values;
-    // 3) merge value_buffer with ineligible_value_buffer;
-    // 4) run the modified "compaction" using the old for loop.
-    bool prefix_initialized = false;
-    shared_ptr<Iterator> backup_input(
-        versions_->MakeInputIterator(compact_->compaction));
-    backup_input->SeekToFirst();
-    uint64_t total_filter_time = 0;
-    while (backup_input->Valid() &&
-           !shutting_down_->load(std::memory_order_acquire) &&
-           !cfd->IsDropped()) {
-      // FLUSH preempts compaction
-      // TODO(icanadi) this currently only checks if flush is necessary on
-      // compacting column family. we should also check if flush is necessary on
-      // other column families, too
-
-      imm_micros += yield_callback_();
-
-      Slice key = backup_input->key();
-      Slice value = backup_input->value();
-
-      if (!ParseInternalKey(key, &ikey)) {
-        // log error
-        Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
-            "[%s] [JOB %d] Failed to parse key: %s", cfd->GetName().c_str(),
-            job_id_, key.ToString().c_str());
-        continue;
-      } else {
-        const SliceTransform* transformer =
-            cfd->ioptions()->compaction_filter_factory_v2->GetPrefixExtractor();
-        const auto key_prefix = transformer->Transform(ikey.user_key);
-        if (!prefix_initialized) {
-          compact_->cur_prefix_ = key_prefix.ToString();
-          prefix_initialized = true;
-        }
-        // If the prefix remains the same, keep buffering
-        if (key_prefix.compare(Slice(compact_->cur_prefix_)) == 0) {
-          // Apply the compaction filter V2 to all the kv pairs sharing
-          // the same prefix
-          if (ikey.type == kTypeValue &&
-              (visible_at_tip_ || ikey.sequence > latest_snapshot_)) {
-            // Buffer all keys sharing the same prefix for CompactionFilterV2
-            // Iterate through keys to check prefix
-            compact_->BufferKeyValueSlices(key, value);
-          } else {
-            // buffer ineligible keys
-            compact_->BufferOtherKeyValueSlices(key, value);
-          }
-          backup_input->Next();
-          continue;
-          // finish changing values for eligible keys
-        } else {
-          // Now prefix changes, this batch is done.
-          // Call compaction filter on the buffered values to change the value
-          if (compact_->key_str_buf_.size() > 0) {
-            uint64_t time = 0;
-            CallCompactionFilterV2(compaction_filter_v2, &time);
-            total_filter_time += time;
-          }
-          compact_->cur_prefix_ = key_prefix.ToString();
-        }
-      }
-
-      // Merge this batch of data (values + ineligible keys)
-      compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
+  RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0)
+      : range(a, b), size(s) {}
+};
 
-      // Done buffering for the current prefix. Spit it out to disk
-      // Now just iterate through all the kv-pairs
-      status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
+bool SliceCompare(const Comparator* cmp, const Slice& a, const Slice& b) {
+  // Returns true if a < b
+  return cmp->Compare(ExtractUserKey(a), ExtractUserKey(b)) < 0;
+}
 
-      if (!status.ok()) {
+// Generates a histogram representing potential divisions of key ranges from
+// the input. It adds the starting and/or ending keys of certain input files
+// to the working set and then finds the approximate size of data in between
+// each consecutive pair of slices. Then it divides these ranges into
+// consecutive groups such that each group has a similar size.
+void CompactionJob::GenSubcompactionBoundaries() {
+  auto* c = compact_->compaction;
+  auto* cfd = c->column_family_data();
+  std::set<Slice, std::function<bool(const Slice& a, const Slice& b)> > bounds(
+      std::bind(&SliceCompare, cfd->user_comparator(), std::placeholders::_1,
+                std::placeholders::_2));
+  int start_lvl = c->start_level();
+  int out_lvl = c->output_level();
+
+  // Add the starting and/or ending key of certain input files as a potential
+  // boundary (because we're inserting into a set, it avoids duplicates)
+  for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) {
+    int lvl = c->level(lvl_idx);
+    if (lvl >= start_lvl && lvl <= out_lvl) {
+      const LevelFilesBrief* flevel = c->input_levels(lvl_idx);
+      size_t num_files = flevel->num_files;
+
+      if (num_files == 0) {
         break;
       }
 
-      // After writing the kv-pairs, we can safely remove the reference
-      // to the string buffer and clean them up
-      compact_->CleanupBatchBuffer();
-      compact_->CleanupMergedBuffer();
-      // Buffer the key that triggers the mismatch in prefix
-      if (ikey.type == kTypeValue &&
-          (visible_at_tip_ || ikey.sequence > latest_snapshot_)) {
-        compact_->BufferKeyValueSlices(key, value);
+      if (lvl == 0) {
+        // For level 0 add the starting and ending key of each file since the
+        // files may have greatly differing key ranges (not range-partitioned)
+        for (size_t i = 0; i < num_files; i++) {
+          bounds.emplace(flevel->files[i].smallest_key);
+          bounds.emplace(flevel->files[i].largest_key);
+        }
       } else {
-        compact_->BufferOtherKeyValueSlices(key, value);
-      }
-      backup_input->Next();
-      if (!backup_input->Valid()) {
-        // If this is the single last value, we need to merge it.
-        if (compact_->key_str_buf_.size() > 0) {
-          uint64_t time = 0;
-          CallCompactionFilterV2(compaction_filter_v2, &time);
-          total_filter_time += time;
+        // For all other levels add the smallest/largest key in the level to
+        // encompass the range covered by that level
+        bounds.emplace(flevel->files[0].smallest_key);
+        bounds.emplace(flevel->files[num_files - 1].largest_key);
+        if (lvl == out_lvl) {
+          // For the last level include the starting keys of all files since
+          // the last level is the largest and probably has the widest key
+          // range. Since it's range partitioned, the ending key of one file
+          // and the starting key of the next are very close (or identical).
+          for (size_t i = 1; i < num_files; i++) {
+            bounds.emplace(flevel->files[i].smallest_key);
+          }
         }
-        compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
+      }
+    }
+  }
 
-        status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
-        if (!status.ok()) {
-          break;
-        }
+  // Combine consecutive pairs of boundaries into ranges with an approximate
+  // size of data covered by keys in that range
+  uint64_t sum = 0;
+  std::vector<RangeWithSize> ranges;
+  auto* v = cfd->current();
+  for (auto it = bounds.begin();;) {
+    const Slice a = *it;
+    it++;
+
+    if (it == bounds.end()) {
+      break;
+    }
 
-        compact_->CleanupBatchBuffer();
-        compact_->CleanupMergedBuffer();
+    const Slice b = *it;
+    uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1);
+    ranges.emplace_back(a, b, size);
+    sum += size;
+  }
+
+  // Group the ranges into subcompactions
+  const double min_file_fill_percent = 4.0 / 5;
+  uint64_t max_output_files = std::ceil(
+      sum / min_file_fill_percent /
+      cfd->GetCurrentMutableCFOptions()->MaxFileSizeForLevel(out_lvl));
+  uint64_t subcompactions =
+      std::min({static_cast<uint64_t>(ranges.size()),
+                static_cast<uint64_t>(db_options_.max_subcompactions),
+                max_output_files});
+
+  double mean = sum * 1.0 / subcompactions;
+
+  if (subcompactions > 1) {
+    // Greedily add ranges to the subcompaction until the sum of the ranges'
+    // sizes becomes >= the expected mean size of a subcompaction
+    sum = 0;
+    for (size_t i = 0; i < ranges.size() - 1; i++) {
+      if (subcompactions == 1) {
+        // If there's only one left to schedule then it goes to the end so no
+        // need to put an end boundary
+        break;
       }
-    }  // done processing all prefix batches
-    // finish the last batch
-    if (status.ok()) {
-      if (compact_->key_str_buf_.size() > 0) {
-        uint64_t time = 0;
-        CallCompactionFilterV2(compaction_filter_v2, &time);
-        total_filter_time += time;
+      sum += ranges[i].size;
+      if (sum >= mean) {
+        boundaries_.emplace_back(ExtractUserKey(ranges[i].range.limit));
+        sizes_.emplace_back(sum);
+        subcompactions--;
+        sum = 0;
       }
-      compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator());
-      status = ProcessKeyValueCompaction(&imm_micros, input.get(), true);
     }
-    RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME, total_filter_time);
-  }  // checking for compaction filter v2
-
-  if (status.ok() &&
-      (shutting_down_->load(std::memory_order_acquire) || cfd->IsDropped())) {
-    status = Status::ShutdownInProgress(
-        "Database shutdown or Column family drop during compaction");
-  }
-  if (status.ok() && compact_->builder != nullptr) {
-    status = FinishCompactionOutputFile(input.get());
-  }
-  if (status.ok()) {
-    status = input->status();
+    sizes_.emplace_back(sum + ranges.back().size);
+  } else {
+    // Only one range so its size is the total sum of sizes computed above
+    sizes_.emplace_back(sum);
   }
-  input.reset();
+}
 
-  if (output_directory_ && !db_options_.disableDataSync) {
-    output_directory_->Fsync();
-  }
+Status CompactionJob::Run() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_RUN);
+  TEST_SYNC_POINT("CompactionJob::Run():Start");
+  log_buffer_->FlushBufferToLog();
+  LogCompaction();
 
-  compaction_stats_.micros = env_->NowMicros() - start_micros - imm_micros;
-  compaction_stats_.files_in_leveln =
-      static_cast<int>(compact_->compaction->num_input_files(0));
-  compaction_stats_.files_in_levelnp1 =
-      static_cast<int>(compact_->compaction->num_input_files(1));
-  MeasureTime(stats_, COMPACTION_TIME, compaction_stats_.micros);
+  const size_t num_threads = compact_->sub_compact_states.size();
+  assert(num_threads > 0);
+  const uint64_t start_micros = env_->NowMicros();
 
-  size_t num_output_files = compact_->outputs.size();
-  if (compact_->builder != nullptr) {
-    // An error occurred so ignore the last output.
-    assert(num_output_files > 0);
-    --num_output_files;
+  // Launch a thread for each of subcompactions 1...num_threads-1
+  std::vector<std::thread> thread_pool;
+  thread_pool.reserve(num_threads - 1);
+  for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+    thread_pool.emplace_back(&CompactionJob::ProcessKeyValueCompaction, this,
+                             &compact_->sub_compact_states[i]);
   }
-  compaction_stats_.files_out_levelnp1 = static_cast<int>(num_output_files);
 
-  for (size_t i = 0; i < compact_->compaction->num_input_files(0); i++) {
-    compaction_stats_.bytes_readn +=
-        compact_->compaction->input(0, i)->fd.GetFileSize();
-    compaction_stats_.num_input_records +=
-        static_cast<uint64_t>(compact_->compaction->input(0, i)->num_entries);
-  }
+  // Always schedule the first subcompaction (whether or not there are also
+  // others) in the current thread to be efficient with resources
+  ProcessKeyValueCompaction(&compact_->sub_compact_states[0]);
 
-  for (size_t i = 0; i < compact_->compaction->num_input_files(1); i++) {
-    compaction_stats_.bytes_readnp1 +=
-        compact_->compaction->input(1, i)->fd.GetFileSize();
+  // Wait for all other threads (if there are any) to finish execution
+  for (auto& thread : thread_pool) {
+    thread.join();
   }
 
-  for (size_t i = 0; i < num_output_files; i++) {
-    compaction_stats_.bytes_written += compact_->outputs[i].file_size;
+  if (output_directory_ && !db_options_.disableDataSync) {
+    output_directory_->Fsync();
   }
-  if (compact_->num_input_records > compact_->num_output_records) {
-    compaction_stats_.num_dropped_records +=
-        compact_->num_input_records - compact_->num_output_records;
+
+  compaction_stats_.micros = env_->NowMicros() - start_micros;
+  MeasureTime(stats_, COMPACTION_TIME, compaction_stats_.micros);
+
+  // Check if any thread encountered an error during execution
+  Status status;
+  for (const auto& state : compact_->sub_compact_states) {
+    if (!state.status.ok()) {
+      status = state.status;
+      break;
+    }
   }
 
+  // Finish up all book-keeping to unify the subcompaction results
+  AggregateStatistics();
+  UpdateCompactionStats();
   RecordCompactionIOStats();
-
   LogFlush(db_options_.info_log);
   TEST_SYNC_POINT("CompactionJob::Run():End");
+
+  compact_->status = status;
   return status;
 }
 
-void CompactionJob::Install(Status* status,
-                            const MutableCFOptions& mutable_cf_options,
-                            InstrumentedMutex* db_mutex) {
+Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options,
+                              InstrumentedMutex* db_mutex) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_INSTALL);
   db_mutex->AssertHeld();
+  Status status = compact_->status;
   ColumnFamilyData* cfd = compact_->compaction->column_family_data();
   cfd->internal_stats()->AddCompactionStats(
       compact_->compaction->output_level(), compaction_stats_);
 
-  if (status->ok()) {
-    *status = InstallCompactionResults(db_mutex, mutable_cf_options);
+  if (status.ok()) {
+    status = InstallCompactionResults(mutable_cf_options, db_mutex);
   }
   VersionStorageInfo::LevelSummaryStorage tmp;
   auto vstorage = cfd->current()->storage_info();
   const auto& stats = compaction_stats_;
-  LogToBuffer(log_buffer_,
-              "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
-              "files in(%d, %d) out(%d) "
-              "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
-              "write-amplify(%.1f) %s, records in: %d, records dropped: %d\n",
-              cfd->GetName().c_str(), vstorage->LevelSummary(&tmp),
-              (stats.bytes_readn + stats.bytes_readnp1) /
-                  static_cast<double>(stats.micros),
-              stats.bytes_written / static_cast<double>(stats.micros),
-              compact_->compaction->output_level(), stats.files_in_leveln,
-              stats.files_in_levelnp1, stats.files_out_levelnp1,
-              stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
-              stats.bytes_written / 1048576.0,
-              (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
-                  static_cast<double>(stats.bytes_readn),
-              stats.bytes_written / static_cast<double>(stats.bytes_readn),
-              status->ToString().c_str(), stats.num_input_records,
-              stats.num_dropped_records);
+  LogToBuffer(
+      log_buffer_,
+      "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
+      "files in(%d, %d) out(%d) "
+      "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
+      "write-amplify(%.1f) %s, records in: %d, records dropped: %d\n",
+      cfd->GetName().c_str(), vstorage->LevelSummary(&tmp),
+      (stats.bytes_read_non_output_levels + stats.bytes_read_output_level) /
+          static_cast<double>(stats.micros),
+      stats.bytes_written / static_cast<double>(stats.micros),
+      compact_->compaction->output_level(),
+      stats.num_input_files_in_non_output_levels,
+      stats.num_input_files_in_output_level,
+      stats.num_output_files,
+      stats.bytes_read_non_output_levels / 1048576.0,
+      stats.bytes_read_output_level / 1048576.0,
+      stats.bytes_written / 1048576.0,
+      (stats.bytes_written + stats.bytes_read_output_level +
+       stats.bytes_read_non_output_levels) /
+          static_cast<double>(stats.bytes_read_non_output_levels),
+      stats.bytes_written /
+          static_cast<double>(stats.bytes_read_non_output_levels),
+      status.ToString().c_str(), stats.num_input_records,
+      stats.num_dropped_records);
+
+  UpdateCompactionJobStats(stats);
 
   auto stream = event_logger_->LogToBuffer(log_buffer_);
-  stream << "job" << job_id_ << "event"
-         << "compaction_finished"
+  stream << "job" << job_id_
+         << "event" << "compaction_finished"
+         << "compaction_time_micros" << compaction_stats_.micros
          << "output_level" << compact_->compaction->output_level()
-         << "num_output_files" << compact_->outputs.size()
-         << "total_output_size" << compact_->total_bytes << "num_input_records"
-         << compact_->num_input_records << "num_output_records"
-         << compact_->num_output_records;
+         << "num_output_files" << compact_->NumOutputFiles()
+         << "total_output_size" << compact_->total_bytes
+         << "num_input_records" << compact_->num_input_records
+         << "num_output_records" << compact_->num_output_records
+         << "num_subcompactions" << compact_->sub_compact_states.size();
+
+  if (measure_io_stats_ && compaction_job_stats_ != nullptr) {
+    stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos;
+    stream << "file_range_sync_nanos"
+           << compaction_job_stats_->file_range_sync_nanos;
+    stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos;
+    stream << "file_prepare_write_nanos"
+           << compaction_job_stats_->file_prepare_write_nanos;
+  }
+
   stream << "lsm_state";
   stream.StartArray();
   for (int level = 0; level < vstorage->num_levels(); ++level) {
@@ -589,466 +568,243 @@ void CompactionJob::Install(Status* status,
   }
   stream.EndArray();
 
-  CleanupCompaction(*status);
+  CleanupCompaction();
+  return status;
 }
 
-Status CompactionJob::ProcessKeyValueCompaction(int64_t* imm_micros,
-                                                Iterator* input,
-                                                bool is_compaction_v2) {
+void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
+  assert(sub_compact != nullptr);
+  std::unique_ptr<Iterator> input(
+      versions_->MakeInputIterator(sub_compact->compaction));
+
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
-  size_t combined_idx = 0;
-  Status status;
-  std::string compaction_filter_value;
-  ParsedInternalKey ikey;
-  IterKey current_user_key;
-  bool has_current_user_key = false;
-  IterKey delete_key;
-  SequenceNumber last_sequence_for_key __attribute__((unused)) =
-      kMaxSequenceNumber;
-  SequenceNumber visible_in_snapshot = kMaxSequenceNumber;
-  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
-  MergeHelper merge(cfd->user_comparator(), cfd->ioptions()->merge_operator,
-                    db_options_.info_log.get(),
-                    cfd->ioptions()->min_partial_merge_operands,
-                    false /* internal key corruption is expected */);
+
+  // I/O measurement variables
+  PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+  const uint64_t kRecordStatsEvery = 1000;
+  uint64_t prev_write_nanos = 0;
+  uint64_t prev_fsync_nanos = 0;
+  uint64_t prev_range_sync_nanos = 0;
+  uint64_t prev_prepare_write_nanos = 0;
+  if (measure_io_stats_) {
+    prev_perf_level = GetPerfLevel();
+    SetPerfLevel(PerfLevel::kEnableTime);
+    prev_write_nanos = iostats_context.write_nanos;
+    prev_fsync_nanos = iostats_context.fsync_nanos;
+    prev_range_sync_nanos = iostats_context.range_sync_nanos;
+    prev_prepare_write_nanos = iostats_context.prepare_write_nanos;
+  }
+
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
   auto compaction_filter = cfd->ioptions()->compaction_filter;
   std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
-  if (!compaction_filter) {
-    auto context = compact_->GetFilterContextV1();
+  if (compaction_filter == nullptr) {
     compaction_filter_from_factory =
-        cfd->ioptions()->compaction_filter_factory->CreateCompactionFilter(
-            context);
+        sub_compact->compaction->CreateCompactionFilter();
     compaction_filter = compaction_filter_from_factory.get();
   }
+  MergeHelper merge(
+      env_, cfd->user_comparator(), cfd->ioptions()->merge_operator,
+      compaction_filter, db_options_.info_log.get(),
+      cfd->ioptions()->min_partial_merge_operands,
+      false /* internal key corruption is expected */,
+      existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
+      compact_->compaction->level(), db_options_.statistics.get());
 
   TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
 
-  int64_t key_drop_user = 0;
-  int64_t key_drop_newer_entry = 0;
-  int64_t key_drop_obsolete = 0;
-  int64_t loop_cnt = 0;
-
-  StopWatchNano timer(env_, stats_ != nullptr);
-  uint64_t total_filter_time = 0;
-  while (input->Valid() && !shutting_down_->load(std::memory_order_acquire) &&
-         !cfd->IsDropped() && status.ok()) {
-    compact_->num_input_records++;
-    if (++loop_cnt > 1000) {
-      if (key_drop_user > 0) {
-        RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user);
-        key_drop_user = 0;
-      }
-      if (key_drop_newer_entry > 0) {
-        RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
-                   key_drop_newer_entry);
-        key_drop_newer_entry = 0;
-      }
-      if (key_drop_obsolete > 0) {
-        RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete);
-        key_drop_obsolete = 0;
-      }
-      RecordCompactionIOStats();
-      loop_cnt = 0;
-    }
-    // FLUSH preempts compaction
-    // TODO(icanadi) this currently only checks if flush is necessary on
-    // compacting column family. we should also check if flush is necessary on
-    // other column families, too
-    (*imm_micros) += yield_callback_();
-
-    Slice key;
-    Slice value;
-    // If is_compaction_v2 is on, kv-pairs are reset to the prefix batch.
-    // This prefix batch should contain results after calling
-    // compaction_filter_v2.
-    //
-    // If is_compaction_v2 is off, this function will go through all the
-    // kv-pairs in input.
-    if (!is_compaction_v2) {
-      key = input->key();
-      value = input->value();
-    } else {
-      if (combined_idx >= compact_->combined_key_buf_.size()) {
+  Slice* start = sub_compact->start;
+  Slice* end = sub_compact->end;
+  if (start != nullptr) {
+    IterKey start_iter;
+    start_iter.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
+    input->Seek(start_iter.GetKey());
+  } else {
+    input->SeekToFirst();
+  }
+
+  Status status;
+  sub_compact->c_iter.reset(new CompactionIterator(
+      input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(),
+      &existing_snapshots_, env_, false, sub_compact->compaction,
+      compaction_filter));
+  auto c_iter = sub_compact->c_iter.get();
+  c_iter->SeekToFirst();
+  const auto& c_iter_stats = c_iter->iter_stats();
+  // TODO(noetzli): check whether we could check !shutting_down_->... only
+  // only occasionally (see diff D42687)
+  while (status.ok() && !shutting_down_->load(std::memory_order_acquire) &&
+         !cfd->IsDropped() && c_iter->Valid()) {
+    // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
+    // returns true.
+    const Slice& key = c_iter->key();
+    const Slice& value = c_iter->value();
+
+    // If an end key (exclusive) is specified, check if the current key is
+    // >= than it and exit if it is because the iterator is out of its range
+    if (end != nullptr &&
+        cfd->user_comparator()->Compare(c_iter->user_key(), *end) >= 0) {
+      break;
+    } else if (sub_compact->compaction->ShouldStopBefore(key) &&
+               sub_compact->builder != nullptr) {
+      status = FinishCompactionOutputFile(input->status(), sub_compact);
+      if (!status.ok()) {
         break;
       }
-      assert(combined_idx < compact_->combined_key_buf_.size());
-      key = compact_->combined_key_buf_[combined_idx];
-      value = compact_->combined_value_buf_[combined_idx];
+    }
 
-      ++combined_idx;
+    if (c_iter_stats.num_input_records % kRecordStatsEvery ==
+        kRecordStatsEvery - 1) {
+      RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+      c_iter->ResetRecordCounts();
+      RecordCompactionIOStats();
     }
 
-    if (compact_->compaction->ShouldStopBefore(key) &&
-        compact_->builder != nullptr) {
-      status = FinishCompactionOutputFile(input);
+    // Open output file if necessary
+    if (sub_compact->builder == nullptr) {
+      status = OpenCompactionOutputFile(sub_compact);
       if (!status.ok()) {
         break;
       }
     }
-
-    // Handle key/value, add to state, etc.
-    bool drop = false;
-    bool current_entry_is_merging = false;
-    if (!ParseInternalKey(key, &ikey)) {
-      // Do not hide error keys
-      // TODO: error key stays in db forever? Figure out the intention/rationale
-      // v10 error v8 : we cannot hide v8 even though it's pretty obvious.
-      current_user_key.Clear();
-      has_current_user_key = false;
-      last_sequence_for_key = kMaxSequenceNumber;
-      visible_in_snapshot = kMaxSequenceNumber;
-    } else {
-      if (!has_current_user_key ||
-          cfd->user_comparator()->Compare(ikey.user_key,
-                                          current_user_key.GetKey()) != 0) {
-        // First occurrence of this user key
-        current_user_key.SetKey(ikey.user_key);
-        has_current_user_key = true;
-        last_sequence_for_key = kMaxSequenceNumber;
-        visible_in_snapshot = kMaxSequenceNumber;
-        // apply the compaction filter to the first occurrence of the user key
-        if (compaction_filter && !is_compaction_v2 && ikey.type == kTypeValue &&
-            (visible_at_tip_ || ikey.sequence > latest_snapshot_)) {
-          // If the user has specified a compaction filter and the sequence
-          // number is greater than any external snapshot, then invoke the
-          // filter.
-          // If the return value of the compaction filter is true, replace
-          // the entry with a delete marker.
-          bool value_changed = false;
-          compaction_filter_value.clear();
-          if (stats_ != nullptr) {
-            timer.Start();
-          }
-          bool to_delete = compaction_filter->Filter(
-              compact_->compaction->level(), ikey.user_key, value,
-              &compaction_filter_value, &value_changed);
-          total_filter_time += timer.ElapsedNanos();
-          if (to_delete) {
-            // make a copy of the original key and convert it to a delete
-            delete_key.SetInternalKey(ExtractUserKey(key), ikey.sequence,
-                                      kTypeDeletion);
-            // anchor the key again
-            key = delete_key.GetKey();
-            // needed because ikey is backed by key
-            ParseInternalKey(key, &ikey);
-            // no value associated with delete
-            value.clear();
-            ++key_drop_user;
-          } else if (value_changed) {
-            value = compaction_filter_value;
-          }
-        }
-      }
-
-      // If there are no snapshots, then this kv affect visibility at tip.
-      // Otherwise, search though all existing snapshots to find
-      // the earlist snapshot that is affected by this kv.
-      SequenceNumber prev_snapshot = 0;  // 0 means no previous snapshot
-      SequenceNumber visible =
-          visible_at_tip_
-              ? visible_at_tip_
-              : findEarliestVisibleSnapshot(ikey.sequence, existing_snapshots_,
-                                            &prev_snapshot);
-
-      if (visible_in_snapshot == visible) {
-        // If the earliest snapshot is which this key is visible in
-        // is the same as the visibily of a previous instance of the
-        // same key, then this kv is not visible in any snapshot.
-        // Hidden by an newer entry for same user key
-        // TODO: why not > ?
-        assert(last_sequence_for_key >= ikey.sequence);
-        drop = true;  // (A)
-        ++key_drop_newer_entry;
-      } else if (ikey.type == kTypeDeletion &&
-                 ikey.sequence <= earliest_snapshot_ &&
-                 compact_->compaction->KeyNotExistsBeyondOutputLevel(
-                     ikey.user_key)) {
-        // For this user key:
-        // (1) there is no data in higher levels
-        // (2) data in lower levels will have larger sequence numbers
-        // (3) data in layers that are being compacted here and have
-        //     smaller sequence numbers will be dropped in the next
-        //     few iterations of this loop (by rule (A) above).
-        // Therefore this deletion marker is obsolete and can be dropped.
-        drop = true;
-        ++key_drop_obsolete;
-      } else if (ikey.type == kTypeMerge) {
-        if (!merge.HasOperator()) {
-          LogToBuffer(log_buffer_, "Options::merge_operator is null.");
-          status = Status::InvalidArgument(
-              "merge_operator is not properly initialized.");
-          break;
-        }
-        // We know the merge type entry is not hidden, otherwise we would
-        // have hit (A)
-        // We encapsulate the merge related state machine in a different
-        // object to minimize change to the existing flow. Turn out this
-        // logic could also be nicely re-used for memtable flush purge
-        // optimization in BuildTable.
-        int steps = 0;
-        merge.MergeUntil(input, prev_snapshot, bottommost_level_,
-                         db_options_.statistics.get(), &steps, env_);
-        // Skip the Merge ops
-        combined_idx = combined_idx - 1 + steps;
-
-        current_entry_is_merging = true;
-        if (merge.IsSuccess()) {
-          // Successfully found Put/Delete/(end-of-key-range) while merging
-          // Get the merge result
-          key = merge.key();
-          ParseInternalKey(key, &ikey);
-          value = merge.value();
-        } else {
-          // Did not find a Put/Delete/(end-of-key-range) while merging
-          // We now have some stack of merge operands to write out.
-          // NOTE: key,value, and ikey are now referring to old entries.
-          //       These will be correctly set below.
-          assert(!merge.keys().empty());
-          assert(merge.keys().size() == merge.values().size());
-
-          // Hack to make sure last_sequence_for_key is correct
-          ParseInternalKey(merge.keys().front(), &ikey);
-        }
-      }
-
-      last_sequence_for_key = ikey.sequence;
-      visible_in_snapshot = visible;
+    assert(sub_compact->builder != nullptr);
+    assert(sub_compact->current_output() != nullptr);
+    sub_compact->builder->Add(key, value);
+    sub_compact->current_output()->meta.UpdateBoundaries(
+        key, c_iter->ikey().sequence);
+    sub_compact->num_output_records++;
+
+    // Close output file if it is big enough
+    // TODO(aekmekji): determine if file should be closed earlier than this
+    // during subcompactions (i.e. if output size, estimated by input size, is
+    // going to be 1.2MB and max_output_file_size = 1MB, prefer to have 0.6MB
+    // and 0.6MB instead of 1MB and 0.2MB)
+    if (sub_compact->builder->FileSize() >=
+        sub_compact->compaction->max_output_file_size()) {
+      status = FinishCompactionOutputFile(input->status(), sub_compact);
     }
 
-    if (!drop) {
-      // We may write a single key (e.g.: for Put/Delete or successful merge).
-      // Or we may instead have to write a sequence/list of keys.
-      // We have to write a sequence iff we have an unsuccessful merge
-      bool has_merge_list = current_entry_is_merging && !merge.IsSuccess();
-      const std::deque<std::string>* keys = nullptr;
-      const std::deque<std::string>* values = nullptr;
-      std::deque<std::string>::const_reverse_iterator key_iter;
-      std::deque<std::string>::const_reverse_iterator value_iter;
-      if (has_merge_list) {
-        keys = &merge.keys();
-        values = &merge.values();
-        key_iter = keys->rbegin();  // The back (*rbegin()) is the first key
-        value_iter = values->rbegin();
-
-        key = Slice(*key_iter);
-        value = Slice(*value_iter);
-      }
-
-      // If we have a list of keys to write, traverse the list.
-      // If we have a single key to write, simply write that key.
-      while (true) {
-        // Invariant: key,value,ikey will always be the next entry to write
-        char* kptr = (char*)key.data();
-        std::string kstr;
-
-        // Zeroing out the sequence number leads to better compression.
-        // If this is the bottommost level (no files in lower levels)
-        // and the earliest snapshot is larger than this seqno
-        // then we can squash the seqno to zero.
-        if (bottommost_level_ && ikey.sequence < earliest_snapshot_ &&
-            ikey.type != kTypeMerge) {
-          assert(ikey.type != kTypeDeletion);
-          // make a copy because updating in place would cause problems
-          // with the priority queue that is managing the input key iterator
-          kstr.assign(key.data(), key.size());
-          kptr = (char*)kstr.c_str();
-          UpdateInternalKey(kptr, key.size(), (uint64_t)0, ikey.type);
-        }
-
-        Slice newkey(kptr, key.size());
-        assert((key.clear(), 1));  // we do not need 'key' anymore
-
-        // Open output file if necessary
-        if (compact_->builder == nullptr) {
-          status = OpenCompactionOutputFile();
-          if (!status.ok()) {
-            break;
-          }
-        }
-
-        SequenceNumber seqno = GetInternalKeySeqno(newkey);
-        if (compact_->builder->NumEntries() == 0) {
-          compact_->current_output()->smallest.DecodeFrom(newkey);
-          compact_->current_output()->smallest_seqno = seqno;
-        } else {
-          compact_->current_output()->smallest_seqno =
-              std::min(compact_->current_output()->smallest_seqno, seqno);
-        }
-        compact_->current_output()->largest.DecodeFrom(newkey);
-        compact_->builder->Add(newkey, value);
-        compact_->num_output_records++,
-            compact_->current_output()->largest_seqno =
-                std::max(compact_->current_output()->largest_seqno, seqno);
-
-        // Close output file if it is big enough
-        if (compact_->builder->FileSize() >=
-            compact_->compaction->MaxOutputFileSize()) {
-          status = FinishCompactionOutputFile(input);
-          if (!status.ok()) {
-            break;
-          }
-        }
-
-        // If we have a list of entries, move to next element
-        // If we only had one entry, then break the loop.
-        if (has_merge_list) {
-          ++key_iter;
-          ++value_iter;
-
-          // If at end of list
-          if (key_iter == keys->rend() || value_iter == values->rend()) {
-            // Sanity Check: if one ends, then both end
-            assert(key_iter == keys->rend() && value_iter == values->rend());
-            break;
-          }
-
-          // Otherwise not at end of list. Update key, value, and ikey.
-          key = Slice(*key_iter);
-          value = Slice(*value_iter);
-          ParseInternalKey(key, &ikey);
+    c_iter->Next();
+  }
 
-        } else {
-          // Only had one item to begin with (Put/Delete)
-          break;
-        }
-      }  // while (true)
-    }    // if (!drop)
+  sub_compact->num_input_records = c_iter_stats.num_input_records;
+  sub_compact->compaction_job_stats.num_input_deletion_records =
+      c_iter_stats.num_input_deletion_records;
+  sub_compact->compaction_job_stats.num_corrupt_keys =
+      c_iter_stats.num_input_corrupt_records;
+  sub_compact->compaction_job_stats.total_input_raw_key_bytes +=
+      c_iter_stats.total_input_raw_key_bytes;
+  sub_compact->compaction_job_stats.total_input_raw_value_bytes +=
+      c_iter_stats.total_input_raw_value_bytes;
+
+  RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME,
+             c_iter_stats.total_filter_time);
+  RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+  RecordCompactionIOStats();
 
-    // MergeUntil has moved input to the next entry
-    if (!current_entry_is_merging) {
-      input->Next();
-    }
+  if (status.ok() &&
+      (shutting_down_->load(std::memory_order_acquire) || cfd->IsDropped())) {
+    status = Status::ShutdownInProgress(
+        "Database shutdown or Column family drop during compaction");
   }
-  RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME, total_filter_time);
-  if (key_drop_user > 0) {
-    RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user);
+  if (status.ok() && sub_compact->builder != nullptr) {
+    status = FinishCompactionOutputFile(input->status(), sub_compact);
   }
-  if (key_drop_newer_entry > 0) {
-    RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, key_drop_newer_entry);
+  if (status.ok()) {
+    status = input->status();
   }
-  if (key_drop_obsolete > 0) {
-    RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete);
+
+  if (measure_io_stats_) {
+    sub_compact->compaction_job_stats.file_write_nanos +=
+        iostats_context.write_nanos - prev_write_nanos;
+    sub_compact->compaction_job_stats.file_fsync_nanos +=
+        iostats_context.fsync_nanos - prev_fsync_nanos;
+    sub_compact->compaction_job_stats.file_range_sync_nanos +=
+        iostats_context.range_sync_nanos - prev_range_sync_nanos;
+    sub_compact->compaction_job_stats.file_prepare_write_nanos +=
+        iostats_context.prepare_write_nanos - prev_prepare_write_nanos;
+    if (prev_perf_level != PerfLevel::kEnableTime) {
+      SetPerfLevel(prev_perf_level);
+    }
   }
-  RecordCompactionIOStats();
 
-  return status;
+  sub_compact->c_iter.reset();
+  input.reset();
+  sub_compact->status = status;
 }
 
-void CompactionJob::CallCompactionFilterV2(
-    CompactionFilterV2* compaction_filter_v2, uint64_t* time) {
-  if (compact_ == nullptr || compaction_filter_v2 == nullptr) {
-    return;
-  }
-  AutoThreadOperationStageUpdater stage_updater(
-      ThreadStatus::STAGE_COMPACTION_FILTER_V2);
-
-  // Assemble slice vectors for user keys and existing values.
-  // We also keep track of our parsed internal key structs because
-  // we may need to access the sequence number in the event that
-  // keys are garbage collected during the filter process.
-  std::vector<ParsedInternalKey> ikey_buf;
-  std::vector<Slice> user_key_buf;
-  std::vector<Slice> existing_value_buf;
-
-  for (const auto& key : compact_->key_str_buf_) {
-    ParsedInternalKey ikey;
-    ParseInternalKey(Slice(key), &ikey);
-    ikey_buf.emplace_back(ikey);
-    user_key_buf.emplace_back(ikey.user_key);
+void CompactionJob::RecordDroppedKeys(
+    const CompactionIteratorStats& c_iter_stats,
+    CompactionJobStats* compaction_job_stats) {
+  if (c_iter_stats.num_record_drop_user > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_USER,
+               c_iter_stats.num_record_drop_user);
   }
-  for (const auto& value : compact_->existing_value_str_buf_) {
-    existing_value_buf.emplace_back(Slice(value));
+  if (c_iter_stats.num_record_drop_hidden > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
+               c_iter_stats.num_record_drop_hidden);
+    if (compaction_job_stats) {
+      compaction_job_stats->num_records_replaced +=
+          c_iter_stats.num_record_drop_hidden;
+    }
   }
-
-  // If the user has specified a compaction filter and the sequence
-  // number is greater than any external snapshot, then invoke the
-  // filter.
-  // If the return value of the compaction filter is true, replace
-  // the entry with a delete marker.
-  StopWatchNano timer(env_, stats_ != nullptr);
-  compact_->to_delete_buf_ = compaction_filter_v2->Filter(
-      compact_->compaction->level(), user_key_buf, existing_value_buf,
-      &compact_->new_value_buf_, &compact_->value_changed_buf_);
-  *time = timer.ElapsedNanos();
-  // new_value_buf_.size() <= to_delete__buf_.size(). "=" iff all
-  // kv-pairs in this compaction run needs to be deleted.
-  assert(compact_->to_delete_buf_.size() == compact_->key_str_buf_.size());
-  assert(compact_->to_delete_buf_.size() ==
-         compact_->existing_value_str_buf_.size());
-  assert(compact_->value_changed_buf_.empty() ||
-         compact_->to_delete_buf_.size() ==
-         compact_->value_changed_buf_.size());
-
-  int new_value_idx = 0;
-  for (unsigned int i = 0; i < compact_->to_delete_buf_.size(); ++i) {
-    if (compact_->to_delete_buf_[i]) {
-      // update the string buffer directly
-      // the Slice buffer points to the updated buffer
-      UpdateInternalKey(&compact_->key_str_buf_[i][0],
-                        compact_->key_str_buf_[i].size(), ikey_buf[i].sequence,
-                        kTypeDeletion);
-
-      // no value associated with delete
-      compact_->existing_value_str_buf_[i].clear();
-      RecordTick(stats_, COMPACTION_KEY_DROP_USER);
-    } else if (!compact_->value_changed_buf_.empty() &&
-        compact_->value_changed_buf_[i]) {
-      compact_->existing_value_str_buf_[i] =
-          compact_->new_value_buf_[new_value_idx++];
+  if (c_iter_stats.num_record_drop_obsolete > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE,
+               c_iter_stats.num_record_drop_obsolete);
+    if (compaction_job_stats) {
+      compaction_job_stats->num_expired_deletion_records +=
+          c_iter_stats.num_record_drop_obsolete;
     }
-  }  // for
+  }
 }
 
-Status CompactionJob::FinishCompactionOutputFile(Iterator* input) {
+Status CompactionJob::FinishCompactionOutputFile(
+    const Status& input_status, SubcompactionState* sub_compact) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
-  assert(compact_ != nullptr);
-  assert(compact_->outfile);
-  assert(compact_->builder != nullptr);
+  assert(sub_compact != nullptr);
+  assert(sub_compact->outfile);
+  assert(sub_compact->builder != nullptr);
+  assert(sub_compact->current_output() != nullptr);
 
-  const uint64_t output_number = compact_->current_output()->number;
-  const uint32_t output_path_id = compact_->current_output()->path_id;
+  uint64_t output_number = sub_compact->current_output()->meta.fd.GetNumber();
   assert(output_number != 0);
 
   TableProperties table_properties;
   // Check for iterator errors
-  Status s = input->status();
-  const uint64_t current_entries = compact_->builder->NumEntries();
+  Status s = input_status;
+  auto meta = &sub_compact->current_output()->meta;
+  const uint64_t current_entries = sub_compact->builder->NumEntries();
+  meta->marked_for_compaction = sub_compact->builder->NeedCompact();
   if (s.ok()) {
-    s = compact_->builder->Finish();
+    s = sub_compact->builder->Finish();
   } else {
-    compact_->builder->Abandon();
+    sub_compact->builder->Abandon();
   }
-  if (s.ok()) {
-    table_properties = compact_->builder->GetTableProperties();
-  }
-  const uint64_t current_bytes = compact_->builder->FileSize();
-  compact_->current_output()->file_size = current_bytes;
-  compact_->total_bytes += current_bytes;
-  compact_->builder.reset();
+  const uint64_t current_bytes = sub_compact->builder->FileSize();
+  meta->fd.file_size = current_bytes;
+  sub_compact->current_output()->finished = true;
+  sub_compact->total_bytes += current_bytes;
 
   // Finish and check for file errors
   if (s.ok() && !db_options_.disableDataSync) {
-    if (db_options_.use_fsync) {
-      StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
-      s = compact_->outfile->Fsync();
-    } else {
-      StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
-      s = compact_->outfile->Sync();
-    }
+    StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
+    s = sub_compact->outfile->Sync(db_options_.use_fsync);
   }
   if (s.ok()) {
-    s = compact_->outfile->Close();
+    s = sub_compact->outfile->Close();
   }
-  compact_->outfile.reset();
+  sub_compact->outfile.reset();
 
   if (s.ok() && current_entries > 0) {
     // Verify that the table is usable
-    ColumnFamilyData* cfd = compact_->compaction->column_family_data();
-    FileDescriptor fd(output_number, output_path_id, current_bytes);
+    ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
     Iterator* iter = cfd->table_cache()->NewIterator(
-        ReadOptions(), env_options_, cfd->internal_comparator(), fd);
+        ReadOptions(), env_options_, cfd->internal_comparator(), meta->fd,
+        nullptr, cfd->internal_stats()->GetFileReadHist(
+                     compact_->compaction->output_level()),
+        false);
     s = iter->status();
 
     if (s.ok() && paranoid_file_checks_) {
@@ -1058,21 +814,30 @@ Status CompactionJob::FinishCompactionOutputFile(Iterator* input) {
 
     delete iter;
     if (s.ok()) {
+      TableFileCreationInfo info(sub_compact->builder->GetTableProperties());
+      info.db_name = dbname_;
+      info.cf_name = cfd->GetName();
+      info.file_path =
+          TableFileName(cfd->ioptions()->db_paths, meta->fd.GetNumber(),
+                        meta->fd.GetPathId());
+      info.file_size = meta->fd.GetFileSize();
+      info.job_id = job_id_;
       Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
           "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64
-          " keys, %" PRIu64 " bytes",
+          " keys, %" PRIu64 " bytes%s",
           cfd->GetName().c_str(), job_id_, output_number, current_entries,
-          current_bytes);
-      EventLoggerHelpers::LogTableFileCreation(event_logger_, job_id_,
-                                               output_number, current_bytes,
-                                               table_properties);
+          current_bytes,
+          meta->marked_for_compaction ? " (need compaction)" : "");
+      EventHelpers::LogAndNotifyTableFileCreation(
+          event_logger_, cfd->ioptions()->listeners, meta->fd, info);
     }
   }
+  sub_compact->builder.reset();
   return s;
 }
 
 Status CompactionJob::InstallCompactionResults(
-    InstrumentedMutex* db_mutex, const MutableCFOptions& mutable_cf_options) {
+    const MutableCFOptions& mutable_cf_options, InstrumentedMutex* db_mutex) {
   db_mutex->AssertHeld();
 
   auto* compaction = compact_->compaction;
@@ -1100,46 +865,17 @@ Status CompactionJob::InstallCompactionResults(
 
   // Add compaction outputs
   compaction->AddInputDeletions(compact_->compaction->edit());
-  for (size_t i = 0; i < compact_->outputs.size(); i++) {
-    const CompactionState::Output& out = compact_->outputs[i];
-    compaction->edit()->AddFile(
-        compaction->output_level(), out.number, out.path_id, out.file_size,
-        out.smallest, out.largest, out.smallest_seqno, out.largest_seqno);
+
+  for (const auto& sub_compact : compact_->sub_compact_states) {
+    for (const auto& out : sub_compact.outputs) {
+      compaction->edit()->AddFile(compaction->output_level(), out.meta);
+    }
   }
   return versions_->LogAndApply(compaction->column_family_data(),
                                 mutable_cf_options, compaction->edit(),
                                 db_mutex, db_directory_);
 }
 
-// Given a sequence number, return the sequence number of the
-// earliest snapshot that this sequence number is visible in.
-// The snapshots themselves are arranged in ascending order of
-// sequence numbers.
-// Employ a sequential search because the total number of
-// snapshots are typically small.
-inline SequenceNumber CompactionJob::findEarliestVisibleSnapshot(
-    SequenceNumber in, const std::vector<SequenceNumber>& snapshots,
-    SequenceNumber* prev_snapshot) {
-  assert(snapshots.size());
-  SequenceNumber prev __attribute__((unused)) = 0;
-  for (const auto cur : snapshots) {
-    assert(prev <= cur);
-    if (cur >= in) {
-      *prev_snapshot = prev;
-      return cur;
-    }
-    prev = cur;  // assignment
-    assert(prev);
-  }
-  Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
-      "CompactionJob is not able to find snapshot"
-      " with SeqId later than %" PRIu64
-      ": current MaxSeqId is %" PRIu64 "",
-      in, snapshots[snapshots.size() - 1]);
-  assert(0);
-  return 0;
-}
-
 void CompactionJob::RecordCompactionIOStats() {
   RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read));
   ThreadStatusUtil::IncreaseThreadOperationProperty(
@@ -1151,76 +887,211 @@ void CompactionJob::RecordCompactionIOStats() {
   IOSTATS_RESET(bytes_written);
 }
 
-Status CompactionJob::OpenCompactionOutputFile() {
-  assert(compact_ != nullptr);
-  assert(compact_->builder == nullptr);
+Status CompactionJob::OpenCompactionOutputFile(
+    SubcompactionState* sub_compact) {
+  assert(sub_compact != nullptr);
+  assert(sub_compact->builder == nullptr);
   // no need to lock because VersionSet::next_file_number_ is atomic
   uint64_t file_number = versions_->NewFileNumber();
   // Make the output file
+  unique_ptr<WritableFile> writable_file;
   std::string fname = TableFileName(db_options_.db_paths, file_number,
-                                    compact_->compaction->GetOutputPathId());
-  Status s = env_->NewWritableFile(fname, &compact_->outfile, env_options_);
-
+                                    sub_compact->compaction->output_path_id());
+  Status s = env_->NewWritableFile(fname, &writable_file, env_options_);
   if (!s.ok()) {
     Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
         "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64
         " fails at NewWritableFile with status %s",
-        compact_->compaction->column_family_data()->GetName().c_str(), job_id_,
-        file_number, s.ToString().c_str());
+        sub_compact->compaction->column_family_data()->GetName().c_str(),
+        job_id_, file_number, s.ToString().c_str());
     LogFlush(db_options_.info_log);
     return s;
   }
-  CompactionState::Output out;
-  out.number = file_number;
-  out.path_id = compact_->compaction->GetOutputPathId();
-  out.smallest.Clear();
-  out.largest.Clear();
-  out.smallest_seqno = out.largest_seqno = 0;
-
-  compact_->outputs.push_back(out);
-  compact_->outfile->SetIOPriority(Env::IO_LOW);
-  compact_->outfile->SetPreallocationBlockSize(
-      static_cast<size_t>(compact_->compaction->OutputFilePreallocationSize()));
-
-  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
-  bool skip_filters = false;
-
+  SubcompactionState::Output out;
+  out.meta.fd =
+      FileDescriptor(file_number, sub_compact->compaction->output_path_id(), 0);
+  out.finished = false;
+
+  sub_compact->outputs.push_back(out);
+  writable_file->SetIOPriority(Env::IO_LOW);
+  writable_file->SetPreallocationBlockSize(static_cast<size_t>(
+      sub_compact->compaction->OutputFilePreallocationSize()));
+  sub_compact->outfile.reset(
+      new WritableFileWriter(std::move(writable_file), env_options_));
+
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
   // If the Column family flag is to only optimize filters for hits,
   // we can skip creating filters if this is the bottommost_level where
   // data is going to be found
-  //
-  if (cfd->ioptions()->optimize_filters_for_hits && bottommost_level_) {
-    skip_filters = true;
-  }
-
-  compact_->builder.reset(NewTableBuilder(
+  bool skip_filters =
+      cfd->ioptions()->optimize_filters_for_hits && bottommost_level_;
+  sub_compact->builder.reset(NewTableBuilder(
       *cfd->ioptions(), cfd->internal_comparator(),
-      cfd->int_tbl_prop_collector_factories(), compact_->outfile.get(),
-      compact_->compaction->OutputCompressionType(),
+      cfd->int_tbl_prop_collector_factories(), sub_compact->outfile.get(),
+      sub_compact->compaction->output_compression(),
       cfd->ioptions()->compression_opts, skip_filters));
   LogFlush(db_options_.info_log);
   return s;
 }
 
-void CompactionJob::CleanupCompaction(const Status& status) {
-  if (compact_->builder != nullptr) {
-    // May happen if we get a shutdown call in the middle of compaction
-    compact_->builder->Abandon();
-    compact_->builder.reset();
-  } else {
-    assert(!status.ok() || compact_->outfile == nullptr);
-  }
-  for (size_t i = 0; i < compact_->outputs.size(); i++) {
-    const CompactionState::Output& out = compact_->outputs[i];
+void CompactionJob::CleanupCompaction() {
+  for (SubcompactionState& sub_compact : compact_->sub_compact_states) {
+    const auto& sub_status = sub_compact.status;
 
-    // If this file was inserted into the table cache then remove
-    // them here because this compaction was not committed.
-    if (!status.ok()) {
-      TableCache::Evict(table_cache_.get(), out.number);
+    if (sub_compact.builder != nullptr) {
+      // May happen if we get a shutdown call in the middle of compaction
+      sub_compact.builder->Abandon();
+      sub_compact.builder.reset();
+    } else {
+      assert(!sub_status.ok() || sub_compact.outfile == nullptr);
+    }
+    for (const auto& out : sub_compact.outputs) {
+      // If this file was inserted into the table cache then remove
+      // them here because this compaction was not committed.
+      if (!sub_status.ok()) {
+        TableCache::Evict(table_cache_.get(), out.meta.fd.GetNumber());
+      }
     }
   }
   delete compact_;
   compact_ = nullptr;
 }
 
+#ifndef ROCKSDB_LITE
+namespace {
+void CopyPrefix(
+    const Slice& src, size_t prefix_length, std::string* dst) {
+  assert(prefix_length > 0);
+  size_t length = src.size() > prefix_length ? prefix_length : src.size();
+  dst->assign(src.data(), length);
+}
+}  // namespace
+
+#endif  // !ROCKSDB_LITE
+
+void CompactionJob::UpdateCompactionStats() {
+  Compaction* compaction = compact_->compaction;
+  compaction_stats_.num_input_files_in_non_output_levels = 0;
+  compaction_stats_.num_input_files_in_output_level = 0;
+  for (int input_level = 0;
+       input_level < static_cast<int>(compaction->num_input_levels());
+       ++input_level) {
+    if (compaction->start_level() + input_level
+        != compaction->output_level()) {
+      UpdateCompactionInputStatsHelper(
+          &compaction_stats_.num_input_files_in_non_output_levels,
+          &compaction_stats_.bytes_read_non_output_levels,
+          input_level);
+    } else {
+      UpdateCompactionInputStatsHelper(
+          &compaction_stats_.num_input_files_in_output_level,
+          &compaction_stats_.bytes_read_output_level,
+          input_level);
+    }
+  }
+
+  for (const auto& sub_compact : compact_->sub_compact_states) {
+    size_t num_output_files = sub_compact.outputs.size();
+    if (sub_compact.builder != nullptr) {
+      // An error occurred so ignore the last output.
+      assert(num_output_files > 0);
+      --num_output_files;
+    }
+    compaction_stats_.num_output_files += static_cast<int>(num_output_files);
+
+    for (const auto& out : sub_compact.outputs) {
+      compaction_stats_.bytes_written += out.meta.fd.file_size;
+    }
+    if (sub_compact.num_input_records > sub_compact.num_output_records) {
+      compaction_stats_.num_dropped_records +=
+          sub_compact.num_input_records - sub_compact.num_output_records;
+    }
+  }
+}
+
+void CompactionJob::UpdateCompactionInputStatsHelper(
+    int* num_files, uint64_t* bytes_read, int input_level) {
+  const Compaction* compaction = compact_->compaction;
+  auto num_input_files = compaction->num_input_files(input_level);
+  *num_files += static_cast<int>(num_input_files);
+
+  for (size_t i = 0; i < num_input_files; ++i) {
+    const auto* file_meta = compaction->input(input_level, i);
+    *bytes_read += file_meta->fd.GetFileSize();
+    compaction_stats_.num_input_records +=
+        static_cast<uint64_t>(file_meta->num_entries);
+  }
+}
+
+void CompactionJob::UpdateCompactionJobStats(
+    const InternalStats::CompactionStats& stats) const {
+#ifndef ROCKSDB_LITE
+  if (compaction_job_stats_) {
+    compaction_job_stats_->elapsed_micros = stats.micros;
+
+    // input information
+    compaction_job_stats_->total_input_bytes =
+        stats.bytes_read_non_output_levels +
+        stats.bytes_read_output_level;
+    compaction_job_stats_->num_input_records =
+        compact_->num_input_records;
+    compaction_job_stats_->num_input_files =
+        stats.num_input_files_in_non_output_levels +
+        stats.num_input_files_in_output_level;
+    compaction_job_stats_->num_input_files_at_output_level =
+        stats.num_input_files_in_output_level;
+
+    // output information
+    compaction_job_stats_->total_output_bytes = stats.bytes_written;
+    compaction_job_stats_->num_output_records =
+        compact_->num_output_records;
+    compaction_job_stats_->num_output_files = stats.num_output_files;
+
+    if (compact_->NumOutputFiles() > 0U) {
+      CopyPrefix(
+          compact_->SmallestUserKey(),
+          CompactionJobStats::kMaxPrefixLength,
+          &compaction_job_stats_->smallest_output_key_prefix);
+      CopyPrefix(
+          compact_->LargestUserKey(),
+          CompactionJobStats::kMaxPrefixLength,
+          &compaction_job_stats_->largest_output_key_prefix);
+    }
+  }
+#endif  // !ROCKSDB_LITE
+}
+
+void CompactionJob::LogCompaction() {
+  Compaction* compaction = compact_->compaction;
+  ColumnFamilyData* cfd = compaction->column_family_data();
+
+  // Let's check if anything will get logged. Don't prepare all the info if
+  // we're not logging
+  if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) {
+    Compaction::InputLevelSummaryBuffer inputs_summary;
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "[%s] [JOB %d] Compacting %s, score %.2f", cfd->GetName().c_str(),
+        job_id_, compaction->InputLevelSummary(&inputs_summary),
+        compaction->score());
+    char scratch[2345];
+    compaction->Summary(scratch, sizeof(scratch));
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "[%s] Compaction start summary: %s\n", cfd->GetName().c_str(), scratch);
+    // build event logger report
+    auto stream = event_logger_->Log();
+    stream << "job" << job_id_ << "event"
+           << "compaction_started";
+    for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
+      stream << ("files_L" + ToString(compaction->level(i)));
+      stream.StartArray();
+      for (auto f : *compaction->inputs(i)) {
+        stream << f->fd.GetNumber();
+      }
+      stream.EndArray();
+    }
+    stream << "score" << compaction->score() << "input_data_size"
+           << compaction->CalculateTotalInputSize();
+  }
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/compaction_job.h b/src/rocksdb/db/compaction_job.h
index d34e4bd..1054fec 100644
--- a/src/rocksdb/db/compaction_job.h
+++ b/src/rocksdb/db/compaction_job.h
@@ -10,34 +10,36 @@
 
 #include <atomic>
 #include <deque>
+#include <functional>
 #include <limits>
 #include <set>
+#include <string>
 #include <utility>
 #include <vector>
-#include <string>
-#include <functional>
 
+#include "db/column_family.h"
+#include "db/compaction_iterator.h"
 #include "db/dbformat.h"
+#include "db/flush_scheduler.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
 #include "db/log_writer.h"
-#include "db/column_family.h"
-#include "db/version_edit.h"
 #include "db/memtable_list.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
 #include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/compaction_job_stats.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
-#include "rocksdb/compaction_filter.h"
 #include "rocksdb/transaction_log.h"
 #include "util/autovector.h"
 #include "util/event_logger.h"
+#include "util/scoped_arena_iterator.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
-#include "util/scoped_arena_iterator.h"
-#include "db/internal_stats.h"
-#include "db/write_controller.h"
-#include "db/flush_scheduler.h"
-#include "db/write_thread.h"
-#include "db/job_context.h"
 
 namespace rocksdb {
 
@@ -56,9 +58,10 @@ class CompactionJob {
                 Directory* db_directory, Directory* output_directory,
                 Statistics* stats,
                 std::vector<SequenceNumber> existing_snapshots,
-                std::shared_ptr<Cache> table_cache,
-                std::function<uint64_t()> yield_callback,
-                EventLogger* event_logger, bool paranoid_file_checks);
+                std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+                bool paranoid_file_checks, bool measure_io_stats,
+                const std::string& dbname,
+                CompactionJobStats* compaction_job_stats);
 
   ~CompactionJob();
 
@@ -71,46 +74,52 @@ class CompactionJob {
   void Prepare();
   // REQUIRED mutex not held
   Status Run();
+
   // REQUIRED: mutex held
-  // status is the return of Run()
-  void Install(Status* status, const MutableCFOptions& mutable_cf_options,
-               InstrumentedMutex* db_mutex);
+  Status Install(const MutableCFOptions& mutable_cf_options,
+                 InstrumentedMutex* db_mutex);
 
  private:
+  struct SubcompactionState;
+
+  void AggregateStatistics();
+  void GenSubcompactionBoundaries();
+
   // update the thread status for starting a compaction.
   void ReportStartedCompaction(Compaction* compaction);
   void AllocateCompactionOutputFileNumbers();
-  // Call compaction filter if is_compaction_v2 is not true. Then iterate
-  // through input and compact the kv-pairs
-  Status ProcessKeyValueCompaction(int64_t* imm_micros, Iterator* input,
-                                   bool is_compaction_v2);
-  // Call compaction_filter_v2->Filter() on kv-pairs in compact
-  void CallCompactionFilterV2(CompactionFilterV2* compaction_filter_v2,
-                              uint64_t* time);
-  Status FinishCompactionOutputFile(Iterator* input);
-  Status InstallCompactionResults(InstrumentedMutex* db_mutex,
-                                  const MutableCFOptions& mutable_cf_options);
-  SequenceNumber findEarliestVisibleSnapshot(
-      SequenceNumber in, const std::vector<SequenceNumber>& snapshots,
-      SequenceNumber* prev_snapshot);
+  // Call compaction filter. Then iterate through input and compact the
+  // kv-pairs
+  void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
+
+  Status FinishCompactionOutputFile(const Status& input_status,
+                                    SubcompactionState* sub_compact);
+  Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options,
+                                  InstrumentedMutex* db_mutex);
   void RecordCompactionIOStats();
-  Status OpenCompactionOutputFile();
-  void CleanupCompaction(const Status& status);
+  Status OpenCompactionOutputFile(SubcompactionState* sub_compact);
+  void CleanupCompaction();
+  void UpdateCompactionJobStats(
+    const InternalStats::CompactionStats& stats) const;
+  void RecordDroppedKeys(const CompactionIteratorStats& c_iter_stats,
+                         CompactionJobStats* compaction_job_stats = nullptr);
+
+  void UpdateCompactionStats();
+  void UpdateCompactionInputStatsHelper(
+      int* num_files, uint64_t* bytes_read, int input_level);
+
+  void LogCompaction();
 
   int job_id_;
 
   // CompactionJob state
   struct CompactionState;
   CompactionState* compact_;
-
-  bool bottommost_level_;
-  SequenceNumber earliest_snapshot_;
-  SequenceNumber visible_at_tip_;
-  SequenceNumber latest_snapshot_;
-
+  CompactionJobStats* compaction_job_stats_;
   InternalStats::CompactionStats compaction_stats_;
 
   // DBImpl state
+  const std::string& dbname_;
   const DBOptions& db_options_;
   const EnvOptions& env_options_;
   Env* env_;
@@ -127,12 +136,15 @@ class CompactionJob {
   std::vector<SequenceNumber> existing_snapshots_;
   std::shared_ptr<Cache> table_cache_;
 
-  // yield callback
-  std::function<uint64_t()> yield_callback_;
-
   EventLogger* event_logger_;
 
+  bool bottommost_level_;
   bool paranoid_file_checks_;
+  bool measure_io_stats_;
+  // Stores the Slices that designate the boundaries for each subcompaction
+  std::vector<Slice> boundaries_;
+  // Stores the approx size of keys covered in the range of each subcompaction
+  std::vector<uint64_t> sizes_;
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/compaction_job_stats_test.cc b/src/rocksdb/db/compaction_job_stats_test.cc
new file mode 100644
index 0000000..8641c8a
--- /dev/null
+++ b/src/rocksdb/db/compaction_job_stats_test.cc
@@ -0,0 +1,1045 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/plain_table_factory.h"
+#include "util/compression.h"
+#include "util/hash.h"
+#include "util/hash_linklist_rep.h"
+#include "util/logging.h"
+#include "util/mock_env.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/scoped_arena_iterator.h"
+#include "util/statistics.h"
+#include "util/string_util.h"
+#include "util/sync_point.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "util/thread_status_util.h"
+#include "util/xfunc.h"
+#include "utilities/merge_operators.h"
+
+#if !defined(IOS_CROSS_COMPILE) && (!defined(NDEBUG) || !defined(OS_WIN))
+#ifndef ROCKSDB_LITE
+namespace rocksdb {
+
+static std::string RandomString(Random* rnd, int len, double ratio) {
+  std::string r;
+  test::CompressibleString(rnd, ratio, len, &r);
+  return r;
+}
+
+std::string Key(uint64_t key, int length) {
+  const int kBufSize = 1000;
+  char buf[kBufSize];
+  if (length > kBufSize) {
+    length = kBufSize;
+  }
+  snprintf(buf, kBufSize, "%0*" PRIu64, length, key);
+  return std::string(buf);
+}
+
+class CompactionJobStatsTest : public testing::Test,
+                               public testing::WithParamInterface<bool> {
+ public:
+  std::string dbname_;
+  std::string alternative_wal_dir_;
+  Env* env_;
+  DB* db_;
+  std::vector<ColumnFamilyHandle*> handles_;
+  uint32_t max_subcompactions_;
+
+  Options last_options_;
+
+  CompactionJobStatsTest() : env_(Env::Default()) {
+    env_->SetBackgroundThreads(1, Env::LOW);
+    env_->SetBackgroundThreads(1, Env::HIGH);
+    dbname_ = test::TmpDir(env_) + "/compaction_job_stats_test";
+    alternative_wal_dir_ = dbname_ + "/wal";
+    Options options;
+    options.create_if_missing = true;
+    max_subcompactions_ = GetParam();
+    options.max_subcompactions = max_subcompactions_;
+    auto delete_options = options;
+    delete_options.wal_dir = alternative_wal_dir_;
+    EXPECT_OK(DestroyDB(dbname_, delete_options));
+    // Destroy it for not alternative WAL dir is used.
+    EXPECT_OK(DestroyDB(dbname_, options));
+    db_ = nullptr;
+    Reopen(options);
+  }
+
+  ~CompactionJobStatsTest() {
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+    rocksdb::SyncPoint::GetInstance()->LoadDependency({});
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+    Close();
+    Options options;
+    options.db_paths.emplace_back(dbname_, 0);
+    options.db_paths.emplace_back(dbname_ + "_2", 0);
+    options.db_paths.emplace_back(dbname_ + "_3", 0);
+    options.db_paths.emplace_back(dbname_ + "_4", 0);
+    EXPECT_OK(DestroyDB(dbname_, options));
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void CreateColumnFamilies(const std::vector<std::string>& cfs,
+                            const Options& options) {
+    ColumnFamilyOptions cf_opts(options);
+    size_t cfi = handles_.size();
+    handles_.resize(cfi + cfs.size());
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+    }
+  }
+
+  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                             const Options& options) {
+    CreateColumnFamilies(cfs, options);
+    std::vector<std::string> cfs_plus_default = cfs;
+    cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+    ReopenWithColumnFamilies(cfs_plus_default, options);
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const std::vector<Options>& options) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const Options& options) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  Status TryReopenWithColumnFamilies(
+      const std::vector<std::string>& cfs,
+      const std::vector<Options>& options) {
+    Close();
+    EXPECT_EQ(cfs.size(), options.size());
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+    }
+    DBOptions db_opts = DBOptions(options[0]);
+    return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const Options& options) {
+    Close();
+    std::vector<Options> v_opts(cfs.size(), options);
+    return TryReopenWithColumnFamilies(cfs, v_opts);
+  }
+
+  void Reopen(const Options& options) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      delete h;
+    }
+    handles_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(const Options& options) {
+    // Destroy using last options
+    Destroy(last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(const Options& options) {
+    Close();
+    ASSERT_OK(DestroyDB(dbname_, options));
+  }
+
+  Status ReadOnlyReopen(const Options& options) {
+    return DB::OpenForReadOnly(options, dbname_, &db_);
+  }
+
+  Status TryReopen(const Options& options) {
+    Close();
+    last_options_ = options;
+    return DB::Open(options, dbname_, &db_);
+  }
+
+  Status Flush(int cf = 0) {
+    if (cf == 0) {
+      return db_->Flush(FlushOptions());
+    } else {
+      return db_->Flush(FlushOptions(), handles_[cf]);
+    }
+  }
+
+  Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, k, v);
+  }
+
+  Status Put(int cf, const Slice& k, const Slice& v,
+             WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, handles_[cf], k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  Status Delete(int cf, const std::string& k) {
+    return db_->Delete(WriteOptions(), handles_[cf], k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  std::string Get(int cf, const std::string& k,
+                  const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, handles_[cf], k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  int NumTableFilesAtLevel(int level, int cf = 0) {
+    std::string property;
+    if (cf == 0) {
+      // default cfd
+      EXPECT_TRUE(db_->GetProperty(
+          "rocksdb.num-files-at-level" + NumberToString(level), &property));
+    } else {
+      EXPECT_TRUE(db_->GetProperty(
+          handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level),
+          &property));
+    }
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel(int cf = 0) {
+    int num_levels =
+        (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+    std::string result;
+    size_t last_non_zero_offset = 0;
+    for (int level = 0; level < num_levels; level++) {
+      int f = NumTableFilesAtLevel(level, cf);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  uint64_t Size(const Slice& start, const Slice& limit, int cf = 0) {
+    Range r(start, limit);
+    uint64_t size;
+    if (cf == 0) {
+      db_->GetApproximateSizes(&r, 1, &size);
+    } else {
+      db_->GetApproximateSizes(handles_[1], &r, 1, &size);
+    }
+    return size;
+  }
+
+  void Compact(int cf, const Slice& start, const Slice& limit,
+               uint32_t target_path_id) {
+    CompactRangeOptions compact_options;
+    compact_options.target_path_id = target_path_id;
+    ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
+  }
+
+  void Compact(int cf, const Slice& start, const Slice& limit) {
+    ASSERT_OK(
+        db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+  }
+
+  void Compact(const Slice& start, const Slice& limit) {
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
+  }
+
+  void TEST_Compact(int level, int cf, const Slice& start, const Slice& limit) {
+    ASSERT_OK(dbfull()->TEST_CompactRange(level, &start, &limit, handles_[cf],
+                                          true /* disallow trivial move */));
+  }
+
+  // Do n memtable compactions, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int n, const std::string& small, const std::string& large,
+                  int cf = 0) {
+    for (int i = 0; i < n; i++) {
+      ASSERT_OK(Put(cf, small, "begin"));
+      ASSERT_OK(Put(cf, large, "end"));
+      ASSERT_OK(Flush(cf));
+    }
+  }
+
+  static void SetDeletionCompactionStats(
+      CompactionJobStats *stats, uint64_t input_deletions,
+      uint64_t expired_deletions, uint64_t records_replaced) {
+    stats->num_input_deletion_records = input_deletions;
+    stats->num_expired_deletion_records = expired_deletions;
+    stats->num_records_replaced = records_replaced;
+  }
+
+  void MakeTableWithKeyValues(
+    Random* rnd, uint64_t smallest, uint64_t largest,
+    int key_size, int value_size, uint64_t interval,
+    double ratio, int cf = 0) {
+    for (auto key = smallest; key < largest; key += interval) {
+      ASSERT_OK(Put(cf, Slice(Key(key, key_size)),
+                        Slice(RandomString(rnd, value_size, ratio))));
+    }
+    ASSERT_OK(Flush(cf));
+  }
+
+  // This function behaves with the implicit understanding that two
+  // rounds of keys are inserted into the database, as per the behavior
+  // of the DeletionStatsTest.
+  void SelectivelyDeleteKeys(uint64_t smallest, uint64_t largest,
+    uint64_t interval, int deletion_interval, int key_size,
+    uint64_t cutoff_key_num, CompactionJobStats* stats, int cf = 0) {
+
+    // interval needs to be >= 2 so that deletion entries can be inserted
+    // that are intended to not result in an actual key deletion by using
+    // an offset of 1 from another existing key
+    ASSERT_GE(interval, 2);
+
+    uint64_t ctr = 1;
+    uint32_t deletions_made = 0;
+    uint32_t num_deleted = 0;
+    uint32_t num_expired = 0;
+    for (auto key = smallest; key <= largest; key += interval, ctr++) {
+      if (ctr % deletion_interval == 0) {
+        ASSERT_OK(Delete(cf, Key(key, key_size)));
+        deletions_made++;
+        num_deleted++;
+
+        if (key > cutoff_key_num) {
+          num_expired++;
+        }
+      }
+    }
+
+    // Insert some deletions for keys that don't exist that
+    // are both in and out of the key range
+    ASSERT_OK(Delete(cf, Key(smallest+1, key_size)));
+    deletions_made++;
+
+    ASSERT_OK(Delete(cf, Key(smallest-1, key_size)));
+    deletions_made++;
+    num_expired++;
+
+    ASSERT_OK(Delete(cf, Key(smallest-9, key_size)));
+    deletions_made++;
+    num_expired++;
+
+    ASSERT_OK(Flush(cf));
+    SetDeletionCompactionStats(stats, deletions_made, num_expired,
+      num_deleted);
+  }
+};
+
+// An EventListener which helps verify the compaction results in
+// test CompactionJobStatsTest.
+class CompactionJobStatsChecker : public EventListener {
+ public:
+  CompactionJobStatsChecker()
+      : compression_enabled_(false), verify_next_comp_io_stats_(false) {}
+
+  size_t NumberOfUnverifiedStats() { return expected_stats_.size(); }
+
+  void set_verify_next_comp_io_stats(bool v) { verify_next_comp_io_stats_ = v; }
+
+  // Once a compaction completed, this function will verify the returned
+  // CompactionJobInfo with the oldest CompactionJobInfo added earlier
+  // in "expected_stats_" which has not yet being used for verification.
+  virtual void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) {
+    if (verify_next_comp_io_stats_) {
+      ASSERT_GT(ci.stats.file_write_nanos, 0);
+      ASSERT_GT(ci.stats.file_range_sync_nanos, 0);
+      ASSERT_GT(ci.stats.file_fsync_nanos, 0);
+      ASSERT_GT(ci.stats.file_prepare_write_nanos, 0);
+      verify_next_comp_io_stats_ = false;
+    }
+
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (expected_stats_.size()) {
+      Verify(ci.stats, expected_stats_.front());
+      expected_stats_.pop();
+    }
+  }
+
+  // A helper function which verifies whether two CompactionJobStats
+  // match.  The verification of all compaction stats are done by
+  // ASSERT_EQ except for the total input / output bytes, which we
+  // use ASSERT_GE and ASSERT_LE with a reasonable bias ---
+  // 10% in uncompressed case and 20% when compression is used.
+  virtual void Verify(const CompactionJobStats& current_stats,
+              const CompactionJobStats& stats) {
+    // time
+    ASSERT_GT(current_stats.elapsed_micros, 0U);
+
+    ASSERT_EQ(current_stats.num_input_records,
+        stats.num_input_records);
+    ASSERT_EQ(current_stats.num_input_files,
+        stats.num_input_files);
+    ASSERT_EQ(current_stats.num_input_files_at_output_level,
+        stats.num_input_files_at_output_level);
+
+    ASSERT_EQ(current_stats.num_output_records,
+        stats.num_output_records);
+    ASSERT_EQ(current_stats.num_output_files,
+        stats.num_output_files);
+
+    ASSERT_EQ(current_stats.is_manual_compaction,
+        stats.is_manual_compaction);
+
+    // file size
+    double kFileSizeBias = compression_enabled_ ? 0.20 : 0.10;
+    ASSERT_GE(current_stats.total_input_bytes * (1.00 + kFileSizeBias),
+              stats.total_input_bytes);
+    ASSERT_LE(current_stats.total_input_bytes,
+              stats.total_input_bytes * (1.00 + kFileSizeBias));
+    ASSERT_GE(current_stats.total_output_bytes * (1.00 + kFileSizeBias),
+              stats.total_output_bytes);
+    ASSERT_LE(current_stats.total_output_bytes,
+              stats.total_output_bytes * (1.00 + kFileSizeBias));
+    ASSERT_EQ(current_stats.total_input_raw_key_bytes,
+              stats.total_input_raw_key_bytes);
+    ASSERT_EQ(current_stats.total_input_raw_value_bytes,
+              stats.total_input_raw_value_bytes);
+
+    ASSERT_EQ(current_stats.num_records_replaced,
+        stats.num_records_replaced);
+
+    ASSERT_EQ(current_stats.num_corrupt_keys,
+        stats.num_corrupt_keys);
+
+    ASSERT_EQ(
+        std::string(current_stats.smallest_output_key_prefix),
+        std::string(stats.smallest_output_key_prefix));
+    ASSERT_EQ(
+        std::string(current_stats.largest_output_key_prefix),
+        std::string(stats.largest_output_key_prefix));
+  }
+
+  // Add an expected compaction stats, which will be used to
+  // verify the CompactionJobStats returned by the OnCompactionCompleted()
+  // callback.
+  void AddExpectedStats(const CompactionJobStats& stats) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    expected_stats_.push(stats);
+  }
+
+  void EnableCompression(bool flag) {
+    compression_enabled_ = flag;
+  }
+
+  bool verify_next_comp_io_stats() const { return verify_next_comp_io_stats_; }
+
+ private:
+  std::mutex mutex_;
+  std::queue<CompactionJobStats> expected_stats_;
+  bool compression_enabled_;
+  bool verify_next_comp_io_stats_;
+};
+
+// An EventListener which helps verify the compaction statistics in
+// the test DeletionStatsTest.
+class CompactionJobDeletionStatsChecker : public CompactionJobStatsChecker {
+ public:
+  // Verifies whether two CompactionJobStats match.
+  void Verify(const CompactionJobStats& current_stats,
+              const CompactionJobStats& stats) {
+    ASSERT_EQ(
+      current_stats.num_input_deletion_records,
+      stats.num_input_deletion_records);
+    ASSERT_EQ(
+        current_stats.num_expired_deletion_records,
+        stats.num_expired_deletion_records);
+    ASSERT_EQ(
+        current_stats.num_records_replaced,
+        stats.num_records_replaced);
+
+    ASSERT_EQ(current_stats.num_corrupt_keys,
+        stats.num_corrupt_keys);
+  }
+};
+
+namespace {
+
+uint64_t EstimatedFileSize(
+    uint64_t num_records, size_t key_size, size_t value_size,
+    double compression_ratio = 1.0,
+    size_t block_size = 4096,
+    int bloom_bits_per_key = 10) {
+  const size_t kPerKeyOverhead = 8;
+  const size_t kFooterSize = 512;
+
+  uint64_t data_size =
+      num_records * (key_size + value_size * compression_ratio +
+                     kPerKeyOverhead);
+
+  return data_size + kFooterSize
+         + num_records * bloom_bits_per_key / 8      // filter block
+         + data_size * (key_size + 8) / block_size;  // index block
+}
+
+namespace {
+
+void CopyPrefix(
+    const Slice& src, size_t prefix_length, std::string* dst) {
+  assert(prefix_length > 0);
+  size_t length = src.size() > prefix_length ? prefix_length : src.size();
+  dst->assign(src.data(), length);
+}
+
+}  // namespace
+
+CompactionJobStats NewManualCompactionJobStats(
+    const std::string& smallest_key, const std::string& largest_key,
+    size_t num_input_files, size_t num_input_files_at_output_level,
+    uint64_t num_input_records, size_t key_size, size_t value_size,
+    size_t num_output_files, uint64_t num_output_records,
+    double compression_ratio, uint64_t num_records_replaced,
+    bool is_manual = true) {
+  CompactionJobStats stats;
+  stats.Reset();
+
+  stats.num_input_records = num_input_records;
+  stats.num_input_files = num_input_files;
+  stats.num_input_files_at_output_level = num_input_files_at_output_level;
+
+  stats.num_output_records = num_output_records;
+  stats.num_output_files = num_output_files;
+
+  stats.total_input_bytes =
+      EstimatedFileSize(
+          num_input_records / num_input_files,
+          key_size, value_size, compression_ratio) * num_input_files;
+  stats.total_output_bytes =
+      EstimatedFileSize(
+          num_output_records / num_output_files,
+          key_size, value_size, compression_ratio) * num_output_files;
+  stats.total_input_raw_key_bytes =
+      num_input_records * (key_size + 8);
+  stats.total_input_raw_value_bytes =
+      num_input_records * value_size;
+
+  stats.is_manual_compaction = is_manual;
+
+  stats.num_records_replaced = num_records_replaced;
+
+  CopyPrefix(smallest_key,
+             CompactionJobStats::kMaxPrefixLength,
+             &stats.smallest_output_key_prefix);
+  CopyPrefix(largest_key,
+             CompactionJobStats::kMaxPrefixLength,
+             &stats.largest_output_key_prefix);
+
+  return stats;
+}
+
+CompressionType GetAnyCompression() {
+  if (Snappy_Supported()) {
+    return kSnappyCompression;
+  } else if (Zlib_Supported()) {
+    return kZlibCompression;
+  } else if (BZip2_Supported()) {
+    return kBZip2Compression;
+  } else if (LZ4_Supported()) {
+    return kLZ4Compression;
+  }
+  return kNoCompression;
+}
+
+}  // namespace
+
+TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
+  Random rnd(301);
+  const int kBufSize = 100;
+  char buf[kBufSize];
+  uint64_t key_base = 100000000l;
+  // Note: key_base must be multiple of num_keys_per_L0_file
+  int num_keys_per_L0_file = 100;
+  const int kTestScale = 8;
+  const int kKeySize = 10;
+  const int kValueSize = 1000;
+  const double kCompressionRatio = 0.5;
+  double compression_ratio = 1.0;
+  uint64_t key_interval = key_base / num_keys_per_L0_file;
+
+  // Whenever a compaction completes, this listener will try to
+  // verify whether the returned CompactionJobStats matches
+  // what we expect.  The expected CompactionJobStats is added
+  // via AddExpectedStats().
+  auto* stats_checker = new CompactionJobStatsChecker();
+  Options options;
+  options.listeners.emplace_back(stats_checker);
+  options.create_if_missing = true;
+  options.max_background_flushes = 0;
+  // just enough setting to hold off auto-compaction.
+  options.level0_file_num_compaction_trigger = kTestScale + 1;
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+  options.max_subcompactions = max_subcompactions_;
+  options.bytes_per_sync = 512 * 1024;
+
+  options.compaction_measure_io_stats = true;
+  for (int test = 0; test < 2; ++test) {
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // 1st Phase: generate "num_L0_files" L0 files.
+    int num_L0_files = 0;
+    for (uint64_t start_key = key_base;
+                  start_key <= key_base * kTestScale;
+                  start_key += key_base) {
+      MakeTableWithKeyValues(
+          &rnd, start_key, start_key + key_base - 1,
+          kKeySize, kValueSize, key_interval,
+          compression_ratio, 1);
+      snprintf(buf, kBufSize, "%d", ++num_L0_files);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+    ASSERT_EQ(ToString(num_L0_files), FilesPerLevel(1));
+
+    // 2nd Phase: perform L0 -> L1 compaction.
+    int L0_compaction_count = 6;
+    int count = 1;
+    std::string smallest_key;
+    std::string largest_key;
+    for (uint64_t start_key = key_base;
+         start_key <= key_base * L0_compaction_count;
+         start_key += key_base, count++) {
+      smallest_key = Key(start_key, 10);
+      largest_key = Key(start_key + key_base - key_interval, 10);
+      stats_checker->AddExpectedStats(
+          NewManualCompactionJobStats(
+              smallest_key, largest_key,
+              1, 0, num_keys_per_L0_file,
+              kKeySize, kValueSize,
+              1, num_keys_per_L0_file,
+              compression_ratio, 0));
+      ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+      TEST_Compact(0, 1, smallest_key, largest_key);
+      snprintf(buf, kBufSize, "%d,%d", num_L0_files - count, count);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+
+    // compact two files into one in the last L0 -> L1 compaction
+    int num_remaining_L0 = num_L0_files - L0_compaction_count;
+    smallest_key = Key(key_base * (L0_compaction_count + 1), 10);
+    largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+    stats_checker->AddExpectedStats(
+        NewManualCompactionJobStats(
+            smallest_key, largest_key,
+            num_remaining_L0,
+            0, num_keys_per_L0_file * num_remaining_L0,
+            kKeySize, kValueSize,
+            1, num_keys_per_L0_file * num_remaining_L0,
+            compression_ratio, 0));
+    ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+    TEST_Compact(0, 1, smallest_key, largest_key);
+
+    int num_L1_files = num_L0_files - num_remaining_L0 + 1;
+    num_L0_files = 0;
+    snprintf(buf, kBufSize, "%d,%d", num_L0_files, num_L1_files);
+    ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+
+    // 3rd Phase: generate sparse L0 files (wider key-range, same num of keys)
+    int sparseness = 2;
+    for (uint64_t start_key = key_base;
+                  start_key <= key_base * kTestScale;
+                  start_key += key_base * sparseness) {
+      MakeTableWithKeyValues(
+          &rnd, start_key, start_key + key_base * sparseness - 1,
+          kKeySize, kValueSize,
+          key_base * sparseness / num_keys_per_L0_file,
+          compression_ratio, 1);
+      snprintf(buf, kBufSize, "%d,%d", ++num_L0_files, num_L1_files);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+
+    // 4th Phase: perform L0 -> L1 compaction again, expect higher write amp
+    // When subcompactions are enabled, the number of output files increases
+    // by 1 because multiple threads are consuming the input and generating
+    // output files without coordinating to see if the output could fit into
+    // a smaller number of files like it does when it runs sequentially
+    int num_output_files = options.max_subcompactions > 1 ? 2 : 1;
+    for (uint64_t start_key = key_base;
+         num_L0_files > 1;
+         start_key += key_base * sparseness) {
+      smallest_key = Key(start_key, 10);
+      largest_key =
+          Key(start_key + key_base * sparseness - key_interval, 10);
+      stats_checker->AddExpectedStats(
+          NewManualCompactionJobStats(
+              smallest_key, largest_key,
+              3, 2, num_keys_per_L0_file * 3,
+              kKeySize, kValueSize,
+              num_output_files,
+              num_keys_per_L0_file * 2,  // 1/3 of the data will be updated.
+              compression_ratio,
+              num_keys_per_L0_file));
+      ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+      Compact(1, smallest_key, largest_key);
+      if (options.max_subcompactions == 1) {
+        --num_L1_files;
+      }
+      snprintf(buf, kBufSize, "%d,%d", --num_L0_files, num_L1_files);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+
+    // 5th Phase: Do a full compaction, which involves in two sub-compactions.
+    // Here we expect to have 1 L0 files and 4 L1 files
+    // In the first sub-compaction, we expect L0 compaction.
+    smallest_key = Key(key_base, 10);
+    largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+    stats_checker->AddExpectedStats(
+        NewManualCompactionJobStats(
+            Key(key_base * (kTestScale + 1 - sparseness), 10), largest_key,
+            2, 1, num_keys_per_L0_file * 3,
+            kKeySize, kValueSize,
+            1, num_keys_per_L0_file * 2,
+            compression_ratio,
+            num_keys_per_L0_file));
+    ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+    Compact(1, smallest_key, largest_key);
+
+    num_L1_files = options.max_subcompactions > 1 ? 7 : 4;
+    char L1_buf[4];
+    snprintf(L1_buf, sizeof(L1_buf), "0,%d", num_L1_files);
+    std::string L1_files(L1_buf);
+    ASSERT_EQ(L1_files, FilesPerLevel(1));
+    options.compression = GetAnyCompression();
+    if (options.compression == kNoCompression) {
+      break;
+    }
+    stats_checker->EnableCompression(true);
+    compression_ratio = kCompressionRatio;
+
+    for (int i = 0; i < 5; i++) {
+      ASSERT_OK(Put(1, Slice(Key(key_base + i, 10)),
+                    Slice(RandomString(&rnd, 512 * 1024, 1))));
+    }
+
+    ASSERT_OK(Flush(1));
+    reinterpret_cast<DBImpl*>(db_)->TEST_WaitForCompact();
+
+    stats_checker->set_verify_next_comp_io_stats(true);
+    std::atomic<bool> first_prepare_write(true);
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void* arg) {
+          if (first_prepare_write.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_prepare_write.store(false);
+          }
+        });
+
+    std::atomic<bool> first_flush(true);
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Flush:BeforeAppend", [&](void* arg) {
+          if (first_flush.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_flush.store(false);
+          }
+        });
+
+    std::atomic<bool> first_sync(true);
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::SyncInternal:0", [&](void* arg) {
+          if (first_sync.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_sync.store(false);
+          }
+        });
+
+    std::atomic<bool> first_range_sync(true);
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::RangeSync:0", [&](void* arg) {
+          if (first_range_sync.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_range_sync.store(false);
+          }
+        });
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    Compact(1, smallest_key, largest_key);
+
+    ASSERT_TRUE(!stats_checker->verify_next_comp_io_stats());
+    ASSERT_TRUE(!first_prepare_write.load());
+    ASSERT_TRUE(!first_flush.load());
+    ASSERT_TRUE(!first_sync.load());
+    ASSERT_TRUE(!first_range_sync.load());
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+  ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+TEST_P(CompactionJobStatsTest, DeletionStatsTest) {
+  Random rnd(301);
+  uint64_t key_base = 100000l;
+  // Note: key_base must be multiple of num_keys_per_L0_file
+  int num_keys_per_L0_file = 20;
+  const int kTestScale = 8;  // make sure this is even
+  const int kKeySize = 10;
+  const int kValueSize = 100;
+  double compression_ratio = 1.0;
+  uint64_t key_interval = key_base / num_keys_per_L0_file;
+  uint64_t largest_key_num = key_base * (kTestScale + 1) - key_interval;
+  uint64_t cutoff_key_num = key_base * (kTestScale / 2 + 1) - key_interval;
+  const std::string smallest_key = Key(key_base - 10, kKeySize);
+  const std::string largest_key = Key(largest_key_num + 10, kKeySize);
+
+  // Whenever a compaction completes, this listener will try to
+  // verify whether the returned CompactionJobStats matches
+  // what we expect.
+  auto* stats_checker = new CompactionJobDeletionStatsChecker();
+  Options options;
+  options.listeners.emplace_back(stats_checker);
+  options.create_if_missing = true;
+  options.max_background_flushes = 0;
+  options.level0_file_num_compaction_trigger = kTestScale+1;
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Stage 1: Generate several L0 files and then send them to L2 by
+  // using CompactRangeOptions and CompactRange(). These files will
+  // have a strict subset of the keys from the full key-range
+  for (uint64_t start_key = key_base;
+                start_key <= key_base * kTestScale / 2;
+                start_key += key_base) {
+    MakeTableWithKeyValues(
+        &rnd, start_key, start_key + key_base - 1,
+        kKeySize, kValueSize, key_interval,
+        compression_ratio, 1);
+  }
+
+  CompactRangeOptions cr_options;
+  cr_options.change_level = true;
+  cr_options.target_level = 2;
+  db_->CompactRange(cr_options, handles_[1], nullptr, nullptr);
+  ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
+
+  // Stage 2: Generate files including keys from the entire key range
+  for (uint64_t start_key = key_base;
+                start_key <= key_base * kTestScale;
+                start_key += key_base) {
+    MakeTableWithKeyValues(
+        &rnd, start_key, start_key + key_base - 1,
+        kKeySize, kValueSize, key_interval,
+        compression_ratio, 1);
+  }
+
+  // Send these L0 files to L1
+  TEST_Compact(0, 1, smallest_key, largest_key);
+  ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+  // Add a new record and flush so now there is a L0 file
+  // with a value too (not just deletions from the next step)
+  ASSERT_OK(Put(1, Key(key_base-6, kKeySize), "test"));
+  ASSERT_OK(Flush(1));
+
+  // Stage 3: Generate L0 files with some deletions so now
+  // there are files with the same key range in L0, L1, and L2
+  int deletion_interval = 3;
+  CompactionJobStats first_compaction_stats;
+  SelectivelyDeleteKeys(key_base, largest_key_num,
+      key_interval, deletion_interval, kKeySize, cutoff_key_num,
+      &first_compaction_stats, 1);
+
+  stats_checker->AddExpectedStats(first_compaction_stats);
+
+  // Stage 4: Trigger compaction and verify the stats
+  TEST_Compact(0, 1, smallest_key, largest_key);
+}
+
+namespace {
+int GetUniversalCompactionInputUnits(uint32_t num_flushes) {
+  uint32_t compaction_input_units;
+  for (compaction_input_units = 1;
+       num_flushes >= compaction_input_units;
+       compaction_input_units *= 2) {
+    if ((num_flushes & compaction_input_units) != 0) {
+      return compaction_input_units > 1 ? compaction_input_units : 0;
+    }
+  }
+  return 0;
+}
+}  // namespace
+
+TEST_P(CompactionJobStatsTest, UniversalCompactionTest) {
+  Random rnd(301);
+  uint64_t key_base = 100000000l;
+  // Note: key_base must be multiple of num_keys_per_L0_file
+  int num_keys_per_table = 100;
+  const uint32_t kTestScale = 8;
+  const int kKeySize = 10;
+  const int kValueSize = 900;
+  double compression_ratio = 1.0;
+  uint64_t key_interval = key_base / num_keys_per_table;
+
+  auto* stats_checker = new CompactionJobStatsChecker();
+  Options options;
+  options.listeners.emplace_back(stats_checker);
+  options.create_if_missing = true;
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = num_keys_per_table * 1000;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 1;
+  options.compaction_options_universal.max_size_amplification_percent = 1000;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Generates the expected CompactionJobStats for each compaction
+  for (uint32_t num_flushes = 2; num_flushes <= kTestScale; num_flushes++) {
+    // Here we treat one newly flushed file as an unit.
+    //
+    // For example, if a newly flushed file is 100k, and a compaction has
+    // 4 input units, then this compaction inputs 400k.
+    uint32_t num_input_units = GetUniversalCompactionInputUnits(num_flushes);
+    if (num_input_units == 0) {
+      continue;
+    }
+    // The following statement determines the expected smallest key
+    // based on whether it is a full compaction.  A full compaction only
+    // happens when the number of flushes equals to the number of compaction
+    // input runs.
+    uint64_t smallest_key =
+        (num_flushes == num_input_units) ?
+            key_base : key_base * (num_flushes - 1);
+
+    stats_checker->AddExpectedStats(
+        NewManualCompactionJobStats(
+            Key(smallest_key, 10),
+            Key(smallest_key + key_base * num_input_units - key_interval, 10),
+            num_input_units,
+            num_input_units > 2 ? num_input_units / 2 : 0,
+            num_keys_per_table * num_input_units,
+            kKeySize, kValueSize,
+            num_input_units,
+            num_keys_per_table * num_input_units,
+            1.0, 0, false));
+  }
+  ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 4U);
+
+  for (uint64_t start_key = key_base;
+                start_key <= key_base * kTestScale;
+                start_key += key_base) {
+    MakeTableWithKeyValues(
+        &rnd, start_key, start_key + key_base - 1,
+        kKeySize, kValueSize, key_interval,
+        compression_ratio, 1);
+    reinterpret_cast<DBImpl*>(db_)->TEST_WaitForCompact();
+  }
+  ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionJobStatsTest, CompactionJobStatsTest,
+                        ::testing::Values(1, 4));
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
+
+#else
+
+int main(int argc, char** argv) { return 0; }
+#endif  // !defined(IOS_CROSS_COMPILE)
diff --git a/src/rocksdb/db/compaction_job_test.cc b/src/rocksdb/db/compaction_job_test.cc
index e4c407a..b1a8909 100644
--- a/src/rocksdb/db/compaction_job_test.cc
+++ b/src/rocksdb/db/compaction_job_test.cc
@@ -3,23 +3,63 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#include <algorithm>
 #include <map>
 #include <string>
+#include <tuple>
 
 #include "db/compaction_job.h"
 #include "db/column_family.h"
 #include "db/version_set.h"
 #include "db/writebuffer.h"
 #include "rocksdb/cache.h"
-#include "rocksdb/options.h"
 #include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "table/mock_table.h"
+#include "util/file_reader_writer.h"
 #include "util/string_util.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
-#include "table/mock_table.h"
+#include "utilities/merge_operators.h"
 
 namespace rocksdb {
 
+namespace {
+
+void VerifyInitializationOfCompactionJobStats(
+      const CompactionJobStats& compaction_job_stats) {
+#if !defined(IOS_CROSS_COMPILE)
+  ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_input_records, 0U);
+  ASSERT_EQ(compaction_job_stats.num_input_files, 0U);
+  ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_output_records, 0U);
+  ASSERT_EQ(compaction_job_stats.num_output_files, 0U);
+
+  ASSERT_EQ(compaction_job_stats.is_manual_compaction, true);
+
+  ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U);
+  ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U);
+
+  ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U);
+  ASSERT_EQ(compaction_job_stats.total_input_raw_value_bytes, 0U);
+
+  ASSERT_EQ(compaction_job_stats.smallest_output_key_prefix[0], 0);
+  ASSERT_EQ(compaction_job_stats.largest_output_key_prefix[0], 0);
+
+  ASSERT_EQ(compaction_job_stats.num_records_replaced, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_input_deletion_records, 0U);
+  ASSERT_EQ(compaction_job_stats.num_expired_deletion_records, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_corrupt_keys, 0U);
+#endif  // !defined(IOS_CROSS_COMPILE)
+}
+
+}  // namespace
+
 // TODO(icanadi) Make it simpler once we mock out VersionSet
 class CompactionJobTest : public testing::Test {
  public:
@@ -37,12 +77,6 @@ class CompactionJobTest : public testing::Test {
     EXPECT_OK(env_->CreateDirIfMissing(dbname_));
     db_options_.db_paths.emplace_back(dbname_,
                                       std::numeric_limits<uint64_t>::max());
-    NewDB();
-    std::vector<ColumnFamilyDescriptor> column_families;
-    cf_options_.table_factory = mock_table_factory_;
-    column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
-
-    EXPECT_OK(versions_->Recover(column_families, false));
   }
 
   std::string GenerateFileName(uint64_t file_number) {
@@ -53,51 +87,98 @@ class CompactionJobTest : public testing::Test {
     return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
   }
 
+  std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num,
+      const ValueType t) {
+    return InternalKey(user_key, seq_num, t).Encode().ToString();
+  }
+
+  void AddMockFile(const stl_wrappers::KVMap& contents, int level = 0) {
+    assert(contents.size() > 0);
+
+    bool first_key = true;
+    std::string smallest, largest;
+    InternalKey smallest_key, largest_key;
+    SequenceNumber smallest_seqno = kMaxSequenceNumber;
+    SequenceNumber largest_seqno = 0;
+    for (auto kv : contents) {
+      ParsedInternalKey key;
+      std::string skey;
+      std::string value;
+      std::tie(skey, value) = kv;
+      ParseInternalKey(skey, &key);
+
+      smallest_seqno = std::min(smallest_seqno, key.sequence);
+      largest_seqno = std::max(largest_seqno, key.sequence);
+
+      if (first_key ||
+          cfd_->user_comparator()->Compare(key.user_key, smallest) < 0) {
+        smallest.assign(key.user_key.data(), key.user_key.size());
+        smallest_key.DecodeFrom(skey);
+      }
+      if (first_key ||
+          cfd_->user_comparator()->Compare(key.user_key, largest) > 0) {
+        largest.assign(key.user_key.data(), key.user_key.size());
+        largest_key.DecodeFrom(skey);
+      }
+
+      first_key = false;
+    }
+
+    uint64_t file_number = versions_->NewFileNumber();
+    EXPECT_OK(mock_table_factory_->CreateMockTable(
+        env_, GenerateFileName(file_number), std::move(contents)));
+
+    VersionEdit edit;
+    edit.AddFile(level, file_number, 0, 10, smallest_key, largest_key,
+        smallest_seqno, largest_seqno, false);
+
+    mutex_.Lock();
+    versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                           mutable_cf_options_, &edit, &mutex_);
+    mutex_.Unlock();
+  }
+
+  void SetLastSequence(const SequenceNumber sequence_number) {
+    versions_->SetLastSequence(sequence_number + 1);
+  }
+
   // returns expected result after compaction
-  mock::MockFileContents CreateTwoFiles() {
-    mock::MockFileContents expected_results;
+  stl_wrappers::KVMap CreateTwoFiles(bool gen_corrupted_keys) {
+    auto expected_results = mock::MakeMockFile();
     const int kKeysPerFile = 10000;
+    const int kCorruptKeysPerFile = 200;
+    const int kMatchingKeys = kKeysPerFile / 2;
     SequenceNumber sequence_number = 0;
+
+    auto corrupt_id = [&](int id) {
+      return gen_corrupted_keys && id > 0 && id <= kCorruptKeysPerFile;
+    };
+
     for (int i = 0; i < 2; ++i) {
-      mock::MockFileContents contents;
-      SequenceNumber smallest_seqno = 0, largest_seqno = 0;
-      InternalKey smallest, largest;
+      auto contents = mock::MakeMockFile();
       for (int k = 0; k < kKeysPerFile; ++k) {
-        auto key = ToString(i * (kKeysPerFile / 2) + k);
+        auto key = ToString(i * kMatchingKeys + k);
         auto value = ToString(i * kKeysPerFile + k);
         InternalKey internal_key(key, ++sequence_number, kTypeValue);
         // This is how the key will look like once it's written in bottommost
         // file
         InternalKey bottommost_internal_key(key, 0, kTypeValue);
-        if (k == 0) {
-          smallest = internal_key;
-          smallest_seqno = sequence_number;
-        } else if (k == kKeysPerFile - 1) {
-          largest = internal_key;
-          largest_seqno = sequence_number;
+        if (corrupt_id(k)) {
+          test::CorruptKeyType(&internal_key);
+          test::CorruptKeyType(&bottommost_internal_key);
         }
-        std::pair<std::string, std::string> key_value(
-            {bottommost_internal_key.Encode().ToString(), value});
-        contents.insert(key_value);
-        if (i == 1 || k < kKeysPerFile / 2) {
-          expected_results.insert(key_value);
+        contents.insert({ internal_key.Encode().ToString(), value });
+        if (i == 1 || k < kMatchingKeys || corrupt_id(k - kMatchingKeys)) {
+          expected_results.insert(
+              { bottommost_internal_key.Encode().ToString(), value });
         }
       }
 
-      uint64_t file_number = versions_->NewFileNumber();
-      EXPECT_OK(mock_table_factory_->CreateMockTable(
-          env_, GenerateFileName(file_number), std::move(contents)));
+      AddMockFile(contents);
+    }
 
-      VersionEdit edit;
-      edit.AddFile(0, file_number, 0, 10, smallest, largest, smallest_seqno,
-                   largest_seqno);
+    SetLastSequence(sequence_number);
 
-      mutex_.Lock();
-      versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
-                             mutable_cf_options_, &edit, &mutex_);
-      mutex_.Unlock();
-    }
-    versions_->SetLastSequence(sequence_number);
     return expected_results;
   }
 
@@ -112,8 +193,10 @@ class CompactionJobTest : public testing::Test {
     Status s = env_->NewWritableFile(
         manifest, &file, env_->OptimizeForManifestWrite(env_options_));
     ASSERT_OK(s);
+    unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(file), env_options_));
     {
-      log::Writer log(std::move(file));
+      log::Writer log(std::move(file_writer));
       std::string record;
       new_db.EncodeTo(&record);
       s = log.AddRecord(record);
@@ -121,6 +204,71 @@ class CompactionJobTest : public testing::Test {
     ASSERT_OK(s);
     // Make "CURRENT" file that points to the new manifest file.
     s = SetCurrentFile(env_, dbname_, 1, nullptr);
+
+    std::vector<ColumnFamilyDescriptor> column_families;
+    cf_options_.table_factory = mock_table_factory_;
+    cf_options_.merge_operator = merge_op_;
+    cf_options_.compaction_filter = compaction_filter_.get();
+    column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+
+    EXPECT_OK(versions_->Recover(column_families, false));
+    cfd_ = versions_->GetColumnFamilySet()->GetDefault();
+  }
+
+  void RunCompaction(const std::vector<std::vector<FileMetaData*>>& input_files,
+                     const stl_wrappers::KVMap& expected_results,
+                     const std::vector<SequenceNumber>& snapshots = {}) {
+    auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+
+    size_t num_input_files = 0;
+    std::vector<CompactionInputFiles> compaction_input_files;
+    for (size_t level = 0; level < input_files.size(); level++) {
+      auto level_files = input_files[level];
+      CompactionInputFiles compaction_level;
+      compaction_level.level = static_cast<int>(level);
+      compaction_level.files.insert(compaction_level.files.end(),
+          level_files.begin(), level_files.end());
+      compaction_input_files.push_back(compaction_level);
+      num_input_files += level_files.size();
+    }
+
+    Compaction compaction(cfd->current()->storage_info(),
+                          *cfd->GetLatestMutableCFOptions(),
+                          compaction_input_files, 1, 1024 * 1024, 10, 0,
+                          kNoCompression, {}, true);
+    compaction.SetInputVersion(cfd->current());
+
+    LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
+    mutex_.Lock();
+    EventLogger event_logger(db_options_.info_log.get());
+    CompactionJob compaction_job(0, &compaction, db_options_, env_options_,
+                                 versions_.get(), &shutting_down_, &log_buffer,
+                                 nullptr, nullptr, nullptr, snapshots,
+                                 table_cache_, &event_logger, false, false,
+                                 dbname_, &compaction_job_stats_);
+
+    VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
+
+    compaction_job.Prepare();
+    mutex_.Unlock();
+    Status s;
+    s = compaction_job.Run();
+    ASSERT_OK(s);
+    mutex_.Lock();
+    ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions(),
+                                     &mutex_));
+    mutex_.Unlock();
+
+    if (expected_results.size() == 0) {
+      ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+      ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+      ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
+    } else {
+      ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+      ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+      ASSERT_EQ(compaction_job_stats_.num_output_files, 1U);
+      mock_table_factory_->AssertLatestFile(expected_results);
+    }
   }
 
   Env* env_;
@@ -136,49 +284,409 @@ class CompactionJobTest : public testing::Test {
   InstrumentedMutex mutex_;
   std::atomic<bool> shutting_down_;
   std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+  CompactionJobStats compaction_job_stats_;
+  ColumnFamilyData* cfd_;
+  std::unique_ptr<CompactionFilter> compaction_filter_;
+  std::shared_ptr<MergeOperator> merge_op_;
 };
 
 TEST_F(CompactionJobTest, Simple) {
+  NewDB();
+
+  auto expected_results = CreateTwoFiles(false);
   auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  auto files = cfd->current()->storage_info()->LevelFiles(0);
+  ASSERT_EQ(2U, files.size());
+  RunCompaction({ files }, expected_results);
+}
 
-  auto expected_results = CreateTwoFiles();
+TEST_F(CompactionJobTest, SimpleCorrupted) {
+  NewDB();
 
+  auto expected_results = CreateTwoFiles(true);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
   auto files = cfd->current()->storage_info()->LevelFiles(0);
-  ASSERT_EQ(2U, files.size());
+  RunCompaction({files}, expected_results);
+  ASSERT_EQ(compaction_job_stats_.num_corrupt_keys, 400U);
+}
+
+TEST_F(CompactionJobTest, SimpleDeletion) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({{KeyStr("c", 4U, kTypeDeletion), ""},
+                                   {KeyStr("c", 3U, kTypeValue), "val"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("b", 2U, kTypeValue), "val"},
+                                   {KeyStr("b", 1U, kTypeValue), "val"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("b", 0U, kTypeValue), "val"}});
+
+  SetLastSequence(4U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleOverwrite) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 3U, kTypeValue), "val2"},
+      {KeyStr("b", 4U, kTypeValue), "val3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+                                   {KeyStr("b", 2U, kTypeValue), "val"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "val2"},
+                          {KeyStr("b", 0U, kTypeValue), "val3"}});
+
+  SetLastSequence(4U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleNonLastLevel) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeValue), "val2"},
+      {KeyStr("b", 6U, kTypeValue), "val3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+                                   {KeyStr("b", 4U, kTypeValue), "val"}});
+  AddMockFile(file2, 1);
+
+  auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+                                   {KeyStr("b", 2U, kTypeValue), "val"}});
+  AddMockFile(file3, 2);
+
+  // Because level 1 is not the last level, the sequence numbers of a and b
+  // cannot be set to 0
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+                          {KeyStr("b", 6U, kTypeValue), "val3"}});
+
+  SetLastSequence(6U);
+  auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+  auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+  RunCompaction({lvl0_files, lvl1_files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SimpleMerge) {
+  merge_op_ = MergeOperators::CreateStringAppendOperator();
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeMerge), "5"},
+      {KeyStr("a", 4U, kTypeMerge), "4"},
+      {KeyStr("a", 3U, kTypeValue), "3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeValue), "1"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+                          {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+  SetLastSequence(5U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, NonAssocMerge) {
+  merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeMerge), "5"},
+      {KeyStr("a", 4U, kTypeMerge), "4"},
+      {KeyStr("a", 3U, kTypeMerge), "3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeMerge), "1"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+                          {KeyStr("b", 2U, kTypeMerge), "2"},
+                          {KeyStr("b", 1U, kTypeMerge), "1"}});
+
+  SetLastSequence(5U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+// Filters merge operands with value 10.
+TEST_F(CompactionJobTest, MergeOperandFilter) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  compaction_filter_.reset(new test::FilterNumber(10U));
+  NewDB();
+
+  auto file1 = mock::MakeMockFile(
+      {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+       {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)},  // Filtered
+       {KeyStr("a", 3U, kTypeMerge), test::EncodeInt(3U)}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)},
+      {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}  // Filtered
+  });
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)},
+                          {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)}});
+
+  SetLastSequence(5U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, FilterSomeMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  compaction_filter_.reset(new test::FilterNumber(10U));
+  NewDB();
+
+  auto file1 = mock::MakeMockFile(
+      {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+       {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)},  // Filtered
+       {KeyStr("a", 3U, kTypeValue), test::EncodeInt(5U)},
+       {KeyStr("d", 8U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(3U)},
+                          {KeyStr("c", 1U, kTypeValue), test::EncodeInt(7U)},
+                          {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}});
+  AddMockFile(file2);
+
+  auto file3 =
+      mock::MakeMockFile({{KeyStr("a", 1U, kTypeMerge), test::EncodeInt(3U)}});
+  AddMockFile(file3, 2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeValue), test::EncodeInt(10U)},
+      {KeyStr("c", 2U, kTypeValue), test::EncodeInt(10U)},
+      {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}
+      // b does not appear because the operands are filtered
+  });
+
+  SetLastSequence(5U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+// Test where all operands/merge results are filtered out.
+TEST_F(CompactionJobTest, FilterAllMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  compaction_filter_.reset(new test::FilterNumber(10U));
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 11U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("a", 10U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("a", 9U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{KeyStr("b", 8U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 7U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 6U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 5U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 4U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 3U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("c", 1U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file2);
+
+  auto file3 =
+      mock::MakeMockFile({{KeyStr("a", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file3, 2);
+
+  SetLastSequence(11U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+
+  stl_wrappers::KVMap empty_map;
+  RunCompaction({files}, empty_map);
+}
+
+TEST_F(CompactionJobTest, SimpleSingleDelete) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeDeletion), ""},
+      {KeyStr("b", 6U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+                                   {KeyStr("b", 4U, kTypeValue), "val"}});
+  AddMockFile(file2);
+
+  auto file3 = mock::MakeMockFile({
+      {KeyStr("a", 1U, kTypeValue), "val"},
+  });
+  AddMockFile(file3, 2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 5U, kTypeDeletion), ""}});
+
+  SetLastSequence(6U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
+}
+
+TEST_F(CompactionJobTest, SingleDeleteSnapshots) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({{KeyStr("A", 12U, kTypeSingleDeletion), ""},
+                                   {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+                                   {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+                                   {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+                                   {KeyStr("d", 9U, kTypeSingleDeletion), ""}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("0", 2U, kTypeSingleDeletion), ""},
+                                   {KeyStr("a", 11U, kTypeValue), "val1"},
+                                   {KeyStr("b", 11U, kTypeValue), "val2"},
+                                   {KeyStr("c", 21U, kTypeValue), "val3"},
+                                   {KeyStr("d", 8U, kTypeValue), "val4"},
+                                   {KeyStr("e", 2U, kTypeSingleDeletion), ""}});
+  AddMockFile(file2);
+
+  auto file3 = mock::MakeMockFile({{KeyStr("A", 1U, kTypeValue), "val"},
+                                   {KeyStr("e", 1U, kTypeValue), "val"}});
+  AddMockFile(file3, 2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("A", 12U, kTypeSingleDeletion), ""},
+                          {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+                          {KeyStr("b", 11U, kTypeValue), "val2"},
+                          {KeyStr("e", 2U, kTypeSingleDeletion), ""}});
+
+  SetLastSequence(22U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results, {10U, 20U});
+}
+
+TEST_F(CompactionJobTest, SingleDeleteZeroSeq) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("A", 10U, kTypeSingleDeletion), ""},
+      {KeyStr("dummy", 5U, kTypeValue), "val2"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("A", 0U, kTypeValue), "val"},
+  });
+  AddMockFile(file2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("dummy", 0U, kTypeValue), "val2"},
+  });
+
+  SetLastSequence(22U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results, {});
+}
+
+TEST_F(CompactionJobTest, MultiSingleDelete) {
+  // Tests three scenarios involving multiple single delete/put pairs:
+  //
+  // A: Put Snapshot SDel Put SDel -> Put Snapshot SDel
+  // B: Put SDel Put SDel -> (Removed)
+  // C: SDel Put SDel Snapshot Put -> Snapshot Put
+  // D: (Put) SDel Snapshot Put SDel -> (Put) SDel Snapshot
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("A", 13U, kTypeValue), "val5"},
+      {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 13U, kTypeValue), "val2"},
+      {KeyStr("C", 14U, kTypeValue), "val3"},
+      {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("D", 11U, kTypeValue), "val4"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("A", 10U, kTypeValue), "val"},
+      {KeyStr("B", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 11U, kTypeValue), "val2"},
+      {KeyStr("C", 10U, kTypeSingleDeletion), ""},
+      {KeyStr("C", 9U, kTypeValue), "val6"},
+      {KeyStr("C", 8U, kTypeSingleDeletion), ""},
+      {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file2);
+
+  auto file3 = mock::MakeMockFile({
+      {KeyStr("D", 11U, kTypeValue), "val"},
+  });
+  AddMockFile(file3, 2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("A", 10U, kTypeValue), "val"},
+      {KeyStr("C", 14U, kTypeValue), "val3"},
+      {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+  });
+
+  SetLastSequence(22U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results, {10U});
+}
+
+// This test documents the behavior where a corrupt key follows a deletion or a
+// single deletion and the (single) deletion gets removed while the corrupt key
+// gets written out. TODO(noetzli): We probably want a better way to treat
+// corrupt keys.
+TEST_F(CompactionJobTest, CorruptionAfterDeletion) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{test::KeyStr("A", 6U, kTypeValue), "val3"},
+                          {test::KeyStr("a", 5U, kTypeDeletion), ""},
+                          {test::KeyStr("a", 4U, kTypeValue, true), "val"}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{test::KeyStr("b", 3U, kTypeSingleDeletion), ""},
+                          {test::KeyStr("b", 2U, kTypeValue, true), "val"},
+                          {test::KeyStr("c", 1U, kTypeValue), "val2"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{test::KeyStr("A", 0U, kTypeValue), "val3"},
+                          {test::KeyStr("a", 0U, kTypeValue, true), "val"},
+                          {test::KeyStr("b", 0U, kTypeValue, true), "val"},
+                          {test::KeyStr("c", 0U, kTypeValue), "val2"}});
 
-  CompactionInputFiles compaction_input_files;
-  compaction_input_files.level = 0;
-  compaction_input_files.files.push_back(files[0]);
-  compaction_input_files.files.push_back(files[1]);
-  std::unique_ptr<Compaction> compaction(new Compaction(
-      cfd->current()->storage_info(), *cfd->GetLatestMutableCFOptions(),
-      {compaction_input_files}, 1, 1024 * 1024, 10, 0, kNoCompression, {}));
-  compaction->SetInputVersion(cfd->current());
-
-  int yield_callback_called = 0;
-  std::function<uint64_t()> yield_callback = [&]() {
-    yield_callback_called++;
-    return 0;
-  };
-  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
-  mutex_.Lock();
-  EventLogger event_logger(db_options_.info_log.get());
-  CompactionJob compaction_job(0, compaction.get(), db_options_, env_options_,
-                               versions_.get(), &shutting_down_, &log_buffer,
-                               nullptr, nullptr, nullptr, {}, table_cache_,
-                               std::move(yield_callback), &event_logger, false);
-
-  compaction_job.Prepare();
-  mutex_.Unlock();
-  ASSERT_OK(compaction_job.Run());
-  mutex_.Lock();
-  Status s;
-  compaction_job.Install(&s, *cfd->GetLatestMutableCFOptions(), &mutex_);
-  ASSERT_OK(s);
-  mutex_.Unlock();
-
-  mock_table_factory_->AssertLatestFile(expected_results);
-  ASSERT_EQ(yield_callback_called, 20000);
+  SetLastSequence(6U);
+  auto files = cfd_->current()->storage_info()->LevelFiles(0);
+  RunCompaction({files}, expected_results);
 }
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/compaction_picker.cc b/src/rocksdb/db/compaction_picker.cc
index 70e4814..2793508 100644
--- a/src/rocksdb/db/compaction_picker.cc
+++ b/src/rocksdb/db/compaction_picker.cc
@@ -15,6 +15,7 @@
 
 #include <inttypes.h>
 #include <limits>
+#include <queue>
 #include <string>
 #include <utility>
 
@@ -37,6 +38,69 @@ uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files) {
   return sum;
 }
 
+// Universal compaction is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+
+// Used in universal compaction when trivial move is enabled.
+// This structure is used for the construction of min heap
+// that contains the file meta data, the level of the file
+// and the index of the file in that level
+
+struct InputFileInfo {
+  InputFileInfo() : f(nullptr) {}
+
+  FileMetaData* f;
+  size_t level;
+  size_t index;
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This comparator is used for the construction of min heap
+// based on the smallest key of the file.
+struct UserKeyComparator {
+  explicit UserKeyComparator(const Comparator* ucmp) { ucmp_ = ucmp; }
+
+  bool operator()(InputFileInfo i1, InputFileInfo i2) const {
+    return (ucmp_->Compare(i1.f->smallest.user_key(),
+                           i2.f->smallest.user_key()) > 0);
+  }
+
+ private:
+  const Comparator* ucmp_;
+};
+
+typedef std::priority_queue<InputFileInfo, std::vector<InputFileInfo>,
+                            UserKeyComparator> SmallestKeyHeap;
+
+// This function creates the heap that is used to find if the files are
+// overlapping during universal compaction when the allow_trivial_move
+// is set.
+SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) {
+  SmallestKeyHeap smallest_key_priority_q =
+      SmallestKeyHeap(UserKeyComparator(ucmp));
+
+  InputFileInfo input_file;
+
+  for (size_t l = 0; l < c->num_input_levels(); l++) {
+    if (c->num_input_files(l) != 0) {
+      if (l == 0 && c->start_level() == 0) {
+        for (size_t i = 0; i < c->num_input_files(0); i++) {
+          input_file.f = c->input(0, i);
+          input_file.level = 0;
+          input_file.index = i;
+          smallest_key_priority_q.push(std::move(input_file));
+        }
+      } else {
+        input_file.f = c->input(l, 0);
+        input_file.level = l;
+        input_file.index = 0;
+        smallest_key_priority_q.push(std::move(input_file));
+      }
+    }
+  }
+  return smallest_key_priority_q;
+}
+#endif  // !ROCKSDB_LITE
 }  // anonymous namespace
 
 // Determine compression type, based on user options, level of the output
@@ -197,10 +261,10 @@ Compaction* CompactionPicker::FormCompaction(
           mutable_cf_options.MaxGrandParentOverlapBytes(output_level + 1) :
           std::numeric_limits<uint64_t>::max();
   assert(input_files.size());
-  return new Compaction(vstorage, mutable_cf_options, input_files, output_level,
-                        compact_options.output_file_size_limit,
-                        max_grandparent_overlap_bytes, output_path_id,
-                        compact_options.compression, /* grandparents */ {});
+  return new Compaction(
+      vstorage, mutable_cf_options, input_files, output_level,
+      compact_options.output_file_size_limit, max_grandparent_overlap_bytes,
+      output_path_id, compact_options.compression, /* grandparents */ {}, true);
 }
 
 Status CompactionPicker::GetCompactionInputsFromFileNumbers(
@@ -342,8 +406,9 @@ bool CompactionPicker::SetupOtherInputs(
       if (expanded1.size() == output_level_inputs->size() &&
           !FilesInCompaction(expanded1)) {
         Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log,
-            "[%s] Expanding@%d %zu+%zu (%" PRIu64 "+%" PRIu64
-            " bytes) to %zu+%zu (%" PRIu64 "+%" PRIu64 "bytes)\n",
+            "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt "(%" PRIu64
+            "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt
+            " (%" PRIu64 "+%" PRIu64 "bytes)\n",
             cf_name.c_str(), input_level, inputs->size(),
             output_level_inputs->size(), inputs0_size, inputs1_size,
             expanded0.size(), expanded1.size(), expanded0_size, inputs1_size);
@@ -645,7 +710,12 @@ Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
     aggregated_file_meta.largestkey = largestkey;
 
     // For all lower levels, include all overlapping files.
-    for (int m = l + 1; m <= output_level; ++m) {
+    // We need to add overlapping files from the current level too because even
+    // if there no input_files in level l, we would still need to add files
+    // which overlap with the range containing the input_files in levels 0 to l
+    // Level 0 doesn't need to be handled this way because files are sorted by
+    // time and not by key
+    for (int m = std::max(l, 1); m <= output_level; ++m) {
       for (auto& next_lv_file : levels[m].files) {
         if (HaveOverlappingKeyRanges(
             comparator, aggregated_file_meta, next_lv_file)) {
@@ -958,7 +1028,7 @@ bool LevelCompactionPicker::PickCompactionBySize(VersionStorageInfo* vstorage,
 
   // Pick the largest file in this level that is not already
   // being compacted
-  const std::vector<int>& file_size = vstorage->FilesBySize(level);
+  const std::vector<int>& file_size = vstorage->FilesByCompactionPri(level);
   const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(level);
 
   // record the first file that is not yet compacted
@@ -969,11 +1039,6 @@ bool LevelCompactionPicker::PickCompactionBySize(VersionStorageInfo* vstorage,
     int index = file_size[i];
     auto* f = level_files[index];
 
-    assert((i == file_size.size() - 1) ||
-           (i >= VersionStorageInfo::kNumberFilesToSort - 1) ||
-           (f->compensated_file_size >=
-            level_files[file_size[i + 1]]->compensated_file_size));
-
     // do not pick a file to compact if it is being compacted
     // from n-1 level.
     if (f->being_compacted) {
@@ -1049,7 +1114,7 @@ void UniversalCompactionPicker::SortedRun::DumpSizeInfo(
 
 std::vector<UniversalCompactionPicker::SortedRun>
 UniversalCompactionPicker::CalculateSortedRuns(
-    const VersionStorageInfo& vstorage) {
+    const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions) {
   std::vector<UniversalCompactionPicker::SortedRun> ret;
   for (FileMetaData* f : vstorage.LevelFiles(0)) {
     ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
@@ -1063,10 +1128,18 @@ UniversalCompactionPicker::CalculateSortedRuns(
     for (FileMetaData* f : vstorage.LevelFiles(level)) {
       total_compensated_size += f->compensated_file_size;
       total_size += f->fd.GetFileSize();
-      // Compaction always includes all files for a non-zero level, so for a
-      // non-zero level, all the files should share the same being_compacted
-      // value.
-      assert(is_first || f->being_compacted == being_compacted);
+      if (ioptions.compaction_options_universal.allow_trivial_move == true) {
+        if (f->being_compacted) {
+          being_compacted = f->being_compacted;
+        }
+      } else {
+        // Compaction always includes all files for a non-zero level, so for a
+        // non-zero level, all the files should share the same being_compacted
+        // value.
+        // This assumption is only valid when
+        // ioptions.compaction_options_universal.allow_trivial_move is false
+        assert(is_first || f->being_compacted == being_compacted);
+      }
       if (is_first) {
         being_compacted = f->being_compacted;
         is_first = false;
@@ -1106,6 +1179,50 @@ void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files,
 }  // namespace
 #endif
 
+// Algorithm that checks to see if there are any overlapping
+// files in the input
+bool CompactionPicker::IsInputNonOverlapping(Compaction* c) {
+  auto comparator = icmp_->user_comparator();
+  int first_iter = 1;
+
+  InputFileInfo prev, curr, next;
+
+  SmallestKeyHeap smallest_key_priority_q =
+      create_level_heap(c, icmp_->user_comparator());
+
+  while (!smallest_key_priority_q.empty()) {
+    curr = smallest_key_priority_q.top();
+    smallest_key_priority_q.pop();
+
+    if (first_iter) {
+      prev = curr;
+      first_iter = 0;
+    } else {
+      if (comparator->Compare(prev.f->largest.user_key(),
+                              curr.f->smallest.user_key()) >= 0) {
+        // found overlapping files, return false
+        return false;
+      }
+      assert(comparator->Compare(curr.f->largest.user_key(),
+                                 prev.f->largest.user_key()) > 0);
+      prev = curr;
+    }
+
+    next.f = nullptr;
+
+    if (curr.level != 0 && curr.index < c->num_input_files(curr.level) - 1) {
+      next.f = c->input(curr.level, curr.index + 1);
+      next.level = curr.level;
+      next.index = curr.index + 1;
+    }
+
+    if (next.f) {
+      smallest_key_priority_q.push(std::move(next));
+    }
+  }
+  return true;
+}
+
 // Universal style of compaction. Pick files that are contiguous in
 // time-range to compact.
 //
@@ -1114,7 +1231,8 @@ Compaction* UniversalCompactionPicker::PickCompaction(
     VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
   const int kLevel0 = 0;
   double score = vstorage->CompactionScore(kLevel0);
-  std::vector<SortedRun> sorted_runs = CalculateSortedRuns(*vstorage);
+  std::vector<SortedRun> sorted_runs =
+      CalculateSortedRuns(*vstorage, ioptions_);
 
   if (sorted_runs.size() <
       (unsigned int)mutable_cf_options.level0_file_num_compaction_trigger) {
@@ -1122,7 +1240,8 @@ Compaction* UniversalCompactionPicker::PickCompaction(
     return nullptr;
   }
   VersionStorageInfo::LevelSummaryStorage tmp;
-  LogToBuffer(log_buffer, 3072, "[%s] Universal: sorted runs files(%zu): %s\n",
+  LogToBuffer(log_buffer, 3072,
+              "[%s] Universal: sorted runs files(%" ROCKSDB_PRIszt "): %s\n",
               cf_name.c_str(), sorted_runs.size(),
               vstorage->LevelSummary(&tmp));
 
@@ -1168,6 +1287,10 @@ Compaction* UniversalCompactionPicker::PickCompaction(
     return nullptr;
   }
 
+  if (ioptions_.compaction_options_universal.allow_trivial_move == true) {
+    c->set_is_trivial_move(IsInputNonOverlapping(c));
+  }
+
 // validate that all the chosen files of L0 are non overlapping in time
 #ifndef NDEBUG
   SequenceNumber prev_smallest_seqno = 0U;
@@ -1194,7 +1317,13 @@ Compaction* UniversalCompactionPicker::PickCompaction(
                               &largest_seqno);
       if (is_first) {
         is_first = false;
-      } else {
+      } else if (prev_smallest_seqno > 0) {
+        // A level is considered as the bottommost level if there are
+        // no files in higher levels or if files in higher levels do
+        // not overlap with the files being compacted. Sequence numbers
+        // of files in bottommost level can be set to 0 to help
+        // compression. As a result, the following assert may not hold
+        // if the prev_smallest_seqno is 0.
         assert(prev_smallest_seqno > largest_seqno);
       }
       prev_smallest_seqno = smallest_seqno;
diff --git a/src/rocksdb/db/compaction_picker.h b/src/rocksdb/db/compaction_picker.h
index 4034101..e7d8bf6 100644
--- a/src/rocksdb/db/compaction_picker.h
+++ b/src/rocksdb/db/compaction_picker.h
@@ -8,22 +8,20 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#include <vector>
+
 #include <memory>
 #include <set>
+#include <string>
 #include <unordered_set>
+#include <vector>
 
-#include "db/version_set.h"
 #include "db/compaction.h"
-#include "rocksdb/status.h"
-#include "rocksdb/options.h"
+#include "db/version_set.h"
 #include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
 #include "util/mutable_cf_options.h"
 
-#include <vector>
-#include <memory>
-#include <set>
-#include <string>
 
 namespace rocksdb {
 
@@ -90,7 +88,8 @@ class CompactionPicker {
   // Returns true if any one of the specified files are being compacted
   bool FilesInCompaction(const std::vector<FileMetaData*>& files);
 
-  // Takes a list of CompactionInputFiles and returns a Compaction object.
+  // Takes a list of CompactionInputFiles and returns a (manual) Compaction
+  // object.
   Compaction* FormCompaction(
       const CompactionOptions& compact_options,
       const std::vector<CompactionInputFiles>& input_files, int output_level,
@@ -105,6 +104,17 @@ class CompactionPicker {
       const VersionStorageInfo* vstorage,
       const CompactionOptions& compact_options) const;
 
+  // Used in universal compaction when the enabled_trivial_move
+  // option is set. Checks whether there are any overlapping files
+  // in the input. Returns true if the input files are non
+  // overlapping.
+  bool IsInputNonOverlapping(Compaction* c);
+
+  // Is there currently a compaction involving level 0 taking place
+  bool IsLevel0CompactionInProgress() const {
+    return !level0_compactions_in_progress_.empty();
+  }
+
  protected:
   int NumberLevels() const { return ioptions_.num_levels; }
 
@@ -269,7 +279,7 @@ class UniversalCompactionPicker : public CompactionPicker {
       const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer);
 
   static std::vector<SortedRun> CalculateSortedRuns(
-      const VersionStorageInfo& vstorage);
+      const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions);
 
   // Pick a path ID to place a newly generated file, with its estimated file
   // size.
diff --git a/src/rocksdb/db/compaction_picker_test.cc b/src/rocksdb/db/compaction_picker_test.cc
index 9efd951..ef86058 100644
--- a/src/rocksdb/db/compaction_picker_test.cc
+++ b/src/rocksdb/db/compaction_picker_test.cc
@@ -3,9 +3,12 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#include "db/compaction.h"
 #include "db/compaction_picker.h"
 #include <limits>
 #include <string>
+#include <utility>
+
 #include "util/logging.h"
 #include "util/string_util.h"
 #include "util/testharness.h"
@@ -35,6 +38,11 @@ class CompactionPickerTest : public testing::Test {
   CompactionOptionsFIFO fifo_options_;
   std::unique_ptr<VersionStorageInfo> vstorage_;
   std::vector<std::unique_ptr<FileMetaData>> files_;
+  // does not own FileMetaData
+  std::unordered_map<uint32_t, std::pair<FileMetaData*, int>> file_map_;
+  // input files to compaction process.
+  std::vector<CompactionInputFiles> input_files_;
+  int compaction_level_start_;
 
   CompactionPickerTest()
       : ucmp_(BytewiseComparator()),
@@ -66,6 +74,8 @@ class CompactionPickerTest : public testing::Test {
   void DeleteVersionStorage() {
     vstorage_.reset();
     files_.clear();
+    file_map_.clear();
+    input_files_.clear();
   }
 
   void Add(int level, uint32_t file_number, const char* smallest,
@@ -77,19 +87,40 @@ class CompactionPickerTest : public testing::Test {
     f->fd = FileDescriptor(file_number, path_id, file_size);
     f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
     f->largest = InternalKey(largest, largest_seq, kTypeValue);
+    f->smallest_seqno = smallest_seq;
+    f->largest_seqno = largest_seq;
     f->compensated_file_size = file_size;
     f->refs = 0;
     vstorage_->AddFile(level, f);
     files_.emplace_back(f);
+    file_map_.insert({file_number, {f, level}});
+  }
+
+  void SetCompactionInputFilesLevels(int level_count, int start_level) {
+    input_files_.resize(level_count);
+    for (int i = 0; i < level_count; ++i) {
+      input_files_[i].level = start_level + i;
+    }
+    compaction_level_start_ = start_level;
+  }
+
+  void AddToCompactionFiles(uint32_t file_number) {
+    auto iter = file_map_.find(file_number);
+    assert(iter != file_map_.end());
+    int level = iter->second.second;
+    assert(level < vstorage_->num_levels());
+    input_files_[level - compaction_level_start_].files.emplace_back(
+        iter->second.first);
   }
 
   void UpdateVersionStorageInfo() {
     vstorage_->CalculateBaseBytes(ioptions_, mutable_cf_options_);
-    vstorage_->UpdateFilesBySize();
+    vstorage_->UpdateFilesByCompactionPri(mutable_cf_options_);
     vstorage_->UpdateNumNonEmptyLevels();
     vstorage_->GenerateFileIndexer();
     vstorage_->GenerateLevelFilesBrief();
     vstorage_->ComputeCompactionScore(mutable_cf_options_, fifo_options_);
+    vstorage_->GenerateLevel0NonOverlapping();
     vstorage_->SetFinalized();
   }
 };
@@ -343,6 +374,8 @@ TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
   ASSERT_EQ(num_levels - 1, compaction->output_level());
 }
 
+// Universal and FIFO Compactions are not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
 TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
   NewVersionStorage(1, kCompactionStyleUniversal);
   UniversalCompactionPicker universal_compaction_picker(
@@ -364,6 +397,64 @@ TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
               vstorage_->CompactionScore(0) >= 1);
   }
 }
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files overlaps, they cannot
+// be trivially moved.
+
+TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_options_universal.allow_trivial_move = true;
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  // must return false when there's no files.
+  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+            false);
+
+  NewVersionStorage(3, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+  Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+  ASSERT_TRUE(!compaction->is_trivial_move());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files doesn't overlaps, they should
+// be trivially moved.
+TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_options_universal.allow_trivial_move = true;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(3, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(1, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(2, 3U, "301", "350", kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+
+  ASSERT_TRUE(compaction->is_trivial_move());
+}
 
 TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
   NewVersionStorage(1, kCompactionStyleFIFO);
@@ -394,6 +485,7 @@ TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
               vstorage_->CompactionScore(0) >= 1);
   }
 }
+#endif  // ROCKSDB_LITE
 
 // This test exhibits the bug where we don't properly reset parent_index in
 // PickCompaction()
@@ -418,6 +510,319 @@ TEST_F(CompactionPickerTest, ParentIndexResetBug) {
       cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
 }
 
+// This test checks ExpandWhileOverlapping() by having overlapping user keys
+// ranges (with different sequence numbers) in the input files.
+TEST_F(CompactionPickerTest, OverlappingUserKeys) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(1, 1U, "100", "150", 1U);
+  // Overlapping user keys
+  Add(1, 2U, "200", "400", 1U);
+  Add(1, 3U, "400", "500", 1000000000U, 0, 0);
+  Add(2, 4U, "600", "700", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+              cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys2) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // Overlapping user keys on same level and output level
+  Add(1, 1U, "200", "400", 1000000000U);
+  Add(1, 2U, "400", "500", 1U, 0, 0);
+  Add(2, 3U, "400", "600", 1U);
+  // The following file is not in the compaction despite overlapping user keys
+  Add(2, 4U, "600", "700", 1U, 0, 0);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+              cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys3) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // Chain of overlapping user key ranges (forces ExpandWhileOverlapping() to
+  // expand multiple times)
+  Add(1, 1U, "100", "150", 1U);
+  Add(1, 2U, "150", "200", 1U, 0, 0);
+  Add(1, 3U, "200", "250", 1000000000U, 0, 0);
+  Add(1, 4U, "250", "300", 1U, 0, 0);
+  Add(1, 5U, "300", "350", 1U, 0, 0);
+  // Output level overlaps with the beginning and the end of the chain
+  Add(2, 6U, "050", "100", 1U);
+  Add(2, 7U, "350", "400", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+              cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(5U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+  ASSERT_EQ(5U, compaction->input(0, 4)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200", 200);
+  Add(0, 2U, "150", "200", 200);
+  Add(0, 3U, "150", "200", 200);
+  // Level 1 is over target by 200
+  Add(1, 4U, "400", "500", 600);
+  Add(1, 5U, "600", "700", 600);
+  // Level 2 is less than target 10000 even added size of level 1
+  Add(2, 6U, "150", "200", 2500);
+  Add(2, 7U, "201", "210", 2000);
+  Add(2, 8U, "300", "310", 2500);
+  Add(2, 9U, "400", "500", 2500);
+  // Level 3 exceeds target 100,000 of 1000
+  Add(3, 10U, "400", "500", 101000);
+  // Level 4 exceeds target 1,000,000 of 500 after adding size from level 3
+  Add(4, 11U, "400", "500", 999500);
+  Add(5, 11U, "400", "500", 8000000);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(2200u + 11000u + 5500u,
+            vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded2) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200", 200);
+  Add(0, 2U, "150", "200", 200);
+  Add(0, 4U, "150", "200", 200);
+  Add(0, 5U, "150", "200", 200);
+  Add(0, 6U, "150", "200", 200);
+  // Level 1 is over target by
+  Add(1, 7U, "400", "500", 200);
+  Add(1, 8U, "600", "700", 200);
+  // Level 2 is less than target 10000 even added size of level 1
+  Add(2, 9U, "150", "200", 9500);
+  Add(3, 10U, "400", "500", 101000);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(1400u + 4400u + 11000u,
+            vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+  // Set Last level size 50000
+  // num_levels - 1 target 5000
+  // num_levels - 2 is base level with taret 500
+  Add(num_levels - 1, 10U, "400", "500", 50000);
+
+  Add(0, 1U, "150", "200", 200);
+  Add(0, 2U, "150", "200", 200);
+  Add(0, 4U, "150", "200", 200);
+  Add(0, 5U, "150", "200", 200);
+  Add(0, 6U, "150", "200", 200);
+  // num_levels - 3 is over target by 100 + 1000
+  Add(num_levels - 3, 7U, "400", "500", 300);
+  Add(num_levels - 3, 8U, "600", "700", 300);
+  // Level 2 is over target by 1100 + 100
+  Add(num_levels - 2, 9U, "150", "200", 5100);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(1600u + 12100u + 13200u,
+            vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
+  // case 1: Higher levels are empty
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  bool result =
+      Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_TRUE(result);
+
+  // case 2: Higher levels have no overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "k", "p");
+  Add(3, 8U, "t", "w");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_TRUE(result);
+
+  // case 3.1: Higher levels (level 3) have overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "e", "g");
+  Add(3, 8U, "h", "k");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // case 3.2: Higher levels (level 5) have overlap
+  DeleteVersionStorage();
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "j", "k");
+  Add(3, 8U, "l", "m");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  Add(5, 11U, "h", "k");
+  Add(5, 12U, "y", "yy");
+  Add(5, 13U, "z", "zz");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // case 3.3: Higher levels (level 5) have overlap, but it's only overlapping
+  // one key ("d")
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "j", "k");
+  Add(3, 8U, "l", "m");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  Add(5, 11U, "ccc", "d");
+  Add(5, 12U, "y", "yy");
+  Add(5, 13U, "z", "zz");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // Level 0 files overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "s", "t");
+  Add(0, 2U, "a", "m");
+  Add(0, 3U, "b", "z");
+  Add(0, 4U, "e", "f");
+  Add(5, 10U, "y", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(1, 0);
+  AddToCompactionFiles(1U);
+  AddToCompactionFiles(2U);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(4U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // Level 0 files don't overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "s", "t");
+  Add(0, 2U, "a", "m");
+  Add(0, 3U, "b", "k");
+  Add(0, 4U, "e", "f");
+  Add(5, 10U, "y", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(1, 0);
+  AddToCompactionFiles(1U);
+  AddToCompactionFiles(2U);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(4U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_TRUE(result);
+
+  // Level 1 files overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "s", "t");
+  Add(0, 2U, "a", "m");
+  Add(0, 3U, "b", "k");
+  Add(0, 4U, "e", "f");
+  Add(1, 5U, "a", "m");
+  Add(1, 6U, "n", "o");
+  Add(1, 7U, "w", "y");
+  Add(5, 10U, "y", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 0);
+  AddToCompactionFiles(1U);
+  AddToCompactionFiles(2U);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(4U);
+  AddToCompactionFiles(5U);
+  AddToCompactionFiles(6U);
+  AddToCompactionFiles(7U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  DeleteVersionStorage();
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/src/rocksdb/db/comparator_db_test.cc b/src/rocksdb/db/comparator_db_test.cc
index 6013f75..cb944a7 100644
--- a/src/rocksdb/db/comparator_db_test.cc
+++ b/src/rocksdb/db/comparator_db_test.cc
@@ -10,6 +10,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "util/hash.h"
+#include "util/stl_wrappers.h"
 #include "util/string_util.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -22,18 +23,10 @@ namespace {
 
 static const Comparator* comparator;
 
-// A comparator for std::map, using comparator
-struct MapComparator {
-  bool operator()(const std::string& a, const std::string& b) const {
-    return comparator->Compare(a, b) < 0;
-  }
-};
-
-typedef std::map<std::string, std::string, MapComparator> KVMap;
-
 class KVIter : public Iterator {
  public:
-  explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {}
+  explicit KVIter(const stl_wrappers::KVMap* map)
+      : map_(map), iter_(map_->end()) {}
   virtual bool Valid() const override { return iter_ != map_->end(); }
   virtual void SeekToFirst() override { iter_ = map_->begin(); }
   virtual void SeekToLast() override {
@@ -60,8 +53,8 @@ class KVIter : public Iterator {
   virtual Status status() const override { return Status::OK(); }
 
  private:
-  const KVMap* const map_;
-  KVMap::const_iterator iter_;
+  const stl_wrappers::KVMap* const map_;
+  stl_wrappers::KVMap::const_iterator iter_;
 };
 
 void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
@@ -77,7 +70,7 @@ void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
 void DoRandomIteraratorTest(DB* db, std::vector<std::string> source_strings,
                             Random* rnd, int num_writes, int num_iter_ops,
                             int num_trigger_flush) {
-  KVMap map;
+  stl_wrappers::KVMap map((stl_wrappers::LessOfComparator(comparator)));
 
   for (int i = 0; i < num_writes; i++) {
     if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) {
@@ -177,8 +170,13 @@ class DoubleComparator : public Comparator {
   virtual const char* Name() const override { return "DoubleComparator"; }
 
   virtual int Compare(const Slice& a, const Slice& b) const override {
+#ifndef CYGWIN
     double da = std::stod(a.ToString());
     double db = std::stod(b.ToString());
+#else
+    double da = std::strtod(a.ToString().c_str(), 0 /* endptr */);
+    double db = std::strtod(a.ToString().c_str(), 0 /* endptr */);
+#endif
     if (da == db) {
       return a.compare(b);
     } else if (da > db) {
diff --git a/src/rocksdb/db/convenience.cc b/src/rocksdb/db/convenience.cc
new file mode 100644
index 0000000..17f7812
--- /dev/null
+++ b/src/rocksdb/db/convenience.cc
@@ -0,0 +1,23 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/convenience.h"
+
+#include "db/db_impl.h"
+
+namespace rocksdb {
+
+void CancelAllBackgroundWork(DB* db, bool wait) {
+  (dynamic_cast<DBImpl*>(db))->CancelAllBackgroundWork(wait);
+}
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/corruption_test.cc b/src/rocksdb/db/corruption_test.cc
index b9a2461..81cff97 100644
--- a/src/rocksdb/db/corruption_test.cc
+++ b/src/rocksdb/db/corruption_test.cc
@@ -7,6 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#ifndef ROCKSDB_LITE
+
 #include "rocksdb/db.h"
 
 #include <errno.h>
@@ -57,6 +59,11 @@ class CorruptionTest : public testing::Test {
      DestroyDB(dbname_, Options());
   }
 
+  void CloseDb() {
+    delete db_;
+    db_ = nullptr;
+  }
+
   Status TryReopen(Options* options = nullptr) {
     delete db_;
     db_ = nullptr;
@@ -80,10 +87,14 @@ class CorruptionTest : public testing::Test {
     ASSERT_OK(::rocksdb::RepairDB(dbname_, options_));
   }
 
-  void Build(int n) {
+  void Build(int n, int flush_every = 0) {
     std::string key_space, value_space;
     WriteBatch batch;
     for (int i = 0; i < n; i++) {
+      if (flush_every != 0 && i != 0 && i % flush_every == 0) {
+        DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+        dbi->TEST_FlushMemTable();
+      }
       //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
       Slice key = Key(i, &key_space);
       batch.Clear();
@@ -229,6 +240,16 @@ class CorruptionTest : public testing::Test {
 TEST_F(CorruptionTest, Recovery) {
   Build(100);
   Check(100, 100);
+#ifdef OS_WIN
+  // On Wndows OS Disk cache does not behave properly
+  // We do not call FlushBuffers on every Flush. If we do not close
+  // the log file prior to the corruption we end up with the first
+  // block not corrupted but only the second. However, under the debugger
+  // things work just fine but never pass when running normally
+  // For that reason people may want to run with unbuffered I/O. That option
+  // is not available for WAL though.
+  CloseDb();
+#endif
   Corrupt(kLogFile, 19, 1);      // WriteBatch tag for first record
   Corrupt(kLogFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
   ASSERT_TRUE(!TryReopen().ok());
@@ -280,13 +301,21 @@ TEST_F(CorruptionTest, TableFile) {
 }
 
 TEST_F(CorruptionTest, TableFileIndexData) {
-  Build(10000);  // Enough to build multiple Tables
+  Options options;
+  // very big, we'll trigger flushes manually
+  options.write_buffer_size = 100 * 1024 * 1024;
+  Reopen(&options);
+  // build 2 tables, flush at 5000
+  Build(10000, 5000);
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
   dbi->TEST_FlushMemTable();
 
+  // corrupt an index block of an entire file
   Corrupt(kTableFile, -2000, 500);
   Reopen();
-  Check(5000, 9999);
+  // one full file should be readable, since only one was corrupted
+  // the other file should be fully non-readable, since index was corrupted
+  Check(5000, 5000);
 }
 
 TEST_F(CorruptionTest, MissingDescriptor) {
@@ -336,13 +365,13 @@ TEST_F(CorruptionTest, CorruptedDescriptor) {
 
 TEST_F(CorruptionTest, CompactionInputError) {
   Options options;
-  options.max_background_flushes = 0;
   Reopen(&options);
   Build(10);
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
   dbi->TEST_FlushMemTable();
-  const int last = dbi->MaxMemCompactionLevel();
-  ASSERT_EQ(1, Property("rocksdb.num-files-at-level" + NumberToString(last)));
+  dbi->TEST_CompactRange(0, nullptr, nullptr);
+  dbi->TEST_CompactRange(1, nullptr, nullptr);
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level2"));
 
   Corrupt(kTableFile, 100, 1);
   Check(9, 9);
@@ -357,18 +386,20 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
   options.paranoid_checks = true;
   options.write_buffer_size = 131072;
   options.max_write_buffer_number = 2;
-  options.max_background_flushes = 0;
   Reopen(&options);
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
 
-  // Fill levels >= 1 so memtable flush outputs to level 0
+  // Fill levels >= 1
   for (int level = 1; level < dbi->NumberLevels(); level++) {
     dbi->Put(WriteOptions(), "", "begin");
     dbi->Put(WriteOptions(), "~", "end");
     dbi->TEST_FlushMemTable();
+    for (int comp_level = 0; comp_level < dbi->NumberLevels() - level;
+         ++comp_level) {
+      dbi->TEST_CompactRange(comp_level, nullptr, nullptr);
+    }
   }
 
-  options.max_mem_compaction_level = 0;
   Reopen(&options);
 
   dbi = reinterpret_cast<DBImpl*>(db_);
@@ -450,3 +481,13 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as RepairDB() is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/cuckoo_table_db_test.cc b/src/rocksdb/db/cuckoo_table_db_test.cc
index 8c2113b..09a68de 100644
--- a/src/rocksdb/db/cuckoo_table_db_test.cc
+++ b/src/rocksdb/db/cuckoo_table_db_test.cc
@@ -3,6 +3,8 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef ROCKSDB_LITE
+
 #include "db/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@@ -39,7 +41,6 @@ class CuckooTableDBTest : public testing::Test {
     options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
     options.allow_mmap_reads = true;
     options.create_if_missing = true;
-    options.max_mem_compaction_level = 0;
     return options;
   }
 
@@ -243,7 +244,8 @@ TEST_F(CuckooTableDBTest, CompactionIntoMultipleFiles) {
   dbfull()->TEST_WaitForFlushMemTable();
   ASSERT_EQ("1", FilesPerLevel());
 
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                              true /* disallow trivial move */);
   ASSERT_EQ("0,2", FilesPerLevel());
   for (int idx = 0; idx < 28; ++idx) {
     ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
@@ -319,3 +321,13 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_bench.cc b/src/rocksdb/db/db_bench.cc
index e4fc1c4..1092297 100644
--- a/src/rocksdb/db/db_bench.cc
+++ b/src/rocksdb/db/db_bench.cc
@@ -24,7 +24,9 @@ int main() {
 #include <numaif.h>
 #endif
 
+#ifndef OS_WIN
 #include <unistd.h>
+#endif
 #include <fcntl.h>
 #include <inttypes.h>
 #include <cstddef>
@@ -32,6 +34,12 @@ int main() {
 #include <stdio.h>
 #include <stdlib.h>
 #include <gflags/gflags.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
 #include "db/db_impl.h"
 #include "db/version_set.h"
 #include "rocksdb/options.h"
@@ -46,6 +54,9 @@ int main() {
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/utilities/flashcache.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "util/crc32c.h"
@@ -60,6 +71,10 @@ int main() {
 #include "hdfs/env_hdfs.h"
 #include "utilities/merge_operators.h"
 
+#ifdef OS_WIN
+#include <io.h>  // open/close
+#endif
+
 using GFLAGS::ParseCommandLineFlags;
 using GFLAGS::RegisterFlagValidator;
 using GFLAGS::SetUsageMessage;
@@ -94,10 +109,12 @@ DEFINE_string(benchmarks,
               "compress,"
               "uncompress,"
               "acquireload,"
-              "fillseekseq,",
+              "fillseekseq,"
+              "randomtransaction,"
+              "randomreplacekeys",
 
-              "Comma-separated list of operations to run in the specified order"
-              "Actual benchmarks:\n"
+              "Comma-separated list of operations to run in the specified"
+              " order. Available benchmarks:\n"
               "\tfillseq       -- write N values in sequential key"
               " order in async mode\n"
               "\tfillrandom    -- write N values in random key order in async"
@@ -115,8 +132,6 @@ DEFINE_string(benchmarks,
               "\treadreverse   -- read N times in reverse order\n"
               "\treadrandom    -- read N times in random order\n"
               "\treadmissing   -- read N missing keys in random order\n"
-              "\treadhot       -- read N times in random order from 1% section "
-              "of DB\n"
               "\treadwhilewriting      -- 1 writer, N threads doing random "
               "reads\n"
               "\treadwhilemerging      -- 1 merger, N threads doing random "
@@ -145,6 +160,10 @@ DEFINE_string(benchmarks,
               "\tacquireload   -- load N*1000 times\n"
               "\tfillseekseq   -- write N values in sequential key, then read "
               "them by seeking to each key\n"
+              "\trandomtransaction     -- execute N random transactions and "
+              "verify correctness\n"
+              "\trandomreplacekeys     -- randomly replaces N keys by deleting "
+              "the old version and putting the new version\n\n"
               "Meta operations:\n"
               "\tcompact     -- Compact the entire DB\n"
               "\tstats       -- Print DB stats\n"
@@ -205,6 +224,15 @@ static bool ValidateKeySize(const char* flagname, int32_t value) {
   return true;
 }
 
+static bool ValidateUint32Range(const char* flagname, uint64_t value) {
+  if (value > std::numeric_limits<uint32_t>::max()) {
+    fprintf(stderr, "Invalid value for --%s: %lu, overflow\n", flagname,
+            (unsigned long)value);
+    return false;
+  }
+  return true;
+}
+
 DEFINE_int32(key_size, 16, "size of each key");
 
 DEFINE_int32(num_multi_db, 0,
@@ -215,7 +243,7 @@ DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
 
 DEFINE_double(read_random_exp_range, 0.0,
               "Read random's key will be generated using distribution of "
-              "num * exp(r) where r is uniform number from 0 to this value. "
+              "num * exp(-r) where r is uniform number from 0 to this value. "
               "The larger the number is, the more skewed the reads are. "
               "Only used in readrandom and multireadrandom benchmarks.");
 
@@ -251,11 +279,32 @@ DEFINE_int32(min_write_buffer_number_to_merge,
              " writing less data to storage if there are duplicate records "
              " in each of these individual write buffers.");
 
+DEFINE_int32(max_write_buffer_number_to_maintain,
+             rocksdb::Options().max_write_buffer_number_to_maintain,
+             "The total maximum number of write buffers to maintain in memory "
+             "including copies of buffers that have already been flushed. "
+             "Unlike max_write_buffer_number, this parameter does not affect "
+             "flushing. This controls the minimum amount of write history "
+             "that will be available in memory for conflict checking when "
+             "Transactions are used. If this value is too low, some "
+             "transactions may fail at commit time due to not being able to "
+             "determine whether there were any write conflicts. Setting this "
+             "value to 0 will cause write buffers to be freed immediately "
+             "after they are flushed.  If this value is set to -1, "
+             "'max_write_buffer_number' will be used.");
+
 DEFINE_int32(max_background_compactions,
              rocksdb::Options().max_background_compactions,
              "The maximum number of concurrent background compactions"
              " that can occur in parallel.");
 
+DEFINE_uint64(subcompactions, 1,
+              "Maximum number of subcompactions to divide L0-L1 compactions "
+              "into.");
+static const bool FLAGS_subcompactions_dummy
+    __attribute__((unused)) = RegisterFlagValidator(&FLAGS_subcompactions,
+                                                    &ValidateUint32Range);
+
 DEFINE_int32(max_background_flushes,
              rocksdb::Options().max_background_flushes,
              "The maximum number of concurrent background flushes"
@@ -265,6 +314,10 @@ static rocksdb::CompactionStyle FLAGS_compaction_style_e;
 DEFINE_int32(compaction_style, (int32_t) rocksdb::Options().compaction_style,
              "style of compaction: level-based vs universal");
 
+static rocksdb::CompactionPri FLAGS_compaction_pri_e;
+DEFINE_int32(compaction_pri, (int32_t)rocksdb::Options().compaction_style,
+             "priority of files to compaction: by size or by data age");
+
 DEFINE_int32(universal_size_ratio, 0,
              "Percentage flexibility while comparing file size"
              " (for universal compaction only).");
@@ -282,9 +335,15 @@ DEFINE_int32(universal_compression_size_percent, -1,
              "The percentage of the database to compress for universal "
              "compaction. -1 means compress everything.");
 
+DEFINE_bool(universal_allow_trivial_move, false,
+            "Allow trivial move in universal compaction.");
+
 DEFINE_int64(cache_size, -1, "Number of bytes to use as a cache of uncompressed"
              "data. Negative means use default settings.");
 
+DEFINE_bool(cache_index_and_filter_blocks, false,
+            "Cache index/filter blocks in block cache.");
+
 DEFINE_int32(block_size,
              static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
              "Number of bytes in a block.");
@@ -297,10 +356,23 @@ DEFINE_int32(block_restart_interval,
 DEFINE_int64(compressed_cache_size, -1,
              "Number of bytes to use as a cache of compressed data.");
 
+DEFINE_int64(row_cache_size, 0,
+             "Number of bytes to use as a cache of individual rows"
+             " (0 = disabled).");
+
 DEFINE_int32(open_files, rocksdb::Options().max_open_files,
              "Maximum number of files to keep open at the same time"
              " (use default if == 0)");
 
+DEFINE_int32(file_opening_threads, rocksdb::Options().max_file_opening_threads,
+             "If open_files is set to -1, this option set the number of "
+             "threads that will be used to open files during DB::Open()");
+
+DEFINE_int32(new_table_reader_for_compaction_inputs, true,
+             "If true, uses a separate file handle for compaction inputs");
+
+DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
+
 DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
              " use default settings.");
 DEFINE_int32(memtable_bloom_bits, 0, "Bloom filter bits per key for memtable. "
@@ -310,6 +382,11 @@ DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing"
             " database.  If you set this flag and also specify a benchmark that"
             " wants a fresh database, that benchmark will fail.");
 
+DEFINE_bool(show_table_properties, false,
+            "If true, then per-level table"
+            " properties will be printed on every stats-interval when"
+            " stats_interval is set and stats_per_interval is on.");
+
 DEFINE_string(db, "", "Use the db with the following name.");
 
 static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
@@ -367,13 +444,19 @@ static std::vector<int> FLAGS_max_bytes_for_level_multiplier_additional_v;
 DEFINE_string(max_bytes_for_level_multiplier_additional, "",
               "A vector that specifies additional fanout per level");
 
-DEFINE_int32(level0_stop_writes_trigger, 12, "Number of files in level-0"
+DEFINE_int32(level0_stop_writes_trigger,
+             rocksdb::Options().level0_stop_writes_trigger,
+             "Number of files in level-0"
              " that will trigger put stop.");
 
-DEFINE_int32(level0_slowdown_writes_trigger, 8, "Number of files in level-0"
+DEFINE_int32(level0_slowdown_writes_trigger,
+             rocksdb::Options().level0_slowdown_writes_trigger,
+             "Number of files in level-0"
              " that will slow down writes.");
 
-DEFINE_int32(level0_file_num_compaction_trigger, 4, "Number of files in level-0"
+DEFINE_int32(level0_file_num_compaction_trigger,
+             rocksdb::Options().level0_file_num_compaction_trigger,
+             "Number of files in level-0"
              " when compactions start");
 
 static bool ValidateInt32Percent(const char* flagname, int32_t value) {
@@ -403,6 +486,33 @@ DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/"
 DEFINE_uint64(delete_obsolete_files_period_micros, 0,
               "Ignored. Left here for backward compatibility");
 
+DEFINE_bool(optimistic_transaction_db, false,
+            "Open a OptimisticTransactionDB instance. "
+            "Required for randomtransaction benchmark.");
+
+DEFINE_bool(transaction_db, false,
+            "Open a TransactionDB instance. "
+            "Required for randomtransaction benchmark.");
+
+DEFINE_uint64(transaction_sets, 2,
+              "Number of keys each transaction will "
+              "modify (use in RandomTransaction only).  Max: 9999");
+
+DEFINE_bool(transaction_set_snapshot, false,
+            "Setting to true will have each transaction call SetSnapshot()"
+            " upon creation.");
+
+DEFINE_int32(transaction_sleep, 0,
+             "Max microseconds to sleep in between "
+             "reading and writing a value (used in RandomTransaction only). ");
+
+DEFINE_uint64(transaction_lock_timeout, 100,
+              "If using a transaction_db, specifies the lock wait timeout in"
+              " milliseconds before failing a transaction waiting on a lock");
+
+DEFINE_bool(compaction_measure_io_stats, false,
+            "Measure times spents on I/Os while in compactions. ");
+
 namespace {
 enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
   assert(ctype);
@@ -419,6 +529,8 @@ enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
     return rocksdb::kLZ4Compression;
   else if (!strcasecmp(ctype, "lz4hc"))
     return rocksdb::kLZ4HCCompression;
+  else if (!strcasecmp(ctype, "zstd"))
+    return rocksdb::kZSTDNotFinalCompression;
 
   fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
   return rocksdb::kSnappyCompression; //default value
@@ -485,6 +597,14 @@ DEFINE_int64(stats_interval_seconds, 0, "Report stats every N seconds. This "
 DEFINE_int32(stats_per_interval, 0, "Reports additional stats per interval when"
              " this is greater than 0.");
 
+DEFINE_int64(report_interval_seconds, 0,
+             "If greater than zero, it will write simple stats in CVS format "
+             "to --report_file every N seconds");
+
+DEFINE_string(report_file, "report.csv",
+              "Filename where some simple stats are reported to (if "
+              "--report_interval_seconds is bigger than 0)");
+
 DEFINE_int32(thread_status_per_interval, 0,
              "Takes and report a snapshot of the current status of each thread"
              " when this is greater than 0.");
@@ -492,7 +612,7 @@ DEFINE_int32(thread_status_per_interval, 0,
 DEFINE_int32(perf_level, 0, "Level of perf collection");
 
 static bool ValidateRateLimit(const char* flagname, double value) {
-  static constexpr double EPSILON = 1e-10;
+  const double EPSILON = 1e-10;
   if ( value < -EPSILON ) {
     fprintf(stderr, "Invalid value for --%s: %12.6f, must be >= 0.0\n",
             flagname, value);
@@ -502,9 +622,14 @@ static bool ValidateRateLimit(const char* flagname, double value) {
 }
 DEFINE_double(soft_rate_limit, 0.0, "");
 
-DEFINE_double(hard_rate_limit, 0.0, "When not equal to 0 this make threads "
-              "sleep at each stats reporting interval until the compaction"
-              " score for all levels is less than or equal to this value.");
+DEFINE_double(hard_rate_limit, 0.0, "DEPRECATED");
+
+DEFINE_uint64(hard_pending_compaction_bytes_limit, 128u * 1024 * 1024 * 1024,
+              "Stop writes if pending compaction bytes exceed this number");
+
+DEFINE_uint64(delayed_write_rate, 2097152u,
+              "Limited bytes allowed to DB when soft_rate_limit or "
+              "level0_slowdown_writes_trigger triggers");
 
 DEFINE_int32(rate_limit_delay_max_milliseconds, 1000,
              "When hard_rate_limit is set then this is the max time a put will"
@@ -512,6 +637,10 @@ DEFINE_int32(rate_limit_delay_max_milliseconds, 1000,
 
 DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
 
+DEFINE_uint64(
+    benchmark_write_rate_limit, 0,
+    "If non-zero, db_bench will rate-limit the writes going into RocksDB");
+
 DEFINE_int32(max_grandparent_overlap_factor, 10, "Control maximum bytes of "
              "overlaps in grandparent (i.e., level+2) before we stop building a"
              " single file in a level->level+1 compaction.");
@@ -570,6 +699,13 @@ DEFINE_uint64(wal_bytes_per_sync,  rocksdb::Options().wal_bytes_per_sync,
 DEFINE_bool(filter_deletes, false, " On true, deletes use bloom-filter and drop"
             " the delete if key not present");
 
+DEFINE_bool(use_single_deletes, true,
+            "Use single deletes (used in RandomReplaceKeys only).");
+
+DEFINE_double(stddev, 2000.0,
+              "Standard deviation of normal distribution used for picking keys"
+              " (used in RandomReplaceKeys only).");
+
 DEFINE_int32(max_successive_merges, 0, "Maximum number of successive merge"
              " operations on a key in the memtable");
 
@@ -781,6 +917,7 @@ class ReportFileOpEnv : public EnvWrapper {
         return rv;
       }
 
+      Status Truncate(uint64_t size) override { return target_->Truncate(size); }
       Status Close() override { return target_->Close(); }
       Status Flush() override { return target_->Flush(); }
       Status Sync() override { return target_->Sync(); }
@@ -846,6 +983,7 @@ static void AppendWithSpace(std::string* str, Slice msg) {
 struct DBWithColumnFamilies {
   std::vector<ColumnFamilyHandle*> cfh;
   DB* db;
+  OptimisticTransactionDB* opt_txn_db;
   std::atomic<size_t> num_created;  // Need to be updated after all the
                                     // new entries in cfh are set.
   size_t num_hot;  // Number of column families to be queried at each moment.
@@ -853,7 +991,7 @@ struct DBWithColumnFamilies {
                    // Column families will be created and used to be queried.
   port::Mutex create_cf_mutex;  // Only one thread can execute CreateNewCf()
 
-  DBWithColumnFamilies() : db(nullptr) {
+  DBWithColumnFamilies() : db(nullptr), opt_txn_db(nullptr) {
     cfh.clear();
     num_created = 0;
     num_hot = 0;
@@ -862,9 +1000,23 @@ struct DBWithColumnFamilies {
   DBWithColumnFamilies(const DBWithColumnFamilies& other)
       : cfh(other.cfh),
         db(other.db),
+        opt_txn_db(other.opt_txn_db),
         num_created(other.num_created.load()),
         num_hot(other.num_hot) {}
 
+  void DeleteDBs() {
+    std::for_each(cfh.begin(), cfh.end(),
+                  [](ColumnFamilyHandle* cfhi) { delete cfhi; });
+    cfh.clear();
+    if (opt_txn_db) {
+      delete opt_txn_db;
+      opt_txn_db = nullptr;
+    } else {
+      delete db;
+    }
+    db = nullptr;
+  }
+
   ColumnFamilyHandle* GetCfh(int64_t rand_num) {
     assert(num_hot > 0);
     return cfh[num_created.load(std::memory_order_acquire) - num_hot +
@@ -894,6 +1046,96 @@ struct DBWithColumnFamilies {
   }
 };
 
+// a class that reports stats to CSV file
+class ReporterAgent {
+ public:
+  ReporterAgent(Env* env, const std::string& fname,
+                uint64_t report_interval_secs)
+      : env_(env),
+        total_ops_done_(0),
+        last_report_(0),
+        report_interval_secs_(report_interval_secs),
+        stop_(false) {
+    auto s = env_->NewWritableFile(fname, &report_file_, EnvOptions());
+    if (s.ok()) {
+      s = report_file_->Append(Header() + "\n");
+    }
+    if (s.ok()) {
+      s = report_file_->Flush();
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "Can't open %s: %s\n", fname.c_str(),
+              s.ToString().c_str());
+      abort();
+    }
+
+    reporting_thread_ = std::thread([&]() { SleepAndReport(); });
+  }
+
+  ~ReporterAgent() {
+    {
+      std::unique_lock<std::mutex> lk(mutex_);
+      stop_ = true;
+      stop_cv_.notify_all();
+    }
+    reporting_thread_.join();
+  }
+
+  // thread safe
+  void ReportFinishedOps(int64_t num_ops) {
+    total_ops_done_.fetch_add(num_ops);
+  }
+
+ private:
+  std::string Header() const { return "secs_elapsed,interval_qps"; }
+  void SleepAndReport() {
+    uint64_t kMicrosInSecond = 1000 * 1000;
+    auto time_started = env_->NowMicros();
+    while (true) {
+      {
+        std::unique_lock<std::mutex> lk(mutex_);
+        if (stop_ ||
+            stop_cv_.wait_for(lk, std::chrono::seconds(report_interval_secs_),
+                              [&]() { return stop_; })) {
+          // stopping
+          break;
+        }
+        // else -> timeout, which means time for a report!
+      }
+      auto total_ops_done_snapshot = total_ops_done_.load();
+      // round the seconds elapsed
+      auto secs_elapsed =
+          (env_->NowMicros() - time_started + kMicrosInSecond / 2) /
+          kMicrosInSecond;
+      std::string report = ToString(secs_elapsed) + "," +
+                           ToString(total_ops_done_snapshot - last_report_) +
+                           "\n";
+      auto s = report_file_->Append(report);
+      if (s.ok()) {
+        s = report_file_->Flush();
+      }
+      if (!s.ok()) {
+        fprintf(stderr,
+                "Can't write to report file (%s), stopping the reporting\n",
+                s.ToString().c_str());
+        break;
+      }
+      last_report_ = total_ops_done_snapshot;
+    }
+  }
+
+  Env* env_;
+  std::unique_ptr<WritableFile> report_file_;
+  std::atomic<int64_t> total_ops_done_;
+  int64_t last_report_;
+  const uint64_t report_interval_secs_;
+  std::thread reporting_thread_;
+  std::mutex mutex_;
+  // will notify on stop
+  std::condition_variable stop_cv_;
+  bool stop_;
+};
+
 class Stats {
  private:
   int id_;
@@ -909,10 +1151,15 @@ class Stats {
   HistogramImpl hist_;
   std::string message_;
   bool exclude_from_merge_;
+  ReporterAgent* reporter_agent_;  // does not own
 
  public:
   Stats() { Start(-1); }
 
+  void SetReporterAgent(ReporterAgent* reporter_agent) {
+    reporter_agent_ = reporter_agent;
+  }
+
   void Start(int id) {
     id_ = id;
     next_report_ = FLAGS_stats_interval ? FLAGS_stats_interval : 100;
@@ -988,6 +1235,9 @@ class Stats {
   }
 
   void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops) {
+    if (reporter_agent_) {
+      reporter_agent_->ReportFinishedOps(num_ops);
+    }
     if (FLAGS_histogram) {
       double now = FLAGS_env->NowMicros();
       double micros = now - last_op_finish_;
@@ -1044,10 +1294,37 @@ class Stats {
                 if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats",
                                     &stats))
                   fprintf(stderr, "%s\n", stats.c_str());
+                if (FLAGS_show_table_properties) {
+                  for (int level = 0; level < FLAGS_num_levels; ++level) {
+                    if (db->GetProperty(
+                            db_with_cfh->cfh[i],
+                            "rocksdb.aggregated-table-properties-at-level" +
+                                ToString(level),
+                            &stats)) {
+                      if (stats.find("# entries=0") == std::string::npos) {
+                        fprintf(stderr, "Level[%d]: %s\n", level,
+                                stats.c_str());
+                      }
+                    }
+                  }
+                }
+              }
+            } else if (db) {
+              if (db->GetProperty("rocksdb.stats", &stats)) {
+                fprintf(stderr, "%s\n", stats.c_str());
+              }
+              if (FLAGS_show_table_properties) {
+                for (int level = 0; level < FLAGS_num_levels; ++level) {
+                  if (db->GetProperty(
+                          "rocksdb.aggregated-table-properties-at-level" +
+                              ToString(level),
+                          &stats)) {
+                    if (stats.find("# entries=0") == std::string::npos) {
+                      fprintf(stderr, "Level[%d]: %s\n", level, stats.c_str());
+                    }
+                  }
+                }
               }
-
-            } else if (db && db->GetProperty("rocksdb.stats", &stats)) {
-              fprintf(stderr, "%s\n", stats.c_str());
             }
           }
 
@@ -1120,6 +1397,7 @@ struct SharedState {
   port::CondVar cv;
   int total;
   int perf_level;
+  std::shared_ptr<RateLimiter> write_rate_limiter;
 
   // Each thread goes through the following states:
   //    (1) initializing
@@ -1216,6 +1494,39 @@ class Benchmark {
     return true;
   }
 
+  inline bool CompressSlice(const Slice& input, std::string* compressed) {
+    bool ok = true;
+    switch (FLAGS_compression_type_e) {
+      case rocksdb::kSnappyCompression:
+        ok = Snappy_Compress(Options().compression_opts, input.data(),
+                             input.size(), compressed);
+        break;
+      case rocksdb::kZlibCompression:
+        ok = Zlib_Compress(Options().compression_opts, 2, input.data(),
+                           input.size(), compressed);
+        break;
+      case rocksdb::kBZip2Compression:
+        ok = BZip2_Compress(Options().compression_opts, 2, input.data(),
+                            input.size(), compressed);
+        break;
+      case rocksdb::kLZ4Compression:
+        ok = LZ4_Compress(Options().compression_opts, 2, input.data(),
+                          input.size(), compressed);
+        break;
+      case rocksdb::kLZ4HCCompression:
+        ok = LZ4HC_Compress(Options().compression_opts, 2, input.data(),
+                            input.size(), compressed);
+        break;
+      case rocksdb::kZSTDNotFinalCompression:
+        ok = ZSTD_Compress(Options().compression_opts, input.data(),
+                           input.size(), compressed);
+        break;
+      default:
+        ok = false;
+    }
+    return ok;
+  }
+
   void PrintHeader() {
     PrintEnvironment();
     fprintf(stdout, "Keys:       %d bytes each\n", FLAGS_key_size);
@@ -1232,7 +1543,7 @@ class Benchmark {
             (((FLAGS_key_size + FLAGS_value_size * FLAGS_compression_ratio)
               * num_)
              / 1048576.0));
-    fprintf(stdout, "Write rate limit: %d\n", FLAGS_writes_per_second);
+    fprintf(stdout, "Writes per second: %d\n", FLAGS_writes_per_second);
     if (FLAGS_enable_numa) {
       fprintf(stderr, "Running in NUMA enabled mode.\n");
 #ifndef NUMA
@@ -1245,26 +1556,10 @@ class Benchmark {
       }
 #endif
     }
-    switch (FLAGS_compression_type_e) {
-      case rocksdb::kNoCompression:
-        fprintf(stdout, "Compression: none\n");
-        break;
-      case rocksdb::kSnappyCompression:
-        fprintf(stdout, "Compression: snappy\n");
-        break;
-      case rocksdb::kZlibCompression:
-        fprintf(stdout, "Compression: zlib\n");
-        break;
-      case rocksdb::kBZip2Compression:
-        fprintf(stdout, "Compression: bzip2\n");
-        break;
-      case rocksdb::kLZ4Compression:
-        fprintf(stdout, "Compression: lz4\n");
-        break;
-      case rocksdb::kLZ4HCCompression:
-        fprintf(stdout, "Compression: lz4hc\n");
-        break;
-    }
+
+    const char* compression =
+        CompressionTypeToString(FLAGS_compression_type_e).c_str();
+    fprintf(stdout, "Compression: %s\n", compression);
 
     switch (FLAGS_rep_factory) {
       case kPrefixHash:
@@ -1285,11 +1580,11 @@ class Benchmark {
     }
     fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
 
-    PrintWarnings();
+    PrintWarnings(compression);
     fprintf(stdout, "------------------------------------------------\n");
   }
 
-  void PrintWarnings() {
+  void PrintWarnings(const char* compression) {
 #if defined(__GNUC__) && !defined(__OPTIMIZE__)
     fprintf(stdout,
             "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
@@ -1302,51 +1597,17 @@ class Benchmark {
     if (FLAGS_compression_type_e != rocksdb::kNoCompression) {
       // The test string should not be too small.
       const int len = FLAGS_block_size;
-      char* text = (char*) malloc(len+1);
-      bool result = true;
-      const char* name = nullptr;
+      std::string input_str(len, 'y');
       std::string compressed;
-
-      memset(text, (int) 'y', len);
-      text[len] = '\0';
-      switch (FLAGS_compression_type_e) {
-        case kSnappyCompression:
-          result = Snappy_Compress(Options().compression_opts, text,
-                                   strlen(text), &compressed);
-          name = "Snappy";
-          break;
-        case kZlibCompression:
-          result = Zlib_Compress(Options().compression_opts, 2, text,
-                                 strlen(text), &compressed);
-          name = "Zlib";
-          break;
-        case kBZip2Compression:
-          result = BZip2_Compress(Options().compression_opts, 2, text,
-                                  strlen(text), &compressed);
-          name = "BZip2";
-          break;
-        case kLZ4Compression:
-          result = LZ4_Compress(Options().compression_opts, 2, text,
-                                strlen(text), &compressed);
-          name = "LZ4";
-          break;
-        case kLZ4HCCompression:
-          result = LZ4HC_Compress(Options().compression_opts, 2, text,
-                                  strlen(text), &compressed);
-          name = "LZ4HC";
-          break;
-        case kNoCompression:
-          assert(false); // cannot happen
-          break;
-      }
+      bool result = CompressSlice(Slice(input_str), &compressed);
 
       if (!result) {
-        fprintf(stdout, "WARNING: %s compression is not enabled\n", name);
-      } else if (name && compressed.size() >= strlen(text)) {
-        fprintf(stdout, "WARNING: %s compression is not effective\n", name);
+        fprintf(stdout, "WARNING: %s compression is not enabled\n",
+                compression);
+      } else if (compressed.size() >= input_str.size()) {
+        fprintf(stdout, "WARNING: %s compression is not effective\n",
+                compression);
       }
-
-      free(text);
     }
   }
 
@@ -1467,9 +1728,7 @@ class Benchmark {
   }
 
   ~Benchmark() {
-    std::for_each(db_.cfh.begin(), db_.cfh.end(),
-                  [](ColumnFamilyHandle* cfh) { delete cfh; });
-    delete db_.db;
+    db_.DeleteDBs();
     delete prefix_extractor_;
     if (cache_.get() != nullptr) {
       // this will leak, but we're shutting down so nobody cares
@@ -1546,18 +1805,9 @@ class Benchmark {
     }
     PrintHeader();
     Open(&open_options_);
-    const char* benchmarks = FLAGS_benchmarks.c_str();
-    while (benchmarks != nullptr) {
-      const char* sep = strchr(benchmarks, ',');
-      Slice name;
-      if (sep == nullptr) {
-        name = benchmarks;
-        benchmarks = nullptr;
-      } else {
-        name = Slice(benchmarks, sep - benchmarks);
-        benchmarks = sep + 1;
-      }
-
+    std::stringstream benchmark_stream(FLAGS_benchmarks);
+    std::string name;
+    while (std::getline(benchmark_stream, name, ',')) {
       // Sanitize parameters
       num_ = FLAGS_num;
       reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
@@ -1573,146 +1823,148 @@ class Benchmark {
       write_options_.disableWAL = FLAGS_disable_wal;
 
       void (Benchmark::*method)(ThreadState*) = nullptr;
+      void (Benchmark::*post_process_method)() = nullptr;
+
       bool fresh_db = false;
       int num_threads = FLAGS_threads;
 
-      if (name == Slice("fillseq")) {
+      if (name == "fillseq") {
         fresh_db = true;
         method = &Benchmark::WriteSeq;
-      } else if (name == Slice("fillbatch")) {
+      } else if (name == "fillbatch") {
         fresh_db = true;
         entries_per_batch_ = 1000;
         method = &Benchmark::WriteSeq;
-      } else if (name == Slice("fillrandom")) {
+      } else if (name == "fillrandom") {
         fresh_db = true;
         method = &Benchmark::WriteRandom;
-      } else if (name == Slice("filluniquerandom")) {
+      } else if (name == "filluniquerandom") {
         fresh_db = true;
         if (num_threads > 1) {
-          fprintf(stderr, "filluniquerandom multithreaded not supported"
-                           ", use 1 thread");
+          fprintf(stderr,
+                  "filluniquerandom multithreaded not supported"
+                  ", use 1 thread");
           num_threads = 1;
         }
         method = &Benchmark::WriteUniqueRandom;
-      } else if (name == Slice("overwrite")) {
-        fresh_db = false;
+      } else if (name == "overwrite") {
         method = &Benchmark::WriteRandom;
-      } else if (name == Slice("fillsync")) {
+      } else if (name == "fillsync") {
         fresh_db = true;
         num_ /= 1000;
         write_options_.sync = true;
         method = &Benchmark::WriteRandom;
-      } else if (name == Slice("fill100K")) {
+      } else if (name == "fill100K") {
         fresh_db = true;
         num_ /= 1000;
         value_size_ = 100 * 1000;
         method = &Benchmark::WriteRandom;
-      } else if (name == Slice("readseq")) {
+      } else if (name == "readseq") {
         method = &Benchmark::ReadSequential;
-      } else if (name == Slice("readtocache")) {
+      } else if (name == "readtocache") {
         method = &Benchmark::ReadSequential;
         num_threads = 1;
         reads_ = num_;
-      } else if (name == Slice("readreverse")) {
+      } else if (name == "readreverse") {
         method = &Benchmark::ReadReverse;
-      } else if (name == Slice("readrandom")) {
+      } else if (name == "readrandom") {
         method = &Benchmark::ReadRandom;
-      } else if (name == Slice("readrandomfast")) {
+      } else if (name == "readrandomfast") {
         method = &Benchmark::ReadRandomFast;
-      } else if (name == Slice("multireadrandom")) {
+      } else if (name == "multireadrandom") {
         fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
                 entries_per_batch_);
         method = &Benchmark::MultiReadRandom;
-      } else if (name == Slice("readmissing")) {
+      } else if (name == "readmissing") {
         ++key_size_;
         method = &Benchmark::ReadRandom;
-      } else if (name == Slice("newiterator")) {
+      } else if (name == "newiterator") {
         method = &Benchmark::IteratorCreation;
-      } else if (name == Slice("newiteratorwhilewriting")) {
+      } else if (name == "newiteratorwhilewriting") {
         num_threads++;  // Add extra thread for writing
         method = &Benchmark::IteratorCreationWhileWriting;
-      } else if (name == Slice("seekrandom")) {
+      } else if (name == "seekrandom") {
         method = &Benchmark::SeekRandom;
-      } else if (name == Slice("seekrandomwhilewriting")) {
+      } else if (name == "seekrandomwhilewriting") {
         num_threads++;  // Add extra thread for writing
         method = &Benchmark::SeekRandomWhileWriting;
-      } else if (name == Slice("seekrandomwhilemerging")) {
+      } else if (name == "seekrandomwhilemerging") {
         num_threads++;  // Add extra thread for merging
         method = &Benchmark::SeekRandomWhileMerging;
-      } else if (name == Slice("readrandomsmall")) {
+      } else if (name == "readrandomsmall") {
         reads_ /= 1000;
         method = &Benchmark::ReadRandom;
-      } else if (name == Slice("deleteseq")) {
+      } else if (name == "deleteseq") {
         method = &Benchmark::DeleteSeq;
-      } else if (name == Slice("deleterandom")) {
+      } else if (name == "deleterandom") {
         method = &Benchmark::DeleteRandom;
-      } else if (name == Slice("readwhilewriting")) {
+      } else if (name == "readwhilewriting") {
         num_threads++;  // Add extra thread for writing
         method = &Benchmark::ReadWhileWriting;
-      } else if (name == Slice("readwhilemerging")) {
+      } else if (name == "readwhilemerging") {
         num_threads++;  // Add extra thread for writing
         method = &Benchmark::ReadWhileMerging;
-      } else if (name == Slice("readrandomwriterandom")) {
+      } else if (name == "readrandomwriterandom") {
         method = &Benchmark::ReadRandomWriteRandom;
-      } else if (name == Slice("readrandommergerandom")) {
+      } else if (name == "readrandommergerandom") {
         if (FLAGS_merge_operator.empty()) {
           fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
-                  name.ToString().c_str());
+                  name.c_str());
           exit(1);
         }
         method = &Benchmark::ReadRandomMergeRandom;
-      } else if (name == Slice("updaterandom")) {
+      } else if (name == "updaterandom") {
         method = &Benchmark::UpdateRandom;
-      } else if (name == Slice("appendrandom")) {
+      } else if (name == "appendrandom") {
         method = &Benchmark::AppendRandom;
-      } else if (name == Slice("mergerandom")) {
+      } else if (name == "mergerandom") {
         if (FLAGS_merge_operator.empty()) {
           fprintf(stdout, "%-12s : skipped (--merge_operator is unknown)\n",
-                  name.ToString().c_str());
+                  name.c_str());
           exit(1);
         }
         method = &Benchmark::MergeRandom;
-      } else if (name == Slice("randomwithverify")) {
+      } else if (name == "randomwithverify") {
         method = &Benchmark::RandomWithVerify;
-      } else if (name == Slice("fillseekseq")) {
+      } else if (name == "fillseekseq") {
         method = &Benchmark::WriteSeqSeekSeq;
-      } else if (name == Slice("compact")) {
+      } else if (name == "compact") {
         method = &Benchmark::Compact;
-      } else if (name == Slice("crc32c")) {
+      } else if (name == "crc32c") {
         method = &Benchmark::Crc32c;
-      } else if (name == Slice("xxhash")) {
+      } else if (name == "xxhash") {
         method = &Benchmark::xxHash;
-      } else if (name == Slice("acquireload")) {
+      } else if (name == "acquireload") {
         method = &Benchmark::AcquireLoad;
-      } else if (name == Slice("compress")) {
+      } else if (name == "compress") {
         method = &Benchmark::Compress;
-      } else if (name == Slice("uncompress")) {
+      } else if (name == "uncompress") {
         method = &Benchmark::Uncompress;
-      } else if (name == Slice("stats")) {
+      } else if (name == "randomtransaction") {
+        method = &Benchmark::RandomTransaction;
+        post_process_method = &Benchmark::RandomTransactionVerify;
+      } else if (name == "randomreplacekeys") {
+        fresh_db = true;
+        method = &Benchmark::RandomReplaceKeys;
+      } else if (name == "stats") {
         PrintStats("rocksdb.stats");
-      } else if (name == Slice("levelstats")) {
+      } else if (name == "levelstats") {
         PrintStats("rocksdb.levelstats");
-      } else if (name == Slice("sstables")) {
+      } else if (name == "sstables") {
         PrintStats("rocksdb.sstables");
-      } else {
-        if (name != Slice()) {  // No error message for empty name
-          fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
-          exit(1);
-        }
+      } else if (!name.empty()) {  // No error message for empty name
+        fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
+        exit(1);
       }
 
       if (fresh_db) {
         if (FLAGS_use_existing_db) {
           fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
-                  name.ToString().c_str());
+                  name.c_str());
           method = nullptr;
         } else {
           if (db_.db != nullptr) {
-            std::for_each(db_.cfh.begin(), db_.cfh.end(),
-                          [](ColumnFamilyHandle* cfh) { delete cfh; });
-            delete db_.db;
-            db_.db = nullptr;
-            db_.cfh.clear();
+            db_.DeleteDBs();
             DestroyDB(FLAGS_db, open_options_);
           }
           for (size_t i = 0; i < multi_dbs_.size(); i++) {
@@ -1728,6 +1980,9 @@ class Benchmark {
         fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
         RunBenchmark(num_threads, name, method);
       }
+      if (post_process_method != nullptr) {
+        (this->*post_process_method)();
+      }
     }
     if (FLAGS_statistics) {
      fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
@@ -1780,6 +2035,16 @@ class Benchmark {
     shared.num_initialized = 0;
     shared.num_done = 0;
     shared.start = false;
+    if (FLAGS_benchmark_write_rate_limit > 0) {
+      shared.write_rate_limiter.reset(
+          NewGenericRateLimiter(FLAGS_benchmark_write_rate_limit));
+    }
+
+    std::unique_ptr<ReporterAgent> reporter_agent;
+    if (FLAGS_report_interval_seconds > 0) {
+      reporter_agent.reset(new ReporterAgent(FLAGS_env, FLAGS_report_file,
+                                             FLAGS_report_interval_seconds));
+    }
 
     ThreadArg* arg = new ThreadArg[n];
 
@@ -1805,6 +2070,7 @@ class Benchmark {
       arg[i].method = method;
       arg[i].shared = &shared;
       arg[i].thread = new ThreadState(i);
+      arg[i].thread->stats.SetReporterAgent(reporter_agent.get());
       arg[i].thread->shared = &shared;
       FLAGS_env->StartThread(ThreadBody, &arg[i]);
     }
@@ -1898,30 +2164,7 @@ class Benchmark {
 
     // Compress 1G
     while (ok && bytes < int64_t(1) << 30) {
-      switch (FLAGS_compression_type_e) {
-      case rocksdb::kSnappyCompression:
-        ok = Snappy_Compress(Options().compression_opts, input.data(),
-                             input.size(), &compressed);
-        break;
-      case rocksdb::kZlibCompression:
-        ok = Zlib_Compress(Options().compression_opts, 2, input.data(),
-                           input.size(), &compressed);
-        break;
-      case rocksdb::kBZip2Compression:
-        ok = BZip2_Compress(Options().compression_opts, 2, input.data(),
-                            input.size(), &compressed);
-        break;
-      case rocksdb::kLZ4Compression:
-        ok = LZ4_Compress(Options().compression_opts, 2, input.data(),
-                          input.size(), &compressed);
-        break;
-      case rocksdb::kLZ4HCCompression:
-        ok = LZ4HC_Compress(Options().compression_opts, 2, input.data(),
-                            input.size(), &compressed);
-        break;
-      default:
-        ok = false;
-      }
+      ok = CompressSlice(input, &compressed);
       produced += compressed.size();
       bytes += input.size();
       thread->stats.FinishedOps(nullptr, nullptr, 1);
@@ -1943,32 +2186,7 @@ class Benchmark {
     Slice input = gen.Generate(FLAGS_block_size);
     std::string compressed;
 
-    bool ok;
-    switch (FLAGS_compression_type_e) {
-    case rocksdb::kSnappyCompression:
-      ok = Snappy_Compress(Options().compression_opts, input.data(),
-                           input.size(), &compressed);
-      break;
-    case rocksdb::kZlibCompression:
-      ok = Zlib_Compress(Options().compression_opts, 2, input.data(),
-                         input.size(), &compressed);
-      break;
-    case rocksdb::kBZip2Compression:
-      ok = BZip2_Compress(Options().compression_opts, 2, input.data(),
-                          input.size(), &compressed);
-      break;
-    case rocksdb::kLZ4Compression:
-      ok = LZ4_Compress(Options().compression_opts, 2, input.data(),
-                        input.size(), &compressed);
-      break;
-    case rocksdb::kLZ4HCCompression:
-      ok = LZ4HC_Compress(Options().compression_opts, 2, input.data(),
-                          input.size(), &compressed);
-      break;
-    default:
-      ok = false;
-    }
-
+    bool ok = CompressSlice(input, &compressed);
     int64_t bytes = 0;
     int decompress_size;
     while (ok && bytes < 1024 * 1048576) {
@@ -2000,6 +2218,11 @@ class Benchmark {
                                       &decompress_size, 2);
         ok = uncompressed != nullptr;
         break;
+      case rocksdb::kZSTDNotFinalCompression:
+        uncompressed = ZSTD_Uncompress(compressed.data(), compressed.size(),
+                                       &decompress_size);
+        ok = uncompressed != nullptr;
+        break;
       default:
         ok = false;
       }
@@ -2027,9 +2250,13 @@ class Benchmark {
     options.max_write_buffer_number = FLAGS_max_write_buffer_number;
     options.min_write_buffer_number_to_merge =
       FLAGS_min_write_buffer_number_to_merge;
+    options.max_write_buffer_number_to_maintain =
+        FLAGS_max_write_buffer_number_to_maintain;
     options.max_background_compactions = FLAGS_max_background_compactions;
+    options.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
     options.max_background_flushes = FLAGS_max_background_flushes;
     options.compaction_style = FLAGS_compaction_style_e;
+    options.compaction_pri = FLAGS_compaction_pri_e;
     if (FLAGS_prefix_size != 0) {
       options.prefix_extractor.reset(
           NewFixedPrefixTransform(FLAGS_prefix_size));
@@ -2044,6 +2271,10 @@ class Benchmark {
     options.memtable_prefix_bloom_bits = FLAGS_memtable_bloom_bits;
     options.bloom_locality = FLAGS_bloom_locality;
     options.max_open_files = FLAGS_open_files;
+    options.max_file_opening_threads = FLAGS_file_opening_threads;
+    options.new_table_reader_for_compaction_inputs =
+        FLAGS_new_table_reader_for_compaction_inputs;
+    options.compaction_readahead_size = FLAGS_compaction_readahead_size;
     options.statistics = dbstats;
     if (FLAGS_enable_io_prio) {
       FLAGS_env->LowerThreadPoolIOPriority(Env::LOW);
@@ -2060,7 +2291,7 @@ class Benchmark {
       flashcache_aware_env_ =
           std::move(NewFlashcacheAwareEnv(FLAGS_env, cachedev_fd_));
       if (flashcache_aware_env_.get() == nullptr) {
-        fprintf(stderr, "Failed to open flashcahce device at %s\n",
+        fprintf(stderr, "Failed to open flashcache device at %s\n",
                 FLAGS_flashcache_dev.c_str());
         std::abort();
       }
@@ -2080,6 +2311,14 @@ class Benchmark {
     options.max_bytes_for_level_multiplier =
         FLAGS_max_bytes_for_level_multiplier;
     options.filter_deletes = FLAGS_filter_deletes;
+    if (FLAGS_row_cache_size) {
+      if (FLAGS_cache_numshardbits >= 1) {
+        options.row_cache =
+            NewLRUCache(FLAGS_row_cache_size, FLAGS_cache_numshardbits);
+      } else {
+        options.row_cache = NewLRUCache(FLAGS_row_cache_size);
+      }
+    }
     if ((FLAGS_prefix_size == 0) && (FLAGS_rep_factory == kPrefixHash ||
                                      FLAGS_rep_factory == kHashLinkedList)) {
       fprintf(stderr, "prefix_size should be non-zero if PrefixHash or "
@@ -2171,6 +2410,8 @@ class Benchmark {
       if (cache_ == nullptr) {
         block_based_options.no_block_cache = true;
       }
+      block_based_options.cache_index_and_filter_blocks =
+          FLAGS_cache_index_and_filter_blocks;
       block_based_options.block_cache = cache_;
       block_based_options.block_cache_compressed = compressed_cache_;
       block_based_options.block_size = FLAGS_block_size;
@@ -2214,6 +2455,9 @@ class Benchmark {
     }
     options.soft_rate_limit = FLAGS_soft_rate_limit;
     options.hard_rate_limit = FLAGS_hard_rate_limit;
+    options.hard_pending_compaction_bytes_limit =
+        FLAGS_hard_pending_compaction_bytes_limit;
+    options.delayed_write_rate = FLAGS_delayed_write_rate;
     options.rate_limit_delay_max_milliseconds =
       FLAGS_rate_limit_delay_max_milliseconds;
     options.table_cache_numshardbits = FLAGS_table_cache_numshardbits;
@@ -2241,6 +2485,7 @@ class Benchmark {
       exit(1);
     }
     options.max_successive_merges = FLAGS_max_successive_merges;
+    options.compaction_measure_io_stats = FLAGS_compaction_measure_io_stats;
 
     // set universal style compaction configurations, if applicable
     if (FLAGS_universal_size_ratio != 0) {
@@ -2263,6 +2508,8 @@ class Benchmark {
       options.compaction_options_universal.compression_size_percent =
         FLAGS_universal_compression_size_percent;
     }
+    options.compaction_options_universal.allow_trivial_move =
+        FLAGS_universal_allow_trivial_move;
     if (FLAGS_thread_status_per_interval > 0) {
       options.enable_thread_tracking = true;
     }
@@ -2271,6 +2518,11 @@ class Benchmark {
           NewGenericRateLimiter(FLAGS_rate_limiter_bytes_per_sec));
     }
 
+    if (FLAGS_readonly && FLAGS_transaction_db) {
+      fprintf(stderr, "Cannot use readonly flag with transaction_db\n");
+      exit(1);
+    }
+
     if (FLAGS_num_multi_db <= 1) {
       OpenDb(options, FLAGS_db, &db_);
     } else {
@@ -2305,15 +2557,41 @@ class Benchmark {
       if (FLAGS_readonly) {
         s = DB::OpenForReadOnly(options, db_name, column_families,
             &db->cfh, &db->db);
+      } else if (FLAGS_optimistic_transaction_db) {
+        s = OptimisticTransactionDB::Open(options, db_name, column_families,
+                                          &db->cfh, &db->opt_txn_db);
+        if (s.ok()) {
+          db->db = db->opt_txn_db->GetBaseDB();
+        }
+      } else if (FLAGS_transaction_db) {
+        TransactionDB* ptr;
+        TransactionDBOptions txn_db_options;
+        s = TransactionDB::Open(options, txn_db_options, db_name,
+                                column_families, &db->cfh, &ptr);
+        if (s.ok()) {
+          db->db = ptr;
+        }
       } else {
         s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
       }
       db->cfh.resize(FLAGS_num_column_families);
       db->num_created = num_hot;
       db->num_hot = num_hot;
-
     } else if (FLAGS_readonly) {
       s = DB::OpenForReadOnly(options, db_name, &db->db);
+    } else if (FLAGS_optimistic_transaction_db) {
+      s = OptimisticTransactionDB::Open(options, db_name, &db->opt_txn_db);
+      if (s.ok()) {
+        db->db = db->opt_txn_db->GetBaseDB();
+      }
+    } else if (FLAGS_transaction_db) {
+      TransactionDB* ptr;
+      TransactionDBOptions txn_db_options;
+      s = TransactionDB::Open(options, txn_db_options, db_name, &ptr);
+      if (s.ok()) {
+        db->db = ptr;
+      }
+
     } else {
       s = DB::Open(options, db_name, &db->db);
     }
@@ -2451,6 +2729,10 @@ class Benchmark {
       DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
       batch.Clear();
       for (int64_t j = 0; j < entries_per_batch_; j++) {
+        if (thread->shared->write_rate_limiter.get() != nullptr) {
+          thread->shared->write_rate_limiter->Request(value_size_ + key_size_,
+                                                      Env::IO_HIGH);
+        }
         int64_t rand_num = key_gens[id]->Next();
         GenerateKeyFromInt(rand_num, FLAGS_num, &key);
         if (FLAGS_num_column_families <= 1) {
@@ -3196,7 +3478,7 @@ class Benchmark {
       // Update the value (by appending data)
       Slice operand = gen.Generate(value_size_);
       if (value.size() > 0) {
-        // Use a delimeter to match the semantics for StringAppendOperator
+        // Use a delimiter to match the semantics for StringAppendOperator
         value.append(1,',');
       }
       value.append(operand.data(), operand.size());
@@ -3313,8 +3595,8 @@ class Benchmark {
 
     char msg[100];
     snprintf(msg, sizeof(msg),
-             "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64 " hits:%" \
-             PRIu64 " maxlength:%zu)",
+             "(reads:%" PRIu64 " merges:%" PRIu64 " total:%" PRIu64
+             " hits:%" PRIu64 " maxlength:%" ROCKSDB_PRIszt ")",
              num_gets, num_merges, readwrites_, num_hits, max_length);
     thread->stats.AddMessage(msg);
   }
@@ -3354,9 +3636,302 @@ class Benchmark {
     }
   }
 
+  // This benchmark stress tests Transactions.  For a given --duration (or
+  // total number of --writes, a Transaction will perform a read-modify-write
+  // to increment the value of a key in each of N(--transaction-sets) sets of
+  // keys (where each set has --num keys).  If --threads is set, this will be
+  // done in parallel.
+  //
+  // To test transactions, use --transaction_db=true.  Not setting this
+  // parameter
+  // will run the same benchmark without transactions.
+  //
+  // RandomTransactionVerify() will then validate the correctness of the results
+  // by checking if the sum of all keys in each set is the same.
+  void RandomTransaction(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    Duration duration(FLAGS_duration, readwrites_);
+    ReadOptions read_options(FLAGS_verify_checksum, true);
+    std::string value;
+    DB* db = db_.db;
+    uint64_t transactions_done = 0;
+    uint64_t transactions_aborted = 0;
+    Status s;
+    uint64_t num_prefix_ranges = FLAGS_transaction_sets;
+
+    if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) {
+      fprintf(stderr, "invalid value for transaction_sets\n");
+      abort();
+    }
+
+    if (FLAGS_num_multi_db > 1) {
+      fprintf(stderr,
+              "Cannot run RandomTransaction benchmark with "
+              "FLAGS_multi_db > 1.");
+      abort();
+    }
+
+    while (!duration.Done(1)) {
+      Transaction* txn = nullptr;
+      WriteBatch* batch = nullptr;
+
+      if (FLAGS_optimistic_transaction_db) {
+        txn = db_.opt_txn_db->BeginTransaction(write_options_);
+        assert(txn);
+      } else if (FLAGS_transaction_db) {
+        TransactionDB* txn_db = reinterpret_cast<TransactionDB*>(db_.db);
+
+        TransactionOptions txn_options;
+        txn_options.lock_timeout = FLAGS_transaction_lock_timeout;
+
+        txn = txn_db->BeginTransaction(write_options_, txn_options);
+        assert(txn);
+      } else {
+        batch = new WriteBatch();
+      }
+
+      if (txn && FLAGS_transaction_set_snapshot) {
+        txn->SetSnapshot();
+      }
+
+      // pick a random number to use to increment a key in each set
+      uint64_t incr = (thread->rand.Next() % 100) + 1;
+
+      bool failed = false;
+      // For each set, pick a key at random and increment it
+      for (uint8_t i = 0; i < num_prefix_ranges; i++) {
+        uint64_t int_value;
+        char prefix_buf[5];
+
+        // key format:  [SET#][random#]
+        std::string rand_key = ToString(thread->rand.Next() % FLAGS_num);
+        Slice base_key(rand_key);
+
+        // Pad prefix appropriately so we can iterate over each set
+        snprintf(prefix_buf, sizeof(prefix_buf), "%04d", i + 1);
+        std::string full_key = std::string(prefix_buf) + base_key.ToString();
+        Slice key(full_key);
+
+        if (txn) {
+          s = txn->GetForUpdate(read_options, key, &value);
+        } else {
+          s = db->Get(read_options, key, &value);
+        }
+
+        if (s.ok()) {
+          int_value = std::stoull(value);
+
+          if (int_value == 0 || int_value == ULONG_MAX) {
+            fprintf(stderr, "Get returned unexpected value: %s\n",
+                    value.c_str());
+            abort();
+          }
+        } else if (s.IsNotFound()) {
+          int_value = 0;
+        } else if (!(s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) {
+          fprintf(stderr, "Get returned an unexpected error: %s\n",
+                  s.ToString().c_str());
+          abort();
+        } else {
+          failed = true;
+          break;
+        }
+
+        if (FLAGS_transaction_sleep > 0) {
+          FLAGS_env->SleepForMicroseconds(thread->rand.Next() %
+                                          FLAGS_transaction_sleep);
+        }
+
+        std::string sum = ToString(int_value + incr);
+        if (txn) {
+          s = txn->Put(key, sum);
+          if (!s.ok()) {
+            // Since we did a GetForUpdate, Put should not fail.
+            fprintf(stderr, "Put returned an unexpected error: %s\n",
+                    s.ToString().c_str());
+            abort();
+          }
+        } else {
+          batch->Put(key, sum);
+        }
+      }
+
+      if (txn) {
+        if (failed) {
+          transactions_aborted++;
+          txn->Rollback();
+          s = Status::OK();
+        } else {
+          s = txn->Commit();
+        }
+      } else {
+        s = db->Write(write_options_, batch);
+      }
+
+      if (!s.ok()) {
+        failed = true;
+
+        // Ideally, we'd want to run this stress test with enough concurrency
+        // on a small enough set of keys that we get some failed transactions
+        // due to conflicts.
+        if (FLAGS_optimistic_transaction_db &&
+            (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) {
+          transactions_aborted++;
+        } else if (FLAGS_transaction_db && s.IsExpired()) {
+          transactions_aborted++;
+        } else {
+          fprintf(stderr, "Unexpected write error: %s\n", s.ToString().c_str());
+          abort();
+        }
+      }
+
+      if (txn) {
+        delete txn;
+      }
+      if (batch) {
+        delete batch;
+      }
+
+      if (!failed) {
+        thread->stats.FinishedOps(nullptr, db, 1);
+      }
+
+      transactions_done++;
+    }
+
+    char msg[100];
+    if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) {
+      snprintf(msg, sizeof(msg),
+               "( transactions:%" PRIu64 " aborts:%" PRIu64 ")",
+               transactions_done, transactions_aborted);
+    } else {
+      snprintf(msg, sizeof(msg), "( batches:%" PRIu64 " )", transactions_done);
+    }
+    thread->stats.AddMessage(msg);
+
+    if (FLAGS_perf_level > 0) {
+      thread->stats.AddMessage(perf_context.ToString());
+    }
+  }
+
+  // Verifies consistency of data after RandomTransaction() has been run.
+  // Since each iteration of RandomTransaction() incremented a key in each set
+  // by the same value, the sum of the keys in each set should be the same.
+  void RandomTransactionVerify() {
+    if (!FLAGS_transaction_db && !FLAGS_optimistic_transaction_db) {
+      // transactions not used, nothing to verify.
+      return;
+    }
+
+    uint64_t prev_total = 0;
+
+    // For each set of keys with the same prefix, sum all the values
+    for (uint32_t i = 0; i < FLAGS_transaction_sets; i++) {
+      char prefix_buf[5];
+      snprintf(prefix_buf, sizeof(prefix_buf), "%04u", i + 1);
+      uint64_t total = 0;
+
+      Iterator* iter = db_.db->NewIterator(ReadOptions());
+
+      for (iter->Seek(Slice(prefix_buf, 4)); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+
+        // stop when we reach a different prefix
+        if (key.ToString().compare(0, 4, prefix_buf) != 0) {
+          break;
+        }
+
+        Slice value = iter->value();
+        uint64_t int_value = std::stoull(value.ToString());
+        if (int_value == 0 || int_value == ULONG_MAX) {
+          fprintf(stderr, "Iter returned unexpected value: %s\n",
+                  value.ToString().c_str());
+          abort();
+        }
+
+        total += int_value;
+      }
+      delete iter;
+
+      if (i > 0) {
+        if (total != prev_total) {
+          fprintf(stderr,
+                  "RandomTransactionVerify found inconsistent totals. "
+                  "Set[%" PRIu32 "]: %" PRIu64 ", Set[%" PRIu32 "]: %" PRIu64
+                  " \n",
+                  i - 1, prev_total, i, total);
+          abort();
+        }
+      }
+      prev_total = total;
+    }
+
+    fprintf(stdout, "RandomTransactionVerify Success!\n");
+  }
+
+  // Writes and deletes random keys without overwriting keys.
+  //
+  // This benchmark is intended to partially replicate the behavior of MyRocks
+  // secondary indices: All data is stored in keys and updates happen by
+  // deleting the old version of the key and inserting the new version.
+  void RandomReplaceKeys(ThreadState* thread) {
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    std::vector<uint32_t> counters(FLAGS_numdistinct, 0);
+    size_t max_counter = 50;
+    RandomGenerator gen;
+
+    Status s;
+    DB* db = SelectDB(thread);
+    for (int64_t i = 0; i < FLAGS_numdistinct; i++) {
+      GenerateKeyFromInt(i * max_counter, FLAGS_num, &key);
+      s = db->Put(write_options_, key, gen.Generate(value_size_));
+      if (!s.ok()) {
+        fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+    }
+
+    db->GetSnapshot();
+
+    std::default_random_engine generator;
+    std::normal_distribution<double> distribution(FLAGS_numdistinct / 2.0,
+                                                  FLAGS_stddev);
+    Duration duration(FLAGS_duration, FLAGS_num);
+    while (!duration.Done(1)) {
+      int64_t rnd_id = static_cast<int64_t>(distribution(generator));
+      int64_t key_id = std::max(std::min(FLAGS_numdistinct - 1, rnd_id),
+                                static_cast<int64_t>(0));
+      GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
+                         &key);
+      s = FLAGS_use_single_deletes ? db->SingleDelete(write_options_, key)
+                                   : db->Delete(write_options_, key);
+      if (s.ok()) {
+        counters[key_id] = (counters[key_id] + 1) % max_counter;
+        GenerateKeyFromInt(key_id * max_counter + counters[key_id], FLAGS_num,
+                           &key);
+        s = db->Put(write_options_, key, Slice());
+      }
+
+      if (!s.ok()) {
+        fprintf(stderr, "Operation failed: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+
+      thread->stats.FinishedOps(nullptr, db, 1);
+    }
+
+    char msg[200];
+    snprintf(msg, sizeof(msg),
+             "use single deletes: %d, "
+             "standard deviation: %lf\n",
+             FLAGS_use_single_deletes, FLAGS_stddev);
+    thread->stats.AddMessage(msg);
+  }
+
   void Compact(ThreadState* thread) {
     DB* db = SelectDB(thread);
-    db->CompactRange(nullptr, nullptr);
+    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
   }
 
   void PrintStats(const char* key) {
@@ -3392,12 +3967,17 @@ int main(int argc, char** argv) {
   if (FLAGS_statistics) {
     dbstats = rocksdb::CreateDBStatistics();
   }
+  FLAGS_compaction_pri_e = (rocksdb::CompactionPri)FLAGS_compaction_pri;
 
   std::vector<std::string> fanout = rocksdb::StringSplit(
       FLAGS_max_bytes_for_level_multiplier_additional, ',');
   for (unsigned int j= 0; j < fanout.size(); j++) {
     FLAGS_max_bytes_for_level_multiplier_additional_v.push_back(
-      std::stoi(fanout[j]));
+#ifndef CYGWIN
+        std::stoi(fanout[j]));
+#else
+        stoi(fanout[j]));
+#endif
   }
 
   FLAGS_compression_type_e =
diff --git a/src/rocksdb/db/db_compaction_filter_test.cc b/src/rocksdb/db/db_compaction_filter_test.cc
new file mode 100644
index 0000000..a1587f2
--- /dev/null
+++ b/src/rocksdb/db/db_compaction_filter_test.cc
@@ -0,0 +1,586 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "port/stack_trace.h"
+#include "util/db_test_util.h"
+
+namespace rocksdb {
+
+static int cfilter_count = 0;
+
+// This is a static filter used for filtering
+// kvs during the compaction process.
+static std::string NEW_VALUE = "NewValue";
+
+class DBTestCompactionFilter : public DBTestBase {
+ public:
+  DBTestCompactionFilter() : DBTestBase("/db_compaction_filter_test") {}
+};
+
+class KeepFilter : public CompactionFilter {
+ public:
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value, bool* value_changed) const
+      override {
+    cfilter_count++;
+    return false;
+  }
+
+  virtual const char* Name() const override { return "KeepFilter"; }
+};
+
+class DeleteFilter : public CompactionFilter {
+ public:
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value, bool* value_changed) const
+      override {
+    cfilter_count++;
+    return true;
+  }
+
+  virtual const char* Name() const override { return "DeleteFilter"; }
+};
+
+class DelayFilter : public CompactionFilter {
+ public:
+  explicit DelayFilter(DBTestBase* d) : db_test(d) {}
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value,
+                      bool* value_changed) const override {
+    db_test->env_->addon_time_.fetch_add(1000);
+    return true;
+  }
+
+  virtual const char* Name() const override { return "DelayFilter"; }
+
+ private:
+  DBTestBase* db_test;
+};
+
+class ConditionalFilter : public CompactionFilter {
+ public:
+  explicit ConditionalFilter(const std::string* filtered_value)
+      : filtered_value_(filtered_value) {}
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value,
+                      bool* value_changed) const override {
+    return value.ToString() == *filtered_value_;
+  }
+
+  virtual const char* Name() const override { return "ConditionalFilter"; }
+
+ private:
+  const std::string* filtered_value_;
+};
+
+class ChangeFilter : public CompactionFilter {
+ public:
+  explicit ChangeFilter() {}
+
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value, bool* value_changed) const
+      override {
+    assert(new_value != nullptr);
+    *new_value = NEW_VALUE;
+    *value_changed = true;
+    return false;
+  }
+
+  virtual const char* Name() const override { return "ChangeFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit KeepFilterFactory(bool check_context = false)
+      : check_context_(check_context) {}
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (check_context_) {
+      EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+      EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+    }
+    return std::unique_ptr<CompactionFilter>(new KeepFilter());
+  }
+
+  virtual const char* Name() const override { return "KeepFilterFactory"; }
+  bool check_context_;
+  std::atomic_bool expect_full_compaction_;
+  std::atomic_bool expect_manual_compaction_;
+};
+
+class DeleteFilterFactory : public CompactionFilterFactory {
+ public:
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (context.is_manual_compaction) {
+      return std::unique_ptr<CompactionFilter>(new DeleteFilter());
+    } else {
+      return std::unique_ptr<CompactionFilter>(nullptr);
+    }
+  }
+
+  virtual const char* Name() const override { return "DeleteFilterFactory"; }
+};
+
+class DelayFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
+  }
+
+  virtual const char* Name() const override { return "DelayFilterFactory"; }
+
+ private:
+  DBTestBase* db_test;
+};
+
+class ConditionalFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ConditionalFilterFactory(const Slice& filtered_value)
+      : filtered_value_(filtered_value.ToString()) {}
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>(
+        new ConditionalFilter(&filtered_value_));
+  }
+
+  virtual const char* Name() const override {
+    return "ConditionalFilterFactory";
+  }
+
+ private:
+  std::string filtered_value_;
+};
+
+class ChangeFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ChangeFilterFactory() {}
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>(new ChangeFilter());
+  }
+
+  virtual const char* Name() const override { return "ChangeFilterFactory"; }
+};
+
+TEST_F(DBTestCompactionFilter, CompactionFilter) {
+  Options options = CurrentOptions();
+  options.max_open_files = -1;
+  options.num_levels = 3;
+  options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Write 100K keys, these are written to a few files in L0.
+  const std::string value(10, 'x');
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    Put(1, key, value);
+  }
+  ASSERT_OK(Flush(1));
+
+  // Push all files to the highest level L2. Verify that
+  // the compaction is each level invokes the filter for
+  // all the keys in that level.
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+  cfilter_count = 0;
+
+  // All the files are in the lowest level.
+  // Verify that all but the 100001st record
+  // has sequence number zero. The 100001st record
+  // is at the tip of this snapshot and cannot
+  // be zeroed out.
+  int count = 0;
+  int total = 0;
+  Arena arena;
+  {
+    ScopedArenaIterator iter(
+        dbfull()->TEST_NewInternalIterator(&arena, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ikey.sequence = -1;
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      total++;
+      if (ikey.sequence != 0) {
+        count++;
+      }
+      iter->Next();
+    }
+  }
+  ASSERT_EQ(total, 100000);
+  ASSERT_EQ(count, 1);
+
+  // overwrite all the 100K keys once again.
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+  ASSERT_OK(Flush(1));
+
+  // push all files to the highest level L2. This
+  // means that all keys should pass at least once
+  // via the compaction filter
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+
+  // create a new database with the compaction
+  // filter in such a way that it deletes all keys
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // write all the keys once again.
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+  ASSERT_OK(Flush(1));
+  ASSERT_NE(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2, 1), 0);
+
+  // Push all files to the highest level L2. This
+  // triggers the compaction filter to delete all keys,
+  // verify that at the end of the compaction process,
+  // nothing is left.
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  ASSERT_EQ(cfilter_count, 0);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+
+  {
+    // Scan the entire database to ensure that nothing is left
+    std::unique_ptr<Iterator> iter(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    iter->SeekToFirst();
+    count = 0;
+    while (iter->Valid()) {
+      count++;
+      iter->Next();
+    }
+    ASSERT_EQ(count, 0);
+  }
+
+  // The sequence number of the remaining record
+  // is not zeroed out even though it is at the
+  // level Lmax because this record is at the tip
+  count = 0;
+  {
+    ScopedArenaIterator iter(
+        dbfull()->TEST_NewInternalIterator(&arena, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      ASSERT_NE(ikey.sequence, (unsigned)0);
+      count++;
+      iter->Next();
+    }
+    ASSERT_EQ(count, 0);
+  }
+}
+
+// Tests the edge case where compaction does not produce any output -- all
+// entries are deleted. The compaction should create bunch of 'DeleteFile'
+// entries in VersionEdit, but none of the 'AddFile's.
+TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) {
+  Options options;
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  // put some data
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      Put(ToString(table * 100 + i), "val");
+    }
+    Flush();
+  }
+
+  // this will produce empty file (delete compaction filter)
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(0U, CountLiveFiles());
+
+  Reopen(options);
+
+  Iterator* itr = db_->NewIterator(ReadOptions());
+  itr->SeekToFirst();
+  // empty db
+  ASSERT_TRUE(!itr->Valid());
+
+  delete itr;
+}
+
+TEST_F(DBTestCompactionFilter, CompactionFilterWithValueChange) {
+  do {
+    Options options;
+    options.num_levels = 3;
+    options.compaction_filter_factory =
+      std::make_shared<ChangeFilterFactory>();
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Write 100K+1 keys, these are written to a few files
+    // in L0. We do this so that the current snapshot points
+    // to the 100001 key.The compaction filter is  not invoked
+    // on keys that are visible via a snapshot because we
+    // anyways cannot delete it.
+    const std::string value(10, 'x');
+    for (int i = 0; i < 100001; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%010d", i);
+      Put(1, key, value);
+    }
+
+    // push all files to  lower levels
+    ASSERT_OK(Flush(1));
+    if (option_config_ != kUniversalCompactionMultiLevel &&
+        option_config_ != kUniversalSubcompactions) {
+      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+      dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    } else {
+      dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                             nullptr);
+    }
+
+    // re-write all data again
+    for (int i = 0; i < 100001; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%010d", i);
+      Put(1, key, value);
+    }
+
+    // push all files to  lower levels. This should
+    // invoke the compaction filter for all 100000 keys.
+    ASSERT_OK(Flush(1));
+    if (option_config_ != kUniversalCompactionMultiLevel &&
+        option_config_ != kUniversalSubcompactions) {
+      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+      dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    } else {
+      dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                             nullptr);
+    }
+
+    // verify that all keys now have the new value that
+    // was set by the compaction process.
+    for (int i = 0; i < 100001; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%010d", i);
+      std::string newvalue = Get(1, key);
+      ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) {
+  std::string one, two, three, four;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+  PutFixed64(&four, 4);
+
+  Options options;
+  options = CurrentOptions(options);
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  options.num_levels = 3;
+  // Filter out keys with value is 2.
+  options.compaction_filter_factory =
+      std::make_shared<ConditionalFilterFactory>(two);
+  DestroyAndReopen(options);
+
+  // In the same compaction, a value type needs to be deleted based on
+  // compaction filter, and there is a merge type for the key. compaction
+  // filter result is ignored.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", one));
+  ASSERT_OK(Flush());
+  std::string newvalue = Get("foo");
+  ASSERT_EQ(newvalue, three);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  newvalue = Get("foo");
+  ASSERT_EQ(newvalue, three);
+
+  // value key can be deleted based on compaction filter, leaving only
+  // merge keys.
+  ASSERT_OK(db_->Put(WriteOptions(), "bar", two));
+  ASSERT_OK(Flush());
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  newvalue = Get("bar");
+  ASSERT_EQ("NOT_FOUND", newvalue);
+  ASSERT_OK(db_->Merge(WriteOptions(), "bar", two));
+  ASSERT_OK(Flush());
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  newvalue = Get("bar");
+  ASSERT_EQ(two, two);
+
+  // Compaction filter never applies to merge keys.
+  ASSERT_OK(db_->Put(WriteOptions(), "foobar", one));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foobar", two));
+  ASSERT_OK(Flush());
+  newvalue = Get("foobar");
+  ASSERT_EQ(newvalue, three);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  newvalue = Get("foobar");
+  ASSERT_EQ(newvalue, three);
+
+  // In the same compaction, both of value type and merge type keys need to be
+  // deleted based on compaction filter, and there is a merge type for the key.
+  // For both keys, compaction filter results are ignored.
+  ASSERT_OK(db_->Put(WriteOptions(), "barfoo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "barfoo", two));
+  ASSERT_OK(Flush());
+  newvalue = Get("barfoo");
+  ASSERT_EQ(newvalue, four);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  newvalue = Get("barfoo");
+  ASSERT_EQ(newvalue, four);
+}
+
+TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
+  KeepFilterFactory* filter = new KeepFilterFactory();
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_filter_factory.reset(filter);
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 8;
+  Reopen(options);
+  int num_keys_per_file = 400;
+  for (int j = 0; j < 3; j++) {
+    // Write several keys.
+    const std::string value(10, 'x');
+    for (int i = 0; i < num_keys_per_file; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%08d%02d", i, j);
+      Put(key, value);
+    }
+    dbfull()->TEST_FlushMemTable();
+    // Make sure next file is much smaller so automatic compaction will not
+    // be triggered.
+    num_keys_per_file /= 2;
+  }
+
+  // Force a manual compaction
+  cfilter_count = 0;
+  filter->expect_manual_compaction_.store(true);
+  filter->expect_full_compaction_.store(false);  // Manual compaction always
+                                                 // set this flag.
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(cfilter_count, 700);
+  ASSERT_EQ(NumSortedRuns(0), 1);
+
+  // Verify total number of keys is correct after manual compaction.
+  {
+    int count = 0;
+    int total = 0;
+    Arena arena;
+    ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ikey.sequence = -1;
+      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+      total++;
+      if (ikey.sequence != 0) {
+        count++;
+      }
+      iter->Next();
+    }
+    ASSERT_EQ(total, 700);
+    ASSERT_EQ(count, 1);
+  }
+}
+
+// Compaction filters should only be applied to records that are newer than the
+// latest snapshot. This test inserts records and applies a delete filter.
+TEST_F(DBTestCompactionFilter, CompactionFilterSnapshot) {
+  Options options;
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  // Put some data.
+  const Snapshot* snapshot = nullptr;
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10; ++i) {
+      Put(ToString(table * 100 + i), "val");
+    }
+    Flush();
+
+    if (table == 0) {
+      snapshot = db_->GetSnapshot();
+    }
+  }
+  assert(snapshot != nullptr);
+
+  cfilter_count = 0;
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // The filter should delete 10 records.
+  ASSERT_EQ(30U, cfilter_count);
+
+  // Release the snapshot and compact again -> now all records should be
+  // removed.
+  db_->ReleaseSnapshot(snapshot);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(0U, CountLiveFiles());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_compaction_test.cc b/src/rocksdb/db/db_compaction_test.cc
new file mode 100644
index 0000000..e052fc7
--- /dev/null
+++ b/src/rocksdb/db/db_compaction_test.cc
@@ -0,0 +1,1858 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "port/stack_trace.h"
+#include "rocksdb/experimental.h"
+#include "util/db_test_util.h"
+#include "util/sync_point.h"
+namespace rocksdb {
+
+// SYNC_POINT is not supported in released Windows mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+
+
+class DBCompactionTest : public DBTestBase {
+ public:
+  DBCompactionTest() : DBTestBase("/db_compaction_test") {}
+};
+
+class DBCompactionTestWithParam : public DBTestBase,
+                                public testing::WithParamInterface<uint32_t> {
+ public:
+  DBCompactionTestWithParam() : DBTestBase("/db_compaction_test") {
+    max_subcompactions_ = GetParam();
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  uint32_t max_subcompactions_;
+};
+
+namespace {
+class OnFileDeletionListener : public EventListener {
+ public:
+  OnFileDeletionListener() :
+      matched_count_(0),
+      expected_file_name_("") {}
+
+  void SetExpectedFileName(
+      const std::string file_name) {
+    expected_file_name_ = file_name;
+  }
+
+  void VerifyMatchedCount(size_t expected_value) {
+    ASSERT_EQ(matched_count_, expected_value);
+  }
+
+  void OnTableFileDeleted(
+      const TableFileDeletionInfo& info) override {
+    if (expected_file_name_ != "") {
+      ASSERT_EQ(expected_file_name_, info.file_path);
+      expected_file_name_ = "";
+      matched_count_++;
+    }
+  }
+
+ private:
+  size_t matched_count_;
+  std::string expected_file_name_;
+};
+
+static const int kCDTValueSize = 1000;
+static const int kCDTKeysPerBuffer = 4;
+static const int kCDTNumLevels = 8;
+Options DeletionTriggerOptions() {
+  Options options;
+  options.compression = kNoCompression;
+  options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24);
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_number_to_maintain = 0;
+  options.num_levels = kCDTNumLevels;
+  options.level0_file_num_compaction_trigger = 1;
+  options.target_file_size_base = options.write_buffer_size * 2;
+  options.target_file_size_multiplier = 2;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * options.target_file_size_multiplier;
+  options.max_bytes_for_level_multiplier = 2;
+  options.disable_auto_compactions = false;
+  return options;
+}
+
+bool HaveOverlappingKeyRanges(
+    const Comparator* c,
+    const SstFileMetaData& a, const SstFileMetaData& b) {
+  if (c->Compare(a.smallestkey, b.smallestkey) >= 0) {
+    if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+      // b.smallestkey <= a.smallestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+    // a.smallestkey < b.smallestkey <= a.largestkey
+    return true;
+  }
+  if (c->Compare(a.largestkey, b.largestkey) <= 0) {
+    if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
+      // b.smallestkey <= a.largestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
+    // a.smallestkey <= b.largestkey < a.largestkey
+    return true;
+  }
+  return false;
+}
+
+// Identifies all files between level "min_level" and "max_level"
+// which has overlapping key range with "input_file_meta".
+void GetOverlappingFileNumbersForLevelCompaction(
+    const ColumnFamilyMetaData& cf_meta,
+    const Comparator* comparator,
+    int min_level, int max_level,
+    const SstFileMetaData* input_file_meta,
+    std::set<std::string>* overlapping_file_names) {
+  std::set<const SstFileMetaData*> overlapping_files;
+  overlapping_files.insert(input_file_meta);
+  for (int m = min_level; m <= max_level; ++m) {
+    for (auto& file : cf_meta.levels[m].files) {
+      for (auto* included_file : overlapping_files) {
+        if (HaveOverlappingKeyRanges(
+                comparator, *included_file, file)) {
+          overlapping_files.insert(&file);
+          overlapping_file_names->insert(file.name);
+          break;
+        }
+      }
+    }
+  }
+}
+
+void VerifyCompactionResult(
+    const ColumnFamilyMetaData& cf_meta,
+    const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
+  for (auto& level : cf_meta.levels) {
+    for (auto& file : level.files) {
+      assert(overlapping_file_numbers.find(file.name) ==
+             overlapping_file_numbers.end());
+    }
+  }
+#endif
+}
+
+const SstFileMetaData* PickFileRandomly(
+    const ColumnFamilyMetaData& cf_meta,
+    Random* rand,
+    int* level = nullptr) {
+  auto file_id = rand->Uniform(static_cast<int>(
+      cf_meta.file_count)) + 1;
+  for (auto& level_meta : cf_meta.levels) {
+    if (file_id <= level_meta.files.size()) {
+      if (level != nullptr) {
+        *level = level_meta.level;
+      }
+      auto result = rand->Uniform(file_id);
+      return &(level_meta.files[result]);
+    }
+    file_id -= level_meta.files.size();
+  }
+  assert(false);
+  return nullptr;
+}
+}  // anonymous namespace
+
+// All the TEST_P tests run once with sub_compactions disabled (i.e.
+// options.max_subcompactions = 1) and once with it enabled
+TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) {
+  for (int tid = 0; tid < 3; ++tid) {
+    uint64_t db_size[2];
+    Options options = CurrentOptions(DeletionTriggerOptions());
+    options.max_subcompactions = max_subcompactions_;
+
+    if (tid == 1) {
+      // the following only disable stats update in DB::Open()
+      // and should not affect the result of this test.
+      options.skip_stats_update_on_db_open = true;
+    } else if (tid == 2) {
+      // third pass with universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+    }
+
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    const int kTestSize = kCDTKeysPerBuffer * 1024;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(RandomString(&rnd, kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[0] = Size(Key(0), Key(kTestSize - 1));
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+
+    // must have much smaller db size.
+    ASSERT_GT(db_size[0] / 3, db_size[1]);
+  }
+}
+
+TEST_F(DBCompactionTest, SkipStatsUpdateTest) {
+  // This test verify UpdateAccumulatedStats is not on by observing
+  // the compaction behavior when there are many of deletion entries.
+  // The test will need to be updated if the internal behavior changes.
+
+  Options options = DeletionTriggerOptions();
+  options = CurrentOptions(options);
+  options.env = env_;
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  const int kTestSize = kCDTKeysPerBuffer * 512;
+  std::vector<std::string> values;
+  for (int k = 0; k < kTestSize; ++k) {
+    values.push_back(RandomString(&rnd, kCDTValueSize));
+    ASSERT_OK(Put(Key(k), values[k]));
+  }
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
+
+  for (int k = 0; k < kTestSize; ++k) {
+    ASSERT_OK(Delete(Key(k)));
+  }
+
+  // Reopen the DB with stats-update disabled
+  options.skip_stats_update_on_db_open = true;
+  env_->random_file_open_counter_.store(0);
+  Reopen(options);
+
+  // As stats-update is disabled, we expect a very low
+  // number of random file open.
+  ASSERT_LT(env_->random_file_open_counter_.load(), 5);
+
+  // Repeat the reopen process, but this time we enable
+  // stats-update.
+  options.skip_stats_update_on_db_open = false;
+  env_->random_file_open_counter_.store(0);
+  Reopen(options);
+
+  // Since we do a normal stats update on db-open, there
+  // will be more random open files.
+  ASSERT_GT(env_->random_file_open_counter_.load(), 5);
+}
+
+TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
+  Options options;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.new_table_reader_for_compaction_inputs = true;
+  options.max_open_files = 100;
+  options.level0_file_num_compaction_trigger = 3;
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  int num_table_cache_lookup = 0;
+  int num_new_table_reader = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::FindTable:0", [&](void* arg) {
+        assert(arg != nullptr);
+        bool no_io = *(reinterpret_cast<bool*>(arg));
+        if (!no_io) {
+          // filter out cases for table properties queries.
+          num_table_cache_lookup++;
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::GetTableReader:0",
+      [&](void* arg) { num_new_table_reader++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int k = 0; k < options.level0_file_num_compaction_trigger; ++k) {
+    ASSERT_OK(Put(Key(k), Key(k)));
+    ASSERT_OK(Put(Key(10 - k), "bar"));
+    if (k < options.level0_file_num_compaction_trigger - 1) {
+      num_table_cache_lookup = 0;
+      Flush();
+      dbfull()->TEST_WaitForCompact();
+      // preloading iterator issues one table cache lookup and create
+      // a new table reader.
+      ASSERT_EQ(num_table_cache_lookup, 1);
+      ASSERT_EQ(num_new_table_reader, 1);
+
+      num_table_cache_lookup = 0;
+      num_new_table_reader = 0;
+      ASSERT_EQ(Key(k), Get(Key(k)));
+      // lookup iterator from table cache and no need to create a new one.
+      ASSERT_EQ(num_table_cache_lookup, 1);
+      ASSERT_EQ(num_new_table_reader, 0);
+    }
+  }
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  // Preloading iterator issues one table cache lookup and creates
+  // a new table reader. One file is created for flush and one for compaction.
+  // Compaction inputs make no table cache look-up.
+  ASSERT_EQ(num_table_cache_lookup, 2);
+  // Create new iterator for:
+  // (1) 1 for verifying flush results
+  // (2) 3 for compaction input files
+  // (3) 1 for verifying compaction results.
+  ASSERT_EQ(num_new_table_reader, 5);
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  ASSERT_EQ(Key(1), Get(Key(1)));
+  ASSERT_EQ(num_table_cache_lookup, 1);
+  ASSERT_EQ(num_new_table_reader, 0);
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  db_->CompactRange(cro, nullptr, nullptr);
+  // Only verifying compaction outputs issues one table cache lookup.
+  ASSERT_EQ(num_table_cache_lookup, 1);
+  // One for compaction input, one for verifying compaction results.
+  ASSERT_EQ(num_new_table_reader, 2);
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  ASSERT_EQ(Key(1), Get(Key(1)));
+  ASSERT_EQ(num_table_cache_lookup, 1);
+  ASSERT_EQ(num_new_table_reader, 0);
+
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) {
+  for (int tid = 0; tid < 2; ++tid) {
+    uint64_t db_size[3];
+    Options options = CurrentOptions(DeletionTriggerOptions());
+    options.max_subcompactions = max_subcompactions_;
+
+    if (tid == 1) {
+      // second pass with universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+    }
+
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    // round 1 --- insert key/value pairs.
+    const int kTestSize = kCDTKeysPerBuffer * 512;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(RandomString(&rnd, kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[0] = Size(Key(0), Key(kTestSize - 1));
+    Close();
+
+    // round 2 --- disable auto-compactions and issue deletions.
+    options.create_if_missing = false;
+    options.disable_auto_compactions = true;
+    Reopen(options);
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+    Close();
+    // as auto_compaction is off, we shouldn't see too much reduce
+    // in db size.
+    ASSERT_LT(db_size[0] / 3, db_size[1]);
+
+    // round 3 --- reopen db with auto_compaction on and see if
+    // deletion compensation still work.
+    options.disable_auto_compactions = false;
+    Reopen(options);
+    // insert relatively small amount of data to trigger auto compaction.
+    for (int k = 0; k < kTestSize / 10; ++k) {
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[2] = Size(Key(0), Key(kTestSize - 1));
+    // this time we're expecting significant drop in size.
+    ASSERT_GT(db_size[0] / 3, db_size[2]);
+  }
+}
+
+TEST_F(DBCompactionTest, DisableStatsUpdateReopen) {
+  uint64_t db_size[3];
+  for (int test = 0; test < 2; ++test) {
+    Options options = CurrentOptions(DeletionTriggerOptions());
+    options.skip_stats_update_on_db_open = (test == 0);
+
+    env_->random_read_counter_.Reset();
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    // round 1 --- insert key/value pairs.
+    const int kTestSize = kCDTKeysPerBuffer * 512;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(RandomString(&rnd, kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[0] = Size(Key(0), Key(kTestSize - 1));
+    Close();
+
+    // round 2 --- disable auto-compactions and issue deletions.
+    options.create_if_missing = false;
+    options.disable_auto_compactions = true;
+
+    env_->random_read_counter_.Reset();
+    Reopen(options);
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    db_size[1] = Size(Key(0), Key(kTestSize - 1));
+    Close();
+    // as auto_compaction is off, we shouldn't see too much reduce
+    // in db size.
+    ASSERT_LT(db_size[0] / 3, db_size[1]);
+
+    // round 3 --- reopen db with auto_compaction on and see if
+    // deletion compensation still work.
+    options.disable_auto_compactions = false;
+    Reopen(options);
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+    db_size[2] = Size(Key(0), Key(kTestSize - 1));
+
+    if (options.skip_stats_update_on_db_open) {
+      // If update stats on DB::Open is disable, we don't expect
+      // deletion entries taking effect.
+      ASSERT_LT(db_size[0] / 3, db_size[2]);
+    } else {
+      // Otherwise, we should see a significant drop in db size.
+      ASSERT_GT(db_size[0] / 3, db_size[2]);
+    }
+  }
+}
+
+
+TEST_P(DBCompactionTestWithParam, CompactionTrigger) {
+  Options options;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_subcompactions = max_subcompactions_;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    std::vector<std::string> values;
+    // Write 100KB (100 values, each 1K)
+    for (int i = 0; i < 100; i++) {
+      values.push_back(RandomString(&rnd, 990));
+      ASSERT_OK(Put(1, Key(i), values[i]));
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
+  }
+
+  // generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 100; i++) {
+    values.push_back(RandomString(&rnd, 990));
+    ASSERT_OK(Put(1, Key(i), values[i]));
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1);
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) {
+  Options options;
+  options.write_buffer_size = 100000000;        // Large write buffer
+  options.max_subcompactions = max_subcompactions_;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+
+  // Write 8MB (80 values, each 100K)
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  std::vector<std::string> values;
+  for (int i = 0; i < 80; i++) {
+    values.push_back(RandomString(&rnd, 100000));
+    ASSERT_OK(Put(1, Key(i), values[i]));
+  }
+
+  // Reopening moves updates to level-0
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                              true /* disallow trivial move */);
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
+  for (int i = 0; i < 80; i++) {
+    ASSERT_EQ(Get(1, Key(i)), values[i]);
+  }
+}
+
+TEST_F(DBCompactionTest, MinorCompactionsHappen) {
+  do {
+    Options options;
+    options.write_buffer_size = 10000;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    const int N = 500;
+
+    int starting_num_tables = TotalTableFiles(1);
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v')));
+    }
+    int ending_num_tables = TotalTableFiles(1);
+    ASSERT_GT(ending_num_tables, starting_num_tables);
+
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+    }
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+    }
+  } while (ChangeCompactOptions());
+}
+
+// Check that writes done during a memtable compaction are recovered
+// if the database is shutdown during the memtable compaction.
+TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) {
+  do {
+    Options options;
+    options.env = env_;
+    options.write_buffer_size = 1000000;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Trigger a long memtable compaction and reopen the database during it
+    ASSERT_OK(Put(1, "foo", "v1"));  // Goes to 1st log file
+    ASSERT_OK(Put(1, "big1", std::string(10000000, 'x')));  // Fills memtable
+    ASSERT_OK(Put(1, "big2", std::string(1000, 'y')));  // Triggers compaction
+    ASSERT_OK(Put(1, "bar", "v2"));                     // Goes to new log file
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1"));
+    ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2"));
+  } while (ChangeOptions());
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) {
+  int32_t trivial_move = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* arg) { trivial_move++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.write_buffer_size = 100000000;
+  options.max_subcompactions = max_subcompactions_;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  int32_t num_keys = 80;
+  int32_t value_size = 100 * 1024;  // 100 KB
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  for (int i = 0; i < num_keys; i++) {
+    values.push_back(RandomString(&rnd, value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+
+  // Reopening moves updates to L0
+  Reopen(options);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 1);  // 1 file in L0
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // 0 files in L1
+
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+  LiveFileMetaData level0_file = metadata[0];  // L0 file meta
+
+  // Compaction will initiate a trivial move from L0 to L1
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  // File moved From L0 to L1
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);  // 0 files in L0
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1);  // 1 file in L1
+
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+  ASSERT_EQ(metadata[0].name /* level1_file.name */, level0_file.name);
+  ASSERT_EQ(metadata[0].size /* level1_file.size */, level0_file.size);
+
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+
+  ASSERT_EQ(trivial_move, 1);
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* arg) { trivial_move++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* arg) { non_trivial_move++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  // non overlapping ranges
+  std::vector<std::pair<int32_t, int32_t>> ranges = {
+    {100, 199},
+    {300, 399},
+    {0, 99},
+    {200, 299},
+    {600, 699},
+    {400, 499},
+    {500, 550},
+    {551, 599},
+  };
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+  for (uint32_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      values[j] = RandomString(&rnd, value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  int32_t level0_files = NumTableFilesAtLevel(0, 0);
+  ASSERT_EQ(level0_files, ranges.size());    // Multiple files in L0
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // No files in L1
+
+  // Since data is non-overlapping we expect compaction to initiate
+  // a trivial move
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  // We expect that all the files were trivially moved from L0 to L1
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files);
+
+  for (uint32_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      ASSERT_EQ(Get(Key(j)), values[j]);
+    }
+  }
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  trivial_move = 0;
+  non_trivial_move = 0;
+  values.clear();
+  DestroyAndReopen(options);
+  // Same ranges as above but overlapping
+  ranges = {
+    {100, 199},
+    {300, 399},
+    {0, 99},
+    {200, 299},
+    {600, 699},
+    {400, 499},
+    {500, 560},  // this range overlap with the next one
+    {551, 599},
+  };
+  for (uint32_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      values[j] = RandomString(&rnd, value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  for (uint32_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      ASSERT_EQ(Get(Key(j)), values[j]);
+    }
+  }
+  ASSERT_EQ(trivial_move, 0);
+  ASSERT_EQ(non_trivial_move, 1);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* arg) { trivial_move++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* arg) { non_trivial_move++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.num_levels = 7;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  // Add 2 non-overlapping files
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file 1 [0 => 300]
+  for (int32_t i = 0; i <= 300; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 2 [600 => 700]
+  for (int32_t i = 600; i <= 700; i++) {
+    values[i] = RandomString(&rnd, value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L0
+  ASSERT_EQ("2", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 6;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  // 2 files in L6
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  for (int32_t i = 0; i <= 300; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+  for (int32_t i = 600; i <= 700; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* arg) { trivial_move++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* arg) { non_trivial_move++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.write_buffer_size = 100000000;
+  options.max_subcompactions = max_subcompactions_;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  // File with keys [ 0 => 99 ]
+  for (int i = 0; i < 100; i++) {
+    values.push_back(RandomString(&rnd, value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1", FilesPerLevel(0));
+  // Compaction will do L0=>L1 (trivial move) then move L1 files to L3
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 3;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  // File with keys [ 100 => 199 ]
+  for (int i = 100; i < 200; i++) {
+    values.push_back(RandomString(&rnd, value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+  // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 4);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  for (int i = 0; i < 200; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionThirdPath) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+  //  options = CurrentOptions(options);
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(options.db_paths[1].path, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  }
+  env_->DeleteDir(options.db_paths[1].path);
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to fill up first path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4", FilesPerLevel(0));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 1)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,1", FilesPerLevel(0));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 2)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,2", FilesPerLevel(0));
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 3)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,3", FilesPerLevel(0));
+  ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 5)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,5", FilesPerLevel(0));
+  ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 6)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,6", FilesPerLevel(0));
+  ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 7)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,7", FilesPerLevel(0));
+  ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+  ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+  //  options = CurrentOptions(options);
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(options.db_paths[1].path, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  }
+  env_->DeleteDir(options.db_paths[1].path);
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Always gets compacted into 1 Level1 file,
+  // 0/1 Level 0 file
+  for (int num = 0; num < 3; num++) {
+    key_idx = 0;
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) {
+  Random rnd(301);
+  int max_key_level_insert = 200;
+  int max_key_universal_insert = 600;
+
+  // Stage 1: generate a db with level compaction
+  Options options;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_bytes_for_level_base = 500 << 10;  // 500KB
+  options.max_bytes_for_level_multiplier = 1;
+  options.target_file_size_base = 200 << 10;  // 200KB
+  options.target_file_size_multiplier = 1;
+  options.max_subcompactions = max_subcompactions_;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (int i = 0; i <= max_key_level_insert; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  }
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_GT(TotalTableFiles(1, 4), 1);
+  int non_level0_num_files = 0;
+  for (int i = 1; i < options.num_levels; i++) {
+    non_level0_num_files += NumTableFilesAtLevel(i, 1);
+  }
+  ASSERT_GT(non_level0_num_files, 0);
+
+  // Stage 2: reopen with universal compaction - should fail
+  options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options = CurrentOptions(options);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Stage 3: compact into a single file and move the file to level 0
+  options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = INT_MAX;
+  options.target_file_size_multiplier = 1;
+  options.max_bytes_for_level_base = INT_MAX;
+  options.max_bytes_for_level_multiplier = 1;
+  options.num_levels = 4;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 0;
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForce;
+  dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+
+  // Only 1 file in L0
+  ASSERT_EQ("1", FilesPerLevel(1));
+
+  // Stage 4: re-open in universal compaction style and do some db operations
+  options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 4;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 3;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  options.num_levels = 1;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  }
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  for (int i = 1; i < options.num_levels; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
+  }
+
+  // verify keys inserted in both level compaction style and universal
+  // compaction style
+  std::string keys_in_db;
+  Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    keys_in_db.append(iter->key().ToString());
+    keys_in_db.push_back(',');
+  }
+  delete iter;
+
+  std::string expected_keys;
+  for (int i = 0; i <= max_key_universal_insert; i++) {
+    expected_keys.append(Key(i));
+    expected_keys.push_back(',');
+  }
+
+  ASSERT_EQ(keys_in_db, expected_keys);
+}
+
+TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_a) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "b", "v"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Delete(1, "b"));
+    ASSERT_OK(Delete(1, "a"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Delete(1, "a"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "v"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("(a->v)", Contents(1));
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ASSERT_EQ("(a->v)", Contents(1));
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    Put(1, "", "");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    Delete(1, "e");
+    Put(1, "", "");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    Put(1, "c", "cv");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    Put(1, "", "");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    Put(1, "", "");
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    Put(1, "d", "dv");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    Put(1, "", "");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    Delete(1, "d");
+    Delete(1, "b");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("(->)(c->cv)", Contents(1));
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ASSERT_EQ("(->)(c->cv)", Contents(1));
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBCompactionTestWithParam, ManualCompaction) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q", 1);
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p1", "p9");
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+    // Populate a different range
+    MakeTables(3, "c", "e", 1);
+    ASSERT_EQ("1,1,2", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel(1));
+
+    // Compact all
+    MakeTables(1, "a", "z", 1);
+    ASSERT_EQ("1,0,2", FilesPerLevel(1));
+    db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+    if (iter == 0) {
+      options = CurrentOptions();
+      options.max_background_flushes = 0;
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"pikachu"}, options);
+    }
+  }
+}
+
+
+TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+  options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+  options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    for (int i = 0; i < 3; ++i) {
+      ASSERT_OK(Put(1, "p", "begin"));
+      ASSERT_OK(Put(1, "q", "end"));
+      ASSERT_OK(Flush(1));
+    }
+    ASSERT_EQ("3", FilesPerLevel(1));
+    ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("3", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("3", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p1", "p9", 1);
+    ASSERT_EQ("0,1", FilesPerLevel(1));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Populate a different range
+    for (int i = 0; i < 3; ++i) {
+      ASSERT_OK(Put(1, "c", "begin"));
+      ASSERT_OK(Put(1, "e", "end"));
+      ASSERT_OK(Flush(1));
+    }
+    ASSERT_EQ("3,1", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f", 1);
+    ASSERT_EQ("0,2", FilesPerLevel(1));
+    ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Compact all
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("1,2", FilesPerLevel(1));
+    ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+    CompactRangeOptions compact_options;
+    compact_options.target_path_id = 1;
+    db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+
+    ASSERT_EQ("0,1", FilesPerLevel(1));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    if (iter == 0) {
+      DestroyAndReopen(options);
+      options = CurrentOptions();
+      options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+      options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+      options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+      options.max_background_flushes = 1;
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      CreateAndReopenWithCF({"pikachu"}, options);
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, FilesDeletedAfterCompaction) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v2"));
+    Compact(1, "a", "z");
+    const size_t num_files = CountLiveFiles();
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(Put(1, "foo", "v2"));
+      Compact(1, "a", "z");
+    }
+    ASSERT_EQ(CountLiveFiles(), num_files);
+  } while (ChangeCompactOptions());
+}
+
+// Check level comapction with compact files
+TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.level0_stop_writes_trigger = 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options.max_subcompactions = max_subcompactions_;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
+  }
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  dbfull()->TEST_WaitForCompact();
+
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  int output_level = static_cast<int>(cf_meta.levels.size()) - 1;
+  for (int file_picked = 5; file_picked > 0; --file_picked) {
+    std::set<std::string> overlapping_file_names;
+    std::vector<std::string> compaction_input_file_names;
+    for (int f = 0; f < file_picked; ++f) {
+      int level;
+      auto file_meta = PickFileRandomly(cf_meta, &rnd, &level);
+      compaction_input_file_names.push_back(file_meta->name);
+      GetOverlappingFileNumbersForLevelCompaction(
+          cf_meta, options.comparator, level, output_level,
+          file_meta, &overlapping_file_names);
+    }
+
+    ASSERT_OK(dbfull()->CompactFiles(
+        CompactionOptions(), handles_[1],
+        compaction_input_file_names,
+        output_level));
+
+    // Make sure all overlapping files do not exist after compaction
+    dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+    VerifyCompactionResult(cf_meta, overlapping_file_names);
+  }
+
+  // make sure all key-values are still there.
+  for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_NE(Get(1, ToString(key)), "NOT_FOUND");
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) {
+  Options options;
+  const int kKeySize = 16;
+  const int kKvSize = 1000;
+  const int kKeysPerBuffer = 100;
+  const int kNumL1Files = 5;
+  options.create_if_missing = true;
+  options.write_buffer_size = kKeysPerBuffer * kKvSize;
+  options.max_write_buffer_number = 2;
+  options.target_file_size_base =
+      options.write_buffer_size *
+      (options.max_write_buffer_number - 1);
+  options.level0_file_num_compaction_trigger = kNumL1Files;
+  options.max_bytes_for_level_base =
+      options.level0_file_num_compaction_trigger *
+      options.target_file_size_base;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options.max_subcompactions = max_subcompactions_;
+
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  // stop the compaction thread until we simulate the file creation failure.
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  options.env = env_;
+
+  DestroyAndReopen(options);
+
+  const int kNumInsertedKeys =
+      options.level0_file_num_compaction_trigger *
+      (options.max_write_buffer_number - 1) *
+      kKeysPerBuffer;
+
+  Random rnd(301);
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    keys.emplace_back(RandomString(&rnd, kKeySize));
+    values.emplace_back(RandomString(&rnd, kKvSize - kKeySize));
+    ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
+    dbfull()->TEST_WaitForFlushMemTable();
+  }
+
+  dbfull()->TEST_FlushMemTable(true);
+  // Make sure the number of L0 files can trigger compaction.
+  ASSERT_GE(NumTableFilesAtLevel(0),
+            options.level0_file_num_compaction_trigger);
+
+  auto previous_num_level0_files = NumTableFilesAtLevel(0);
+
+  // Fail the first file creation.
+  env_->non_writable_count_ = 1;
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  // Expect compaction to fail here as one file will fail its
+  // creation.
+  ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok());
+
+  // Verify L0 -> L1 compaction does fail.
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+  // Verify all L0 files are still there.
+  ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files);
+
+  // All key-values must exist after compaction fails.
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    ASSERT_EQ(values[k], Get(keys[k]));
+  }
+
+  env_->non_writable_count_ = 0;
+
+  // Make sure RocksDB will not get into corrupted state.
+  Reopen(options);
+
+  // Verify again after reopen.
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    ASSERT_EQ(values[k], Get(keys[k]));
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) {
+  // iter 1 -- delete_obsolete_files_period_micros == 0
+  for (int iter = 0; iter < 2; ++iter) {
+    // This test triggers move compaction and verifies that the file is not
+    // deleted when it's part of move compaction
+    Options options = CurrentOptions();
+    options.env = env_;
+    if (iter == 1) {
+      options.delete_obsolete_files_period_micros = 0;
+    }
+    options.create_if_missing = true;
+    options.level0_file_num_compaction_trigger =
+        2;  // trigger compaction when we have 2 files
+    OnFileDeletionListener* listener = new OnFileDeletionListener();
+    options.listeners.emplace_back(listener);
+    options.max_subcompactions = max_subcompactions_;
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    // Create two 1MB sst files
+    for (int i = 0; i < 2; ++i) {
+      // Create 1MB sst file
+      for (int j = 0; j < 100; ++j) {
+        ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    // this should execute L0->L1
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+
+    // block compactions
+    test::SleepingBackgroundTask sleeping_task;
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                   Env::Priority::LOW);
+
+    options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+    Reopen(options);
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+    // let compactions go
+    sleeping_task.WakeUp();
+    sleeping_task.WaitUntilDone();
+
+    // this should execute L1->L2 (move)
+    dbfull()->TEST_WaitForCompact();
+
+    ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    ASSERT_EQ(metadata.size(), 1U);
+    auto moved_file_name = metadata[0].name;
+
+    // Create two more 1MB sst files
+    for (int i = 0; i < 2; ++i) {
+      // Create 1MB sst file
+      for (int j = 0; j < 100; ++j) {
+        ASSERT_OK(Put(Key(i * 50 + j + 100), RandomString(&rnd, 10 * 1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    // this should execute both L0->L1 and L1->L2 (merge with previous file)
+    dbfull()->TEST_WaitForCompact();
+
+    ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+    // iterator is holding the file
+    ASSERT_OK(env_->FileExists(dbname_ + moved_file_name));
+
+    listener->SetExpectedFileName(dbname_ + moved_file_name);
+    iterator.reset();
+
+    // this file should have been compacted away
+    ASSERT_NOK(env_->FileExists(dbname_ + moved_file_name));
+    listener->VerifyMatchedCount(1);
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) {
+  if (!Zlib_Supported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+  // First two levels have no compression, so that a trivial move between
+  // them will be allowed. Level 2 has Zlib compression so that a trivial
+  // move to level 3 will not be allowed
+  options.compression_per_level = {kNoCompression, kNoCompression,
+                                   kZlibCompression};
+  int matches = 0, didnt_match = 0, trivial_move = 0, non_trivial = 0;
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::InputCompressionMatchesOutput:Matches",
+      [&](void* arg) { matches++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::InputCompressionMatchesOutput:DidntMatch",
+      [&](void* arg) { didnt_match++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* arg) { non_trivial++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* arg) { trivial_move++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are going to level 0
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to fill up level 0
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(4, GetSstFileCount(dbname_));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4", FilesPerLevel(0));
+
+  // (1, 4, 1)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,1", FilesPerLevel(0));
+
+  // (1, 4, 2)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,2", FilesPerLevel(0));
+
+  // (1, 4, 3)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,3", FilesPerLevel(0));
+
+  // (1, 4, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+
+  // (1, 4, 5)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,5", FilesPerLevel(0));
+
+  // (1, 4, 6)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,6", FilesPerLevel(0));
+
+  // (1, 4, 7)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,7", FilesPerLevel(0));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+
+  ASSERT_EQ(matches, 12);
+  // Currently, the test relies on the number of calls to
+  // InputCompressionMatchesOutput() per compaction.
+  const int kCallsToInputCompressionMatch = 2;
+  ASSERT_EQ(didnt_match, 8 * kCallsToInputCompressionMatch);
+  ASSERT_EQ(trivial_move, 12);
+  ASSERT_EQ(non_trivial, 8);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+// This tests for a bug that could cause two level0 compactions running
+// concurrently
+// TODO(aekmekji): Make sure that the reason this fails when run with
+// max_subcompactions > 1 is not a correctness issue but just inherent to
+// running parallel L0-L1 compactions
+TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_write_buffer_number = 2;
+  options.max_background_compactions = 2;
+
+  DestroyAndReopen(options);
+
+  // fill up the DB
+  Random rnd(301);
+  for (int num = 0; num < 10; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"CompactionJob::Run():Start",
+        "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"},
+       {"DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2",
+        "CompactionJob::Run():End"}});
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // trigger L0 compaction
+  for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+       num++) {
+    GenerateNewRandomFile(&rnd, /* nowait */ true);
+    ASSERT_OK(Flush());
+  }
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1");
+
+  GenerateNewRandomFile(&rnd, /* nowait */ true);
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+  for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+       num++) {
+    GenerateNewRandomFile(&rnd, /* nowait */ true);
+    ASSERT_OK(Flush());
+  }
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2");
+  dbfull()->TEST_WaitForCompact();
+}
+
+
+TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* arg) { trivial_move++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* arg) { non_trivial_move++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.write_buffer_size = 100000000;
+  options.max_subcompactions = max_subcompactions_;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  // File with keys [ 0 => 99 ]
+  for (int i = 0; i < 100; i++) {
+    values.push_back(RandomString(&rnd, value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1", FilesPerLevel(0));
+  // Compaction will do L0=>L1 (trivial move) then move L1 files to L3
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 3;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  // File with keys [ 100 => 199 ]
+  for (int i = 100; i < 200; i++) {
+    values.push_back(RandomString(&rnd, value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+  // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+  // then compacte the bottommost level L3=>L3 (non trivial move)
+  compact_options = CompactRangeOptions();
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 4);
+  ASSERT_EQ(non_trivial_move, 1);
+
+  // File with keys [ 200 => 299 ]
+  for (int i = 200; i < 300; i++) {
+    values.push_back(RandomString(&rnd, value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+  trivial_move = 0;
+  non_trivial_move = 0;
+  compact_options = CompactRangeOptions();
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kSkip;
+  // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+  // and will skip bottommost level compaction
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 3);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  for (int i = 0; i < 300; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam,
+                        ::testing::Values(1, 4));
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_dynamic_level_test.cc b/src/rocksdb/db/db_dynamic_level_test.cc
new file mode 100644
index 0000000..f4d2b81
--- /dev/null
+++ b/src/rocksdb/db/db_dynamic_level_test.cc
@@ -0,0 +1,497 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !(defined NDEBUG) || !defined(OS_WIN)
+
+#include "port/stack_trace.h"
+#include "util/db_test_util.h"
+
+namespace rocksdb {
+class DBTestDynamicLevel : public DBTestBase {
+ public:
+  DBTestDynamicLevel() : DBTestBase("/db_dynamic_level_test") {}
+};
+
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase) {
+  if (!Snappy_Supported() || !LZ4_Supported()) {
+    return;
+  }
+  // Use InMemoryEnv, or it would be too slow.
+  unique_ptr<Env> env(new MockEnv(env_));
+
+  const int kNKeys = 1000;
+  int keys[kNKeys];
+
+  auto verify_func = [&]() {
+    for (int i = 0; i < kNKeys; i++) {
+      ASSERT_NE("NOT_FOUND", Get(Key(i)));
+      ASSERT_NE("NOT_FOUND", Get(Key(kNKeys * 2 + i)));
+      if (i < kNKeys / 10) {
+        ASSERT_EQ("NOT_FOUND", Get(Key(kNKeys + keys[i])));
+      } else {
+        ASSERT_NE("NOT_FOUND", Get(Key(kNKeys + keys[i])));
+      }
+    }
+  };
+
+  Random rnd(301);
+  for (int ordered_insert = 0; ordered_insert <= 1; ordered_insert++) {
+    for (int i = 0; i < kNKeys; i++) {
+      keys[i] = i;
+    }
+    if (ordered_insert == 0) {
+      std::random_shuffle(std::begin(keys), std::end(keys));
+    }
+    for (int max_background_compactions = 1; max_background_compactions < 4;
+         max_background_compactions += 2) {
+      Options options;
+      options.env = env.get();
+      options.create_if_missing = true;
+      options.db_write_buffer_size = 2048;
+      options.write_buffer_size = 2048;
+      options.max_write_buffer_number = 2;
+      options.level0_file_num_compaction_trigger = 2;
+      options.level0_slowdown_writes_trigger = 2;
+      options.level0_stop_writes_trigger = 2;
+      options.target_file_size_base = 2048;
+      options.level_compaction_dynamic_level_bytes = true;
+      options.max_bytes_for_level_base = 10240;
+      options.max_bytes_for_level_multiplier = 4;
+      options.soft_rate_limit = 1.1;
+      options.max_background_compactions = max_background_compactions;
+      options.num_levels = 5;
+
+      options.compression_per_level.resize(3);
+      options.compression_per_level[0] = kNoCompression;
+      options.compression_per_level[1] = kLZ4Compression;
+      options.compression_per_level[2] = kSnappyCompression;
+
+      DestroyAndReopen(options);
+
+      for (int i = 0; i < kNKeys; i++) {
+        int key = keys[i];
+        ASSERT_OK(Put(Key(kNKeys + key), RandomString(&rnd, 102)));
+        ASSERT_OK(Put(Key(key), RandomString(&rnd, 102)));
+        ASSERT_OK(Put(Key(kNKeys * 2 + key), RandomString(&rnd, 102)));
+        ASSERT_OK(Delete(Key(kNKeys + keys[i / 10])));
+        env_->SleepForMicroseconds(5000);
+      }
+
+      uint64_t int_prop;
+      ASSERT_TRUE(db_->GetIntProperty("rocksdb.background-errors", &int_prop));
+      ASSERT_EQ(0U, int_prop);
+
+      // Verify DB
+      for (int j = 0; j < 2; j++) {
+        verify_func();
+        if (j == 0) {
+          Reopen(options);
+        }
+      }
+
+      // Test compact range works
+      dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+      // All data should be in the last level.
+      ColumnFamilyMetaData cf_meta;
+      db_->GetColumnFamilyMetaData(&cf_meta);
+      ASSERT_EQ(5U, cf_meta.levels.size());
+      for (int i = 0; i < 4; i++) {
+        ASSERT_EQ(0U, cf_meta.levels[i].files.size());
+      }
+      ASSERT_GT(cf_meta.levels[4U].files.size(), 0U);
+      verify_func();
+
+      Close();
+    }
+  }
+
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+}
+
+// Test specific cases in dynamic max bytes
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
+  Random rnd(301);
+  int kMaxKey = 1000000;
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 2048;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 9999;
+  options.level0_stop_writes_trigger = 9999;
+  options.target_file_size_base = 2048;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 2;
+  options.num_levels = 5;
+  options.expanded_compaction_factor = 0;  // Force not expanding in compactions
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+
+  uint64_t int_prop;
+  std::string str_prop;
+
+  // Initial base level is the last level
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Put about 7K to L0
+  for (int i = 0; i < 70; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 80)));
+  }
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Insert extra about 3.5K to L0. After they are compacted to L4, base level
+  // should be changed to L3.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  for (int i = 0; i < 70; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 80)));
+  }
+
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
+  ASSERT_EQ("0", str_prop);
+
+  // Trigger parallel compaction, and the first one would change the base
+  // level.
+  // Hold compaction jobs to make sure
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start",
+      [&](void* arg) { env_->SleepForMicroseconds(100000); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  // Write about 10K more
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 80)));
+  }
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  Flush();
+  // Wait for 200 milliseconds before proceeding compactions to make sure two
+  // parallel ones are executed.
+  env_->SleepForMicroseconds(200000);
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Trigger a condition that the compaction changes base level and L0->Lbase
+  // happens at the same time.
+  // We try to make last levels' targets to be 10K, 40K, 160K, add triggers
+  // another compaction from 40K->160K.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  // Write about 150K more
+  for (int i = 0; i < 1350; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 80)));
+  }
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(2U, int_prop);
+
+  // Keep Writing data until base level changed 2->1. There will be L0->L2
+  // compaction going on at the same time.
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  for (int attempt = 0; attempt <= 20; attempt++) {
+    // Write about 5K more data with two flushes. It should be flush to level 2
+    // but when it is applied, base level is already 1.
+    for (int i = 0; i < 50; i++) {
+      ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                    RandomString(&rnd, 80)));
+    }
+    Flush();
+
+    ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+    if (int_prop == 2U) {
+      env_->SleepForMicroseconds(50000);
+    } else {
+      break;
+    }
+  }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  env_->SleepForMicroseconds(200000);
+
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(1U, int_prop);
+}
+
+// Test specific cases in dynamic max bytes
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) {
+  Random rnd(301);
+  int kMaxKey = 1000000;
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 2048;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 9999;
+  options.level0_stop_writes_trigger = 9999;
+  options.target_file_size_base = 2;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 1;
+  const int kNumLevels = 5;
+  options.num_levels = kNumLevels;
+  options.expanded_compaction_factor = 0;  // Force not expanding in compactions
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+
+  // Compact against empty DB
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  uint64_t int_prop;
+  std::string str_prop;
+
+  // Initial base level is the last level
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Put about 7K to L0
+  for (int i = 0; i < 140; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 80)));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  if (NumTableFilesAtLevel(0) == 0) {
+    // Make sure level 0 is not empty
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  RandomString(&rnd, 80)));
+    Flush();
+  }
+
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
+  ASSERT_EQ("0", str_prop);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  std::set<int> output_levels;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionPicker::CompactRange:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        output_levels.insert(compaction->output_level());
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(output_levels.size(), 2);
+  ASSERT_TRUE(output_levels.find(3) != output_levels.end());
+  ASSERT_TRUE(output_levels.find(4) != output_levels.end());
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level3", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  // Base level is still level 3.
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+}
+
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBaseInc) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 2048;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 2048;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.soft_rate_limit = 1.1;
+  options.max_background_compactions = 2;
+  options.num_levels = 5;
+
+  DestroyAndReopen(options);
+
+  int non_trivial = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* arg) { non_trivial++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  const int total_keys = 3000;
+  const int random_part_size = 100;
+  for (int i = 0; i < total_keys; i++) {
+    std::string value = RandomString(&rnd, random_part_size);
+    PutFixed32(&value, static_cast<uint32_t>(i));
+    ASSERT_OK(Put(Key(i), value));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_EQ(non_trivial, 0);
+
+  for (int i = 0; i < total_keys; i++) {
+    std::string value = Get(Key(i));
+    ASSERT_EQ(DecodeFixed32(value.c_str() + random_part_size),
+              static_cast<uint32_t>(i));
+  }
+
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+}
+
+TEST_F(DBTestDynamicLevel, MigrateToDynamicLevelMaxBytesBase) {
+  Random rnd(301);
+  const int kMaxKey = 2000;
+
+  Options options;
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 2048;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 8;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 8;
+  options.target_file_size_base = 2048;
+  options.level_compaction_dynamic_level_bytes = false;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.soft_rate_limit = 1.1;
+  options.num_levels = 8;
+
+  DestroyAndReopen(options);
+
+  auto verify_func = [&](int num_keys, bool if_sleep) {
+    for (int i = 0; i < num_keys; i++) {
+      ASSERT_NE("NOT_FOUND", Get(Key(kMaxKey + i)));
+      if (i < num_keys / 10) {
+        ASSERT_EQ("NOT_FOUND", Get(Key(i)));
+      } else {
+        ASSERT_NE("NOT_FOUND", Get(Key(i)));
+      }
+      if (if_sleep && i % 1000 == 0) {
+        // Without it, valgrind may choose not to give another
+        // thread a chance to run before finishing the function,
+        // causing the test to be extremely slow.
+        env_->SleepForMicroseconds(1);
+      }
+    }
+  };
+
+  int total_keys = 1000;
+  for (int i = 0; i < total_keys; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102)));
+    ASSERT_OK(Delete(Key(i / 10)));
+  }
+  verify_func(total_keys, false);
+  dbfull()->TEST_WaitForCompact();
+
+  options.level_compaction_dynamic_level_bytes = true;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  verify_func(total_keys, false);
+
+  std::atomic_bool compaction_finished;
+  compaction_finished = false;
+  // Issue manual compaction in one thread and still verify DB state
+  // in main thread.
+  std::thread t([&]() {
+    CompactRangeOptions compact_options;
+    compact_options.change_level = true;
+    compact_options.target_level = options.num_levels - 1;
+    dbfull()->CompactRange(compact_options, nullptr, nullptr);
+    compaction_finished.store(true);
+  });
+  do {
+    verify_func(total_keys, true);
+  } while (!compaction_finished.load());
+  t.join();
+
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+
+  int total_keys2 = 2000;
+  for (int i = total_keys; i < total_keys2; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102)));
+    ASSERT_OK(Delete(Key(i / 10)));
+  }
+
+  verify_func(total_keys2, false);
+  dbfull()->TEST_WaitForCompact();
+  verify_func(total_keys2, false);
+
+  // Base level is not level 1
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+}
+}  // namespace rocksdb
+
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+
+int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_filesnapshot.cc b/src/rocksdb/db/db_filesnapshot.cc
index c724303..e39ccf4 100644
--- a/src/rocksdb/db/db_filesnapshot.cc
+++ b/src/rocksdb/db/db_filesnapshot.cc
@@ -98,6 +98,8 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
       cfd->Ref();
       mutex_.Unlock();
       status = FlushMemTable(cfd, FlushOptions());
+      TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
+      TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
       mutex_.Lock();
       cfd->Unref();
       if (!status.ok()) {
diff --git a/src/rocksdb/db/db_impl.cc b/src/rocksdb/db/db_impl.cc
index 757571d..cf4fa74 100644
--- a/src/rocksdb/db/db_impl.cc
+++ b/src/rocksdb/db/db_impl.cc
@@ -14,25 +14,27 @@
 #endif
 
 #include <inttypes.h>
+#include <stdint.h>
+
 #include <algorithm>
 #include <climits>
 #include <cstdio>
 #include <set>
 #include <stdexcept>
-#include <stdint.h>
 #include <string>
-#include <unordered_set>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "db/builder.h"
-#include "db/flush_job.h"
 #include "db/compaction_job.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
-#include "db/event_logger_helpers.h"
+#include "db/event_helpers.h"
 #include "db/filename.h"
+#include "db/flush_job.h"
+#include "db/forward_iterator.h"
 #include "db/job_context.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
@@ -43,22 +45,24 @@
 #include "db/merge_helper.h"
 #include "db/table_cache.h"
 #include "db/table_properties_collector.h"
-#include "db/forward_iterator.h"
 #include "db/transaction_log_impl.h"
 #include "db/version_set.h"
-#include "db/writebuffer.h"
 #include "db/write_batch_internal.h"
+#include "db/write_callback.h"
+#include "db/writebuffer.h"
+#include "port/likely.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
-#include "port/likely.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/db.h"
+#include "rocksdb/delete_scheduler.h"
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
-#include "rocksdb/version.h"
+#include "rocksdb/sst_file_writer.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
+#include "rocksdb/version.h"
 #include "table/block.h"
 #include "table/block_based_table_factory.h"
 #include "table/merger.h"
@@ -69,18 +73,20 @@
 #include "util/build_version.h"
 #include "util/coding.h"
 #include "util/compression.h"
+#include "util/crc32c.h"
 #include "util/db_info_dumper.h"
+#include "util/file_reader_writer.h"
 #include "util/file_util.h"
-#include "util/hash_skiplist_rep.h"
 #include "util/hash_linklist_rep.h"
-#include "util/logging.h"
+#include "util/hash_skiplist_rep.h"
+#include "util/iostats_context_imp.h"
 #include "util/log_buffer.h"
+#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/perf_context_imp.h"
-#include "util/iostats_context_imp.h"
 #include "util/stop_watch.h"
-#include "util/sync_point.h"
 #include "util/string_util.h"
+#include "util/sync_point.h"
 #include "util/thread_status_updater.h"
 #include "util/thread_status_util.h"
 #include "util/xfunc.h"
@@ -93,12 +99,15 @@ void DumpRocksDBBuildVersion(Logger * log);
 
 struct DBImpl::WriteContext {
   autovector<SuperVersion*> superversions_to_free_;
-  bool schedule_bg_work_ = false;
+  autovector<MemTable*> memtables_to_free_;
 
   ~WriteContext() {
     for (auto& sv : superversions_to_free_) {
       delete sv;
     }
+    for (auto& m : memtables_to_free_) {
+      delete m;
+    }
   }
 };
 
@@ -115,7 +124,11 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
 
   // result.max_open_files means an "infinite" open files.
   if (result.max_open_files != -1) {
-    ClipToRange(&result.max_open_files, 20, 1000000);
+    int max_max_open_files = port::GetMaxOpenFiles();
+    if (max_max_open_files == -1) {
+      max_max_open_files = 1000000;
+    }
+    ClipToRange(&result.max_open_files, 20, max_max_open_files);
   }
 
   if (result.info_log == nullptr) {
@@ -149,6 +162,10 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
     result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
   }
 
+  if (result.compaction_readahead_size > 0) {
+    result.new_table_reader_for_compaction_inputs = true;
+  }
+
   return result;
 }
 
@@ -190,7 +207,7 @@ CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions) {
   }
 }
 
-void DumpCompressionInfo(Logger* logger) {
+void DumpSupportInfo(Logger* logger) {
   Log(InfoLogLevel::INFO_LEVEL, logger, "Compression algorithms supported:");
   Log(InfoLogLevel::INFO_LEVEL, logger, "\tSnappy supported: %d",
       Snappy_Supported());
@@ -199,6 +216,8 @@ void DumpCompressionInfo(Logger* logger) {
   Log(InfoLogLevel::INFO_LEVEL, logger, "\tBzip supported: %d",
       BZip2_Supported());
   Log(InfoLogLevel::INFO_LEVEL, logger, "\tLZ4 supported: %d", LZ4_Supported());
+  Log(InfoLogLevel::INFO_LEVEL, logger, "Fast CRC32 supported: %d",
+      crc32c::IsFastCrc32Supported());
 }
 
 }  // namespace
@@ -216,10 +235,13 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       log_dir_synced_(false),
       log_empty_(true),
       default_cf_handle_(nullptr),
+      log_sync_cv_(&mutex_),
       total_log_size_(0),
       max_total_in_memory_state_(0),
       is_snapshot_supported_(true),
       write_buffer_(options.db_write_buffer_size),
+      write_controller_(options.delayed_write_rate),
+      last_batch_group_size_(0),
       unscheduled_flushes_(0),
       unscheduled_compactions_(0),
       bg_compaction_scheduled_(0),
@@ -238,17 +260,15 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       wal_manager_(db_options_, env_options_),
 #endif  // ROCKSDB_LITE
       event_logger_(db_options_.info_log.get()),
-      bg_work_gate_closed_(false),
+      bg_work_paused_(0),
       refitting_level_(false),
-      opened_successfully_(false),
-      notifying_events_(0) {
+      opened_successfully_(false) {
   env_->GetAbsolutePath(dbname, &db_absolute_path_);
 
   // Reserve ten files or so for other uses and give the rest to TableCache.
   // Give a large number for setting of "infinite" open files.
   const int table_cache_size = (db_options_.max_open_files == -1) ?
         4194304 : db_options_.max_open_files - 10;
-  // Reserve ten files or so for other uses and give the rest to TableCache.
   table_cache_ =
       NewLRUCache(table_cache_size, db_options_.table_cache_numshardbits);
 
@@ -261,30 +281,27 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
   DumpRocksDBBuildVersion(db_options_.info_log.get());
   DumpDBFileSummary(db_options_, dbname_);
   db_options_.Dump(db_options_.info_log.get());
-  DumpCompressionInfo(db_options_.info_log.get());
-
-  LogFlush(db_options_.info_log);
+  DumpSupportInfo(db_options_.info_log.get());
 }
 
-// Will only lock the mutex_ and wait for completion if wait is true
+// Will lock the mutex_,  will wait for completion if wait is true
 void DBImpl::CancelAllBackgroundWork(bool wait) {
+  InstrumentedMutexLock l(&mutex_);
   shutting_down_.store(true, std::memory_order_release);
+  bg_cv_.SignalAll();
   if (!wait) {
     return;
   }
   // Wait for background work to finish
-  mutex_.Lock();
-  while (bg_compaction_scheduled_ || bg_flush_scheduled_ || notifying_events_) {
+  while (bg_compaction_scheduled_ || bg_flush_scheduled_) {
     bg_cv_.Wait();
   }
-  mutex_.Unlock();
 }
 
 DBImpl::~DBImpl() {
-  EraseThreadStatusDbInfo();
   mutex_.Lock();
 
-  if (flush_on_destroy_) {
+  if (!shutting_down_.load(std::memory_order_acquire) && flush_on_destroy_) {
     for (auto cfd : *versions_->GetColumnFamilySet()) {
       if (!cfd->IsDropped() && !cfd->mem()->IsEmpty()) {
         cfd->Ref();
@@ -296,12 +313,11 @@ DBImpl::~DBImpl() {
     }
     versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
   }
-  // CancelAllBackgroundWork called with false means we just set the
-  // shutdown marker, while holding the mutex_ here. After which we
-  // do a variant of the waiting after we release the lock and unschedule work
+  mutex_.Unlock();
+  // CancelAllBackgroundWork called with false means we just set the shutdown
+  // marker. After this we do a variant of the waiting and unschedule work
   // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
   CancelAllBackgroundWork(false);
-  mutex_.Unlock();
   int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW);
   int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH);
   mutex_.Lock();
@@ -309,10 +325,10 @@ DBImpl::~DBImpl() {
   bg_flush_scheduled_ -= flushes_unscheduled;
 
   // Wait for background work to finish
-  while (bg_compaction_scheduled_ || bg_flush_scheduled_ || notifying_events_) {
+  while (bg_compaction_scheduled_ || bg_flush_scheduled_) {
     bg_cv_.Wait();
   }
-  listeners_.clear();
+  EraseThreadStatusDbInfo();
   flush_scheduler_.Clear();
 
   while (!flush_queue_.empty()) {
@@ -347,17 +363,24 @@ DBImpl::~DBImpl() {
   if (opened_successfully_) {
     JobContext job_context(next_job_id_.fetch_add(1));
     FindObsoleteFiles(&job_context, true);
+
+    mutex_.Unlock();
     // manifest number starting from 2
     job_context.manifest_file_number = 1;
     if (job_context.HaveSomethingToDelete()) {
       PurgeObsoleteFiles(job_context);
     }
     job_context.Clean();
+    mutex_.Lock();
   }
 
   for (auto l : logs_to_free_) {
     delete l;
   }
+  for (auto& log : logs_) {
+    log.ClearWriter();
+  }
+  logs_.clear();
 
   // versions need to be destroyed before table_cache since it can hold
   // references to table_cache.
@@ -376,18 +399,22 @@ Status DBImpl::NewDB() {
   new_db.SetNextFile(2);
   new_db.SetLastSequence(0);
 
+  Status s;
+
   Log(InfoLogLevel::INFO_LEVEL,
       db_options_.info_log, "Creating manifest 1 \n");
   const std::string manifest = DescriptorFileName(dbname_, 1);
-  unique_ptr<WritableFile> file;
-  Status s = env_->NewWritableFile(
-      manifest, &file, env_->OptimizeForManifestWrite(env_options_));
-  if (!s.ok()) {
-    return s;
-  }
-  file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size);
   {
-    log::Writer log(std::move(file));
+    unique_ptr<WritableFile> file;
+    EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_);
+    s = env_->NewWritableFile(manifest, &file, env_options);
+    if (!s.ok()) {
+      return s;
+    }
+    file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size);
+    unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(file), env_options));
+    log::Writer log(std::move(file_writer));
     std::string record;
     new_db.EncodeTo(&record);
     s = log.AddRecord(record);
@@ -510,6 +537,9 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
 
   // don't delete files that might be currently written to from compaction
   // threads
+  // Since job_context->min_pending_output is set, until file scan finishes,
+  // mutex_ cannot be released. Otherwise, we might see no min_pending_output
+  // here but later find newer generated unfinalized files while scannint.
   if (!pending_outputs_.empty()) {
     job_context->min_pending_output = *pending_outputs_.begin();
   } else {
@@ -517,7 +547,8 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
     job_context->min_pending_output = std::numeric_limits<uint64_t>::max();
   }
 
-  // get obsolete files
+  // Get obsolete files.  This function will also update the list of
+  // pending files in VersionSet().
   versions_->GetObsoleteFiles(&job_context->sst_delete_files,
                               job_context->min_pending_output);
 
@@ -562,6 +593,37 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
       }
     }
   }
+
+  if (!alive_log_files_.empty()) {
+    uint64_t min_log_number = versions_->MinLogNumber();
+    // find newly obsoleted log files
+    while (alive_log_files_.begin()->number < min_log_number) {
+      auto& earliest = *alive_log_files_.begin();
+      job_context->log_delete_files.push_back(earliest.number);
+      total_log_size_ -= earliest.size;
+      alive_log_files_.pop_front();
+      // Current log should always stay alive since it can't have
+      // number < MinLogNumber().
+      assert(alive_log_files_.size());
+    }
+    while (!logs_.empty() && logs_.front().number < min_log_number) {
+      auto& log = logs_.front();
+      if (log.getting_synced) {
+        log_sync_cv_.Wait();
+        // logs_ could have changed while we were waiting.
+        continue;
+      }
+      logs_to_free_.push_back(log.ReleaseWriter());
+      logs_.pop_front();
+    }
+    // Current log cannot be obsolete.
+    assert(!logs_.empty());
+  }
+
+  // We're just cleaning up for DB::Write().
+  assert(job_context->logs_to_free.empty());
+  job_context->logs_to_free = logs_to_free_;
+  logs_to_free_.clear();
 }
 
 namespace {
@@ -686,30 +748,47 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
       // evict from cache
       TableCache::Evict(table_cache_.get(), number);
       fname = TableFileName(db_options_.db_paths, number, path_id);
-      event_logger_.Log() << "job" << state.job_id << "event"
-                          << "table_file_deletion"
-                          << "file_number" << number;
     } else {
       fname = ((type == kLogFile) ?
           db_options_.wal_dir : dbname_) + "/" + to_delete;
     }
 
-#ifdef ROCKSDB_LITE
-    Status s = env_->DeleteFile(fname);
-    Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
-        "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", state.job_id,
-        fname.c_str(), type, number, s.ToString().c_str());
-#else   // not ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
     if (type == kLogFile && (db_options_.WAL_ttl_seconds > 0 ||
-                             db_options_.WAL_size_limit_MB > 0)) {
+                              db_options_.WAL_size_limit_MB > 0)) {
       wal_manager_.ArchiveWALFile(fname, number);
+      continue;
+    }
+#endif  // !ROCKSDB_LITE
+    Status file_deletion_status;
+    if (type == kTableFile && path_id == 0) {
+      file_deletion_status = DeleteOrMoveToTrash(&db_options_, fname);
     } else {
-      Status s = env_->DeleteFile(fname);
+      file_deletion_status = env_->DeleteFile(fname);
+    }
+    if (file_deletion_status.ok()) {
       Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
           "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", state.job_id,
-          fname.c_str(), type, number, s.ToString().c_str());
+          fname.c_str(), type, number,
+          file_deletion_status.ToString().c_str());
+    } else if (env_->FileExists(fname).IsNotFound()) {
+      Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+          "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64
+          " -- %s\n",
+          state.job_id, fname.c_str(), type, number,
+          file_deletion_status.ToString().c_str());
+    } else {
+      Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+          "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n",
+          state.job_id, fname.c_str(), type, number,
+          file_deletion_status.ToString().c_str());
+    }
+    if (type == kTableFile) {
+      EventHelpers::LogAndNotifyTableFileDeletion(
+          &event_logger_, state.job_id, number, fname,
+          file_deletion_status, GetName(),
+          db_options_.listeners);
     }
-#endif  // ROCKSDB_LITE
   }
 
   // Delete old info log files.
@@ -726,9 +805,16 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
           full_path_to_delete.c_str());
       Status s = env_->DeleteFile(full_path_to_delete);
       if (!s.ok()) {
-        Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
-            "[JOB %d] Delete info log file %s FAILED -- %s\n", state.job_id,
-            to_delete.c_str(), s.ToString().c_str());
+        if (env_->FileExists(full_path_to_delete).IsNotFound()) {
+          Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+              "[JOB %d] Tried to delete non-existing info log file %s FAILED "
+              "-- %s\n",
+              state.job_id, to_delete.c_str(), s.ToString().c_str());
+        } else {
+          Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+              "[JOB %d] Delete info log file %s FAILED -- %s\n", state.job_id,
+              to_delete.c_str(), s.ToString().c_str());
+        }
       }
     }
   }
@@ -742,10 +828,13 @@ void DBImpl::DeleteObsoleteFiles() {
   mutex_.AssertHeld();
   JobContext job_context(next_job_id_.fetch_add(1));
   FindObsoleteFiles(&job_context, true);
+
+  mutex_.Unlock();
   if (job_context.HaveSomethingToDelete()) {
     PurgeObsoleteFiles(job_context);
   }
   job_context.Clean();
+  mutex_.Lock();
 }
 
 Status DBImpl::Directories::CreateAndNewDirectory(
@@ -826,7 +915,8 @@ Status DBImpl::Recover(
       return s;
     }
 
-    if (!env_->FileExists(CurrentFileName(dbname_))) {
+    s = env_->FileExists(CurrentFileName(dbname_));
+    if (s.IsNotFound()) {
       if (db_options_.create_if_missing) {
         s = NewDB();
         is_new_db = true;
@@ -837,18 +927,26 @@ Status DBImpl::Recover(
         return Status::InvalidArgument(
             dbname_, "does not exist (create_if_missing is false)");
       }
-    } else {
+    } else if (s.ok()) {
       if (db_options_.error_if_exists) {
         return Status::InvalidArgument(
             dbname_, "exists (error_if_exists is true)");
       }
+    } else {
+      // Unexpected error reading file
+      assert(s.IsIOError());
+      return s;
     }
     // Check for the IDENTITY file and create it if not there
-    if (!env_->FileExists(IdentityFileName(dbname_))) {
+    s = env_->FileExists(IdentityFileName(dbname_));
+    if (s.IsNotFound()) {
       s = SetIdentityFile(env_, dbname_);
       if (!s.ok()) {
         return s;
       }
+    } else if (!s.ok()) {
+      assert(s.IsIOError());
+      return s;
     }
   }
 
@@ -857,7 +955,7 @@ Status DBImpl::Recover(
     s = CheckConsistency();
   }
   if (s.ok()) {
-    SequenceNumber max_sequence(0);
+    SequenceNumber max_sequence(kMaxSequenceNumber);
     default_cf_handle_ = new ColumnFamilyHandleImpl(
         versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
     default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
@@ -908,7 +1006,8 @@ Status DBImpl::Recover(
       if (!s.ok()) {
         // Clear memtables if recovery failed
         for (auto cfd : *versions_->GetColumnFamilySet()) {
-          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions());
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                 kMaxSequenceNumber);
         }
       }
     }
@@ -939,7 +1038,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
           info_log, "%s%s: dropping %d bytes; %s",
           (this->status == nullptr ? "(ignoring error) " : ""),
           fname, static_cast<int>(bytes), s.ToString().c_str());
-      if (this->status != nullptr && this->status->ok()) *this->status = s;
+      if (this->status != nullptr && this->status->ok()) {
+        *this->status = s;
+      }
     }
   };
 
@@ -965,6 +1066,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     stream.EndArray();
   }
 
+  bool continue_replay_log = true;
   for (auto log_number : log_numbers) {
     // The previous incarnation may not have written any MANIFEST
     // records after allocating this log number.  So we manually
@@ -972,17 +1074,21 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     versions_->MarkFileNumberUsedDuringRecovery(log_number);
     // Open the log file
     std::string fname = LogFileName(db_options_.wal_dir, log_number);
-    unique_ptr<SequentialFile> file;
-    status = env_->NewSequentialFile(fname, &file, env_options_);
-    if (!status.ok()) {
-      MaybeIgnoreError(&status);
+    unique_ptr<SequentialFileReader> file_reader;
+    {
+      unique_ptr<SequentialFile> file;
+      status = env_->NewSequentialFile(fname, &file, env_options_);
       if (!status.ok()) {
-        return status;
-      } else {
-        // Fail with one log file, but that's ok.
-        // Try next one.
-        continue;
+        MaybeIgnoreError(&status);
+        if (!status.ok()) {
+          return status;
+        } else {
+          // Fail with one log file, but that's ok.
+          // Try next one.
+          continue;
+        }
       }
+      file_reader.reset(new SequentialFileReader(std::move(file)));
     }
 
     // Create the log reader.
@@ -990,21 +1096,56 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     reporter.env = env_;
     reporter.info_log = db_options_.info_log.get();
     reporter.fname = fname.c_str();
-    reporter.status = (db_options_.paranoid_checks) ? &status : nullptr;
+    if (!db_options_.paranoid_checks ||
+        db_options_.wal_recovery_mode ==
+            WALRecoveryMode::kSkipAnyCorruptedRecords) {
+      reporter.status = nullptr;
+    } else {
+      reporter.status = &status;
+    }
     // We intentially make log::Reader do checksumming even if
     // paranoid_checks==false so that corruptions cause entire commits
     // to be skipped instead of propagating bad information (like overly
     // large sequence numbers).
-    log::Reader reader(std::move(file), &reporter, true /*checksum*/,
+    log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/,
                        0 /*initial_offset*/);
-    Log(InfoLogLevel::INFO_LEVEL,
-        db_options_.info_log, "Recovering log #%" PRIu64 "", log_number);
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "Recovering log #%" PRIu64 " mode %d skip-recovery %d", log_number,
+        db_options_.wal_recovery_mode, !continue_replay_log);
+
+    // Determine if we should tolerate incomplete records at the tail end of the
+    // log
+    bool report_eof_inconsistency;
+    if (db_options_.wal_recovery_mode ==
+        WALRecoveryMode::kAbsoluteConsistency) {
+      // in clean shutdown we don't expect any error in the log files
+      report_eof_inconsistency = true;
+    } else {
+      // for other modes ignore only incomplete records in the last log file
+      // which is presumably due to write in progress during restart
+      report_eof_inconsistency = false;
+
+      // TODO krad: Evaluate if we need to move to a more strict mode where we
+      // restrict the inconsistency to only the last log
+    }
 
     // Read all the records and add to a memtable
     std::string scratch;
     Slice record;
     WriteBatch batch;
-    while (reader.ReadRecord(&record, &scratch) && status.ok()) {
+
+    if (!continue_replay_log) {
+      uint64_t bytes;
+      if (env_->GetFileSize(fname, &bytes).ok()) {
+        auto info_log = db_options_.info_log.get();
+        Log(InfoLogLevel::WARN_LEVEL, info_log, "%s: dropping %d bytes",
+            fname.c_str(), static_cast<int>(bytes));
+      }
+    }
+
+    while (continue_replay_log &&
+           reader.ReadRecord(&record, &scratch, report_eof_inconsistency) &&
+           status.ok()) {
       if (record.size() < 12) {
         reporter.Corruption(record.size(),
                             Status::Corruption("log record too small"));
@@ -1022,11 +1163,15 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
 
       MaybeIgnoreError(&status);
       if (!status.ok()) {
-        return status;
+        // We are treating this as a failure while reading since we read valid
+        // blocks that do not form coherent data
+        reporter.Corruption(record.size(), status);
+        continue;
       }
+
       const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) +
                                       WriteBatchInternal::Count(&batch) - 1;
-      if (last_seq > *max_sequence) {
+      if ((*max_sequence == kMaxSequenceNumber) || (last_seq > *max_sequence)) {
         *max_sequence = last_seq;
       }
 
@@ -1049,17 +1194,39 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
             // file-systems cause the DB::Open() to fail.
             return status;
           }
-          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions());
+
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                 *max_sequence);
         }
       }
     }
 
     if (!status.ok()) {
-      return status;
+      if (db_options_.wal_recovery_mode ==
+             WALRecoveryMode::kSkipAnyCorruptedRecords) {
+        // We should ignore all errors unconditionally
+        status = Status::OK();
+      } else if (db_options_.wal_recovery_mode ==
+                 WALRecoveryMode::kPointInTimeRecovery) {
+        // We should ignore the error but not continue replaying
+        status = Status::OK();
+        continue_replay_log = false;
+
+        Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+            "Point in time recovered to log #%" PRIu64 " seq #%" PRIu64,
+            log_number, *max_sequence);
+      } else {
+        assert(db_options_.wal_recovery_mode ==
+                  WALRecoveryMode::kTolerateCorruptedTailRecords
+               || db_options_.wal_recovery_mode ==
+                  WALRecoveryMode::kAbsoluteConsistency);
+        return status;
+      }
     }
 
     flush_scheduler_.Clear();
-    if (versions_->LastSequence() < *max_sequence) {
+    if ((*max_sequence != kMaxSequenceNumber) &&
+        (versions_->LastSequence() < *max_sequence)) {
       versions_->SetLastSequence(*max_sequence);
     }
   }
@@ -1090,7 +1257,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
           // Recovery failed
           break;
         }
-        cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions());
+
+        cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                               *max_sequence);
       }
 
       // write MANIFEST with update
@@ -1135,9 +1304,6 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
   TableProperties table_properties;
   {
     ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
-    const SequenceNumber newest_snapshot = snapshots_.GetNewest();
-    const SequenceNumber earliest_seqno_in_memtable =
-        mem->GetFirstSequenceNumber();
     Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
         "[%s] [WriteLevel0TableForRecovery]"
         " Level-0 table #%" PRIu64 ": started",
@@ -1147,30 +1313,36 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
         cfd->GetLatestMutableCFOptions()->paranoid_file_checks;
     {
       mutex_.Unlock();
+      TableFileCreationInfo info;
       s = BuildTable(
           dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(),
           iter.get(), &meta, cfd->internal_comparator(),
-          cfd->int_tbl_prop_collector_factories(), newest_snapshot,
-          earliest_seqno_in_memtable, GetCompressionFlush(*cfd->ioptions()),
-          cfd->ioptions()->compression_opts, paranoid_file_checks, Env::IO_HIGH,
-          &table_properties);
+          cfd->int_tbl_prop_collector_factories(), snapshots_.GetAll(),
+          GetCompressionFlush(*cfd->ioptions()),
+          cfd->ioptions()->compression_opts, paranoid_file_checks,
+          cfd->internal_stats(), Env::IO_HIGH, &info.table_properties);
       LogFlush(db_options_.info_log);
+      Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
+          "[%s] [WriteLevel0TableForRecovery]"
+          " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
+          cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
+          s.ToString().c_str());
+
+      // output to event logger
+      if (s.ok()) {
+        info.db_name = dbname_;
+        info.cf_name = cfd->GetName();
+        info.file_path = TableFileName(db_options_.db_paths,
+                                       meta.fd.GetNumber(),
+                                       meta.fd.GetPathId());
+        info.file_size = meta.fd.GetFileSize();
+        info.job_id = job_id;
+        EventHelpers::LogAndNotifyTableFileCreation(
+            &event_logger_, db_options_.listeners, meta.fd, info);
+      }
       mutex_.Lock();
     }
   }
-  Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
-      "[%s] [WriteLevel0TableForRecovery]"
-      " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
-      cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
-      s.ToString().c_str());
-
-  // output to event logger
-  if (s.ok()) {
-    EventLoggerHelpers::LogTableFileCreation(
-        &event_logger_, job_id, meta.fd.GetNumber(), meta.fd.GetFileSize(),
-        table_properties);
-  }
-
   ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
 
   // Note that if file_size is zero, the file has been deleted and
@@ -1179,13 +1351,14 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
   if (s.ok() && meta.fd.GetFileSize() > 0) {
     edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
                   meta.fd.GetFileSize(), meta.smallest, meta.largest,
-                  meta.smallest_seqno, meta.largest_seqno);
+                  meta.smallest_seqno, meta.largest_seqno,
+                  meta.marked_for_compaction);
   }
 
   InternalStats::CompactionStats stats(1);
   stats.micros = env_->NowMicros() - start_micros;
   stats.bytes_written = meta.fd.GetFileSize();
-  stats.files_out_levelnp1 = 1;
+  stats.num_output_files = 1;
   cfd->internal_stats()->AddCompactionStats(level, stats);
   cfd->internal_stats()->AddCFStats(
       InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize());
@@ -1195,40 +1368,37 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
 
 Status DBImpl::FlushMemTableToOutputFile(
     ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
-    bool* madeProgress, JobContext* job_context, LogBuffer* log_buffer) {
+    bool* made_progress, JobContext* job_context, LogBuffer* log_buffer) {
   mutex_.AssertHeld();
-  assert(cfd->imm()->size() != 0);
+  assert(cfd->imm()->NumNotFlushed() != 0);
   assert(cfd->imm()->IsFlushPending());
 
   FlushJob flush_job(dbname_, cfd, db_options_, mutable_cf_options,
                      env_options_, versions_.get(), &mutex_, &shutting_down_,
-                     snapshots_.GetNewest(), job_context, log_buffer,
+                     snapshots_.GetAll(), job_context, log_buffer,
                      directories_.GetDbDir(), directories_.GetDataDir(0U),
                      GetCompressionFlush(*cfd->ioptions()), stats_,
                      &event_logger_);
 
-  uint64_t file_number;
-  Status s = flush_job.Run(&file_number);
+  FileMetaData file_meta;
+
+  // Within flush_job.Run, rocksdb may call event listener to notify
+  // file creation and deletion.
+  //
+  // Note that flush_job.Run will unlock and lock the db_mutex,
+  // and EventListener callback will be called when the db_mutex
+  // is unlocked by the current thread.
+  Status s = flush_job.Run(&file_meta);
 
   if (s.ok()) {
-    InstallSuperVersionBackground(cfd, job_context, mutable_cf_options);
-    if (madeProgress) {
-      *madeProgress = 1;
+    InstallSuperVersionAndScheduleWorkWrapper(cfd, job_context,
+                                              mutable_cf_options);
+    if (made_progress) {
+      *made_progress = 1;
     }
     VersionStorageInfo::LevelSummaryStorage tmp;
     LogToBuffer(log_buffer, "[%s] Level summary: %s\n", cfd->GetName().c_str(),
                 cfd->current()->storage_info()->LevelSummary(&tmp));
-
-    if (disable_delete_obsolete_files_ == 0) {
-      // add to deletion state
-      while (alive_log_files_.size() &&
-             alive_log_files_.begin()->number < versions_->MinLogNumber()) {
-        const auto& earliest = *alive_log_files_.begin();
-        job_context->log_delete_files.push_back(earliest.number);
-        total_log_size_ -= earliest.size;
-        alive_log_files_.pop_front();
-      }
-    }
   }
 
   if (!s.ok() && !s.IsShutdownInProgress() && db_options_.paranoid_checks &&
@@ -1241,50 +1411,59 @@ Status DBImpl::FlushMemTableToOutputFile(
 #ifndef ROCKSDB_LITE
   if (s.ok()) {
     // may temporarily unlock and lock the mutex.
-    NotifyOnFlushCompleted(cfd, file_number, mutable_cf_options);
+    NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options,
+                           job_context->job_id);
   }
 #endif  // ROCKSDB_LITE
   return s;
 }
 
 void DBImpl::NotifyOnFlushCompleted(
-    ColumnFamilyData* cfd, uint64_t file_number,
-    const MutableCFOptions& mutable_cf_options) {
+    ColumnFamilyData* cfd, FileMetaData* file_meta,
+    const MutableCFOptions& mutable_cf_options, int job_id) {
 #ifndef ROCKSDB_LITE
-  if (cfd->ioptions()->listeners.size() == 0U) {
+  if (db_options_.listeners.size() == 0U) {
     return;
   }
   mutex_.AssertHeld();
   if (shutting_down_.load(std::memory_order_acquire)) {
     return;
   }
-  bool triggered_flush_slowdown =
+  bool triggered_writes_slowdown =
       (cfd->current()->storage_info()->NumLevelFiles(0) >=
        mutable_cf_options.level0_slowdown_writes_trigger);
-  bool triggered_flush_stop =
+  bool triggered_writes_stop =
       (cfd->current()->storage_info()->NumLevelFiles(0) >=
        mutable_cf_options.level0_stop_writes_trigger);
-  notifying_events_++;
   // release lock while notifying events
   mutex_.Unlock();
-  // TODO(yhchiang): make db_paths dynamic.
-  cfd->NotifyOnFlushCompleted(
-        this, MakeTableFileName(db_options_.db_paths[0].path, file_number),
-        triggered_flush_slowdown,
-        triggered_flush_stop);
+  {
+    FlushJobInfo info;
+    info.cf_name = cfd->GetName();
+    // TODO(yhchiang): make db_paths dynamic in case flush does not
+    //                 go to L0 in the future.
+    info.file_path = MakeTableFileName(db_options_.db_paths[0].path,
+                                       file_meta->fd.GetNumber());
+    info.thread_id = env_->GetThreadID();
+    info.job_id = job_id;
+    info.triggered_writes_slowdown = triggered_writes_slowdown;
+    info.triggered_writes_stop = triggered_writes_stop;
+    info.smallest_seqno = file_meta->smallest_seqno;
+    info.largest_seqno = file_meta->largest_seqno;
+    for (auto listener : db_options_.listeners) {
+      listener->OnFlushCompleted(this, info);
+    }
+  }
   mutex_.Lock();
-  notifying_events_--;
-  assert(notifying_events_ >= 0);
   // no need to signal bg_cv_ as it will be signaled at the end of the
   // flush process.
 #endif  // ROCKSDB_LITE
 }
 
-Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
-                            const Slice* begin, const Slice* end,
-                            bool reduce_level, int target_level,
-                            uint32_t target_path_id) {
-  if (target_path_id >= db_options_.db_paths.size()) {
+Status DBImpl::CompactRange(const CompactRangeOptions& options,
+                            ColumnFamilyHandle* column_family,
+                            const Slice* begin, const Slice* end) {
+  if (options.target_path_id >= db_options_.db_paths.size()) {
     return Status::InvalidArgument("Invalid target path ID");
   }
 
@@ -1309,35 +1488,56 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
     }
   }
 
+  int final_output_level = 0;
   if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
       cfd->NumberLevels() > 1) {
     // Always compact all files together.
     s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
-                            cfd->NumberLevels() - 1, target_path_id, begin,
-                            end);
+                            cfd->NumberLevels() - 1, options.target_path_id,
+                            begin, end);
+    final_output_level = cfd->NumberLevels() - 1;
   } else {
     for (int level = 0; level <= max_level_with_files; level++) {
-      // in case the compaction is unversal or if we're compacting the
+      int output_level;
+      // in case the compaction is universal or if we're compacting the
       // bottom-most level, the output level will be the same as input one.
       // level 0 can never be the bottommost level (i.e. if all files are in
       // level 0, we will compact to level 1)
       if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
-          cfd->ioptions()->compaction_style == kCompactionStyleFIFO ||
-          (level == max_level_with_files && level > 0)) {
-        s = RunManualCompaction(cfd, level, level, target_path_id, begin, end);
+          cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+        output_level = level;
+      } else if (level == max_level_with_files && level > 0) {
+        if (options.bottommost_level_compaction ==
+            BottommostLevelCompaction::kSkip) {
+          // Skip bottommost level compaction
+          continue;
+        } else if (options.bottommost_level_compaction ==
+                       BottommostLevelCompaction::kIfHaveCompactionFilter &&
+                   cfd->ioptions()->compaction_filter == nullptr &&
+                   cfd->ioptions()->compaction_filter_factory == nullptr) {
+          // Skip bottommost level compaction since we don't have a compaction
+          // filter
+          continue;
+        }
+        output_level = level;
       } else {
-        int output_level = level + 1;
+        output_level = level + 1;
         if (cfd->ioptions()->compaction_style == kCompactionStyleLevel &&
             cfd->ioptions()->level_compaction_dynamic_level_bytes &&
             level == 0) {
           output_level = ColumnFamilyData::kCompactToBaseLevel;
         }
-        s = RunManualCompaction(cfd, level, output_level, target_path_id, begin,
-                                end);
       }
+      s = RunManualCompaction(cfd, level, output_level, options.target_path_id,
+                              begin, end);
       if (!s.ok()) {
         break;
       }
+      if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+        final_output_level = cfd->NumberLevels() - 1;
+      } else if (output_level > final_output_level) {
+        final_output_level = output_level;
+      }
       TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1");
       TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2");
     }
@@ -1347,8 +1547,14 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
     return s;
   }
 
-  if (reduce_level) {
-    s = ReFitLevel(cfd, max_level_with_files, target_level);
+  if (options.change_level) {
+    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
+        "[RefitLevel] waiting for background threads to stop");
+    s = PauseBackgroundWork();
+    if (s.ok()) {
+      s = ReFitLevel(cfd, final_output_level, options.target_level);
+    }
+    ContinueBackgroundWork();
   }
   LogFlush(db_options_.info_log);
 
@@ -1402,7 +1608,7 @@ Status DBImpl::CompactFiles(
     // FindObsoleteFiles(). This is because job_context does not
     // catch all created files if compaction failed.
     FindObsoleteFiles(&job_context, !s.ok());
-  }
+  }  // release the mutex
 
   // delete unnecessary files if any, this is done outside the mutex
   if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
@@ -1413,6 +1619,7 @@ Status DBImpl::CompactFiles(
     // It also applies to access other states that DB owns.
     log_buffer.FlushBufferToLog();
     if (job_context.HaveSomethingToDelete()) {
+      // no mutex is locked here.  No need to Unlock() and Lock() here.
       PurgeObsoleteFiles(job_context);
     }
     job_context.Clean();
@@ -1486,29 +1693,39 @@ Status DBImpl::CompactFilesImpl(
   assert(c);
   c->SetInputVersion(version);
   // deletion compaction currently not allowed in CompactFiles.
-  assert(!c->IsDeletionCompaction());
+  assert(!c->deletion_compaction());
 
-  auto yield_callback = [&]() {
-    return CallFlushDuringCompaction(
-        c->column_family_data(), *c->mutable_cf_options(),
-        job_context, log_buffer);
-  };
   assert(is_snapshot_supported_ || snapshots_.empty());
   CompactionJob compaction_job(
       job_context->job_id, c.get(), db_options_, env_options_, versions_.get(),
       &shutting_down_, log_buffer, directories_.GetDbDir(),
-      directories_.GetDataDir(c->GetOutputPathId()), stats_,
-      snapshots_.GetAll(), table_cache_, std::move(yield_callback),
-      &event_logger_, c->mutable_cf_options()->paranoid_file_checks);
+      directories_.GetDataDir(c->output_path_id()), stats_, snapshots_.GetAll(),
+      table_cache_, &event_logger_,
+      c->mutable_cf_options()->paranoid_file_checks,
+      c->mutable_cf_options()->compaction_measure_io_stats, dbname_,
+      nullptr);  // Here we pass a nullptr for CompactionJobStats because
+                 // CompactFiles does not trigger OnCompactionCompleted(),
+                 // which is the only place where CompactionJobStats is
+                 // returned.  The idea of not triggering OnCompationCompleted()
+                 // is that CompactFiles runs in the caller thread, so the user
+                 // should always know when it completes.  As a result, it makes
+                 // less sense to notify the users something they should already
+                 // know.
+                 //
+                 // In the future, if we would like to add CompactionJobStats
+                 // support for CompactFiles, we should have CompactFiles API
+                 // pass a pointer of CompactionJobStats as the out-value
+                 // instead of using EventListener.
   compaction_job.Prepare();
 
   mutex_.Unlock();
-  Status status = compaction_job.Run();
+  compaction_job.Run();
   mutex_.Lock();
-  compaction_job.Install(&status, *c->mutable_cf_options(), &mutex_);
+
+  Status status = compaction_job.Install(*c->mutable_cf_options(), &mutex_);
   if (status.ok()) {
-    InstallSuperVersionBackground(c->column_family_data(), job_context,
-                                  *c->mutable_cf_options());
+    InstallSuperVersionAndScheduleWorkWrapper(
+        c->column_family_data(), job_context, *c->mutable_cf_options());
   }
   c->ReleaseCompactionFiles(s);
   c.reset();
@@ -1528,28 +1745,75 @@ Status DBImpl::CompactFilesImpl(
   }
 
   bg_compaction_scheduled_--;
+  if (bg_compaction_scheduled_ == 0) {
+    bg_cv_.SignalAll();
+  }
 
   return status;
 }
 #endif  // ROCKSDB_LITE
 
+Status DBImpl::PauseBackgroundWork() {
+  InstrumentedMutexLock guard_lock(&mutex_);
+  bg_work_paused_++;
+  while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_ > 0) {
+    bg_cv_.Wait();
+  }
+  return Status::OK();
+}
+
+Status DBImpl::ContinueBackgroundWork() {
+  InstrumentedMutexLock guard_lock(&mutex_);
+  assert(bg_work_paused_ > 0);
+  bg_work_paused_--;
+  if (bg_work_paused_ == 0) {
+    MaybeScheduleFlushOrCompaction();
+  }
+  return Status::OK();
+}
+
 void DBImpl::NotifyOnCompactionCompleted(
-    ColumnFamilyData* cfd, Compaction *c, const Status &st) {
+    ColumnFamilyData* cfd, Compaction *c, const Status &st,
+    const CompactionJobStats& compaction_job_stats,
+    const int job_id) {
 #ifndef ROCKSDB_LITE
-  if (cfd->ioptions()->listeners.size() == 0U) {
+  if (db_options_.listeners.size() == 0U) {
     return;
   }
   mutex_.AssertHeld();
   if (shutting_down_.load(std::memory_order_acquire)) {
     return;
   }
-  notifying_events_++;
   // release lock while notifying events
   mutex_.Unlock();
-  cfd->NotifyOnCompactionCompleted(this, c, st);
+  {
+    CompactionJobInfo info;
+    info.cf_name = cfd->GetName();
+    info.status = st;
+    info.thread_id = env_->GetThreadID();
+    info.job_id = job_id;
+    info.base_input_level = c->start_level();
+    info.output_level = c->output_level();
+    info.stats = compaction_job_stats;
+    for (size_t i = 0; i < c->num_input_levels(); ++i) {
+      for (const auto fmd : *c->inputs(i)) {
+        info.input_files.push_back(
+            TableFileName(db_options_.db_paths,
+                          fmd->fd.GetNumber(),
+                          fmd->fd.GetPathId()));
+      }
+    }
+    for (const auto newf : c->edit()->GetNewFiles()) {
+      info.output_files.push_back(
+          TableFileName(db_options_.db_paths,
+                        newf.second.fd.GetNumber(),
+                        newf.second.fd.GetPathId()));
+    }
+    for (auto listener : db_options_.listeners) {
+      listener->OnCompactionCompleted(this, info);
+    }
+  }
   mutex_.Lock();
-  notifying_events_--;
-  assert(notifying_events_ >= 0);
   // no need to signal bg_cv_ as it will be signaled at the end of the
   // flush process.
 #endif  // ROCKSDB_LITE
@@ -1594,6 +1858,7 @@ Status DBImpl::SetOptions(ColumnFamilyHandle* column_family,
     Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
         "[%s] SetOptions failed", cfd->GetName().c_str());
   }
+  LogFlush(db_options_.info_log);
   return s;
 #endif  // ROCKSDB_LITE
 }
@@ -1617,66 +1882,72 @@ int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
   return minimum_level;
 }
 
+// REQUIREMENT: block all background work by calling PauseBackgroundWork()
+// before calling this function
 Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
   assert(level < cfd->NumberLevels());
+  if (target_level >= cfd->NumberLevels()) {
+    return Status::InvalidArgument("Target level exceeds number of levels");
+  }
 
-  SuperVersion* superversion_to_free = nullptr;
-  SuperVersion* new_superversion = new SuperVersion();
+  std::unique_ptr<SuperVersion> superversion_to_free;
+  std::unique_ptr<SuperVersion> new_superversion(new SuperVersion());
 
-  mutex_.Lock();
+  Status status;
+
+  InstrumentedMutexLock guard_lock(&mutex_);
 
   // only allow one thread refitting
   if (refitting_level_) {
-    mutex_.Unlock();
     Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
         "[ReFitLevel] another thread is refitting");
-    delete new_superversion;
     return Status::NotSupported("another thread is refitting");
   }
   refitting_level_ = true;
 
-  // wait for all background threads to stop
-  bg_work_gate_closed_ = true;
-  while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_) {
-    Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
-        "[RefitLevel] waiting for background threads to stop: %d %d",
-        bg_compaction_scheduled_, bg_flush_scheduled_);
-    bg_cv_.Wait();
-  }
-
-  const MutableCFOptions mutable_cf_options =
-    *cfd->GetLatestMutableCFOptions();
+  const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
   // move to a smaller level
   int to_level = target_level;
   if (target_level < 0) {
     to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level);
   }
 
-  assert(to_level <= level);
-
-  Status status;
-  if (to_level < level) {
+  auto* vstorage = cfd->current()->storage_info();
+  if (to_level > level) {
+    if (level == 0) {
+      return Status::NotSupported(
+          "Cannot change from level 0 to other levels.");
+    }
+    // Check levels are empty for a trivial move
+    for (int l = level + 1; l <= to_level; l++) {
+      if (vstorage->NumLevelFiles(l) > 0) {
+        return Status::NotSupported(
+            "Levels between source and target are not empty for a move.");
+      }
+    }
+  }
+  if (to_level != level) {
     Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
-        "[%s] Before refitting:\n%s",
-        cfd->GetName().c_str(), cfd->current()->DebugString().data());
+        "[%s] Before refitting:\n%s", cfd->GetName().c_str(),
+        cfd->current()->DebugString().data());
 
     VersionEdit edit;
     edit.SetColumnFamily(cfd->GetID());
-    for (const auto& f : cfd->current()->storage_info()->LevelFiles(level)) {
+    for (const auto& f : vstorage->LevelFiles(level)) {
       edit.DeleteFile(level, f->fd.GetNumber());
       edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
-                   f->smallest_seqno, f->largest_seqno);
+                   f->smallest_seqno, f->largest_seqno,
+                   f->marked_for_compaction);
     }
     Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
-        "[%s] Apply version edit:\n%s",
-        cfd->GetName().c_str(), edit.DebugString().data());
+        "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
+        edit.DebugString().data());
 
     status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_,
                                     directories_.GetDbDir());
-    superversion_to_free = InstallSuperVersion(
-        cfd, new_superversion, mutable_cf_options);
-    new_superversion = nullptr;
+    superversion_to_free.reset(InstallSuperVersionAndScheduleWork(
+        cfd, new_superversion.release(), mutable_cf_options));
 
     Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
         "[%s] LogAndApply: %s\n", cfd->GetName().c_str(),
@@ -1684,17 +1955,13 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
 
     if (status.ok()) {
       Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
-          "[%s] After refitting:\n%s",
-          cfd->GetName().c_str(), cfd->current()->DebugString().data());
+          "[%s] After refitting:\n%s", cfd->GetName().c_str(),
+          cfd->current()->DebugString().data());
     }
   }
 
   refitting_level_ = false;
-  bg_work_gate_closed_ = false;
 
-  mutex_.Unlock();
-  delete superversion_to_free;
-  delete new_superversion;
   return status;
 }
 
@@ -1704,10 +1971,7 @@ int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
 }
 
 int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) {
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  InstrumentedMutexLock l(&mutex_);
-  return cfh->cfd()->GetSuperVersion()->
-      mutable_cf_options.max_mem_compaction_level;
+  return 0;
 }
 
 int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
@@ -1723,13 +1987,95 @@ Status DBImpl::Flush(const FlushOptions& flush_options,
   return FlushMemTable(cfh->cfd(), flush_options);
 }
 
+Status DBImpl::SyncWAL() {
+  autovector<log::Writer*, 1> logs_to_sync;
+  bool need_log_dir_sync;
+  uint64_t current_log_number;
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    assert(!logs_.empty());
+
+    // This SyncWAL() call only cares about logs up to this number.
+    current_log_number = logfile_number_;
+
+    while (logs_.front().number <= current_log_number &&
+           logs_.front().getting_synced) {
+      log_sync_cv_.Wait();
+    }
+    // First check that logs are safe to sync in background.
+    for (auto it = logs_.begin();
+         it != logs_.end() && it->number <= current_log_number; ++it) {
+      if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) {
+        return Status::NotSupported(
+          "SyncWAL() is not supported for this implementation of WAL file",
+          db_options_.allow_mmap_writes
+            ? "try setting Options::allow_mmap_writes to false"
+            : Slice());
+      }
+    }
+    for (auto it = logs_.begin();
+         it != logs_.end() && it->number <= current_log_number; ++it) {
+      auto& log = *it;
+      assert(!log.getting_synced);
+      log.getting_synced = true;
+      logs_to_sync.push_back(log.writer);
+    }
+
+    need_log_dir_sync = !log_dir_synced_;
+  }
+
+  RecordTick(stats_, WAL_FILE_SYNCED);
+  Status status;
+  for (log::Writer* log : logs_to_sync) {
+    status = log->file()->SyncWithoutFlush(db_options_.use_fsync);
+    if (!status.ok()) {
+      break;
+    }
+  }
+  if (status.ok() && need_log_dir_sync) {
+    status = directories_.GetWalDir()->Fsync();
+  }
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    MarkLogsSynced(current_log_number, need_log_dir_sync, status);
+  }
+
+  return status;
+}
+
+void DBImpl::MarkLogsSynced(
+    uint64_t up_to, bool synced_dir, const Status& status) {
+  mutex_.AssertHeld();
+  if (synced_dir &&
+      logfile_number_ == up_to &&
+      status.ok()) {
+    log_dir_synced_ = true;
+  }
+  for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
+    auto& log = *it;
+    assert(log.getting_synced);
+    if (status.ok() && logs_.size() > 1) {
+      logs_to_free_.push_back(log.ReleaseWriter());
+      it = logs_.erase(it);
+    } else {
+      log.getting_synced = false;
+      ++it;
+    }
+  }
+  assert(logs_.empty() || (logs_.size() == 1 && !logs_[0].getting_synced));
+  log_sync_cv_.SignalAll();
+}
+
 SequenceNumber DBImpl::GetLatestSequenceNumber() const {
   return versions_->LastSequence();
 }
 
 Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
                                    int output_level, uint32_t output_path_id,
-                                   const Slice* begin, const Slice* end) {
+                                   const Slice* begin, const Slice* end,
+                                   bool disallow_trivial_move) {
   assert(input_level == ColumnFamilyData::kCompactAllLevels ||
          input_level >= 0);
 
@@ -1742,6 +2088,7 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
   manual.output_path_id = output_path_id;
   manual.done = false;
   manual.in_progress = false;
+  manual.disallow_trivial_move = disallow_trivial_move;
   // For universal compaction, we enforce every manual compaction to compact
   // all files.
   if (begin == nullptr ||
@@ -1818,19 +2165,18 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
     WriteContext context;
     InstrumentedMutexLock guard_lock(&mutex_);
 
-    if (cfd->imm()->size() == 0 && cfd->mem()->IsEmpty()) {
+    if (cfd->imm()->NumNotFlushed() == 0 && cfd->mem()->IsEmpty()) {
       // Nothing to flush
       return Status::OK();
     }
 
-    WriteThread::Writer w(&mutex_);
-    s = write_thread_.EnterWriteThread(&w, 0);
-    assert(s.ok() && !w.done);  // No timeout and nobody should do our job
+    WriteThread::Writer w;
+    write_thread_.EnterUnbatched(&w, &mutex_);
 
-    // SetNewMemtableAndNewLogFile() will release and reacquire mutex
+    // SwitchMemtable() will release and reacquire mutex
     // during execution
-    s = SetNewMemtableAndNewLogFile(cfd, &context);
-    write_thread_.ExitWriteThread(&w, &w, s);
+    s = SwitchMemtable(cfd, &context);
+    write_thread_.ExitUnbatched(&w);
 
     cfd->imm()->FlushRequested();
 
@@ -1850,7 +2196,10 @@ Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) {
   Status s;
   // Wait until the compaction completes
   InstrumentedMutexLock l(&mutex_);
-  while (cfd->imm()->size() > 0 && bg_error_.ok()) {
+  while (cfd->imm()->NumNotFlushed() > 0 && bg_error_.ok()) {
+    if (shutting_down_.load(std::memory_order_acquire)) {
+      return Status::ShutdownInProgress();
+    }
     bg_cv_.Wait();
   }
   if (!bg_error_.ok()) {
@@ -1861,8 +2210,12 @@ Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) {
 
 void DBImpl::MaybeScheduleFlushOrCompaction() {
   mutex_.AssertHeld();
-  if (bg_work_gate_closed_) {
-    // gate closed for background work
+  if (!opened_successfully_) {
+    // Compaction may introduce data race to DB open
+    return;
+  }
+  if (bg_work_paused_ > 0) {
+    // we paused the background work
     return;
   } else if (shutting_down_.load(std::memory_order_acquire)) {
     // DB is being deleted; no more background compactions
@@ -1876,27 +2229,24 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
     env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH, this);
   }
 
+  // special case -- if max_background_flushes == 0, then schedule flush on a
+  // compaction thread
+  if (db_options_.max_background_flushes == 0) {
+    while (unscheduled_flushes_ > 0 &&
+           bg_flush_scheduled_ + bg_compaction_scheduled_ <
+               db_options_.max_background_compactions) {
+      unscheduled_flushes_--;
+      bg_flush_scheduled_++;
+      env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::LOW, this);
+    }
+  }
+
   if (bg_manual_only_) {
     // only manual compactions are allowed to run. don't schedule automatic
     // compactions
     return;
   }
 
-  if (db_options_.max_background_flushes == 0 &&
-      bg_compaction_scheduled_ < db_options_.max_background_compactions &&
-      unscheduled_flushes_ > 0) {
-    // special case where flush is executed by compaction thread
-    // (if max_background_flushes == 0).
-    // Compaction thread will execute all the flushes
-    unscheduled_flushes_ = 0;
-    if (unscheduled_compactions_ > 0) {
-      // bg compaction will execute one compaction
-      unscheduled_compactions_--;
-    }
-    bg_compaction_scheduled_++;
-    env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW, this);
-  }
-
   while (bg_compaction_scheduled_ < db_options_.max_background_compactions &&
          unscheduled_compactions_ > 0) {
     bg_compaction_scheduled_++;
@@ -1958,7 +2308,9 @@ void DBImpl::RecordFlushIOStats() {
 
 void DBImpl::BGWorkFlush(void* db) {
   IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);
+  TEST_SYNC_POINT("DBImpl::BGWorkFlush");
   reinterpret_cast<DBImpl*>(db)->BackgroundCallFlush();
+  TEST_SYNC_POINT("DBImpl::BGWorkFlush:done");
 }
 
 void DBImpl::BGWorkCompaction(void* db) {
@@ -1967,7 +2319,7 @@ void DBImpl::BGWorkCompaction(void* db) {
   reinterpret_cast<DBImpl*>(db)->BackgroundCallCompaction();
 }
 
-Status DBImpl::BackgroundFlush(bool* madeProgress, JobContext* job_context,
+Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
                                LogBuffer* log_buffer) {
   mutex_.AssertHeld();
 
@@ -2008,7 +2360,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress, JobContext* job_context,
         cfd->GetName().c_str(),
         db_options_.max_background_flushes - bg_flush_scheduled_,
         db_options_.max_background_compactions - bg_compaction_scheduled_);
-    status = FlushMemTableToOutputFile(cfd, mutable_cf_options, madeProgress,
+    status = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress,
                                        job_context, log_buffer);
     if (cfd->Unref()) {
       delete cfd;
@@ -2018,7 +2370,7 @@ Status DBImpl::BackgroundFlush(bool* madeProgress, JobContext* job_context,
 }
 
 void DBImpl::BackgroundCallFlush() {
-  bool madeProgress = false;
+  bool made_progress = false;
   JobContext job_context(next_job_id_.fetch_add(1), true);
   assert(bg_flush_scheduled_);
 
@@ -2029,7 +2381,7 @@ void DBImpl::BackgroundCallFlush() {
     auto pending_outputs_inserted_elem =
         CaptureCurrentFileNumberInPendingOutputs();
 
-    Status s = BackgroundFlush(&madeProgress, &job_context, &log_buffer);
+    Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer);
     if (!s.ok() && !s.IsShutdownInProgress()) {
       // Wait a little bit before retrying background flush in
       // case this is an environmental problem and we do not want to
@@ -2051,10 +2403,6 @@ void DBImpl::BackgroundCallFlush() {
 
     ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
 
-    // We're just cleaning up for DB::Write()
-    job_context.logs_to_free = logs_to_free_;
-    logs_to_free_.clear();
-
     // If flush failed, we want to delete all temporary files that we might have
     // created. Thus, we force full scan in FindObsoleteFiles()
     FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress());
@@ -2087,7 +2435,7 @@ void DBImpl::BackgroundCallFlush() {
 }
 
 void DBImpl::BackgroundCallCompaction() {
-  bool madeProgress = false;
+  bool made_progress = false;
   JobContext job_context(next_job_id_.fetch_add(1), true);
 
   MaybeDumpStats();
@@ -2099,7 +2447,7 @@ void DBImpl::BackgroundCallCompaction() {
         CaptureCurrentFileNumberInPendingOutputs();
 
     assert(bg_compaction_scheduled_);
-    Status s = BackgroundCompaction(&madeProgress, &job_context, &log_buffer);
+    Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer);
     if (!s.ok() && !s.IsShutdownInProgress()) {
       // Wait a little bit before retrying background compaction in
       // case this is an environmental problem and we do not want to
@@ -2121,10 +2469,6 @@ void DBImpl::BackgroundCallCompaction() {
 
     ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
 
-    // We're just cleaning up for DB::Write()
-    job_context.logs_to_free = logs_to_free_;
-    logs_to_free_.clear();
-
     // If compaction failed, we want to delete all temporary files that we might
     // have created (they might not be all recorded in job_context in case of a
     // failure). Thus, we force full scan in FindObsoleteFiles()
@@ -2152,9 +2496,9 @@ void DBImpl::BackgroundCallCompaction() {
 
     // See if there's more work to be done
     MaybeScheduleFlushOrCompaction();
-    if (madeProgress || bg_compaction_scheduled_ == 0 || bg_manual_only_ > 0) {
+    if (made_progress || bg_compaction_scheduled_ == 0 || bg_manual_only_ > 0) {
       // signal if
-      // * madeProgress -- need to wakeup DelayWrite
+      // * made_progress -- need to wakeup DelayWrite
       // * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl
       // * bg_manual_only_ > 0 -- need to wakeup RunManualCompaction
       // If none of this is true, there is no need to signal since nobody is
@@ -2168,14 +2512,18 @@ void DBImpl::BackgroundCallCompaction() {
   }
 }
 
-Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
+Status DBImpl::BackgroundCompaction(bool* made_progress,
+                                    JobContext* job_context,
                                     LogBuffer* log_buffer) {
-  *madeProgress = false;
+  *made_progress = false;
   mutex_.AssertHeld();
 
   bool is_manual = (manual_compaction_ != nullptr) &&
                    (manual_compaction_->in_progress == false);
+  bool trivial_move_disallowed = is_manual &&
+                                 manual_compaction_->disallow_trivial_move;
 
+  CompactionJobStats compaction_job_stats;
   Status status = bg_error_;
   if (status.ok() && shutting_down_.load(std::memory_order_acquire)) {
     status = Status::ShutdownInProgress();
@@ -2200,35 +2548,6 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
     return Status::OK();
   }
 
-  // If there are no flush threads, then compaction thread needs to execute the
-  // flushes
-  if (db_options_.max_background_flushes == 0) {
-    // BackgroundFlush() will only execute a single flush. We keep calling it as
-    // long as there's more flushes to be done
-    while (!flush_queue_.empty()) {
-      LogToBuffer(
-          log_buffer,
-          "BackgroundCompaction calling BackgroundFlush. flush slots available "
-          "%d, compaction slots available %d",
-          db_options_.max_background_flushes - bg_flush_scheduled_,
-          db_options_.max_background_compactions - bg_compaction_scheduled_);
-      auto flush_status =
-          BackgroundFlush(madeProgress, job_context, log_buffer);
-      // the second condition will be false when a column family is dropped. we
-      // don't want to fail compaction because of that (because it might be a
-      // different column family)
-      if (!flush_status.ok() && !flush_status.IsShutdownInProgress()) {
-        if (is_manual) {
-          manual_compaction_->status = flush_status;
-          manual_compaction_->done = true;
-          manual_compaction_->in_progress = false;
-          manual_compaction_ = nullptr;
-        }
-        return flush_status;
-      }
-    }
-  }
-
   unique_ptr<Compaction> c;
   InternalKey manual_end_storage;
   InternalKey* manual_end = &manual_end_storage;
@@ -2289,10 +2608,10 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
                     c->inputs(0)->size());
         // There are three things that can change compaction score:
         // 1) When flush or compaction finish. This case is covered by
-        // InstallSuperVersion()
+        // InstallSuperVersionAndScheduleWork
         // 2) When MutableCFOptions changes. This case is also covered by
-        // InstallSuperVersion(), because this is when the new options take
-        // effect.
+        // InstallSuperVersionAndScheduleWork, because this is when the new
+        // options take effect.
         // 3) When we Pick a new compaction, we "remove" those files being
         // compacted from the calculation, which then influences compaction
         // score. Here we check if we need the new compaction even without the
@@ -2312,96 +2631,121 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
   if (!c) {
     // Nothing to do
     LogToBuffer(log_buffer, "Compaction nothing to do");
-  } else if (c->IsDeletionCompaction()) {
+  } else if (c->deletion_compaction()) {
     // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old
     // file if there is alive snapshot pointing to it
     assert(c->num_input_files(1) == 0);
     assert(c->level() == 0);
     assert(c->column_family_data()->ioptions()->compaction_style ==
            kCompactionStyleFIFO);
+
+    compaction_job_stats.num_input_files = c->num_input_files(0);
+
     for (const auto& f : *c->inputs(0)) {
       c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
     }
     status = versions_->LogAndApply(c->column_family_data(),
                                     *c->mutable_cf_options(), c->edit(),
                                     &mutex_, directories_.GetDbDir());
-    InstallSuperVersionBackground(c->column_family_data(), job_context,
-                                  *c->mutable_cf_options());
+    InstallSuperVersionAndScheduleWorkWrapper(
+        c->column_family_data(), job_context, *c->mutable_cf_options());
     LogToBuffer(log_buffer, "[%s] Deleted %d files\n",
                 c->column_family_data()->GetName().c_str(),
                 c->num_input_files(0));
-    *madeProgress = true;
-  } else if (!is_manual && c->IsTrivialMove()) {
+    *made_progress = true;
+  } else if (!trivial_move_disallowed && c->IsTrivialMove()) {
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove");
     // Instrument for event update
     // TODO(yhchiang): add op details for showing trivial-move.
     ThreadStatusUtil::SetColumnFamily(c->column_family_data());
     ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
 
-    // Move file to next level
-    assert(c->num_input_files(0) == 1);
-    FileMetaData* f = c->input(0, 0);
-    c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
-    c->edit()->AddFile(c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(),
-                       f->fd.GetFileSize(), f->smallest, f->largest,
-                       f->smallest_seqno, f->largest_seqno);
+    compaction_job_stats.num_input_files = c->num_input_files(0);
+
+    // Move files to next level
+    int32_t moved_files = 0;
+    int64_t moved_bytes = 0;
+    for (unsigned int l = 0; l < c->num_input_levels(); l++) {
+      if (c->level(l) == c->output_level()) {
+        continue;
+      }
+      for (size_t i = 0; i < c->num_input_files(l); i++) {
+        FileMetaData* f = c->input(l, i);
+        c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
+        c->edit()->AddFile(c->output_level(), f->fd.GetNumber(),
+                           f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest,
+                           f->largest, f->smallest_seqno, f->largest_seqno,
+                           f->marked_for_compaction);
+
+        LogToBuffer(log_buffer,
+                    "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
+                    c->column_family_data()->GetName().c_str(),
+                    f->fd.GetNumber(), c->output_level(), f->fd.GetFileSize());
+        ++moved_files;
+        moved_bytes += f->fd.GetFileSize();
+      }
+    }
+
     status = versions_->LogAndApply(c->column_family_data(),
                                     *c->mutable_cf_options(), c->edit(),
                                     &mutex_, directories_.GetDbDir());
     // Use latest MutableCFOptions
-    InstallSuperVersionBackground(c->column_family_data(), job_context,
-                                  *c->mutable_cf_options());
+    InstallSuperVersionAndScheduleWorkWrapper(
+        c->column_family_data(), job_context, *c->mutable_cf_options());
 
     VersionStorageInfo::LevelSummaryStorage tmp;
-    c->column_family_data()->internal_stats()->IncBytesMoved(
-        c->level() + 1, f->fd.GetFileSize());
+    c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(),
+                                                             moved_bytes);
     {
       event_logger_.LogToBuffer(log_buffer)
           << "job" << job_context->job_id << "event"
           << "trivial_move"
-          << "destination_level" << c->level() + 1 << "file_number"
-          << f->fd.GetNumber() << "file_size" << f->fd.GetFileSize();
+          << "destination_level" << c->output_level() << "files" << moved_files
+          << "total_files_size" << moved_bytes;
     }
     LogToBuffer(
         log_buffer,
-        "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes %s: %s\n",
-        c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
-        c->level() + 1, f->fd.GetFileSize(), status.ToString().c_str(),
+        "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n",
+        c->column_family_data()->GetName().c_str(), moved_files,
+        c->output_level(), moved_bytes, status.ToString().c_str(),
         c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
-    *madeProgress = true;
+    *made_progress = true;
 
     // Clear Instrument
     ThreadStatusUtil::ResetThreadStatus();
   } else {
-    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial");
-    auto yield_callback = [&]() {
-      return CallFlushDuringCompaction(c->column_family_data(),
-                                       *c->mutable_cf_options(), job_context,
-                                       log_buffer);
-    };
+    int output_level  __attribute__((unused)) = c->output_level();
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial",
+                             &output_level);
     assert(is_snapshot_supported_ || snapshots_.empty());
     CompactionJob compaction_job(
         job_context->job_id, c.get(), db_options_, env_options_,
         versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(),
-        directories_.GetDataDir(c->GetOutputPathId()), stats_,
-        snapshots_.GetAll(), table_cache_, std::move(yield_callback),
-        &event_logger_, c->mutable_cf_options()->paranoid_file_checks);
+        directories_.GetDataDir(c->output_path_id()), stats_,
+        snapshots_.GetAll(), table_cache_, &event_logger_,
+        c->mutable_cf_options()->paranoid_file_checks,
+        c->mutable_cf_options()->compaction_measure_io_stats, dbname_,
+        &compaction_job_stats);
     compaction_job.Prepare();
+
     mutex_.Unlock();
-    status = compaction_job.Run();
+    compaction_job.Run();
+    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
     mutex_.Lock();
-    compaction_job.Install(&status, *c->mutable_cf_options(), &mutex_);
+
+    status = compaction_job.Install(*c->mutable_cf_options(), &mutex_);
     if (status.ok()) {
-      InstallSuperVersionBackground(c->column_family_data(), job_context,
-                                    *c->mutable_cf_options());
+      InstallSuperVersionAndScheduleWorkWrapper(
+          c->column_family_data(), job_context, *c->mutable_cf_options());
     }
-    *madeProgress = true;
+    *made_progress = true;
   }
-  // FIXME(orib): should I check if column family data is null?
   if (c != nullptr) {
-    NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status);
+    NotifyOnCompactionCompleted(
+        c->column_family_data(), c.get(), status,
+        compaction_job_stats, job_context->job_id);
     c->ReleaseCompactionFiles(status);
-    *madeProgress = true;
+    *made_progress = true;
   }
   // this will unref its input_version and column_family_data
   c.reset();
@@ -2457,30 +2801,6 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
   return status;
 }
 
-uint64_t DBImpl::CallFlushDuringCompaction(
-    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
-    JobContext* job_context, LogBuffer* log_buffer) {
-  if (db_options_.max_background_flushes > 0) {
-    // flush thread will take care of this
-    return 0;
-  }
-  if (cfd->imm()->imm_flush_needed.load(std::memory_order_relaxed)) {
-    const uint64_t imm_start = env_->NowMicros();
-    mutex_.Lock();
-    if (cfd->imm()->IsFlushPending()) {
-      cfd->Ref();
-      FlushMemTableToOutputFile(cfd, mutable_cf_options, nullptr, job_context,
-                                log_buffer);
-      cfd->Unref();
-      bg_cv_.SignalAll();  // Wakeup DelayWrite() if necessary
-    }
-    mutex_.Unlock();
-    log_buffer->FlushBufferToLog();
-    return env_->NowMicros() - imm_start;
-  }
-  return 0;
-}
-
 namespace {
 struct IterState {
   IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version)
@@ -2554,26 +2874,25 @@ Status DBImpl::Get(const ReadOptions& read_options,
 // * malloc one SuperVersion() outside of the lock -- new_superversion
 // * delete SuperVersion()s outside of the lock -- superversions_to_free
 //
-// However, if InstallSuperVersion() gets called twice with the same
-// job_context, we can't reuse the SuperVersion() that got
-// malloced
-// because
+// However, if InstallSuperVersionAndScheduleWork() gets called twice with the
+// same job_context, we can't reuse the SuperVersion() that got
+// malloced because
 // first call already used it. In that rare case, we take a hit and create a
 // new SuperVersion() inside of the mutex. We do similar thing
 // for superversion_to_free
-void DBImpl::InstallSuperVersionBackground(
+void DBImpl::InstallSuperVersionAndScheduleWorkWrapper(
     ColumnFamilyData* cfd, JobContext* job_context,
     const MutableCFOptions& mutable_cf_options) {
   mutex_.AssertHeld();
-  SuperVersion* old_superversion = InstallSuperVersion(
+  SuperVersion* old_superversion = InstallSuperVersionAndScheduleWork(
       cfd, job_context->new_superversion, mutable_cf_options);
   job_context->new_superversion = nullptr;
   job_context->superversions_to_free.push_back(old_superversion);
 }
 
-SuperVersion* DBImpl::InstallSuperVersion(
+SuperVersion* DBImpl::InstallSuperVersionAndScheduleWork(
     ColumnFamilyData* cfd, SuperVersion* new_sv,
-    const MutableCFOptions& mutable_cf_options, bool dont_schedule_bg_work) {
+    const MutableCFOptions& mutable_cf_options) {
   mutex_.AssertHeld();
 
   // Update max_total_in_memory_state_
@@ -2588,14 +2907,10 @@ SuperVersion* DBImpl::InstallSuperVersion(
       new_sv ? new_sv : new SuperVersion(), &mutex_, mutable_cf_options);
 
   // Whenever we install new SuperVersion, we might need to issue new flushes or
-  // compactions. dont_schedule_bg_work is true when scheduling from write
-  // thread and we don't want to add additional overhead. Callers promise to
-  // call SchedulePendingFlush() and MaybeScheduleFlushOrCompaction() eventually
-  if (!dont_schedule_bg_work) {
-    SchedulePendingFlush(cfd);
-    SchedulePendingCompaction(cfd);
-    MaybeScheduleFlushOrCompaction();
-  }
+  // compactions.
+  SchedulePendingFlush(cfd);
+  SchedulePendingCompaction(cfd);
+  MaybeScheduleFlushOrCompaction();
 
   // Update max_total_in_memory_state_
   max_total_in_memory_state_ =
@@ -2621,10 +2936,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
   } else {
     snapshot = versions_->LastSequence();
   }
-
   // Acquire SuperVersion
   SuperVersion* sv = GetAndRefSuperVersion(cfd);
-
   // Prepare to store a list of merge operations if merge occurs.
   MergeContext merge_context;
 
@@ -2770,11 +3083,232 @@ std::vector<Status> DBImpl::MultiGet(
   return stat_list;
 }
 
-Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
-                                  const std::string& column_family_name,
+#ifndef ROCKSDB_LITE
+Status DBImpl::AddFile(ColumnFamilyHandle* column_family,
+                       const std::string& file_path, bool move_file) {
+  Status status;
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+
+  ExternalSstFileInfo file_info;
+  file_info.file_path = file_path;
+  status = env_->GetFileSize(file_path, &file_info.file_size);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Access the file using TableReader to extract
+  // version, number of entries, smallest user key, largest user key
+  std::unique_ptr<RandomAccessFile> sst_file;
+  status = env_->NewRandomAccessFile(file_path, &sst_file, env_options_);
+  if (!status.ok()) {
+    return status;
+  }
+  std::unique_ptr<RandomAccessFileReader> sst_file_reader;
+  sst_file_reader.reset(new RandomAccessFileReader(std::move(sst_file)));
+
+  std::unique_ptr<TableReader> table_reader;
+  status = cfd->ioptions()->table_factory->NewTableReader(
+      TableReaderOptions(*cfd->ioptions(), env_options_,
+                         cfd->internal_comparator()),
+      std::move(sst_file_reader), file_info.file_size, &table_reader);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Get the external sst file version from table properties
+  const UserCollectedProperties& user_collected_properties =
+      table_reader->GetTableProperties()->user_collected_properties;
+  UserCollectedProperties::const_iterator external_sst_file_version_iter =
+      user_collected_properties.find(ExternalSstFilePropertyNames::kVersion);
+  if (external_sst_file_version_iter == user_collected_properties.end()) {
+    return Status::InvalidArgument("Generated table version not found");
+  }
+
+  file_info.version =
+      DecodeFixed32(external_sst_file_version_iter->second.c_str());
+  if (file_info.version == 1) {
+    // version 1 imply that all sequence numbers in table equal 0
+    file_info.sequence_number = 0;
+  } else {
+    return Status::InvalidArgument("Generated table version is not supported");
+  }
+
+  // Get number of entries in table
+  file_info.num_entries = table_reader->GetTableProperties()->num_entries;
+
+  ParsedInternalKey key;
+  std::unique_ptr<Iterator> iter(table_reader->NewIterator(ReadOptions()));
+
+  // Get first (smallest) key from file
+  iter->SeekToFirst();
+  if (!ParseInternalKey(iter->key(), &key)) {
+    return Status::Corruption("Generated table have corrupted keys");
+  }
+  if (key.sequence != 0) {
+    return Status::Corruption("Generated table have non zero sequence number");
+  }
+  file_info.smallest_key = key.user_key.ToString();
+
+  // Get last (largest) key from file
+  iter->SeekToLast();
+  if (!ParseInternalKey(iter->key(), &key)) {
+    return Status::Corruption("Generated table have corrupted keys");
+  }
+  if (key.sequence != 0) {
+    return Status::Corruption("Generated table have non zero sequence number");
+  }
+  file_info.largest_key = key.user_key.ToString();
+
+  return AddFile(column_family, &file_info, move_file);
+}
+
+Status DBImpl::AddFile(ColumnFamilyHandle* column_family,
+                       const ExternalSstFileInfo* file_info, bool move_file) {
+  Status status;
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+
+  if (cfd->NumberLevels() <= 1) {
+    return Status::NotSupported(
+        "AddFile requires a database with at least 2 levels");
+  }
+  if (file_info->version != 1) {
+    return Status::InvalidArgument("Generated table version is not supported");
+  }
+  // version 1 imply that file have only Put Operations with Sequence Number = 0
+
+  FileMetaData meta;
+  meta.smallest =
+      InternalKey(file_info->smallest_key, file_info->sequence_number,
+                  ValueType::kTypeValue);
+  meta.largest = InternalKey(file_info->largest_key, file_info->sequence_number,
+                             ValueType::kTypeValue);
+  if (!meta.smallest.Valid() || !meta.largest.Valid()) {
+    return Status::Corruption("Generated table have corrupted keys");
+  }
+  meta.smallest_seqno = file_info->sequence_number;
+  meta.largest_seqno = file_info->sequence_number;
+  if (meta.smallest_seqno != 0 || meta.largest_seqno != 0) {
+    return Status::InvalidArgument(
+        "Non zero sequence numbers are not supported");
+  }
+  // Generate a location for the new table
+  meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, file_info->file_size);
+  std::string db_fname = TableFileName(
+      db_options_.db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
+
+  if (move_file) {
+    status = env_->LinkFile(file_info->file_path, db_fname);
+    if (status.IsNotSupported()) {
+      // Original file is on a different FS, use copy instead of hard linking
+      status = CopyFile(env_, file_info->file_path, db_fname, 0);
+    }
+  } else {
+    status = CopyFile(env_, file_info->file_path, db_fname, 0);
+  }
+  if (!status.ok()) {
+    return status;
+  }
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    const MutableCFOptions mutable_cf_options =
+        *cfd->GetLatestMutableCFOptions();
+
+    WriteThread::Writer w;
+    write_thread_.EnterUnbatched(&w, &mutex_);
+
+    // Make sure memtables are empty
+    if (!cfd->mem()->IsEmpty() || cfd->imm()->NumNotFlushed() > 0) {
+      // Cannot add the file since the keys in memtable
+      // will hide the keys in file
+      status = Status::NotSupported("Memtable is not empty");
+    }
+
+    // Make sure last sequence number is 0, if there are existing files then
+    // they should have sequence number = 0
+    if (status.ok() && versions_->LastSequence() > 0) {
+      status = Status::NotSupported("Last Sequence number is not zero");
+    }
+
+    auto* vstorage = cfd->current()->storage_info();
+    if (status.ok()) {
+      // Make sure that the key range in the file we will add does not overlap
+      // with previously added files
+      Slice smallest_user_key = meta.smallest.user_key();
+      Slice largest_user_key = meta.largest.user_key();
+      for (int level = 0; level < vstorage->num_non_empty_levels(); level++) {
+        if (vstorage->OverlapInLevel(level, &smallest_user_key,
+                                     &largest_user_key)) {
+          status = Status::NotSupported("Cannot add overlapping files");
+          break;
+        }
+      }
+    }
+
+    if (status.ok()) {
+      // We add the file to the last level
+      int target_level = cfd->NumberLevels() - 1;
+      if (cfd->ioptions()->level_compaction_dynamic_level_bytes == false) {
+        // If we are using dynamic level compaction we add the file to
+        // last level with files
+        target_level = vstorage->num_non_empty_levels() - 1;
+        if (target_level <= 0) {
+          target_level = 1;
+        }
+      }
+      VersionEdit edit;
+      edit.SetColumnFamily(cfd->GetID());
+      edit.AddFile(target_level, meta.fd.GetNumber(), meta.fd.GetPathId(),
+                   meta.fd.GetFileSize(), meta.smallest, meta.largest,
+                   meta.smallest_seqno, meta.largest_seqno,
+                   meta.marked_for_compaction);
+
+      status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_,
+                                      directories_.GetDbDir());
+    }
+    write_thread_.ExitUnbatched(&w);
+
+    if (status.ok()) {
+      delete InstallSuperVersionAndScheduleWork(cfd, nullptr,
+                                                mutable_cf_options);
+    }
+  }
+
+  if (!status.ok()) {
+    // We failed to add the file to the database
+    Status s = env_->DeleteFile(db_fname);
+    if (!s.ok()) {
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "AddFile() clean up for file %s failed : %s", db_fname.c_str(),
+          s.ToString().c_str());
+    }
+  } else if (status.ok() && move_file) {
+    // The file was moved and added successfully, remove original file link
+    Status s = env_->DeleteFile(file_info->file_path);
+    if (!s.ok()) {
+      Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
+          "%s was added to DB successfully but failed to remove original file "
+          "link : %s",
+          file_info->file_path.c_str(), s.ToString().c_str());
+    }
+  }
+  return status;
+}
+#endif  // ROCKSDB_LITE
+
+Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+                                  const std::string& column_family_name,
                                   ColumnFamilyHandle** handle) {
   Status s;
   *handle = nullptr;
+
+  s = CheckCompressionSupported(cf_options);
+  if (!s.ok()) {
+    return s;
+  }
+
   {
     InstrumentedMutexLock l(&mutex_);
 
@@ -2793,22 +3327,21 @@ Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
     // ColumnFamilyData object
     Options opt(db_options_, cf_options);
     {  // write thread
-      WriteThread::Writer w(&mutex_);
-      s = write_thread_.EnterWriteThread(&w, 0);
-      assert(s.ok() && !w.done);  // No timeout and nobody should do our job
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
       // LogAndApply will both write the creation in MANIFEST and create
       // ColumnFamilyData object
       s = versions_->LogAndApply(
           nullptr, MutableCFOptions(opt, ImmutableCFOptions(opt)), &edit,
           &mutex_, directories_.GetDbDir(), false, &cf_options);
-      write_thread_.ExitWriteThread(&w, &w, s);
+      write_thread_.ExitUnbatched(&w);
     }
     if (s.ok()) {
       single_column_family_mode_ = false;
       auto* cfd =
           versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
       assert(cfd != nullptr);
-      delete InstallSuperVersion(
+      delete InstallSuperVersionAndScheduleWork(
           cfd, nullptr, *cfd->GetLatestMutableCFOptions());
 
       if (!cfd->mem()->IsSnapshotSupported()) {
@@ -2855,12 +3388,11 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
     }
     if (s.ok()) {
       // we drop column family from a single write thread
-      WriteThread::Writer w(&mutex_);
-      s = write_thread_.EnterWriteThread(&w, 0);
-      assert(s.ok() && !w.done);  // No timeout and nobody should do our job
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
       s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
                                  &edit, &mutex_);
-      write_thread_.ExitWriteThread(&w, &w, s);
+      write_thread_.ExitUnbatched(&w);
     }
 
     if (!cf_support_snapshot) {
@@ -2995,8 +3527,8 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
     // |       |                       |
     // +-------+-----------------------+
     //
-    // ArenaWrappedDBIter inlines an arena area where all the iterartor in the
-    // the iterator tree is allocated in the order of being accessed when
+    // ArenaWrappedDBIter inlines an arena area where all the iterators in
+    // the iterator tree are allocated in the order of being accessed when
     // querying.
     // Laying out the iterators in the order of being accessed makes it more
     // likely that any iterator pointer is close to the iterator it points to so
@@ -3086,16 +3618,24 @@ Status DBImpl::NewIterators(
 const Snapshot* DBImpl::GetSnapshot() {
   int64_t unix_time = 0;
   env_->GetCurrentTime(&unix_time);  // Ignore error
+  SnapshotImpl* s = new SnapshotImpl;
 
   InstrumentedMutexLock l(&mutex_);
   // returns null if the underlying memtable does not support snapshot.
-  if (!is_snapshot_supported_) return nullptr;
-  return snapshots_.New(versions_->LastSequence(), unix_time);
+  if (!is_snapshot_supported_) {
+    delete s;
+    return nullptr;
+  }
+  return snapshots_.New(s, versions_->LastSequence(), unix_time);
 }
 
 void DBImpl::ReleaseSnapshot(const Snapshot* s) {
-  InstrumentedMutexLock l(&mutex_);
-  snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
+  const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    snapshots_.Delete(casted_s);
+  }
+  delete casted_s;
 }
 
 // Convenience methods
@@ -3119,32 +3659,69 @@ Status DBImpl::Delete(const WriteOptions& write_options,
   return DB::Delete(write_options, column_family, key);
 }
 
+Status DBImpl::SingleDelete(const WriteOptions& write_options,
+                            ColumnFamilyHandle* column_family,
+                            const Slice& key) {
+  return DB::SingleDelete(write_options, column_family, key);
+}
+
 Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
+  return WriteImpl(write_options, my_batch, nullptr);
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
+                                 WriteBatch* my_batch,
+                                 WriteCallback* callback) {
+  return WriteImpl(write_options, my_batch, callback);
+}
+#endif  // ROCKSDB_LITE
+
+Status DBImpl::WriteImpl(const WriteOptions& write_options,
+                         WriteBatch* my_batch, WriteCallback* callback) {
   if (my_batch == nullptr) {
     return Status::Corruption("Batch is nullptr!");
   }
+  if (write_options.timeout_hint_us != 0) {
+    return Status::InvalidArgument("timeout_hint_us is deprecated");
+  }
+
+  Status status;
+  bool callback_failed = false;
+
+  bool xfunc_attempted_write = false;
+  XFUNC_TEST("transaction", "transaction_xftest_write_impl",
+             xf_transaction_write1, xf_transaction_write, write_options,
+             db_options_, my_batch, callback, this, &status,
+             &xfunc_attempted_write);
+  if (xfunc_attempted_write) {
+    // Test already did the write
+    return status;
+  }
+
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
-  WriteThread::Writer w(&mutex_);
+  WriteThread::Writer w;
   w.batch = my_batch;
   w.sync = write_options.sync;
   w.disableWAL = write_options.disableWAL;
   w.in_batch_group = false;
   w.done = false;
-  w.timeout_hint_us = write_options.timeout_hint_us;
-
-  uint64_t expiration_time = 0;
-  bool has_timeout = false;
-  if (w.timeout_hint_us == 0) {
-    w.timeout_hint_us = WriteThread::kNoTimeOut;
-  } else {
-    expiration_time = env_->NowMicros() + w.timeout_hint_us;
-    has_timeout = true;
-  }
+  w.has_callback = (callback != nullptr) ? true : false;
 
   if (!write_options.disableWAL) {
     RecordTick(stats_, WRITE_WITH_WAL);
   }
 
+  StopWatch write_sw(env_, db_options_.statistics.get(), DB_WRITE);
+
+  write_thread_.JoinBatchGroup(&w);
+  if (w.done) {
+    // write was done by someone else, no need to grab mutex
+    RecordTick(stats_, WRITE_DONE_BY_OTHER);
+    return w.status;
+  }
+  // else we are the leader of the write batch group
+
   WriteContext context;
   mutex_.Lock();
 
@@ -3152,21 +3729,6 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
     default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL, 1);
   }
 
-  Status status = write_thread_.EnterWriteThread(&w, expiration_time);
-  assert(status.ok() || status.IsTimedOut());
-  if (status.IsTimedOut()) {
-    mutex_.Unlock();
-    RecordTick(stats_, WRITE_TIMEDOUT);
-    return Status::TimedOut();
-  }
-  if (w.done) {  // write was done by someone else
-    default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER,
-                                           1);
-    mutex_.Unlock();
-    RecordTick(stats_, WRITE_DONE_BY_OTHER);
-    return w.status;
-  }
-
   RecordTick(stats_, WRITE_DONE_BY_SELF);
   default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1);
 
@@ -3196,15 +3758,15 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
         continue;
       }
       if (cfd->GetLogNumber() <= flush_column_family_if_log_file) {
-        status = SetNewMemtableAndNewLogFile(cfd, &context);
+        status = SwitchMemtable(cfd, &context);
         if (!status.ok()) {
           break;
         }
         cfd->imm()->FlushRequested();
         SchedulePendingFlush(cfd);
-        context.schedule_bg_work_ = true;
       }
     }
+    MaybeScheduleFlushOrCompaction();
   } else if (UNLIKELY(write_buffer_.ShouldFlush())) {
     Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
         "Flushing all column families. Write buffer is using %" PRIu64
@@ -3217,13 +3779,12 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
         continue;
       }
       if (!cfd->mem()->IsEmpty()) {
-        status = SetNewMemtableAndNewLogFile(cfd, &context);
+        status = SwitchMemtable(cfd, &context);
         if (!status.ok()) {
           break;
         }
         cfd->imm()->FlushRequested();
         SchedulePendingFlush(cfd);
-        context.schedule_bg_work_ = true;
       }
     }
     MaybeScheduleFlushOrCompaction();
@@ -3237,33 +3798,58 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
     status = ScheduleFlushes(&context);
   }
 
-  if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
-                               write_controller_.GetDelay() > 0))) {
-    // If writer is stopped, we need to get it going,
-    // so schedule flushes/compactions
-    if (context.schedule_bg_work_) {
-      MaybeScheduleFlushOrCompaction();
-    }
-    status = DelayWrite(expiration_time);
-  }
-
-  if (UNLIKELY(status.ok() && has_timeout &&
-               env_->NowMicros() > expiration_time)) {
-    status = Status::TimedOut();
+  if (UNLIKELY(status.ok()) &&
+      (write_controller_.IsStopped() || write_controller_.NeedsDelay())) {
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+    PERF_TIMER_GUARD(write_delay_time);
+    // We don't know size of curent batch so that we always use the size
+    // for previous one. It might create a fairness issue that expiration
+    // might happen for smaller writes but larger writes can go through.
+    // Can optimize it if it is an issue.
+    status = DelayWrite(last_batch_group_size_);
+    PERF_TIMER_START(write_pre_and_post_process_time);
   }
 
   uint64_t last_sequence = versions_->LastSequence();
   WriteThread::Writer* last_writer = &w;
+  autovector<WriteBatch*> write_batch_group;
+  bool need_log_sync = !write_options.disableWAL && write_options.sync;
+  bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
+
   if (status.ok()) {
-    autovector<WriteBatch*> write_batch_group;
-    write_thread_.BuildBatchGroup(&last_writer, &write_batch_group);
+    last_batch_group_size_ = write_thread_.EnterAsBatchGroupLeader(
+        &w, &last_writer, &write_batch_group);
+
+    if (need_log_sync) {
+      while (logs_.front().getting_synced) {
+        log_sync_cv_.Wait();
+      }
+      for (auto& log : logs_) {
+        assert(!log.getting_synced);
+        log.getting_synced = true;
+      }
+    }
 
     // Add to log and apply to memtable.  We can release the lock
     // during this phase since &w is currently responsible for logging
     // and protects against concurrent loggers and concurrent writes
     // into memtables
-    {
-      mutex_.Unlock();
+
+    mutex_.Unlock();
+
+    if (callback != nullptr) {
+      // If this write has a validation callback, check to see if this write
+      // is able to be written.  Must be called on the write thread.
+      status = callback->Callback(this);
+      callback_failed = true;
+    }
+  } else {
+    mutex_.Unlock();
+  }
+
+  // At this point the mutex is unlocked
+
+  if (status.ok()) {
       WriteBatch* updates = nullptr;
       if (write_batch_group.size() == 1) {
         updates = write_batch_group[0];
@@ -3291,27 +3877,34 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
       if (!write_options.disableWAL) {
         PERF_TIMER_GUARD(write_wal_time);
         Slice log_entry = WriteBatchInternal::Contents(updates);
-        status = log_->AddRecord(log_entry);
+        status = logs_.back().writer->AddRecord(log_entry);
         total_log_size_ += log_entry.size();
         alive_log_files_.back().AddSize(log_entry.size());
         log_empty_ = false;
         log_size = log_entry.size();
         RecordTick(stats_, WAL_FILE_BYTES, log_size);
-        if (status.ok() && write_options.sync) {
+        if (status.ok() && need_log_sync) {
           RecordTick(stats_, WAL_FILE_SYNCED);
           StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
-          if (db_options_.use_fsync) {
-            status = log_->file()->Fsync();
-          } else {
-            status = log_->file()->Sync();
+          // It's safe to access logs_ with unlocked mutex_ here because:
+          //  - we've set getting_synced=true for all logs,
+          //    so other threads won't pop from logs_ while we're here,
+          //  - only writer thread can push to logs_, and we're in
+          //    writer thread, so no one will push to logs_,
+          //  - as long as other threads don't modify it, it's safe to read
+          //    from std::deque from multiple threads concurrently.
+          for (auto& log : logs_) {
+            status = log.writer->file()->Sync(db_options_.use_fsync);
+            if (!status.ok()) {
+              break;
+            }
           }
-          if (status.ok() && !log_dir_synced_) {
+          if (status.ok() && need_log_dir_sync) {
             // We only sync WAL directory the first time WAL syncing is
             // requested, so that in case users never turn on WAL sync,
             // we can avoid the disk I/O in the write code path.
             status = directories_.GetWalDir()->Fsync();
           }
-          log_dir_synced_ = true;
         }
       }
       if (status.ok()) {
@@ -3335,72 +3928,77 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
         tmp_batch_.Clear();
       }
       mutex_.Lock();
+
       // internal stats
       default_cf_internal_stats_->AddDBStats(
           InternalStats::BYTES_WRITTEN, batch_size);
       default_cf_internal_stats_->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN,
                                              my_batch_count);
       if (!write_options.disableWAL) {
-        default_cf_internal_stats_->AddDBStats(
-            InternalStats::WAL_FILE_SYNCED, 1);
+        if (write_options.sync) {
+          default_cf_internal_stats_->AddDBStats(InternalStats::WAL_FILE_SYNCED,
+                                                 1);
+        }
         default_cf_internal_stats_->AddDBStats(
             InternalStats::WAL_FILE_BYTES, log_size);
       }
       if (status.ok()) {
         versions_->SetLastSequence(last_sequence);
       }
-    }
+  } else {
+    // Operation failed.  Make sure sure mutex is held for cleanup code below.
+    mutex_.Lock();
   }
-  if (db_options_.paranoid_checks && !status.ok() &&
-      !status.IsTimedOut() && bg_error_.ok()) {
+
+  if (db_options_.paranoid_checks && !status.ok() && !callback_failed &&
+      !status.IsBusy() && bg_error_.ok()) {
     bg_error_ = status; // stop compaction & fail any further writes
   }
 
-  write_thread_.ExitWriteThread(&w, last_writer, status);
+  mutex_.AssertHeld();
 
-  if (context.schedule_bg_work_) {
-    MaybeScheduleFlushOrCompaction();
+  if (need_log_sync) {
+    MarkLogsSynced(logfile_number_, need_log_dir_sync, status);
   }
-  mutex_.Unlock();
 
-  if (status.IsTimedOut()) {
-    RecordTick(stats_, WRITE_TIMEDOUT);
+  uint64_t writes_for_other = write_batch_group.size() - 1;
+  if (writes_for_other > 0) {
+    default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER,
+                                           writes_for_other);
+    if (!write_options.disableWAL) {
+      default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL,
+                                             writes_for_other);
+    }
   }
 
+  mutex_.Unlock();
+
+  write_thread_.ExitAsBatchGroupLeader(&w, last_writer, status);
+
   return status;
 }
 
 // REQUIRES: mutex_ is held
 // REQUIRES: this thread is currently at the front of the writer queue
-Status DBImpl::DelayWrite(uint64_t expiration_time) {
+Status DBImpl::DelayWrite(uint64_t num_bytes) {
   uint64_t time_delayed = 0;
   bool delayed = false;
-  bool timed_out = false;
   {
     StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed);
-    bool has_timeout = (expiration_time > 0);
-    auto delay = write_controller_.GetDelay();
-    if (write_controller_.IsStopped() == false && delay > 0) {
+    auto delay = write_controller_.GetDelay(env_, num_bytes);
+    if (delay > 0) {
       mutex_.Unlock();
       delayed = true;
-      // hopefully we don't have to sleep more than 2 billion microseconds
       TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep");
+      // hopefully we don't have to sleep more than 2 billion microseconds
       env_->SleepForMicroseconds(static_cast<int>(delay));
       mutex_.Lock();
     }
 
     while (bg_error_.ok() && write_controller_.IsStopped()) {
       delayed = true;
-      if (has_timeout) {
-        TEST_SYNC_POINT("DBImpl::DelayWrite:TimedWait");
-        bg_cv_.TimedWait(expiration_time);
-        if (env_->NowMicros() > expiration_time) {
-          timed_out = true;
-          break;
-        }
-      } else {
-        bg_cv_.Wait();
-      }
+      TEST_SYNC_POINT("DBImpl::DelayWrite:Wait");
+      bg_cv_.Wait();
     }
   }
   if (delayed) {
@@ -3409,19 +4007,13 @@ Status DBImpl::DelayWrite(uint64_t expiration_time) {
     RecordTick(stats_, STALL_MICROS, time_delayed);
   }
 
-  if (timed_out) {
-    return Status::TimedOut();
-  }
-
   return bg_error_;
 }
 
 Status DBImpl::ScheduleFlushes(WriteContext* context) {
   ColumnFamilyData* cfd;
   while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) {
-    auto status = SetNewMemtableAndNewLogFile(cfd, context);
-    SchedulePendingFlush(cfd);
-    context->schedule_bg_work_ = true;
+    auto status = SwitchMemtable(cfd, context);
     if (cfd->Unref()) {
       delete cfd;
     }
@@ -3434,8 +4026,7 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) {
 
 // REQUIRES: mutex_ is held
 // REQUIRES: this thread is currently at the front of the writer queue
-Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
-                                           WriteContext* context) {
+Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
   mutex_.AssertHeld();
   unique_ptr<WritableFile> lfile;
   log::Writer* new_log = nullptr;
@@ -3453,21 +4044,25 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
   Status s;
   {
     if (creating_new_log) {
+      EnvOptions opt_env_opt =
+          env_->OptimizeForLogWrite(env_options_, db_options_);
       s = env_->NewWritableFile(
           LogFileName(db_options_.wal_dir, new_log_number), &lfile,
-          env_->OptimizeForLogWrite(env_options_, db_options_));
+          opt_env_opt);
       if (s.ok()) {
         // Our final size should be less than write_buffer_size
         // (compression, etc) but err on the side of caution.
         lfile->SetPreallocationBlockSize(
             1.1 * mutable_cf_options.write_buffer_size);
-        new_log = new log::Writer(std::move(lfile));
-        log_dir_synced_ = false;
+        unique_ptr<WritableFileWriter> file_writer(
+            new WritableFileWriter(std::move(lfile), opt_env_opt));
+        new_log = new log::Writer(std::move(file_writer));
       }
     }
 
     if (s.ok()) {
-      new_mem = cfd->ConstructNewMemtable(mutable_cf_options);
+      SequenceNumber seq = versions_->LastSequence();
+      new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq);
       new_superversion = new SuperVersion();
     }
   }
@@ -3485,9 +4080,9 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
   if (creating_new_log) {
     logfile_number_ = new_log_number;
     assert(new_log != nullptr);
-    logs_to_free_.push_back(log_.release());
-    log_.reset(new_log);
     log_empty_ = true;
+    log_dir_synced_ = false;
+    logs_.emplace_back(logfile_number_, new_log);
     alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
     for (auto loop_cfd : *versions_->GetColumnFamilySet()) {
       // all this is just optimization to delete logs that
@@ -3495,17 +4090,17 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
       // doesn't need that particular log to stay alive, so we just
       // advance the log number. no need to persist this in the manifest
       if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 &&
-          loop_cfd->imm()->size() == 0) {
+          loop_cfd->imm()->NumNotFlushed() == 0) {
         loop_cfd->SetLogNumber(logfile_number_);
       }
     }
   }
   cfd->mem()->SetNextLogNumber(logfile_number_);
-  cfd->imm()->Add(cfd->mem());
+  cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
   new_mem->Ref();
   cfd->SetMemtable(new_mem);
-  context->superversions_to_free_.push_back(
-      InstallSuperVersion(cfd, new_superversion, mutable_cf_options, true));
+  context->superversions_to_free_.push_back(InstallSuperVersionAndScheduleWork(
+      cfd, new_superversion, mutable_cf_options));
   return s;
 }
 
@@ -3611,6 +4206,34 @@ SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
   return cfd->GetThreadLocalSuperVersion(&mutex_);
 }
 
+// REQUIRED: this function should only be called on the write thread or if the
+// mutex is held.
+SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) {
+  auto column_family_set = versions_->GetColumnFamilySet();
+  auto cfd = column_family_set->GetColumnFamily(column_family_id);
+  if (!cfd) {
+    return nullptr;
+  }
+
+  return GetAndRefSuperVersion(cfd);
+}
+
+// REQUIRED:  mutex is NOT held
+SuperVersion* DBImpl::GetAndRefSuperVersionUnlocked(uint32_t column_family_id) {
+  ColumnFamilyData* cfd;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    auto column_family_set = versions_->GetColumnFamilySet();
+    cfd = column_family_set->GetColumnFamily(column_family_id);
+  }
+
+  if (!cfd) {
+    return nullptr;
+  }
+
+  return GetAndRefSuperVersion(cfd);
+}
+
 void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
                                           SuperVersion* sv) {
   bool unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv);
@@ -3629,28 +4252,81 @@ void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
   }
 }
 
+// REQUIRED: this function should only be called on the write thread.
+void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id,
+                                          SuperVersion* sv) {
+  auto column_family_set = versions_->GetColumnFamilySet();
+  auto cfd = column_family_set->GetColumnFamily(column_family_id);
+
+  // If SuperVersion is held, and we successfully fetched a cfd using
+  // GetAndRefSuperVersion(), it must still exist.
+  assert(cfd != nullptr);
+  ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+// REQUIRED: Mutex should NOT be held.
+void DBImpl::ReturnAndCleanupSuperVersionUnlocked(uint32_t column_family_id,
+                                                  SuperVersion* sv) {
+  ColumnFamilyData* cfd;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    auto column_family_set = versions_->GetColumnFamilySet();
+    cfd = column_family_set->GetColumnFamily(column_family_id);
+  }
+
+  // If SuperVersion is held, and we successfully fetched a cfd using
+  // GetAndRefSuperVersion(), it must still exist.
+  assert(cfd != nullptr);
+  ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+// REQUIRED: this function should only be called on the write thread or if the
+// mutex is held.
+ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
+  ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
+
+  if (!cf_memtables->Seek(column_family_id)) {
+    return nullptr;
+  }
+
+  return cf_memtables->GetColumnFamilyHandle();
+}
+
+// REQUIRED: mutex is NOT held.
+ColumnFamilyHandle* DBImpl::GetColumnFamilyHandleUnlocked(
+    uint32_t column_family_id) {
+  ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
+
+  InstrumentedMutexLock l(&mutex_);
+
+  if (!cf_memtables->Seek(column_family_id)) {
+    return nullptr;
+  }
+
+  return cf_memtables->GetColumnFamilyHandle();
+}
+
 void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                 const Range* range, int n, uint64_t* sizes) {
+                                 const Range* range, int n, uint64_t* sizes,
+                                 bool include_memtable) {
   Version* v;
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
-  {
-    InstrumentedMutexLock l(&mutex_);
-    v = cfd->current();
-    v->Ref();
-  }
+  SuperVersion* sv = GetAndRefSuperVersion(cfd);
+  v = sv->current;
 
   for (int i = 0; i < n; i++) {
     // Convert user_key into a corresponding internal key.
     InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
     InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
     sizes[i] = versions_->ApproximateSize(v, k1.Encode(), k2.Encode());
+    if (include_memtable) {
+      sizes[i] += sv->mem->ApproximateSize(k1.Encode(), k2.Encode());
+      sizes[i] += sv->imm->ApproximateSize(k1.Encode(), k2.Encode());
+    }
   }
 
-  {
-    InstrumentedMutexLock l(&mutex_);
-    v->Unref();
-  }
+  ReturnAndCleanupSuperVersion(cfd, sv);
 }
 
 std::list<uint64_t>::iterator
@@ -3760,14 +4436,16 @@ Status DBImpl::DeleteFile(std::string name) {
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
                                     &edit, &mutex_, directories_.GetDbDir());
     if (status.ok()) {
-      InstallSuperVersionBackground(cfd, &job_context,
-                                    *cfd->GetLatestMutableCFOptions());
+      InstallSuperVersionAndScheduleWorkWrapper(
+          cfd, &job_context, *cfd->GetLatestMutableCFOptions());
     }
     FindObsoleteFiles(&job_context, false);
   }  // lock released here
+
   LogFlush(db_options_.info_log);
   // remove files outside the db-lock
   if (job_context.HaveSomethingToDelete()) {
+    // Call PurgeObsoleteFiles() without holding mutex.
     PurgeObsoleteFiles(job_context);
   }
   job_context.Clean();
@@ -3803,6 +4481,10 @@ Status DBImpl::CheckConsistency() {
 
     uint64_t fsize = 0;
     Status s = env_->GetFileSize(file_path, &fsize);
+    if (!s.ok() &&
+        env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
+      s = Status::OK();
+    }
     if (!s.ok()) {
       corruption_messages +=
           "Can't access " + md.name + ": " + s.ToString() + "\n";
@@ -3820,22 +4502,28 @@ Status DBImpl::CheckConsistency() {
   }
 }
 
-Status DBImpl::GetDbIdentity(std::string& identity) {
+Status DBImpl::GetDbIdentity(std::string& identity) const {
   std::string idfilename = IdentityFileName(dbname_);
-  unique_ptr<SequentialFile> idfile;
   const EnvOptions soptions;
-  Status s = env_->NewSequentialFile(idfilename, &idfile, soptions);
-  if (!s.ok()) {
-    return s;
+  unique_ptr<SequentialFileReader> id_file_reader;
+  Status s;
+  {
+    unique_ptr<SequentialFile> idfile;
+    s = env_->NewSequentialFile(idfilename, &idfile, soptions);
+    if (!s.ok()) {
+      return s;
+    }
+    id_file_reader.reset(new SequentialFileReader(std::move(idfile)));
   }
+
   uint64_t file_size;
   s = env_->GetFileSize(idfilename, &file_size);
   if (!s.ok()) {
     return s;
   }
-  char buffer[file_size];
+  char* buffer = reinterpret_cast<char*>(alloca(file_size));
   Slice id;
-  s = idfile->Read(static_cast<size_t>(file_size), &id, buffer);
+  s = id_file_reader->Read(static_cast<size_t>(file_size), &id, buffer);
   if (!s.ok()) {
     return s;
   }
@@ -3866,6 +4554,13 @@ Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
   return Write(opt, &batch);
 }
 
+Status DB::SingleDelete(const WriteOptions& opt,
+                        ColumnFamilyHandle* column_family, const Slice& key) {
+  WriteBatch batch;
+  batch.SingleDelete(column_family, key);
+  return Write(opt, &batch);
+}
+
 Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
                  const Slice& key, const Slice& value) {
   WriteBatch batch;
@@ -3910,8 +4605,12 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
     return s;
   }
 
-  if (db_options.db_paths.size() > 1) {
-    for (auto& cfd : column_families) {
+  for (auto& cfd : column_families) {
+    s = CheckCompressionSupported(cfd.options);
+    if (!s.ok()) {
+      return s;
+    }
+    if (db_options.db_paths.size() > 1) {
       if ((cfd.options.compaction_style != kCompactionStyleUniversal) &&
           (cfd.options.compaction_style != kCompactionStyleLevel)) {
         return Status::NotSupported(
@@ -3919,11 +4618,11 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
             "universal and level compaction styles. ");
       }
     }
+  }
 
-    if (db_options.db_paths.size() > 4) {
-      return Status::NotSupported(
-          "More than four DB paths are not supported yet. ");
-    }
+  if (db_options.db_paths.size() > 4) {
+    return Status::NotSupported(
+        "More than four DB paths are not supported yet. ");
   }
 
   *dbptr = nullptr;
@@ -3963,14 +4662,18 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
     uint64_t new_log_number = impl->versions_->NewFileNumber();
     unique_ptr<WritableFile> lfile;
     EnvOptions soptions(db_options);
+    EnvOptions opt_env_options =
+        impl->db_options_.env->OptimizeForLogWrite(soptions, impl->db_options_);
     s = impl->db_options_.env->NewWritableFile(
         LogFileName(impl->db_options_.wal_dir, new_log_number), &lfile,
-        impl->db_options_.env->OptimizeForLogWrite(soptions,
-                                                   impl->db_options_));
+        opt_env_options);
     if (s.ok()) {
       lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size);
       impl->logfile_number_ = new_log_number;
-      impl->log_.reset(new log::Writer(std::move(lfile)));
+      unique_ptr<WritableFileWriter> file_writer(
+          new WritableFileWriter(std::move(lfile), opt_env_options));
+      impl->logs_.emplace_back(new_log_number,
+                               new log::Writer(std::move(file_writer)));
 
       // set column family handles
       for (auto cf : column_families) {
@@ -4001,7 +4704,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
     }
     if (s.ok()) {
       for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
-        delete impl->InstallSuperVersion(
+        delete impl->InstallSuperVersionAndScheduleWork(
             cfd, nullptr, *cfd->GetLatestMutableCFOptions());
       }
       impl->alive_log_files_.push_back(
@@ -4039,13 +4742,18 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
       }
     }
   }
-
+  TEST_SYNC_POINT("DBImpl::Open:Opened");
+  if (s.ok()) {
+    impl->opened_successfully_ = true;
+    impl->MaybeScheduleFlushOrCompaction();
+  }
   impl->mutex_.Unlock();
 
   if (s.ok()) {
-    impl->opened_successfully_ = true;
     Log(InfoLogLevel::INFO_LEVEL, impl->db_options_.info_log, "DB pointer %p",
         impl);
+    LogFlush(impl->db_options_.info_log);
+
     *dbptr = impl;
   } else {
     for (auto* h : *handles) {
@@ -4086,10 +4794,13 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
       if (ParseFileName(filenames[i], &number, info_log_prefix.prefix, &type) &&
           type != kDBLockFile) {  // Lock file will be deleted at end
         Status del;
+        std::string path_to_delete = dbname + "/" + filenames[i];
         if (type == kMetaDatabase) {
-          del = DestroyDB(dbname + "/" + filenames[i], options);
+          del = DestroyDB(path_to_delete, options);
+        } else if (type == kTableFile) {
+          del = DeleteOrMoveToTrash(&options, path_to_delete);
         } else {
-          del = env->DeleteFile(dbname + "/" + filenames[i]);
+          del = env->DeleteFile(path_to_delete);
         }
         if (result.ok() && !del.ok()) {
           result = del;
@@ -4097,12 +4808,19 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
       }
     }
 
-    for (auto& db_path : options.db_paths) {
+    for (size_t path_id = 0; path_id < options.db_paths.size(); path_id++) {
+      const auto& db_path = options.db_paths[path_id];
       env->GetChildren(db_path.path, &filenames);
       for (size_t i = 0; i < filenames.size(); i++) {
         if (ParseFileName(filenames[i], &number, &type) &&
             type == kTableFile) {  // Lock file will be deleted at end
-          Status del = env->DeleteFile(db_path.path + "/" + filenames[i]);
+          Status del;
+          std::string table_path = db_path.path + "/" + filenames[i];
+          if (path_id == 0) {
+            del = DeleteOrMoveToTrash(&options, table_path);
+          } else {
+            del = env->DeleteFile(table_path);
+          }
           if (result.ok() && !del.ok()) {
             result = del;
           }
@@ -4191,13 +4909,90 @@ void DumpRocksDBBuildVersion(Logger * log) {
 #if !defined(IOS_CROSS_COMPILE)
   // if we compile with Xcode, we don't run build_detect_vesion, so we don't
   // generate util/build_version.cc
-  Log(InfoLogLevel::INFO_LEVEL, log,
-      "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR, ROCKSDB_MINOR,
-      ROCKSDB_PATCH);
-  Log(InfoLogLevel::INFO_LEVEL, log, "Git sha %s", rocksdb_build_git_sha);
-  Log(InfoLogLevel::INFO_LEVEL, log, "Compile date %s",
-      rocksdb_build_compile_date);
+  Header(log, "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR, ROCKSDB_MINOR,
+         ROCKSDB_PATCH);
+  Header(log, "Git sha %s", rocksdb_build_git_sha);
+  Header(log, "Compile date %s", rocksdb_build_compile_date);
 #endif
 }
 
+#ifndef ROCKSDB_LITE
+SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv,
+                                                         bool include_history) {
+  // Find the earliest sequence number that we know we can rely on reading
+  // from the memtable without needing to check sst files.
+  SequenceNumber earliest_seq =
+      sv->imm->GetEarliestSequenceNumber(include_history);
+  if (earliest_seq == kMaxSequenceNumber) {
+    earliest_seq = sv->mem->GetEarliestSequenceNumber();
+  }
+  assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq);
+
+  return earliest_seq;
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetLatestSequenceForKeyFromMemtable(SuperVersion* sv,
+                                                   const Slice& key,
+                                                   SequenceNumber* seq) {
+  Status s;
+  std::string value;
+  MergeContext merge_context;
+
+  SequenceNumber current_seq = versions_->LastSequence();
+  LookupKey lkey(key, current_seq);
+
+  *seq = kMaxSequenceNumber;
+
+  // Check if there is a record for this key in the latest memtable
+  sv->mem->Get(lkey, &value, &s, &merge_context, seq);
+
+  if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+    // unexpected error reading memtable.
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Unexpected status returned from MemTable::Get: %s\n",
+        s.ToString().c_str());
+
+    return s;
+  }
+
+  if (*seq != kMaxSequenceNumber) {
+    // Found a sequence number, no need to check immutable memtables
+    return Status::OK();
+  }
+
+  // Check if there is a record for this key in the immutable memtables
+  sv->imm->Get(lkey, &value, &s, &merge_context, seq);
+
+  if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+    // unexpected error reading memtable.
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Unexpected status returned from MemTableList::Get: %s\n",
+        s.ToString().c_str());
+
+    return s;
+  }
+
+  if (*seq != kMaxSequenceNumber) {
+    // Found a sequence number, no need to check memtable history
+    return Status::OK();
+  }
+
+  // Check if there is a record for this key in the immutable memtables
+  sv->imm->GetFromHistory(lkey, &value, &s, &merge_context, seq);
+
+  if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+    // unexpected error reading memtable.
+    Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
+        "Unexpected status returned from MemTableList::GetFromHistory: %s\n",
+        s.ToString().c_str());
+
+    return s;
+  }
+
+  return Status::OK();
+}
+#endif  // ROCKSDB_LITE
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/db_impl.h b/src/rocksdb/db/db_impl.h
index 91a5963..d7cc9db 100644
--- a/src/rocksdb/db/db_impl.h
+++ b/src/rocksdb/db/db_impl.h
@@ -11,19 +11,24 @@
 #include <atomic>
 #include <deque>
 #include <limits>
-#include <set>
 #include <list>
+#include <set>
+#include <string>
 #include <utility>
-#include <list>
 #include <vector>
-#include <string>
 
+#include "db/column_family.h"
+#include "db/compaction_job.h"
 #include "db/dbformat.h"
+#include "db/flush_job.h"
+#include "db/flush_scheduler.h"
+#include "db/internal_stats.h"
 #include "db/log_writer.h"
-#include "db/snapshot.h"
-#include "db/column_family.h"
+#include "db/snapshot_impl.h"
 #include "db/version_edit.h"
 #include "db/wal_manager.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
 #include "db/writebuffer.h"
 #include "memtable_list.h"
 #include "port/port.h"
@@ -34,15 +39,10 @@
 #include "util/autovector.h"
 #include "util/event_logger.h"
 #include "util/hash.h"
+#include "util/instrumented_mutex.h"
+#include "util/scoped_arena_iterator.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
-#include "util/scoped_arena_iterator.h"
-#include "util/hash.h"
-#include "util/instrumented_mutex.h"
-#include "db/internal_stats.h"
-#include "db/write_controller.h"
-#include "db/flush_scheduler.h"
-#include "db/write_thread.h"
 
 namespace rocksdb {
 
@@ -51,9 +51,10 @@ class TableCache;
 class Version;
 class VersionEdit;
 class VersionSet;
-class CompactionFilterV2;
 class Arena;
+class WriteCallback;
 struct JobContext;
+struct ExternalSstFileInfo;
 
 class DBImpl : public DB {
  public:
@@ -73,9 +74,14 @@ class DBImpl : public DB {
   virtual Status Delete(const WriteOptions& options,
                         ColumnFamilyHandle* column_family,
                         const Slice& key) override;
+  using DB::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key) override;
   using DB::Write;
   virtual Status Write(const WriteOptions& options,
                        WriteBatch* updates) override;
+
   using DB::Get;
   virtual Status Get(const ReadOptions& options,
                      ColumnFamilyHandle* column_family, const Slice& key,
@@ -118,13 +124,12 @@ class DBImpl : public DB {
                               const Slice& property, uint64_t* value) override;
   using DB::GetApproximateSizes;
   virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                   const Range* range, int n,
-                                   uint64_t* sizes) override;
+                                   const Range* range, int n, uint64_t* sizes,
+                                   bool include_memtable = false) override;
   using DB::CompactRange;
-  virtual Status CompactRange(ColumnFamilyHandle* column_family,
-                              const Slice* begin, const Slice* end,
-                              bool reduce_level = false, int target_level = -1,
-                              uint32_t target_path_id = 0) override;
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end) override;
 
   using DB::CompactFiles;
   virtual Status CompactFiles(const CompactionOptions& compact_options,
@@ -133,6 +138,9 @@ class DBImpl : public DB {
                               const int output_level,
                               const int output_path_id = -1) override;
 
+  virtual Status PauseBackgroundWork() override;
+  virtual Status ContinueBackgroundWork() override;
+
   using DB::SetOptions;
   Status SetOptions(
       ColumnFamilyHandle* column_family,
@@ -155,6 +163,7 @@ class DBImpl : public DB {
   using DB::Flush;
   virtual Status Flush(const FlushOptions& options,
                        ColumnFamilyHandle* column_family) override;
+  virtual Status SyncWAL() override;
 
   virtual SequenceNumber GetLatestSequenceNumber() const override;
 
@@ -191,17 +200,53 @@ class DBImpl : public DB {
 
   Status PromoteL0(ColumnFamilyHandle* column_family, int target_level);
 
+  // Similar to Write() but will call the callback once on the single write
+  // thread to determine whether it is safe to perform the write.
+  virtual Status WriteWithCallback(const WriteOptions& write_options,
+                                   WriteBatch* my_batch,
+                                   WriteCallback* callback);
+
+  // Returns the sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into the current
+  // memtables. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  //
+  // If the earliest sequence number could not be determined,
+  // kMaxSequenceNumber will be returned.
+  //
+  // If include_history=true, will also search Memtables in MemTableList
+  // History.
+  SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv,
+                                                   bool include_history);
+
+  // For a given key, check to see if there are any records for this key
+  // in the memtables, including memtable history.
+
+  // On success, *seq will contain the sequence number for the
+  // latest such change or kMaxSequenceNumber if no records were present.
+  // Returns OK on success, other status on error reading memtables.
+  Status GetLatestSequenceForKeyFromMemtable(SuperVersion* sv, const Slice& key,
+                                             SequenceNumber* seq);
+
+  using DB::AddFile;
+  virtual Status AddFile(ColumnFamilyHandle* column_family,
+                         const ExternalSstFileInfo* file_info,
+                         bool move_file) override;
+  virtual Status AddFile(ColumnFamilyHandle* column_family,
+                         const std::string& file_path, bool move_file) override;
+
 #endif  // ROCKSDB_LITE
 
   // checks if all live files exist on file system and that their file sizes
   // match to our in-memory records
   virtual Status CheckConsistency();
 
-  virtual Status GetDbIdentity(std::string& identity) override;
+  virtual Status GetDbIdentity(std::string& identity) const override;
 
   Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
                              int output_level, uint32_t output_path_id,
-                             const Slice* begin, const Slice* end);
+                             const Slice* begin, const Slice* end,
+                             bool disallow_trivial_move = false);
 
 #ifndef ROCKSDB_LITE
   // Extra methods (for testing) that are not in the public DB interface
@@ -209,7 +254,8 @@ class DBImpl : public DB {
 
   // Compact any files in the named level that overlap [*begin, *end]
   Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
-                           ColumnFamilyHandle* column_family = nullptr);
+                           ColumnFamilyHandle* column_family = nullptr,
+                           bool disallow_trivial_move = false);
 
   // Force current memtable contents to be flushed.
   Status TEST_FlushMemTable(bool wait = true);
@@ -257,6 +303,8 @@ class DBImpl : public DB {
 
   size_t TEST_LogsToFreeSize();
 
+  uint64_t TEST_LogfileNumber();
+
 #endif  // ROCKSDB_LITE
 
   // Returns the list of live files in 'live' and the list
@@ -279,6 +327,42 @@ class DBImpl : public DB {
 
   void CancelAllBackgroundWork(bool wait);
 
+  // Find Super version and reference it. Based on options, it might return
+  // the thread local cached one.
+  // Call ReturnAndCleanupSuperVersion() when it is no longer needed.
+  SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);
+
+  // Similar to the previous function but looks up based on a column family id.
+  // nullptr will be returned if this column family no longer exists.
+  // REQUIRED: this function should only be called on the write thread or if the
+  // mutex is held.
+  SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id);
+
+  // Same as above, should called without mutex held and not on write thread.
+  SuperVersion* GetAndRefSuperVersionUnlocked(uint32_t column_family_id);
+
+  // Un-reference the super version and return it to thread local cache if
+  // needed. If it is the last reference of the super version. Clean it up
+  // after un-referencing it.
+  void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv);
+
+  // Similar to the previous function but looks up based on a column family id.
+  // nullptr will be returned if this column family no longer exists.
+  // REQUIRED: this function should only be called on the write thread.
+  void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv);
+
+  // Same as above, should called without mutex held and not on write thread.
+  void ReturnAndCleanupSuperVersionUnlocked(uint32_t colun_family_id,
+                                            SuperVersion* sv);
+
+  // REQUIRED: this function should only be called on the write thread or if the
+  // mutex is held.  Return value only valid until next call to this function or
+  // mutex is released.
+  ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
+
+  // Same as above, should called without mutex held and not on write thread.
+  ColumnFamilyHandle* GetColumnFamilyHandleUnlocked(uint32_t column_family_id);
+
  protected:
   Env* const env_;
   const std::string dbname_;
@@ -289,11 +373,14 @@ class DBImpl : public DB {
   Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
                                 SuperVersion* super_version, Arena* arena);
 
-  void NotifyOnFlushCompleted(ColumnFamilyData* cfd, uint64_t file_number,
-                              const MutableCFOptions& mutable_cf_options);
+  void NotifyOnFlushCompleted(ColumnFamilyData* cfd, FileMetaData* file_meta,
+                              const MutableCFOptions& mutable_cf_options,
+                              int job_id);
 
   void NotifyOnCompactionCompleted(ColumnFamilyData* cfd,
-                                   Compaction *c, const Status &st);
+                                   Compaction *c, const Status &st,
+                                   const CompactionJobStats& job_stats,
+                                   int job_id);
 
   void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
 
@@ -301,6 +388,9 @@ class DBImpl : public DB {
 
   void EraseThreadStatusDbInfo() const;
 
+  Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
+                   WriteCallback* callback);
+
  private:
   friend class DB;
   friend class InternalStats;
@@ -309,6 +399,9 @@ class DBImpl : public DB {
 #endif
   friend struct SuperVersion;
   friend class CompactedDBImpl;
+#ifndef NDEBUG
+  friend class XFTransactionWriteHandler;
+#endif
   struct CompactionState;
 
   struct WriteContext;
@@ -364,12 +457,14 @@ class DBImpl : public DB {
   // concurrent flush memtables to storage.
   Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
                                      MemTable* mem, VersionEdit* edit);
-  Status DelayWrite(uint64_t expiration_time);
+
+  // num_bytes: for slowdown case, delay time is calculated based on
+  //            `num_bytes` going through.
+  Status DelayWrite(uint64_t num_bytes);
 
   Status ScheduleFlushes(WriteContext* context);
 
-  Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
-                                     WriteContext* context);
+  Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
 
   // Force current memtable contents to be flushed.
   Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
@@ -402,13 +497,6 @@ class DBImpl : public DB {
   Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
                          LogBuffer* log_buffer);
 
-  // This function is called as part of compaction. It enables Flush process to
-  // preempt compaction, since it's higher prioirty
-  uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd,
-                                     const MutableCFOptions& mutable_cf_options,
-                                     JobContext* job_context,
-                                     LogBuffer* log_buffer);
-
   void PrintStatistics();
 
   // dump rocksdb.stats to LOG
@@ -430,6 +518,9 @@ class DBImpl : public DB {
   void AddToFlushQueue(ColumnFamilyData* cfd);
   ColumnFamilyData* PopFirstFromFlushQueue();
 
+  // helper function to call after some of the logs_ were synced
+  void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status);
+
   // table_cache_ provides its own synchronization
   std::shared_ptr<Cache> table_cache_;
 
@@ -449,7 +540,6 @@ class DBImpl : public DB {
   // * whenever there is an error in background flush or compaction
   InstrumentedCondVar bg_cv_;
   uint64_t logfile_number_;
-  unique_ptr<log::Writer> log_;
   bool log_dir_synced_;
   bool log_empty_;
   ColumnFamilyHandleImpl* default_cf_handle_;
@@ -457,13 +547,45 @@ class DBImpl : public DB {
   unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
   struct LogFileNumberSize {
     explicit LogFileNumberSize(uint64_t _number)
-        : number(_number), size(0), getting_flushed(false) {}
+        : number(_number) {}
     void AddSize(uint64_t new_size) { size += new_size; }
     uint64_t number;
-    uint64_t size;
-    bool getting_flushed;
+    uint64_t size = 0;
+    bool getting_flushed = false;
+  };
+  struct LogWriterNumber {
+    // pass ownership of _writer
+    LogWriterNumber(uint64_t _number, log::Writer* _writer)
+        : number(_number), writer(_writer) {}
+
+    log::Writer* ReleaseWriter() {
+      auto* w = writer;
+      writer = nullptr;
+      return w;
+    }
+    void ClearWriter() {
+      delete writer;
+      writer = nullptr;
+    }
+
+    uint64_t number;
+    // Visual Studio doesn't support deque's member to be noncopyable because
+    // of a unique_ptr as a member.
+    log::Writer* writer;  // own
+    // true for some prefix of logs_
+    bool getting_synced = false;
   };
   std::deque<LogFileNumberSize> alive_log_files_;
+  // Log files that aren't fully synced, and the current log file.
+  // Synchronization:
+  //  - push_back() is done from write thread with locked mutex_,
+  //  - pop_front() is done from any thread with locked mutex_,
+  //  - back() and items with getting_synced=true are not popped,
+  //  - it follows that write thread with unlocked mutex_ can safely access
+  //    back() and items with getting_synced=true.
+  std::deque<LogWriterNumber> logs_;
+  // Signaled when getting_synced becomes false for some of the logs_.
+  InstrumentedCondVar log_sync_cv_;
   uint64_t total_log_size_;
   // only used for dynamically adjusting max_total_wal_size. it is a sum of
   // [write_buffer_size * max_write_buffer_number] over all column families
@@ -513,6 +635,11 @@ class DBImpl : public DB {
   WriteBatch tmp_batch_;
 
   WriteController write_controller_;
+
+  // Size of the last batch group. In slowdown mode, next write needs to
+  // sleep if it uses up the quota.
+  uint64_t last_batch_group_size_;
+
   FlushScheduler flush_scheduler_;
 
   SnapshotList snapshots_;
@@ -574,10 +701,11 @@ class DBImpl : public DB {
     uint32_t output_path_id;
     bool done;
     Status status;
-    bool in_progress;           // compaction request being processed?
-    const InternalKey* begin;   // nullptr means beginning of key range
-    const InternalKey* end;     // nullptr means end of key range
-    InternalKey tmp_storage;    // Used to keep track of compaction progress
+    bool in_progress;             // compaction request being processed?
+    const InternalKey* begin;     // nullptr means beginning of key range
+    const InternalKey* end;       // nullptr means end of key range
+    InternalKey tmp_storage;      // Used to keep track of compaction progress
+    bool disallow_trivial_move;   // Force actual compaction to run
   };
   ManualCompaction* manual_compaction_;
 
@@ -605,6 +733,9 @@ class DBImpl : public DB {
   bool flush_on_destroy_; // Used when disableWAL is true.
 
   static const int KEEP_LOG_FILE_NUM = 1000;
+  // MSVC version 1800 still does not have constexpr for ::max()
+  static const uint64_t kNoTimeOut = port::kMaxUint64;
+
   std::string db_absolute_path_;
 
   // The options to access storage files
@@ -617,8 +748,8 @@ class DBImpl : public DB {
   // Unified interface for logging events
   EventLogger event_logger_;
 
-  // A value of true temporarily disables scheduling of background work
-  bool bg_work_gate_closed_;
+  // A value of >0 temporarily disables scheduling of background work
+  int bg_work_paused_;
 
   // Guard against multiple concurrent refitting
   bool refitting_level_;
@@ -626,12 +757,6 @@ class DBImpl : public DB {
   // Indicate DB was opened successfully
   bool opened_successfully_;
 
-  // The list of registered event listeners.
-  std::list<EventListener*> listeners_;
-
-  // count how many events are currently being notified.
-  int notifying_events_;
-
   // No copying allowed
   DBImpl(const DBImpl&);
   void operator=(const DBImpl&);
@@ -647,31 +772,16 @@ class DBImpl : public DB {
   // the InstallSuperVersion() function. Background threads carry
   // job_context which can have new_superversion already
   // allocated.
-  void InstallSuperVersionBackground(
+  void InstallSuperVersionAndScheduleWorkWrapper(
       ColumnFamilyData* cfd, JobContext* job_context,
       const MutableCFOptions& mutable_cf_options);
 
   // All ColumnFamily state changes go through this function. Here we analyze
   // the new state and we schedule background work if we detect that the new
   // state needs flush or compaction.
-  // If dont_schedule_bg_work == true, then caller asks us to not schedule flush
-  // or compaction here, but it also promises to schedule needed background
-  // work. We use this to  scheduling background compactions when we are in the
-  // write thread, which is very performance critical. Caller schedules
-  // background work as soon as it exits the write thread
-  SuperVersion* InstallSuperVersion(ColumnFamilyData* cfd, SuperVersion* new_sv,
-                                    const MutableCFOptions& mutable_cf_options,
-                                    bool dont_schedule_bg_work = false);
-
-  // Find Super version and reference it. Based on options, it might return
-  // the thread local cached one.
-  inline SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);
-
-  // Un-reference the super version and return it to thread local cache if
-  // needed. If it is the last reference of the super version. Clean it up
-  // after un-referencing it.
-  inline void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
-                                           SuperVersion* sv);
+  SuperVersion* InstallSuperVersionAndScheduleWork(
+      ColumnFamilyData* cfd, SuperVersion* new_sv,
+      const MutableCFOptions& mutable_cf_options);
 
 #ifndef ROCKSDB_LITE
   using DB::GetPropertiesOfAllTables;
diff --git a/src/rocksdb/db/db_impl_debug.cc b/src/rocksdb/db/db_impl_debug.cc
index 5c7a353..dc40fef 100644
--- a/src/rocksdb/db/db_impl_debug.cc
+++ b/src/rocksdb/db/db_impl_debug.cc
@@ -73,7 +73,8 @@ uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
 
 Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
                                  const Slice* end,
-                                 ColumnFamilyHandle* column_family) {
+                                 ColumnFamilyHandle* column_family,
+                                 bool disallow_trivial_move) {
   ColumnFamilyData* cfd;
   if (column_family == nullptr) {
     cfd = default_cf_handle_->cfd();
@@ -86,7 +87,8 @@ Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
        cfd->ioptions()->compaction_style == kCompactionStyleFIFO)
           ? level
           : level + 1;
-  return RunManualCompaction(cfd, level, output_level, 0, begin, end);
+  return RunManualCompaction(cfd, level, output_level, 0, begin, end,
+                             disallow_trivial_move);
 }
 
 Status DBImpl::TEST_FlushMemTable(bool wait) {
@@ -129,15 +131,14 @@ void DBImpl::TEST_UnlockMutex() {
 }
 
 void* DBImpl::TEST_BeginWrite() {
-  auto w = new WriteThread::Writer(&mutex_);
-  Status s = write_thread_.EnterWriteThread(w, 0);
-  assert(s.ok() && !w->done);  // No timeout and nobody should do our job
+  auto w = new WriteThread::Writer();
+  write_thread_.EnterUnbatched(w, &mutex_);
   return reinterpret_cast<void*>(w);
 }
 
 void DBImpl::TEST_EndWrite(void* w) {
   auto writer = reinterpret_cast<WriteThread::Writer*>(w);
-  write_thread_.ExitWriteThread(writer, writer, Status::OK());
+  write_thread_.ExitUnbatched(writer);
   delete writer;
 }
 
@@ -146,5 +147,10 @@ size_t DBImpl::TEST_LogsToFreeSize() {
   return logs_to_free_.size();
 }
 
+uint64_t DBImpl::TEST_LogfileNumber() {
+  InstrumentedMutexLock l(&mutex_);
+  return logfile_number_;
+}
+
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl_experimental.cc b/src/rocksdb/db/db_impl_experimental.cc
index d6c3dfc..6bf0ba6 100644
--- a/src/rocksdb/db/db_impl_experimental.cc
+++ b/src/rocksdb/db/db_impl_experimental.cc
@@ -130,14 +130,15 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
       edit.DeleteFile(0, f->fd.GetNumber());
       edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
-                   f->smallest_seqno, f->largest_seqno);
+                   f->smallest_seqno, f->largest_seqno,
+                   f->marked_for_compaction);
     }
 
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
                                     &edit, &mutex_, directories_.GetDbDir());
     if (status.ok()) {
-      InstallSuperVersionBackground(cfd, &job_context,
-                                    *cfd->GetLatestMutableCFOptions());
+      InstallSuperVersionAndScheduleWorkWrapper(
+          cfd, &job_context, *cfd->GetLatestMutableCFOptions());
     }
   }  // lock released here
   LogFlush(db_options_.info_log);
diff --git a/src/rocksdb/db/db_impl_readonly.cc b/src/rocksdb/db/db_impl_readonly.cc
index c1d61e3..618ade8 100644
--- a/src/rocksdb/db/db_impl_readonly.cc
+++ b/src/rocksdb/db/db_impl_readonly.cc
@@ -5,7 +5,8 @@
 
 
 #include "db/db_impl_readonly.h"
-#include "utilities/compacted_db/compacted_db_impl.h"
+
+#include "db/compacted_db_impl.h"
 #include "db/db_impl.h"
 #include "db/merge_context.h"
 #include "db/db_iter.h"
diff --git a/src/rocksdb/db/db_impl_readonly.h b/src/rocksdb/db/db_impl_readonly.h
index 25fcb43..8f3103a 100644
--- a/src/rocksdb/db/db_impl_readonly.h
+++ b/src/rocksdb/db/db_impl_readonly.h
@@ -53,15 +53,20 @@ class DBImplReadOnly : public DBImpl {
                         const Slice& key) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
+  using DBImpl::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
   virtual Status Write(const WriteOptions& options,
                        WriteBatch* updates) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
   using DBImpl::CompactRange;
-  virtual Status CompactRange(ColumnFamilyHandle* column_family,
-                              const Slice* begin, const Slice* end,
-                              bool reduce_level = false, int target_level = -1,
-                              uint32_t target_path_id = 0) override {
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
@@ -95,6 +100,11 @@ class DBImplReadOnly : public DBImpl {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
+  using DBImpl::SyncWAL;
+  virtual Status SyncWAL() override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
  private:
   friend class DB;
 
diff --git a/src/rocksdb/db/db_inplace_update_test.cc b/src/rocksdb/db/db_inplace_update_test.cc
new file mode 100644
index 0000000..a04c2f5
--- /dev/null
+++ b/src/rocksdb/db/db_inplace_update_test.cc
@@ -0,0 +1,171 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "port/stack_trace.h"
+#include "util/db_test_util.h"
+
+namespace rocksdb {
+
+class DBTestInPlaceUpdate : public DBTestBase {
+ public:
+  DBTestInPlaceUpdate() : DBTestBase("/db_inplace_update_test") {}
+};
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdate) {
+  do {
+    Options options;
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size
+    int numValues = 10;
+    for (int i = numValues; i > 0; i--) {
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateLargeNewValue) {
+  do {
+    Options options;
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of larger size
+    int numValues = 10;
+    for (int i = 0; i < numValues; i++) {
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+    }
+
+    // All 10 updates exist in the internal iterator
+    validateNumberOfEntries(numValues, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerSize) {
+  do {
+    Options options;
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTestInPlaceUpdate::updateInPlaceSmallerSize;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size
+    int numValues = 10;
+    ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
+    ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+
+    for (int i = numValues; i > 0; i--) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(i - 1, 'b'), Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerVarintSize) {
+  do {
+    Options options;
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTestInPlaceUpdate::updateInPlaceSmallerVarintSize;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller varint size
+    int numValues = 265;
+    ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
+    ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+
+    for (int i = numValues; i > 0; i--) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(1, 'b'), Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackLargeNewValue) {
+  do {
+    Options options;
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTestInPlaceUpdate::updateInPlaceLargerSize;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of larger size
+    int numValues = 10;
+    for (int i = 0; i < numValues; i++) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(i, 'c'), Get(1, "key"));
+    }
+
+    // No inplace updates. All updates are puts with new seq number
+    // All 10 updates exist in the internal iterator
+    validateNumberOfEntries(numValues, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackNoAction) {
+  do {
+    Options options;
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+      rocksdb::DBTestInPlaceUpdate::updateInPlaceNoAction;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Callback function requests no actions from db
+    ASSERT_OK(Put(1, "key", DummyString(1, 'a')));
+    ASSERT_EQ(Get(1, "key"), "NOT_FOUND");
+  } while (ChangeCompactOptions());
+}
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iter.cc b/src/rocksdb/db/db_iter.cc
index ce75f43..065b8e4 100644
--- a/src/rocksdb/db/db_iter.cc
+++ b/src/rocksdb/db/db_iter.cc
@@ -115,6 +115,7 @@ class DBIter: public Iterator {
   virtual void SeekToLast() override;
 
  private:
+  void ReverseToBackward();
   void PrevInternal();
   void FindParseableKey(ParsedInternalKey* ikey, Direction direction);
   bool FindValueForCurrentKey();
@@ -188,6 +189,13 @@ void DBIter::Next() {
     return;
   }
   FindNextUserEntry(true /* skipping the current user key */);
+  if (statistics_ != nullptr) {
+    RecordTick(statistics_, NUMBER_DB_NEXT);
+    if (valid_) {
+      RecordTick(statistics_, NUMBER_DB_NEXT_FOUND);
+      RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+    }
+  }
 }
 
 // PRE: saved_key_ has the current user key if skipping
@@ -227,6 +235,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
         } else {
           switch (ikey.type) {
             case kTypeDeletion:
+            case kTypeSingleDeletion:
               // Arrange to skip all upcoming entries for this key since
               // they are hidden by this deletion.
               saved_key_.SetKey(ikey.user_key);
@@ -255,12 +264,13 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
     // If we have sequentially iterated via numerous keys and still not
     // found the next user-key, then it is better to seek so that we can
     // avoid too many key comparisons. We seek to the last occurrence of
-    // our current key by looking for sequence number 0.
+    // our current key by looking for sequence number 0 and type deletion
+    // (the smallest type).
     if (skipping && num_skipped > max_skip_) {
       num_skipped = 0;
       std::string last_key;
       AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetKey(), 0,
-                                                     kValueTypeForSeek));
+                                                     kTypeDeletion));
       iter_->Seek(last_key);
       RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
     } else {
@@ -296,19 +306,15 @@ void DBIter::MergeValuesNewToOld() {
       continue;
     }
 
-    if (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) != 0) {
+    if (!user_comparator_->Equal(ikey.user_key, saved_key_.GetKey())) {
       // hit the next user key, stop right here
       break;
-    }
-
-    if (kTypeDeletion == ikey.type) {
+    } else if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type) {
       // hit a delete with the same user key, stop right here
       // iter_ is positioned after delete
       iter_->Next();
       break;
-    }
-
-    if (kTypeValue == ikey.type) {
+    } else if (kTypeValue == ikey.type) {
       // hit a put, merge the put value with operands and store the
       // final result in saved_value_. We are done!
       // ignore corruption if there is any.
@@ -324,13 +330,13 @@ void DBIter::MergeValuesNewToOld() {
       // iter_ is positioned after put
       iter_->Next();
       return;
-    }
-
-    if (kTypeMerge == ikey.type) {
+    } else if (kTypeMerge == ikey.type) {
       // hit a merge, add the value as an operand and run associative merge.
       // when complete, add result to operands and continue.
       const Slice& val = iter_->value();
       operands.push_front(val.ToString());
+    } else {
+      assert(false);
     }
   }
 
@@ -350,10 +356,43 @@ void DBIter::MergeValuesNewToOld() {
 void DBIter::Prev() {
   assert(valid_);
   if (direction_ == kForward) {
-    FindPrevUserKey();
-    direction_ = kReverse;
+    ReverseToBackward();
   }
   PrevInternal();
+  if (statistics_ != nullptr) {
+    RecordTick(statistics_, NUMBER_DB_PREV);
+    if (valid_) {
+      RecordTick(statistics_, NUMBER_DB_PREV_FOUND);
+      RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+    }
+  }
+}
+
+void DBIter::ReverseToBackward() {
+  if (current_entry_is_merged_) {
+    // Not placed in the same key. Need to call Prev() until finding the
+    // previous key.
+    if (!iter_->Valid()) {
+      iter_->SeekToLast();
+    }
+    ParsedInternalKey ikey;
+    FindParseableKey(&ikey, kReverse);
+    while (iter_->Valid() &&
+           user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) > 0) {
+      iter_->Prev();
+      FindParseableKey(&ikey, kReverse);
+    }
+  }
+#ifndef NDEBUG
+  if (iter_->Valid()) {
+    ParsedInternalKey ikey;
+    assert(ParseKey(&ikey));
+    assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0);
+  }
+#endif
+
+  FindPrevUserKey();
+  direction_ = kReverse;
 }
 
 void DBIter::PrevInternal() {
@@ -372,7 +411,7 @@ void DBIter::PrevInternal() {
         return;
       }
       FindParseableKey(&ikey, kReverse);
-      if (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0) {
+      if (user_comparator_->Equal(ikey.user_key, saved_key_.GetKey())) {
         FindPrevUserKey();
       }
       return;
@@ -381,8 +420,7 @@ void DBIter::PrevInternal() {
       break;
     }
     FindParseableKey(&ikey, kReverse);
-    if (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0) {
-
+    if (user_comparator_->Equal(ikey.user_key, saved_key_.GetKey())) {
       FindPrevUserKey();
     }
   }
@@ -392,12 +430,14 @@ void DBIter::PrevInternal() {
 }
 
 // This function checks, if the entry with biggest sequence_number <= sequence_
-// is non kTypeDeletion. If it's not, we save value in saved_value_
+// is non kTypeDeletion or kTypeSingleDeletion. If it's not, we save value in
+// saved_value_
 bool DBIter::FindValueForCurrentKey() {
   assert(iter_->Valid());
   // Contains operands for merge operator.
   std::deque<std::string> operands;
-  // last entry before merge (could be kTypeDeletion or kTypeValue)
+  // last entry before merge (could be kTypeDeletion, kTypeSingleDeletion or
+  // kTypeValue)
   ValueType last_not_merge_type = kTypeDeletion;
   ValueType last_key_entry_type = kTypeDeletion;
 
@@ -406,7 +446,7 @@ bool DBIter::FindValueForCurrentKey() {
 
   size_t num_skipped = 0;
   while (iter_->Valid() && ikey.sequence <= sequence_ &&
-         (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0)) {
+         user_comparator_->Equal(ikey.user_key, saved_key_.GetKey())) {
     // We iterate too much: let's use Seek() to avoid too much key comparisons
     if (num_skipped >= max_skip_) {
       return FindValueForCurrentKeyUsingSeek();
@@ -420,8 +460,9 @@ bool DBIter::FindValueForCurrentKey() {
         last_not_merge_type = kTypeValue;
         break;
       case kTypeDeletion:
+      case kTypeSingleDeletion:
         operands.clear();
-        last_not_merge_type = kTypeDeletion;
+        last_not_merge_type = last_key_entry_type;
         PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
         break;
       case kTypeMerge:
@@ -433,7 +474,7 @@ bool DBIter::FindValueForCurrentKey() {
     }
 
     PERF_COUNTER_ADD(internal_key_skipped_count, 1);
-    assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0);
+    assert(user_comparator_->Equal(ikey.user_key, saved_key_.GetKey()));
     iter_->Prev();
     ++num_skipped;
     FindParseableKey(&ikey, kReverse);
@@ -441,6 +482,7 @@ bool DBIter::FindValueForCurrentKey() {
 
   switch (last_key_entry_type) {
     case kTypeDeletion:
+    case kTypeSingleDeletion:
       valid_ = false;
       return false;
     case kTypeMerge:
@@ -489,7 +531,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
   ParsedInternalKey ikey;
   FindParseableKey(&ikey, kForward);
 
-  if (ikey.type == kTypeValue || ikey.type == kTypeDeletion) {
+  if (ikey.type == kTypeValue || ikey.type == kTypeDeletion ||
+      ikey.type == kTypeSingleDeletion) {
     if (ikey.type == kTypeValue) {
       saved_value_ = iter_->value().ToString();
       valid_ = true;
@@ -503,7 +546,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
   // in operands
   std::deque<std::string> operands;
   while (iter_->Valid() &&
-         (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0) &&
+         user_comparator_->Equal(ikey.user_key, saved_key_.GetKey()) &&
          ikey.type == kTypeMerge) {
     operands.push_front(iter_->value().ToString());
     iter_->Next();
@@ -511,8 +554,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
   }
 
   if (!iter_->Valid() ||
-      (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) != 0) ||
-      ikey.type == kTypeDeletion) {
+      !user_comparator_->Equal(ikey.user_key, saved_key_.GetKey()) ||
+      ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion) {
     {
       StopWatchNano timer(env_, statistics_ != nullptr);
       PERF_TIMER_GUARD(merge_operator_time_nanos);
@@ -522,7 +565,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
     }
     // Make iter_ valid and point to saved_key_
     if (!iter_->Valid() ||
-        (user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) != 0)) {
+        !user_comparator_->Equal(ikey.user_key, saved_key_.GetKey())) {
       iter_->Seek(last_key);
       RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
     }
@@ -553,7 +596,7 @@ void DBIter::FindNextUserKey() {
   ParsedInternalKey ikey;
   FindParseableKey(&ikey, kForward);
   while (iter_->Valid() &&
-         user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) != 0) {
+         !user_comparator_->Equal(ikey.user_key, saved_key_.GetKey())) {
     iter_->Next();
     FindParseableKey(&ikey, kForward);
   }
@@ -567,19 +610,23 @@ void DBIter::FindPrevUserKey() {
   size_t num_skipped = 0;
   ParsedInternalKey ikey;
   FindParseableKey(&ikey, kReverse);
-  while (iter_->Valid() &&
-         user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0) {
-    if (num_skipped >= max_skip_) {
-      num_skipped = 0;
-      IterKey last_key;
-      last_key.SetInternalKey(ParsedInternalKey(
-          saved_key_.GetKey(), kMaxSequenceNumber, kValueTypeForSeek));
-      iter_->Seek(last_key.GetKey());
-      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+  int cmp;
+  while (iter_->Valid() && ((cmp = user_comparator_->Compare(
+                                 ikey.user_key, saved_key_.GetKey())) == 0 ||
+                            (cmp > 0 && ikey.sequence > sequence_))) {
+    if (cmp == 0) {
+      if (num_skipped >= max_skip_) {
+        num_skipped = 0;
+        IterKey last_key;
+        last_key.SetInternalKey(ParsedInternalKey(
+            saved_key_.GetKey(), kMaxSequenceNumber, kValueTypeForSeek));
+        iter_->Seek(last_key.GetKey());
+        RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+      } else {
+        ++num_skipped;
+      }
     }
-
     iter_->Prev();
-    ++num_skipped;
     FindParseableKey(&ikey, kReverse);
   }
 }
@@ -597,21 +644,6 @@ void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) {
 
 void DBIter::Seek(const Slice& target) {
   StopWatch sw(env_, statistics_, DB_SEEK);
-
-  // total ordering is not guaranteed if prefix_extractor is set
-  // hence prefix based seeks will not give correct results
-  if (iterate_upper_bound_ != nullptr && prefix_extractor_ != nullptr) {
-    if (!prefix_extractor_->InDomain(*iterate_upper_bound_) ||
-        !prefix_extractor_->InDomain(target) ||
-        prefix_extractor_->Transform(*iterate_upper_bound_).compare(
-          prefix_extractor_->Transform(target)) != 0) {
-      status_ = Status::InvalidArgument("read_options.iterate_*_bound "
-                  " and seek target need to have the same prefix.");
-      valid_ = false;
-      return;
-    }
-  }
-
   saved_key_.Clear();
   // now savved_key is used to store internal key.
   saved_key_.SetInternalKey(target, sequence_);
@@ -621,10 +653,17 @@ void DBIter::Seek(const Slice& target) {
     iter_->Seek(saved_key_.GetKey());
   }
 
+  RecordTick(statistics_, NUMBER_DB_SEEK);
   if (iter_->Valid()) {
     direction_ = kForward;
     ClearSavedValue();
-    FindNextUserEntry(false /*not skipping */);
+    FindNextUserEntry(false /* not skipping */);
+    if (statistics_ != nullptr) {
+      if (valid_) {
+        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+      }
+    }
   } else {
     valid_ = false;
   }
@@ -632,7 +671,7 @@ void DBIter::Seek(const Slice& target) {
 
 void DBIter::SeekToFirst() {
   // Don't use iter_::Seek() if we set a prefix extractor
-  // because prefix seek wiil be used.
+  // because prefix seek will be used.
   if (prefix_extractor_ != nullptr) {
     max_skip_ = std::numeric_limits<uint64_t>::max();
   }
@@ -644,8 +683,15 @@ void DBIter::SeekToFirst() {
     iter_->SeekToFirst();
   }
 
+  RecordTick(statistics_, NUMBER_DB_SEEK);
   if (iter_->Valid()) {
     FindNextUserEntry(false /* not skipping */);
+    if (statistics_ != nullptr) {
+      if (valid_) {
+        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+      }
+    }
   } else {
     valid_ = false;
   }
@@ -653,7 +699,7 @@ void DBIter::SeekToFirst() {
 
 void DBIter::SeekToLast() {
   // Don't use iter_::Seek() if we set a prefix extractor
-  // because prefix seek wiil be used.
+  // because prefix seek will be used.
   if (prefix_extractor_ != nullptr) {
     max_skip_ = std::numeric_limits<uint64_t>::max();
   }
@@ -664,8 +710,36 @@ void DBIter::SeekToLast() {
     PERF_TIMER_GUARD(seek_internal_seek_time);
     iter_->SeekToLast();
   }
+  // When the iterate_upper_bound is set to a value,
+  // it will seek to the last key before the
+  // ReadOptions.iterate_upper_bound
+  if (iter_->Valid() && iterate_upper_bound_ != nullptr) {
+    saved_key_.SetKey(*iterate_upper_bound_);
+    std::string last_key;
+    AppendInternalKey(&last_key,
+                      ParsedInternalKey(saved_key_.GetKey(), kMaxSequenceNumber,
+                                        kValueTypeForSeek));
+
+    iter_->Seek(last_key);
 
+    if (!iter_->Valid()) {
+      iter_->SeekToLast();
+    } else {
+      iter_->Prev();
+      if (!iter_->Valid()) {
+        valid_ = false;
+        return;
+      }
+    }
+  }
   PrevInternal();
+  if (statistics_ != nullptr) {
+    RecordTick(statistics_, NUMBER_DB_SEEK);
+    if (valid_) {
+      RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+      RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+    }
+  }
 }
 
 Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions,
diff --git a/src/rocksdb/db/db_iter_test.cc b/src/rocksdb/db/db_iter_test.cc
index 18b38ac..68c5b15 100644
--- a/src/rocksdb/db/db_iter_test.cc
+++ b/src/rocksdb/db/db_iter_test.cc
@@ -8,13 +8,17 @@
 #include <algorithm>
 #include <utility>
 
+#include "db/db_iter.h"
 #include "db/dbformat.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/statistics.h"
-#include "db/db_iter.h"
+#include "table/iterator_wrapper.h"
+#include "table/merger.h"
 #include "util/string_util.h"
+#include "util/sync_point.h"
 #include "util/testharness.h"
 #include "utilities/merge_operators.h"
 
@@ -34,24 +38,40 @@ class TestIterator : public Iterator {
         iter_(0),
         cmp(comparator) {}
 
-  void AddMerge(std::string argkey, std::string argvalue) {
-    Add(argkey, kTypeMerge, argvalue);
+  void AddPut(std::string argkey, std::string argvalue) {
+    Add(argkey, kTypeValue, argvalue);
   }
 
   void AddDeletion(std::string argkey) {
     Add(argkey, kTypeDeletion, std::string());
   }
 
-  void AddPut(std::string argkey, std::string argvalue) {
-    Add(argkey, kTypeValue, argvalue);
+  void AddSingleDeletion(std::string argkey) {
+    Add(argkey, kTypeSingleDeletion, std::string());
+  }
+
+  void AddMerge(std::string argkey, std::string argvalue) {
+    Add(argkey, kTypeMerge, argvalue);
   }
 
   void Add(std::string argkey, ValueType type, std::string argvalue) {
+    Add(argkey, type, argvalue, sequence_number_++);
+  }
+
+  void Add(std::string argkey, ValueType type, std::string argvalue,
+           size_t seq_num, bool update_iter = false) {
     valid_ = true;
-    ParsedInternalKey internal_key(argkey, sequence_number_++, type);
+    ParsedInternalKey internal_key(argkey, seq_num, type);
     data_.push_back(
         std::pair<std::string, std::string>(std::string(), argvalue));
     AppendInternalKey(&data_.back().first, internal_key);
+    if (update_iter && valid_ && cmp.Compare(data_.back().first, key()) < 0) {
+      // insert a key smaller than current key
+      Finish();
+      // data_[iter_] is not anymore the current element of the iterator.
+      // Increment it to reposition it to the right position.
+      iter_++;
+    }
   }
 
   // should be called before operations with iterator
@@ -184,6 +204,272 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     db_iter->Next();
     ASSERT_TRUE(!db_iter->Valid());
   }
+  // Test to check the SeekToLast() with iterate_upper_bound not set
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        10, options.max_sequential_skip_in_iterations));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+  }
+
+  // Test to check the SeekToLast() with iterate_upper_bound set
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("d", "val_d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->AddPut("f", "val_f");
+    internal_iter->Finish();
+
+    Slice prefix("d");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+  }
+  // Test to check the SeekToLast() iterate_upper_bound set to a key that
+  // is not Put yet
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("d", "val_d");
+    internal_iter->Finish();
+
+    Slice prefix("z");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+  }
+  // Test to check the SeekToLast() with iterate_upper_bound set to the
+  // first key
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    Slice prefix("a");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+  // Test case to check SeekToLast with iterate_upper_bound set
+  // (same key put may times - SeekToLast should start with the
+  // maximum sequence id of the upper bound)
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        7, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound));
+
+    SetPerfLevel(kEnableCount);
+    ASSERT_TRUE(GetPerfLevel() == kEnableCount);
+
+    perf_context.Reset();
+    db_iter->SeekToLast();
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(static_cast<int>(perf_context.internal_key_skipped_count), 1);
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+
+    SetPerfLevel(kDisable);
+  }
+  // Test to check the SeekToLast() with the iterate_upper_bound set
+  // (Checking the value of the key which has sequence ids greater than
+  // and less that the iterator's sequence id)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+    internal_iter->AddPut("a", "val_a1");
+    internal_iter->AddPut("a", "val_a2");
+    internal_iter->AddPut("b", "val_b1");
+    internal_iter->AddPut("c", "val_c1");
+    internal_iter->AddPut("c", "val_c2");
+    internal_iter->AddPut("c", "val_c3");
+    internal_iter->AddPut("b", "val_b2");
+    internal_iter->AddPut("d", "val_d1");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        4, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b1");
+  }
+
+  // Test to check the SeekToLast() with the iterate_upper_bound set to the
+  // key that is deleted
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    Slice prefix("a");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+  // Test to check the SeekToLast() with the iterate_upper_bound set
+  // (Deletion cases)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+  }
+  // Test to check the SeekToLast() with iterate_upper_bound set
+  // (Deletion cases - Lot of internal keys after the upper_bound
+  // is deleted)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddDeletion("c");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddDeletion("e");
+    internal_iter->AddDeletion("f");
+    internal_iter->AddDeletion("g");
+    internal_iter->AddDeletion("h");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        7, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound));
+
+    SetPerfLevel(kEnableCount);
+    ASSERT_TRUE(GetPerfLevel() == kEnableCount);
+
+    perf_context.Reset();
+    db_iter->SeekToLast();
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(static_cast<int>(perf_context.internal_delete_skipped_count), 0);
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+
+    SetPerfLevel(kDisable);
+  }
 
   {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
@@ -609,6 +895,8 @@ TEST_F(DBIteratorTest, DBIterator1) {
   db_iter->Next();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
+  db_iter->Next();
+  ASSERT_FALSE(db_iter->Valid());
 }
 
 TEST_F(DBIteratorTest, DBIterator2) {
@@ -1377,6 +1665,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
     ASSERT_TRUE(!db_iter->Valid());
   }
 }
+
 TEST_F(DBIteratorTest, DBIterator8) {
   Options options;
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
@@ -1401,6 +1690,611 @@ TEST_F(DBIteratorTest, DBIterator8) {
   ASSERT_EQ(db_iter->value().ToString(), "0");
 }
 
+// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary
+//             return the biggest element smaller than the seek key.
+TEST_F(DBIteratorTest, DBIterator9) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("b", "merge_3");
+    internal_iter->AddMerge("b", "merge_4");
+    internal_iter->AddMerge("d", "merge_5");
+    internal_iter->AddMerge("d", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+        10, options.max_sequential_skip_in_iterations));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+
+    db_iter->Seek("b");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+
+    db_iter->Seek("c");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+  }
+}
+
+// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary
+//             return the biggest element smaller than the seek key.
+TEST_F(DBIteratorTest, DBIterator10) {
+  Options options;
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->AddPut("c", "3");
+  internal_iter->AddPut("d", "4");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+      10, options.max_sequential_skip_in_iterations));
+
+  db_iter->Seek("c");
+  ASSERT_TRUE(db_iter->Valid());
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "3");
+}
+
+TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) {
+  Options options;
+  options.merge_operator = nullptr;
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter,
+      10, 0 /* force seek */));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "1");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+  db_iter->Next();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator11) {
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddSingleDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 1,
+      options.max_sequential_skip_in_iterations));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  db_iter->Next();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator12) {
+  Options options;
+  options.merge_operator = nullptr;
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->AddPut("c", "3");
+  internal_iter->AddSingleDeletion("b");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(
+      NewDBIterator(env_, ImmutableCFOptions(options), BytewiseComparator(),
+                    internal_iter, 10, 0));
+  db_iter->SeekToLast();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "3");
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "1");
+  db_iter->Prev();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+class DBIterWithMergeIterTest : public testing::Test {
+ public:
+  DBIterWithMergeIterTest()
+      : env_(Env::Default()), icomp_(BytewiseComparator()) {
+    options_.merge_operator = nullptr;
+
+    internal_iter1_ = new TestIterator(BytewiseComparator());
+    internal_iter1_->Add("a", kTypeValue, "1", 3u);
+    internal_iter1_->Add("f", kTypeValue, "2", 5u);
+    internal_iter1_->Add("g", kTypeValue, "3", 7u);
+    internal_iter1_->Finish();
+
+    internal_iter2_ = new TestIterator(BytewiseComparator());
+    internal_iter2_->Add("a", kTypeValue, "4", 6u);
+    internal_iter2_->Add("b", kTypeValue, "5", 1u);
+    internal_iter2_->Add("c", kTypeValue, "6", 2u);
+    internal_iter2_->Add("d", kTypeValue, "7", 3u);
+    internal_iter2_->Finish();
+
+    std::vector<Iterator*> child_iters;
+    child_iters.push_back(internal_iter1_);
+    child_iters.push_back(internal_iter2_);
+    InternalKeyComparator icomp(BytewiseComparator());
+    Iterator* merge_iter = NewMergingIterator(&icomp_, &child_iters[0], 2u);
+
+    db_iter_.reset(NewDBIterator(env_, ImmutableCFOptions(options_),
+                                 BytewiseComparator(), merge_iter,
+                                 8 /* read data earlier than seqId 8 */,
+                                 3 /* max iterators before reseek */));
+  }
+
+  Env* env_;
+  Options options_;
+  TestIterator* internal_iter1_;
+  TestIterator* internal_iter2_;
+  InternalKeyComparator icomp_;
+  Iterator* merge_iter_;
+  std::unique_ptr<Iterator> db_iter_;
+};
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIterator1) {
+  db_iter_->SeekToFirst();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+  db_iter_->Next();
+  ASSERT_FALSE(db_iter_->Valid());
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIterator2) {
+  // Test Prev() when one child iterator is at its end.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+}
+
+#if !(defined NDEBUG) || !defined(OS_WIN)
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace1) {
+  // Test Prev() when one child iterator is at its end but more rows
+  // are added.
+  db_iter_->Seek("f");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+  // Test call back inserts a key in the end of the mem table after
+  // MergeIterator::Prev() realized the mem table iterator is at its end
+  // and before an SeekToLast() is called.
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforeSeekToLast",
+      [&](void* arg) { internal_iter2_->Add("z", kTypeValue, "7", 12u); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace2) {
+  // Test Prev() when one child iterator is at its end but more rows
+  // are added.
+  db_iter_->Seek("f");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+  // Test call back inserts entries for update a key in the end of the
+  // mem table after MergeIterator::Prev() realized the mem tableiterator is at
+  // its end and before an SeekToLast() is called.
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforeSeekToLast", [&](void* arg) {
+        internal_iter2_->Add("z", kTypeValue, "7", 12u);
+        internal_iter2_->Add("z", kTypeValue, "7", 11u);
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace3) {
+  // Test Prev() when one child iterator is at its end but more rows
+  // are added and max_skipped is triggered.
+  db_iter_->Seek("f");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+  // Test call back inserts entries for update a key in the end of the
+  // mem table after MergeIterator::Prev() realized the mem table iterator is at
+  // its end and before an SeekToLast() is called.
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforeSeekToLast", [&](void* arg) {
+        internal_iter2_->Add("z", kTypeValue, "7", 16u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 15u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 14u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 13u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 12u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 11u, true);
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace4) {
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts entries for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 14u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 13u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 12u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 11u, true);
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace5) {
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts entries for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace6) {
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts an entry for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace7) {
+  internal_iter1_->Add("u", kTypeValue, "10", 4u);
+  internal_iter1_->Add("v", kTypeValue, "11", 4u);
+  internal_iter1_->Add("w", kTypeValue, "12", 4u);
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts entries for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 14u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 13u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 12u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 11u, true);
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) {
+  // internal_iter1_: a, f, g
+  // internal_iter2_: a, b, c, d, adding (z)
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts two keys before "z" in mem table after
+  // MergeIterator::Prev() calls mem table iterator's Seek() and
+  // before calling Prev()
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("y", kTypeValue, "7", 17u, true);
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif // #if !(defined NDEBUG) || !defined(OS_WIN)
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/src/rocksdb/db/db_log_iter_test.cc b/src/rocksdb/db/db_log_iter_test.cc
new file mode 100644
index 0000000..a1e8d20
--- /dev/null
+++ b/src/rocksdb/db/db_log_iter_test.cc
@@ -0,0 +1,290 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !(defined NDEBUG) || !defined(OS_WIN)
+
+#include "port/stack_trace.h"
+#include "util/db_test_util.h"
+
+namespace rocksdb {
+
+class DBTestXactLogIterator : public DBTestBase {
+ public:
+  DBTestXactLogIterator() : DBTestBase("/db_log_iter_test") {}
+
+  std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+      const SequenceNumber seq) {
+    unique_ptr<TransactionLogIterator> iter;
+    Status status = dbfull()->GetUpdatesSince(seq, &iter);
+    EXPECT_OK(status);
+    EXPECT_TRUE(iter->Valid());
+    return std::move(iter);
+  }
+};
+
+namespace {
+SequenceNumber ReadRecords(
+    std::unique_ptr<TransactionLogIterator>& iter,
+    int& count) {
+  count = 0;
+  SequenceNumber lastSequence = 0;
+  BatchResult res;
+  while (iter->Valid()) {
+    res = iter->GetBatch();
+    EXPECT_TRUE(res.sequence > lastSequence);
+    ++count;
+    lastSequence = res.sequence;
+    EXPECT_OK(iter->status());
+    iter->Next();
+  }
+  return res.sequence;
+}
+
+void ExpectRecords(
+    const int expected_no_records,
+    std::unique_ptr<TransactionLogIterator>& iter) {
+  int num_records;
+  ReadRecords(iter, num_records);
+  ASSERT_EQ(num_records, expected_no_records);
+}
+}  // namespace
+
+TEST_F(DBTestXactLogIterator, TransactionLogIterator) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    Put(0, "key1", DummyString(1024));
+    Put(1, "key2", DummyString(1024));
+    Put(1, "key2", DummyString(1024));
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U);
+    {
+      auto iter = OpenTransactionLogIter(0);
+      ExpectRecords(3, iter);
+    }
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    env_->SleepForMicroseconds(2 * 1000 * 1000);
+    {
+      Put(0, "key4", DummyString(1024));
+      Put(1, "key5", DummyString(1024));
+      Put(0, "key6", DummyString(1024));
+    }
+    {
+      auto iter = OpenTransactionLogIter(0);
+      ExpectRecords(6, iter);
+    }
+  } while (ChangeCompactOptions());
+}
+
+#ifndef NDEBUG  // sync point is not included with DNDEBUG build
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) {
+  static const int LOG_ITERATOR_RACE_TEST_COUNT = 2;
+  static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = {
+      {"WalManager::GetSortedWalFiles:1",  "WalManager::PurgeObsoleteFiles:1",
+       "WalManager::PurgeObsoleteFiles:2", "WalManager::GetSortedWalFiles:2"},
+      {"WalManager::GetSortedWalsOfType:1",
+       "WalManager::PurgeObsoleteFiles:1",
+       "WalManager::PurgeObsoleteFiles:2",
+       "WalManager::GetSortedWalsOfType:2"}};
+  for (int test = 0; test < LOG_ITERATOR_RACE_TEST_COUNT; ++test) {
+    // Setup sync point dependency to reproduce the race condition of
+    // a log file moved to archived dir, in the middle of GetSortedWalFiles
+    rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      { { sync_points[test][0], sync_points[test][1] },
+        { sync_points[test][2], sync_points[test][3] },
+      });
+
+    do {
+      rocksdb::SyncPoint::GetInstance()->ClearTrace();
+      rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+      Options options = OptionsForLogIterTest();
+      DestroyAndReopen(options);
+      Put("key1", DummyString(1024));
+      dbfull()->Flush(FlushOptions());
+      Put("key2", DummyString(1024));
+      dbfull()->Flush(FlushOptions());
+      Put("key3", DummyString(1024));
+      dbfull()->Flush(FlushOptions());
+      Put("key4", DummyString(1024));
+      ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U);
+
+      {
+        auto iter = OpenTransactionLogIter(0);
+        ExpectRecords(4, iter);
+      }
+
+      rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+      // trigger async flush, and log move. Well, log move will
+      // wait until the GetSortedWalFiles:1 to reproduce the race
+      // condition
+      FlushOptions flush_options;
+      flush_options.wait = false;
+      dbfull()->Flush(flush_options);
+
+      // "key5" would be written in a new memtable and log
+      Put("key5", DummyString(1024));
+      {
+        // this iter would miss "key4" if not fixed
+        auto iter = OpenTransactionLogIter(0);
+        ExpectRecords(5, iter);
+      }
+    } while (ChangeCompactOptions());
+  }
+}
+#endif
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorStallAtLastRecord) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    Put("key1", DummyString(1024));
+    auto iter = OpenTransactionLogIter(0);
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
+    Put("key2", DummyString(1024));
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckAfterRestart) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    Put("key1", DummyString(1024));
+    Put("key2", DummyString(1023));
+    dbfull()->Flush(FlushOptions());
+    Reopen(options);
+    auto iter = OpenTransactionLogIter(0);
+    ExpectRecords(2, iter);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorCorruptedLog) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    for (int i = 0; i < 1024; i++) {
+      Put("key"+ToString(i), DummyString(10));
+    }
+    dbfull()->Flush(FlushOptions());
+    // Corrupt this log to create a gap
+    rocksdb::VectorLogPtr wal_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+    const auto logfile_path = dbname_ + "/" + wal_files.front()->PathName();
+    if (mem_env_) {
+      mem_env_->Truncate(logfile_path, wal_files.front()->SizeFileBytes() / 2);
+    } else {
+      ASSERT_EQ(0, truncate(logfile_path.c_str(),
+                   wal_files.front()->SizeFileBytes() / 2));
+    }
+
+    // Insert a new entry to a new log file
+    Put("key1025", DummyString(10));
+    // Try to read from the beginning. Should stop before the gap and read less
+    // than 1025 entries
+    auto iter = OpenTransactionLogIter(0);
+    int count;
+    SequenceNumber last_sequence_read = ReadRecords(iter, count);
+    ASSERT_LT(last_sequence_read, 1025U);
+    // Try to read past the gap, should be able to seek to key1025
+    auto iter2 = OpenTransactionLogIter(last_sequence_read + 1);
+    ExpectRecords(1, iter2);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorBatchOperations) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    WriteBatch batch;
+    batch.Put(handles_[1], "key1", DummyString(1024));
+    batch.Put(handles_[0], "key2", DummyString(1024));
+    batch.Put(handles_[1], "key3", DummyString(1024));
+    batch.Delete(handles_[0], "key2");
+    dbfull()->Write(WriteOptions(), &batch);
+    Flush(1);
+    Flush(0);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    Put(1, "key4", DummyString(1024));
+    auto iter = OpenTransactionLogIter(3);
+    ExpectRecords(2, iter);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) {
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  {
+    WriteBatch batch;
+    batch.Put(handles_[1], "key1", DummyString(1024));
+    batch.Put(handles_[0], "key2", DummyString(1024));
+    batch.PutLogData(Slice("blob1"));
+    batch.Put(handles_[1], "key3", DummyString(1024));
+    batch.PutLogData(Slice("blob2"));
+    batch.Delete(handles_[0], "key2");
+    dbfull()->Write(WriteOptions(), &batch);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  }
+
+  auto res = OpenTransactionLogIter(0)->GetBatch();
+  struct Handler : public WriteBatch::Handler {
+    std::string seen;
+    virtual Status PutCF(uint32_t cf, const Slice& key,
+                         const Slice& value) override {
+      seen += "Put(" + ToString(cf) + ", " + key.ToString() + ", " +
+              ToString(value.size()) + ")";
+      return Status::OK();
+    }
+    virtual Status MergeCF(uint32_t cf, const Slice& key,
+                           const Slice& value) override {
+      seen += "Merge(" + ToString(cf) + ", " + key.ToString() + ", " +
+              ToString(value.size()) + ")";
+      return Status::OK();
+    }
+    virtual void LogData(const Slice& blob) override {
+      seen += "LogData(" + blob.ToString() + ")";
+    }
+    virtual Status DeleteCF(uint32_t cf, const Slice& key) override {
+      seen += "Delete(" + ToString(cf) + ", " + key.ToString() + ")";
+      return Status::OK();
+    }
+  } handler;
+  res.writeBatchPtr->Iterate(&handler);
+  ASSERT_EQ(
+      "Put(1, key1, 1024)"
+      "Put(0, key2, 1024)"
+      "LogData(blob1)"
+      "Put(1, key3, 1024)"
+      "LogData(blob2)"
+      "Delete(0, key2)",
+      handler.seen);
+}
+}  // namespace rocksdb
+
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+
+int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_tailing_iter_test.cc b/src/rocksdb/db/db_tailing_iter_test.cc
new file mode 100644
index 0000000..4ca5e90
--- /dev/null
+++ b/src/rocksdb/db/db_tailing_iter_test.cc
@@ -0,0 +1,659 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !(defined NDEBUG) || !defined(OS_WIN)
+
+#include "db/forward_iterator.h"
+#include "port/stack_trace.h"
+#include "util/db_test_util.h"
+
+namespace rocksdb {
+
+class DBTestTailingIterator : public DBTestBase {
+ public:
+  DBTestTailingIterator() : DBTestBase("/db_tailing_iterator_test") {}
+};
+
+TEST_F(DBTestTailingIterator, TailingIteratorSingle) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  iter->SeekToFirst();
+  ASSERT_TRUE(!iter->Valid());
+
+  // add a record and check that iter can see it
+  ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor"));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "mirko");
+
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorKeepAdding) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  std::string value(1024, 'a');
+
+  const int num_records = 10000;
+  for (int i = 0; i < num_records; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%016d", i);
+
+    Slice key(buf, 16);
+    ASSERT_OK(Put(1, key, value));
+
+    iter->Seek(key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorSeekToNext) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+  std::string value(1024, 'a');
+
+  const int num_records = 1000;
+  for (int i = 1; i < num_records; ++i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+    if (i == 1) {
+      itern->SeekToFirst();
+    } else {
+      itern->Next();
+    }
+    ASSERT_TRUE(itern->Valid());
+    ASSERT_EQ(itern->key().compare(key), 0);
+  }
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  for (int i = 2 * num_records; i > 0; --i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) {
+  const uint64_t k150KB = 150 * 1024;
+  Options options;
+  options.write_buffer_size = k150KB;
+  options.max_write_buffer_number = 3;
+  options.min_write_buffer_number_to_merge = 2;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ReadOptions read_options;
+  read_options.tailing = true;
+  int num_iters, deleted_iters;
+
+  char bufe[32];
+  snprintf(bufe, sizeof(bufe), "00b0%016d", 0);
+  Slice keyu(bufe, 20);
+  read_options.iterate_upper_bound = &keyu;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+  std::unique_ptr<Iterator> iterh(db_->NewIterator(read_options, handles_[1]));
+  std::string value(1024, 'a');
+  bool file_iters_deleted = false;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::SeekInternal:Return", [&](void* arg) {
+        ForwardIterator* fiter = reinterpret_cast<ForwardIterator*>(arg);
+        ASSERT_TRUE(!file_iters_deleted ||
+                    fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters));
+      });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::Next:Return", [&](void* arg) {
+        ForwardIterator* fiter = reinterpret_cast<ForwardIterator*>(arg);
+        ASSERT_TRUE(!file_iters_deleted ||
+                    fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters));
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  const int num_records = 1000;
+  for (int i = 1; i < num_records; ++i) {
+    char buf1[32];
+    char buf2[32];
+    char buf3[32];
+    char buf4[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+    snprintf(buf3, sizeof(buf1), "00b0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+    Slice keyn(buf3, 20);
+    ASSERT_OK(Put(1, keyn, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+      dbfull()->TEST_WaitForCompact();
+      if (i == 299) {
+        file_iters_deleted = true;
+      }
+      snprintf(buf4, sizeof(buf1), "00a0%016d", i * 5 / 2);
+      Slice target(buf4, 20);
+      iterh->Seek(target);
+      ASSERT_TRUE(iter->Valid());
+      for (int j = (i + 1) * 5 / 2; j < i * 5; j += 5) {
+        iterh->Next();
+        ASSERT_TRUE(iterh->Valid());
+      }
+      if (i == 299) {
+        file_iters_deleted = false;
+      }
+    }
+
+    file_iters_deleted = true;
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+    ASSERT_LE(num_iters, 1);
+    if (i == 1) {
+      itern->SeekToFirst();
+    } else {
+      itern->Next();
+    }
+    ASSERT_TRUE(itern->Valid());
+    ASSERT_EQ(itern->key().compare(key), 0);
+    ASSERT_LE(num_iters, 1);
+    file_iters_deleted = false;
+  }
+  iter = 0;
+  itern = 0;
+  iterh = 0;
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.block_cache_compressed = nullptr;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  read_options.read_tier = kBlockCacheTier;
+  std::unique_ptr<Iterator> iteri(db_->NewIterator(read_options, handles_[1]));
+  char buf5[32];
+  snprintf(buf5, sizeof(buf5), "00a0%016d", (num_records / 2) * 5 - 2);
+  Slice target1(buf5, 20);
+  iteri->Seek(target1);
+  ASSERT_TRUE(iteri->status().IsIncomplete());
+  iteri = 0;
+
+  read_options.read_tier = kReadAllTier;
+  options.table_factory.reset(NewBlockBasedTableFactory());
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  iter.reset(db_->NewIterator(read_options, handles_[1]));
+  for (int i = 2 * num_records; i > 0; --i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorDeletes) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+
+  // write a single record, read it using the iterator, then delete it
+  ASSERT_OK(Put(1, "0test", "test"));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "0test");
+  ASSERT_OK(Delete(1, "0test"));
+
+  // write many more records
+  const int num_records = 10000;
+  std::string value(1024, 'A');
+
+  for (int i = 0; i < num_records; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "1%015d", i);
+
+    Slice key(buf, 16);
+    ASSERT_OK(Put(1, key, value));
+  }
+
+  // force a flush to make sure that no records are read from memtable
+  ASSERT_OK(Flush(1));
+
+  // skip "0test"
+  iter->Next();
+
+  // make sure we can read all new records using the existing iterator
+  int count = 0;
+  for (; iter->Valid(); iter->Next(), ++count) ;
+
+  ASSERT_EQ(count, num_records);
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorPrefixSeek) {
+  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip,
+             kSkipNoPrefix);
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(Put(1, "0101", "test"));
+
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(Put(1, "0202", "test"));
+
+  // Seek(0102) shouldn't find any records since 0202 has a different prefix
+  iter->Seek("0102");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("0202");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "0202");
+
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, 0);
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorIncomplete) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.read_tier = kBlockCacheTier;
+
+  std::string key("key");
+  std::string value("value");
+
+  ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  iter->SeekToFirst();
+  // we either see the entry or it's not in cache
+  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  iter->SeekToFirst();
+  // should still be true after compaction
+  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+}
+
+TEST_F(DBTestTailingIterator, TailingIteratorSeekToSame) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 1000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ReadOptions read_options;
+  read_options.tailing = true;
+
+  const int NROWS = 10000;
+  // Write rows with keys 00000, 00002, 00004 etc.
+  for (int i = 0; i < NROWS; ++i) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%05d", 2*i);
+    std::string key(buf);
+    std::string value("value");
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+  }
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  // Seek to 00001.  We expect to find 00002.
+  std::string start_key = "00001";
+  iter->Seek(start_key);
+  ASSERT_TRUE(iter->Valid());
+
+  std::string found = iter->key().ToString();
+  ASSERT_EQ("00002", found);
+
+  // Now seek to the same key.  The iterator should remain in the same
+  // position.
+  iter->Seek(found);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(found, iter->key().ToString());
+}
+
+// Sets iterate_upper_bound and verifies that ForwardIterator doesn't call
+// Seek() on immutable iterators when target key is >= prev_key and all
+// iterators, including the memtable iterator, are over the upper bound.
+TEST_F(DBTestTailingIterator, TailingIteratorUpperBound) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+  const Slice upper_bound("20", 3);
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.iterate_upper_bound = &upper_bound;
+
+  ASSERT_OK(Put(1, "11", "11"));
+  ASSERT_OK(Put(1, "12", "12"));
+  ASSERT_OK(Put(1, "22", "22"));
+  ASSERT_OK(Flush(1));  // flush all those keys to an immutable SST file
+
+  // Add another key to the memtable.
+  ASSERT_OK(Put(1, "21", "21"));
+
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_options, handles_[1]));
+  it->Seek("12");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("12", it->key().ToString());
+
+  it->Next();
+  // Not valid since "21" is over the upper bound.
+  ASSERT_FALSE(it->Valid());
+
+  // This keeps track of the number of times NeedToSeekImmutable() was true.
+  int immutable_seeks = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::SeekInternal:Immutable",
+      [&](void* arg) { ++immutable_seeks; });
+
+  // Seek to 13. This should not require any immutable seeks.
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  it->Seek("13");
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_FALSE(it->Valid());
+  ASSERT_EQ(0, immutable_seeks);
+}
+
+TEST_F(DBTestTailingIterator, ManagedTailingIteratorSingle) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.managed = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  iter->SeekToFirst();
+  ASSERT_TRUE(!iter->Valid());
+
+  // add a record and check that iter can see it
+  ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor"));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "mirko");
+
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_F(DBTestTailingIterator, ManagedTailingIteratorKeepAdding) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.managed = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  std::string value(1024, 'a');
+
+  const int num_records = 10000;
+  for (int i = 0; i < num_records; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%016d", i);
+
+    Slice key(buf, 16);
+    ASSERT_OK(Put(1, key, value));
+
+    iter->Seek(key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_F(DBTestTailingIterator, ManagedTailingIteratorSeekToNext) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.managed = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  std::string value(1024, 'a');
+
+  const int num_records = 1000;
+  for (int i = 1; i < num_records; ++i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+  for (int i = 2 * num_records; i > 0; --i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_F(DBTestTailingIterator, ManagedTailingIteratorDeletes) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.managed = true;
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+
+  // write a single record, read it using the iterator, then delete it
+  ASSERT_OK(Put(1, "0test", "test"));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "0test");
+  ASSERT_OK(Delete(1, "0test"));
+
+  // write many more records
+  const int num_records = 10000;
+  std::string value(1024, 'A');
+
+  for (int i = 0; i < num_records; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "1%015d", i);
+
+    Slice key(buf, 16);
+    ASSERT_OK(Put(1, key, value));
+  }
+
+  // force a flush to make sure that no records are read from memtable
+  ASSERT_OK(Flush(1));
+
+  // skip "0test"
+  iter->Next();
+
+  // make sure we can read all new records using the existing iterator
+  int count = 0;
+  for (; iter->Valid(); iter->Next(), ++count) {
+  }
+
+  ASSERT_EQ(count, num_records);
+}
+
+TEST_F(DBTestTailingIterator, ManagedTailingIteratorPrefixSeek) {
+  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip,
+             kSkipNoPrefix);
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.managed = true;
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(Put(1, "0101", "test"));
+
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(Put(1, "0202", "test"));
+
+  // Seek(0102) shouldn't find any records since 0202 has a different prefix
+  iter->Seek("0102");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("0202");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "0202");
+
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, 0);
+}
+
+TEST_F(DBTestTailingIterator, ManagedTailingIteratorIncomplete) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.managed = true;
+  read_options.read_tier = kBlockCacheTier;
+
+  std::string key = "key";
+  std::string value = "value";
+
+  ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  iter->SeekToFirst();
+  // we either see the entry or it's not in cache
+  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  iter->SeekToFirst();
+  // should still be true after compaction
+  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+}
+
+TEST_F(DBTestTailingIterator, ManagedTailingIteratorSeekToSame) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 1000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.managed = true;
+
+  const int NROWS = 10000;
+  // Write rows with keys 00000, 00002, 00004 etc.
+  for (int i = 0; i < NROWS; ++i) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%05d", 2 * i);
+    std::string key(buf);
+    std::string value("value");
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+  }
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  // Seek to 00001.  We expect to find 00002.
+  std::string start_key = "00001";
+  iter->Seek(start_key);
+  ASSERT_TRUE(iter->Valid());
+
+  std::string found = iter->key().ToString();
+  ASSERT_EQ("00002", found);
+
+  // Now seek to the same key.  The iterator should remain in the same
+  // position.
+  iter->Seek(found);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(found, iter->key().ToString());
+}
+
+}  // namespace rocksdb
+
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+
+int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_test.cc b/src/rocksdb/db/db_test.cc
index eaef2a6..4bfe4db 100644
--- a/src/rocksdb/db/db_test.cc
+++ b/src/rocksdb/db/db_test.cc
@@ -7,41 +7,54 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !(defined NDEBUG) || !defined(OS_WIN)
+
 #include <algorithm>
 #include <iostream>
 #include <set>
-#include <unistd.h>
 #include <thread>
 #include <unordered_set>
 #include <utility>
+#include <fcntl.h>
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
 
 #include "db/filename.h"
 #include "db/dbformat.h"
 #include "db/db_impl.h"
-#include "db/filename.h"
 #include "db/job_context.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
 #include "port/stack_trace.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
+#include "rocksdb/delete_scheduler.h"
 #include "rocksdb/env.h"
 #include "rocksdb/experimental.h"
 #include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/sst_file_writer.h"
 #include "rocksdb/table.h"
-#include "rocksdb/options.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/thread_status.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksdb/utilities/checkpoint.h"
-#include "rocksdb/utilities/convenience.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "table/block_based_table_factory.h"
 #include "table/mock_table.h"
 #include "table/plain_table_factory.h"
+#include "util/db_test_util.h"
+#include "util/file_reader_writer.h"
 #include "util/hash.h"
 #include "util/hash_linklist_rep.h"
 #include "utilities/merge_operators.h"
@@ -61,1292 +74,929 @@
 
 namespace rocksdb {
 
-static std::string RandomString(Random* rnd, int len) {
-  std::string r;
-  test::RandomString(rnd, len, &r);
-  return r;
+static long TestGetTickerCount(const Options& options, Tickers ticker_type) {
+  return options.statistics->getTickerCount(ticker_type);
 }
 
-namespace anon {
-class AtomicCounter {
- private:
-  port::Mutex mu_;
-  int count_;
- public:
-  AtomicCounter() : count_(0) { }
-  void Increment() {
-    MutexLock l(&mu_);
-    count_++;
-  }
-  int Read() {
-    MutexLock l(&mu_);
-    return count_;
-  }
-  void Reset() {
-    MutexLock l(&mu_);
-    count_ = 0;
-  }
-};
+// A helper function that ensures the table properties returned in
+// `GetPropertiesOfAllTablesTest` is correct.
+// This test assumes entries size is different for each of the tables.
+namespace {
 
-struct OptionsOverride {
-  std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+void VerifyTableProperties(DB* db, uint64_t expected_entries_size) {
+  TablePropertiesCollection props;
+  ASSERT_OK(db->GetPropertiesOfAllTables(&props));
 
-  // Used as a bit mask of individual enums in which to skip an XF test point
-  int skip_policy = 0;
-};
+  ASSERT_EQ(4U, props.size());
+  std::unordered_set<uint64_t> unique_entries;
 
-}  // namespace anon
+  // Indirect test
+  uint64_t sum = 0;
+  for (const auto& item : props) {
+    unique_entries.insert(item.second->num_entries);
+    sum += item.second->num_entries;
+  }
 
-static std::string Key(int i) {
-  char buf[100];
-  snprintf(buf, sizeof(buf), "key%06d", i);
-  return std::string(buf);
+  ASSERT_EQ(props.size(), unique_entries.size());
+  ASSERT_EQ(expected_entries_size, sum);
 }
 
-// Special Env used to delay background operations
-class SpecialEnv : public EnvWrapper {
- public:
-  Random rnd_;
-  port::Mutex rnd_mutex_;  // Lock to pretect rnd_
-
-  // sstable Sync() calls are blocked while this pointer is non-nullptr.
-  std::atomic<bool> delay_sstable_sync_;
+uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
+                                            std::string column_family_name) {
+  std::vector<LiveFileMetaData> metadata;
+  db->GetLiveFilesMetaData(&metadata);
+  uint64_t result = 0;
+  for (auto& fileMetadata : metadata) {
+    result += (fileMetadata.column_family_name == column_family_name);
+  }
+  return result;
+}
 
-  // Drop writes on the floor while this pointer is non-nullptr.
-  std::atomic<bool> drop_writes_;
+}  // namespace
 
-  // Simulate no-space errors while this pointer is non-nullptr.
-  std::atomic<bool> no_space_;
+class DBTest : public DBTestBase {
+ public:
+  DBTest() : DBTestBase("/db_test") {}
+};
 
-  // Simulate non-writable file system while this pointer is non-nullptr
-  std::atomic<bool> non_writable_;
+class DBTestWithParam : public DBTest,
+                        public testing::WithParamInterface<uint32_t> {
+ public:
+  DBTestWithParam() { max_subcompactions_ = GetParam(); }
 
-  // Force sync of manifest files to fail while this pointer is non-nullptr
-  std::atomic<bool> manifest_sync_error_;
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
 
-  // Force write to manifest files to fail while this pointer is non-nullptr
-  std::atomic<bool> manifest_write_error_;
+  uint32_t max_subcompactions_;
+};
 
-  // Force write to log files to fail while this pointer is non-nullptr
-  std::atomic<bool> log_write_error_;
+class BloomStatsTestWithParam
+    : public DBTest,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  BloomStatsTestWithParam() {
+    use_block_table_ = std::get<0>(GetParam());
+    use_block_based_builder_ = std::get<1>(GetParam());
+
+    options_.create_if_missing = true;
+    options_.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(4));
+    options_.memtable_prefix_bloom_bits = 8 * 1024;
+    if (use_block_table_) {
+      BlockBasedTableOptions table_options;
+      table_options.hash_index_allow_collision = false;
+      table_options.filter_policy.reset(
+          NewBloomFilterPolicy(10, use_block_based_builder_));
+      options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    } else {
+      PlainTableOptions table_options;
+      options_.table_factory.reset(NewPlainTableFactory(table_options));
+    }
 
-  // Slow down every log write, in micro-seconds.
-  std::atomic<int> log_write_slowdown_;
+    perf_context.Reset();
+    DestroyAndReopen(options_);
+  }
 
-  bool count_random_reads_;
-  anon::AtomicCounter random_read_counter_;
+  ~BloomStatsTestWithParam() {
+    perf_context.Reset();
+    Destroy(options_);
+  }
 
-  bool count_sequential_reads_;
-  anon::AtomicCounter sequential_read_counter_;
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
 
-  anon::AtomicCounter sleep_counter_;
+  bool use_block_table_;
+  bool use_block_based_builder_;
+  Options options_;
+};
 
-  std::atomic<int64_t> bytes_written_;
+TEST_F(DBTest, Empty) {
+  do {
+    Options options;
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
-  std::atomic<int> sync_counter_;
+    std::string num;
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("0", num);
 
-  std::atomic<uint32_t> non_writeable_rate_;
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("1", num);
 
-  std::atomic<uint32_t> new_writable_count_;
+    // Block sync calls
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
+    Put(1, "k1", std::string(100000, 'x'));         // Fill memtable
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("2", num);
 
-  std::atomic<uint32_t> non_writable_count_;
+    Put(1, "k2", std::string(100000, 'y'));         // Trigger compaction
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("1", num);
 
-  std::function<void()>* table_write_callback_;
+    ASSERT_EQ("v1", Get(1, "foo"));
+    // Release sync calls
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
 
-  int64_t addon_time_;
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("1", num);
 
-  explicit SpecialEnv(Env* base) : EnvWrapper(base), rnd_(301), addon_time_(0) {
-    delay_sstable_sync_.store(false, std::memory_order_release);
-    drop_writes_.store(false, std::memory_order_release);
-    no_space_.store(false, std::memory_order_release);
-    non_writable_.store(false, std::memory_order_release);
-    count_random_reads_ = false;
-    count_sequential_reads_ = false;
-    manifest_sync_error_.store(false, std::memory_order_release);
-    manifest_write_error_.store(false, std::memory_order_release);
-    log_write_error_.store(false, std::memory_order_release);
-    log_write_slowdown_ = 0;
-    bytes_written_ = 0;
-    sync_counter_ = 0;
-    non_writeable_rate_ = 0;
-    new_writable_count_ = 0;
-    non_writable_count_ = 0;
-    table_write_callback_ = nullptr;
-  }
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("2", num);
 
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
-                         const EnvOptions& soptions) override {
-    class SSTableFile : public WritableFile {
-     private:
-      SpecialEnv* env_;
-      unique_ptr<WritableFile> base_;
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("3", num);
 
-     public:
-      SSTableFile(SpecialEnv* env, unique_ptr<WritableFile>&& base)
-          : env_(env),
-            base_(std::move(base)) {
-      }
-      Status Append(const Slice& data) override {
-        if (env_->table_write_callback_) {
-          (*env_->table_write_callback_)();
-        }
-        if (env_->drop_writes_.load(std::memory_order_acquire)) {
-          // Drop writes on the floor
-          return Status::OK();
-        } else if (env_->no_space_.load(std::memory_order_acquire)) {
-          return Status::IOError("No space left on device");
-        } else {
-          env_->bytes_written_ += data.size();
-          return base_->Append(data);
-        }
-      }
-      Status Close() override {
-        // Check preallocation size
-        // preallocation size is never passed to base file.
-        size_t preallocation_size = preallocation_block_size();
-        TEST_SYNC_POINT_CALLBACK("DBTestWritableFile.GetPreallocationStatus",
-                                 &preallocation_size);
-        return base_->Close();
-      }
-      Status Flush() override { return base_->Flush(); }
-      Status Sync() override {
-        ++env_->sync_counter_;
-        while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) {
-          env_->SleepForMicroseconds(100000);
-        }
-        return base_->Sync();
-      }
-      void SetIOPriority(Env::IOPriority pri) override {
-        base_->SetIOPriority(pri);
-      }
-    };
-    class ManifestFile : public WritableFile {
-     private:
-      SpecialEnv* env_;
-      unique_ptr<WritableFile> base_;
-     public:
-      ManifestFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
-          : env_(env), base_(std::move(b)) { }
-      Status Append(const Slice& data) override {
-        if (env_->manifest_write_error_.load(std::memory_order_acquire)) {
-          return Status::IOError("simulated writer error");
-        } else {
-          return base_->Append(data);
-        }
-      }
-      Status Close() override { return base_->Close(); }
-      Status Flush() override { return base_->Flush(); }
-      Status Sync() override {
-        ++env_->sync_counter_;
-        if (env_->manifest_sync_error_.load(std::memory_order_acquire)) {
-          return Status::IOError("simulated sync error");
-        } else {
-          return base_->Sync();
-        }
-      }
-      uint64_t GetFileSize() override { return base_->GetFileSize(); }
-    };
-    class WalFile : public WritableFile {
-     private:
-      SpecialEnv* env_;
-      unique_ptr<WritableFile> base_;
-     public:
-      WalFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
-          : env_(env), base_(std::move(b)) {}
-      Status Append(const Slice& data) override {
-        if (env_->log_write_error_.load(std::memory_order_acquire)) {
-          return Status::IOError("simulated writer error");
-        } else {
-          int slowdown =
-              env_->log_write_slowdown_.load(std::memory_order_acquire);
-          if (slowdown > 0) {
-            env_->SleepForMicroseconds(slowdown);
-          }
-          return base_->Append(data);
-        }
-      }
-      Status Close() override { return base_->Close(); }
-      Status Flush() override { return base_->Flush(); }
-      Status Sync() override {
-        ++env_->sync_counter_;
-        return base_->Sync();
-      }
-    };
+    ASSERT_OK(db_->EnableFileDeletions(false));
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("2", num);
 
-    if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
-      uint32_t random_number;
-      {
-        MutexLock l(&rnd_mutex_);
-        random_number = rnd_.Uniform(100);
-      }
-      if (random_number < non_writeable_rate_.load()) {
-        return Status::IOError("simulated random write error");
-      }
-    }
+    ASSERT_OK(db_->EnableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("0", num);
+  } while (ChangeOptions());
+}
 
-    new_writable_count_++;
+TEST_F(DBTest, WriteEmptyBatch) {
+  Options options;
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
-    if (non_writable_count_.load() > 0) {
-      non_writable_count_--;
-      return Status::IOError("simulated write error");
-    }
+  ASSERT_OK(Put(1, "foo", "bar"));
+  env_->sync_counter_.store(0);
+  WriteOptions wo;
+  wo.sync = true;
+  wo.disableWAL = false;
+  WriteBatch empty_batch;
+  ASSERT_OK(dbfull()->Write(wo, &empty_batch));
+  ASSERT_GE(env_->sync_counter_.load(), 1);
 
-    Status s = target()->NewWritableFile(f, r, soptions);
-    if (s.ok()) {
-      if (strstr(f.c_str(), ".sst") != nullptr) {
-        r->reset(new SSTableFile(this, std::move(*r)));
-      } else if (strstr(f.c_str(), "MANIFEST") != nullptr) {
-        r->reset(new ManifestFile(this, std::move(*r)));
-      } else if (strstr(f.c_str(), "log") != nullptr) {
-        r->reset(new WalFile(this, std::move(*r)));
-      }
-    }
-    return s;
-  }
+  // make sure we can re-open it.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+  ASSERT_EQ("bar", Get(1, "foo"));
+}
 
-  Status NewRandomAccessFile(const std::string& f,
-                             unique_ptr<RandomAccessFile>* r,
-                             const EnvOptions& soptions) override {
-    class CountingFile : public RandomAccessFile {
-     private:
-      unique_ptr<RandomAccessFile> target_;
-      anon::AtomicCounter* counter_;
-     public:
-      CountingFile(unique_ptr<RandomAccessFile>&& target,
-                   anon::AtomicCounter* counter)
-          : target_(std::move(target)), counter_(counter) {
-      }
-      virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                          char* scratch) const override {
-        counter_->Increment();
-        return target_->Read(offset, n, result, scratch);
-      }
-    };
+TEST_F(DBTest, ReadOnlyDB) {
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  Close();
 
-    Status s = target()->NewRandomAccessFile(f, r, soptions);
-    if (s.ok() && count_random_reads_) {
-      r->reset(new CountingFile(std::move(*r), &random_read_counter_));
-    }
-    return s;
+  auto options = CurrentOptions();
+  assert(options.env = env_);
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    ++count;
   }
+  ASSERT_EQ(count, 2);
+  delete iter;
+  Close();
 
-  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
-                           const EnvOptions& soptions) override {
-    class CountingFile : public SequentialFile {
-     private:
-      unique_ptr<SequentialFile> target_;
-      anon::AtomicCounter* counter_;
-
-     public:
-      CountingFile(unique_ptr<SequentialFile>&& target,
-                   anon::AtomicCounter* counter)
-          : target_(std::move(target)), counter_(counter) {}
-      virtual Status Read(size_t n, Slice* result, char* scratch) override {
-        counter_->Increment();
-        return target_->Read(n, result, scratch);
-      }
-      virtual Status Skip(uint64_t n) override { return target_->Skip(n); }
-    };
-
-    Status s = target()->NewSequentialFile(f, r, soptions);
-    if (s.ok() && count_sequential_reads_) {
-      r->reset(new CountingFile(std::move(*r), &sequential_read_counter_));
-    }
-    return s;
-  }
+  // Reopen and flush memtable.
+  Reopen(options);
+  Flush();
+  Close();
+  // Now check keys in read only mode.
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+}
 
-  virtual void SleepForMicroseconds(int micros) override {
-    sleep_counter_.Increment();
-    target()->SleepForMicroseconds(micros);
-  }
+TEST_F(DBTest, CompactedDB) {
+  const uint64_t kFileSize = 1 << 20;
+  Options options;
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = kFileSize;
+  options.target_file_size_base = kFileSize;
+  options.max_bytes_for_level_base = 1 << 30;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  Reopen(options);
+  // 1 L0 file, use CompactedDB if max_open_files = -1
+  ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1')));
+  Flush();
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  Status s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+  ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported in compacted db mode.");
+  ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+  Close();
+  Reopen(options);
+  // Add more L0 files
+  ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2')));
+  Flush();
+  ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a')));
+  Flush();
+  ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b')));
+  ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
+  Flush();
+  Close();
 
-  virtual Status GetCurrentTime(int64_t* unix_time) override {
-    Status s = target()->GetCurrentTime(unix_time);
-    if (s.ok()) {
-      *unix_time += addon_time_;
-    }
-    return s;
-  }
+  ASSERT_OK(ReadOnlyReopen(options));
+  // Fallback to read-only DB
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+  Close();
 
-  virtual uint64_t NowNanos() override {
-    return target()->NowNanos() + addon_time_ * 1000;
-  }
-};
+  // Full compaction
+  Reopen(options);
+  // Add more keys
+  ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
+  ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h')));
+  ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i')));
+  ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j')));
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(3, NumTableFilesAtLevel(1));
+  Close();
 
-class DBTest : public testing::Test {
- protected:
-  // Sequence of option configurations to try
-  enum OptionConfig {
-    kDefault = 0,
-    kBlockBasedTableWithPrefixHashIndex = 1,
-    kBlockBasedTableWithWholeKeyHashIndex = 2,
-    kPlainTableFirstBytePrefix = 3,
-    kPlainTableCappedPrefix = 4,
-    kPlainTableAllBytesPrefix = 5,
-    kVectorRep = 6,
-    kHashLinkList = 7,
-    kHashCuckoo = 8,
-    kMergePut = 9,
-    kFilter = 10,
-    kFullFilter = 11,
-    kUncompressed = 12,
-    kNumLevel_3 = 13,
-    kDBLogDir = 14,
-    kWalDirAndMmapReads = 15,
-    kManifestFileSize = 16,
-    kCompactOnFlush = 17,
-    kPerfOptions = 18,
-    kDeletesFilterFirst = 19,
-    kHashSkipList = 20,
-    kUniversalCompaction = 21,
-    kUniversalCompactionMultiLevel = 22,
-    kCompressedBlockCache = 23,
-    kInfiniteMaxOpenFiles = 24,
-    kxxHashChecksum = 25,
-    kFIFOCompaction = 26,
-    kOptimizeFiltersForHits = 27,
-    kEnd = 28
-  };
-  int option_config_;
+  // CompactedDB
+  ASSERT_OK(ReadOnlyReopen(options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported in compacted db mode.");
+  ASSERT_EQ("NOT_FOUND", Get("abc"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb"));
+  ASSERT_EQ("NOT_FOUND", Get("ccc"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff"));
+  ASSERT_EQ("NOT_FOUND", Get("ggg"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj"));
+  ASSERT_EQ("NOT_FOUND", Get("kkk"));
 
- public:
-  std::string dbname_;
-  std::string alternative_wal_dir_;
-  MockEnv* mem_env_;
-  SpecialEnv* env_;
-  DB* db_;
-  std::vector<ColumnFamilyHandle*> handles_;
-
-  Options last_options_;
-
-  // Skip some options, as they may not be applicable to a specific test.
-  // To add more skip constants, use values 4, 8, 16, etc.
-  enum OptionSkip {
-    kNoSkip = 0,
-    kSkipDeletesFilterFirst = 1,
-    kSkipUniversalCompaction = 2,
-    kSkipMergePut = 4,
-    kSkipPlainTable = 8,
-    kSkipHashIndex = 16,
-    kSkipNoSeekToLast = 32,
-    kSkipHashCuckoo = 64,
-    kSkipFIFOCompaction = 128,
-    kSkipMmapReads = 256,
-  };
+  // MultiGet
+  std::vector<std::string> values;
+  std::vector<Status> status_list = dbfull()->MultiGet(ReadOptions(),
+      std::vector<Slice>({Slice("aaa"), Slice("ccc"), Slice("eee"),
+                          Slice("ggg"), Slice("iii"), Slice("kkk")}),
+      &values);
+  ASSERT_EQ(status_list.size(), static_cast<uint64_t>(6));
+  ASSERT_EQ(values.size(), static_cast<uint64_t>(6));
+  ASSERT_OK(status_list[0]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]);
+  ASSERT_TRUE(status_list[1].IsNotFound());
+  ASSERT_OK(status_list[2]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]);
+  ASSERT_TRUE(status_list[3].IsNotFound());
+  ASSERT_OK(status_list[4]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]);
+  ASSERT_TRUE(status_list[5].IsNotFound());
+}
 
+// Make sure that when options.block_cache is set, after a new table is
+// created its index/filter blocks are added to block cache.
+TEST_F(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
 
-  DBTest() : option_config_(kDefault),
-             mem_env_(!getenv("MEM_ENV") ? nullptr :
-                                           new MockEnv(Env::Default())),
-             env_(new SpecialEnv(mem_env_ ? mem_env_ : Env::Default())) {
-    env_->SetBackgroundThreads(1, Env::LOW);
-    env_->SetBackgroundThreads(1, Env::HIGH);
-    dbname_ = test::TmpDir(env_) + "/db_test";
-    alternative_wal_dir_ = dbname_ + "/wal";
-    auto options = CurrentOptions();
-    auto delete_options = options;
-    delete_options.wal_dir = alternative_wal_dir_;
-    EXPECT_OK(DestroyDB(dbname_, delete_options));
-    // Destroy it for not alternative WAL dir is used.
-    EXPECT_OK(DestroyDB(dbname_, options));
-    db_ = nullptr;
-    Reopen(options);
-  }
+  ASSERT_OK(Put(1, "key", "val"));
+  // Create a new table.
+  ASSERT_OK(Flush(1));
 
-  ~DBTest() {
-    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-    rocksdb::SyncPoint::GetInstance()->LoadDependency({});
-    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-    Close();
-    Options options;
-    options.db_paths.emplace_back(dbname_, 0);
-    options.db_paths.emplace_back(dbname_ + "_2", 0);
-    options.db_paths.emplace_back(dbname_ + "_3", 0);
-    options.db_paths.emplace_back(dbname_ + "_4", 0);
-    EXPECT_OK(DestroyDB(dbname_, options));
-    delete env_;
-  }
-
-  // Switch to a fresh database with the next option configuration to
-  // test.  Return false if there are no more configurations to test.
-  bool ChangeOptions(int skip_mask = kNoSkip) {
-    for(option_config_++; option_config_ < kEnd; option_config_++) {
-      if ((skip_mask & kSkipDeletesFilterFirst) &&
-          option_config_ == kDeletesFilterFirst) {
-        continue;
-      }
-      if ((skip_mask & kSkipUniversalCompaction) &&
-          (option_config_ == kUniversalCompaction ||
-           option_config_ == kUniversalCompactionMultiLevel)) {
-        continue;
-      }
-      if ((skip_mask & kSkipMergePut) && option_config_ == kMergePut) {
-        continue;
-      }
-      if ((skip_mask & kSkipNoSeekToLast) &&
-          (option_config_ == kHashLinkList ||
-           option_config_ == kHashSkipList)) {;
-        continue;
-      }
-      if ((skip_mask & kSkipPlainTable) &&
-          (option_config_ == kPlainTableAllBytesPrefix ||
-           option_config_ == kPlainTableFirstBytePrefix ||
-           option_config_ == kPlainTableCappedPrefix)) {
-        continue;
-      }
-      if ((skip_mask & kSkipHashIndex) &&
-          (option_config_ == kBlockBasedTableWithPrefixHashIndex ||
-           option_config_ == kBlockBasedTableWithWholeKeyHashIndex)) {
-        continue;
-      }
-      if ((skip_mask & kSkipHashCuckoo) && (option_config_ == kHashCuckoo)) {
-        continue;
-      }
-      if ((skip_mask & kSkipFIFOCompaction) &&
-          option_config_ == kFIFOCompaction) {
-        continue;
-      }
-      if ((skip_mask & kSkipMmapReads) &&
-          option_config_ == kWalDirAndMmapReads) {
-        continue;
-      }
-      break;
-    }
+  // index/filter blocks added to block cache right after table creation.
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(2, /* only index/filter were added */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+  uint64_t int_num;
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
 
-    if (option_config_ >= kEnd) {
-      Destroy(last_options_);
-      return false;
-    } else {
-      auto options = CurrentOptions();
-      options.create_if_missing = true;
-      DestroyAndReopen(options);
-      return true;
-    }
-  }
+  // Make sure filter block is in cache.
+  std::string value;
+  ReadOptions ropt;
+  db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
 
-  // Switch between different compaction styles (we have only 2 now).
-  bool ChangeCompactOptions() {
-    if (option_config_ == kDefault) {
-      option_config_ = kUniversalCompaction;
-      Destroy(last_options_);
-      auto options = CurrentOptions();
-      options.create_if_missing = true;
-      TryReopen(options);
-      return true;
-    } else if (option_config_ == kUniversalCompaction) {
-      option_config_ = kUniversalCompactionMultiLevel;
-      Destroy(last_options_);
-      auto options = CurrentOptions();
-      options.create_if_missing = true;
-      TryReopen(options);
-      return true;
-    } else {
-      return false;
-    }
-  }
+  // Miss count should remain the same.
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
 
-  // Switch between different filter policy
-  // Jump from kDefault to kFilter to kFullFilter
-  bool ChangeFilterOptions() {
-    if (option_config_ == kDefault) {
-      option_config_ = kFilter;
-    } else if (option_config_ == kFilter) {
-      option_config_ = kFullFilter;
-    } else {
-      return false;
-    }
-    Destroy(last_options_);
+  db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
 
-    auto options = CurrentOptions();
-    options.create_if_missing = true;
-    TryReopen(options);
-    return true;
-  }
+  // Make sure index block is in cache.
+  auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  value = Get(1, "key");
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(index_block_hit + 1,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
 
-  // Return the current option configuration.
-  Options CurrentOptions(
-      const anon::OptionsOverride& options_override = anon::OptionsOverride()) {
-    Options options;
-    return CurrentOptions(options, options_override);
-  }
+  value = Get(1, "key");
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(index_block_hit + 2,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+}
 
-  Options CurrentOptions(
-      const Options& defaultOptions,
-      const anon::OptionsOverride& options_override = anon::OptionsOverride()) {
-    // this redudant copy is to minimize code change w/o having lint error.
-    Options options = defaultOptions;
-    XFUNC_TEST("", "dbtest_options", inplace_options1, GetXFTestOptions,
-               reinterpret_cast<Options*>(&options),
-               options_override.skip_policy);
-    BlockBasedTableOptions table_options;
-    bool set_block_based_table_factory = true;
-    switch (option_config_) {
-      case kHashSkipList:
-        options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-        options.memtable_factory.reset(
-            NewHashSkipListRepFactory(16));
-        break;
-      case kPlainTableFirstBytePrefix:
-        options.table_factory.reset(new PlainTableFactory());
-        options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-        options.allow_mmap_reads = true;
-        options.max_sequential_skip_in_iterations = 999999;
-        set_block_based_table_factory = false;
-        break;
-      case kPlainTableCappedPrefix:
-        options.table_factory.reset(new PlainTableFactory());
-        options.prefix_extractor.reset(NewCappedPrefixTransform(8));
-        options.allow_mmap_reads = true;
-        options.max_sequential_skip_in_iterations = 999999;
-        set_block_based_table_factory = false;
-        break;
-      case kPlainTableAllBytesPrefix:
-        options.table_factory.reset(new PlainTableFactory());
-        options.prefix_extractor.reset(NewNoopTransform());
-        options.allow_mmap_reads = true;
-        options.max_sequential_skip_in_iterations = 999999;
-        set_block_based_table_factory = false;
-        break;
-      case kMergePut:
-        options.merge_operator = MergeOperators::CreatePutOperator();
-        break;
-      case kFilter:
-        table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
-        break;
-      case kFullFilter:
-        table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
-        break;
-      case kUncompressed:
-        options.compression = kNoCompression;
-        break;
-      case kNumLevel_3:
-        options.num_levels = 3;
-        break;
-      case kDBLogDir:
-        options.db_log_dir = test::TmpDir(env_);
-        break;
-      case kWalDirAndMmapReads:
-        options.wal_dir = alternative_wal_dir_;
-        // mmap reads should be orthogonal to WalDir setting, so we piggyback to
-        // this option config to test mmap reads as well
-        options.allow_mmap_reads = true;
-        break;
-      case kManifestFileSize:
-        options.max_manifest_file_size = 50; // 50 bytes
-      case kCompactOnFlush:
-        options.purge_redundant_kvs_while_flush =
-          !options.purge_redundant_kvs_while_flush;
-        break;
-      case kPerfOptions:
-        options.hard_rate_limit = 2.0;
-        options.rate_limit_delay_max_milliseconds = 2;
-        // TODO -- test more options
-        break;
-      case kDeletesFilterFirst:
-        options.filter_deletes = true;
-        break;
-      case kVectorRep:
-        options.memtable_factory.reset(new VectorRepFactory(100));
-        break;
-      case kHashLinkList:
-        options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-        options.memtable_factory.reset(
-            NewHashLinkListRepFactory(4, 0, 3, true, 4));
-        break;
-      case kHashCuckoo:
-        options.memtable_factory.reset(
-            NewHashCuckooRepFactory(options.write_buffer_size));
-        break;
-      case kUniversalCompaction:
-        options.compaction_style = kCompactionStyleUniversal;
-        options.num_levels = 1;
-        break;
-      case kUniversalCompactionMultiLevel:
-        options.compaction_style = kCompactionStyleUniversal;
-        options.num_levels = 8;
-        break;
-      case kCompressedBlockCache:
-        options.allow_mmap_writes = true;
-        table_options.block_cache_compressed = NewLRUCache(8*1024*1024);
-        break;
-      case kInfiniteMaxOpenFiles:
-        options.max_open_files = -1;
-        break;
-      case kxxHashChecksum: {
-        table_options.checksum = kxxHash;
-        break;
-      }
-      case kFIFOCompaction: {
-        options.compaction_style = kCompactionStyleFIFO;
-        break;
-      }
-      case kBlockBasedTableWithPrefixHashIndex: {
-        table_options.index_type = BlockBasedTableOptions::kHashSearch;
-        options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-        break;
-      }
-      case kBlockBasedTableWithWholeKeyHashIndex: {
-        table_options.index_type = BlockBasedTableOptions::kHashSearch;
-        options.prefix_extractor.reset(NewNoopTransform());
-        break;
-      }
-      case kOptimizeFiltersForHits: {
-        options.optimize_filters_for_hits = true;
-        set_block_based_table_factory = true;
-        break;
-      }
+TEST_F(DBTest, ParanoidFileChecks) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.level0_file_num_compaction_trigger = 2;
+  options.paranoid_file_checks = true;
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
 
-      default:
-        break;
-    }
+  ASSERT_OK(Put(1, "1_key", "val"));
+  ASSERT_OK(Put(1, "9_key", "val"));
+  // Create a new table.
+  ASSERT_OK(Flush(1));
+  ASSERT_EQ(1, /* read and cache data block */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
-    if (options_override.filter_policy) {
-      table_options.filter_policy = options_override.filter_policy;
-    }
-    if (set_block_based_table_factory) {
-      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-    }
-    options.env = env_;
-    options.create_if_missing = true;
-    return options;
-  }
+  ASSERT_OK(Put(1, "1_key2", "val2"));
+  ASSERT_OK(Put(1, "9_key2", "val2"));
+  // Create a new SST file. This will further trigger a compaction
+  // and generate another file.
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(3, /* Totally 3 files created up to now */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
-  DBImpl* dbfull() {
-    return reinterpret_cast<DBImpl*>(db_);
-  }
+  // After disabling options.paranoid_file_checks. NO further block
+  // is added after generating a new file.
+  ASSERT_OK(
+      dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "false"}}));
+
+  ASSERT_OK(Put(1, "1_key3", "val3"));
+  ASSERT_OK(Put(1, "9_key3", "val3"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "1_key4", "val4"));
+  ASSERT_OK(Put(1, "9_key4", "val4"));
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(3, /* Totally 3 files created up to now */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+}
 
-  void CreateColumnFamilies(const std::vector<std::string>& cfs,
-                            const Options& options) {
-    ColumnFamilyOptions cf_opts(options);
-    size_t cfi = handles_.size();
-    handles_.resize(cfi + cfs.size());
-    for (auto cf : cfs) {
-      ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+TEST_F(DBTest, GetPropertiesOfAllTablesTest) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 8;
+  Reopen(options);
+  // Create 4 tables
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      db_->Put(WriteOptions(), ToString(table * 100 + i), "val");
     }
+    db_->Flush(FlushOptions());
   }
 
-  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
-                             const Options& options) {
-    CreateColumnFamilies(cfs, options);
-    std::vector<std::string> cfs_plus_default = cfs;
-    cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
-    ReopenWithColumnFamilies(cfs_plus_default, options);
-  }
+  // 1. Read table properties directly from file
+  Reopen(options);
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
 
-  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
-                                const std::vector<Options>& options) {
-    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  // 2. Put two tables to table cache and
+  Reopen(options);
+  // fetch key from 1st and 2nd table, which will internally place that table to
+  // the table cache.
+  for (int i = 0; i < 2; ++i) {
+    Get(ToString(i * 100 + 0));
   }
 
-  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
-                                const Options& options) {
-    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
-  }
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
 
-  Status TryReopenWithColumnFamilies(
-      const std::vector<std::string>& cfs,
-      const std::vector<Options>& options) {
-    Close();
-    EXPECT_EQ(cfs.size(), options.size());
-    std::vector<ColumnFamilyDescriptor> column_families;
-    for (size_t i = 0; i < cfs.size(); ++i) {
-      column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
-    }
-    DBOptions db_opts = DBOptions(options[0]);
-    return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  // 3. Put all tables to table cache
+  Reopen(options);
+  // fetch key from 1st and 2nd table, which will internally place that table to
+  // the table cache.
+  for (int i = 0; i < 4; ++i) {
+    Get(ToString(i * 100 + 0));
   }
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+}
 
-  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
-                                     const Options& options) {
-    Close();
-    std::vector<Options> v_opts(cfs.size(), options);
-    return TryReopenWithColumnFamilies(cfs, v_opts);
+namespace {
+void ResetTableProperties(TableProperties* tp) {
+  tp->data_size = 0;
+  tp->index_size = 0;
+  tp->filter_size = 0;
+  tp->raw_key_size = 0;
+  tp->raw_value_size = 0;
+  tp->num_data_blocks = 0;
+  tp->num_entries = 0;
+}
+
+void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
+  double dummy_double;
+  std::replace(tp_string.begin(), tp_string.end(), ';', ' ');
+  std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
+  ResetTableProperties(tp);
+
+  sscanf(tp_string.c_str(), "# data blocks %" SCNu64
+                            " # entries %" SCNu64
+                            " raw key size %" SCNu64
+                            " raw average key size %lf "
+                            " raw value size %" SCNu64
+                            " raw average value size %lf "
+                            " data block size %" SCNu64
+                            " index block size %" SCNu64
+                            " filter block size %" SCNu64,
+         &tp->num_data_blocks, &tp->num_entries, &tp->raw_key_size,
+         &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
+         &tp->index_size, &tp->filter_size);
+}
+
+void VerifySimilar(uint64_t a, uint64_t b, double bias) {
+  ASSERT_EQ(a == 0U, b == 0U);
+  if (a == 0) {
+    return;
   }
+  double dbl_a = static_cast<double>(a);
+  double dbl_b = static_cast<double>(b);
+  if (dbl_a > dbl_b) {
+    ASSERT_LT(static_cast<double>(dbl_a - dbl_b) / (dbl_a + dbl_b), bias);
+  } else {
+    ASSERT_LT(static_cast<double>(dbl_b - dbl_a) / (dbl_a + dbl_b), bias);
+  }
+}
+
+void VerifyTableProperties(const TableProperties& base_tp,
+                           const TableProperties& new_tp,
+                           double filter_size_bias = 0.1,
+                           double index_size_bias = 0.1,
+                           double data_size_bias = 0.1,
+                           double num_data_blocks_bias = 0.05) {
+  VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias);
+  VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias);
+  VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias);
+  VerifySimilar(base_tp.num_data_blocks, new_tp.num_data_blocks,
+                num_data_blocks_bias);
+  ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size);
+  ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size);
+  ASSERT_EQ(base_tp.num_entries, new_tp.num_entries);
+}
+
+void GetExpectedTableProperties(TableProperties* expected_tp,
+                                const int kKeySize, const int kValueSize,
+                                const int kKeysPerTable, const int kTableCount,
+                                const int kBloomBitsPerKey,
+                                const size_t kBlockSize) {
+  const int kKeyCount = kTableCount * kKeysPerTable;
+  const int kAvgSuccessorSize = kKeySize / 2;
+  const int kEncodingSavePerKey = kKeySize / 4;
+  expected_tp->raw_key_size = kKeyCount * (kKeySize + 8);
+  expected_tp->raw_value_size = kKeyCount * kValueSize;
+  expected_tp->num_entries = kKeyCount;
+  expected_tp->num_data_blocks =
+      kTableCount *
+      (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) /
+      kBlockSize;
+  expected_tp->data_size =
+      kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize));
+  expected_tp->index_size =
+      expected_tp->num_data_blocks * (kAvgSuccessorSize + 12);
+  expected_tp->filter_size =
+      kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8);
+}
+}  // namespace
 
-  void Reopen(const Options& options) {
-    ASSERT_OK(TryReopen(options));
-  }
-
-  void Close() {
-    for (auto h : handles_) {
-      delete h;
-    }
-    handles_.clear();
-    delete db_;
-    db_ = nullptr;
-  }
-
-  void DestroyAndReopen(const Options& options) {
-    //Destroy using last options
-    Destroy(last_options_);
-    ASSERT_OK(TryReopen(options));
-  }
-
-  void Destroy(const Options& options) {
-    Close();
-    ASSERT_OK(DestroyDB(dbname_, options));
-  }
+TEST_F(DBTest, AggregatedTableProperties) {
+  for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) {
+    const int kKeysPerTable = 100;
+    const int kKeySize = 80;
+    const int kValueSize = 200;
+    const int kBloomBitsPerKey = 20;
 
-  Status ReadOnlyReopen(const Options& options) {
-    return DB::OpenForReadOnly(options, dbname_, &db_);
-  }
+    Options options = CurrentOptions();
+    options.level0_file_num_compaction_trigger = 8;
+    options.compression = kNoCompression;
+    options.create_if_missing = true;
 
-  Status TryReopen(const Options& options) {
-    Close();
-    last_options_ = options;
-    return DB::Open(options, dbname_, &db_);
-  }
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy.reset(
+        NewBloomFilterPolicy(kBloomBitsPerKey, false));
+    table_options.block_size = 1024;
+    options.table_factory.reset(new BlockBasedTableFactory(table_options));
 
-  Status Flush(int cf = 0) {
-    if (cf == 0) {
-      return db_->Flush(FlushOptions());
-    } else {
-      return db_->Flush(FlushOptions(), handles_[cf]);
-    }
-  }
+    DestroyAndReopen(options);
 
-  Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
-    if (kMergePut == option_config_ ) {
-      return db_->Merge(wo, k, v);
-    } else {
-      return db_->Put(wo, k, v);
+    Random rnd(5632);
+    for (int table = 1; table <= kTableCount; ++table) {
+      for (int i = 0; i < kKeysPerTable; ++i) {
+        db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
+                 RandomString(&rnd, kValueSize));
+      }
+      db_->Flush(FlushOptions());
     }
-  }
+    std::string property;
+    db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property);
 
-  Status Put(int cf, const Slice& k, const Slice& v,
-             WriteOptions wo = WriteOptions()) {
-    if (kMergePut == option_config_) {
-      return db_->Merge(wo, handles_[cf], k, v);
-    } else {
-      return db_->Put(wo, handles_[cf], k, v);
-    }
-  }
+    TableProperties expected_tp;
+    GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
+                               kKeysPerTable, kTableCount, kBloomBitsPerKey,
+                               table_options.block_size);
 
-  Status Delete(const std::string& k) {
-    return db_->Delete(WriteOptions(), k);
-  }
+    TableProperties output_tp;
+    ParseTablePropertiesString(property, &output_tp);
 
-  Status Delete(int cf, const std::string& k) {
-    return db_->Delete(WriteOptions(), handles_[cf], k);
+    VerifyTableProperties(expected_tp, output_tp);
   }
+}
 
-  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
-    ReadOptions options;
-    options.verify_checksums = true;
-    options.snapshot = snapshot;
-    std::string result;
-    Status s = db_->Get(options, k, &result);
-    if (s.IsNotFound()) {
-      result = "NOT_FOUND";
-    } else if (!s.ok()) {
-      result = s.ToString();
-    }
-    return result;
-  }
+TEST_F(DBTest, ReadLatencyHistogramByLevel) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;
+  options.level0_file_num_compaction_trigger = 3;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_write_buffer_number = 2;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.max_open_files = 100;
 
-  std::string Get(int cf, const std::string& k,
-                  const Snapshot* snapshot = nullptr) {
-    ReadOptions options;
-    options.verify_checksums = true;
-    options.snapshot = snapshot;
-    std::string result;
-    Status s = db_->Get(options, handles_[cf], k, &result);
-    if (s.IsNotFound()) {
-      result = "NOT_FOUND";
-    } else if (!s.ok()) {
-      result = s.ToString();
-    }
-    return result;
-  }
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
 
-  uint64_t GetNumSnapshots() {
-    uint64_t int_num;
-    EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.num-snapshots", &int_num));
-    return int_num;
+  DestroyAndReopen(options);
+  int key_index = 0;
+  Random rnd(301);
+  for (int num = 0; num < 5; num++) {
+    Put("foo", "bar");
+    GenerateNewFile(&rnd, &key_index);
   }
 
-  uint64_t GetTimeOldestSnapshots() {
-    uint64_t int_num;
-    EXPECT_TRUE(
-        dbfull()->GetIntProperty("rocksdb.oldest-snapshot-time", &int_num));
-    return int_num;
-  }
-
-  // Return a string that contains all key,value pairs in order,
-  // formatted like "(k1->v1)(k2->v2)".
-  std::string Contents(int cf = 0) {
-    std::vector<std::string> forward;
-    std::string result;
-    Iterator* iter = (cf == 0) ? db_->NewIterator(ReadOptions())
-                               : db_->NewIterator(ReadOptions(), handles_[cf]);
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      std::string s = IterStatus(iter);
-      result.push_back('(');
-      result.append(s);
-      result.push_back(')');
-      forward.push_back(s);
-    }
-
-    // Check reverse iteration results are the reverse of forward results
-    unsigned int matched = 0;
-    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
-      EXPECT_LT(matched, forward.size());
-      EXPECT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
-      matched++;
-    }
-    EXPECT_EQ(matched, forward.size());
-
-    delete iter;
-    return result;
-  }
+  std::string prop;
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
 
-  std::string AllEntriesFor(const Slice& user_key, int cf = 0) {
-    Arena arena;
-    ScopedArenaIterator iter;
-    if (cf == 0) {
-      iter.set(dbfull()->TEST_NewInternalIterator(&arena));
-    } else {
-      iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf]));
-    }
-    InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
-    iter->Seek(target.Encode());
-    std::string result;
-    if (!iter->status().ok()) {
-      result = iter->status().ToString();
-    } else {
-      result = "[ ";
-      bool first = true;
-      while (iter->Valid()) {
-        ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-        if (!ParseInternalKey(iter->key(), &ikey)) {
-          result += "CORRUPTED";
-        } else {
-          if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) {
-            break;
-          }
-          if (!first) {
-            result += ", ";
-          }
-          first = false;
-          switch (ikey.type) {
-            case kTypeValue:
-              result += iter->value().ToString();
-              break;
-            case kTypeMerge:
-              // keep it the same as kTypeValue for testing kMergePut
-              result += iter->value().ToString();
-              break;
-            case kTypeDeletion:
-              result += "DEL";
-              break;
-            default:
-              assert(false);
-              break;
-          }
-        }
-        iter->Next();
-      }
-      if (!first) {
-        result += " ";
-      }
-      result += "]";
-    }
-    return result;
+  // Get() after flushes, See latency histogram tracked.
+  for (int key = 0; key < 500; key++) {
+    Get(Key(key));
   }
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
 
-  int NumSortedRuns(int cf = 0) {
-    ColumnFamilyMetaData cf_meta;
-    if (cf == 0) {
-      db_->GetColumnFamilyMetaData(&cf_meta);
-    } else {
-      db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
-    }
-    int num_sr = static_cast<int>(cf_meta.levels[0].files.size());
-    for (size_t i = 1U; i < cf_meta.levels.size(); i++) {
-      if (cf_meta.levels[i].files.size() > 0) {
-        num_sr++;
-      }
-    }
-    return num_sr;
+  // Reopen and issue Get(). See thee latency tracked
+  Reopen(options);
+  for (int key = 0; key < 500; key++) {
+    Get(Key(key));
   }
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
 
-  uint64_t TotalSize(int cf = 0) {
-    ColumnFamilyMetaData cf_meta;
-    if (cf == 0) {
-      db_->GetColumnFamilyMetaData(&cf_meta);
-    } else {
-      db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+  // Reopen and issue iterating. See thee latency tracked
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+  {
+    unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
     }
-    return cf_meta.size;
   }
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
 
-  int NumTableFilesAtLevel(int level, int cf = 0) {
-    std::string property;
-    if (cf == 0) {
-      // default cfd
-      EXPECT_TRUE(db_->GetProperty(
-          "rocksdb.num-files-at-level" + NumberToString(level), &property));
-    } else {
-      EXPECT_TRUE(db_->GetProperty(
-          handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level),
-          &property));
-    }
-    return atoi(property.c_str());
-  }
+  // options.max_open_files preloads table readers.
+  options.max_open_files = -1;
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+  for (int key = 0; key < 500; key++) {
+    Get(Key(key));
+  }
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+}
+
+TEST_F(DBTest, AggregatedTablePropertiesAtLevel) {
+  const int kTableCount = 100;
+  const int kKeysPerTable = 10;
+  const int kKeySize = 50;
+  const int kValueSize = 400;
+  const int kMaxLevel = 7;
+  const int kBloomBitsPerKey = 20;
+  Random rnd(301);
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 8;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = 8192;
+  options.max_bytes_for_level_base = 10000;
+  options.max_bytes_for_level_multiplier = 2;
+  // This ensures there no compaction happening when we call GetProperty().
+  options.disable_auto_compactions = true;
 
-  uint64_t SizeAtLevel(int level) {
-    std::vector<LiveFileMetaData> metadata;
-    db_->GetLiveFilesMetaData(&metadata);
-    uint64_t sum = 0;
-    for (const auto& m : metadata) {
-      if (m.level == level) {
-        sum += m.size;
-      }
-    }
-    return sum;
-  }
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(
+      NewBloomFilterPolicy(kBloomBitsPerKey, false));
+  table_options.block_size = 1024;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
 
-  int TotalLiveFiles(int cf = 0) {
-    ColumnFamilyMetaData cf_meta;
-    if (cf == 0) {
-      db_->GetColumnFamilyMetaData(&cf_meta);
-    } else {
-      db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
-    }
-    int num_files = 0;
-    for (auto& level : cf_meta.levels) {
-      num_files += level.files.size();
-    }
-    return num_files;
-  }
+  DestroyAndReopen(options);
 
-  int TotalTableFiles(int cf = 0, int levels = -1) {
-    if (levels == -1) {
-      levels = CurrentOptions().num_levels;
+  std::string level_tp_strings[kMaxLevel];
+  std::string tp_string;
+  TableProperties level_tps[kMaxLevel];
+  TableProperties tp, sum_tp, expected_tp;
+  for (int table = 1; table <= kTableCount; ++table) {
+    for (int i = 0; i < kKeysPerTable; ++i) {
+      db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
+               RandomString(&rnd, kValueSize));
     }
-    int result = 0;
-    for (int level = 0; level < levels; level++) {
-      result += NumTableFilesAtLevel(level, cf);
-    }
-    return result;
-  }
-
-  // Return spread of files per level
-  std::string FilesPerLevel(int cf = 0) {
-    int num_levels =
-        (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
-    std::string result;
-    size_t last_non_zero_offset = 0;
-    for (int level = 0; level < num_levels; level++) {
-      int f = NumTableFilesAtLevel(level, cf);
-      char buf[100];
-      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
-      result += buf;
-      if (f > 0) {
-        last_non_zero_offset = result.size();
-      }
+    db_->Flush(FlushOptions());
+    db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    ResetTableProperties(&sum_tp);
+    for (int level = 0; level < kMaxLevel; ++level) {
+      db_->GetProperty(
+          DB::Properties::kAggregatedTablePropertiesAtLevel + ToString(level),
+          &level_tp_strings[level]);
+      ParseTablePropertiesString(level_tp_strings[level], &level_tps[level]);
+      sum_tp.data_size += level_tps[level].data_size;
+      sum_tp.index_size += level_tps[level].index_size;
+      sum_tp.filter_size += level_tps[level].filter_size;
+      sum_tp.raw_key_size += level_tps[level].raw_key_size;
+      sum_tp.raw_value_size += level_tps[level].raw_value_size;
+      sum_tp.num_data_blocks += level_tps[level].num_data_blocks;
+      sum_tp.num_entries += level_tps[level].num_entries;
+    }
+    db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
+    ParseTablePropertiesString(tp_string, &tp);
+    ASSERT_EQ(sum_tp.data_size, tp.data_size);
+    ASSERT_EQ(sum_tp.index_size, tp.index_size);
+    ASSERT_EQ(sum_tp.filter_size, tp.filter_size);
+    ASSERT_EQ(sum_tp.raw_key_size, tp.raw_key_size);
+    ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size);
+    ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks);
+    ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
+    if (table > 3) {
+      GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
+                                 kKeysPerTable, table, kBloomBitsPerKey,
+                                 table_options.block_size);
+      // Gives larger bias here as index block size, filter block size,
+      // and data block size become much harder to estimate in this test.
+      VerifyTableProperties(tp, expected_tp, 0.5, 0.4, 0.4, 0.25);
     }
-    result.resize(last_non_zero_offset);
-    return result;
   }
+}
 
-  size_t CountFiles() {
-    std::vector<std::string> files;
-    env_->GetChildren(dbname_, &files);
-
-    std::vector<std::string> logfiles;
-    if (dbname_ != last_options_.wal_dir) {
-      env_->GetChildren(last_options_.wal_dir, &logfiles);
-    }
+class CoutingUserTblPropCollector : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "CoutingUserTblPropCollector"; }
 
-    return files.size() + logfiles.size();
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    PutVarint32(&encoded, count_);
+    *properties = UserCollectedProperties{
+        {"CoutingUserTblPropCollector", message_}, {"Count", encoded},
+    };
+    return Status::OK();
   }
 
-  size_t CountLiveFiles() {
-    std::vector<LiveFileMetaData> metadata;
-    db_->GetLiveFilesMetaData(&metadata);
-    return metadata.size();
+  Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type,
+                    SequenceNumber seq, uint64_t file_size) override {
+    ++count_;
+    return Status::OK();
   }
 
-  uint64_t Size(const Slice& start, const Slice& limit, int cf = 0) {
-    Range r(start, limit);
-    uint64_t size;
-    if (cf == 0) {
-      db_->GetApproximateSizes(&r, 1, &size);
-    } else {
-      db_->GetApproximateSizes(handles_[1], &r, 1, &size);
-    }
-    return size;
+  virtual UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
   }
 
-  void Compact(int cf, const Slice& start, const Slice& limit,
-               uint32_t target_path_id) {
-    ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit, false, -1,
-                                target_path_id));
-  }
+ private:
+  std::string message_ = "Rocksdb";
+  uint32_t count_ = 0;
+};
 
-  void Compact(int cf, const Slice& start, const Slice& limit) {
-    ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit));
+class CoutingUserTblPropCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector() override {
+    return new CoutingUserTblPropCollector();
   }
-
-  void Compact(const Slice& start, const Slice& limit) {
-    ASSERT_OK(db_->CompactRange(&start, &limit));
+  const char* Name() const override {
+    return "CoutingUserTblPropCollectorFactory";
   }
+};
 
-  // Do n memtable compactions, each of which produces an sstable
-  // covering the range [small,large].
-  void MakeTables(int n, const std::string& small, const std::string& large,
-                  int cf = 0) {
-    for (int i = 0; i < n; i++) {
-      ASSERT_OK(Put(cf, small, "begin"));
-      ASSERT_OK(Put(cf, large, "end"));
-      ASSERT_OK(Flush(cf));
+TEST_F(DBTest, GetUserDefinedTablaProperties) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = (1<<30);
+  options.max_background_flushes = 0;
+  options.table_properties_collector_factories.resize(1);
+  options.table_properties_collector_factories[0] =
+      std::make_shared<CoutingUserTblPropCollectorFactory>();
+  Reopen(options);
+  // Create 4 tables
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      db_->Put(WriteOptions(), ToString(table * 100 + i), "val");
     }
+    db_->Flush(FlushOptions());
   }
 
-  // Prevent pushing of new sstables into deeper levels by adding
-  // tables that cover a specified range to all levels.
-  void FillLevels(const std::string& smallest, const std::string& largest,
-                  int cf) {
-    MakeTables(db_->NumberLevels(handles_[cf]), smallest, largest, cf);
+  TablePropertiesCollection props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+  ASSERT_EQ(4U, props.size());
+  uint32_t sum = 0;
+  for (const auto& item : props) {
+    auto& user_collected = item.second->user_collected_properties;
+    ASSERT_TRUE(user_collected.find("CoutingUserTblPropCollector") !=
+                user_collected.end());
+    ASSERT_EQ(user_collected.at("CoutingUserTblPropCollector"), "Rocksdb");
+    ASSERT_TRUE(user_collected.find("Count") != user_collected.end());
+    Slice key(user_collected.at("Count"));
+    uint32_t count;
+    ASSERT_TRUE(GetVarint32(&key, &count));
+    sum += count;
   }
+  ASSERT_EQ(10u + 11u + 12u + 13u, sum);
+}
 
-  void DumpFileCounts(const char* label) {
-    fprintf(stderr, "---\n%s:\n", label);
-    fprintf(stderr, "maxoverlap: %lld\n",
-            static_cast<long long>(
-                dbfull()->TEST_MaxNextLevelOverlappingBytes()));
-    for (int level = 0; level < db_->NumberLevels(); level++) {
-      int num = NumTableFilesAtLevel(level);
-      if (num > 0) {
-        fprintf(stderr, "  level %3d : %d files\n", level, num);
-      }
-    }
-  }
+TEST_F(DBTest, LevelLimitReopen) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
 
-  std::string DumpSSTableList() {
-    std::string property;
-    db_->GetProperty("rocksdb.sstables", &property);
-    return property;
+  const std::string value(1024 * 1024, ' ');
+  int i = 0;
+  while (NumTableFilesAtLevel(2, 1) == 0) {
+    ASSERT_OK(Put(1, Key(i++), value));
   }
 
-  int GetSstFileCount(std::string path) {
-    std::vector<std::string> files;
-    env_->GetChildren(path, &files);
+  options.num_levels = 1;
+  options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ(s.IsInvalidArgument(), true);
+  ASSERT_EQ(s.ToString(),
+            "Invalid argument: db has more levels than options.num_levels");
 
-    int sst_count = 0;
-    uint64_t number;
-    FileType type;
-    for (size_t i = 0; i < files.size(); i++) {
-      if (ParseFileName(files[i], &number, &type) && type == kTableFile) {
-        sst_count++;
-      }
-    }
-    return sst_count;
-  }
+  options.num_levels = 10;
+  options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
 
-  // this will generate non-overlapping files since it keeps increasing key_idx
-  void GenerateNewFile(Random* rnd, int* key_idx, bool nowait = false) {
-    for (int i = 0; i < 11; i++) {
-      ASSERT_OK(Put(Key(*key_idx), RandomString(rnd, (i == 10) ? 1 : 10000)));
-      (*key_idx)++;
-    }
-    if (!nowait) {
-      dbfull()->TEST_WaitForFlushMemTable();
-      dbfull()->TEST_WaitForCompact();
-    }
-  }
+TEST_F(DBTest, PutDeleteGet) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+  } while (ChangeOptions());
+}
 
-  void GenerateNewRandomFile(Random* rnd, bool nowait = false) {
-    for (int i = 0; i < 100; i++) {
-      ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 1000)));
-    }
-    ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 1)));
-    if (!nowait) {
-      dbfull()->TEST_WaitForFlushMemTable();
-      dbfull()->TEST_WaitForCompact();
-    }
-  }
+TEST_F(DBTest, PutSingleDeleteGet) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo2", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo2"));
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    // Skip HashCuckooRep as it does not support single delete. FIFO and
+    // universal compaction do not apply to the test case. Skip MergePut
+    // because single delete does not get removed when it encounters a merge.
+  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction |
+                         kSkipUniversalCompaction | kSkipMergePut));
+}
 
-  std::string IterStatus(Iterator* iter) {
-    std::string result;
-    if (iter->Valid()) {
-      result = iter->key().ToString() + "->" + iter->value().ToString();
-    } else {
-      result = "(invalid)";
-    }
-    return result;
-  }
+TEST_F(DBTest, SingleDeleteFlush) {
+  // Test to check whether flushing preserves a single delete hidden
+  // behind a put.
+  do {
+    Random rnd(301);
 
-  Options OptionsForLogIterTest() {
     Options options = CurrentOptions();
-    options.create_if_missing = true;
-    options.WAL_ttl_seconds = 1000;
-    return options;
-  }
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
 
-  std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
-      const SequenceNumber seq) {
-    unique_ptr<TransactionLogIterator> iter;
-    Status status = dbfull()->GetUpdatesSince(seq, &iter);
-    EXPECT_OK(status);
-    EXPECT_TRUE(iter->Valid());
-    return std::move(iter);
-  }
+    // Put values on second level (so that they will not be in the same
+    // compaction as the other operations.
+    Put(1, "foo", "first");
+    Put(1, "bar", "one");
+    ASSERT_OK(Flush(1));
+    MoveFilesToLevel(2, 1);
 
-  std::string DummyString(size_t len, char c = 'a') {
-    return std::string(len, c);
-  }
+    // (Single) delete hidden by a put
+    SingleDelete(1, "foo");
+    Put(1, "foo", "second");
+    Delete(1, "bar");
+    Put(1, "bar", "two");
+    ASSERT_OK(Flush(1));
 
-  void VerifyIterLast(std::string expected_key, int cf = 0) {
-    Iterator* iter;
-    ReadOptions ro;
-    if (cf == 0) {
-      iter = db_->NewIterator(ro);
-    } else {
-      iter = db_->NewIterator(ro, handles_[cf]);
-    }
-    iter->SeekToLast();
-    ASSERT_EQ(IterStatus(iter), expected_key);
-    delete iter;
-  }
+    SingleDelete(1, "foo");
+    Delete(1, "bar");
+    ASSERT_OK(Flush(1));
 
-  // Used to test InplaceUpdate
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
 
-  // If previous value is nullptr or delta is > than previous value,
-  //   sets newValue with delta
-  // If previous value is not empty,
-  //   updates previous value with 'b' string of previous value size - 1.
-  static UpdateStatus
-      updateInPlaceSmallerSize(char* prevValue, uint32_t* prevSize,
-                               Slice delta, std::string* newValue) {
-    if (prevValue == nullptr) {
-      *newValue = std::string(delta.size(), 'c');
-      return UpdateStatus::UPDATED;
-    } else {
-      *prevSize = *prevSize - 1;
-      std::string str_b = std::string(*prevSize, 'b');
-      memcpy(prevValue, str_b.c_str(), str_b.size());
-      return UpdateStatus::UPDATED_INPLACE;
-    }
-  }
-
-  static UpdateStatus
-      updateInPlaceSmallerVarintSize(char* prevValue, uint32_t* prevSize,
-                                     Slice delta, std::string* newValue) {
-    if (prevValue == nullptr) {
-      *newValue = std::string(delta.size(), 'c');
-      return UpdateStatus::UPDATED;
-    } else {
-      *prevSize = 1;
-      std::string str_b = std::string(*prevSize, 'b');
-      memcpy(prevValue, str_b.c_str(), str_b.size());
-      return UpdateStatus::UPDATED_INPLACE;
-    }
-  }
-
-  static UpdateStatus
-      updateInPlaceLargerSize(char* prevValue, uint32_t* prevSize,
-                              Slice delta, std::string* newValue) {
-    *newValue = std::string(delta.size(), 'c');
-    return UpdateStatus::UPDATED;
-  }
-
-  static UpdateStatus
-      updateInPlaceNoAction(char* prevValue, uint32_t* prevSize,
-                            Slice delta, std::string* newValue) {
-    return UpdateStatus::UPDATE_FAILED;
-  }
-
-  // Utility method to test InplaceUpdate
-  void validateNumberOfEntries(int numValues, int cf = 0) {
-    ScopedArenaIterator iter;
-    Arena arena;
-    if (cf != 0) {
-      iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf]));
-    } else {
-      iter.set(dbfull()->TEST_NewInternalIterator(&arena));
-    }
-    iter->SeekToFirst();
-    ASSERT_EQ(iter->status().ok(), true);
-    int seq = numValues;
-    while (iter->Valid()) {
-      ParsedInternalKey ikey;
-      ikey.sequence = -1;
-      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-
-      // checks sequence number for updates
-      ASSERT_EQ(ikey.sequence, (unsigned)seq--);
-      iter->Next();
-    }
-    ASSERT_EQ(0, seq);
-  }
-
-  void CopyFile(const std::string& source, const std::string& destination,
-                uint64_t size = 0) {
-    const EnvOptions soptions;
-    unique_ptr<SequentialFile> srcfile;
-    ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
-    unique_ptr<WritableFile> destfile;
-    ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+    ASSERT_EQ("NOT_FOUND", Get(1, "bar"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    // Skip HashCuckooRep as it does not support single delete. FIFO and
+    // universal compaction do not apply to the test case. Skip MergePut
+    // because merges cannot be combined with single deletions.
+  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction |
+                         kSkipUniversalCompaction | kSkipMergePut));
+}
 
-    if (size == 0) {
-      // default argument means copy everything
-      ASSERT_OK(env_->GetFileSize(source, &size));
-    }
+TEST_F(DBTest, SingleDeletePutFlush) {
+  // Single deletes that encounter the matching put in a flush should get
+  // removed.
+  do {
+    Random rnd(301);
 
-    char buffer[4096];
-    Slice slice;
-    while (size > 0) {
-      uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
-      ASSERT_OK(srcfile->Read(one, &slice, buffer));
-      ASSERT_OK(destfile->Append(slice));
-      size -= slice.size();
-    }
-    ASSERT_OK(destfile->Close());
-  }
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
 
-};
+    Put(1, "foo", Slice());
+    Put(1, "a", Slice());
+    SingleDelete(1, "a");
+    ASSERT_OK(Flush(1));
 
-static long TestGetTickerCount(const Options& options, Tickers ticker_type) {
-  return options.statistics->getTickerCount(ticker_type);
+    ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
+    // Skip HashCuckooRep as it does not support single delete. FIFO and
+    // universal compaction do not apply to the test case. Skip MergePut
+    // because merges cannot be combined with single deletions.
+  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction |
+                         kSkipUniversalCompaction | kSkipMergePut));
 }
 
-// A helper function that ensures the table properties returned in
-// `GetPropertiesOfAllTablesTest` is correct.
-// This test assumes entries size is different for each of the tables.
-namespace {
-void VerifyTableProperties(DB* db, uint64_t expected_entries_size) {
-  TablePropertiesCollection props;
-  ASSERT_OK(db->GetPropertiesOfAllTables(&props));
-
-  ASSERT_EQ(4U, props.size());
-  std::unordered_set<uint64_t> unique_entries;
+TEST_F(DBTest, EmptyFlush) {
+  // It is possible to produce empty flushes when using single deletes. Tests
+  // whether empty flushes cause issues.
+  do {
+    Random rnd(301);
 
-  // Indirect test
-  uint64_t sum = 0;
-  for (const auto& item : props) {
-    unique_entries.insert(item.second->num_entries);
-    sum += item.second->num_entries;
-  }
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
 
-  ASSERT_EQ(props.size(), unique_entries.size());
-  ASSERT_EQ(expected_entries_size, sum);
-}
+    Put(1, "a", Slice());
+    SingleDelete(1, "a");
+    ASSERT_OK(Flush(1));
 
-uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
-                                            std::string column_family_name) {
-  std::vector<LiveFileMetaData> metadata;
-  db->GetLiveFilesMetaData(&metadata);
-  uint64_t result = 0;
-  for (auto& fileMetadata : metadata) {
-    result += (fileMetadata.column_family_name == column_family_name);
-  }
-  return result;
+    ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
+    // Skip HashCuckooRep as it does not support single delete. FIFO and
+    // universal compaction do not apply to the test case. Skip MergePut
+    // because merges cannot be combined with single deletions.
+  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction |
+                         kSkipUniversalCompaction | kSkipMergePut));
 }
-}  // namespace
 
-TEST_F(DBTest, Empty) {
+TEST_F(DBTest, GetFromImmutableLayer) {
   do {
     Options options;
     options.env = env_;
@@ -1354,8270 +1004,5149 @@ TEST_F(DBTest, Empty) {
     options = CurrentOptions(options);
     CreateAndReopenWithCF({"pikachu"}, options);
 
-    std::string num;
-    ASSERT_TRUE(dbfull()->GetProperty(
-        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
-    ASSERT_EQ("0", num);
-
     ASSERT_OK(Put(1, "foo", "v1"));
     ASSERT_EQ("v1", Get(1, "foo"));
-    ASSERT_TRUE(dbfull()->GetProperty(
-        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
-    ASSERT_EQ("1", num);
 
     // Block sync calls
     env_->delay_sstable_sync_.store(true, std::memory_order_release);
-    Put(1, "k1", std::string(100000, 'x'));         // Fill memtable
-    ASSERT_TRUE(dbfull()->GetProperty(
-        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
-    ASSERT_EQ("2", num);
-
-    Put(1, "k2", std::string(100000, 'y'));         // Trigger compaction
-    ASSERT_TRUE(dbfull()->GetProperty(
-        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
-    ASSERT_EQ("1", num);
-
+    Put(1, "k1", std::string(100000, 'x'));          // Fill memtable
+    Put(1, "k2", std::string(100000, 'y'));          // Trigger flush
     ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
     // Release sync calls
     env_->delay_sstable_sync_.store(false, std::memory_order_release);
-
-    ASSERT_OK(db_->DisableFileDeletions());
-    ASSERT_TRUE(
-        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
-    ASSERT_EQ("1", num);
-
-    ASSERT_OK(db_->DisableFileDeletions());
-    ASSERT_TRUE(
-        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
-    ASSERT_EQ("2", num);
-
-    ASSERT_OK(db_->DisableFileDeletions());
-    ASSERT_TRUE(
-        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
-    ASSERT_EQ("3", num);
-
-    ASSERT_OK(db_->EnableFileDeletions(false));
-    ASSERT_TRUE(
-        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
-    ASSERT_EQ("2", num);
-
-    ASSERT_OK(db_->EnableFileDeletions());
-    ASSERT_TRUE(
-        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
-    ASSERT_EQ("0", num);
   } while (ChangeOptions());
 }
 
-TEST_F(DBTest, WriteEmptyBatch) {
-  Options options;
-  options.env = env_;
-  options.write_buffer_size = 100000;
-  options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
+TEST_F(DBTest, GetFromVersions) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+  } while (ChangeOptions());
+}
 
-  ASSERT_OK(Put(1, "foo", "bar"));
-  env_->sync_counter_.store(0);
-  WriteOptions wo;
-  wo.sync = true;
-  wo.disableWAL = false;
-  WriteBatch empty_batch;
-  ASSERT_OK(dbfull()->Write(wo, &empty_batch));
-  ASSERT_GE(env_->sync_counter_.load(), 1);
+TEST_F(DBTest, GetSnapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    // Try with both a short key and a long key
+    for (int i = 0; i < 2; i++) {
+      std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
+      ASSERT_OK(Put(1, key, "v1"));
+      const Snapshot* s1 = db_->GetSnapshot();
+      if (option_config_ == kHashCuckoo) {
+        // Unsupported case.
+        ASSERT_TRUE(s1 == nullptr);
+        break;
+      }
+      ASSERT_OK(Put(1, key, "v2"));
+      ASSERT_EQ("v2", Get(1, key));
+      ASSERT_EQ("v1", Get(1, key, s1));
+      ASSERT_OK(Flush(1));
+      ASSERT_EQ("v2", Get(1, key));
+      ASSERT_EQ("v1", Get(1, key, s1));
+      db_->ReleaseSnapshot(s1);
+    }
+  } while (ChangeOptions());
+}
 
-  // make sure we can re-open it.
-  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
-  ASSERT_EQ("bar", Get(1, "foo"));
+TEST_F(DBTest, GetLevel0Ordering) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    // Check that we process level-0 files in correct order.  The code
+    // below generates two level-0 files where the earlier one comes
+    // before the later one in the level-0 file list since the earlier
+    // one has a smaller "smallest" key.
+    ASSERT_OK(Put(1, "bar", "b"));
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v2", Get(1, "foo"));
+  } while (ChangeOptions());
 }
 
-TEST_F(DBTest, ReadOnlyDB) {
-  ASSERT_OK(Put("foo", "v1"));
-  ASSERT_OK(Put("bar", "v2"));
-  ASSERT_OK(Put("foo", "v3"));
+TEST_F(DBTest, WrongLevel0Config) {
+  Options options = CurrentOptions();
   Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+  options.level0_stop_writes_trigger = 1;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_file_num_compaction_trigger = 3;
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+}
 
-  auto options = CurrentOptions();
-  assert(options.env = env_);
-  ASSERT_OK(ReadOnlyReopen(options));
-  ASSERT_EQ("v3", Get("foo"));
-  ASSERT_EQ("v2", Get("bar"));
-  Iterator* iter = db_->NewIterator(ReadOptions());
-  int count = 0;
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    ASSERT_OK(iter->status());
-    ++count;
-  }
-  ASSERT_EQ(count, 2);
-  delete iter;
-  Close();
+TEST_F(DBTest, GetOrderedByLevels) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    Compact(1, "a", "z");
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v2", Get(1, "foo"));
+  } while (ChangeOptions());
+}
 
-  // Reopen and flush memtable.
-  Reopen(options);
-  Flush();
-  Close();
-  // Now check keys in read only mode.
-  ASSERT_OK(ReadOnlyReopen(options));
-  ASSERT_EQ("v3", Get("foo"));
-  ASSERT_EQ("v2", Get("bar"));
+TEST_F(DBTest, GetPicksCorrectFile) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    // Arrange to have multiple files in a non-level-0 level.
+    ASSERT_OK(Put(1, "a", "va"));
+    Compact(1, "a", "b");
+    ASSERT_OK(Put(1, "x", "vx"));
+    Compact(1, "x", "y");
+    ASSERT_OK(Put(1, "f", "vf"));
+    Compact(1, "f", "g");
+    ASSERT_EQ("va", Get(1, "a"));
+    ASSERT_EQ("vf", Get(1, "f"));
+    ASSERT_EQ("vx", Get(1, "x"));
+  } while (ChangeOptions());
 }
 
-TEST_F(DBTest, CompactedDB) {
-  const uint64_t kFileSize = 1 << 20;
-  Options options;
-  options.disable_auto_compactions = true;
-  options.max_mem_compaction_level = 0;
-  options.write_buffer_size = kFileSize;
-  options.target_file_size_base = kFileSize;
-  options.max_bytes_for_level_base = 1 << 30;
-  options.compression = kNoCompression;
-  options = CurrentOptions(options);
-  Reopen(options);
-  // 1 L0 file, use CompactedDB if max_open_files = -1
-  ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1')));
-  Flush();
-  Close();
-  ASSERT_OK(ReadOnlyReopen(options));
-  Status s = Put("new", "value");
-  ASSERT_EQ(s.ToString(),
-            "Not implemented: Not supported operation in read only mode.");
-  ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
-  Close();
-  options.max_open_files = -1;
-  ASSERT_OK(ReadOnlyReopen(options));
-  s = Put("new", "value");
-  ASSERT_EQ(s.ToString(),
-            "Not implemented: Not supported in compacted db mode.");
-  ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
-  Close();
-  Reopen(options);
-  // Add more L0 files
-  ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2')));
-  Flush();
-  ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a')));
-  Flush();
-  ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b')));
-  Flush();
-  Close();
+TEST_F(DBTest, GetEncountersEmptyLevel) {
+  do {
+    Options options = CurrentOptions();
+    options.disableDataSync = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    // Arrange for the following to happen:
+    //   * sstable A in level 0
+    //   * nothing in level 1
+    //   * sstable B in level 2
+    // Then do enough Get() calls to arrange for an automatic compaction
+    // of sstable A.  A bug would cause the compaction to be marked as
+    // occurring at level 1 (instead of the correct level 0).
 
-  ASSERT_OK(ReadOnlyReopen(options));
-  // Fallback to read-only DB
-  s = Put("new", "value");
-  ASSERT_EQ(s.ToString(),
-            "Not implemented: Not supported operation in read only mode.");
-  Close();
+    // Step 1: First place sstables in levels 0 and 2
+    Put(1, "a", "begin");
+    Put(1, "z", "end");
+    ASSERT_OK(Flush(1));
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    Put(1, "a", "begin");
+    Put(1, "z", "end");
+    ASSERT_OK(Flush(1));
+    ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
+    ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
 
-  // Full compaction
-  Reopen(options);
-  // Add more keys
-  ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
-  ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
-  ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h')));
-  ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i')));
-  ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j')));
-  db_->CompactRange(nullptr, nullptr);
-  ASSERT_EQ(3, NumTableFilesAtLevel(1));
-  Close();
+    // Step 2: clear level 1 if necessary.
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
+    ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+    ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1);
 
-  // CompactedDB
-  ASSERT_OK(ReadOnlyReopen(options));
-  s = Put("new", "value");
-  ASSERT_EQ(s.ToString(),
-            "Not implemented: Not supported in compacted db mode.");
-  ASSERT_EQ("NOT_FOUND", Get("abc"));
-  ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa"));
-  ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb"));
-  ASSERT_EQ("NOT_FOUND", Get("ccc"));
-  ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee"));
-  ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff"));
-  ASSERT_EQ("NOT_FOUND", Get("ggg"));
-  ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh"));
-  ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii"));
-  ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj"));
-  ASSERT_EQ("NOT_FOUND", Get("kkk"));
+    // Step 3: read a bunch of times
+    for (int i = 0; i < 1000; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, "missing"));
+    }
 
-  // MultiGet
-  std::vector<std::string> values;
-  std::vector<Status> status_list = dbfull()->MultiGet(ReadOptions(),
-      std::vector<Slice>({Slice("aaa"), Slice("ccc"), Slice("eee"),
-                          Slice("ggg"), Slice("iii"), Slice("kkk")}),
-      &values);
-  ASSERT_EQ(status_list.size(), static_cast<uint64_t>(6));
-  ASSERT_EQ(values.size(), static_cast<uint64_t>(6));
-  ASSERT_OK(status_list[0]);
-  ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]);
-  ASSERT_TRUE(status_list[1].IsNotFound());
-  ASSERT_OK(status_list[2]);
-  ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]);
-  ASSERT_TRUE(status_list[3].IsNotFound());
-  ASSERT_OK(status_list[4]);
-  ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]);
-  ASSERT_TRUE(status_list[5].IsNotFound());
-}
+    // Step 4: Wait for compaction to finish
+    dbfull()->TEST_WaitForCompact();
 
-// Make sure that when options.block_cache is set, after a new table is
-// created its index/filter blocks are added to block cache.
-TEST_F(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
-  Options options = CurrentOptions();
-  options.create_if_missing = true;
-  options.statistics = rocksdb::CreateDBStatistics();
-  BlockBasedTableOptions table_options;
-  table_options.cache_index_and_filter_blocks = true;
-  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
-  CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);  // XXX
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
+}
 
-  ASSERT_OK(Put(1, "key", "val"));
-  // Create a new table.
-  ASSERT_OK(Flush(1));
+// KeyMayExist can lead to a few false positives, but not false negatives.
+// To make test deterministic, use a much larger number of bits per key-20 than
+// bits in the key, so that false positives are eliminated
+TEST_F(DBTest, KeyMayExist) {
+  do {
+    ReadOptions ropts;
+    std::string value;
+    anon::OptionsOverride options_override;
+    options_override.filter_policy.reset(NewBloomFilterPolicy(20));
+    Options options = CurrentOptions(options_override);
+    options.statistics = rocksdb::CreateDBStatistics();
+    CreateAndReopenWithCF({"pikachu"}, options);
 
-  // index/filter blocks added to block cache right after table creation.
-  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
-  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
-  ASSERT_EQ(2, /* only index/filter were added */
-            TestGetTickerCount(options, BLOCK_CACHE_ADD));
-  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
-  uint64_t int_num;
-  ASSERT_TRUE(
-      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
-  ASSERT_EQ(int_num, 0U);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
 
-  // Make sure filter block is in cache.
-  std::string value;
-  ReadOptions ropt;
-  db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+    ASSERT_OK(Put(1, "a", "b"));
+    bool value_found = false;
+    ASSERT_TRUE(
+        db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
+    ASSERT_TRUE(value_found);
+    ASSERT_EQ("b", value);
 
-  // Miss count should remain the same.
-  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
-  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_OK(Flush(1));
+    value.clear();
 
-  db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
-  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
-  ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    long numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(
+        db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
+    ASSERT_TRUE(!value_found);
+    // assert that no new files were opened and no new blocks were
+    // read into block cache.
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
-  // Make sure index block is in cache.
-  auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
-  value = Get(1, "key");
-  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
-  ASSERT_EQ(index_block_hit + 1,
-            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_OK(Delete(1, "a"));
 
-  value = Get(1, "key");
-  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
-  ASSERT_EQ(index_block_hit + 2,
-            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
-}
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
-TEST_F(DBTest, ParanoidFileChecks) {
-  Options options = CurrentOptions();
-  options.create_if_missing = true;
-  options.statistics = rocksdb::CreateDBStatistics();
-  options.level0_file_num_compaction_trigger = 2;
-  options.paranoid_file_checks = true;
-  BlockBasedTableOptions table_options;
-  table_options.cache_index_and_filter_blocks = false;
-  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
-  options.table_factory.reset(new BlockBasedTableFactory(table_options));
-  CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(Flush(1));
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                true /* disallow trivial move */);
 
-  ASSERT_OK(Put(1, "1_key", "val"));
-  ASSERT_OK(Put(1, "9_key", "val"));
-  // Create a new table.
-  ASSERT_OK(Flush(1));
-  ASSERT_EQ(1, /* read and cache data block */
-            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
-  ASSERT_OK(Put(1, "1_key2", "val2"));
-  ASSERT_OK(Put(1, "9_key2", "val2"));
-  // Create a new SST file. This will further trigger a compaction
-  // and generate another file.
-  ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ(3, /* Totally 3 files created up to now */
-            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    ASSERT_OK(Delete(1, "c"));
 
-  // After disabling options.paranoid_file_checks. NO further block
-  // is added after generating a new file.
-  ASSERT_OK(
-      dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "false"}}));
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
 
-  ASSERT_OK(Put(1, "1_key3", "val3"));
-  ASSERT_OK(Put(1, "9_key3", "val3"));
-  ASSERT_OK(Flush(1));
-  ASSERT_OK(Put(1, "1_key4", "val4"));
-  ASSERT_OK(Put(1, "9_key4", "val4"));
-  ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ(3, /* Totally 3 files created up to now */
-            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    // KeyMayExist function only checks data in block caches, which is not used
+    // by plain table format.
+  } while (
+      ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction));
 }
 
-TEST_F(DBTest, GetPropertiesOfAllTablesTest) {
-  Options options = CurrentOptions();
-  options.max_background_flushes = 0;
-  Reopen(options);
-  // Create 4 tables
-  for (int table = 0; table < 4; ++table) {
-    for (int i = 0; i < 10 + table; ++i) {
-      db_->Put(WriteOptions(), ToString(table * 100 + i), "val");
-    }
-    db_->Flush(FlushOptions());
-  }
-
-  // 1. Read table properties directly from file
-  Reopen(options);
-  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
-
-  // 2. Put two tables to table cache and
-  Reopen(options);
-  // fetch key from 1st and 2nd table, which will internally place that table to
-  // the table cache.
-  for (int i = 0; i < 2; ++i) {
-    Get(ToString(i * 100 + 0));
-  }
-
-  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+TEST_F(DBTest, NonBlockingIteration) {
+  do {
+    ReadOptions non_blocking_opts, regular_opts;
+    Options options = CurrentOptions();
+    options.statistics = rocksdb::CreateDBStatistics();
+    non_blocking_opts.read_tier = kBlockCacheTier;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    // write one kv to the database.
+    ASSERT_OK(Put(1, "a", "b"));
 
-  // 3. Put all tables to table cache
-  Reopen(options);
-  // fetch key from 1st and 2nd table, which will internally place that table to
-  // the table cache.
-  for (int i = 0; i < 4; ++i) {
-    Get(ToString(i * 100 + 0));
-  }
-  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
-}
+    // scan using non-blocking iterator. We should find it because
+    // it is in memtable.
+    Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    int count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    delete iter;
 
-class CoutingUserTblPropCollector : public TablePropertiesCollector {
- public:
-  const char* Name() const override { return "CoutingUserTblPropCollector"; }
+    // flush memtable to storage. Now, the key should not be in the
+    // memtable neither in the block cache.
+    ASSERT_OK(Flush(1));
 
-  Status Finish(UserCollectedProperties* properties) override {
-    std::string encoded;
-    PutVarint32(&encoded, count_);
-    *properties = UserCollectedProperties{
-        {"CoutingUserTblPropCollector", message_}, {"Count", encoded},
-    };
-    return Status::OK();
-  }
+    // verify that a non-blocking iterator does not find any
+    // kvs. Neither does it do any IOs to storage.
+    long numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      count++;
+    }
+    ASSERT_EQ(count, 0);
+    ASSERT_TRUE(iter->status().IsIncomplete());
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    delete iter;
 
-  Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type,
-                    SequenceNumber seq, uint64_t file_size) override {
-    ++count_;
-    return Status::OK();
-  }
+    // read in the specified block via a regular get
+    ASSERT_EQ(Get(1, "a"), "b");
 
-  virtual UserCollectedProperties GetReadableProperties() const override {
-    return UserCollectedProperties{};
-  }
+    // verify that we can find it via a non-blocking scan
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    delete iter;
 
- private:
-  std::string message_ = "Rocksdb";
-  uint32_t count_ = 0;
-};
+    // This test verifies block cache behaviors, which is not used by plain
+    // table format.
+    // Exclude kHashCuckoo as it does not support iteration currently
+  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo |
+                         kSkipMmapReads));
+}
 
-class CoutingUserTblPropCollectorFactory
-    : public TablePropertiesCollectorFactory {
- public:
-  virtual TablePropertiesCollector* CreateTablePropertiesCollector() override {
-    return new CoutingUserTblPropCollector();
-  }
-  const char* Name() const override {
-    return "CoutingUserTblPropCollectorFactory";
-  }
-};
+TEST_F(DBTest, ManagedNonBlockingIteration) {
+  do {
+    ReadOptions non_blocking_opts, regular_opts;
+    Options options = CurrentOptions();
+    options.statistics = rocksdb::CreateDBStatistics();
+    non_blocking_opts.read_tier = kBlockCacheTier;
+    non_blocking_opts.managed = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    // write one kv to the database.
+    ASSERT_OK(Put(1, "a", "b"));
 
-TEST_F(DBTest, GetUserDefinedTablaProperties) {
-  Options options = CurrentOptions();
-  options.max_background_flushes = 0;
-  options.table_properties_collector_factories.resize(1);
-  options.table_properties_collector_factories[0] =
-      std::make_shared<CoutingUserTblPropCollectorFactory>();
-  Reopen(options);
-  // Create 4 tables
-  for (int table = 0; table < 4; ++table) {
-    for (int i = 0; i < 10 + table; ++i) {
-      db_->Put(WriteOptions(), ToString(table * 100 + i), "val");
+    // scan using non-blocking iterator. We should find it because
+    // it is in memtable.
+    Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    int count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
     }
-    db_->Flush(FlushOptions());
-  }
-
-  TablePropertiesCollection props;
-  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
-  ASSERT_EQ(4U, props.size());
-  uint32_t sum = 0;
-  for (const auto& item : props) {
-    auto& user_collected = item.second->user_collected_properties;
-    ASSERT_TRUE(user_collected.find("CoutingUserTblPropCollector") !=
-                user_collected.end());
-    ASSERT_EQ(user_collected.at("CoutingUserTblPropCollector"), "Rocksdb");
-    ASSERT_TRUE(user_collected.find("Count") != user_collected.end());
-    Slice key(user_collected.at("Count"));
-    uint32_t count;
-    ASSERT_TRUE(GetVarint32(&key, &count));
-    sum += count;
-  }
-  ASSERT_EQ(10u + 11u + 12u + 13u, sum);
-}
+    ASSERT_EQ(count, 1);
+    delete iter;
 
-TEST_F(DBTest, LevelLimitReopen) {
-  Options options = CurrentOptions();
-  CreateAndReopenWithCF({"pikachu"}, options);
+    // flush memtable to storage. Now, the key should not be in the
+    // memtable neither in the block cache.
+    ASSERT_OK(Flush(1));
 
-  const std::string value(1024 * 1024, ' ');
-  int i = 0;
-  while (NumTableFilesAtLevel(2, 1) == 0) {
-    ASSERT_OK(Put(1, Key(i++), value));
-  }
+    // verify that a non-blocking iterator does not find any
+    // kvs. Neither does it do any IOs to storage.
+    int64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    int64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      count++;
+    }
+    ASSERT_EQ(count, 0);
+    ASSERT_TRUE(iter->status().IsIncomplete());
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    delete iter;
 
-  options.num_levels = 1;
-  options.max_bytes_for_level_multiplier_additional.resize(1, 1);
-  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
-  ASSERT_EQ(s.IsInvalidArgument(), true);
-  ASSERT_EQ(s.ToString(),
-            "Invalid argument: db has more levels than options.num_levels");
+    // read in the specified block via a regular get
+    ASSERT_EQ(Get(1, "a"), "b");
 
-  options.num_levels = 10;
-  options.max_bytes_for_level_multiplier_additional.resize(10, 1);
-  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
-}
+    // verify that we can find it via a non-blocking scan
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    delete iter;
 
-TEST_F(DBTest, PutDeleteGet) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "foo", "v1"));
-    ASSERT_EQ("v1", Get(1, "foo"));
-    ASSERT_OK(Put(1, "foo", "v2"));
-    ASSERT_EQ("v2", Get(1, "foo"));
-    ASSERT_OK(Delete(1, "foo"));
-    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
-  } while (ChangeOptions());
+    // This test verifies block cache behaviors, which is not used by plain
+    // table format.
+    // Exclude kHashCuckoo as it does not support iteration currently
+  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo |
+                         kSkipMmapReads));
 }
 
-TEST_F(DBTest, GetFromImmutableLayer) {
+// A delete is skipped for key if KeyMayExist(key) returns False
+// Tests Writebatch consistency and proper delete behaviour
+TEST_F(DBTest, FilterDeletes) {
   do {
-    Options options;
-    options.env = env_;
-    options.write_buffer_size = 100000;  // Small write buffer
-    options = CurrentOptions(options);
+    anon::OptionsOverride options_override;
+    options_override.filter_policy.reset(NewBloomFilterPolicy(20));
+    Options options = CurrentOptions(options_override);
+    options.filter_deletes = true;
     CreateAndReopenWithCF({"pikachu"}, options);
+    WriteBatch batch;
 
-    ASSERT_OK(Put(1, "foo", "v1"));
-    ASSERT_EQ("v1", Get(1, "foo"));
-
-    // Block sync calls
-    env_->delay_sstable_sync_.store(true, std::memory_order_release);
-    Put(1, "k1", std::string(100000, 'x'));          // Fill memtable
-    Put(1, "k2", std::string(100000, 'y'));          // Trigger flush
-    ASSERT_EQ("v1", Get(1, "foo"));
-    ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
-    // Release sync calls
-    env_->delay_sstable_sync_.store(false, std::memory_order_release);
-  } while (ChangeOptions());
-}
+    batch.Delete(handles_[1], "a");
+    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_EQ(AllEntriesFor("a", 1), "[ ]");  // Delete skipped
+    batch.Clear();
 
-TEST_F(DBTest, GetFromVersions) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "foo", "v1"));
-    ASSERT_OK(Flush(1));
-    ASSERT_EQ("v1", Get(1, "foo"));
-    ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
-  } while (ChangeOptions());
-}
-
-TEST_F(DBTest, GetSnapshot) {
-  anon::OptionsOverride options_override;
-  options_override.skip_policy = kSkipNoSnapshot;
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
-    // Try with both a short key and a long key
-    for (int i = 0; i < 2; i++) {
-      std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
-      ASSERT_OK(Put(1, key, "v1"));
-      const Snapshot* s1 = db_->GetSnapshot();
-      if (option_config_ == kHashCuckoo) {
-        // NOt supported case.
-        ASSERT_TRUE(s1 == nullptr);
-        break;
-      }
-      ASSERT_OK(Put(1, key, "v2"));
-      ASSERT_EQ("v2", Get(1, key));
-      ASSERT_EQ("v1", Get(1, key, s1));
-      ASSERT_OK(Flush(1));
-      ASSERT_EQ("v2", Get(1, key));
-      ASSERT_EQ("v1", Get(1, key, s1));
-      db_->ReleaseSnapshot(s1);
-    }
-  } while (ChangeOptions());
-}
-
-TEST_F(DBTest, GetSnapshotLink) {
-  do {
-    Options options;
-    const std::string snapshot_name = test::TmpDir(env_) + "/snapshot";
-    DB* snapshotDB;
-    ReadOptions roptions;
-    std::string result;
-    Checkpoint* checkpoint;
-
-    options = CurrentOptions(options);
-    delete db_;
-    db_ = nullptr;
-    ASSERT_OK(DestroyDB(dbname_, options));
-    ASSERT_OK(DestroyDB(snapshot_name, options));
-    env_->DeleteDir(snapshot_name);
-
-    // Create a database
-    Status s;
-    options.create_if_missing = true;
-    ASSERT_OK(DB::Open(options, dbname_, &db_));
-    std::string key = std::string("foo");
-    ASSERT_OK(Put(key, "v1"));
-    // Take a snapshot
-    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
-    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name));
-    ASSERT_OK(Put(key, "v2"));
-    ASSERT_EQ("v2", Get(key));
-    ASSERT_OK(Flush());
-    ASSERT_EQ("v2", Get(key));
-    // Open snapshot and verify contents while DB is running
-    options.create_if_missing = false;
-    ASSERT_OK(DB::Open(options, snapshot_name, &snapshotDB));
-    ASSERT_OK(snapshotDB->Get(roptions, key, &result));
-    ASSERT_EQ("v1", result);
-    delete snapshotDB;
-    snapshotDB = nullptr;
-    delete db_;
-    db_ = nullptr;
-
-    // Destroy original DB
-    ASSERT_OK(DestroyDB(dbname_, options));
-
-    // Open snapshot and verify contents
-    options.create_if_missing = false;
-    dbname_ = snapshot_name;
-    ASSERT_OK(DB::Open(options, dbname_, &db_));
-    ASSERT_EQ("v1", Get(key));
-    delete db_;
-    db_ = nullptr;
-    ASSERT_OK(DestroyDB(dbname_, options));
-    delete checkpoint;
-
-    // Restore DB name
-    dbname_ = test::TmpDir(env_) + "/db_test";
-  } while (ChangeOptions());
-}
+    batch.Put(handles_[1], "a", "b");
+    batch.Delete(handles_[1], "a");
+    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_EQ(Get(1, "a"), "NOT_FOUND");
+    ASSERT_EQ(AllEntriesFor("a", 1), "[ DEL, b ]");  // Delete issued
+    batch.Clear();
 
-TEST_F(DBTest, GetLevel0Ordering) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    // Check that we process level-0 files in correct order.  The code
-    // below generates two level-0 files where the earlier one comes
-    // before the later one in the level-0 file list since the earlier
-    // one has a smaller "smallest" key.
-    ASSERT_OK(Put(1, "bar", "b"));
-    ASSERT_OK(Put(1, "foo", "v1"));
-    ASSERT_OK(Flush(1));
-    ASSERT_OK(Put(1, "foo", "v2"));
-    ASSERT_OK(Flush(1));
-    ASSERT_EQ("v2", Get(1, "foo"));
-  } while (ChangeOptions());
-}
+    batch.Delete(handles_[1], "c");
+    batch.Put(handles_[1], "c", "d");
+    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_EQ(Get(1, "c"), "d");
+    ASSERT_EQ(AllEntriesFor("c", 1), "[ d ]");  // Delete skipped
+    batch.Clear();
 
-TEST_F(DBTest, WrongLevel0Config) {
-  Options options = CurrentOptions();
-  Close();
-  ASSERT_OK(DestroyDB(dbname_, options));
-  options.level0_stop_writes_trigger = 1;
-  options.level0_slowdown_writes_trigger = 2;
-  options.level0_file_num_compaction_trigger = 3;
-  ASSERT_OK(DB::Open(options, dbname_, &db_));
-}
+    ASSERT_OK(Flush(1));  // A stray Flush
 
-TEST_F(DBTest, GetOrderedByLevels) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "foo", "v1"));
-    Compact(1, "a", "z");
-    ASSERT_EQ("v1", Get(1, "foo"));
-    ASSERT_OK(Put(1, "foo", "v2"));
-    ASSERT_EQ("v2", Get(1, "foo"));
-    ASSERT_OK(Flush(1));
-    ASSERT_EQ("v2", Get(1, "foo"));
-  } while (ChangeOptions());
+    batch.Delete(handles_[1], "c");
+    dbfull()->Write(WriteOptions(), &batch);
+    ASSERT_EQ(AllEntriesFor("c", 1), "[ DEL, d ]");  // Delete issued
+    batch.Clear();
+  } while (ChangeCompactOptions());
 }
 
-TEST_F(DBTest, GetPicksCorrectFile) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    // Arrange to have multiple files in a non-level-0 level.
-    ASSERT_OK(Put(1, "a", "va"));
-    Compact(1, "a", "b");
-    ASSERT_OK(Put(1, "x", "vx"));
-    Compact(1, "x", "y");
-    ASSERT_OK(Put(1, "f", "vf"));
-    Compact(1, "f", "g");
-    ASSERT_EQ("va", Get(1, "a"));
-    ASSERT_EQ("vf", Get(1, "f"));
-    ASSERT_EQ("vx", Get(1, "x"));
-  } while (ChangeOptions());
-}
+TEST_F(DBTest, GetFilterByPrefixBloom) {
+  Options options = last_options_;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+  options.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
 
-TEST_F(DBTest, GetEncountersEmptyLevel) {
-  do {
-    Options options = CurrentOptions();
-    options.max_background_flushes = 0;
-    options.disableDataSync = true;
-    CreateAndReopenWithCF({"pikachu"}, options);
-    // Arrange for the following to happen:
-    //   * sstable A in level 0
-    //   * nothing in level 1
-    //   * sstable B in level 2
-    // Then do enough Get() calls to arrange for an automatic compaction
-    // of sstable A.  A bug would cause the compaction to be marked as
-    // occurring at level 1 (instead of the correct level 0).
+  WriteOptions wo;
+  ReadOptions ro;
+  FlushOptions fo;
+  fo.wait = true;
+  std::string value;
 
-    // Step 1: First place sstables in levels 0 and 2
-    int compaction_count = 0;
-    while (NumTableFilesAtLevel(0, 1) == 0 || NumTableFilesAtLevel(2, 1) == 0) {
-      ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2";
-      compaction_count++;
-      Put(1, "a", "begin");
-      Put(1, "z", "end");
-      ASSERT_OK(Flush(1));
-    }
+  ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
+  ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
+  ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
 
-    // Step 2: clear level 1 if necessary.
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
-    ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
-    ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1);
+  dbfull()->Flush(fo);
 
-    // Step 3: read a bunch of times
-    for (int i = 0; i < 1000; i++) {
-      ASSERT_EQ("NOT_FOUND", Get(1, "missing"));
-    }
+  ASSERT_EQ("foo", Get("barbarbar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  ASSERT_EQ("foo2", Get("barbarbar2"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
 
-    // Step 4: Wait for compaction to finish
-    env_->SleepForMicroseconds(1000000);
+  ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
 
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);  // XXX
-  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
+  ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
 }
 
-// KeyMayExist can lead to a few false positives, but not false negatives.
-// To make test deterministic, use a much larger number of bits per key-20 than
-// bits in the key, so that false positives are eliminated
-TEST_F(DBTest, KeyMayExist) {
-  do {
-    ReadOptions ropts;
-    std::string value;
-    anon::OptionsOverride options_override;
-    options_override.filter_policy.reset(NewBloomFilterPolicy(20));
-    Options options = CurrentOptions(options_override);
-    options.statistics = rocksdb::CreateDBStatistics();
-    CreateAndReopenWithCF({"pikachu"}, options);
+TEST_F(DBTest, WholeKeyFilterProp) {
+  Options options = last_options_;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.statistics = rocksdb::CreateDBStatistics();
 
-    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
 
-    ASSERT_OK(Put(1, "a", "b"));
-    bool value_found = false;
-    ASSERT_TRUE(
-        db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
-    ASSERT_TRUE(value_found);
-    ASSERT_EQ("b", value);
+  WriteOptions wo;
+  ReadOptions ro;
+  FlushOptions fo;
+  fo.wait = true;
+  std::string value;
 
-    ASSERT_OK(Flush(1));
-    value.clear();
+  ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+  // Needs insert some keys to make sure files are not filtered out by key
+  // ranges.
+  ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+  ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+  dbfull()->Flush(fo);
 
-    long numopen = TestGetTickerCount(options, NO_FILE_OPENS);
-    long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    ASSERT_TRUE(
-        db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
-    ASSERT_TRUE(!value_found);
-    // assert that no new files were opened and no new blocks were
-    // read into block cache.
-    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
-    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  Reopen(options);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  ASSERT_EQ("NOT_FOUND", Get("bar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("foo", Get("foobar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
 
-    ASSERT_OK(Delete(1, "a"));
+  // Reopen with whole key filtering enabled and prefix extractor
+  // NULL. Bloom filter should be off for both of whole key and
+  // prefix bloom.
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.prefix_extractor.reset();
+  Reopen(options);
 
-    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
-    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
-    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
-    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("NOT_FOUND", Get("bar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("foo", Get("foobar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  // Write DB with only full key filtering.
+  ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+  // Needs insert some keys to make sure files are not filtered out by key
+  // ranges.
+  ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+  ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
 
-    ASSERT_OK(Flush(1));
-    db_->CompactRange(handles_[1], nullptr, nullptr);
+  // Reopen with both of whole key off and prefix extractor enabled.
+  // Still no bloom filter should be used.
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
 
-    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
-    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
-    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
-    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
-
-    ASSERT_OK(Delete(1, "c"));
-
-    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
-    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value));
-    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
-    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
-
-    // KeyMayExist function only checks data in block caches, which is not used
-    // by plain table format.
-  } while (
-      ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction));
-}
-
-TEST_F(DBTest, NonBlockingIteration) {
-  do {
-    ReadOptions non_blocking_opts, regular_opts;
-    Options options = CurrentOptions();
-    options.statistics = rocksdb::CreateDBStatistics();
-    non_blocking_opts.read_tier = kBlockCacheTier;
-    CreateAndReopenWithCF({"pikachu"}, options);
-    // write one kv to the database.
-    ASSERT_OK(Put(1, "a", "b"));
-
-    // scan using non-blocking iterator. We should find it because
-    // it is in memtable.
-    Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]);
-    int count = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      ASSERT_OK(iter->status());
-      count++;
-    }
-    ASSERT_EQ(count, 1);
-    delete iter;
-
-    // flush memtable to storage. Now, the key should not be in the
-    // memtable neither in the block cache.
-    ASSERT_OK(Flush(1));
-
-    // verify that a non-blocking iterator does not find any
-    // kvs. Neither does it do any IOs to storage.
-    long numopen = TestGetTickerCount(options, NO_FILE_OPENS);
-    long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
-    count = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      count++;
-    }
-    ASSERT_EQ(count, 0);
-    ASSERT_TRUE(iter->status().IsIncomplete());
-    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
-    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
-    delete iter;
-
-    // read in the specified block via a regular get
-    ASSERT_EQ(Get(1, "a"), "b");
-
-    // verify that we can find it via a non-blocking scan
-    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
-    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
-    count = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      ASSERT_OK(iter->status());
-      count++;
-    }
-    ASSERT_EQ(count, 1);
-    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
-    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
-    delete iter;
-
-    // This test verifies block cache behaviors, which is not used by plain
-    // table format.
-    // Exclude kHashCuckoo as it does not support iteration currently
-  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo |
-                         kSkipMmapReads));
-}
-
-TEST_F(DBTest, ManagedNonBlockingIteration) {
-  do {
-    ReadOptions non_blocking_opts, regular_opts;
-    Options options = CurrentOptions();
-    options.statistics = rocksdb::CreateDBStatistics();
-    non_blocking_opts.read_tier = kBlockCacheTier;
-    non_blocking_opts.managed = true;
-    CreateAndReopenWithCF({"pikachu"}, options);
-    // write one kv to the database.
-    ASSERT_OK(Put(1, "a", "b"));
-
-    // scan using non-blocking iterator. We should find it because
-    // it is in memtable.
-    Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]);
-    int count = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      ASSERT_OK(iter->status());
-      count++;
-    }
-    ASSERT_EQ(count, 1);
-    delete iter;
-
-    // flush memtable to storage. Now, the key should not be in the
-    // memtable neither in the block cache.
-    ASSERT_OK(Flush(1));
-
-    // verify that a non-blocking iterator does not find any
-    // kvs. Neither does it do any IOs to storage.
-    int64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
-    int64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
-    count = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      count++;
-    }
-    ASSERT_EQ(count, 0);
-    ASSERT_TRUE(iter->status().IsIncomplete());
-    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
-    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
-    delete iter;
-
-    // read in the specified block via a regular get
-    ASSERT_EQ(Get(1, "a"), "b");
-
-    // verify that we can find it via a non-blocking scan
-    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
-    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
-    count = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      ASSERT_OK(iter->status());
-      count++;
-    }
-    ASSERT_EQ(count, 1);
-    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
-    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
-    delete iter;
-
-    // This test verifies block cache behaviors, which is not used by plain
-    // table format.
-    // Exclude kHashCuckoo as it does not support iteration currently
-  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo |
-                         kSkipMmapReads));
-}
-
-// A delete is skipped for key if KeyMayExist(key) returns False
-// Tests Writebatch consistency and proper delete behaviour
-TEST_F(DBTest, FilterDeletes) {
-  do {
-    anon::OptionsOverride options_override;
-    options_override.filter_policy.reset(NewBloomFilterPolicy(20));
-    Options options = CurrentOptions(options_override);
-    options.filter_deletes = true;
-    CreateAndReopenWithCF({"pikachu"}, options);
-    WriteBatch batch;
-
-    batch.Delete(handles_[1], "a");
-    dbfull()->Write(WriteOptions(), &batch);
-    ASSERT_EQ(AllEntriesFor("a", 1), "[ ]");  // Delete skipped
-    batch.Clear();
-
-    batch.Put(handles_[1], "a", "b");
-    batch.Delete(handles_[1], "a");
-    dbfull()->Write(WriteOptions(), &batch);
-    ASSERT_EQ(Get(1, "a"), "NOT_FOUND");
-    ASSERT_EQ(AllEntriesFor("a", 1), "[ DEL, b ]");  // Delete issued
-    batch.Clear();
-
-    batch.Delete(handles_[1], "c");
-    batch.Put(handles_[1], "c", "d");
-    dbfull()->Write(WriteOptions(), &batch);
-    ASSERT_EQ(Get(1, "c"), "d");
-    ASSERT_EQ(AllEntriesFor("c", 1), "[ d ]");  // Delete skipped
-    batch.Clear();
-
-    ASSERT_OK(Flush(1));  // A stray Flush
-
-    batch.Delete(handles_[1], "c");
-    dbfull()->Write(WriteOptions(), &batch);
-    ASSERT_EQ(AllEntriesFor("c", 1), "[ DEL, d ]");  // Delete issued
-    batch.Clear();
-  } while (ChangeCompactOptions());
-}
-
-TEST_F(DBTest, GetFilterByPrefixBloom) {
-  Options options = last_options_;
-  options.prefix_extractor.reset(NewFixedPrefixTransform(8));
-  options.statistics = rocksdb::CreateDBStatistics();
-  BlockBasedTableOptions bbto;
-  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
-  bbto.whole_key_filtering = false;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  DestroyAndReopen(options);
-
-  WriteOptions wo;
-  ReadOptions ro;
-  FlushOptions fo;
-  fo.wait = true;
-  std::string value;
-
-  ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
-  ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
-  ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
-
-  dbfull()->Flush(fo);
-
-  ASSERT_EQ("foo", Get("barbarbar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
-  ASSERT_EQ("foo2", Get("barbarbar2"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
-  ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
-
-  ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
   ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
-
-  ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
-}
-
-TEST_F(DBTest, WholeKeyFilterProp) {
-  Options options = last_options_;
-  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
-  options.statistics = rocksdb::CreateDBStatistics();
-
-  BlockBasedTableOptions bbto;
-  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
-  bbto.whole_key_filtering = false;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  DestroyAndReopen(options);
-
-  WriteOptions wo;
-  ReadOptions ro;
-  FlushOptions fo;
-  fo.wait = true;
-  std::string value;
-
-  ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
-  // Needs insert some keys to make sure files are not filtered out by key
-  // ranges.
-  ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
-  ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
-  dbfull()->Flush(fo);
-
-  Reopen(options);
   ASSERT_EQ("NOT_FOUND", Get("foo"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
-  ASSERT_EQ("NOT_FOUND", Get("bar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
-  ASSERT_EQ("foo", Get("foobar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
-
-  // Reopen with whole key filtering enabled and prefix extractor
-  // NULL. Bloom filter should be off for both of whole key and
-  // prefix bloom.
-  bbto.whole_key_filtering = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  options.prefix_extractor.reset();
-  Reopen(options);
-
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
-  ASSERT_EQ("NOT_FOUND", Get("foo"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
-  ASSERT_EQ("NOT_FOUND", Get("bar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
-  ASSERT_EQ("foo", Get("foobar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
-  // Write DB with only full key filtering.
-  ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
-  // Needs insert some keys to make sure files are not filtered out by key
-  // ranges.
-  ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
-  ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
-  db_->CompactRange(nullptr, nullptr);
-
-  // Reopen with both of whole key off and prefix extractor enabled.
-  // Still no bloom filter should be used.
-  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
-  bbto.whole_key_filtering = false;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  Reopen(options);
-
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
-  ASSERT_EQ("NOT_FOUND", Get("foo"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
-  ASSERT_EQ("NOT_FOUND", Get("bar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
-  ASSERT_EQ("foo", Get("foobar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
-
-  // Try to create a DB with mixed files:
-  ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
-  // Needs insert some keys to make sure files are not filtered out by key
-  // ranges.
-  ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
-  ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
-  db_->CompactRange(nullptr, nullptr);
-
-  options.prefix_extractor.reset();
-  bbto.whole_key_filtering = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  Reopen(options);
-
-  // Try to create a DB with mixed files.
-  ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar"));
-  // In this case needs insert some keys to make sure files are
-  // not filtered out by key ranges.
-  ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
-  ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
-  Flush();
-
-  // Now we have two files:
-  // File 1: An older file with prefix bloom.
-  // File 2: A newer file with whole bloom filter.
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
-  ASSERT_EQ("NOT_FOUND", Get("foo"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
-  ASSERT_EQ("NOT_FOUND", Get("bar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
-  ASSERT_EQ("foo", Get("foobar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
-  ASSERT_EQ("bar", Get("barfoo"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
-
-  // Reopen with the same setting: only whole key is used
-  Reopen(options);
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
-  ASSERT_EQ("NOT_FOUND", Get("foo"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5);
-  ASSERT_EQ("NOT_FOUND", Get("bar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6);
-  ASSERT_EQ("foo", Get("foobar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
-  ASSERT_EQ("bar", Get("barfoo"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
-
-  // Restart with both filters are allowed
-  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
-  bbto.whole_key_filtering = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  Reopen(options);
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
-  // File 1 will has it filtered out.
-  // File 2 will not, as prefix `foo` exists in the file.
-  ASSERT_EQ("NOT_FOUND", Get("foo"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8);
-  ASSERT_EQ("NOT_FOUND", Get("bar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10);
-  ASSERT_EQ("foo", Get("foobar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
-  ASSERT_EQ("bar", Get("barfoo"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
-
-  // Restart with only prefix bloom is allowed.
-  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
-  bbto.whole_key_filtering = false;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  Reopen(options);
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
-  ASSERT_EQ("NOT_FOUND", Get("foo"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
-  ASSERT_EQ("NOT_FOUND", Get("bar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
-  ASSERT_EQ("foo", Get("foobar"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
-  ASSERT_EQ("bar", Get("barfoo"));
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
-}
-
-TEST_F(DBTest, IterSeekBeforePrev) {
-  ASSERT_OK(Put("a", "b"));
-  ASSERT_OK(Put("c", "d"));
-  dbfull()->Flush(FlushOptions());
-  ASSERT_OK(Put("0", "f"));
-  ASSERT_OK(Put("1", "h"));
-  dbfull()->Flush(FlushOptions());
-  ASSERT_OK(Put("2", "j"));
-  auto iter = db_->NewIterator(ReadOptions());
-  iter->Seek(Slice("c"));
-  iter->Prev();
-  iter->Seek(Slice("a"));
-  iter->Prev();
-  delete iter;
-}
-
-namespace {
-std::string MakeLongKey(size_t length, char c) {
-  return std::string(length, c);
-}
-}  // namespace
-
-TEST_F(DBTest, IterLongKeys) {
-  ASSERT_OK(Put(MakeLongKey(20, 0), "0"));
-  ASSERT_OK(Put(MakeLongKey(32, 2), "2"));
-  ASSERT_OK(Put("a", "b"));
-  dbfull()->Flush(FlushOptions());
-  ASSERT_OK(Put(MakeLongKey(50, 1), "1"));
-  ASSERT_OK(Put(MakeLongKey(127, 3), "3"));
-  ASSERT_OK(Put(MakeLongKey(64, 4), "4"));
-  auto iter = db_->NewIterator(ReadOptions());
-
-  // Create a key that needs to be skipped for Seq too new
-  iter->Seek(MakeLongKey(20, 0));
-  ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0");
-  iter->Next();
-  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
-  iter->Next();
-  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
-  iter->Next();
-  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
-  iter->Next();
-  ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4");
-  delete iter;
-
-  iter = db_->NewIterator(ReadOptions());
-  iter->Seek(MakeLongKey(50, 1));
-  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
-  iter->Next();
-  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
-  iter->Next();
-  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
-  delete iter;
-}
-
-TEST_F(DBTest, IterNextWithNewerSeq) {
-  ASSERT_OK(Put("0", "0"));
-  dbfull()->Flush(FlushOptions());
-  ASSERT_OK(Put("a", "b"));
-  ASSERT_OK(Put("c", "d"));
-  ASSERT_OK(Put("d", "e"));
-  auto iter = db_->NewIterator(ReadOptions());
-
-  // Create a key that needs to be skipped for Seq too new
-  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
-       i++) {
-    ASSERT_OK(Put("b", "f"));
-  }
-
-  iter->Seek(Slice("a"));
-  ASSERT_EQ(IterStatus(iter), "a->b");
-  iter->Next();
-  ASSERT_EQ(IterStatus(iter), "c->d");
-  delete iter;
-}
-
-TEST_F(DBTest, IterPrevWithNewerSeq) {
-  ASSERT_OK(Put("0", "0"));
-  dbfull()->Flush(FlushOptions());
-  ASSERT_OK(Put("a", "b"));
-  ASSERT_OK(Put("c", "d"));
-  ASSERT_OK(Put("d", "e"));
-  auto iter = db_->NewIterator(ReadOptions());
-
-  // Create a key that needs to be skipped for Seq too new
-  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
-       i++) {
-    ASSERT_OK(Put("b", "f"));
-  }
-
-  iter->Seek(Slice("d"));
-  ASSERT_EQ(IterStatus(iter), "d->e");
-  iter->Prev();
-  ASSERT_EQ(IterStatus(iter), "c->d");
-  iter->Prev();
-  ASSERT_EQ(IterStatus(iter), "a->b");
-
-  iter->Prev();
-  delete iter;
-}
-
-TEST_F(DBTest, IterPrevWithNewerSeq2) {
-  ASSERT_OK(Put("0", "0"));
-  dbfull()->Flush(FlushOptions());
-  ASSERT_OK(Put("a", "b"));
-  ASSERT_OK(Put("c", "d"));
-  ASSERT_OK(Put("d", "e"));
-  auto iter = db_->NewIterator(ReadOptions());
-  iter->Seek(Slice("c"));
-  ASSERT_EQ(IterStatus(iter), "c->d");
-
-  // Create a key that needs to be skipped for Seq too new
-  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
-      i++) {
-    ASSERT_OK(Put("b", "f"));
-  }
-
-  iter->Prev();
-  ASSERT_EQ(IterStatus(iter), "a->b");
-
-  iter->Prev();
-  delete iter;
-}
-
-TEST_F(DBTest, IterEmpty) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
-
-    iter->SeekToFirst();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-
-    iter->SeekToLast();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-
-    iter->Seek("foo");
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-
-    delete iter;
-  } while (ChangeCompactOptions());
-}
-
-TEST_F(DBTest, IterSingle) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "a", "va"));
-    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
-
-    iter->SeekToFirst();
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-    iter->SeekToFirst();
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-
-    iter->SeekToLast();
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-    iter->SeekToLast();
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-
-    iter->Seek("");
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-
-    iter->Seek("a");
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-
-    iter->Seek("b");
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-
-    delete iter;
-  } while (ChangeCompactOptions());
-}
-
-TEST_F(DBTest, IterMulti) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "a", "va"));
-    ASSERT_OK(Put(1, "b", "vb"));
-    ASSERT_OK(Put(1, "c", "vc"));
-    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
-
-    iter->SeekToFirst();
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "b->vb");
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "c->vc");
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-    iter->SeekToFirst();
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-
-    iter->SeekToLast();
-    ASSERT_EQ(IterStatus(iter), "c->vc");
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "b->vb");
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-    iter->SeekToLast();
-    ASSERT_EQ(IterStatus(iter), "c->vc");
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-
-    iter->Seek("");
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Seek("a");
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Seek("ax");
-    ASSERT_EQ(IterStatus(iter), "b->vb");
-
-    iter->Seek("b");
-    ASSERT_EQ(IterStatus(iter), "b->vb");
-    iter->Seek("z");
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-
-    // Switch from reverse to forward
-    iter->SeekToLast();
-    iter->Prev();
-    iter->Prev();
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "b->vb");
-
-    // Switch from forward to reverse
-    iter->SeekToFirst();
-    iter->Next();
-    iter->Next();
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "b->vb");
-
-    // Make sure iter stays at snapshot
-    ASSERT_OK(Put(1, "a", "va2"));
-    ASSERT_OK(Put(1, "a2", "va3"));
-    ASSERT_OK(Put(1, "b", "vb2"));
-    ASSERT_OK(Put(1, "c", "vc2"));
-    ASSERT_OK(Delete(1, "b"));
-    iter->SeekToFirst();
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "b->vb");
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "c->vc");
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-    iter->SeekToLast();
-    ASSERT_EQ(IterStatus(iter), "c->vc");
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "b->vb");
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-
-    delete iter;
-  } while (ChangeCompactOptions());
-}
-
-// Check that we can skip over a run of user keys
-// by using reseek rather than sequential scan
-TEST_F(DBTest, IterReseek) {
-  anon::OptionsOverride options_override;
-  options_override.skip_policy = kSkipNoSnapshot;
-  Options options = CurrentOptions(options_override);
-  options.max_sequential_skip_in_iterations = 3;
-  options.create_if_missing = true;
-  options.statistics = rocksdb::CreateDBStatistics();
-  DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  // insert two keys with same userkey and verify that
-  // reseek is not invoked. For each of these test cases,
-  // verify that we can find the next key "b".
-  ASSERT_OK(Put(1, "a", "one"));
-  ASSERT_OK(Put(1, "a", "two"));
-  ASSERT_OK(Put(1, "b", "bone"));
-  Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
-  iter->SeekToFirst();
-  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
-  ASSERT_EQ(IterStatus(iter), "a->two");
-  iter->Next();
-  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
-  ASSERT_EQ(IterStatus(iter), "b->bone");
-  delete iter;
-
-  // insert a total of three keys with same userkey and verify
-  // that reseek is still not invoked.
-  ASSERT_OK(Put(1, "a", "three"));
-  iter = db_->NewIterator(ReadOptions(), handles_[1]);
-  iter->SeekToFirst();
-  ASSERT_EQ(IterStatus(iter), "a->three");
-  iter->Next();
-  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
-  ASSERT_EQ(IterStatus(iter), "b->bone");
-  delete iter;
-
-  // insert a total of four keys with same userkey and verify
-  // that reseek is invoked.
-  ASSERT_OK(Put(1, "a", "four"));
-  iter = db_->NewIterator(ReadOptions(), handles_[1]);
-  iter->SeekToFirst();
-  ASSERT_EQ(IterStatus(iter), "a->four");
-  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
-  iter->Next();
-  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
-  ASSERT_EQ(IterStatus(iter), "b->bone");
-  delete iter;
-
-  // Testing reverse iterator
-  // At this point, we have three versions of "a" and one version of "b".
-  // The reseek statistics is already at 1.
-  int num_reseeks =
-      (int)TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION);
-
-  // Insert another version of b and assert that reseek is not invoked
-  ASSERT_OK(Put(1, "b", "btwo"));
-  iter = db_->NewIterator(ReadOptions(), handles_[1]);
-  iter->SeekToLast();
-  ASSERT_EQ(IterStatus(iter), "b->btwo");
-  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
-            num_reseeks);
-  iter->Prev();
-  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
-            num_reseeks + 1);
-  ASSERT_EQ(IterStatus(iter), "a->four");
-  delete iter;
-
-  // insert two more versions of b. This makes a total of 4 versions
-  // of b and 4 versions of a.
-  ASSERT_OK(Put(1, "b", "bthree"));
-  ASSERT_OK(Put(1, "b", "bfour"));
-  iter = db_->NewIterator(ReadOptions(), handles_[1]);
-  iter->SeekToLast();
-  ASSERT_EQ(IterStatus(iter), "b->bfour");
-  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
-            num_reseeks + 2);
-  iter->Prev();
-
-  // the previous Prev call should have invoked reseek
-  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
-            num_reseeks + 3);
-  ASSERT_EQ(IterStatus(iter), "a->four");
-  delete iter;
-}
-
-TEST_F(DBTest, IterSmallAndLargeMix) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "a", "va"));
-    ASSERT_OK(Put(1, "b", std::string(100000, 'b')));
-    ASSERT_OK(Put(1, "c", "vc"));
-    ASSERT_OK(Put(1, "d", std::string(100000, 'd')));
-    ASSERT_OK(Put(1, "e", std::string(100000, 'e')));
-
-    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
-
-    iter->SeekToFirst();
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "c->vc");
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
-    iter->Next();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-
-    iter->SeekToLast();
-    ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "c->vc");
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "a->va");
-    iter->Prev();
-    ASSERT_EQ(IterStatus(iter), "(invalid)");
-
-    delete iter;
-  } while (ChangeCompactOptions());
-}
-
-TEST_F(DBTest, IterMultiWithDelete) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "ka", "va"));
-    ASSERT_OK(Put(1, "kb", "vb"));
-    ASSERT_OK(Put(1, "kc", "vc"));
-    ASSERT_OK(Delete(1, "kb"));
-    ASSERT_EQ("NOT_FOUND", Get(1, "kb"));
-
-    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
-    iter->Seek("kc");
-    ASSERT_EQ(IterStatus(iter), "kc->vc");
-    if (!CurrentOptions().merge_operator) {
-      // TODO: merge operator does not support backward iteration yet
-      if (kPlainTableAllBytesPrefix != option_config_&&
-          kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
-          kHashLinkList != option_config_) {
-        iter->Prev();
-        ASSERT_EQ(IterStatus(iter), "ka->va");
-      }
-    }
-    delete iter;
-  } while (ChangeOptions());
-}
-
-TEST_F(DBTest, IterPrevMaxSkip) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    for (int i = 0; i < 2; i++) {
-      ASSERT_OK(Put(1, "key1", "v1"));
-      ASSERT_OK(Put(1, "key2", "v2"));
-      ASSERT_OK(Put(1, "key3", "v3"));
-      ASSERT_OK(Put(1, "key4", "v4"));
-      ASSERT_OK(Put(1, "key5", "v5"));
-    }
-
-    VerifyIterLast("key5->v5", 1);
-
-    ASSERT_OK(Delete(1, "key5"));
-    VerifyIterLast("key4->v4", 1);
-
-    ASSERT_OK(Delete(1, "key4"));
-    VerifyIterLast("key3->v3", 1);
-
-    ASSERT_OK(Delete(1, "key3"));
-    VerifyIterLast("key2->v2", 1);
-
-    ASSERT_OK(Delete(1, "key2"));
-    VerifyIterLast("key1->v1", 1);
-
-    ASSERT_OK(Delete(1, "key1"));
-    VerifyIterLast("(invalid)", 1);
-  } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast));
-}
-
-TEST_F(DBTest, IterWithSnapshot) {
-  anon::OptionsOverride options_override;
-  options_override.skip_policy = kSkipNoSnapshot;
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
-    ASSERT_OK(Put(1, "key1", "val1"));
-    ASSERT_OK(Put(1, "key2", "val2"));
-    ASSERT_OK(Put(1, "key3", "val3"));
-    ASSERT_OK(Put(1, "key4", "val4"));
-    ASSERT_OK(Put(1, "key5", "val5"));
-
-    const Snapshot *snapshot = db_->GetSnapshot();
-    ReadOptions options;
-    options.snapshot = snapshot;
-    Iterator* iter = db_->NewIterator(options, handles_[1]);
-
-    // Put more values after the snapshot
-    ASSERT_OK(Put(1, "key100", "val100"));
-    ASSERT_OK(Put(1, "key101", "val101"));
-
-    iter->Seek("key5");
-    ASSERT_EQ(IterStatus(iter), "key5->val5");
-    if (!CurrentOptions().merge_operator) {
-      // TODO: merge operator does not support backward iteration yet
-      if (kPlainTableAllBytesPrefix != option_config_&&
-        kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
-        kHashLinkList != option_config_) {
-        iter->Prev();
-        ASSERT_EQ(IterStatus(iter), "key4->val4");
-        iter->Prev();
-        ASSERT_EQ(IterStatus(iter), "key3->val3");
-
-        iter->Next();
-        ASSERT_EQ(IterStatus(iter), "key4->val4");
-        iter->Next();
-        ASSERT_EQ(IterStatus(iter), "key5->val5");
-      }
-      iter->Next();
-      ASSERT_TRUE(!iter->Valid());
-    }
-    db_->ReleaseSnapshot(snapshot);
-    delete iter;
-    // skip as HashCuckooRep does not support snapshot
-  } while (ChangeOptions(kSkipHashCuckoo));
-}
-
-TEST_F(DBTest, Recover) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "foo", "v1"));
-    ASSERT_OK(Put(1, "baz", "v5"));
-
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    ASSERT_EQ("v1", Get(1, "foo"));
-
-    ASSERT_EQ("v1", Get(1, "foo"));
-    ASSERT_EQ("v5", Get(1, "baz"));
-    ASSERT_OK(Put(1, "bar", "v2"));
-    ASSERT_OK(Put(1, "foo", "v3"));
-
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    ASSERT_EQ("v3", Get(1, "foo"));
-    ASSERT_OK(Put(1, "foo", "v4"));
-    ASSERT_EQ("v4", Get(1, "foo"));
-    ASSERT_EQ("v2", Get(1, "bar"));
-    ASSERT_EQ("v5", Get(1, "baz"));
-  } while (ChangeOptions());
-}
-
-TEST_F(DBTest, RecoverWithTableHandle) {
-  do {
-    Options options;
-    options.create_if_missing = true;
-    options.write_buffer_size = 100;
-    options.disable_auto_compactions = true;
-    options = CurrentOptions(options);
-    DestroyAndReopen(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
-
-    ASSERT_OK(Put(1, "foo", "v1"));
-    ASSERT_OK(Put(1, "bar", "v2"));
-    ASSERT_OK(Flush(1));
-    ASSERT_OK(Put(1, "foo", "v3"));
-    ASSERT_OK(Put(1, "bar", "v4"));
-    ASSERT_OK(Flush(1));
-    ASSERT_OK(Put(1, "big", std::string(100, 'a')));
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-
-    std::vector<std::vector<FileMetaData>> files;
-    dbfull()->TEST_GetFilesMetaData(handles_[1], &files);
-    int total_files = 0;
-    for (const auto& level : files) {
-      total_files += level.size();
-    }
-    ASSERT_EQ(total_files, 3);
-    for (const auto& level : files) {
-      for (const auto& file : level) {
-        if (kInfiniteMaxOpenFiles == option_config_) {
-          ASSERT_TRUE(file.table_reader_handle != nullptr);
-        } else {
-          ASSERT_TRUE(file.table_reader_handle == nullptr);
-        }
-      }
-    }
-  } while (ChangeOptions());
-}
-
-TEST_F(DBTest, IgnoreRecoveredLog) {
-  std::string backup_logs = dbname_ + "/backup_logs";
-
-  // delete old files in backup_logs directory
-  env_->CreateDirIfMissing(backup_logs);
-  std::vector<std::string> old_files;
-  env_->GetChildren(backup_logs, &old_files);
-  for (auto& file : old_files) {
-    if (file != "." && file != "..") {
-      env_->DeleteFile(backup_logs + "/" + file);
-    }
-  }
-
-  do {
-    Options options = CurrentOptions();
-    options.create_if_missing = true;
-    options.merge_operator = MergeOperators::CreateUInt64AddOperator();
-    options.wal_dir = dbname_ + "/logs";
-    DestroyAndReopen(options);
-
-    // fill up the DB
-    std::string one, two;
-    PutFixed64(&one, 1);
-    PutFixed64(&two, 2);
-    ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
-    ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
-    ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one)));
-
-    // copy the logs to backup
-    std::vector<std::string> logs;
-    env_->GetChildren(options.wal_dir, &logs);
-    for (auto& log : logs) {
-      if (log != ".." && log != ".") {
-        CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log);
-      }
-    }
-
-    // recover the DB
-    Reopen(options);
-    ASSERT_EQ(two, Get("foo"));
-    ASSERT_EQ(one, Get("bar"));
-    Close();
-
-    // copy the logs from backup back to wal dir
-    for (auto& log : logs) {
-      if (log != ".." && log != ".") {
-        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
-      }
-    }
-    // this should ignore the log files, recovery should not happen again
-    // if the recovery happens, the same merge operator would be called twice,
-    // leading to incorrect results
-    Reopen(options);
-    ASSERT_EQ(two, Get("foo"));
-    ASSERT_EQ(one, Get("bar"));
-    Close();
-    Destroy(options);
-    Reopen(options);
-    Close();
-
-    // copy the logs from backup back to wal dir
-    env_->CreateDirIfMissing(options.wal_dir);
-    for (auto& log : logs) {
-      if (log != ".." && log != ".") {
-        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
-      }
-    }
-    // assert that we successfully recovered only from logs, even though we
-    // destroyed the DB
-    Reopen(options);
-    ASSERT_EQ(two, Get("foo"));
-    ASSERT_EQ(one, Get("bar"));
-
-    // Recovery will fail if DB directory doesn't exist.
-    Destroy(options);
-    // copy the logs from backup back to wal dir
-    env_->CreateDirIfMissing(options.wal_dir);
-    for (auto& log : logs) {
-      if (log != ".." && log != ".") {
-        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
-        // we won't be needing this file no more
-        env_->DeleteFile(backup_logs + "/" + log);
-      }
-    }
-    Status s = TryReopen(options);
-    ASSERT_TRUE(!s.ok());
-  } while (ChangeOptions(kSkipHashCuckoo));
-}
-
-TEST_F(DBTest, RollLog) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "foo", "v1"));
-    ASSERT_OK(Put(1, "baz", "v5"));
-
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    for (int i = 0; i < 10; i++) {
-      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    }
-    ASSERT_OK(Put(1, "foo", "v4"));
-    for (int i = 0; i < 10; i++) {
-      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    }
-  } while (ChangeOptions());
-}
-
-TEST_F(DBTest, WAL) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    WriteOptions writeOpt = WriteOptions();
-    writeOpt.disableWAL = true;
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
-
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    ASSERT_EQ("v1", Get(1, "foo"));
-    ASSERT_EQ("v1", Get(1, "bar"));
-
-    writeOpt.disableWAL = false;
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
-    writeOpt.disableWAL = true;
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
-
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    // Both value's should be present.
-    ASSERT_EQ("v2", Get(1, "bar"));
-    ASSERT_EQ("v2", Get(1, "foo"));
-
-    writeOpt.disableWAL = true;
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
-    writeOpt.disableWAL = false;
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
-
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    // again both values should be present.
-    ASSERT_EQ("v3", Get(1, "foo"));
-    ASSERT_EQ("v3", Get(1, "bar"));
-  } while (ChangeCompactOptions());
-}
-
-TEST_F(DBTest, CheckLock) {
-  do {
-    DB* localdb;
-    Options options = CurrentOptions();
-    ASSERT_OK(TryReopen(options));
-
-    // second open should fail
-    ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok());
-  } while (ChangeCompactOptions());
-}
-
-TEST_F(DBTest, FlushMultipleMemtable) {
-  do {
-    Options options = CurrentOptions();
-    WriteOptions writeOpt = WriteOptions();
-    writeOpt.disableWAL = true;
-    options.max_write_buffer_number = 4;
-    options.min_write_buffer_number_to_merge = 3;
-    CreateAndReopenWithCF({"pikachu"}, options);
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
-    ASSERT_OK(Flush(1));
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
-
-    ASSERT_EQ("v1", Get(1, "foo"));
-    ASSERT_EQ("v1", Get(1, "bar"));
-    ASSERT_OK(Flush(1));
-  } while (ChangeCompactOptions());
-}
-
-TEST_F(DBTest, NumImmutableMemTable) {
-  do {
-    Options options = CurrentOptions();
-    WriteOptions writeOpt = WriteOptions();
-    writeOpt.disableWAL = true;
-    options.max_write_buffer_number = 4;
-    options.min_write_buffer_number_to_merge = 3;
-    options.write_buffer_size = 1000000;
-    CreateAndReopenWithCF({"pikachu"}, options);
-
-    std::string big_value(1000000 * 2, 'x');
-    std::string num;
-    SetPerfLevel(kEnableTime);;
-    ASSERT_TRUE(GetPerfLevel() == kEnableTime);
-
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value));
-    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
-                                      "rocksdb.num-immutable-mem-table", &num));
-    ASSERT_EQ(num, "0");
-    ASSERT_TRUE(dbfull()->GetProperty(
-        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
-    ASSERT_EQ(num, "1");
-    perf_context.Reset();
-    Get(1, "k1");
-    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
-
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
-    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
-                                      "rocksdb.num-immutable-mem-table", &num));
-    ASSERT_EQ(num, "1");
-    ASSERT_TRUE(dbfull()->GetProperty(
-        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
-    ASSERT_EQ(num, "1");
-    ASSERT_TRUE(dbfull()->GetProperty(
-        handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
-    ASSERT_EQ(num, "1");
-
-    perf_context.Reset();
-    Get(1, "k1");
-    ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
-    perf_context.Reset();
-    Get(1, "k2");
-    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
-
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value));
-    ASSERT_TRUE(dbfull()->GetProperty(
-        handles_[1], "rocksdb.cur-size-active-mem-table", &num));
-    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
-                                      "rocksdb.num-immutable-mem-table", &num));
-    ASSERT_EQ(num, "2");
-    ASSERT_TRUE(dbfull()->GetProperty(
-        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
-    ASSERT_EQ(num, "1");
-    ASSERT_TRUE(dbfull()->GetProperty(
-        handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
-    ASSERT_EQ(num, "2");
-    perf_context.Reset();
-    Get(1, "k2");
-    ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
-    perf_context.Reset();
-    Get(1, "k3");
-    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
-    perf_context.Reset();
-    Get(1, "k1");
-    ASSERT_EQ(3, (int) perf_context.get_from_memtable_count);
-
-    ASSERT_OK(Flush(1));
-    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
-                                      "rocksdb.num-immutable-mem-table", &num));
-    ASSERT_EQ(num, "0");
-    ASSERT_TRUE(dbfull()->GetProperty(
-        handles_[1], "rocksdb.cur-size-active-mem-table", &num));
-    // "200" is the size of the metadata of an empty skiplist, this would
-    // break if we change the default skiplist implementation
-    ASSERT_EQ(num, "200");
-
-    uint64_t int_num;
-    uint64_t base_total_size;
-    ASSERT_TRUE(dbfull()->GetIntProperty(
-        handles_[1], "rocksdb.estimate-num-keys", &base_total_size));
-
-    ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k2"));
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", ""));
-    ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k3"));
-    ASSERT_TRUE(dbfull()->GetIntProperty(
-        handles_[1], "rocksdb.num-deletes-active-mem-table", &int_num));
-    ASSERT_EQ(int_num, 2U);
-    ASSERT_TRUE(dbfull()->GetIntProperty(
-        handles_[1], "rocksdb.num-entries-active-mem-table", &int_num));
-    ASSERT_EQ(int_num, 3U);
-
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
-    ASSERT_TRUE(dbfull()->GetIntProperty(
-        handles_[1], "rocksdb.num-entries-imm-mem-tables", &int_num));
-    ASSERT_EQ(int_num, 4U);
-    ASSERT_TRUE(dbfull()->GetIntProperty(
-        handles_[1], "rocksdb.num-deletes-imm-mem-tables", &int_num));
-    ASSERT_EQ(int_num, 2U);
-
-    ASSERT_TRUE(dbfull()->GetIntProperty(
-        handles_[1], "rocksdb.estimate-num-keys", &int_num));
-    ASSERT_EQ(int_num, base_total_size + 1);
-
-    SetPerfLevel(kDisable);
-    ASSERT_TRUE(GetPerfLevel() == kDisable);
-  } while (ChangeCompactOptions());
-}
-
-class SleepingBackgroundTask {
- public:
-  SleepingBackgroundTask()
-      : bg_cv_(&mutex_), should_sleep_(true), done_with_sleep_(false) {}
-  void DoSleep() {
-    MutexLock l(&mutex_);
-    while (should_sleep_) {
-      bg_cv_.Wait();
-    }
-    done_with_sleep_ = true;
-    bg_cv_.SignalAll();
-  }
-  void WakeUp() {
-    MutexLock l(&mutex_);
-    should_sleep_ = false;
-    bg_cv_.SignalAll();
-  }
-  void WaitUntilDone() {
-    MutexLock l(&mutex_);
-    while (!done_with_sleep_) {
-      bg_cv_.Wait();
-    }
-  }
-
-  static void DoSleepTask(void* arg) {
-    reinterpret_cast<SleepingBackgroundTask*>(arg)->DoSleep();
-  }
-
- private:
-  port::Mutex mutex_;
-  port::CondVar bg_cv_;  // Signalled when background work finishes
-  bool should_sleep_;
-  bool done_with_sleep_;
-};
-
-TEST_F(DBTest, FlushEmptyColumnFamily) {
-  // Block flush thread and disable compaction thread
-  env_->SetBackgroundThreads(1, Env::HIGH);
-  env_->SetBackgroundThreads(1, Env::LOW);
-  SleepingBackgroundTask sleeping_task_low;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
-                 Env::Priority::LOW);
-  SleepingBackgroundTask sleeping_task_high;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_high,
-                 Env::Priority::HIGH);
-
-  Options options = CurrentOptions();
-  // disable compaction
-  options.disable_auto_compactions = true;
-  WriteOptions writeOpt = WriteOptions();
-  writeOpt.disableWAL = true;
-  options.max_write_buffer_number = 2;
-  options.min_write_buffer_number_to_merge = 1;
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  // Compaction can still go through even if no thread can flush the
-  // mem table.
-  ASSERT_OK(Flush(0));
-  ASSERT_OK(Flush(1));
-
-  // Insert can go through
-  ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1"));
-  ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
-
-  ASSERT_EQ("v1", Get(0, "foo"));
-  ASSERT_EQ("v1", Get(1, "bar"));
-
-  sleeping_task_high.WakeUp();
-  sleeping_task_high.WaitUntilDone();
-
-  // Flush can still go through.
-  ASSERT_OK(Flush(0));
-  ASSERT_OK(Flush(1));
-
-  sleeping_task_low.WakeUp();
-  sleeping_task_low.WaitUntilDone();
-}
-
-TEST_F(DBTest, GetProperty) {
-  // Set sizes to both background thread pool to be 1 and block them.
-  env_->SetBackgroundThreads(1, Env::HIGH);
-  env_->SetBackgroundThreads(1, Env::LOW);
-  SleepingBackgroundTask sleeping_task_low;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
-                 Env::Priority::LOW);
-  SleepingBackgroundTask sleeping_task_high;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_high,
-                 Env::Priority::HIGH);
-
-  Options options = CurrentOptions();
-  WriteOptions writeOpt = WriteOptions();
-  writeOpt.disableWAL = true;
-  options.compaction_style = kCompactionStyleUniversal;
-  options.level0_file_num_compaction_trigger = 1;
-  options.compaction_options_universal.size_ratio = 50;
-  options.max_background_compactions = 1;
-  options.max_background_flushes = 1;
-  options.max_write_buffer_number = 10;
-  options.min_write_buffer_number_to_merge = 1;
-  options.write_buffer_size = 1000000;
-  Reopen(options);
-
-  std::string big_value(1000000 * 2, 'x');
-  std::string num;
-  uint64_t int_num;
-  SetPerfLevel(kEnableTime);
-
-  ASSERT_TRUE(
-      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
-  ASSERT_EQ(int_num, 0U);
-
-  ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
-  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
-  ASSERT_EQ(num, "0");
-  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
-  ASSERT_EQ(num, "0");
-  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
-  ASSERT_EQ(num, "0");
-  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
-  ASSERT_EQ(num, "1");
-  perf_context.Reset();
-
-  ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
-  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
-  ASSERT_EQ(num, "1");
-  ASSERT_OK(dbfull()->Delete(writeOpt, "k-non-existing"));
-  ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
-  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
-  ASSERT_EQ(num, "2");
-  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
-  ASSERT_EQ(num, "1");
-  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
-  ASSERT_EQ(num, "0");
-  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
-  ASSERT_EQ(num, "2");
-  // Verify the same set of properties through GetIntProperty
-  ASSERT_TRUE(
-      dbfull()->GetIntProperty("rocksdb.num-immutable-mem-table", &int_num));
-  ASSERT_EQ(int_num, 2U);
-  ASSERT_TRUE(
-      dbfull()->GetIntProperty("rocksdb.mem-table-flush-pending", &int_num));
-  ASSERT_EQ(int_num, 1U);
-  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.compaction-pending", &int_num));
-  ASSERT_EQ(int_num, 0U);
-  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
-  ASSERT_EQ(int_num, 2U);
-
-  ASSERT_TRUE(
-      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
-  ASSERT_EQ(int_num, 0U);
-
-  sleeping_task_high.WakeUp();
-  sleeping_task_high.WaitUntilDone();
-  dbfull()->TEST_WaitForFlushMemTable();
-
-  ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value));
-  ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value));
-  dbfull()->TEST_WaitForFlushMemTable();
-  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
-  ASSERT_EQ(num, "0");
-  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
-  ASSERT_EQ(num, "1");
-  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
-  ASSERT_EQ(num, "4");
-
-  ASSERT_TRUE(
-      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
-  ASSERT_GT(int_num, 0U);
-
-  sleeping_task_low.WakeUp();
-  sleeping_task_low.WaitUntilDone();
-
-  dbfull()->TEST_WaitForFlushMemTable();
-  options.max_open_files = 10;
-  Reopen(options);
-  // After reopening, no table reader is loaded, so no memory for table readers
-  ASSERT_TRUE(
-      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
-  ASSERT_EQ(int_num, 0U);
-  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
-  ASSERT_GT(int_num, 0U);
-
-  // After reading a key, at least one table reader is loaded.
-  Get("k5");
-  ASSERT_TRUE(
-      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
-  ASSERT_GT(int_num, 0U);
-
-  // Test rocksdb.num-live-versions
-  {
-    options.level0_file_num_compaction_trigger = 20;
-    Reopen(options);
-    ASSERT_TRUE(
-        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
-    ASSERT_EQ(int_num, 1U);
-
-    // Use an iterator to hold current version
-    std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
-
-    ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value));
-    Flush();
-    ASSERT_TRUE(
-        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
-    ASSERT_EQ(int_num, 2U);
-
-    // Use an iterator to hold current version
-    std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
-
-    ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value));
-    Flush();
-    ASSERT_TRUE(
-        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
-    ASSERT_EQ(int_num, 3U);
-
-    iter2.reset();
-    ASSERT_TRUE(
-        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
-    ASSERT_EQ(int_num, 2U);
-
-    iter1.reset();
-    ASSERT_TRUE(
-        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
-    ASSERT_EQ(int_num, 1U);
-  }
-}
-
-TEST_F(DBTest, FLUSH) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    WriteOptions writeOpt = WriteOptions();
-    writeOpt.disableWAL = true;
-    SetPerfLevel(kEnableTime);;
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
-    // this will now also flush the last 2 writes
-    ASSERT_OK(Flush(1));
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
-
-    perf_context.Reset();
-    Get(1, "foo");
-    ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
-
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    ASSERT_EQ("v1", Get(1, "foo"));
-    ASSERT_EQ("v1", Get(1, "bar"));
-
-    writeOpt.disableWAL = true;
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
-    ASSERT_OK(Flush(1));
-
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    ASSERT_EQ("v2", Get(1, "bar"));
-    perf_context.Reset();
-    ASSERT_EQ("v2", Get(1, "foo"));
-    ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
-
-    writeOpt.disableWAL = false;
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
-    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
-    ASSERT_OK(Flush(1));
-
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    // 'foo' should be there because its put
-    // has WAL enabled.
-    ASSERT_EQ("v3", Get(1, "foo"));
-    ASSERT_EQ("v3", Get(1, "bar"));
-
-    SetPerfLevel(kDisable);
-  } while (ChangeCompactOptions());
-}
-
-TEST_F(DBTest, RecoveryWithEmptyLog) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "foo", "v1"));
-    ASSERT_OK(Put(1, "foo", "v2"));
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "foo", "v3"));
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    ASSERT_EQ("v3", Get(1, "foo"));
-  } while (ChangeOptions());
-}
-
-// Check that writes done during a memtable compaction are recovered
-// if the database is shutdown during the memtable compaction.
-TEST_F(DBTest, RecoverDuringMemtableCompaction) {
-  do {
-    Options options;
-    options.env = env_;
-    options.write_buffer_size = 1000000;
-    options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
-
-    // Trigger a long memtable compaction and reopen the database during it
-    ASSERT_OK(Put(1, "foo", "v1"));  // Goes to 1st log file
-    ASSERT_OK(Put(1, "big1", std::string(10000000, 'x')));  // Fills memtable
-    ASSERT_OK(Put(1, "big2", std::string(1000, 'y')));  // Triggers compaction
-    ASSERT_OK(Put(1, "bar", "v2"));                     // Goes to new log file
-
-    ReopenWithColumnFamilies({"default", "pikachu"}, options);
-    ASSERT_EQ("v1", Get(1, "foo"));
-    ASSERT_EQ("v2", Get(1, "bar"));
-    ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1"));
-    ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2"));
-  } while (ChangeOptions());
-}
-
-// false positive TSAN report on shared_ptr --
-// https://groups.google.com/forum/#!topic/thread-sanitizer/vz_s-t226Vg
-#ifndef ROCKSDB_TSAN_RUN
-TEST_F(DBTest, FlushSchedule) {
-  Options options = CurrentOptions();
-  options.disable_auto_compactions = true;
-  options.level0_stop_writes_trigger = 1 << 10;
-  options.level0_slowdown_writes_trigger = 1 << 10;
-  options.min_write_buffer_number_to_merge = 1;
-  options.max_write_buffer_number = 2;
-  options.write_buffer_size = 100 * 1000;
-  CreateAndReopenWithCF({"pikachu"}, options);
-  std::vector<std::thread> threads;
-
-  std::atomic<int> thread_num(0);
-  // each column family will have 5 thread, each thread generating 2 memtables.
-  // each column family should end up with 10 table files
-  for (int i = 0; i < 10; ++i) {
-    threads.emplace_back([&]() {
-      int a = thread_num.fetch_add(1);
-      Random rnd(a);
-      WriteOptions wo;
-      // this should fill up 2 memtables
-      for (int k = 0; k < 5000; ++k) {
-        ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), ""));
-      }
-    });
-  }
-
-  for (auto& t : threads) {
-    t.join();
-  }
-
-  auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default");
-  auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu");
-  ASSERT_LE(default_tables, static_cast<uint64_t>(10));
-  ASSERT_GT(default_tables, static_cast<uint64_t>(0));
-  ASSERT_LE(pikachu_tables, static_cast<uint64_t>(10));
-  ASSERT_GT(pikachu_tables, static_cast<uint64_t>(0));
-}
-#endif  // enabled only if not TSAN run
-
-TEST_F(DBTest, MinorCompactionsHappen) {
-  do {
-    Options options;
-    options.write_buffer_size = 10000;
-    options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
-
-    const int N = 500;
-
-    int starting_num_tables = TotalTableFiles(1);
-    for (int i = 0; i < N; i++) {
-      ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v')));
-    }
-    int ending_num_tables = TotalTableFiles(1);
-    ASSERT_GT(ending_num_tables, starting_num_tables);
-
-    for (int i = 0; i < N; i++) {
-      ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
-    }
-
-    ReopenWithColumnFamilies({"default", "pikachu"}, options);
-
-    for (int i = 0; i < N; i++) {
-      ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
-    }
-  } while (ChangeCompactOptions());
-}
-
-TEST_F(DBTest, ManifestRollOver) {
-  do {
-    Options options;
-    options.max_manifest_file_size = 10 ;  // 10 bytes
-    options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
-    {
-      ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1')));
-      ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2')));
-      ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3')));
-      uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
-      ASSERT_OK(Flush(1));  // This should trigger LogAndApply.
-      uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
-      ASSERT_GT(manifest_after_flush, manifest_before_flush);
-      ReopenWithColumnFamilies({"default", "pikachu"}, options);
-      ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
-      // check if a new manifest file got inserted or not.
-      ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1"));
-      ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2"));
-      ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3"));
-    }
-  } while (ChangeCompactOptions());
-}
-
-TEST_F(DBTest, IdentityAcrossRestarts) {
-  do {
-    std::string id1;
-    ASSERT_OK(db_->GetDbIdentity(id1));
-
-    Options options = CurrentOptions();
-    Reopen(options);
-    std::string id2;
-    ASSERT_OK(db_->GetDbIdentity(id2));
-    // id1 should match id2 because identity was not regenerated
-    ASSERT_EQ(id1.compare(id2), 0);
-
-    std::string idfilename = IdentityFileName(dbname_);
-    ASSERT_OK(env_->DeleteFile(idfilename));
-    Reopen(options);
-    std::string id3;
-    ASSERT_OK(db_->GetDbIdentity(id3));
-    // id1 should NOT match id3 because identity was regenerated
-    ASSERT_NE(id1.compare(id3), 0);
-  } while (ChangeCompactOptions());
-}
-
-TEST_F(DBTest, RecoverWithLargeLog) {
-  do {
-    {
-      Options options = CurrentOptions();
-      CreateAndReopenWithCF({"pikachu"}, options);
-      ASSERT_OK(Put(1, "big1", std::string(200000, '1')));
-      ASSERT_OK(Put(1, "big2", std::string(200000, '2')));
-      ASSERT_OK(Put(1, "small3", std::string(10, '3')));
-      ASSERT_OK(Put(1, "small4", std::string(10, '4')));
-      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
-    }
-
-    // Make sure that if we re-open with a small write buffer size that
-    // we flush table files in the middle of a large log file.
-    Options options;
-    options.write_buffer_size = 100000;
-    options = CurrentOptions(options);
-    ReopenWithColumnFamilies({"default", "pikachu"}, options);
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3);
-    ASSERT_EQ(std::string(200000, '1'), Get(1, "big1"));
-    ASSERT_EQ(std::string(200000, '2'), Get(1, "big2"));
-    ASSERT_EQ(std::string(10, '3'), Get(1, "small3"));
-    ASSERT_EQ(std::string(10, '4'), Get(1, "small4"));
-    ASSERT_GT(NumTableFilesAtLevel(0, 1), 1);
-  } while (ChangeCompactOptions());
-}
-
-TEST_F(DBTest, CompactionsGenerateMultipleFiles) {
-  Options options;
-  options.write_buffer_size = 100000000;        // Large write buffer
-  options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  Random rnd(301);
-
-  // Write 8MB (80 values, each 100K)
-  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
-  std::vector<std::string> values;
-  for (int i = 0; i < 80; i++) {
-    values.push_back(RandomString(&rnd, 100000));
-    ASSERT_OK(Put(1, Key(i), values[i]));
-  }
-
-  // Reopening moves updates to level-0
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-
-  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
-  ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
-  for (int i = 0; i < 80; i++) {
-    ASSERT_EQ(Get(1, Key(i)), values[i]);
-  }
-}
-
-TEST_F(DBTest, CompactionTrigger) {
-  Options options;
-  options.write_buffer_size = 100<<10; //100KB
-  options.num_levels = 3;
-  options.max_mem_compaction_level = 0;
-  options.level0_file_num_compaction_trigger = 3;
-  options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  Random rnd(301);
-
-  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
-       num++) {
-    std::vector<std::string> values;
-    // Write 120KB (12 values, each 10K)
-    for (int i = 0; i < 12; i++) {
-      values.push_back(RandomString(&rnd, 10000));
-      ASSERT_OK(Put(1, Key(i), values[i]));
-    }
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
-  }
-
-  //generate one more file in level-0, and should trigger level-0 compaction
-  std::vector<std::string> values;
-  for (int i = 0; i < 12; i++) {
-    values.push_back(RandomString(&rnd, 10000));
-    ASSERT_OK(Put(1, Key(i), values[i]));
-  }
-  dbfull()->TEST_WaitForCompact();
-
-  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1);
-}
-
-namespace {
-static const int kCDTValueSize = 1000;
-static const int kCDTKeysPerBuffer = 4;
-static const int kCDTNumLevels = 8;
-Options DeletionTriggerOptions() {
-  Options options;
-  options.compression = kNoCompression;
-  options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24);
-  options.min_write_buffer_number_to_merge = 1;
-  options.num_levels = kCDTNumLevels;
-  options.max_mem_compaction_level = 0;
-  options.level0_file_num_compaction_trigger = 1;
-  options.target_file_size_base = options.write_buffer_size * 2;
-  options.target_file_size_multiplier = 2;
-  options.max_bytes_for_level_base =
-      options.target_file_size_base * options.target_file_size_multiplier;
-  options.max_bytes_for_level_multiplier = 2;
-  options.disable_auto_compactions = false;
-  return options;
-}
-}  // anonymous namespace
-
-TEST_F(DBTest, CompactionDeletionTrigger) {
-  for (int tid = 0; tid < 2; ++tid) {
-    uint64_t db_size[2];
-    Options options = CurrentOptions(DeletionTriggerOptions());
-
-    if (tid == 1) {
-      // second pass with universal compaction
-      options.compaction_style = kCompactionStyleUniversal;
-      options.num_levels = 1;
-    }
-
-    DestroyAndReopen(options);
-    Random rnd(301);
-
-    const int kTestSize = kCDTKeysPerBuffer * 512;
-    std::vector<std::string> values;
-    for (int k = 0; k < kTestSize; ++k) {
-      values.push_back(RandomString(&rnd, kCDTValueSize));
-      ASSERT_OK(Put(Key(k), values[k]));
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[0] = Size(Key(0), Key(kTestSize - 1));
-
-    for (int k = 0; k < kTestSize; ++k) {
-      ASSERT_OK(Delete(Key(k)));
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[1] = Size(Key(0), Key(kTestSize - 1));
-
-    // must have much smaller db size.
-    ASSERT_GT(db_size[0] / 3, db_size[1]);
-  }
-}
-
-TEST_F(DBTest, CompactionDeletionTriggerReopen) {
-  for (int tid = 0; tid < 2; ++tid) {
-    uint64_t db_size[3];
-    Options options = CurrentOptions(DeletionTriggerOptions());
-
-    if (tid == 1) {
-      // second pass with universal compaction
-      options.compaction_style = kCompactionStyleUniversal;
-      options.num_levels = 1;
-    }
-
-    DestroyAndReopen(options);
-    Random rnd(301);
-
-    // round 1 --- insert key/value pairs.
-    const int kTestSize = kCDTKeysPerBuffer * 512;
-    std::vector<std::string> values;
-    for (int k = 0; k < kTestSize; ++k) {
-      values.push_back(RandomString(&rnd, kCDTValueSize));
-      ASSERT_OK(Put(Key(k), values[k]));
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[0] = Size(Key(0), Key(kTestSize - 1));
-    Close();
-
-    // round 2 --- disable auto-compactions and issue deletions.
-    options.create_if_missing = false;
-    options.disable_auto_compactions = true;
-    Reopen(options);
-
-    for (int k = 0; k < kTestSize; ++k) {
-      ASSERT_OK(Delete(Key(k)));
-    }
-    db_size[1] = Size(Key(0), Key(kTestSize - 1));
-    Close();
-    // as auto_compaction is off, we shouldn't see too much reduce
-    // in db size.
-    ASSERT_LT(db_size[0] / 3, db_size[1]);
-
-    // round 3 --- reopen db with auto_compaction on and see if
-    // deletion compensation still work.
-    options.disable_auto_compactions = false;
-    Reopen(options);
-    // insert relatively small amount of data to trigger auto compaction.
-    for (int k = 0; k < kTestSize / 10; ++k) {
-      ASSERT_OK(Put(Key(k), values[k]));
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-    db_size[2] = Size(Key(0), Key(kTestSize - 1));
-    // this time we're expecting significant drop in size.
-    ASSERT_GT(db_size[0] / 3, db_size[2]);
-  }
-}
-
-// This is a static filter used for filtering
-// kvs during the compaction process.
-static int cfilter_count;
-static std::string NEW_VALUE = "NewValue";
-
-class KeepFilter : public CompactionFilter {
- public:
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value, bool* value_changed) const
-      override {
-    cfilter_count++;
-    return false;
-  }
-
-  virtual const char* Name() const override { return "KeepFilter"; }
-};
-
-class DeleteFilter : public CompactionFilter {
- public:
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value, bool* value_changed) const
-      override {
-    cfilter_count++;
-    return true;
-  }
-
-  virtual const char* Name() const override { return "DeleteFilter"; }
-};
-
-class DelayFilter : public CompactionFilter {
- public:
-  explicit DelayFilter(DBTest* d) : db_test(d) {}
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value,
-                      bool* value_changed) const override {
-    db_test->env_->addon_time_ += 1000;
-    return true;
-  }
-
-  virtual const char* Name() const override { return "DelayFilter"; }
-
- private:
-  DBTest* db_test;
-};
-
-class ConditionalFilter : public CompactionFilter {
- public:
-  explicit ConditionalFilter(const std::string* filtered_value)
-      : filtered_value_(filtered_value) {}
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value,
-                      bool* value_changed) const override {
-    return value.ToString() == *filtered_value_;
-  }
-
-  virtual const char* Name() const override { return "ConditionalFilter"; }
-
- private:
-  const std::string* filtered_value_;
-};
-
-class ChangeFilter : public CompactionFilter {
- public:
-  explicit ChangeFilter() {}
-
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value, bool* value_changed) const
-      override {
-    assert(new_value != nullptr);
-    *new_value = NEW_VALUE;
-    *value_changed = true;
-    return false;
-  }
-
-  virtual const char* Name() const override { return "ChangeFilter"; }
-};
-
-class KeepFilterFactory : public CompactionFilterFactory {
- public:
-  explicit KeepFilterFactory(bool check_context = false)
-      : check_context_(check_context) {}
-
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override {
-    if (check_context_) {
-      EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
-      EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
-    }
-    return std::unique_ptr<CompactionFilter>(new KeepFilter());
-  }
-
-  virtual const char* Name() const override { return "KeepFilterFactory"; }
-  bool check_context_;
-  std::atomic_bool expect_full_compaction_;
-  std::atomic_bool expect_manual_compaction_;
-};
-
-class DeleteFilterFactory : public CompactionFilterFactory {
- public:
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override {
-    if (context.is_manual_compaction) {
-      return std::unique_ptr<CompactionFilter>(new DeleteFilter());
-    } else {
-      return std::unique_ptr<CompactionFilter>(nullptr);
-    }
-  }
-
-  virtual const char* Name() const override { return "DeleteFilterFactory"; }
-};
-
-class DelayFilterFactory : public CompactionFilterFactory {
- public:
-  explicit DelayFilterFactory(DBTest* d) : db_test(d) {}
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override {
-    return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
-  }
-
-  virtual const char* Name() const override { return "DelayFilterFactory"; }
-
- private:
-  DBTest* db_test;
-};
-
-class ConditionalFilterFactory : public CompactionFilterFactory {
- public:
-  explicit ConditionalFilterFactory(const Slice& filtered_value)
-      : filtered_value_(filtered_value.ToString()) {}
-
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override {
-    return std::unique_ptr<CompactionFilter>(
-        new ConditionalFilter(&filtered_value_));
-  }
-
-  virtual const char* Name() const override {
-    return "ConditionalFilterFactory";
-  }
-
- private:
-  std::string filtered_value_;
-};
-
-class ChangeFilterFactory : public CompactionFilterFactory {
- public:
-  explicit ChangeFilterFactory() {}
-
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override {
-    return std::unique_ptr<CompactionFilter>(new ChangeFilter());
-  }
-
-  virtual const char* Name() const override { return "ChangeFilterFactory"; }
-};
-
-class DBTestUniversalCompactionBase
-    : public DBTest,
-      public ::testing::WithParamInterface<int> {
- public:
-  virtual void SetUp() override { num_levels_ = GetParam(); }
-  int num_levels_;
-};
-
-class DBTestUniversalCompaction : public DBTestUniversalCompactionBase {};
-
-// TODO(kailiu) The tests on UniversalCompaction has some issues:
-//  1. A lot of magic numbers ("11" or "12").
-//  2. Made assumption on the memtable flush conditions, which may change from
-//     time to time.
-TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) {
-  Options options;
-  options.compaction_style = kCompactionStyleUniversal;
-  options.num_levels = num_levels_;
-  options.write_buffer_size = 100 << 10;     // 100KB
-  options.target_file_size_base = 32 << 10;  // 32KB
-  // trigger compaction if there are >= 4 files
-  options.level0_file_num_compaction_trigger = 4;
-  KeepFilterFactory* filter = new KeepFilterFactory(true);
-  filter->expect_manual_compaction_.store(false);
-  options.compaction_filter_factory.reset(filter);
-
-  options = CurrentOptions(options);
-  DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBTestWritableFile.GetPreallocationStatus", [&](void* arg) {
-        ASSERT_TRUE(arg != nullptr);
-        size_t preallocation_size = *(static_cast<size_t*>(arg));
-        if (num_levels_ > 3) {
-          ASSERT_LE(preallocation_size, options.target_file_size_base * 1.1);
-        }
-      });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-
-  Random rnd(301);
-  int key_idx = 0;
-
-  filter->expect_full_compaction_.store(true);
-  // Stage 1:
-  //   Generate a set of files at level 0, but don't trigger level-0
-  //   compaction.
-  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
-       num++) {
-    // Write 110KB (11 values, each 10K)
-    for (int i = 0; i < 12; i++) {
-      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
-      key_idx++;
-    }
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-    ASSERT_EQ(NumSortedRuns(1), num + 1);
-  }
-
-  // Generate one more file at level-0, which should trigger level-0
-  // compaction.
-  for (int i = 0; i < 11; i++) {
-    ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
-    key_idx++;
-  }
-  dbfull()->TEST_WaitForCompact();
-  // Suppose each file flushed from mem table has size 1. Now we compact
-  // (level0_file_num_compaction_trigger+1)=4 files and should have a big
-  // file of size 4.
-  ASSERT_EQ(NumSortedRuns(1), 1);
-
-  // Stage 2:
-  //   Now we have one file at level 0, with size 4. We also have some data in
-  //   mem table. Let's continue generating new files at level 0, but don't
-  //   trigger level-0 compaction.
-  //   First, clean up memtable before inserting new data. This will generate
-  //   a level-0 file, with size around 0.4 (according to previously written
-  //   data amount).
-  filter->expect_full_compaction_.store(false);
-  ASSERT_OK(Flush(1));
-  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
-       num++) {
-    // Write 110KB (11 values, each 10K)
-    for (int i = 0; i < 11; i++) {
-      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
-      key_idx++;
-    }
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-    ASSERT_EQ(NumSortedRuns(1), num + 3);
-  }
-
-  // Generate one more file at level-0, which should trigger level-0
-  // compaction.
-  for (int i = 0; i < 11; i++) {
-    ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
-    key_idx++;
-  }
-  dbfull()->TEST_WaitForCompact();
-  // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
-  // After compaction, we should have 2 files, with size 4, 2.4.
-  ASSERT_EQ(NumSortedRuns(1), 2);
-
-  // Stage 3:
-  //   Now we have 2 files at level 0, with size 4 and 2.4. Continue
-  //   generating new files at level 0.
-  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
-       num++) {
-    // Write 110KB (11 values, each 10K)
-    for (int i = 0; i < 11; i++) {
-      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
-      key_idx++;
-    }
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-    ASSERT_EQ(NumSortedRuns(1), num + 3);
-  }
-
-  // Generate one more file at level-0, which should trigger level-0
-  // compaction.
-  for (int i = 0; i < 12; i++) {
-    ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
-    key_idx++;
-  }
-  dbfull()->TEST_WaitForCompact();
-  // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1.
-  // After compaction, we should have 3 files, with size 4, 2.4, 2.
-  ASSERT_EQ(NumSortedRuns(1), 3);
-
-  // Stage 4:
-  //   Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a
-  //   new file of size 1.
-  for (int i = 0; i < 11; i++) {
-    ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
-    key_idx++;
-  }
-  dbfull()->TEST_WaitForCompact();
-  // Level-0 compaction is triggered, but no file will be picked up.
-  ASSERT_EQ(NumSortedRuns(1), 4);
-
-  // Stage 5:
-  //   Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate
-  //   a new file of size 1.
-  filter->expect_full_compaction_.store(true);
-  for (int i = 0; i < 11; i++) {
-    ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
-    key_idx++;
-  }
-  dbfull()->TEST_WaitForCompact();
-  // All files at level 0 will be compacted into a single one.
-  ASSERT_EQ(NumSortedRuns(1), 1);
-
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-}
-
-TEST_P(DBTestUniversalCompaction, UniversalCompactionSizeAmplification) {
-  Options options;
-  options.compaction_style = kCompactionStyleUniversal;
-  options.num_levels = num_levels_;
-  options.write_buffer_size = 100 << 10;     // 100KB
-  options.target_file_size_base = 32 << 10;  // 32KB
-  options.level0_file_num_compaction_trigger = 3;
-  options = CurrentOptions(options);
-  DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  // Trigger compaction if size amplification exceeds 110%
-  options.compaction_options_universal.max_size_amplification_percent = 110;
-  options = CurrentOptions(options);
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
-
-  Random rnd(301);
-  int key_idx = 0;
-
-  //   Generate two files in Level 0. Both files are approx the same size.
-  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
-       num++) {
-    // Write 110KB (11 values, each 10K)
-    for (int i = 0; i < 11; i++) {
-      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
-      key_idx++;
-    }
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-    ASSERT_EQ(NumSortedRuns(1), num + 1);
-  }
-  ASSERT_EQ(NumSortedRuns(1), 2);
-
-  // Flush whatever is remaining in memtable. This is typically
-  // small, which should not trigger size ratio based compaction
-  // but will instead trigger size amplification.
-  ASSERT_OK(Flush(1));
-
-  dbfull()->TEST_WaitForCompact();
-
-  // Verify that size amplification did occur
-  ASSERT_EQ(NumSortedRuns(1), 1);
-}
-
-class DBTestUniversalCompactionMultiLevels
-    : public DBTestUniversalCompactionBase {};
-
-TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionMultiLevels) {
-  Options options;
-  options.compaction_style = kCompactionStyleUniversal;
-  options.num_levels = num_levels_;
-  options.write_buffer_size = 100 << 10;  // 100KB
-  options.level0_file_num_compaction_trigger = 8;
-  options.max_background_compactions = 3;
-  options.target_file_size_base = 32 * 1024;
-  options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  // Trigger compaction if size amplification exceeds 110%
-  options.compaction_options_universal.max_size_amplification_percent = 110;
-  options = CurrentOptions(options);
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
-
-  Random rnd(301);
-  int num_keys = 100000;
-  for (int i = 0; i < num_keys * 2; i++) {
-    ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
-  }
-
-  dbfull()->TEST_WaitForCompact();
-
-  for (int i = num_keys; i < num_keys * 2; i++) {
-    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
-  }
-}
-
-INSTANTIATE_TEST_CASE_P(DBTestUniversalCompactionMultiLevels,
-                        DBTestUniversalCompactionMultiLevels,
-                        ::testing::Values(3, 20));
-
-class DBTestUniversalCompactionParallel : public DBTestUniversalCompactionBase {
-};
-
-TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) {
-  Options options;
-  options.compaction_style = kCompactionStyleUniversal;
-  options.num_levels = num_levels_;
-  options.write_buffer_size = 1 << 10;  // 1KB
-  options.level0_file_num_compaction_trigger = 3;
-  options.max_background_compactions = 3;
-  options.max_background_flushes = 3;
-  options.target_file_size_base = 1 * 1024;
-  options.compaction_options_universal.max_size_amplification_percent = 110;
-  options = CurrentOptions(options);
-  DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  // Delay every compaction so multiple compactions will happen.
-  std::atomic<int> num_compactions_running(0);
-  std::atomic<bool> has_parallel(false);
-  rocksdb::SyncPoint::GetInstance()->SetCallBack("CompactionJob::Run():Start",
-                                                 [&](void* arg) {
-    if (num_compactions_running.fetch_add(1) > 0) {
-      has_parallel.store(true);
-      return;
-    }
-    for (int nwait = 0; nwait < 20000; nwait++) {
-      if (has_parallel.load() || num_compactions_running.load() > 1) {
-        has_parallel.store(true);
-        break;
-      }
-      env_->SleepForMicroseconds(1000);
-    }
-  });
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "CompactionJob::Run():End",
-      [&](void* arg) { num_compactions_running.fetch_add(-1); });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-
-  options = CurrentOptions(options);
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
-
-  Random rnd(301);
-  int num_keys = 30000;
-  for (int i = 0; i < num_keys * 2; i++) {
-    ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
-  }
-  dbfull()->TEST_WaitForCompact();
-
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  ASSERT_EQ(num_compactions_running.load(), 0);
-  ASSERT_TRUE(has_parallel.load());
-
-  for (int i = num_keys; i < num_keys * 2; i++) {
-    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
-  }
-
-  // Reopen and check.
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
-  for (int i = num_keys; i < num_keys * 2; i++) {
-    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
-  }
-}
-
-INSTANTIATE_TEST_CASE_P(DBTestUniversalCompactionParallel,
-                        DBTestUniversalCompactionParallel,
-                        ::testing::Values(1, 10));
-
-TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) {
-  Options options;
-  options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 100 << 10;     // 100KB
-  options.target_file_size_base = 32 << 10;  // 32KB
-  options.level0_file_num_compaction_trigger = 4;
-  options.num_levels = num_levels_;
-  options.compaction_options_universal.compression_size_percent = -1;
-  options = CurrentOptions(options);
-  DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  Random rnd(301);
-  int key_idx = 0;
-
-  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
-    // Write 110KB (11 values, each 10K)
-    for (int i = 0; i < 11; i++) {
-      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
-      key_idx++;
-    }
-    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-
-    if (num < options.level0_file_num_compaction_trigger - 1) {
-      ASSERT_EQ(NumSortedRuns(1), num + 1);
-    }
-  }
-
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ(NumSortedRuns(1), 1);
-}
-
-TEST_P(DBTestUniversalCompaction, UniversalCompactionStopStyleSimilarSize) {
-  Options options = CurrentOptions();
-  options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 100 << 10;     // 100KB
-  options.target_file_size_base = 32 << 10;  // 32KB
-  // trigger compaction if there are >= 4 files
-  options.level0_file_num_compaction_trigger = 4;
-  options.compaction_options_universal.size_ratio = 10;
-  options.compaction_options_universal.stop_style =
-      kCompactionStopStyleSimilarSize;
-  options.num_levels = num_levels_;
-  DestroyAndReopen(options);
-
-  Random rnd(301);
-  int key_idx = 0;
-
-  // Stage 1:
-  //   Generate a set of files at level 0, but don't trigger level-0
-  //   compaction.
-  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
-       num++) {
-    // Write 110KB (11 values, each 10K)
-    for (int i = 0; i < 11; i++) {
-      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
-      key_idx++;
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-    ASSERT_EQ(NumSortedRuns(), num + 1);
-  }
-
-  // Generate one more file at level-0, which should trigger level-0
-  // compaction.
-  for (int i = 0; i < 11; i++) {
-    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
-    key_idx++;
-  }
-  dbfull()->TEST_WaitForCompact();
-  // Suppose each file flushed from mem table has size 1. Now we compact
-  // (level0_file_num_compaction_trigger+1)=4 files and should have a big
-  // file of size 4.
-  ASSERT_EQ(NumSortedRuns(), 1);
-
-  // Stage 2:
-  //   Now we have one file at level 0, with size 4. We also have some data in
-  //   mem table. Let's continue generating new files at level 0, but don't
-  //   trigger level-0 compaction.
-  //   First, clean up memtable before inserting new data. This will generate
-  //   a level-0 file, with size around 0.4 (according to previously written
-  //   data amount).
-  dbfull()->Flush(FlushOptions());
-  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
-       num++) {
-    // Write 110KB (11 values, each 10K)
-    for (int i = 0; i < 11; i++) {
-      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
-      key_idx++;
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-    ASSERT_EQ(NumSortedRuns(), num + 3);
-  }
-
-  // Generate one more file at level-0, which should trigger level-0
-  // compaction.
-  for (int i = 0; i < 11; i++) {
-    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
-    key_idx++;
-  }
-  dbfull()->TEST_WaitForCompact();
-  // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
-  // After compaction, we should have 3 files, with size 4, 0.4, 2.
-  ASSERT_EQ(NumSortedRuns(), 3);
-  // Stage 3:
-  //   Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one
-  //   more file at level-0, which should trigger level-0 compaction.
-  for (int i = 0; i < 11; i++) {
-    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 10000)));
-    key_idx++;
-  }
-  dbfull()->TEST_WaitForCompact();
-  // Level-0 compaction is triggered, but no file will be picked up.
-  ASSERT_EQ(NumSortedRuns(), 4);
-}
-
-TEST_F(DBTest, CompressedCache) {
-  if (!Snappy_Supported()) {
-    return;
-  }
-  int num_iter = 80;
-
-  // Run this test three iterations.
-  // Iteration 1: only a uncompressed block cache
-  // Iteration 2: only a compressed block cache
-  // Iteration 3: both block cache and compressed cache
-  // Iteration 4: both block cache and compressed cache, but DB is not
-  // compressed
-  for (int iter = 0; iter < 4; iter++) {
-    Options options;
-    options.write_buffer_size = 64*1024;        // small write buffer
-    options.statistics = rocksdb::CreateDBStatistics();
-    options = CurrentOptions(options);
-
-    BlockBasedTableOptions table_options;
-    switch (iter) {
-      case 0:
-        // only uncompressed block cache
-        table_options.block_cache = NewLRUCache(8*1024);
-        table_options.block_cache_compressed = nullptr;
-        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-        break;
-      case 1:
-        // no block cache, only compressed cache
-        table_options.no_block_cache = true;
-        table_options.block_cache = nullptr;
-        table_options.block_cache_compressed = NewLRUCache(8*1024);
-        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-        break;
-      case 2:
-        // both compressed and uncompressed block cache
-        table_options.block_cache = NewLRUCache(1024);
-        table_options.block_cache_compressed = NewLRUCache(8*1024);
-        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-        break;
-      case 3:
-        // both block cache and compressed cache, but DB is not compressed
-        // also, make block cache sizes bigger, to trigger block cache hits
-        table_options.block_cache = NewLRUCache(1024 * 1024);
-        table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
-        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-        options.compression = kNoCompression;
-        break;
-      default:
-        ASSERT_TRUE(false);
-    }
-    CreateAndReopenWithCF({"pikachu"}, options);
-    // default column family doesn't have block cache
-    Options no_block_cache_opts;
-    no_block_cache_opts.statistics = options.statistics;
-    no_block_cache_opts = CurrentOptions(no_block_cache_opts);
-    BlockBasedTableOptions table_options_no_bc;
-    table_options_no_bc.no_block_cache = true;
-    no_block_cache_opts.table_factory.reset(
-        NewBlockBasedTableFactory(table_options_no_bc));
-    ReopenWithColumnFamilies({"default", "pikachu"},
-        std::vector<Options>({no_block_cache_opts, options}));
-
-    Random rnd(301);
-
-    // Write 8MB (80 values, each 100K)
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
-    std::vector<std::string> values;
-    std::string str;
-    for (int i = 0; i < num_iter; i++) {
-      if (i % 4 == 0) {        // high compression ratio
-        str = RandomString(&rnd, 1000);
-      }
-      values.push_back(str);
-      ASSERT_OK(Put(1, Key(i), values[i]));
-    }
-
-    // flush all data from memtable so that reads are from block cache
-    ASSERT_OK(Flush(1));
-
-    for (int i = 0; i < num_iter; i++) {
-      ASSERT_EQ(Get(1, Key(i)), values[i]);
-    }
-
-    // check that we triggered the appropriate code paths in the cache
-    switch (iter) {
-      case 0:
-        // only uncompressed block cache
-        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
-        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
-        break;
-      case 1:
-        // no block cache, only compressed cache
-        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
-        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
-        break;
-      case 2:
-        // both compressed and uncompressed block cache
-        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
-        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
-        break;
-      case 3:
-        // both compressed and uncompressed block cache
-        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
-        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0);
-        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
-        // compressed doesn't have any hits since blocks are not compressed on
-        // storage
-        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0);
-        break;
-      default:
-        ASSERT_TRUE(false);
-    }
-
-    options.create_if_missing = true;
-    DestroyAndReopen(options);
-  }
-}
-
-static std::string CompressibleString(Random* rnd, int len) {
-  std::string r;
-  test::CompressibleString(rnd, 0.8, len, &r);
-  return r;
-}
-
-TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio1) {
-  if (!Snappy_Supported()) {
-    return;
-  }
-
-  Options options;
-  options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 100 << 10;     // 100KB
-  options.target_file_size_base = 32 << 10;  // 32KB
-  options.level0_file_num_compaction_trigger = 2;
-  options.num_levels = num_levels_;
-  options.compaction_options_universal.compression_size_percent = 70;
-  options = CurrentOptions(options);
-  DestroyAndReopen(options);
-
-  Random rnd(301);
-  int key_idx = 0;
-
-  // The first compaction (2) is compressed.
-  for (int num = 0; num < 2; num++) {
-    // Write 110KB (11 values, each 10K)
-    for (int i = 0; i < 11; i++) {
-      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
-      key_idx++;
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-  }
-  ASSERT_LT(TotalSize(), 110000U * 2 * 0.9);
-
-  // The second compaction (4) is compressed
-  for (int num = 0; num < 2; num++) {
-    // Write 110KB (11 values, each 10K)
-    for (int i = 0; i < 11; i++) {
-      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
-      key_idx++;
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-  }
-  ASSERT_LT(TotalSize(), 110000 * 4 * 0.9);
-
-  // The third compaction (2 4) is compressed since this time it is
-  // (1 1 3.2) and 3.2/5.2 doesn't reach ratio.
-  for (int num = 0; num < 2; num++) {
-    // Write 110KB (11 values, each 10K)
-    for (int i = 0; i < 11; i++) {
-      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
-      key_idx++;
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-  }
-  ASSERT_LT(TotalSize(), 110000 * 6 * 0.9);
-
-  // When we start for the compaction up to (2 4 8), the latest
-  // compressed is not compressed.
-  for (int num = 0; num < 8; num++) {
-    // Write 110KB (11 values, each 10K)
-    for (int i = 0; i < 11; i++) {
-      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
-      key_idx++;
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-  }
-  ASSERT_GT(TotalSize(), 110000 * 11 * 0.8 + 110000 * 2);
-}
-
-TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio2) {
-  if (!Snappy_Supported()) {
-    return;
-  }
-  Options options;
-  options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 100 << 10;     // 100KB
-  options.target_file_size_base = 32 << 10;  // 32KB
-  options.level0_file_num_compaction_trigger = 2;
-  options.num_levels = num_levels_;
-  options.compaction_options_universal.compression_size_percent = 95;
-  options = CurrentOptions(options);
-  DestroyAndReopen(options);
-
-  Random rnd(301);
-  int key_idx = 0;
-
-  // When we start for the compaction up to (2 4 8), the latest
-  // compressed is compressed given the size ratio to compress.
-  for (int num = 0; num < 14; num++) {
-    // Write 120KB (12 values, each 10K)
-    for (int i = 0; i < 12; i++) {
-      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
-      key_idx++;
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-    dbfull()->TEST_WaitForCompact();
-  }
-  ASSERT_LT(TotalSize(), 120000U * 12 * 0.8 + 120000 * 2);
-}
-
-INSTANTIATE_TEST_CASE_P(UniversalCompactionNumLevels, DBTestUniversalCompaction,
-                        ::testing::Values(1, 3, 5));
-
-TEST_F(DBTest, FailMoreDbPaths) {
-  Options options = CurrentOptions();
-  options.db_paths.emplace_back(dbname_, 10000000);
-  options.db_paths.emplace_back(dbname_ + "_2", 1000000);
-  options.db_paths.emplace_back(dbname_ + "_3", 1000000);
-  options.db_paths.emplace_back(dbname_ + "_4", 1000000);
-  options.db_paths.emplace_back(dbname_ + "_5", 1000000);
-  ASSERT_TRUE(TryReopen(options).IsNotSupported());
-}
-
-TEST_F(DBTest, UniversalCompactionSecondPathRatio) {
-  if (!Snappy_Supported()) {
-    return;
-  }
-  Options options;
-  options.db_paths.emplace_back(dbname_, 500 * 1024);
-  options.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024);
-  options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 100 << 10;  // 100KB
-  options.level0_file_num_compaction_trigger = 2;
-  options.num_levels = 1;
-  options = CurrentOptions(options);
-
-  std::vector<std::string> filenames;
-  env_->GetChildren(options.db_paths[1].path, &filenames);
-  // Delete archival files.
-  for (size_t i = 0; i < filenames.size(); ++i) {
-    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
-  }
-  env_->DeleteDir(options.db_paths[1].path);
-  Reopen(options);
-
-  Random rnd(301);
-  int key_idx = 0;
-
-  // First three 110KB files are not going to second path.
-  // After that, (100K, 200K)
-  for (int num = 0; num < 3; num++) {
-    GenerateNewFile(&rnd, &key_idx);
-  }
-
-  // Another 110KB triggers a compaction to 400K file to second path
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-
-  // (1, 4)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1,1,4) -> (2, 4)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1, 2, 4)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(2, GetSstFileCount(dbname_));
-
-  // (1, 1, 2, 4) -> (8)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(0, GetSstFileCount(dbname_));
-
-  // (1, 8)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1, 1, 8) -> (2, 8)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1, 2, 8)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(2, GetSstFileCount(dbname_));
-
-  // (1, 1, 2, 8) -> (4, 8)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(0, GetSstFileCount(dbname_));
-
-  // (1, 4, 8)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  for (int i = 0; i < key_idx; i++) {
-    auto v = Get(Key(i));
-    ASSERT_NE(v, "NOT_FOUND");
-    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
-  }
-
-  Reopen(options);
-
-  for (int i = 0; i < key_idx; i++) {
-    auto v = Get(Key(i));
-    ASSERT_NE(v, "NOT_FOUND");
-    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
-  }
-
-  Destroy(options);
-}
-
-TEST_F(DBTest, LevelCompactionThirdPath) {
-  Options options = CurrentOptions();
-  options.db_paths.emplace_back(dbname_, 500 * 1024);
-  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
-  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
-  options.compaction_style = kCompactionStyleLevel;
-  options.write_buffer_size = 100 << 10;  // 100KB
-  options.level0_file_num_compaction_trigger = 2;
-  options.num_levels = 4;
-  options.max_bytes_for_level_base = 400 * 1024;
-  //  options = CurrentOptions(options);
-
-  std::vector<std::string> filenames;
-  env_->GetChildren(options.db_paths[1].path, &filenames);
-  // Delete archival files.
-  for (size_t i = 0; i < filenames.size(); ++i) {
-    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
-  }
-  env_->DeleteDir(options.db_paths[1].path);
-  Reopen(options);
-
-  Random rnd(301);
-  int key_idx = 0;
-
-  // First three 110KB files are not going to second path.
-  // After that, (100K, 200K)
-  for (int num = 0; num < 3; num++) {
-    GenerateNewFile(&rnd, &key_idx);
-  }
-
-  // Another 110KB triggers a compaction to 400K file to fill up first path
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path));
-
-  // (1, 4)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4", FilesPerLevel(0));
-  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1, 4, 1)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,1", FilesPerLevel(0));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1, 4, 2)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,2", FilesPerLevel(0));
-  ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1, 4, 3)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,3", FilesPerLevel(0));
-  ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1, 4, 4)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,4", FilesPerLevel(0));
-  ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1, 4, 5)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,5", FilesPerLevel(0));
-  ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1, 4, 6)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,6", FilesPerLevel(0));
-  ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1, 4, 7)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,7", FilesPerLevel(0));
-  ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1, 4, 8)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,8", FilesPerLevel(0));
-  ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  for (int i = 0; i < key_idx; i++) {
-    auto v = Get(Key(i));
-    ASSERT_NE(v, "NOT_FOUND");
-    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
-  }
-
-  Reopen(options);
-
-  for (int i = 0; i < key_idx; i++) {
-    auto v = Get(Key(i));
-    ASSERT_NE(v, "NOT_FOUND");
-    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
-  }
-
-  Destroy(options);
-}
-
-TEST_F(DBTest, LevelCompactionPathUse) {
-  Options options = CurrentOptions();
-  options.db_paths.emplace_back(dbname_, 500 * 1024);
-  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
-  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
-  options.compaction_style = kCompactionStyleLevel;
-  options.write_buffer_size = 100 << 10;  // 100KB
-  options.level0_file_num_compaction_trigger = 2;
-  options.num_levels = 4;
-  options.max_bytes_for_level_base = 400 * 1024;
-  //  options = CurrentOptions(options);
-
-  std::vector<std::string> filenames;
-  env_->GetChildren(options.db_paths[1].path, &filenames);
-  // Delete archival files.
-  for (size_t i = 0; i < filenames.size(); ++i) {
-    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
-  }
-  env_->DeleteDir(options.db_paths[1].path);
-  Reopen(options);
-
-  Random rnd(301);
-  int key_idx = 0;
-
-  // Always gets compacted into 1 Level1 file,
-  // 0/1 Level 0 file
-  for (int num = 0; num < 3; num++) {
-    key_idx = 0;
-    GenerateNewFile(&rnd, &key_idx);
-  }
-
-  key_idx = 0;
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-
-  key_idx = 0;
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,1", FilesPerLevel(0));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  key_idx = 0;
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("0,1", FilesPerLevel(0));
-  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(0, GetSstFileCount(dbname_));
-
-  key_idx = 0;
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,1", FilesPerLevel(0));
-  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  key_idx = 0;
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("0,1", FilesPerLevel(0));
-  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(0, GetSstFileCount(dbname_));
-
-  key_idx = 0;
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,1", FilesPerLevel(0));
-  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  key_idx = 0;
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("0,1", FilesPerLevel(0));
-  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(0, GetSstFileCount(dbname_));
-
-  key_idx = 0;
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,1", FilesPerLevel(0));
-  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  key_idx = 0;
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("0,1", FilesPerLevel(0));
-  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(0, GetSstFileCount(dbname_));
-
-  key_idx = 0;
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,1", FilesPerLevel(0));
-  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  for (int i = 0; i < key_idx; i++) {
-    auto v = Get(Key(i));
-    ASSERT_NE(v, "NOT_FOUND");
-    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
-  }
-
-  Reopen(options);
-
-  for (int i = 0; i < key_idx; i++) {
-    auto v = Get(Key(i));
-    ASSERT_NE(v, "NOT_FOUND");
-    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
-  }
-
-  Destroy(options);
-}
-
-TEST_F(DBTest, UniversalCompactionFourPaths) {
-  Options options;
-  options.db_paths.emplace_back(dbname_, 300 * 1024);
-  options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024);
-  options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
-  options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
-  options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 100 << 10;  // 100KB
-  options.level0_file_num_compaction_trigger = 2;
-  options.num_levels = 1;
-  options = CurrentOptions(options);
-
-  std::vector<std::string> filenames;
-  env_->GetChildren(options.db_paths[1].path, &filenames);
-  // Delete archival files.
-  for (size_t i = 0; i < filenames.size(); ++i) {
-    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
-  }
-  env_->DeleteDir(options.db_paths[1].path);
-  Reopen(options);
-
-  Random rnd(301);
-  int key_idx = 0;
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("NOT_FOUND", Get("bar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("foo", Get("foobar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
 
-  // First three 110KB files are not going to second path.
-  // After that, (100K, 200K)
-  for (int num = 0; num < 3; num++) {
-    GenerateNewFile(&rnd, &key_idx);
-  }
+  // Try to create a DB with mixed files:
+  ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+  // Needs insert some keys to make sure files are not filtered out by key
+  // ranges.
+  ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+  ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
 
-  // Another 110KB triggers a compaction to 400K file to second path
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  options.prefix_extractor.reset();
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
 
-  // (1, 4)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
+  // Try to create a DB with mixed files.
+  ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar"));
+  // In this case needs insert some keys to make sure files are
+  // not filtered out by key ranges.
+  ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+  ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+  Flush();
 
-  // (1,1,4) -> (2, 4)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(0, GetSstFileCount(dbname_));
+  // Now we have two files:
+  // File 1: An older file with prefix bloom.
+  // File 2: A newer file with whole bloom filter.
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+  ASSERT_EQ("NOT_FOUND", Get("bar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+  ASSERT_EQ("foo", Get("foobar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+  ASSERT_EQ("bar", Get("barfoo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
 
-  // (1, 2, 4)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1, 1, 2, 4) -> (8)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
-
-  // (1, 8)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1, 1, 8) -> (2, 8)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-
-  // (1, 2, 8)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  // (1, 1, 2, 8) -> (4, 8)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
-
-  // (1, 4, 8)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
-  ASSERT_EQ(1, GetSstFileCount(dbname_));
-
-  for (int i = 0; i < key_idx; i++) {
-    auto v = Get(Key(i));
-    ASSERT_NE(v, "NOT_FOUND");
-    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
-  }
+  // Reopen with the same setting: only whole key is used
+  Reopen(options);
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5);
+  ASSERT_EQ("NOT_FOUND", Get("bar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6);
+  ASSERT_EQ("foo", Get("foobar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+  ASSERT_EQ("bar", Get("barfoo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
 
+  // Restart with both filters are allowed
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   Reopen(options);
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+  // File 1 will has it filtered out.
+  // File 2 will not, as prefix `foo` exists in the file.
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8);
+  ASSERT_EQ("NOT_FOUND", Get("bar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10);
+  ASSERT_EQ("foo", Get("foobar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+  ASSERT_EQ("bar", Get("barfoo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
 
-  for (int i = 0; i < key_idx; i++) {
-    auto v = Get(Key(i));
-    ASSERT_NE(v, "NOT_FOUND");
-    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
-  }
+  // Restart with only prefix bloom is allowed.
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+  ASSERT_EQ("NOT_FOUND", Get("bar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+  ASSERT_EQ("foo", Get("foobar"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+  ASSERT_EQ("bar", Get("barfoo"));
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+}
 
-  Destroy(options);
+TEST_F(DBTest, IterSeekBeforePrev) {
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("0", "f"));
+  ASSERT_OK(Put("1", "h"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("2", "j"));
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Slice("c"));
+  iter->Prev();
+  iter->Seek(Slice("a"));
+  iter->Prev();
+  delete iter;
 }
 
-void CheckColumnFamilyMeta(const ColumnFamilyMetaData& cf_meta) {
-  uint64_t cf_size = 0;
-  uint64_t cf_csize = 0;
-  size_t file_count = 0;
-  for (auto level_meta : cf_meta.levels) {
-    uint64_t level_size = 0;
-    uint64_t level_csize = 0;
-    file_count += level_meta.files.size();
-    for (auto file_meta : level_meta.files) {
-      level_size += file_meta.size;
-    }
-    ASSERT_EQ(level_meta.size, level_size);
-    cf_size += level_size;
-    cf_csize += level_csize;
-  }
-  ASSERT_EQ(cf_meta.file_count, file_count);
-  ASSERT_EQ(cf_meta.size, cf_size);
+namespace {
+std::string MakeLongKey(size_t length, char c) {
+  return std::string(length, c);
 }
+}  // namespace
 
-TEST_F(DBTest, ColumnFamilyMetaDataTest) {
-  Options options = CurrentOptions();
-  options.create_if_missing = true;
-  DestroyAndReopen(options);
+TEST_F(DBTest, IterLongKeys) {
+  ASSERT_OK(Put(MakeLongKey(20, 0), "0"));
+  ASSERT_OK(Put(MakeLongKey(32, 2), "2"));
+  ASSERT_OK(Put("a", "b"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put(MakeLongKey(50, 1), "1"));
+  ASSERT_OK(Put(MakeLongKey(127, 3), "3"));
+  ASSERT_OK(Put(MakeLongKey(64, 4), "4"));
+  auto iter = db_->NewIterator(ReadOptions());
 
-  Random rnd(301);
-  int key_index = 0;
-  ColumnFamilyMetaData cf_meta;
-  for (int i = 0; i < 100; ++i) {
-    GenerateNewFile(&rnd, &key_index);
-    db_->GetColumnFamilyMetaData(&cf_meta);
-    CheckColumnFamilyMeta(cf_meta);
-  }
-}
+  // Create a key that needs to be skipped for Seq too new
+  iter->Seek(MakeLongKey(20, 0));
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4");
+  delete iter;
 
-TEST_F(DBTest, ConvertCompactionStyle) {
-  Random rnd(301);
-  int max_key_level_insert = 200;
-  int max_key_universal_insert = 600;
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek(MakeLongKey(50, 1));
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+  delete iter;
+}
 
-  // Stage 1: generate a db with level compaction
-  Options options;
-  options.write_buffer_size = 100<<10; //100KB
-  options.num_levels = 4;
-  options.level0_file_num_compaction_trigger = 3;
-  options.max_bytes_for_level_base = 500<<10; // 500KB
-  options.max_bytes_for_level_multiplier = 1;
-  options.target_file_size_base = 200<<10; // 200KB
-  options.target_file_size_multiplier = 1;
-  options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
+TEST_F(DBTest, IterNextWithNewerSeq) {
+  ASSERT_OK(Put("0", "0"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("d", "e"));
+  auto iter = db_->NewIterator(ReadOptions());
 
-  for (int i = 0; i <= max_key_level_insert; i++) {
-    // each value is 10K
-    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+       i++) {
+    ASSERT_OK(Put("b", "f"));
   }
-  ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
 
-  ASSERT_GT(TotalTableFiles(1, 4), 1);
-  int non_level0_num_files = 0;
-  for (int i = 1; i < options.num_levels; i++) {
-    non_level0_num_files += NumTableFilesAtLevel(i, 1);
+  iter->Seek(Slice("a"));
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  delete iter;
+}
+
+TEST_F(DBTest, IterPrevWithNewerSeq) {
+  ASSERT_OK(Put("0", "0"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("d", "e"));
+  auto iter = db_->NewIterator(ReadOptions());
+
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+       i++) {
+    ASSERT_OK(Put("b", "f"));
   }
-  ASSERT_GT(non_level0_num_files, 0);
 
-  // Stage 2: reopen with universal compaction - should fail
-  options = CurrentOptions();
-  options.compaction_style = kCompactionStyleUniversal;
-  options.num_levels = 1;
-  options = CurrentOptions(options);
-  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
-  ASSERT_TRUE(s.IsInvalidArgument());
+  iter->Seek(Slice("d"));
+  ASSERT_EQ(IterStatus(iter), "d->e");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->b");
 
-  // Stage 3: compact into a single file and move the file to level 0
-  options = CurrentOptions();
-  options.disable_auto_compactions = true;
-  options.target_file_size_base = INT_MAX;
-  options.target_file_size_multiplier = 1;
-  options.max_bytes_for_level_base = INT_MAX;
-  options.max_bytes_for_level_multiplier = 1;
-  options.num_levels = 4;
-  options = CurrentOptions(options);
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  iter->Prev();
+  delete iter;
+}
 
-  dbfull()->CompactRange(handles_[1], nullptr, nullptr, true /* reduce level */,
-                         0 /* reduce to level 0 */);
+TEST_F(DBTest, IterPrevWithNewerSeq2) {
+  ASSERT_OK(Put("0", "0"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("d", "e"));
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Slice("c"));
+  ASSERT_EQ(IterStatus(iter), "c->d");
 
-  for (int i = 0; i < options.num_levels; i++) {
-    int num = NumTableFilesAtLevel(i, 1);
-    if (i == 0) {
-      ASSERT_EQ(num, 1);
-    } else {
-      ASSERT_EQ(num, 0);
-    }
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+      i++) {
+    ASSERT_OK(Put("b", "f"));
   }
 
-  // Stage 4: re-open in universal compaction style and do some db operations
-  options = CurrentOptions();
-  options.compaction_style = kCompactionStyleUniversal;
-  options.num_levels = 4;
-  options.write_buffer_size = 100<<10; //100KB
-  options.level0_file_num_compaction_trigger = 3;
-  options = CurrentOptions(options);
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
-
-  options.num_levels = 1;
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->b");
 
-  for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
-    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
-  }
-  dbfull()->Flush(FlushOptions());
-  ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  iter->Prev();
+  delete iter;
+}
 
-  for (int i = 1; i < options.num_levels; i++) {
-    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
-  }
+TEST_F(DBTest, IterEmpty) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
 
-  // verify keys inserted in both level compaction style and universal
-  // compaction style
-  std::string keys_in_db;
-  Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    keys_in_db.append(iter->key().ToString());
-    keys_in_db.push_back(',');
-  }
-  delete iter;
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
 
-  std::string expected_keys;
-  for (int i = 0; i <= max_key_universal_insert; i++) {
-    expected_keys.append(Key(i));
-    expected_keys.push_back(',');
-  }
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
 
-  ASSERT_EQ(keys_in_db, expected_keys);
-}
+    iter->Seek("foo");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
 
-TEST_F(DBTest, IncreaseUniversalCompactionNumLevels) {
-  std::function<void(int)> verify_func = [&](int num_keys_in_db) {
-    std::string keys_in_db;
-    Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      keys_in_db.append(iter->key().ToString());
-      keys_in_db.push_back(',');
-    }
     delete iter;
+  } while (ChangeCompactOptions());
+}
 
-    std::string expected_keys;
-    for (int i = 0; i <= num_keys_in_db; i++) {
-      expected_keys.append(Key(i));
-      expected_keys.push_back(',');
-    }
+TEST_F(DBTest, IterSingle) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "va"));
+    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
 
-    ASSERT_EQ(keys_in_db, expected_keys);
-  };
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
 
-  Random rnd(301);
-  int max_key1 = 200;
-  int max_key2 = 600;
-  int max_key3 = 800;
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
 
-  // Stage 1: open a DB with universal compaction, num_levels=1
-  Options options = CurrentOptions();
-  options.compaction_style = kCompactionStyleUniversal;
-  options.num_levels = 1;
-  options.write_buffer_size = 100 << 10;  // 100KB
-  options.level0_file_num_compaction_trigger = 3;
-  options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
+    iter->Seek("");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
 
-  for (int i = 0; i <= max_key1; i++) {
-    // each value is 10K
-    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
-  }
-  ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+    iter->Seek("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
 
-  int non_level0_num_files = 0;
-  for (int i = 1; i < options.num_levels; i++) {
-    non_level0_num_files += NumTableFilesAtLevel(i, 1);
-  }
-  ASSERT_EQ(non_level0_num_files, 0);
+    iter->Seek("b");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
 
-  // Stage 2: reopen with universal compaction, num_levels=4
-  options.compaction_style = kCompactionStyleUniversal;
-  options.num_levels = 4;
-  options = CurrentOptions(options);
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    delete iter;
+  } while (ChangeCompactOptions());
+}
 
-  verify_func(max_key1);
+TEST_F(DBTest, IterMulti) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "va"));
+    ASSERT_OK(Put(1, "b", "vb"));
+    ASSERT_OK(Put(1, "c", "vc"));
+    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
 
-  // Insert more keys
-  for (int i = max_key1 + 1; i <= max_key2; i++) {
-    // each value is 10K
-    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
-  }
-  ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
 
-  verify_func(max_key2);
-  // Compaction to non-L0 has happened.
-  ASSERT_GT(NumTableFilesAtLevel(options.num_levels - 1, 1), 0);
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
 
-  // Stage 3: Revert it back to one level and revert to num_levels=1.
-  options.num_levels = 4;
-  options.target_file_size_base = INT_MAX;
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
-  // Compact all to level 0
-  dbfull()->CompactRange(handles_[1], nullptr, nullptr, true /* reduce level */,
-                         0 /* reduce to level 0 */);
-  // Need to restart it once to remove higher level records in manifest.
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
-  // Final reopen
-  options.compaction_style = kCompactionStyleUniversal;
-  options.num_levels = 1;
-  options = CurrentOptions(options);
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    iter->Seek("");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Seek("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Seek("ax");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
 
-  // Insert more keys
-  for (int i = max_key2 + 1; i <= max_key3; i++) {
-    // each value is 10K
-    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
-  }
-  ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
-  verify_func(max_key3);
-}
+    iter->Seek("b");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Seek("z");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
 
-namespace {
-void MinLevelHelper(DBTest* self, Options& options) {
-  Random rnd(301);
+    // Switch from reverse to forward
+    iter->SeekToLast();
+    iter->Prev();
+    iter->Prev();
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
 
-  for (int num = 0;
-    num < options.level0_file_num_compaction_trigger - 1;
-    num++)
-  {
-    std::vector<std::string> values;
-    // Write 120KB (12 values, each 10K)
-    for (int i = 0; i < 12; i++) {
-      values.push_back(RandomString(&rnd, 10000));
-      ASSERT_OK(self->Put(Key(i), values[i]));
-    }
-    self->dbfull()->TEST_WaitForFlushMemTable();
-    ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1);
-  }
+    // Switch from forward to reverse
+    iter->SeekToFirst();
+    iter->Next();
+    iter->Next();
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
 
-  //generate one more file in level-0, and should trigger level-0 compaction
-  std::vector<std::string> values;
-  for (int i = 0; i < 12; i++) {
-    values.push_back(RandomString(&rnd, 10000));
-    ASSERT_OK(self->Put(Key(i), values[i]));
-  }
-  self->dbfull()->TEST_WaitForCompact();
+    // Make sure iter stays at snapshot
+    ASSERT_OK(Put(1, "a", "va2"));
+    ASSERT_OK(Put(1, "a2", "va3"));
+    ASSERT_OK(Put(1, "b", "vb2"));
+    ASSERT_OK(Put(1, "c", "vc2"));
+    ASSERT_OK(Delete(1, "b"));
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
 
-  ASSERT_EQ(self->NumTableFilesAtLevel(0), 0);
-  ASSERT_EQ(self->NumTableFilesAtLevel(1), 1);
+    delete iter;
+  } while (ChangeCompactOptions());
 }
 
-// returns false if the calling-Test should be skipped
-bool MinLevelToCompress(CompressionType& type, Options& options, int wbits,
-                        int lev, int strategy) {
-  fprintf(stderr, "Test with compression options : window_bits = %d, level =  %d, strategy = %d}\n", wbits, lev, strategy);
-  options.write_buffer_size = 100<<10; //100KB
-  options.num_levels = 3;
-  options.max_mem_compaction_level = 0;
-  options.level0_file_num_compaction_trigger = 3;
+// Check that we can skip over a run of user keys
+// by using reseek rather than sequential scan
+TEST_F(DBTest, IterReseek) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Options options = CurrentOptions(options_override);
+  options.max_sequential_skip_in_iterations = 3;
   options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // insert two keys with same userkey and verify that
+  // reseek is not invoked. For each of these test cases,
+  // verify that we can find the next key "b".
+  ASSERT_OK(Put(1, "a", "one"));
+  ASSERT_OK(Put(1, "a", "two"));
+  ASSERT_OK(Put(1, "b", "bone"));
+  Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "a->two");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // insert a total of three keys with same userkey and verify
+  // that reseek is still not invoked.
+  ASSERT_OK(Put(1, "a", "three"));
+  iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->three");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // insert a total of four keys with same userkey and verify
+  // that reseek is invoked.
+  ASSERT_OK(Put(1, "a", "four"));
+  iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // Testing reverse iterator
+  // At this point, we have three versions of "a" and one version of "b".
+  // The reseek statistics is already at 1.
+  int num_reseeks =
+      (int)TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION);
+
+  // Insert another version of b and assert that reseek is not invoked
+  ASSERT_OK(Put(1, "b", "btwo"));
+  iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "b->btwo");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 1);
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  delete iter;
 
-  if (Snappy_Supported()) {
-    type = kSnappyCompression;
-    fprintf(stderr, "using snappy\n");
-  } else if (Zlib_Supported()) {
-    type = kZlibCompression;
-    fprintf(stderr, "using zlib\n");
-  } else if (BZip2_Supported()) {
-    type = kBZip2Compression;
-    fprintf(stderr, "using bzip2\n");
-  } else if (LZ4_Supported()) {
-    type = kLZ4Compression;
-    fprintf(stderr, "using lz4\n");
-  } else {
-    fprintf(stderr, "skipping test, compression disabled\n");
-    return false;
-  }
-  options.compression_per_level.resize(options.num_levels);
+  // insert two more versions of b. This makes a total of 4 versions
+  // of b and 4 versions of a.
+  ASSERT_OK(Put(1, "b", "bthree"));
+  ASSERT_OK(Put(1, "b", "bfour"));
+  iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "b->bfour");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 2);
+  iter->Prev();
 
-  // do not compress L0
-  for (int i = 0; i < 1; i++) {
-    options.compression_per_level[i] = kNoCompression;
-  }
-  for (int i = 1; i < options.num_levels; i++) {
-    options.compression_per_level[i] = type;
-  }
-  return true;
+  // the previous Prev call should have invoked reseek
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 3);
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  delete iter;
 }
-}  // namespace
 
-TEST_F(DBTest, MinLevelToCompress1) {
-  Options options = CurrentOptions();
-  CompressionType type = kSnappyCompression;
-  if (!MinLevelToCompress(type, options, -14, -1, 0)) {
-    return;
-  }
-  Reopen(options);
-  MinLevelHelper(this, options);
+TEST_F(DBTest, IterSmallAndLargeMix) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "va"));
+    ASSERT_OK(Put(1, "b", std::string(100000, 'b')));
+    ASSERT_OK(Put(1, "c", "vc"));
+    ASSERT_OK(Put(1, "d", std::string(100000, 'd')));
+    ASSERT_OK(Put(1, "e", std::string(100000, 'e')));
 
-  // do not compress L0 and L1
-  for (int i = 0; i < 2; i++) {
-    options.compression_per_level[i] = kNoCompression;
-  }
-  for (int i = 2; i < options.num_levels; i++) {
-    options.compression_per_level[i] = type;
-  }
-  DestroyAndReopen(options);
-  MinLevelHelper(this, options);
-}
+    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
 
-TEST_F(DBTest, MinLevelToCompress2) {
-  Options options = CurrentOptions();
-  CompressionType type = kSnappyCompression;
-  if (!MinLevelToCompress(type, options, 15, -1, 0)) {
-    return;
-  }
-  Reopen(options);
-  MinLevelHelper(this, options);
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
 
-  // do not compress L0 and L1
-  for (int i = 0; i < 2; i++) {
-    options.compression_per_level[i] = kNoCompression;
-  }
-  for (int i = 2; i < options.num_levels; i++) {
-    options.compression_per_level[i] = type;
-  }
-  DestroyAndReopen(options);
-  MinLevelHelper(this, options);
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
 }
 
-TEST_F(DBTest, RepeatedWritesToSameKey) {
+TEST_F(DBTest, IterMultiWithDelete) {
   do {
-    Options options;
-    options.env = env_;
-    options.write_buffer_size = 100000;  // Small write buffer
-    options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
-
-    // We must have at most one file per level except for level-0,
-    // which may have up to kL0_StopWritesTrigger files.
-    const int kMaxFiles =
-        options.num_levels + options.level0_stop_writes_trigger;
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "ka", "va"));
+    ASSERT_OK(Put(1, "kb", "vb"));
+    ASSERT_OK(Put(1, "kc", "vc"));
+    ASSERT_OK(Delete(1, "kb"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "kb"));
 
-    Random rnd(301);
-    std::string value =
-        RandomString(&rnd, static_cast<int>(2 * options.write_buffer_size));
-    for (int i = 0; i < 5 * kMaxFiles; i++) {
-      ASSERT_OK(Put(1, "key", value));
-      ASSERT_LE(TotalTableFiles(1), kMaxFiles);
+    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+    iter->Seek("kc");
+    ASSERT_EQ(IterStatus(iter), "kc->vc");
+    if (!CurrentOptions().merge_operator) {
+      // TODO: merge operator does not support backward iteration yet
+      if (kPlainTableAllBytesPrefix != option_config_&&
+          kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+          kHashLinkList != option_config_) {
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "ka->va");
+      }
     }
-  } while (ChangeCompactOptions());
+    delete iter;
+  } while (ChangeOptions());
 }
 
-TEST_F(DBTest, InPlaceUpdate) {
+TEST_F(DBTest, IterPrevMaxSkip) {
   do {
-    Options options;
-    options.create_if_missing = true;
-    options.inplace_update_support = true;
-    options.env = env_;
-    options.write_buffer_size = 100000;
-    options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
-
-    // Update key with values of smaller size
-    int numValues = 10;
-    for (int i = numValues; i > 0; i--) {
-      std::string value = DummyString(i, 'a');
-      ASSERT_OK(Put(1, "key", value));
-      ASSERT_EQ(value, Get(1, "key"));
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    for (int i = 0; i < 2; i++) {
+      ASSERT_OK(Put(1, "key1", "v1"));
+      ASSERT_OK(Put(1, "key2", "v2"));
+      ASSERT_OK(Put(1, "key3", "v3"));
+      ASSERT_OK(Put(1, "key4", "v4"));
+      ASSERT_OK(Put(1, "key5", "v5"));
     }
 
-    // Only 1 instance for that key.
-    validateNumberOfEntries(1, 1);
+    VerifyIterLast("key5->v5", 1);
 
-  } while (ChangeCompactOptions());
-}
+    ASSERT_OK(Delete(1, "key5"));
+    VerifyIterLast("key4->v4", 1);
 
-TEST_F(DBTest, InPlaceUpdateLargeNewValue) {
-  do {
-    Options options;
-    options.create_if_missing = true;
-    options.inplace_update_support = true;
-    options.env = env_;
-    options.write_buffer_size = 100000;
-    options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(Delete(1, "key4"));
+    VerifyIterLast("key3->v3", 1);
 
-    // Update key with values of larger size
-    int numValues = 10;
-    for (int i = 0; i < numValues; i++) {
-      std::string value = DummyString(i, 'a');
-      ASSERT_OK(Put(1, "key", value));
-      ASSERT_EQ(value, Get(1, "key"));
-    }
+    ASSERT_OK(Delete(1, "key3"));
+    VerifyIterLast("key2->v2", 1);
 
-    // All 10 updates exist in the internal iterator
-    validateNumberOfEntries(numValues, 1);
+    ASSERT_OK(Delete(1, "key2"));
+    VerifyIterLast("key1->v1", 1);
 
-  } while (ChangeCompactOptions());
+    ASSERT_OK(Delete(1, "key1"));
+    VerifyIterLast("(invalid)", 1);
+  } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast));
 }
 
-TEST_F(DBTest, InPlaceUpdateCallbackSmallerSize) {
+TEST_F(DBTest, IterWithSnapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    Options options;
-    options.create_if_missing = true;
-    options.inplace_update_support = true;
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    ASSERT_OK(Put(1, "key1", "val1"));
+    ASSERT_OK(Put(1, "key2", "val2"));
+    ASSERT_OK(Put(1, "key3", "val3"));
+    ASSERT_OK(Put(1, "key4", "val4"));
+    ASSERT_OK(Put(1, "key5", "val5"));
 
-    options.env = env_;
-    options.write_buffer_size = 100000;
-    options.inplace_callback =
-      rocksdb::DBTest::updateInPlaceSmallerSize;
-    options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
+    const Snapshot *snapshot = db_->GetSnapshot();
+    ReadOptions options;
+    options.snapshot = snapshot;
+    Iterator* iter = db_->NewIterator(options, handles_[1]);
+
+    // Put more values after the snapshot
+    ASSERT_OK(Put(1, "key100", "val100"));
+    ASSERT_OK(Put(1, "key101", "val101"));
 
-    // Update key with values of smaller size
-    int numValues = 10;
-    ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
-    ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+    iter->Seek("key5");
+    ASSERT_EQ(IterStatus(iter), "key5->val5");
+    if (!CurrentOptions().merge_operator) {
+      // TODO: merge operator does not support backward iteration yet
+      if (kPlainTableAllBytesPrefix != option_config_&&
+        kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+        kHashLinkList != option_config_) {
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key4->val4");
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key3->val3");
 
-    for (int i = numValues; i > 0; i--) {
-      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
-      ASSERT_EQ(DummyString(i - 1, 'b'), Get(1, "key"));
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key4->val4");
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key5->val5");
+      }
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
     }
+    db_->ReleaseSnapshot(snapshot);
+    delete iter;
+    // skip as HashCuckooRep does not support snapshot
+  } while (ChangeOptions(kSkipHashCuckoo));
+}
+
+TEST_F(DBTest, Recover) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "baz", "v5"));
 
-    // Only 1 instance for that key.
-    validateNumberOfEntries(1, 1);
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v1", Get(1, "foo"));
 
-  } while (ChangeCompactOptions());
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+    ASSERT_OK(Put(1, "bar", "v2"));
+    ASSERT_OK(Put(1, "foo", "v3"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v4"));
+    ASSERT_EQ("v4", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+  } while (ChangeOptions());
 }
 
-TEST_F(DBTest, InPlaceUpdateCallbackSmallerVarintSize) {
+TEST_F(DBTest, RecoverWithTableHandle) {
   do {
     Options options;
     options.create_if_missing = true;
-    options.inplace_update_support = true;
-
-    options.env = env_;
-    options.write_buffer_size = 100000;
-    options.inplace_callback =
-      rocksdb::DBTest::updateInPlaceSmallerVarintSize;
+    options.write_buffer_size = 100;
+    options.disable_auto_compactions = true;
     options = CurrentOptions(options);
+    DestroyAndReopen(options);
     CreateAndReopenWithCF({"pikachu"}, options);
 
-    // Update key with values of smaller varint size
-    int numValues = 265;
-    ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
-    ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "bar", "v2"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "foo", "v3"));
+    ASSERT_OK(Put(1, "bar", "v4"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "big", std::string(100, 'a')));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
 
-    for (int i = numValues; i > 0; i--) {
-      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
-      ASSERT_EQ(DummyString(1, 'b'), Get(1, "key"));
+    std::vector<std::vector<FileMetaData>> files;
+    dbfull()->TEST_GetFilesMetaData(handles_[1], &files);
+    int total_files = 0;
+    for (const auto& level : files) {
+      total_files += level.size();
     }
-
-    // Only 1 instance for that key.
-    validateNumberOfEntries(1, 1);
-
-  } while (ChangeCompactOptions());
+    ASSERT_EQ(total_files, 3);
+    for (const auto& level : files) {
+      for (const auto& file : level) {
+        if (kInfiniteMaxOpenFiles == option_config_) {
+          ASSERT_TRUE(file.table_reader_handle != nullptr);
+        } else {
+          ASSERT_TRUE(file.table_reader_handle == nullptr);
+        }
+      }
+    }
+  } while (ChangeOptions());
 }
 
-TEST_F(DBTest, InPlaceUpdateCallbackLargeNewValue) {
-  do {
-    Options options;
-    options.create_if_missing = true;
-    options.inplace_update_support = true;
-
-    options.env = env_;
-    options.write_buffer_size = 100000;
-    options.inplace_callback =
-      rocksdb::DBTest::updateInPlaceLargerSize;
-    options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
+TEST_F(DBTest, IgnoreRecoveredLog) {
+  std::string backup_logs = dbname_ + "/backup_logs";
 
-    // Update key with values of larger size
-    int numValues = 10;
-    for (int i = 0; i < numValues; i++) {
-      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
-      ASSERT_EQ(DummyString(i, 'c'), Get(1, "key"));
+  // delete old files in backup_logs directory
+  env_->CreateDirIfMissing(backup_logs);
+  std::vector<std::string> old_files;
+  env_->GetChildren(backup_logs, &old_files);
+  for (auto& file : old_files) {
+    if (file != "." && file != "..") {
+      env_->DeleteFile(backup_logs + "/" + file);
     }
+  }
 
-    // No inplace updates. All updates are puts with new seq number
-    // All 10 updates exist in the internal iterator
-    validateNumberOfEntries(numValues, 1);
-
-  } while (ChangeCompactOptions());
-}
-
-TEST_F(DBTest, InPlaceUpdateCallbackNoAction) {
   do {
-    Options options;
+    Options options = CurrentOptions();
     options.create_if_missing = true;
-    options.inplace_update_support = true;
-
-    options.env = env_;
-    options.write_buffer_size = 100000;
-    options.inplace_callback =
-      rocksdb::DBTest::updateInPlaceNoAction;
-    options = CurrentOptions(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
-
-    // Callback function requests no actions from db
-    ASSERT_OK(Put(1, "key", DummyString(1, 'a')));
-    ASSERT_EQ(Get(1, "key"), "NOT_FOUND");
-
-  } while (ChangeCompactOptions());
-}
-
-TEST_F(DBTest, CompactionFilter) {
-  Options options = CurrentOptions();
-  options.max_open_files = -1;
-  options.num_levels = 3;
-  options.max_mem_compaction_level = 0;
-  options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
-  options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
+    options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+    options.wal_dir = dbname_ + "/logs";
+    DestroyAndReopen(options);
 
-  // Write 100K keys, these are written to a few files in L0.
-  const std::string value(10, 'x');
-  for (int i = 0; i < 100000; i++) {
-    char key[100];
-    snprintf(key, sizeof(key), "B%010d", i);
-    Put(1, key, value);
-  }
-  ASSERT_OK(Flush(1));
+    // fill up the DB
+    std::string one, two;
+    PutFixed64(&one, 1);
+    PutFixed64(&two, 2);
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one)));
 
-  // Push all files to the highest level L2. Verify that
-  // the compaction is each level invokes the filter for
-  // all the keys in that level.
-  cfilter_count = 0;
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-  ASSERT_EQ(cfilter_count, 100000);
-  cfilter_count = 0;
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
-  ASSERT_EQ(cfilter_count, 100000);
-
-  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
-  ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
-  cfilter_count = 0;
-
-  // All the files are in the lowest level.
-  // Verify that all but the 100001st record
-  // has sequence number zero. The 100001st record
-  // is at the tip of this snapshot and cannot
-  // be zeroed out.
-  // TODO: figure out sequence number squashtoo
-  int count = 0;
-  int total = 0;
-  Arena arena;
-  {
-    ScopedArenaIterator iter(
-        dbfull()->TEST_NewInternalIterator(&arena, handles_[1]));
-    iter->SeekToFirst();
-    ASSERT_OK(iter->status());
-    while (iter->Valid()) {
-      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ikey.sequence = -1;
-      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-      total++;
-      if (ikey.sequence != 0) {
-        count++;
+    // copy the logs to backup
+    std::vector<std::string> logs;
+    env_->GetChildren(options.wal_dir, &logs);
+    for (auto& log : logs) {
+      if (log != ".." && log != ".") {
+        CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log);
       }
-      iter->Next();
     }
-  }
-  ASSERT_EQ(total, 100000);
-  ASSERT_EQ(count, 1);
-
-  // overwrite all the 100K keys once again.
-  for (int i = 0; i < 100000; i++) {
-    char key[100];
-    snprintf(key, sizeof(key), "B%010d", i);
-    ASSERT_OK(Put(1, key, value));
-  }
-  ASSERT_OK(Flush(1));
-
-  // push all files to the highest level L2. This
-  // means that all keys should pass at least once
-  // via the compaction filter
-  cfilter_count = 0;
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-  ASSERT_EQ(cfilter_count, 100000);
-  cfilter_count = 0;
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
-  ASSERT_EQ(cfilter_count, 100000);
-  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
-  ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
-
-  // create a new database with the compaction
-  // filter in such a way that it deletes all keys
-  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
-  options.create_if_missing = true;
-  DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
 
-  // write all the keys once again.
-  for (int i = 0; i < 100000; i++) {
-    char key[100];
-    snprintf(key, sizeof(key), "B%010d", i);
-    ASSERT_OK(Put(1, key, value));
-  }
-  ASSERT_OK(Flush(1));
-  ASSERT_NE(NumTableFilesAtLevel(0, 1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2, 1), 0);
-
-  // Push all files to the highest level L2. This
-  // triggers the compaction filter to delete all keys,
-  // verify that at the end of the compaction process,
-  // nothing is left.
-  cfilter_count = 0;
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-  ASSERT_EQ(cfilter_count, 100000);
-  cfilter_count = 0;
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
-  ASSERT_EQ(cfilter_count, 0);
-  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+    // recover the DB
+    Reopen(options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
+    Close();
 
-  {
-    // Scan the entire database to ensure that nothing is left
-    std::unique_ptr<Iterator> iter(
-        db_->NewIterator(ReadOptions(), handles_[1]));
-    iter->SeekToFirst();
-    count = 0;
-    while (iter->Valid()) {
-      count++;
-      iter->Next();
+    // copy the logs from backup back to wal dir
+    for (auto& log : logs) {
+      if (log != ".." && log != ".") {
+        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+      }
     }
-    ASSERT_EQ(count, 0);
-  }
+    // this should ignore the log files, recovery should not happen again
+    // if the recovery happens, the same merge operator would be called twice,
+    // leading to incorrect results
+    Reopen(options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
+    Close();
+    Destroy(options);
+    Reopen(options);
+    Close();
 
-  // The sequence number of the remaining record
-  // is not zeroed out even though it is at the
-  // level Lmax because this record is at the tip
-  // TODO: remove the following or design a different
-  // test
-  count = 0;
-  {
-    ScopedArenaIterator iter(
-        dbfull()->TEST_NewInternalIterator(&arena, handles_[1]));
-    iter->SeekToFirst();
-    ASSERT_OK(iter->status());
-    while (iter->Valid()) {
-      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-      ASSERT_NE(ikey.sequence, (unsigned)0);
-      count++;
-      iter->Next();
+    // copy the logs from backup back to wal dir
+    env_->CreateDirIfMissing(options.wal_dir);
+    for (auto& log : logs) {
+      if (log != ".." && log != ".") {
+        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+      }
     }
-    ASSERT_EQ(count, 0);
-  }
-}
-
-// Tests the edge case where compaction does not produce any output -- all
-// entries are deleted. The compaction should create bunch of 'DeleteFile'
-// entries in VersionEdit, but none of the 'AddFile's.
-TEST_F(DBTest, CompactionFilterDeletesAll) {
-  Options options;
-  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
-  options.disable_auto_compactions = true;
-  options.create_if_missing = true;
-  options = CurrentOptions(options);
-  DestroyAndReopen(options);
+    // assert that we successfully recovered only from logs, even though we
+    // destroyed the DB
+    Reopen(options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
 
-  // put some data
-  for (int table = 0; table < 4; ++table) {
-    for (int i = 0; i < 10 + table; ++i) {
-      Put(ToString(table * 100 + i), "val");
+    // Recovery will fail if DB directory doesn't exist.
+    Destroy(options);
+    // copy the logs from backup back to wal dir
+    env_->CreateDirIfMissing(options.wal_dir);
+    for (auto& log : logs) {
+      if (log != ".." && log != ".") {
+        CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+        // we won't be needing this file no more
+        env_->DeleteFile(backup_logs + "/" + log);
+      }
     }
-    Flush();
-  }
+    Status s = TryReopen(options);
+    ASSERT_TRUE(!s.ok());
+  } while (ChangeOptions(kSkipHashCuckoo));
+}
 
-  // this will produce empty file (delete compaction filter)
-  ASSERT_OK(db_->CompactRange(nullptr, nullptr));
-  ASSERT_EQ(0U, CountLiveFiles());
+TEST_F(DBTest, CheckLock) {
+  do {
+    DB* localdb;
+    Options options = CurrentOptions();
+    ASSERT_OK(TryReopen(options));
 
-  Reopen(options);
+    // second open should fail
+    ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok());
+  } while (ChangeCompactOptions());
+}
 
-  Iterator* itr = db_->NewIterator(ReadOptions());
-  itr->SeekToFirst();
-  // empty db
-  ASSERT_TRUE(!itr->Valid());
+TEST_F(DBTest, FlushMultipleMemtable) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    options.max_write_buffer_number_to_maintain = -1;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
 
-  delete itr;
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+    ASSERT_OK(Flush(1));
+  } while (ChangeCompactOptions());
 }
 
-TEST_F(DBTest, CompactionFilterWithValueChange) {
+TEST_F(DBTest, NumImmutableMemTable) {
   do {
-    Options options;
-    options.num_levels = 3;
-    options.max_mem_compaction_level = 0;
-    options.compaction_filter_factory =
-      std::make_shared<ChangeFilterFactory>();
-    options = CurrentOptions(options);
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    options.max_write_buffer_number_to_maintain = 0;
+    options.write_buffer_size = 1000000;
     CreateAndReopenWithCF({"pikachu"}, options);
 
-    // Write 100K+1 keys, these are written to a few files
-    // in L0. We do this so that the current snapshot points
-    // to the 100001 key.The compaction filter is  not invoked
-    // on keys that are visible via a snapshot because we
-    // anyways cannot delete it.
-    const std::string value(10, 'x');
-    for (int i = 0; i < 100001; i++) {
-      char key[100];
-      snprintf(key, sizeof(key), "B%010d", i);
-      Put(1, key, value);
-    }
+    std::string big_value(1000000 * 2, 'x');
+    std::string num;
+    SetPerfLevel(kEnableTime);;
+    ASSERT_TRUE(GetPerfLevel() == kEnableTime);
 
-    // push all files to  lower levels
-    ASSERT_OK(Flush(1));
-    if (option_config_ != kUniversalCompactionMultiLevel) {
-      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-      dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
-    } else {
-      dbfull()->CompactRange(handles_[1], nullptr, nullptr);
-    }
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "0");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    perf_context.Reset();
+    Get(1, "k1");
+    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
+    ASSERT_EQ(num, "1");
+
+    perf_context.Reset();
+    Get(1, "k1");
+    ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
+    perf_context.Reset();
+    Get(1, "k2");
+    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
 
-    // re-write all data again
-    for (int i = 0; i < 100001; i++) {
-      char key[100];
-      snprintf(key, sizeof(key), "B%010d", i);
-      Put(1, key, value);
-    }
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.cur-size-active-mem-table", &num));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "2");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
+    ASSERT_EQ(num, "2");
+    perf_context.Reset();
+    Get(1, "k2");
+    ASSERT_EQ(2, (int) perf_context.get_from_memtable_count);
+    perf_context.Reset();
+    Get(1, "k3");
+    ASSERT_EQ(1, (int) perf_context.get_from_memtable_count);
+    perf_context.Reset();
+    Get(1, "k1");
+    ASSERT_EQ(3, (int) perf_context.get_from_memtable_count);
 
-    // push all files to  lower levels. This should
-    // invoke the compaction filter for all 100000 keys.
     ASSERT_OK(Flush(1));
-    if (option_config_ != kUniversalCompactionMultiLevel) {
-      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-      dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
-    } else {
-      dbfull()->CompactRange(handles_[1], nullptr, nullptr);
-    }
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "0");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.cur-size-active-mem-table", &num));
+    // "200" is the size of the metadata of an empty skiplist, this would
+    // break if we change the default skiplist implementation
+    ASSERT_EQ(num, "200");
 
-    // verify that all keys now have the new value that
-    // was set by the compaction process.
-    for (int i = 0; i < 100001; i++) {
-      char key[100];
-      snprintf(key, sizeof(key), "B%010d", i);
-      std::string newvalue = Get(1, key);
-      ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
-    }
-  } while (ChangeCompactOptions());
-}
+    uint64_t int_num;
+    uint64_t base_total_size;
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.estimate-num-keys", &base_total_size));
 
-TEST_F(DBTest, CompactionFilterWithMergeOperator) {
-  std::string one, two, three, four;
-  PutFixed64(&one, 1);
-  PutFixed64(&two, 2);
-  PutFixed64(&three, 3);
-  PutFixed64(&four, 4);
+    ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k2"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", ""));
+    ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k3"));
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-deletes-active-mem-table", &int_num));
+    ASSERT_EQ(int_num, 2U);
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &int_num));
+    ASSERT_EQ(int_num, 3U);
 
-  Options options;
-  options = CurrentOptions(options);
-  options.create_if_missing = true;
-  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
-  options.num_levels = 3;
-  options.max_mem_compaction_level = 0;
-  // Filter out keys with value is 2.
-  options.compaction_filter_factory =
-      std::make_shared<ConditionalFilterFactory>(two);
-  DestroyAndReopen(options);
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &int_num));
+    ASSERT_EQ(int_num, 4U);
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-deletes-imm-mem-tables", &int_num));
+    ASSERT_EQ(int_num, 2U);
 
-  // In the same compaction, a value type needs to be deleted based on
-  // compaction filter, and there is a merge type for the key. compaction
-  // filter result is ignored.
-  ASSERT_OK(db_->Put(WriteOptions(), "foo", two));
-  ASSERT_OK(Flush());
-  ASSERT_OK(db_->Merge(WriteOptions(), "foo", one));
-  ASSERT_OK(Flush());
-  std::string newvalue = Get("foo");
-  ASSERT_EQ(newvalue, three);
-  dbfull()->CompactRange(nullptr, nullptr);
-  newvalue = Get("foo");
-  ASSERT_EQ(newvalue, three);
-
-  // value key can be deleted based on compaction filter, leaving only
-  // merge keys.
-  ASSERT_OK(db_->Put(WriteOptions(), "bar", two));
-  ASSERT_OK(Flush());
-  dbfull()->CompactRange(nullptr, nullptr);
-  newvalue = Get("bar");
-  ASSERT_EQ("NOT_FOUND", newvalue);
-  ASSERT_OK(db_->Merge(WriteOptions(), "bar", two));
-  ASSERT_OK(Flush());
-  dbfull()->CompactRange(nullptr, nullptr);
-  newvalue = Get("bar");
-  ASSERT_EQ(two, two);
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.estimate-num-keys", &int_num));
+    ASSERT_EQ(int_num, base_total_size + 1);
 
-  // Compaction filter never applies to merge keys.
-  ASSERT_OK(db_->Put(WriteOptions(), "foobar", one));
-  ASSERT_OK(Flush());
-  ASSERT_OK(db_->Merge(WriteOptions(), "foobar", two));
-  ASSERT_OK(Flush());
-  newvalue = Get("foobar");
-  ASSERT_EQ(newvalue, three);
-  dbfull()->CompactRange(nullptr, nullptr);
-  newvalue = Get("foobar");
-  ASSERT_EQ(newvalue, three);
-
-  // In the same compaction, both of value type and merge type keys need to be
-  // deleted based on compaction filter, and there is a merge type for the key.
-  // For both keys, compaction filter results are ignored.
-  ASSERT_OK(db_->Put(WriteOptions(), "barfoo", two));
-  ASSERT_OK(Flush());
-  ASSERT_OK(db_->Merge(WriteOptions(), "barfoo", two));
-  ASSERT_OK(Flush());
-  newvalue = Get("barfoo");
-  ASSERT_EQ(newvalue, four);
-  dbfull()->CompactRange(nullptr, nullptr);
-  newvalue = Get("barfoo");
-  ASSERT_EQ(newvalue, four);
+    SetPerfLevel(kDisable);
+    ASSERT_TRUE(GetPerfLevel() == kDisable);
+  } while (ChangeCompactOptions());
 }
 
-TEST_F(DBTest, CompactionFilterContextManual) {
-  KeepFilterFactory* filter = new KeepFilterFactory();
+TEST_F(DBTest, FlushEmptyColumnFamily) {
+  // Block flush thread and disable compaction thread
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  test::SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_high, Env::Priority::HIGH);
 
   Options options = CurrentOptions();
-  options.compaction_style = kCompactionStyleUniversal;
-  options.compaction_filter_factory.reset(filter);
-  options.compression = kNoCompression;
-  options.level0_file_num_compaction_trigger = 8;
-  Reopen(options);
-  int num_keys_per_file = 400;
-  for (int j = 0; j < 3; j++) {
-    // Write several keys.
-    const std::string value(10, 'x');
-    for (int i = 0; i < num_keys_per_file; i++) {
-      char key[100];
-      snprintf(key, sizeof(key), "B%08d%02d", i, j);
-      Put(key, value);
-    }
-    dbfull()->TEST_FlushMemTable();
-    // Make sure next file is much smaller so automatic compaction will not
-    // be triggered.
-    num_keys_per_file /= 2;
-  }
-
-  // Force a manual compaction
-  cfilter_count = 0;
-  filter->expect_manual_compaction_.store(true);
-  filter->expect_full_compaction_.store(false);  // Manual compaction always
-                                                 // set this flag.
-  dbfull()->CompactRange(nullptr, nullptr);
-  ASSERT_EQ(cfilter_count, 700);
-  ASSERT_EQ(NumSortedRuns(0), 1);
-
-  // Verify total number of keys is correct after manual compaction.
-  {
-    int count = 0;
-    int total = 0;
-    Arena arena;
-    ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena));
-    iter->SeekToFirst();
-    ASSERT_OK(iter->status());
-    while (iter->Valid()) {
-      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ikey.sequence = -1;
-      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-      total++;
-      if (ikey.sequence != 0) {
-        count++;
-      }
-      iter->Next();
-    }
-    ASSERT_EQ(total, 700);
-    ASSERT_EQ(count, 1);
-  }
-}
-
-class KeepFilterV2 : public CompactionFilterV2 {
- public:
-  virtual std::vector<bool> Filter(int level,
-                                   const SliceVector& keys,
-                                   const SliceVector& existing_values,
-                                   std::vector<std::string>* new_values,
-                                   std::vector<bool>* values_changed)
-    const override {
-    cfilter_count++;
-    std::vector<bool> ret;
-    new_values->clear();
-    values_changed->clear();
-    for (unsigned int i = 0; i < keys.size(); ++i) {
-      values_changed->push_back(false);
-      ret.push_back(false);
-    }
-    return ret;
-  }
+  // disable compaction
+  options.disable_auto_compactions = true;
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.max_write_buffer_number = 2;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_number_to_maintain = 1;
+  CreateAndReopenWithCF({"pikachu"}, options);
 
-  virtual const char* Name() const override {
-    return "KeepFilterV2";
-  }
-};
+  // Compaction can still go through even if no thread can flush the
+  // mem table.
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
 
-class DeleteFilterV2 : public CompactionFilterV2 {
- public:
-  virtual std::vector<bool> Filter(int level,
-                                   const SliceVector& keys,
-                                   const SliceVector& existing_values,
-                                   std::vector<std::string>* new_values,
-                                   std::vector<bool>* values_changed)
-    const override {
-    cfilter_count++;
-    new_values->clear();
-    values_changed->clear();
-    std::vector<bool> ret;
-    for (unsigned int i = 0; i < keys.size(); ++i) {
-      values_changed->push_back(false);
-      ret.push_back(true);
-    }
-    return ret;
-  }
+  // Insert can go through
+  ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1"));
+  ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
 
-  virtual const char* Name() const override {
-    return "DeleteFilterV2";
-  }
-};
+  ASSERT_EQ("v1", Get(0, "foo"));
+  ASSERT_EQ("v1", Get(1, "bar"));
 
-class ChangeFilterV2 : public CompactionFilterV2 {
- public:
-  virtual std::vector<bool> Filter(int level,
-                                   const SliceVector& keys,
-                                   const SliceVector& existing_values,
-                                   std::vector<std::string>* new_values,
-                                   std::vector<bool>* values_changed)
-    const override {
-    std::vector<bool> ret;
-    new_values->clear();
-    values_changed->clear();
-    for (unsigned int i = 0; i < keys.size(); ++i) {
-      values_changed->push_back(true);
-      new_values->push_back(NEW_VALUE);
-      ret.push_back(false);
-    }
-    return ret;
-  }
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
 
-  virtual const char* Name() const override {
-    return "ChangeFilterV2";
-  }
-};
+  // Flush can still go through.
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
 
-class KeepFilterFactoryV2 : public CompactionFilterFactoryV2 {
- public:
-  explicit KeepFilterFactoryV2(const SliceTransform* prefix_extractor)
-    : CompactionFilterFactoryV2(prefix_extractor) { }
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
 
-  virtual std::unique_ptr<CompactionFilterV2>
-  CreateCompactionFilterV2(
-      const CompactionFilterContext& context) override {
-    return std::unique_ptr<CompactionFilterV2>(new KeepFilterV2());
-  }
+TEST_F(DBTest, GetProperty) {
+  // Set sizes to both background thread pool to be 1 and block them.
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  test::SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_high, Env::Priority::HIGH);
 
-  virtual const char* Name() const override {
-    return "KeepFilterFactoryV2";
-  }
-};
+  Options options = CurrentOptions();
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = 1;
+  options.compaction_options_universal.size_ratio = 50;
+  options.max_background_compactions = 1;
+  options.max_background_flushes = 1;
+  options.max_write_buffer_number = 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_number_to_maintain = 0;
+  options.write_buffer_size = 1000000;
+  Reopen(options);
 
-class DeleteFilterFactoryV2 : public CompactionFilterFactoryV2 {
- public:
-  explicit DeleteFilterFactoryV2(const SliceTransform* prefix_extractor)
-    : CompactionFilterFactoryV2(prefix_extractor) { }
+  std::string big_value(1000000 * 2, 'x');
+  std::string num;
+  uint64_t int_num;
+  SetPerfLevel(kEnableTime);
 
-  virtual std::unique_ptr<CompactionFilterV2>
-  CreateCompactionFilterV2(
-      const CompactionFilterContext& context) override {
-    return std::unique_ptr<CompactionFilterV2>(new DeleteFilterV2());
-  }
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-live-data-size", &int_num));
+  ASSERT_EQ(int_num, 0U);
 
-  virtual const char* Name() const override {
-    return "DeleteFilterFactoryV2";
-  }
-};
+  ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "1");
+  perf_context.Reset();
 
-class ChangeFilterFactoryV2 : public CompactionFilterFactoryV2 {
- public:
-  explicit ChangeFilterFactoryV2(const SliceTransform* prefix_extractor)
-    : CompactionFilterFactoryV2(prefix_extractor) { }
+  ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "1");
+  ASSERT_OK(dbfull()->Delete(writeOpt, "k-non-existing"));
+  ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "2");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "1");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "2");
+  // Verify the same set of properties through GetIntProperty
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.num-immutable-mem-table", &int_num));
+  ASSERT_EQ(int_num, 2U);
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.mem-table-flush-pending", &int_num));
+  ASSERT_EQ(int_num, 1U);
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.compaction-pending", &int_num));
+  ASSERT_EQ(int_num, 0U);
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
+  ASSERT_EQ(int_num, 2U);
 
-  virtual std::unique_ptr<CompactionFilterV2>
-  CreateCompactionFilterV2(
-      const CompactionFilterContext& context) override {
-    return std::unique_ptr<CompactionFilterV2>(new ChangeFilterV2());
-  }
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
 
-  virtual const char* Name() const override {
-    return "ChangeFilterFactoryV2";
-  }
-};
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+  dbfull()->TEST_WaitForFlushMemTable();
 
-TEST_F(DBTest, CompactionFilterV2) {
-  Options options = CurrentOptions();
-  options.num_levels = 3;
-  options.max_mem_compaction_level = 0;
-  // extract prefix
-  std::unique_ptr<const SliceTransform> prefix_extractor;
-  prefix_extractor.reset(NewFixedPrefixTransform(8));
-
-  options.compaction_filter_factory_v2
-    = std::make_shared<KeepFilterFactoryV2>(prefix_extractor.get());
-  // In a testing environment, we can only flush the application
-  // compaction filter buffer using universal compaction
-  option_config_ = kUniversalCompaction;
-  options.compaction_style = (rocksdb::CompactionStyle)1;
-  Reopen(options);
+  ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value));
+  ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value));
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "1");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "4");
 
-  // Write 100K keys, these are written to a few files in L0.
-  const std::string value(10, 'x');
-  for (int i = 0; i < 100000; i++) {
-    char key[100];
-    snprintf(key, sizeof(key), "B%08d%010d", i , i);
-    Put(key, value);
-  }
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_GT(int_num, 0U);
 
-  dbfull()->TEST_FlushMemTable();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
 
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+  // Wait for compaction to be done. This is important because otherwise RocksDB
+  // might schedule a compaction when reopening the database, failing assertion
+  // (A) as a result.
+  dbfull()->TEST_WaitForCompact();
+  options.max_open_files = 10;
+  Reopen(options);
+  // After reopening, no table reader is loaded, so no memory for table readers
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);  // (A)
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
+  ASSERT_GT(int_num, 0U);
 
-  ASSERT_EQ(NumSortedRuns(0), 1);
+  // After reading a key, at least one table reader is loaded.
+  Get("k5");
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_GT(int_num, 0U);
 
-  // All the files are in the lowest level.
-  int count = 0;
-  int total = 0;
+  // Test rocksdb.num-live-versions
   {
-    Arena arena;
-    ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena));
-    iter->SeekToFirst();
-    ASSERT_OK(iter->status());
-    while (iter->Valid()) {
-      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ikey.sequence = -1;
-      ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
-      total++;
-      if (ikey.sequence != 0) {
-        count++;
-      }
-      iter->Next();
-    }
-  }
+    options.level0_file_num_compaction_trigger = 20;
+    Reopen(options);
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 1U);
 
-  ASSERT_EQ(total, 100000);
-  // 1 snapshot only. Since we are using universal compacton,
-  // the sequence no is cleared for better compression
-  ASSERT_EQ(count, 1);
+    // Use an iterator to hold current version
+    std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
 
-  // create a new database with the compaction
-  // filter in such a way that it deletes all keys
-  options.compaction_filter_factory_v2 =
-    std::make_shared<DeleteFilterFactoryV2>(prefix_extractor.get());
-  options.create_if_missing = true;
-  DestroyAndReopen(options);
+    ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value));
+    Flush();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 2U);
 
-  // write all the keys once again.
-  for (int i = 0; i < 100000; i++) {
-    char key[100];
-    snprintf(key, sizeof(key), "B%08d%010d", i, i);
-    Put(key, value);
-  }
+    // Use an iterator to hold current version
+    std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
 
-  dbfull()->TEST_FlushMemTable();
-  ASSERT_NE(NumTableFilesAtLevel(0), 0);
+    ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value));
+    Flush();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 3U);
 
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr);
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+    iter2.reset();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 2U);
 
-  // Scan the entire database to ensure that nothing is left
-  Iterator* iter = db_->NewIterator(ReadOptions());
-  iter->SeekToFirst();
-  count = 0;
-  while (iter->Valid()) {
-    count++;
-    iter->Next();
+    iter1.reset();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 1U);
   }
-
-  ASSERT_EQ(count, 0);
-  delete iter;
 }
 
-TEST_F(DBTest, CompactionFilterV2WithValueChange) {
-  Options options = CurrentOptions();
-  options.num_levels = 3;
-  options.max_mem_compaction_level = 0;
-  std::unique_ptr<const SliceTransform> prefix_extractor;
-  prefix_extractor.reset(NewFixedPrefixTransform(8));
-  options.compaction_filter_factory_v2 =
-    std::make_shared<ChangeFilterFactoryV2>(prefix_extractor.get());
-  // In a testing environment, we can only flush the application
-  // compaction filter buffer using universal compaction
-  option_config_ = kUniversalCompaction;
-  options.compaction_style = (rocksdb::CompactionStyle)1;
+TEST_F(DBTest, ApproximateMemoryUsage) {
+  const int kNumRounds = 10;
+  // TODO(noetzli) kFlushesPerRound does not really correlate with how many
+  // flushes happen.
+  const int kFlushesPerRound = 10;
+  const int kWritesPerFlush = 10;
+  const int kKeySize = 100;
+  const int kValueSize = 1000;
+  Options options;
+  options.write_buffer_size = 1000;  // small write buffer
+  options.min_write_buffer_number_to_merge = 4;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
   options = CurrentOptions(options);
-  Reopen(options);
+  DestroyAndReopen(options);
 
-  // Write 100K+1 keys, these are written to a few files
-  // in L0. We do this so that the current snapshot points
-  // to the 100001 key.The compaction filter is  not invoked
-  // on keys that are visible via a snapshot because we
-  // anyways cannot delete it.
-  const std::string value(10, 'x');
-  for (int i = 0; i < 100001; i++) {
-    char key[100];
-    snprintf(key, sizeof(key), "B%08d%010d", i, i);
-    Put(key, value);
-  }
+  Random rnd(301);
 
-  // push all files to lower levels
-  dbfull()->TEST_FlushMemTable();
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
-  dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+  std::vector<Iterator*> iters;
+
+  uint64_t active_mem;
+  uint64_t unflushed_mem;
+  uint64_t all_mem;
+  uint64_t prev_all_mem;
+
+  // Phase 0. The verify the initial value of all these properties are the same
+  // as we have no mem-tables.
+  dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+  dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+  dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+  ASSERT_EQ(all_mem, active_mem);
+  ASSERT_EQ(all_mem, unflushed_mem);
+
+  // Phase 1. Simply issue Put() and expect "cur-size-all-mem-tables" equals to
+  // "size-all-mem-tables"
+  for (int r = 0; r < kNumRounds; ++r) {
+    for (int f = 0; f < kFlushesPerRound; ++f) {
+      for (int w = 0; w < kWritesPerFlush; ++w) {
+        Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize));
+      }
+    }
+    // Make sure that there is no flush between getting the two properties.
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+    dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+    // in no iterator case, these two number should be the same.
+    ASSERT_EQ(unflushed_mem, all_mem);
+  }
+  prev_all_mem = all_mem;
+
+  // Phase 2. Keep issuing Put() but also create new iterators. This time we
+  // expect "size-all-mem-tables" > "cur-size-all-mem-tables".
+  for (int r = 0; r < kNumRounds; ++r) {
+    iters.push_back(db_->NewIterator(ReadOptions()));
+    for (int f = 0; f < kFlushesPerRound; ++f) {
+      for (int w = 0; w < kWritesPerFlush; ++w) {
+        Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize));
+      }
+    }
+    // Force flush to prevent flush from happening between getting the
+    // properties or after getting the properties and before the new round.
+    Flush();
 
-  // verify that all keys now have the new value that
-  // was set by the compaction process.
-  for (int i = 0; i < 100001; i++) {
-    char key[100];
-    snprintf(key, sizeof(key), "B%08d%010d", i, i);
-    std::string newvalue = Get(key);
-    ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+    // In the second round, add iterators.
+    dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+    dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+    dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+    ASSERT_GT(all_mem, active_mem);
+    ASSERT_GT(all_mem, unflushed_mem);
+    ASSERT_GT(all_mem, prev_all_mem);
+    prev_all_mem = all_mem;
   }
-}
-
-TEST_F(DBTest, CompactionFilterV2NULLPrefix) {
-  Options options = CurrentOptions();
-  options.num_levels = 3;
-  options.max_mem_compaction_level = 0;
-  std::unique_ptr<const SliceTransform> prefix_extractor;
-  prefix_extractor.reset(NewFixedPrefixTransform(8));
-  options.compaction_filter_factory_v2 =
-    std::make_shared<ChangeFilterFactoryV2>(prefix_extractor.get());
-  // In a testing environment, we can only flush the application
-  // compaction filter buffer using universal compaction
-  option_config_ = kUniversalCompaction;
-  options.compaction_style = (rocksdb::CompactionStyle)1;
-  Reopen(options);
 
-  // Write 100K+1 keys, these are written to a few files
-  // in L0. We do this so that the current snapshot points
-  // to the 100001 key.The compaction filter is  not invoked
-  // on keys that are visible via a snapshot because we
-  // anyways cannot delete it.
-  const std::string value(10, 'x');
-  char first_key[100];
-  snprintf(first_key, sizeof(first_key), "%s0000%010d", "NULL", 1);
-  Put(first_key, value);
-  for (int i = 1; i < 100000; i++) {
-    char key[100];
-    snprintf(key, sizeof(key), "%08d%010d", i, i);
-    Put(key, value);
-  }
-
-  char last_key[100];
-  snprintf(last_key, sizeof(last_key), "%s0000%010d", "NULL", 2);
-  Put(last_key, value);
-
-  // push all files to lower levels
-  dbfull()->TEST_FlushMemTable();
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr);
-
-  // verify that all keys now have the new value that
-  // was set by the compaction process.
-  std::string newvalue = Get(first_key);
-  ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
-  newvalue = Get(last_key);
-  ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
-  for (int i = 1; i < 100000; i++) {
-    char key[100];
-    snprintf(key, sizeof(key), "%08d%010d", i, i);
-    newvalue = Get(key);
-    ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+  // Phase 3. Delete iterators and expect "size-all-mem-tables" shrinks
+  // whenever we release an iterator.
+  for (auto* iter : iters) {
+    delete iter;
+    dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+    // Expect the size shrinking
+    ASSERT_LT(all_mem, prev_all_mem);
+    prev_all_mem = all_mem;
   }
-}
-
-TEST_F(DBTest, SparseMerge) {
-  do {
-    Options options = CurrentOptions();
-    options.compression = kNoCompression;
-    CreateAndReopenWithCF({"pikachu"}, options);
-
-    FillLevels("A", "Z", 1);
-
-    // Suppose there is:
-    //    small amount of data with prefix A
-    //    large amount of data with prefix B
-    //    small amount of data with prefix C
-    // and that recent updates have made small changes to all three prefixes.
-    // Check that we do not do a compaction that merges all of B in one shot.
-    const std::string value(1000, 'x');
-    Put(1, "A", "va");
-    // Write approximately 100MB of "B" values
-    for (int i = 0; i < 100000; i++) {
-      char key[100];
-      snprintf(key, sizeof(key), "B%010d", i);
-      Put(1, key, value);
-    }
-    Put(1, "C", "vc");
-    ASSERT_OK(Flush(1));
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-
-    // Make sparse update
-    Put(1, "A", "va2");
-    Put(1, "B100", "bvalue2");
-    Put(1, "C", "vc2");
-    ASSERT_OK(Flush(1));
 
-    // Compactions should not cause us to create a situation where
-    // a file overlaps too much data at the next level.
-    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
-              20 * 1048576);
-    dbfull()->TEST_CompactRange(0, nullptr, nullptr);
-    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
-              20 * 1048576);
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr);
-    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
-              20 * 1048576);
-  } while (ChangeCompactOptions());
-}
+  // Expect all these three counters to be the same.
+  dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+  dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+  dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+  ASSERT_EQ(active_mem, unflushed_mem);
+  ASSERT_EQ(unflushed_mem, all_mem);
 
-static bool Between(uint64_t val, uint64_t low, uint64_t high) {
-  bool result = (val >= low) && (val <= high);
-  if (!result) {
-    fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
-            (unsigned long long)(val),
-            (unsigned long long)(low),
-            (unsigned long long)(high));
-  }
-  return result;
+  // Phase 5. Reopen, and expect all these three counters to be the same again.
+  Reopen(options);
+  dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+  dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+  dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+  ASSERT_EQ(active_mem, unflushed_mem);
+  ASSERT_EQ(unflushed_mem, all_mem);
 }
 
-TEST_F(DBTest, ApproximateSizes) {
-  do {
-    Options options;
-    options.write_buffer_size = 100000000;        // Large write buffer
-    options.compression = kNoCompression;
-    options.create_if_missing = true;
-    options = CurrentOptions(options);
-    DestroyAndReopen(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
+TEST_F(DBTest, EstimatePendingCompBytes) {
+  // Set sizes to both background thread pool to be 1 and block them.
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
 
-    ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
-    ReopenWithColumnFamilies({"default", "pikachu"}, options);
-    ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
+  Options options = CurrentOptions();
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_background_compactions = 1;
+  options.max_background_flushes = 1;
+  options.max_write_buffer_number = 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_number_to_maintain = 0;
+  options.write_buffer_size = 1000000;
+  Reopen(options);
 
-    // Write 8MB (80 values, each 100K)
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
-    const int N = 80;
-    static const int S1 = 100000;
-    static const int S2 = 105000;  // Allow some expansion from metadata
-    Random rnd(301);
-    for (int i = 0; i < N; i++) {
-      ASSERT_OK(Put(1, Key(i), RandomString(&rnd, S1)));
-    }
+  std::string big_value(1000000 * 2, 'x');
+  std::string num;
+  uint64_t int_num;
 
-    // 0 because GetApproximateSizes() does not account for memtable space
-    ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0));
+  ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
+  Flush();
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_EQ(int_num, 0U);
 
-    // Check sizes across recovery by reopening a few times
-    for (int run = 0; run < 3; run++) {
-      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
+  Flush();
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_EQ(int_num, 0U);
 
-      for (int compact_start = 0; compact_start < N; compact_start += 10) {
-        for (int i = 0; i < N; i += 10) {
-          ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i));
-          ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1),
-                              S2 * (i + 1)));
-          ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10));
-        }
-        ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50));
-        ASSERT_TRUE(
-            Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50));
+  ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
+  Flush();
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_GT(int_num, 0U);
 
-        std::string cstart_str = Key(compact_start);
-        std::string cend_str = Key(compact_start + 9);
-        Slice cstart = cstart_str;
-        Slice cend = cend_str;
-        dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]);
-      }
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
 
-      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
-      ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
-    }
-    // ApproximateOffsetOf() is not yet implemented in plain table format.
-  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
-                         kSkipPlainTable | kSkipHashIndex));
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_EQ(int_num, 0U);
 }
 
-TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
+TEST_F(DBTest, FLUSH) {
   do {
-    Options options = CurrentOptions();
-    options.compression = kNoCompression;
-    CreateAndReopenWithCF({"pikachu"}, options);
-
-    Random rnd(301);
-    std::string big1 = RandomString(&rnd, 100000);
-    ASSERT_OK(Put(1, Key(0), RandomString(&rnd, 10000)));
-    ASSERT_OK(Put(1, Key(1), RandomString(&rnd, 10000)));
-    ASSERT_OK(Put(1, Key(2), big1));
-    ASSERT_OK(Put(1, Key(3), RandomString(&rnd, 10000)));
-    ASSERT_OK(Put(1, Key(4), big1));
-    ASSERT_OK(Put(1, Key(5), RandomString(&rnd, 10000)));
-    ASSERT_OK(Put(1, Key(6), RandomString(&rnd, 300000)));
-    ASSERT_OK(Put(1, Key(7), RandomString(&rnd, 10000)));
-
-    // Check sizes across recovery by reopening a few times
-    for (int run = 0; run < 3; run++) {
-      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    SetPerfLevel(kEnableTime);;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    // this will now also flush the last 2 writes
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
 
-      ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0));
-      ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000));
-      ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000));
-      ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000));
-      ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000));
-      ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000));
-      ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000));
-      ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000));
-      ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000));
+    perf_context.Reset();
+    Get(1, "foo");
+    ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
 
-      ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
 
-      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-    }
-    // ApproximateOffsetOf() is not yet implemented in plain table format.
-  } while (ChangeOptions(kSkipPlainTable));
-}
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+    ASSERT_OK(Flush(1));
 
-TEST_F(DBTest, IteratorPinsRef) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    Put(1, "foo", "hello");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v2", Get(1, "bar"));
+    perf_context.Reset();
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0);
 
-    // Get iterator that will yield the current contents of the DB.
-    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+    ASSERT_OK(Flush(1));
 
-    // Write to force compactions
-    Put(1, "foo", "newvalue1");
-    for (int i = 0; i < 100; i++) {
-      // 100K values
-      ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v')));
-    }
-    Put(1, "foo", "newvalue2");
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    // 'foo' should be there because its put
+    // has WAL enabled.
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_EQ("v3", Get(1, "bar"));
 
-    iter->SeekToFirst();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ("foo", iter->key().ToString());
-    ASSERT_EQ("hello", iter->value().ToString());
-    iter->Next();
-    ASSERT_TRUE(!iter->Valid());
-    delete iter;
+    SetPerfLevel(kDisable);
   } while (ChangeCompactOptions());
 }
 
-TEST_F(DBTest, Snapshot) {
-  anon::OptionsOverride options_override;
-  options_override.skip_policy = kSkipNoSnapshot;
+TEST_F(DBTest, RecoveryWithEmptyLog) {
   do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
-    Put(0, "foo", "0v1");
-    Put(1, "foo", "1v1");
-
-    const Snapshot* s1 = db_->GetSnapshot();
-    ASSERT_EQ(1U, GetNumSnapshots());
-    uint64_t time_snap1 = GetTimeOldestSnapshots();
-    ASSERT_GT(time_snap1, 0U);
-    Put(0, "foo", "0v2");
-    Put(1, "foo", "1v2");
-
-    env_->addon_time_++;
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v3"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v3", Get(1, "foo"));
+  } while (ChangeOptions());
+}
 
-    const Snapshot* s2 = db_->GetSnapshot();
-    ASSERT_EQ(2U, GetNumSnapshots());
-    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
-    Put(0, "foo", "0v3");
-    Put(1, "foo", "1v3");
 
-    const Snapshot* s3 = db_->GetSnapshot();
-    ASSERT_EQ(3U, GetNumSnapshots());
-    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+TEST_F(DBTest, FlushSchedule) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_number_to_maintain = 1;
+  options.max_write_buffer_number = 2;
+  options.write_buffer_size = 120 * 1024;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<std::thread> threads;
 
-    Put(0, "foo", "0v4");
-    Put(1, "foo", "1v4");
-    ASSERT_EQ("0v1", Get(0, "foo", s1));
-    ASSERT_EQ("1v1", Get(1, "foo", s1));
-    ASSERT_EQ("0v2", Get(0, "foo", s2));
-    ASSERT_EQ("1v2", Get(1, "foo", s2));
-    ASSERT_EQ("0v3", Get(0, "foo", s3));
-    ASSERT_EQ("1v3", Get(1, "foo", s3));
-    ASSERT_EQ("0v4", Get(0, "foo"));
-    ASSERT_EQ("1v4", Get(1, "foo"));
+  std::atomic<int> thread_num(0);
+  // each column family will have 5 thread, each thread generating 2 memtables.
+  // each column family should end up with 10 table files
+  std::function<void()> fill_memtable_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    Random rnd(a);
+    WriteOptions wo;
+    // this should fill up 2 memtables
+    for (int k = 0; k < 5000; ++k) {
+      ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), ""));
+    }
+  };
 
-    db_->ReleaseSnapshot(s3);
-    ASSERT_EQ(2U, GetNumSnapshots());
-    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
-    ASSERT_EQ("0v1", Get(0, "foo", s1));
-    ASSERT_EQ("1v1", Get(1, "foo", s1));
-    ASSERT_EQ("0v2", Get(0, "foo", s2));
-    ASSERT_EQ("1v2", Get(1, "foo", s2));
-    ASSERT_EQ("0v4", Get(0, "foo"));
-    ASSERT_EQ("1v4", Get(1, "foo"));
+  for (int i = 0; i < 10; ++i) {
+    threads.emplace_back(fill_memtable_func);
+  }
 
-    db_->ReleaseSnapshot(s1);
-    ASSERT_EQ("0v2", Get(0, "foo", s2));
-    ASSERT_EQ("1v2", Get(1, "foo", s2));
-    ASSERT_EQ("0v4", Get(0, "foo"));
-    ASSERT_EQ("1v4", Get(1, "foo"));
-    ASSERT_EQ(1U, GetNumSnapshots());
-    ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
+  for (auto& t : threads) {
+    t.join();
+  }
 
-    db_->ReleaseSnapshot(s2);
-    ASSERT_EQ(0U, GetNumSnapshots());
-    ASSERT_EQ("0v4", Get(0, "foo"));
-    ASSERT_EQ("1v4", Get(1, "foo"));
-  } while (ChangeOptions(kSkipHashCuckoo));
+  auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default");
+  auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu");
+  ASSERT_LE(default_tables, static_cast<uint64_t>(10));
+  ASSERT_GT(default_tables, static_cast<uint64_t>(0));
+  ASSERT_LE(pikachu_tables, static_cast<uint64_t>(10));
+  ASSERT_GT(pikachu_tables, static_cast<uint64_t>(0));
 }
 
-TEST_F(DBTest, HiddenValuesAreRemoved) {
-  anon::OptionsOverride options_override;
-  options_override.skip_policy = kSkipNoSnapshot;
+
+TEST_F(DBTest, ManifestRollOver) {
   do {
-    Options options = CurrentOptions(options_override);
-    options.max_background_flushes = 0;
+    Options options;
+    options.max_manifest_file_size = 10 ;  // 10 bytes
+    options = CurrentOptions(options);
     CreateAndReopenWithCF({"pikachu"}, options);
-    Random rnd(301);
-    FillLevels("a", "z", 1);
-
-    std::string big = RandomString(&rnd, 50000);
-    Put(1, "foo", big);
-    Put(1, "pastfoo", "v");
-    const Snapshot* snapshot = db_->GetSnapshot();
-    Put(1, "foo", "tiny");
-    Put(1, "pastfoo2", "v2");  // Advance sequence number one more
+    {
+      ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1')));
+      ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2')));
+      ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3')));
+      uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_OK(Flush(1));  // This should trigger LogAndApply.
+      uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_GT(manifest_after_flush, manifest_before_flush);
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+      ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
+      // check if a new manifest file got inserted or not.
+      ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1"));
+      ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2"));
+      ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3"));
+    }
+  } while (ChangeCompactOptions());
+}
 
-    ASSERT_OK(Flush(1));
-    ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
+TEST_F(DBTest, IdentityAcrossRestarts) {
+  do {
+    std::string id1;
+    ASSERT_OK(db_->GetDbIdentity(id1));
 
-    ASSERT_EQ(big, Get(1, "foo", snapshot));
-    ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000));
-    db_->ReleaseSnapshot(snapshot);
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]");
-    Slice x("x");
-    dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]);
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
-    ASSERT_GE(NumTableFilesAtLevel(1, 1), 1);
-    dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]);
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
+    Options options = CurrentOptions();
+    Reopen(options);
+    std::string id2;
+    ASSERT_OK(db_->GetDbIdentity(id2));
+    // id1 should match id2 because identity was not regenerated
+    ASSERT_EQ(id1.compare(id2), 0);
 
-    ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000));
-    // ApproximateOffsetOf() is not yet implemented in plain table format,
-    // which is used by Size().
-    // skip HashCuckooRep as it does not support snapshot
-  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
-                         kSkipPlainTable | kSkipHashCuckoo));
+    std::string idfilename = IdentityFileName(dbname_);
+    ASSERT_OK(env_->DeleteFile(idfilename));
+    Reopen(options);
+    std::string id3;
+    ASSERT_OK(db_->GetDbIdentity(id3));
+    // id1 should NOT match id3 because identity was regenerated
+    ASSERT_NE(id1.compare(id3), 0);
+  } while (ChangeCompactOptions());
 }
 
-TEST_F(DBTest, CompactBetweenSnapshots) {
-  anon::OptionsOverride options_override;
-  options_override.skip_policy = kSkipNoSnapshot;
+TEST_F(DBTest, RecoverWithLargeLog) {
   do {
-    Options options = CurrentOptions(options_override);
-    options.disable_auto_compactions = true;
-    CreateAndReopenWithCF({"pikachu"}, options);
-    Random rnd(301);
-    FillLevels("a", "z", 1);
+    {
+      Options options = CurrentOptions();
+      CreateAndReopenWithCF({"pikachu"}, options);
+      ASSERT_OK(Put(1, "big1", std::string(200000, '1')));
+      ASSERT_OK(Put(1, "big2", std::string(200000, '2')));
+      ASSERT_OK(Put(1, "small3", std::string(10, '3')));
+      ASSERT_OK(Put(1, "small4", std::string(10, '4')));
+      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    }
 
-    Put(1, "foo", "first");
-    const Snapshot* snapshot1 = db_->GetSnapshot();
-    Put(1, "foo", "second");
-    Put(1, "foo", "third");
-    Put(1, "foo", "fourth");
-    const Snapshot* snapshot2 = db_->GetSnapshot();
-    Put(1, "foo", "fifth");
-    Put(1, "foo", "sixth");
+    // Make sure that if we re-open with a small write buffer size that
+    // we flush table files in the middle of a large log file.
+    Options options;
+    options.write_buffer_size = 100000;
+    options = CurrentOptions(options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3);
+    ASSERT_EQ(std::string(200000, '1'), Get(1, "big1"));
+    ASSERT_EQ(std::string(200000, '2'), Get(1, "big2"));
+    ASSERT_EQ(std::string(10, '3'), Get(1, "small3"));
+    ASSERT_EQ(std::string(10, '4'), Get(1, "small4"));
+    ASSERT_GT(NumTableFilesAtLevel(0, 1), 1);
+  } while (ChangeCompactOptions());
+}
 
-    // All entries (including duplicates) exist
-    // before any compaction is triggered.
-    ASSERT_OK(Flush(1));
-    ASSERT_EQ("sixth", Get(1, "foo"));
-    ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
-    ASSERT_EQ("first", Get(1, "foo", snapshot1));
-    ASSERT_EQ(AllEntriesFor("foo", 1),
-              "[ sixth, fifth, fourth, third, second, first ]");
+namespace {
+class KeepFilter : public CompactionFilter {
+ public:
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value, bool* value_changed) const
+      override {
+    return false;
+  }
+
+  virtual const char* Name() const override { return "KeepFilter"; }
+};
 
-    // After a compaction, "second", "third" and "fifth" should
-    // be removed
-    FillLevels("a", "z", 1);
-    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
-    ASSERT_EQ("sixth", Get(1, "foo"));
-    ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
-    ASSERT_EQ("first", Get(1, "foo", snapshot1));
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]");
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit KeepFilterFactory(bool check_context = false)
+      : check_context_(check_context) {}
 
-    // after we release the snapshot1, only two values left
-    db_->ReleaseSnapshot(snapshot1);
-    FillLevels("a", "z", 1);
-    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (check_context_) {
+      EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+      EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+    }
+    return std::unique_ptr<CompactionFilter>(new KeepFilter());
+  }
 
-    // We have only one valid snapshot snapshot2. Since snapshot1 is
-    // not valid anymore, "first" should be removed by a compaction.
-    ASSERT_EQ("sixth", Get(1, "foo"));
-    ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]");
+  virtual const char* Name() const override { return "KeepFilterFactory"; }
+  bool check_context_;
+  std::atomic_bool expect_full_compaction_;
+  std::atomic_bool expect_manual_compaction_;
+};
 
-    // after we release the snapshot2, only one value should be left
-    db_->ReleaseSnapshot(snapshot2);
-    FillLevels("a", "z", 1);
-    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
-    ASSERT_EQ("sixth", Get(1, "foo"));
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]");
-    // skip HashCuckooRep as it does not support snapshot
-  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction));
-}
+class DelayFilter : public CompactionFilter {
+ public:
+  explicit DelayFilter(DBTestBase* d) : db_test(d) {}
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value,
+                      bool* value_changed) const override {
+    db_test->env_->addon_time_.fetch_add(1000);
+    return true;
+  }
 
-TEST_F(DBTest, DeletionMarkers1) {
-  Options options = CurrentOptions();
-  options.max_background_flushes = 0;
-  CreateAndReopenWithCF({"pikachu"}, options);
-  Put(1, "foo", "v1");
-  ASSERT_OK(Flush(1));
-  const int last = CurrentOptions().max_mem_compaction_level;
-  // foo => v1 is now in last level
-  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+  virtual const char* Name() const override { return "DelayFilter"; }
 
-  // Place a table at level last-1 to prevent merging with preceding mutation
-  Put(1, "a", "begin");
-  Put(1, "z", "end");
-  Flush(1);
-  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
-  ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
+ private:
+  DBTestBase* db_test;
+};
 
-  Delete(1, "foo");
-  Put(1, "foo", "v2");
-  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
-  ASSERT_OK(Flush(1));  // Moves to level last-2
-  if (CurrentOptions().purge_redundant_kvs_while_flush) {
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
-  } else {
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+class DelayFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
   }
-  Slice z("z");
-  dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]);
-  // DEL eliminated, but v1 remains because we aren't compacting that level
-  // (DEL can be eliminated because v2 hides v1).
-  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
-  dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]);
-  // Merging last-1 w/ last, so we are the base level for "foo", so
-  // DEL is removed.  (as is v1).
-  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
-}
 
-TEST_F(DBTest, DeletionMarkers2) {
-  Options options = CurrentOptions();
-  options.max_background_flushes = 0;
-  CreateAndReopenWithCF({"pikachu"}, options);
-  Put(1, "foo", "v1");
-  ASSERT_OK(Flush(1));
-  const int last = CurrentOptions().max_mem_compaction_level;
-  // foo => v1 is now in last level
-  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+  virtual const char* Name() const override { return "DelayFilterFactory"; }
 
-  // Place a table at level last-1 to prevent merging with preceding mutation
-  Put(1, "a", "begin");
-  Put(1, "z", "end");
-  Flush(1);
-  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
-  ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
+ private:
+  DBTestBase* db_test;
+};
+}  // namespace
 
-  Delete(1, "foo");
-  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
-  ASSERT_OK(Flush(1));  // Moves to level last-2
-  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
-  dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]);
-  // DEL kept: "last" file overlaps
-  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
-  dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]);
-  // Merging last-1 w/ last, so we are the base level for "foo", so
-  // DEL is removed.  (as is v1).
-  ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
-}
+TEST_F(DBTest, CompressedCache) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  int num_iter = 80;
 
-TEST_F(DBTest, OverlapInLevel0) {
-  do {
-    Options options = CurrentOptions();
-    options.max_background_flushes = 0;
+  // Run this test three iterations.
+  // Iteration 1: only a uncompressed block cache
+  // Iteration 2: only a compressed block cache
+  // Iteration 3: both block cache and compressed cache
+  // Iteration 4: both block cache and compressed cache, but DB is not
+  // compressed
+  for (int iter = 0; iter < 4; iter++) {
+    Options options;
+    options.write_buffer_size = 64*1024;        // small write buffer
+    options.statistics = rocksdb::CreateDBStatistics();
+    options = CurrentOptions(options);
+
+    BlockBasedTableOptions table_options;
+    switch (iter) {
+      case 0:
+        // only uncompressed block cache
+        table_options.block_cache = NewLRUCache(8*1024);
+        table_options.block_cache_compressed = nullptr;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 1:
+        // no block cache, only compressed cache
+        table_options.no_block_cache = true;
+        table_options.block_cache = nullptr;
+        table_options.block_cache_compressed = NewLRUCache(8*1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 2:
+        // both compressed and uncompressed block cache
+        table_options.block_cache = NewLRUCache(1024);
+        table_options.block_cache_compressed = NewLRUCache(8*1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 3:
+        // both block cache and compressed cache, but DB is not compressed
+        // also, make block cache sizes bigger, to trigger block cache hits
+        table_options.block_cache = NewLRUCache(1024 * 1024);
+        table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        options.compression = kNoCompression;
+        break;
+      default:
+        ASSERT_TRUE(false);
+    }
     CreateAndReopenWithCF({"pikachu"}, options);
-    int tmp = CurrentOptions().max_mem_compaction_level;
-    ASSERT_EQ(tmp, 2) << "Fix test to match config";
+    // default column family doesn't have block cache
+    Options no_block_cache_opts;
+    no_block_cache_opts.statistics = options.statistics;
+    no_block_cache_opts = CurrentOptions(no_block_cache_opts);
+    BlockBasedTableOptions table_options_no_bc;
+    table_options_no_bc.no_block_cache = true;
+    no_block_cache_opts.table_factory.reset(
+        NewBlockBasedTableFactory(table_options_no_bc));
+    ReopenWithColumnFamilies({"default", "pikachu"},
+        std::vector<Options>({no_block_cache_opts, options}));
 
-    //Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
-    ASSERT_OK(Put(1, "100", "v100"));
-    ASSERT_OK(Put(1, "999", "v999"));
-    Flush(1);
-    ASSERT_OK(Delete(1, "100"));
-    ASSERT_OK(Delete(1, "999"));
-    Flush(1);
-    ASSERT_EQ("0,1,1", FilesPerLevel(1));
+    Random rnd(301);
 
-    // Make files spanning the following ranges in level-0:
-    //  files[0]  200 .. 900
-    //  files[1]  300 .. 500
-    // Note that files are sorted by smallest key.
-    ASSERT_OK(Put(1, "300", "v300"));
-    ASSERT_OK(Put(1, "500", "v500"));
-    Flush(1);
-    ASSERT_OK(Put(1, "200", "v200"));
-    ASSERT_OK(Put(1, "600", "v600"));
-    ASSERT_OK(Put(1, "900", "v900"));
-    Flush(1);
-    ASSERT_EQ("2,1,1", FilesPerLevel(1));
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    std::vector<std::string> values;
+    std::string str;
+    for (int i = 0; i < num_iter; i++) {
+      if (i % 4 == 0) {        // high compression ratio
+        str = RandomString(&rnd, 1000);
+      }
+      values.push_back(str);
+      ASSERT_OK(Put(1, Key(i), values[i]));
+    }
 
-    // Compact away the placeholder files we created initially
-    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
-    dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]);
-    ASSERT_EQ("2", FilesPerLevel(1));
+    // flush all data from memtable so that reads are from block cache
+    ASSERT_OK(Flush(1));
 
-    // Do a memtable compaction.  Before bug-fix, the compaction would
-    // not detect the overlap with level-0 files and would incorrectly place
-    // the deletion in a deeper level.
-    ASSERT_OK(Delete(1, "600"));
-    Flush(1);
-    ASSERT_EQ("3", FilesPerLevel(1));
-    ASSERT_EQ("NOT_FOUND", Get(1, "600"));
-  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
+    for (int i = 0; i < num_iter; i++) {
+      ASSERT_EQ(Get(1, Key(i)), values[i]);
+    }
+
+    // check that we triggered the appropriate code paths in the cache
+    switch (iter) {
+      case 0:
+        // only uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 1:
+        // no block cache, only compressed cache
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 2:
+        // both compressed and uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 3:
+        // both compressed and uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        // compressed doesn't have any hits since blocks are not compressed on
+        // storage
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0);
+        break;
+      default:
+        ASSERT_TRUE(false);
+    }
+
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+  }
 }
 
-TEST_F(DBTest, L0_CompactionBug_Issue44_a) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "b", "v"));
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    ASSERT_OK(Delete(1, "b"));
-    ASSERT_OK(Delete(1, "a"));
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    ASSERT_OK(Delete(1, "a"));
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "a", "v"));
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    ASSERT_EQ("(a->v)", Contents(1));
-    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
-    ASSERT_EQ("(a->v)", Contents(1));
-  } while (ChangeCompactOptions());
+static std::string CompressibleString(Random* rnd, int len) {
+  std::string r;
+  test::CompressibleString(rnd, 0.8, len, &r);
+  return r;
 }
 
-TEST_F(DBTest, L0_CompactionBug_Issue44_b) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    Put(1, "", "");
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Delete(1, "e");
-    Put(1, "", "");
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "c", "cv");
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "", "");
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "", "");
-    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "d", "dv");
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Put(1, "", "");
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    Delete(1, "d");
-    Delete(1, "b");
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
-    ASSERT_EQ("(->)(c->cv)", Contents(1));
-    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
-    ASSERT_EQ("(->)(c->cv)", Contents(1));
-  } while (ChangeCompactOptions());
+TEST_F(DBTest, FailMoreDbPaths) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 10000000);
+  options.db_paths.emplace_back(dbname_ + "_2", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_3", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_4", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_5", 1000000);
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
 }
 
-TEST_F(DBTest, ComparatorCheck) {
-  class NewComparator : public Comparator {
-   public:
-    virtual const char* Name() const override {
-      return "rocksdb.NewComparator";
-    }
-    virtual int Compare(const Slice& a, const Slice& b) const override {
-      return BytewiseComparator()->Compare(a, b);
-    }
-    virtual void FindShortestSeparator(std::string* s,
-                                       const Slice& l) const override {
-      BytewiseComparator()->FindShortestSeparator(s, l);
-    }
-    virtual void FindShortSuccessor(std::string* key) const override {
-      BytewiseComparator()->FindShortSuccessor(key);
+void CheckColumnFamilyMeta(const ColumnFamilyMetaData& cf_meta) {
+  uint64_t cf_size = 0;
+  uint64_t cf_csize = 0;
+  size_t file_count = 0;
+  for (auto level_meta : cf_meta.levels) {
+    uint64_t level_size = 0;
+    uint64_t level_csize = 0;
+    file_count += level_meta.files.size();
+    for (auto file_meta : level_meta.files) {
+      level_size += file_meta.size;
     }
-  };
-  Options new_options, options;
-  NewComparator cmp;
-  do {
-    options = CurrentOptions();
-    CreateAndReopenWithCF({"pikachu"}, options);
-    new_options = CurrentOptions();
-    new_options.comparator = &cmp;
-    // only the non-default column family has non-matching comparator
-    Status s = TryReopenWithColumnFamilies({"default", "pikachu"},
-        std::vector<Options>({options, new_options}));
-    ASSERT_TRUE(!s.ok());
-    ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
-        << s.ToString();
-  } while (ChangeCompactOptions());
+    ASSERT_EQ(level_meta.size, level_size);
+    cf_size += level_size;
+    cf_csize += level_csize;
+  }
+  ASSERT_EQ(cf_meta.file_count, file_count);
+  ASSERT_EQ(cf_meta.size, cf_size);
 }
 
-TEST_F(DBTest, CustomComparator) {
-  class NumberComparator : public Comparator {
-   public:
-    virtual const char* Name() const override {
-      return "test.NumberComparator";
-    }
-    virtual int Compare(const Slice& a, const Slice& b) const override {
-      return ToNumber(a) - ToNumber(b);
-    }
-    virtual void FindShortestSeparator(std::string* s,
-                                       const Slice& l) const override {
-      ToNumber(*s);     // Check format
-      ToNumber(l);      // Check format
-    }
-    virtual void FindShortSuccessor(std::string* key) const override {
-      ToNumber(*key);   // Check format
-    }
-   private:
-    static int ToNumber(const Slice& x) {
-      // Check that there are no extra characters.
-      EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']')
-          << EscapeString(x);
-      int val;
-      char ignored;
-      EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
-          << EscapeString(x);
-      return val;
-    }
-  };
-  Options new_options;
-  NumberComparator cmp;
-  do {
-    new_options = CurrentOptions();
-    new_options.create_if_missing = true;
-    new_options.comparator = &cmp;
-    new_options.write_buffer_size = 1000;  // Compact more often
-    new_options = CurrentOptions(new_options);
-    DestroyAndReopen(new_options);
-    CreateAndReopenWithCF({"pikachu"}, new_options);
-    ASSERT_OK(Put(1, "[10]", "ten"));
-    ASSERT_OK(Put(1, "[0x14]", "twenty"));
-    for (int i = 0; i < 2; i++) {
-      ASSERT_EQ("ten", Get(1, "[10]"));
-      ASSERT_EQ("ten", Get(1, "[0xa]"));
-      ASSERT_EQ("twenty", Get(1, "[20]"));
-      ASSERT_EQ("twenty", Get(1, "[0x14]"));
-      ASSERT_EQ("NOT_FOUND", Get(1, "[15]"));
-      ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]"));
-      Compact(1, "[0]", "[9999]");
-    }
+TEST_F(DBTest, ColumnFamilyMetaDataTest) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
 
-    for (int run = 0; run < 2; run++) {
-      for (int i = 0; i < 1000; i++) {
-        char buf[100];
-        snprintf(buf, sizeof(buf), "[%d]", i*10);
-        ASSERT_OK(Put(1, buf, buf));
-      }
-      Compact(1, "[0]", "[1000000]");
-    }
-  } while (ChangeCompactOptions());
+  Random rnd(301);
+  int key_index = 0;
+  ColumnFamilyMetaData cf_meta;
+  for (int i = 0; i < 100; ++i) {
+    GenerateNewFile(&rnd, &key_index);
+    db_->GetColumnFamilyMetaData(&cf_meta);
+    CheckColumnFamilyMeta(cf_meta);
+  }
 }
 
-TEST_F(DBTest, ManualCompaction) {
-  Options options = CurrentOptions();
-  options.max_background_flushes = 0;
-  CreateAndReopenWithCF({"pikachu"}, options);
-  ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
-      << "Need to update this test to match kMaxMemCompactLevel";
+namespace {
+void MinLevelHelper(DBTest* self, Options& options) {
+  Random rnd(301);
 
-  // iter - 0 with 7 levels
-  // iter - 1 with 3 levels
-  for (int iter = 0; iter < 2; ++iter) {
-    MakeTables(3, "p", "q", 1);
-    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+  for (int num = 0;
+    num < options.level0_file_num_compaction_trigger - 1;
+    num++)
+  {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(DBTestBase::RandomString(&rnd, 10000));
+      ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
+    }
+    self->dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1);
+  }
 
-    // Compaction range falls before files
-    Compact(1, "", "c");
-    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+  //generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(DBTestBase::RandomString(&rnd, 10000));
+    ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
+  }
+  self->dbfull()->TEST_WaitForCompact();
 
-    // Compaction range falls after files
-    Compact(1, "r", "z");
-    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+  ASSERT_EQ(self->NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(self->NumTableFilesAtLevel(1), 1);
+}
 
-    // Compaction range overlaps files
-    Compact(1, "p1", "p9");
-    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+// returns false if the calling-Test should be skipped
+bool MinLevelToCompress(CompressionType& type, Options& options, int wbits,
+                        int lev, int strategy) {
+  fprintf(stderr, "Test with compression options : window_bits = %d, level =  %d, strategy = %d}\n", wbits, lev, strategy);
+  options.write_buffer_size = 100<<10; //100KB
+  options.arena_block_size = 4096;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 3;
+  options.create_if_missing = true;
 
-    // Populate a different range
-    MakeTables(3, "c", "e", 1);
-    ASSERT_EQ("1,1,2", FilesPerLevel(1));
+  if (Snappy_Supported()) {
+    type = kSnappyCompression;
+    fprintf(stderr, "using snappy\n");
+  } else if (Zlib_Supported()) {
+    type = kZlibCompression;
+    fprintf(stderr, "using zlib\n");
+  } else if (BZip2_Supported()) {
+    type = kBZip2Compression;
+    fprintf(stderr, "using bzip2\n");
+  } else if (LZ4_Supported()) {
+    type = kLZ4Compression;
+    fprintf(stderr, "using lz4\n");
+  } else {
+    fprintf(stderr, "skipping test, compression disabled\n");
+    return false;
+  }
+  options.compression_per_level.resize(options.num_levels);
 
-    // Compact just the new range
-    Compact(1, "b", "f");
-    ASSERT_EQ("0,0,2", FilesPerLevel(1));
+  // do not compress L0
+  for (int i = 0; i < 1; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 1; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  return true;
+}
+}  // namespace
 
-    // Compact all
-    MakeTables(1, "a", "z", 1);
-    ASSERT_EQ("0,1,2", FilesPerLevel(1));
-    db_->CompactRange(handles_[1], nullptr, nullptr);
-    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+TEST_F(DBTest, MinLevelToCompress1) {
+  Options options = CurrentOptions();
+  CompressionType type = kSnappyCompression;
+  if (!MinLevelToCompress(type, options, -14, -1, 0)) {
+    return;
+  }
+  Reopen(options);
+  MinLevelHelper(this, options);
 
-    if (iter == 0) {
-      options = CurrentOptions();
-      options.max_background_flushes = 0;
-      options.num_levels = 3;
-      options.create_if_missing = true;
-      DestroyAndReopen(options);
-      CreateAndReopenWithCF({"pikachu"}, options);
-    }
+  // do not compress L0 and L1
+  for (int i = 0; i < 2; i++) {
+    options.compression_per_level[i] = kNoCompression;
   }
-
+  for (int i = 2; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  DestroyAndReopen(options);
+  MinLevelHelper(this, options);
 }
 
-class DBTestUniversalManualCompactionOutputPathId
-    : public DBTestUniversalCompactionBase {};
-
-TEST_P(DBTestUniversalManualCompactionOutputPathId,
-       ManualCompactionOutputPathId) {
-  Options options = CurrentOptions();
-  options.create_if_missing = true;
-  options.db_paths.emplace_back(dbname_, 1000000000);
-  options.db_paths.emplace_back(dbname_ + "_2", 1000000000);
-  options.compaction_style = kCompactionStyleUniversal;
-  options.num_levels = num_levels_;
-  options.target_file_size_base = 1 << 30;  // Big size
-  options.level0_file_num_compaction_trigger = 10;
-  Destroy(options);
-  DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-  MakeTables(3, "p", "q", 1);
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ(3, TotalLiveFiles(1));
-  ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path));
-  ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
-
-  // Full compaction to DB path 0
-  db_->CompactRange(handles_[1], nullptr, nullptr, false, -1, 1);
-  ASSERT_EQ(1, TotalLiveFiles(1));
-  ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-
-  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
-  ASSERT_EQ(1, TotalLiveFiles(1));
-  ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-
-  MakeTables(1, "p", "q", 1);
-  ASSERT_EQ(2, TotalLiveFiles(1));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-
-  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
-  ASSERT_EQ(2, TotalLiveFiles(1));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-
-  // Full compaction to DB path 0
-  db_->CompactRange(handles_[1], nullptr, nullptr, false, -1, 0);
-  ASSERT_EQ(1, TotalLiveFiles(1));
-  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
-  ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
-
-  // Fail when compacting to an invalid path ID
-  ASSERT_TRUE(db_->CompactRange(handles_[1], nullptr, nullptr, false, -1, 2)
-                  .IsInvalidArgument());
-}
-
-INSTANTIATE_TEST_CASE_P(DBTestUniversalManualCompactionOutputPathId,
-                        DBTestUniversalManualCompactionOutputPathId,
-                        ::testing::Values(1, 8));
-
-TEST_F(DBTest, ManualLevelCompactionOutputPathId) {
+TEST_F(DBTest, MinLevelToCompress2) {
   Options options = CurrentOptions();
-  options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
-  options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
-  options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
-  options.max_background_flushes = 1;
-  CreateAndReopenWithCF({"pikachu"}, options);
-  ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
-      << "Need to update this test to match kMaxMemCompactLevel";
-
-  // iter - 0 with 7 levels
-  // iter - 1 with 3 levels
-  for (int iter = 0; iter < 2; ++iter) {
-    MakeTables(3, "p", "q", 1);
-    ASSERT_EQ("3", FilesPerLevel(1));
-    ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path));
-    ASSERT_EQ(0, GetSstFileCount(dbname_));
-
-    // Compaction range falls before files
-    Compact(1, "", "c");
-    ASSERT_EQ("3", FilesPerLevel(1));
-
-    // Compaction range falls after files
-    Compact(1, "r", "z");
-    ASSERT_EQ("3", FilesPerLevel(1));
-
-    // Compaction range overlaps files
-    Compact(1, "p1", "p9", 1);
-    ASSERT_EQ("0,1", FilesPerLevel(1));
-    ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
-    ASSERT_EQ(0, GetSstFileCount(dbname_));
+  CompressionType type = kSnappyCompression;
+  if (!MinLevelToCompress(type, options, 15, -1, 0)) {
+    return;
+  }
+  Reopen(options);
+  MinLevelHelper(this, options);
 
-    // Populate a different range
-    MakeTables(3, "c", "e", 1);
-    ASSERT_EQ("3,1", FilesPerLevel(1));
+  // do not compress L0 and L1
+  for (int i = 0; i < 2; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 2; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  DestroyAndReopen(options);
+  MinLevelHelper(this, options);
+}
 
-    // Compact just the new range
-    Compact(1, "b", "f", 1);
-    ASSERT_EQ("0,2", FilesPerLevel(1));
-    ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
-    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
-    ASSERT_EQ(0, GetSstFileCount(dbname_));
+TEST_F(DBTest, RepeatedWritesToSameKey) {
+  do {
+    Options options;
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
-    // Compact all
-    MakeTables(1, "a", "z", 1);
-    ASSERT_EQ("1,2", FilesPerLevel(1));
-    ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
-    ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
-    db_->CompactRange(handles_[1], nullptr, nullptr, false, 1, 1);
-    ASSERT_EQ("0,1", FilesPerLevel(1));
-    ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
-    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
-    ASSERT_EQ(0, GetSstFileCount(dbname_));
+    // We must have at most one file per level except for level-0,
+    // which may have up to kL0_StopWritesTrigger files.
+    const int kMaxFiles =
+        options.num_levels + options.level0_stop_writes_trigger;
 
-    if (iter == 0) {
-      DestroyAndReopen(options);
-      options = CurrentOptions();
-      options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
-      options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
-      options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
-      options.max_background_flushes = 1;
-      options.num_levels = 3;
-      options.create_if_missing = true;
-      CreateAndReopenWithCF({"pikachu"}, options);
+    Random rnd(301);
+    std::string value =
+        RandomString(&rnd, static_cast<int>(2 * options.write_buffer_size));
+    for (int i = 0; i < 5 * kMaxFiles; i++) {
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_LE(TotalTableFiles(1), kMaxFiles);
     }
-  }
+  } while (ChangeCompactOptions());
 }
 
-TEST_F(DBTest, DBOpen_Options) {
-  Options options = CurrentOptions();
-  std::string dbname = test::TmpDir(env_) + "/db_options_test";
-  ASSERT_OK(DestroyDB(dbname, options));
-
-  // Does not exist, and create_if_missing == false: error
-  DB* db = nullptr;
-  options.create_if_missing = false;
-  Status s = DB::Open(options, dbname, &db);
-  ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
-  ASSERT_TRUE(db == nullptr);
+TEST_F(DBTest, SparseMerge) {
+  do {
+    Options options = CurrentOptions();
+    options.compression = kNoCompression;
+    CreateAndReopenWithCF({"pikachu"}, options);
 
-  // Does not exist, and create_if_missing == true: OK
-  options.create_if_missing = true;
-  s = DB::Open(options, dbname, &db);
-  ASSERT_OK(s);
-  ASSERT_TRUE(db != nullptr);
+    FillLevels("A", "Z", 1);
 
-  delete db;
-  db = nullptr;
+    // Suppose there is:
+    //    small amount of data with prefix A
+    //    large amount of data with prefix B
+    //    small amount of data with prefix C
+    // and that recent updates have made small changes to all three prefixes.
+    // Check that we do not do a compaction that merges all of B in one shot.
+    const std::string value(1000, 'x');
+    Put(1, "A", "va");
+    // Write approximately 100MB of "B" values
+    for (int i = 0; i < 100000; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%010d", i);
+      Put(1, key, value);
+    }
+    Put(1, "C", "vc");
+    ASSERT_OK(Flush(1));
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
 
-  // Does exist, and error_if_exists == true: error
-  options.create_if_missing = false;
-  options.error_if_exists = true;
-  s = DB::Open(options, dbname, &db);
-  ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
-  ASSERT_TRUE(db == nullptr);
+    // Make sparse update
+    Put(1, "A", "va2");
+    Put(1, "B100", "bvalue2");
+    Put(1, "C", "vc2");
+    ASSERT_OK(Flush(1));
 
-  // Does exist, and error_if_exists == false: OK
-  options.create_if_missing = true;
-  options.error_if_exists = false;
-  s = DB::Open(options, dbname, &db);
-  ASSERT_OK(s);
-  ASSERT_TRUE(db != nullptr);
+    // Compactions should not cause us to create a situation where
+    // a file overlaps too much data at the next level.
+    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
+              20 * 1048576);
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
+              20 * 1048576);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+    ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(handles_[1]),
+              20 * 1048576);
+  } while (ChangeCompactOptions());
+}
 
-  delete db;
-  db = nullptr;
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+  bool result = (val >= low) && (val <= high);
+  if (!result) {
+    fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+            (unsigned long long)(val),
+            (unsigned long long)(low),
+            (unsigned long long)(high));
+  }
+  return result;
 }
 
-TEST_F(DBTest, DBOpen_Change_NumLevels) {
-  Options options = CurrentOptions();
+TEST_F(DBTest, ApproximateSizesMemTable) {
+  Options options;
+  options.write_buffer_size = 100000000;  // Large write buffer
+  options.compression = kNoCompression;
   options.create_if_missing = true;
-  options.max_background_flushes = 0;
+  options = CurrentOptions(options);
   DestroyAndReopen(options);
-  ASSERT_TRUE(db_ != nullptr);
-  CreateAndReopenWithCF({"pikachu"}, options);
 
-  ASSERT_OK(Put(1, "a", "123"));
-  ASSERT_OK(Put(1, "b", "234"));
-  db_->CompactRange(handles_[1], nullptr, nullptr);
-  Close();
+  const int N = 128;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+  }
 
-  options.create_if_missing = false;
-  options.num_levels = 2;
-  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
-  ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
-  ASSERT_TRUE(db_ == nullptr);
-}
+  uint64_t size;
+  std::string start = Key(50);
+  std::string end = Key(60);
+  Range r(start, end);
+  db_->GetApproximateSizes(&r, 1, &size, true);
+  ASSERT_GT(size, 6000);
+  ASSERT_LT(size, 204800);
+  // Zero if not including mem table
+  db_->GetApproximateSizes(&r, 1, &size, false);
+  ASSERT_EQ(size, 0);
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  db_->GetApproximateSizes(&r, 1, &size, true);
+  ASSERT_EQ(size, 0);
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(1000 + i), RandomString(&rnd, 1024)));
+  }
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  db_->GetApproximateSizes(&r, 1, &size, true);
+  ASSERT_EQ(size, 0);
+
+  start = Key(100);
+  end = Key(1020);
+  r = Range(start, end);
+  db_->GetApproximateSizes(&r, 1, &size, true);
+  ASSERT_GT(size, 6000);
+
+  options.max_write_buffer_number = 8;
+  options.min_write_buffer_number_to_merge = 5;
+  options.write_buffer_size = 1024 * N;  // Not very large
+  DestroyAndReopen(options);
 
-TEST_F(DBTest, DestroyDBMetaDatabase) {
-  std::string dbname = test::TmpDir(env_) + "/db_meta";
-  ASSERT_OK(env_->CreateDirIfMissing(dbname));
-  std::string metadbname = MetaDatabaseName(dbname, 0);
-  ASSERT_OK(env_->CreateDirIfMissing(metadbname));
-  std::string metametadbname = MetaDatabaseName(metadbname, 0);
-  ASSERT_OK(env_->CreateDirIfMissing(metametadbname));
+  int keys[N * 3];
+  for (int i = 0; i < N; i++) {
+    keys[i * 3] = i * 5;
+    keys[i * 3 + 1] = i * 5 + 1;
+    keys[i * 3 + 2] = i * 5 + 2;
+  }
+  std::random_shuffle(std::begin(keys), std::end(keys));
 
-  // Destroy previous versions if they exist. Using the long way.
-  Options options = CurrentOptions();
-  ASSERT_OK(DestroyDB(metametadbname, options));
-  ASSERT_OK(DestroyDB(metadbname, options));
-  ASSERT_OK(DestroyDB(dbname, options));
+  for (int i = 0; i < N * 3; i++) {
+    ASSERT_OK(Put(Key(keys[i] + 1000), RandomString(&rnd, 1024)));
+  }
+
+  start = Key(100);
+  end = Key(300);
+  r = Range(start, end);
+  db_->GetApproximateSizes(&r, 1, &size, true);
+  ASSERT_EQ(size, 0);
+
+  start = Key(1050);
+  end = Key(1080);
+  r = Range(start, end);
+  db_->GetApproximateSizes(&r, 1, &size, true);
+  ASSERT_GT(size, 6000);
+
+  start = Key(2100);
+  end = Key(2300);
+  r = Range(start, end);
+  db_->GetApproximateSizes(&r, 1, &size, true);
+  ASSERT_EQ(size, 0);
+
+  start = Key(1050);
+  end = Key(1080);
+  r = Range(start, end);
+  uint64_t size_with_mt, size_without_mt;
+  db_->GetApproximateSizes(&r, 1, &size_with_mt, true);
+  ASSERT_GT(size_with_mt, 6000);
+  db_->GetApproximateSizes(&r, 1, &size_without_mt, false);
+  ASSERT_EQ(size_without_mt, 0);
 
-  // Setup databases
-  DB* db = nullptr;
-  ASSERT_OK(DB::Open(options, dbname, &db));
-  delete db;
-  db = nullptr;
-  ASSERT_OK(DB::Open(options, metadbname, &db));
-  delete db;
-  db = nullptr;
-  ASSERT_OK(DB::Open(options, metametadbname, &db));
-  delete db;
-  db = nullptr;
+  Flush();
 
-  // Delete databases
-  ASSERT_OK(DestroyDB(dbname, options));
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i + 1000), RandomString(&rnd, 1024)));
+  }
 
-  // Check if deletion worked.
-  options.create_if_missing = false;
-  ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok());
-  ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok());
-  ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok());
+  start = Key(1050);
+  end = Key(1080);
+  r = Range(start, end);
+  db_->GetApproximateSizes(&r, 1, &size_with_mt, true);
+  db_->GetApproximateSizes(&r, 1, &size_without_mt, false);
+  ASSERT_GT(size_with_mt, size_without_mt);
+  ASSERT_GT(size_without_mt, 6000);
 }
 
-// Check that number of files does not grow when writes are dropped
-TEST_F(DBTest, DropWrites) {
+TEST_F(DBTest, ApproximateSizes) {
   do {
-    Options options = CurrentOptions();
-    options.env = env_;
-    options.paranoid_checks = false;
-    Reopen(options);
+    Options options;
+    options.write_buffer_size = 100000000;        // Large write buffer
+    options.compression = kNoCompression;
+    options.create_if_missing = true;
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
 
-    ASSERT_OK(Put("foo", "v1"));
-    ASSERT_EQ("v1", Get("foo"));
-    Compact("a", "z");
-    const size_t num_files = CountFiles();
-    // Force out-of-space errors
-    env_->drop_writes_.store(true, std::memory_order_release);
-    env_->sleep_counter_.Reset();
-    for (int i = 0; i < 5; i++) {
-      if (option_config_ != kUniversalCompactionMultiLevel) {
-        for (int level = 0; level < dbfull()->NumberLevels(); level++) {
-          if (level > 0 && level == dbfull()->NumberLevels() - 1) {
-            break;
-          }
-          dbfull()->TEST_CompactRange(level, nullptr, nullptr);
-        }
-      } else {
-        dbfull()->CompactRange(nullptr, nullptr);
-      }
+    ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0));
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    const int N = 80;
+    static const int S1 = 100000;
+    static const int S2 = 105000;  // Allow some expansion from metadata
+    Random rnd(301);
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), RandomString(&rnd, S1)));
     }
 
-    std::string property_value;
-    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
-    ASSERT_EQ("5", property_value);
+    // 0 because GetApproximateSizes() does not account for memtable space
+    ASSERT_TRUE(Between(Size("", Key(50), 1), 0, 0));
 
-    env_->drop_writes_.store(false, std::memory_order_release);
-    ASSERT_LT(CountFiles(), num_files + 3);
+    // Check sizes across recovery by reopening a few times
+    for (int run = 0; run < 3; run++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
-    // Check that compaction attempts slept after errors
-    ASSERT_GE(env_->sleep_counter_.Read(), 5);
-  } while (ChangeCompactOptions());
+      for (int compact_start = 0; compact_start < N; compact_start += 10) {
+        for (int i = 0; i < N; i += 10) {
+          ASSERT_TRUE(Between(Size("", Key(i), 1), S1 * i, S2 * i));
+          ASSERT_TRUE(Between(Size("", Key(i) + ".suffix", 1), S1 * (i + 1),
+                              S2 * (i + 1)));
+          ASSERT_TRUE(Between(Size(Key(i), Key(i + 10), 1), S1 * 10, S2 * 10));
+        }
+        ASSERT_TRUE(Between(Size("", Key(50), 1), S1 * 50, S2 * 50));
+        ASSERT_TRUE(
+            Between(Size("", Key(50) + ".suffix", 1), S1 * 50, S2 * 50));
+
+        std::string cstart_str = Key(compact_start);
+        std::string cend_str = Key(compact_start + 9);
+        Slice cstart = cstart_str;
+        Slice cend = cend_str;
+        dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]);
+      }
+
+      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+      ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+    }
+    // ApproximateOffsetOf() is not yet implemented in plain table format.
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
+                         kSkipPlainTable | kSkipHashIndex));
 }
 
-// Check background error counter bumped on flush failures.
-TEST_F(DBTest, DropWritesFlush) {
+TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
   do {
     Options options = CurrentOptions();
-    options.env = env_;
-    options.max_background_flushes = 1;
-    Reopen(options);
+    options.compression = kNoCompression;
+    CreateAndReopenWithCF({"pikachu"}, options);
 
-    ASSERT_OK(Put("foo", "v1"));
-    // Force out-of-space errors
-    env_->drop_writes_.store(true, std::memory_order_release);
+    Random rnd(301);
+    std::string big1 = RandomString(&rnd, 100000);
+    ASSERT_OK(Put(1, Key(0), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(1), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(2), big1));
+    ASSERT_OK(Put(1, Key(3), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(4), big1));
+    ASSERT_OK(Put(1, Key(5), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(1, Key(6), RandomString(&rnd, 300000)));
+    ASSERT_OK(Put(1, Key(7), RandomString(&rnd, 10000)));
 
-    std::string property_value;
-    // Background error count is 0 now.
-    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
-    ASSERT_EQ("0", property_value);
+    // Check sizes across recovery by reopening a few times
+    for (int run = 0; run < 3; run++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
-    dbfull()->TEST_FlushMemTable(true);
+      ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0));
+      ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000));
+      ASSERT_TRUE(Between(Size("", Key(2), 1), 20000, 21000));
+      ASSERT_TRUE(Between(Size("", Key(3), 1), 120000, 121000));
+      ASSERT_TRUE(Between(Size("", Key(4), 1), 130000, 131000));
+      ASSERT_TRUE(Between(Size("", Key(5), 1), 230000, 231000));
+      ASSERT_TRUE(Between(Size("", Key(6), 1), 240000, 241000));
+      ASSERT_TRUE(Between(Size("", Key(7), 1), 540000, 541000));
+      ASSERT_TRUE(Between(Size("", Key(8), 1), 550000, 560000));
 
-    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
-    ASSERT_EQ("1", property_value);
+      ASSERT_TRUE(Between(Size(Key(3), Key(5), 1), 110000, 111000));
 
-    env_->drop_writes_.store(false, std::memory_order_release);
-  } while (ChangeCompactOptions());
+      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+    }
+    // ApproximateOffsetOf() is not yet implemented in plain table format.
+  } while (ChangeOptions(kSkipPlainTable));
 }
 
-// Check that CompactRange() returns failure if there is not enough space left
-// on device
-TEST_F(DBTest, NoSpaceCompactRange) {
+TEST_F(DBTest, IteratorPinsRef) {
   do {
-    Options options = CurrentOptions();
-    options.env = env_;
-    options.disable_auto_compactions = true;
-    Reopen(options);
-
-    // generate 5 tables
-    for (int i = 0; i < 5; ++i) {
-      ASSERT_OK(Put(Key(i), Key(i) + "v"));
-      ASSERT_OK(Flush());
-    }
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    Put(1, "foo", "hello");
 
-    // Force out-of-space errors
-    env_->no_space_.store(true, std::memory_order_release);
+    // Get iterator that will yield the current contents of the DB.
+    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
 
-    Status s = db_->CompactRange(nullptr, nullptr);
-    ASSERT_TRUE(s.IsIOError());
+    // Write to force compactions
+    Put(1, "foo", "newvalue1");
+    for (int i = 0; i < 100; i++) {
+      // 100K values
+      ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v')));
+    }
+    Put(1, "foo", "newvalue2");
 
-    env_->no_space_.store(false, std::memory_order_release);
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ("hello", iter->value().ToString());
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+    delete iter;
   } while (ChangeCompactOptions());
 }
 
-TEST_F(DBTest, NonWritableFileSystem) {
+TEST_F(DBTest, Snapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    Options options = CurrentOptions();
-    options.write_buffer_size = 1000;
-    options.env = env_;
-    Reopen(options);
-    ASSERT_OK(Put("foo", "v1"));
-    env_->non_writeable_rate_.store(100);
-    std::string big(100000, 'x');
-    int errors = 0;
-    for (int i = 0; i < 20; i++) {
-      if (!Put("foo", big).ok()) {
-        errors++;
-        env_->SleepForMicroseconds(100000);
-      }
-    }
-    ASSERT_GT(errors, 0);
-    env_->non_writeable_rate_.store(0);
-  } while (ChangeCompactOptions());
-}
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    Put(0, "foo", "0v1");
+    Put(1, "foo", "1v1");
 
-TEST_F(DBTest, ManifestWriteError) {
-  // Test for the following problem:
-  // (a) Compaction produces file F
-  // (b) Log record containing F is written to MANIFEST file, but Sync() fails
-  // (c) GC deletes F
-  // (d) After reopening DB, reads fail since deleted F is named in log record
+    const Snapshot* s1 = db_->GetSnapshot();
+    ASSERT_EQ(1U, GetNumSnapshots());
+    uint64_t time_snap1 = GetTimeOldestSnapshots();
+    ASSERT_GT(time_snap1, 0U);
+    Put(0, "foo", "0v2");
+    Put(1, "foo", "1v2");
 
-  // We iterate twice.  In the second iteration, everything is the
-  // same except the log record never makes it to the MANIFEST file.
-  for (int iter = 0; iter < 2; iter++) {
-    std::atomic<bool>* error_type = (iter == 0)
-        ? &env_->manifest_sync_error_
-        : &env_->manifest_write_error_;
+    env_->addon_time_.fetch_add(1);
 
-    // Insert foo=>bar mapping
-    Options options = CurrentOptions();
-    options.env = env_;
-    options.create_if_missing = true;
-    options.error_if_exists = false;
-    options.max_background_flushes = 0;
-    DestroyAndReopen(options);
-    ASSERT_OK(Put("foo", "bar"));
-    ASSERT_EQ("bar", Get("foo"));
+    const Snapshot* s2 = db_->GetSnapshot();
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+    Put(0, "foo", "0v3");
+    Put(1, "foo", "1v3");
 
-    // Memtable compaction (will succeed)
-    Flush();
-    ASSERT_EQ("bar", Get("foo"));
-    const int last = dbfull()->MaxMemCompactionLevel();
-    ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo=>bar is now in last level
+    {
+      ManagedSnapshot s3(db_);
+      ASSERT_EQ(3U, GetNumSnapshots());
+      ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+
+      Put(0, "foo", "0v4");
+      Put(1, "foo", "1v4");
+      ASSERT_EQ("0v1", Get(0, "foo", s1));
+      ASSERT_EQ("1v1", Get(1, "foo", s1));
+      ASSERT_EQ("0v2", Get(0, "foo", s2));
+      ASSERT_EQ("1v2", Get(1, "foo", s2));
+      ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot()));
+      ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot()));
+      ASSERT_EQ("0v4", Get(0, "foo"));
+      ASSERT_EQ("1v4", Get(1, "foo"));
+    }
 
-    // Merging compaction (will fail)
-    error_type->store(true, std::memory_order_release);
-    dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
-    ASSERT_EQ("bar", Get("foo"));
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ("0v1", Get(0, "foo", s1));
+    ASSERT_EQ("1v1", Get(1, "foo", s1));
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
 
-    // Recovery: should not lose data
-    error_type->store(false, std::memory_order_release);
-    Reopen(options);
-    ASSERT_EQ("bar", Get("foo"));
-  }
-}
+    db_->ReleaseSnapshot(s1);
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+    ASSERT_EQ(1U, GetNumSnapshots());
+    ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
 
-TEST_F(DBTest, PutFailsParanoid) {
-  // Test the following:
-  // (a) A random put fails in paranoid mode (simulate by sync fail)
-  // (b) All other puts have to fail, even if writes would succeed
-  // (c) All of that should happen ONLY if paranoid_checks = true
+    db_->ReleaseSnapshot(s2);
+    ASSERT_EQ(0U, GetNumSnapshots());
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+  } while (ChangeOptions(kSkipHashCuckoo));
+}
 
-  Options options = CurrentOptions();
-  options.env = env_;
-  options.create_if_missing = true;
-  options.error_if_exists = false;
-  options.paranoid_checks = true;
-  DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-  Status s;
+TEST_F(DBTest, HiddenValuesAreRemoved) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    Options options = CurrentOptions(options_override);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    Random rnd(301);
+    FillLevels("a", "z", 1);
 
-  ASSERT_OK(Put(1, "foo", "bar"));
-  ASSERT_OK(Put(1, "foo1", "bar1"));
-  // simulate error
-  env_->log_write_error_.store(true, std::memory_order_release);
-  s = Put(1, "foo2", "bar2");
-  ASSERT_TRUE(!s.ok());
-  env_->log_write_error_.store(false, std::memory_order_release);
-  s = Put(1, "foo3", "bar3");
-  // the next put should fail, too
-  ASSERT_TRUE(!s.ok());
-  // but we're still able to read
-  ASSERT_EQ("bar", Get(1, "foo"));
+    std::string big = RandomString(&rnd, 50000);
+    Put(1, "foo", big);
+    Put(1, "pastfoo", "v");
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Put(1, "foo", "tiny");
+    Put(1, "pastfoo2", "v2");  // Advance sequence number one more
 
-  // do the same thing with paranoid checks off
-  options.paranoid_checks = false;
-  DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(Flush(1));
+    ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
 
-  ASSERT_OK(Put(1, "foo", "bar"));
-  ASSERT_OK(Put(1, "foo1", "bar1"));
-  // simulate error
-  env_->log_write_error_.store(true, std::memory_order_release);
-  s = Put(1, "foo2", "bar2");
-  ASSERT_TRUE(!s.ok());
-  env_->log_write_error_.store(false, std::memory_order_release);
-  s = Put(1, "foo3", "bar3");
-  // the next put should NOT fail
-  ASSERT_TRUE(s.ok());
-}
+    ASSERT_EQ(big, Get(1, "foo", snapshot));
+    ASSERT_TRUE(Between(Size("", "pastfoo", 1), 50000, 60000));
+    db_->ReleaseSnapshot(snapshot);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]");
+    Slice x("x");
+    dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    ASSERT_GE(NumTableFilesAtLevel(1, 1), 1);
+    dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
 
-TEST_F(DBTest, FilesDeletedAfterCompaction) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "foo", "v2"));
-    Compact(1, "a", "z");
-    const size_t num_files = CountLiveFiles();
-    for (int i = 0; i < 10; i++) {
-      ASSERT_OK(Put(1, "foo", "v2"));
-      Compact(1, "a", "z");
-    }
-    ASSERT_EQ(CountLiveFiles(), num_files);
-  } while (ChangeCompactOptions());
+    ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000));
+    // ApproximateOffsetOf() is not yet implemented in plain table format,
+    // which is used by Size().
+    // skip HashCuckooRep as it does not support snapshot
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
+                         kSkipPlainTable | kSkipHashCuckoo));
 }
 
-TEST_F(DBTest, BloomFilter) {
+TEST_F(DBTest, CompactBetweenSnapshots) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
   do {
-    Options options = CurrentOptions();
-    env_->count_random_reads_ = true;
-    options.env = env_;
-    // ChangeCompactOptions() only changes compaction style, which does not
-    // trigger reset of table_factory
-    BlockBasedTableOptions table_options;
-    table_options.no_block_cache = true;
-    table_options.filter_policy.reset(NewBloomFilterPolicy(10));
-    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-
+    Options options = CurrentOptions(options_override);
+    options.disable_auto_compactions = true;
     CreateAndReopenWithCF({"pikachu"}, options);
+    Random rnd(301);
+    FillLevels("a", "z", 1);
 
-    // Populate multiple layers
-    const int N = 10000;
-    for (int i = 0; i < N; i++) {
-      ASSERT_OK(Put(1, Key(i), Key(i)));
-    }
-    Compact(1, "a", "z");
-    for (int i = 0; i < N; i += 100) {
-      ASSERT_OK(Put(1, Key(i), Key(i)));
-    }
-    Flush(1);
+    Put(1, "foo", "first");
+    const Snapshot* snapshot1 = db_->GetSnapshot();
+    Put(1, "foo", "second");
+    Put(1, "foo", "third");
+    Put(1, "foo", "fourth");
+    const Snapshot* snapshot2 = db_->GetSnapshot();
+    Put(1, "foo", "fifth");
+    Put(1, "foo", "sixth");
 
-    // Prevent auto compactions triggered by seeks
-    env_->delay_sstable_sync_.store(true, std::memory_order_release);
+    // All entries (including duplicates) exist
+    // before any compaction or flush is triggered.
+    ASSERT_EQ(AllEntriesFor("foo", 1),
+              "[ sixth, fifth, fourth, third, second, first ]");
+    ASSERT_EQ("sixth", Get(1, "foo"));
+    ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+    ASSERT_EQ("first", Get(1, "foo", snapshot1));
 
-    // Lookup present keys.  Should rarely read from small sstable.
-    env_->random_read_counter_.Reset();
-    for (int i = 0; i < N; i++) {
-      ASSERT_EQ(Key(i), Get(1, Key(i)));
-    }
-    int reads = env_->random_read_counter_.Read();
-    fprintf(stderr, "%d present => %d reads\n", N, reads);
-    ASSERT_GE(reads, N);
-    ASSERT_LE(reads, N + 2*N/100);
+    // After a flush, "second", "third" and "fifth" should
+    // be removed
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]");
 
-    // Lookup present keys.  Should rarely read from either sstable.
-    env_->random_read_counter_.Reset();
-    for (int i = 0; i < N; i++) {
-      ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing"));
-    }
-    reads = env_->random_read_counter_.Read();
-    fprintf(stderr, "%d missing => %d reads\n", N, reads);
-    ASSERT_LE(reads, 3*N/100);
+    // after we release the snapshot1, only two values left
+    db_->ReleaseSnapshot(snapshot1);
+    FillLevels("a", "z", 1);
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
 
-    env_->delay_sstable_sync_.store(false, std::memory_order_release);
-    Close();
-  } while (ChangeCompactOptions());
+    // We have only one valid snapshot snapshot2. Since snapshot1 is
+    // not valid anymore, "first" should be removed by a compaction.
+    ASSERT_EQ("sixth", Get(1, "foo"));
+    ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]");
+
+    // after we release the snapshot2, only one value should be left
+    db_->ReleaseSnapshot(snapshot2);
+    FillLevels("a", "z", 1);
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ("sixth", Get(1, "foo"));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]");
+    // skip HashCuckooRep as it does not support snapshot
+  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction));
 }
 
-TEST_F(DBTest, BloomFilterRate) {
-  while (ChangeFilterOptions()) {
-    Options options = CurrentOptions();
-    options.statistics = rocksdb::CreateDBStatistics();
+TEST_F(DBTest, UnremovableSingleDelete) {
+  // If we compact:
+  //
+  // Put(A, v1) Snapshot SingleDelete(A) Put(A, v2)
+  //
+  // We do not want to end up with:
+  //
+  // Put(A, v1) Snapshot Put(A, v2)
+  //
+  // Because a subsequent SingleDelete(A) would delete the Put(A, v2)
+  // but not Put(A, v1), so Get(A) would return v1.
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    Options options = CurrentOptions(options_override);
+    options.disable_auto_compactions = true;
     CreateAndReopenWithCF({"pikachu"}, options);
 
-    const int maxKey = 10000;
-    for (int i = 0; i < maxKey; i++) {
-      ASSERT_OK(Put(1, Key(i), Key(i)));
-    }
-    // Add a large key to make the file contain wide range
-    ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
-    Flush(1);
-
-    // Check if they can be found
-    for (int i = 0; i < maxKey; i++) {
-      ASSERT_EQ(Key(i), Get(1, Key(i)));
-    }
-    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    Put(1, "foo", "first");
+    const Snapshot* snapshot = db_->GetSnapshot();
+    SingleDelete(1, "foo");
+    Put(1, "foo", "second");
+    ASSERT_OK(Flush(1));
 
-    // Check if filter is useful
-    for (int i = 0; i < maxKey; i++) {
-      ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333)));
-    }
-    ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98);
-  }
-}
+    ASSERT_EQ("first", Get(1, "foo", snapshot));
+    ASSERT_EQ("second", Get(1, "foo"));
 
-TEST_F(DBTest, BloomFilterCompatibility) {
-  Options options = CurrentOptions();
-  options.statistics = rocksdb::CreateDBStatistics();
-  BlockBasedTableOptions table_options;
-  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1));
 
-  // Create with block based filter
-  CreateAndReopenWithCF({"pikachu"}, options);
+    SingleDelete(1, "foo");
 
-  const int maxKey = 10000;
-  for (int i = 0; i < maxKey; i++) {
-    ASSERT_OK(Put(1, Key(i), Key(i)));
-  }
-  ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
-  Flush(1);
+    ASSERT_EQ("first", Get(1, "foo", snapshot));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
 
-  // Check db with full filter
-  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
 
-  // Check if they can be found
-  for (int i = 0; i < maxKey; i++) {
-    ASSERT_EQ(Key(i), Get(1, Key(i)));
-  }
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ("first", Get(1, "foo", snapshot));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    db_->ReleaseSnapshot(snapshot);
+    // Skip HashCuckooRep as it does not support single delete.  FIFO and
+    // universal compaction do not apply to the test case.  Skip MergePut
+    // because single delete does not get removed when it encounters a merge.
+  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction |
+                         kSkipUniversalCompaction | kSkipMergePut));
 }
 
-TEST_F(DBTest, BloomFilterReverseCompatibility) {
+TEST_F(DBTest, DeletionMarkers1) {
   Options options = CurrentOptions();
-  options.statistics = rocksdb::CreateDBStatistics();
-  BlockBasedTableOptions table_options;
-  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-
-  // Create with full filter
+  options.max_background_flushes = 0;
   CreateAndReopenWithCF({"pikachu"}, options);
+  Put(1, "foo", "v1");
+  ASSERT_OK(Flush(1));
+  const int last = 2;
+  MoveFilesToLevel(last, 1);
+  // foo => v1 is now in last level
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
 
-  const int maxKey = 10000;
-  for (int i = 0; i < maxKey; i++) {
-    ASSERT_OK(Put(1, Key(i), Key(i)));
-  }
-  ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+  // Place a table at level last-1 to prevent merging with preceding mutation
+  Put(1, "a", "begin");
+  Put(1, "z", "end");
   Flush(1);
+  MoveFilesToLevel(last - 1, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
 
-  // Check db with block_based filter
-  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  ReopenWithColumnFamilies({"default", "pikachu"}, options);
-
-  // Check if they can be found
-  for (int i = 0; i < maxKey; i++) {
-    ASSERT_EQ(Key(i), Get(1, Key(i)));
-  }
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  Delete(1, "foo");
+  Put(1, "foo", "v2");
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+  ASSERT_OK(Flush(1));  // Moves to level last-2
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+  Slice z("z");
+  dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]);
+  // DEL eliminated, but v1 remains because we aren't compacting that level
+  // (DEL can be eliminated because v2 hides v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+  dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]);
+  // Merging last-1 w/ last, so we are the base level for "foo", so
+  // DEL is removed.  (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
 }
 
-namespace {
-// A wrapped bloom over default FilterPolicy
-class WrappedBloom : public FilterPolicy {
- public:
-  explicit WrappedBloom(int bits_per_key) :
-        filter_(NewBloomFilterPolicy(bits_per_key)),
-        counter_(0) {}
-
-  ~WrappedBloom() { delete filter_; }
-
-  const char* Name() const override { return "WrappedRocksDbFilterPolicy"; }
-
-  void CreateFilter(const rocksdb::Slice* keys, int n, std::string* dst)
-      const override {
-    std::unique_ptr<rocksdb::Slice[]> user_keys(new rocksdb::Slice[n]);
-    for (int i = 0; i < n; ++i) {
-      user_keys[i] = convertKey(keys[i]);
-    }
-    return filter_->CreateFilter(user_keys.get(), n, dst);
-  }
-
-  bool KeyMayMatch(const rocksdb::Slice& key, const rocksdb::Slice& filter)
-      const override {
-    counter_++;
-    return filter_->KeyMayMatch(convertKey(key), filter);
-  }
-
-  uint32_t GetCounter() { return counter_; }
-
- private:
-  const FilterPolicy* filter_;
-  mutable uint32_t counter_;
+TEST_F(DBTest, DeletionMarkers2) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Put(1, "foo", "v1");
+  ASSERT_OK(Flush(1));
+  const int last = 2;
+  MoveFilesToLevel(last, 1);
+  // foo => v1 is now in last level
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
 
-  rocksdb::Slice convertKey(const rocksdb::Slice& key) const {
-    return key;
-  }
-};
-}  // namespace
+  // Place a table at level last-1 to prevent merging with preceding mutation
+  Put(1, "a", "begin");
+  Put(1, "z", "end");
+  Flush(1);
+  MoveFilesToLevel(last - 1, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
 
-TEST_F(DBTest, BloomFilterWrapper) {
-  Options options = CurrentOptions();
-  options.statistics = rocksdb::CreateDBStatistics();
+  Delete(1, "foo");
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  ASSERT_OK(Flush(1));  // Moves to level last-2
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]);
+  // DEL kept: "last" file overlaps
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]);
+  // Merging last-1 w/ last, so we are the base level for "foo", so
+  // DEL is removed.  (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+}
 
-  BlockBasedTableOptions table_options;
-  WrappedBloom* policy = new WrappedBloom(10);
-  table_options.filter_policy.reset(policy);
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+TEST_F(DBTest, OverlapInLevel0) {
+  do {
+    Options options = CurrentOptions();
+    CreateAndReopenWithCF({"pikachu"}, options);
 
-  CreateAndReopenWithCF({"pikachu"}, options);
+    //Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
+    ASSERT_OK(Put(1, "100", "v100"));
+    ASSERT_OK(Put(1, "999", "v999"));
+    Flush(1);
+    MoveFilesToLevel(2, 1);
+    ASSERT_OK(Delete(1, "100"));
+    ASSERT_OK(Delete(1, "999"));
+    Flush(1);
+    MoveFilesToLevel(1, 1);
+    ASSERT_EQ("0,1,1", FilesPerLevel(1));
 
-  const int maxKey = 10000;
-  for (int i = 0; i < maxKey; i++) {
-    ASSERT_OK(Put(1, Key(i), Key(i)));
-  }
-  // Add a large key to make the file contain wide range
-  ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
-  ASSERT_EQ(0U, policy->GetCounter());
-  Flush(1);
+    // Make files spanning the following ranges in level-0:
+    //  files[0]  200 .. 900
+    //  files[1]  300 .. 500
+    // Note that files are sorted by smallest key.
+    ASSERT_OK(Put(1, "300", "v300"));
+    ASSERT_OK(Put(1, "500", "v500"));
+    Flush(1);
+    ASSERT_OK(Put(1, "200", "v200"));
+    ASSERT_OK(Put(1, "600", "v600"));
+    ASSERT_OK(Put(1, "900", "v900"));
+    Flush(1);
+    ASSERT_EQ("2,1,1", FilesPerLevel(1));
 
-  // Check if they can be found
-  for (int i = 0; i < maxKey; i++) {
-    ASSERT_EQ(Key(i), Get(1, Key(i)));
-  }
-  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
-  ASSERT_EQ(1U * maxKey, policy->GetCounter());
+    // Compact away the placeholder files we created initially
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+    dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]);
+    ASSERT_EQ("2", FilesPerLevel(1));
 
-  // Check if filter is useful
-  for (int i = 0; i < maxKey; i++) {
-    ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333)));
-  }
-  ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98);
-  ASSERT_EQ(2U * maxKey, policy->GetCounter());
+    // Do a memtable compaction.  Before bug-fix, the compaction would
+    // not detect the overlap with level-0 files and would incorrectly place
+    // the deletion in a deeper level.
+    ASSERT_OK(Delete(1, "600"));
+    Flush(1);
+    ASSERT_EQ("3", FilesPerLevel(1));
+    ASSERT_EQ("NOT_FOUND", Get(1, "600"));
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
 }
 
-TEST_F(DBTest, SnapshotFiles) {
+TEST_F(DBTest, ComparatorCheck) {
+  class NewComparator : public Comparator {
+   public:
+    virtual const char* Name() const override {
+      return "rocksdb.NewComparator";
+    }
+    virtual int Compare(const Slice& a, const Slice& b) const override {
+      return BytewiseComparator()->Compare(a, b);
+    }
+    virtual void FindShortestSeparator(std::string* s,
+                                       const Slice& l) const override {
+      BytewiseComparator()->FindShortestSeparator(s, l);
+    }
+    virtual void FindShortSuccessor(std::string* key) const override {
+      BytewiseComparator()->FindShortSuccessor(key);
+    }
+  };
+  Options new_options, options;
+  NewComparator cmp;
   do {
-    Options options = CurrentOptions();
-    options.write_buffer_size = 100000000;        // Large write buffer
+    options = CurrentOptions();
     CreateAndReopenWithCF({"pikachu"}, options);
+    new_options = CurrentOptions();
+    new_options.comparator = &cmp;
+    // only the non-default column family has non-matching comparator
+    Status s = TryReopenWithColumnFamilies({"default", "pikachu"},
+        std::vector<Options>({options, new_options}));
+    ASSERT_TRUE(!s.ok());
+    ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
+        << s.ToString();
+  } while (ChangeCompactOptions());
+}
 
-    Random rnd(301);
+TEST_F(DBTest, CustomComparator) {
+  class NumberComparator : public Comparator {
+   public:
+    virtual const char* Name() const override {
+      return "test.NumberComparator";
+    }
+    virtual int Compare(const Slice& a, const Slice& b) const override {
+      return ToNumber(a) - ToNumber(b);
+    }
+    virtual void FindShortestSeparator(std::string* s,
+                                       const Slice& l) const override {
+      ToNumber(*s);     // Check format
+      ToNumber(l);      // Check format
+    }
+    virtual void FindShortSuccessor(std::string* key) const override {
+      ToNumber(*key);   // Check format
+    }
+   private:
+    static int ToNumber(const Slice& x) {
+      // Check that there are no extra characters.
+      EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']')
+          << EscapeString(x);
+      int val;
+      char ignored;
+      EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
+          << EscapeString(x);
+      return val;
+    }
+  };
+  Options new_options;
+  NumberComparator cmp;
+  do {
+    new_options = CurrentOptions();
+    new_options.create_if_missing = true;
+    new_options.comparator = &cmp;
+    new_options.write_buffer_size = 4096;  // Compact more often
+    new_options.arena_block_size = 4096;
+    new_options = CurrentOptions(new_options);
+    DestroyAndReopen(new_options);
+    CreateAndReopenWithCF({"pikachu"}, new_options);
+    ASSERT_OK(Put(1, "[10]", "ten"));
+    ASSERT_OK(Put(1, "[0x14]", "twenty"));
+    for (int i = 0; i < 2; i++) {
+      ASSERT_EQ("ten", Get(1, "[10]"));
+      ASSERT_EQ("ten", Get(1, "[0xa]"));
+      ASSERT_EQ("twenty", Get(1, "[20]"));
+      ASSERT_EQ("twenty", Get(1, "[0x14]"));
+      ASSERT_EQ("NOT_FOUND", Get(1, "[15]"));
+      ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]"));
+      Compact(1, "[0]", "[9999]");
+    }
 
-    // Write 8MB (80 values, each 100K)
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
-    std::vector<std::string> values;
-    for (int i = 0; i < 80; i++) {
-      values.push_back(RandomString(&rnd, 100000));
-      ASSERT_OK(Put((i < 40), Key(i), values[i]));
+    for (int run = 0; run < 2; run++) {
+      for (int i = 0; i < 1000; i++) {
+        char buf[100];
+        snprintf(buf, sizeof(buf), "[%d]", i*10);
+        ASSERT_OK(Put(1, buf, buf));
+      }
+      Compact(1, "[0]", "[1000000]");
     }
+  } while (ChangeCompactOptions());
+}
 
-    // assert that nothing makes it to disk yet.
-    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+TEST_F(DBTest, DBOpen_Options) {
+  Options options = CurrentOptions();
+  std::string dbname = test::TmpDir(env_) + "/db_options_test";
+  ASSERT_OK(DestroyDB(dbname, options));
 
-    // get a file snapshot
-    uint64_t manifest_number = 0;
-    uint64_t manifest_size = 0;
-    std::vector<std::string> files;
-    dbfull()->DisableFileDeletions();
-    dbfull()->GetLiveFiles(files, &manifest_size);
+  // Does not exist, and create_if_missing == false: error
+  DB* db = nullptr;
+  options.create_if_missing = false;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+  ASSERT_TRUE(db == nullptr);
 
-    // CURRENT, MANIFEST, *.sst files (one for each CF)
-    ASSERT_EQ(files.size(), 4U);
+  // Does not exist, and create_if_missing == true: OK
+  options.create_if_missing = true;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
 
-    uint64_t number = 0;
-    FileType type;
+  delete db;
+  db = nullptr;
 
-    // copy these files to a new snapshot directory
-    std::string snapdir = dbname_ + ".snapdir/";
-    ASSERT_OK(env_->CreateDirIfMissing(snapdir));
+  // Does exist, and error_if_exists == true: error
+  options.create_if_missing = false;
+  options.error_if_exists = true;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does exist, and error_if_exists == false: OK
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+}
+
+TEST_F(DBTest, DBOpen_Change_NumLevels) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  ASSERT_TRUE(db_ != nullptr);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "a", "123"));
+  ASSERT_OK(Put(1, "b", "234"));
+  Flush(1);
+  MoveFilesToLevel(3, 1);
+  Close();
 
-    for (unsigned int i = 0; i < files.size(); i++) {
-      // our clients require that GetLiveFiles returns
-      // files with "/" as first character!
-      ASSERT_EQ(files[i][0], '/');
-      std::string src = dbname_ + files[i];
-      std::string dest = snapdir + files[i];
+  options.create_if_missing = false;
+  options.num_levels = 2;
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
+  ASSERT_TRUE(db_ == nullptr);
+}
 
-      uint64_t size;
-      ASSERT_OK(env_->GetFileSize(src, &size));
+TEST_F(DBTest, DestroyDBMetaDatabase) {
+  std::string dbname = test::TmpDir(env_) + "/db_meta";
+  ASSERT_OK(env_->CreateDirIfMissing(dbname));
+  std::string metadbname = MetaDatabaseName(dbname, 0);
+  ASSERT_OK(env_->CreateDirIfMissing(metadbname));
+  std::string metametadbname = MetaDatabaseName(metadbname, 0);
+  ASSERT_OK(env_->CreateDirIfMissing(metametadbname));
 
-      // record the number and the size of the
-      // latest manifest file
-      if (ParseFileName(files[i].substr(1), &number, &type)) {
-        if (type == kDescriptorFile) {
-          if (number > manifest_number) {
-            manifest_number = number;
-            ASSERT_GE(size, manifest_size);
-            size = manifest_size; // copy only valid MANIFEST data
-          }
-        }
-      }
-      CopyFile(src, dest, size);
-    }
+  // Destroy previous versions if they exist. Using the long way.
+  Options options = CurrentOptions();
+  ASSERT_OK(DestroyDB(metametadbname, options));
+  ASSERT_OK(DestroyDB(metadbname, options));
+  ASSERT_OK(DestroyDB(dbname, options));
 
-    // release file snapshot
-    dbfull()->DisableFileDeletions();
-    // overwrite one key, this key should not appear in the snapshot
-    std::vector<std::string> extras;
-    for (unsigned int i = 0; i < 1; i++) {
-      extras.push_back(RandomString(&rnd, 100000));
-      ASSERT_OK(Put(0, Key(i), extras[i]));
-    }
+  // Setup databases
+  DB* db = nullptr;
+  ASSERT_OK(DB::Open(options, dbname, &db));
+  delete db;
+  db = nullptr;
+  ASSERT_OK(DB::Open(options, metadbname, &db));
+  delete db;
+  db = nullptr;
+  ASSERT_OK(DB::Open(options, metametadbname, &db));
+  delete db;
+  db = nullptr;
 
-    // verify that data in the snapshot are correct
-    std::vector<ColumnFamilyDescriptor> column_families;
-    column_families.emplace_back("default", ColumnFamilyOptions());
-    column_families.emplace_back("pikachu", ColumnFamilyOptions());
-    std::vector<ColumnFamilyHandle*> cf_handles;
-    DB* snapdb;
-    DBOptions opts;
-    opts.env = env_;
-    opts.create_if_missing = false;
-    Status stat =
-        DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb);
-    ASSERT_OK(stat);
+  // Delete databases
+  ASSERT_OK(DestroyDB(dbname, options));
 
-    ReadOptions roptions;
-    std::string val;
-    for (unsigned int i = 0; i < 80; i++) {
-      stat = snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val);
-      ASSERT_EQ(values[i].compare(val), 0);
-    }
-    for (auto cfh : cf_handles) {
-      delete cfh;
-    }
-    delete snapdb;
+  // Check if deletion worked.
+  options.create_if_missing = false;
+  ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok());
+}
 
-    // look at the new live files after we added an 'extra' key
-    // and after we took the first snapshot.
-    uint64_t new_manifest_number = 0;
-    uint64_t new_manifest_size = 0;
-    std::vector<std::string> newfiles;
-    dbfull()->DisableFileDeletions();
-    dbfull()->GetLiveFiles(newfiles, &new_manifest_size);
+// Check that number of files does not grow when writes are dropped
+TEST_F(DBTest, DropWrites) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.paranoid_checks = false;
+    Reopen(options);
 
-    // find the new manifest file. assert that this manifest file is
-    // the same one as in the previous snapshot. But its size should be
-    // larger because we added an extra key after taking the
-    // previous shapshot.
-    for (unsigned int i = 0; i < newfiles.size(); i++) {
-      std::string src = dbname_ + "/" + newfiles[i];
-      // record the lognumber and the size of the
-      // latest manifest file
-      if (ParseFileName(newfiles[i].substr(1), &number, &type)) {
-        if (type == kDescriptorFile) {
-          if (number > new_manifest_number) {
-            uint64_t size;
-            new_manifest_number = number;
-            ASSERT_OK(env_->GetFileSize(src, &size));
-            ASSERT_GE(size, new_manifest_size);
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_EQ("v1", Get("foo"));
+    Compact("a", "z");
+    const size_t num_files = CountFiles();
+    // Force out-of-space errors
+    env_->drop_writes_.store(true, std::memory_order_release);
+    env_->sleep_counter_.Reset();
+    env_->no_sleep_ = true;
+    for (int i = 0; i < 5; i++) {
+      if (option_config_ != kUniversalCompactionMultiLevel &&
+          option_config_ != kUniversalSubcompactions) {
+        for (int level = 0; level < dbfull()->NumberLevels(); level++) {
+          if (level > 0 && level == dbfull()->NumberLevels() - 1) {
+            break;
           }
+          dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr,
+                                      true /* disallow trivial move */);
         }
+      } else {
+        dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
       }
     }
-    ASSERT_EQ(manifest_number, new_manifest_number);
-    ASSERT_GT(new_manifest_size, manifest_size);
 
-    // release file snapshot
-    dbfull()->DisableFileDeletions();
+    std::string property_value;
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("5", property_value);
+
+    env_->drop_writes_.store(false, std::memory_order_release);
+    ASSERT_LT(CountFiles(), num_files + 3);
+
+    // Check that compaction attempts slept after errors
+    // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler
+    // versions
+    ASSERT_GE(env_->sleep_counter_.Read(), 4);
   } while (ChangeCompactOptions());
 }
 
-TEST_F(DBTest, CompactOnFlush) {
-  anon::OptionsOverride options_override;
-  options_override.skip_policy = kSkipNoSnapshot;
+// Check background error counter bumped on flush failures.
+TEST_F(DBTest, DropWritesFlush) {
   do {
-    Options options = CurrentOptions(options_override);
-    options.purge_redundant_kvs_while_flush = true;
-    options.disable_auto_compactions = true;
-    CreateAndReopenWithCF({"pikachu"}, options);
-
-    Put(1, "foo", "v1");
-    ASSERT_OK(Flush(1));
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]");
-
-    // Write two new keys
-    Put(1, "a", "begin");
-    Put(1, "z", "end");
-    Flush(1);
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.max_background_flushes = 1;
+    Reopen(options);
 
-    // Case1: Delete followed by a put
-    Delete(1, "foo");
-    Put(1, "foo", "v2");
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+    ASSERT_OK(Put("foo", "v1"));
+    // Force out-of-space errors
+    env_->drop_writes_.store(true, std::memory_order_release);
 
-    // After the current memtable is flushed, the DEL should
-    // have been removed
-    ASSERT_OK(Flush(1));
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+    std::string property_value;
+    // Background error count is 0 now.
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("0", property_value);
 
-    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
+    dbfull()->TEST_FlushMemTable(true);
 
-    // Case 2: Delete followed by another delete
-    Delete(1, "foo");
-    Delete(1, "foo");
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]");
-    ASSERT_OK(Flush(1));
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]");
-    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("1", property_value);
 
-    // Case 3: Put followed by a delete
-    Put(1, "foo", "v3");
-    Delete(1, "foo");
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]");
-    ASSERT_OK(Flush(1));
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]");
-    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+    env_->drop_writes_.store(false, std::memory_order_release);
+  } while (ChangeCompactOptions());
+}
 
-    // Case 4: Put followed by another Put
-    Put(1, "foo", "v4");
-    Put(1, "foo", "v5");
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]");
-    ASSERT_OK(Flush(1));
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
-    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
+// Check that CompactRange() returns failure if there is not enough space left
+// on device
+TEST_F(DBTest, NoSpaceCompactRange) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.disable_auto_compactions = true;
+    Reopen(options);
 
-    // clear database
-    Delete(1, "foo");
-    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+    // generate 5 tables
+    for (int i = 0; i < 5; ++i) {
+      ASSERT_OK(Put(Key(i), Key(i) + "v"));
+      ASSERT_OK(Flush());
+    }
 
-    // Case 5: Put followed by snapshot followed by another Put
-    // Both puts should remain.
-    Put(1, "foo", "v6");
-    const Snapshot* snapshot = db_->GetSnapshot();
-    Put(1, "foo", "v7");
-    ASSERT_OK(Flush(1));
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]");
-    db_->ReleaseSnapshot(snapshot);
+    // Force out-of-space errors
+    env_->no_space_.store(true, std::memory_order_release);
 
-    // clear database
-    Delete(1, "foo");
-    dbfull()->CompactRange(handles_[1], nullptr, nullptr);
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+    Status s = dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                           true /* disallow trivial move */);
+    ASSERT_TRUE(s.IsIOError());
 
-    // Case 5: snapshot followed by a put followed by another Put
-    // Only the last put should remain.
-    const Snapshot* snapshot1 = db_->GetSnapshot();
-    Put(1, "foo", "v8");
-    Put(1, "foo", "v9");
-    ASSERT_OK(Flush(1));
-    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]");
-    db_->ReleaseSnapshot(snapshot1);
+    env_->no_space_.store(false, std::memory_order_release);
   } while (ChangeCompactOptions());
 }
 
-namespace {
-std::vector<std::uint64_t> ListSpecificFiles(
-    Env* env, const std::string& path, const FileType expected_file_type) {
-  std::vector<std::string> files;
-  std::vector<uint64_t> file_numbers;
-  env->GetChildren(path, &files);
-  uint64_t number;
-  FileType type;
-  for (size_t i = 0; i < files.size(); ++i) {
-    if (ParseFileName(files[i], &number, &type)) {
-      if (type == expected_file_type) {
-        file_numbers.push_back(number);
+TEST_F(DBTest, NonWritableFileSystem) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 4096;
+    options.arena_block_size = 4096;
+    options.env = env_;
+    Reopen(options);
+    ASSERT_OK(Put("foo", "v1"));
+    env_->non_writeable_rate_.store(100);
+    std::string big(100000, 'x');
+    int errors = 0;
+    for (int i = 0; i < 20; i++) {
+      if (!Put("foo", big).ok()) {
+        errors++;
+        env_->SleepForMicroseconds(100000);
       }
     }
-  }
-  return std::move(file_numbers);
+    ASSERT_GT(errors, 0);
+    env_->non_writeable_rate_.store(0);
+  } while (ChangeCompactOptions());
 }
 
-std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path) {
-  return ListSpecificFiles(env, path, kTableFile);
-}
-}  // namespace
+TEST_F(DBTest, ManifestWriteError) {
+  // Test for the following problem:
+  // (a) Compaction produces file F
+  // (b) Log record containing F is written to MANIFEST file, but Sync() fails
+  // (c) GC deletes F
+  // (d) After reopening DB, reads fail since deleted F is named in log record
 
-TEST_F(DBTest, FlushOneColumnFamily) {
-  Options options = CurrentOptions();
-  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
-                         "alyosha", "popovich"},
-                        options);
+  // We iterate twice.  In the second iteration, everything is the
+  // same except the log record never makes it to the MANIFEST file.
+  for (int iter = 0; iter < 2; iter++) {
+    std::atomic<bool>* error_type = (iter == 0)
+        ? &env_->manifest_sync_error_
+        : &env_->manifest_write_error_;
 
-  ASSERT_OK(Put(0, "Default", "Default"));
-  ASSERT_OK(Put(1, "pikachu", "pikachu"));
-  ASSERT_OK(Put(2, "ilya", "ilya"));
-  ASSERT_OK(Put(3, "muromec", "muromec"));
-  ASSERT_OK(Put(4, "dobrynia", "dobrynia"));
-  ASSERT_OK(Put(5, "nikitich", "nikitich"));
-  ASSERT_OK(Put(6, "alyosha", "alyosha"));
-  ASSERT_OK(Put(7, "popovich", "popovich"));
+    // Insert foo=>bar mapping
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.create_if_missing = true;
+    options.error_if_exists = false;
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_EQ("bar", Get("foo"));
 
-  for (int i = 0; i < 8; ++i) {
-    Flush(i);
-    auto tables = ListTableFiles(env_, dbname_);
-    ASSERT_EQ(tables.size(), i + 1U);
+    // Memtable compaction (will succeed)
+    Flush();
+    ASSERT_EQ("bar", Get("foo"));
+    const int last = 2;
+    MoveFilesToLevel(2);
+    ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo=>bar is now in last level
+
+    // Merging compaction (will fail)
+    error_type->store(true, std::memory_order_release);
+    dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Recovery: should not lose data
+    error_type->store(false, std::memory_order_release);
+    Reopen(options);
+    ASSERT_EQ("bar", Get("foo"));
   }
 }
 
-// In https://reviews.facebook.net/D20661 we change
-// recovery behavior: previously for each log file each column family
-// memtable was flushed, even it was empty. Now it's changed:
-// we try to create the smallest number of table files by merging
-// updates from multiple logs
-TEST_F(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
+TEST_F(DBTest, PutFailsParanoid) {
+  // Test the following:
+  // (a) A random put fails in paranoid mode (simulate by sync fail)
+  // (b) All other puts have to fail, even if writes would succeed
+  // (c) All of that should happen ONLY if paranoid_checks = true
+
   Options options = CurrentOptions();
-  options.write_buffer_size = 5000000;
-  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Status s;
 
-  // Since we will reopen DB with smaller write_buffer_size,
-  // each key will go to new SST file
-  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
-  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
-  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
-  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  // simulate error
+  env_->log_write_error_.store(true, std::memory_order_release);
+  s = Put(1, "foo2", "bar2");
+  ASSERT_TRUE(!s.ok());
+  env_->log_write_error_.store(false, std::memory_order_release);
+  s = Put(1, "foo3", "bar3");
+  // the next put should fail, too
+  ASSERT_TRUE(!s.ok());
+  // but we're still able to read
+  ASSERT_EQ("bar", Get(1, "foo"));
 
-  ASSERT_OK(Put(3, Key(10), DummyString(1)));
-  // Make 'dobrynia' to be flushed and new WAL file to be created
-  ASSERT_OK(Put(2, Key(10), DummyString(7500000)));
-  ASSERT_OK(Put(2, Key(1), DummyString(1)));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
-  {
-    auto tables = ListTableFiles(env_, dbname_);
-    ASSERT_EQ(tables.size(), static_cast<size_t>(1));
-    // Make sure 'dobrynia' was flushed: check sst files amount
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
-              static_cast<uint64_t>(1));
-  }
-  // New WAL file
-  ASSERT_OK(Put(1, Key(1), DummyString(1)));
-  ASSERT_OK(Put(1, Key(1), DummyString(1)));
-  ASSERT_OK(Put(3, Key(10), DummyString(1)));
-  ASSERT_OK(Put(3, Key(10), DummyString(1)));
-  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  // do the same thing with paranoid checks off
+  options.paranoid_checks = false;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
-  options.write_buffer_size = 10;
-  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
-                           options);
-  {
-    // No inserts => default is empty
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
-              static_cast<uint64_t>(0));
-    // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
-              static_cast<uint64_t>(5));
-    // 1 SST for big key + 1 SST for small one
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
-              static_cast<uint64_t>(2));
-    // 1 SST for all keys
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
-              static_cast<uint64_t>(1));
-  }
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  // simulate error
+  env_->log_write_error_.store(true, std::memory_order_release);
+  s = Put(1, "foo2", "bar2");
+  ASSERT_TRUE(!s.ok());
+  env_->log_write_error_.store(false, std::memory_order_release);
+  s = Put(1, "foo3", "bar3");
+  // the next put should NOT fail
+  ASSERT_TRUE(s.ok());
 }
 
-// In https://reviews.facebook.net/D20661 we change
-// recovery behavior: previously for each log file each column family
-// memtable was flushed, even it wasn't empty. Now it's changed:
-// we try to create the smallest number of table files by merging
-// updates from multiple logs
-TEST_F(DBTest, RecoverCheckFileAmount) {
-  Options options = CurrentOptions();
-  options.write_buffer_size = 100000;
-  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+TEST_F(DBTest, BloomFilter) {
+  do {
+    Options options = CurrentOptions();
+    env_->count_random_reads_ = true;
+    options.env = env_;
+    // ChangeCompactOptions() only changes compaction style, which does not
+    // trigger reset of table_factory
+    BlockBasedTableOptions table_options;
+    table_options.no_block_cache = true;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  ASSERT_OK(Put(0, Key(1), DummyString(1)));
-  ASSERT_OK(Put(1, Key(1), DummyString(1)));
-  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+    CreateAndReopenWithCF({"pikachu"}, options);
 
-  // Make 'nikitich' memtable to be flushed
-  ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
-  ASSERT_OK(Put(3, Key(1), DummyString(1)));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
-  // 4 memtable are not flushed, 1 sst file
-  {
-    auto tables = ListTableFiles(env_, dbname_);
-    ASSERT_EQ(tables.size(), static_cast<size_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
-              static_cast<uint64_t>(1));
-  }
-  // Memtable for 'nikitich' has flushed, new WAL file has opened
-  // 4 memtable still not flushed
+    // Populate multiple layers
+    const int N = 10000;
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    Compact(1, "a", "z");
+    for (int i = 0; i < N; i += 100) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    Flush(1);
 
-  // Write to new WAL file
-  ASSERT_OK(Put(0, Key(1), DummyString(1)));
-  ASSERT_OK(Put(1, Key(1), DummyString(1)));
-  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+    // Prevent auto compactions triggered by seeks
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
 
-  // Fill up 'nikitich' one more time
-  ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
-  // make it flush
-  ASSERT_OK(Put(3, Key(1), DummyString(1)));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
-  // There are still 4 memtable not flushed, and 2 sst tables
-  ASSERT_OK(Put(0, Key(1), DummyString(1)));
-  ASSERT_OK(Put(1, Key(1), DummyString(1)));
-  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+    // Lookup present keys.  Should rarely read from small sstable.
+    env_->random_read_counter_.Reset();
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    int reads = env_->random_read_counter_.Read();
+    fprintf(stderr, "%d present => %d reads\n", N, reads);
+    ASSERT_GE(reads, N);
+    ASSERT_LE(reads, N + 2*N/100);
+
+    // Lookup present keys.  Should rarely read from either sstable.
+    env_->random_read_counter_.Reset();
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing"));
+    }
+    reads = env_->random_read_counter_.Read();
+    fprintf(stderr, "%d missing => %d reads\n", N, reads);
+    ASSERT_LE(reads, 3*N/100);
+
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
+    Close();
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, BloomFilterRate) {
+  while (ChangeFilterOptions()) {
+    Options options = CurrentOptions();
+    options.statistics = rocksdb::CreateDBStatistics();
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    const int maxKey = 10000;
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    // Add a large key to make the file contain wide range
+    ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+    Flush(1);
 
-  {
-    auto tables = ListTableFiles(env_, dbname_);
-    ASSERT_EQ(tables.size(), static_cast<size_t>(2));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
-              static_cast<uint64_t>(2));
-  }
+    // Check if they can be found
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
 
-  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
-                           options);
-  {
-    std::vector<uint64_t> table_files = ListTableFiles(env_, dbname_);
-    // Check, that records for 'default', 'dobrynia' and 'pikachu' from
-    // first, second and third WALs  went to the same SST.
-    // So, there is 6 SSTs: three  for 'nikitich', one for 'default', one for
-    // 'dobrynia', one for 'pikachu'
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
-              static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
-              static_cast<uint64_t>(3));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
-              static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
-              static_cast<uint64_t>(1));
+    // Check if filter is useful
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333)));
+    }
+    ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98);
   }
 }
 
-TEST_F(DBTest, SharedWriteBuffer) {
+TEST_F(DBTest, BloomFilterCompatibility) {
   Options options = CurrentOptions();
-  options.db_write_buffer_size = 100000;  // this is the real limit
-  options.write_buffer_size    = 500000;  // this is never hit
-  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+  options.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  // Trigger a flush on every CF
-  ASSERT_OK(Put(0, Key(1), DummyString(1)));
-  ASSERT_OK(Put(1, Key(1), DummyString(1)));
-  ASSERT_OK(Put(3, Key(1), DummyString(90000)));
-  ASSERT_OK(Put(2, Key(2), DummyString(20000)));
-  ASSERT_OK(Put(2, Key(1), DummyString(1)));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
-  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
-  {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
-              static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
-              static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
-              static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
-              static_cast<uint64_t>(1));
-  }
+  // Create with block based filter
+  CreateAndReopenWithCF({"pikachu"}, options);
 
-  // Flush 'dobrynia' and 'nikitich'
-  ASSERT_OK(Put(2, Key(2), DummyString(50000)));
-  ASSERT_OK(Put(3, Key(2), DummyString(40000)));
-  ASSERT_OK(Put(2, Key(3), DummyString(20000)));
-  ASSERT_OK(Put(3, Key(2), DummyString(40000)));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
-  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
-  {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
-              static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
-              static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
-              static_cast<uint64_t>(2));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
-              static_cast<uint64_t>(2));
+  const int maxKey = 10000;
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
   }
+  ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+  Flush(1);
 
-  // Make 'dobrynia' and 'nikitich' both take up 40% of space
-  // When 'pikachu' puts us over 100%, all 3 flush.
-  ASSERT_OK(Put(2, Key(2), DummyString(40000)));
-  ASSERT_OK(Put(1, Key(2), DummyString(20000)));
-  ASSERT_OK(Put(0, Key(1), DummyString(1)));
-  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
-  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
-  {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
-              static_cast<uint64_t>(1));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
-              static_cast<uint64_t>(2));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
-              static_cast<uint64_t>(3));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
-              static_cast<uint64_t>(3));
-  }
+  // Check db with full filter
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
 
-  // Some remaining writes so 'default' and 'nikitich' flush on closure.
-  ASSERT_OK(Put(3, Key(1), DummyString(1)));
-  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
-                           options);
-  {
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
-              static_cast<uint64_t>(2));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
-              static_cast<uint64_t>(2));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
-              static_cast<uint64_t>(3));
-    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
-              static_cast<uint64_t>(4));
+  // Check if they can be found
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ(Key(i), Get(1, Key(i)));
   }
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
 }
 
-TEST_F(DBTest, PurgeInfoLogs) {
+TEST_F(DBTest, BloomFilterReverseCompatibility) {
   Options options = CurrentOptions();
-  options.keep_log_file_num = 5;
-  options.create_if_missing = true;
-  for (int mode = 0; mode <= 1; mode++) {
-    if (mode == 1) {
-      options.db_log_dir = dbname_ + "_logs";
-      env_->CreateDirIfMissing(options.db_log_dir);
-    } else {
-      options.db_log_dir = "";
-    }
-    for (int i = 0; i < 8; i++) {
-      Reopen(options);
-    }
+  options.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-    std::vector<std::string> files;
-    env_->GetChildren(options.db_log_dir.empty() ? dbname_ : options.db_log_dir,
-                      &files);
-    int info_log_count = 0;
-    for (std::string file : files) {
-      if (file.find("LOG") != std::string::npos) {
-        info_log_count++;
-      }
-    }
-    ASSERT_EQ(5, info_log_count);
+  // Create with full filter
+  CreateAndReopenWithCF({"pikachu"}, options);
 
-    Destroy(options);
-    // For mode (1), test DestroyDB() to delete all the logs under DB dir.
-    // For mode (2), no info log file should have been put under DB dir.
-    std::vector<std::string> db_files;
-    env_->GetChildren(dbname_, &db_files);
-    for (std::string file : db_files) {
-      ASSERT_TRUE(file.find("LOG") == std::string::npos);
-    }
+  const int maxKey = 10000;
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+  Flush(1);
 
-    if (mode == 1) {
-      // Cleaning up
-      env_->GetChildren(options.db_log_dir, &files);
-      for (std::string file : files) {
-        env_->DeleteFile(options.db_log_dir + "/" + file);
-      }
-      env_->DeleteDir(options.db_log_dir);
-    }
+  // Check db with block_based filter
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // Check if they can be found
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ(Key(i), Get(1, Key(i)));
   }
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
 }
 
 namespace {
-SequenceNumber ReadRecords(
-    std::unique_ptr<TransactionLogIterator>& iter,
-    int& count) {
-  count = 0;
-  SequenceNumber lastSequence = 0;
-  BatchResult res;
-  while (iter->Valid()) {
-    res = iter->GetBatch();
-    EXPECT_TRUE(res.sequence > lastSequence);
-    ++count;
-    lastSequence = res.sequence;
-    EXPECT_OK(iter->status());
-    iter->Next();
-  }
-  return res.sequence;
-}
+// A wrapped bloom over default FilterPolicy
+class WrappedBloom : public FilterPolicy {
+ public:
+  explicit WrappedBloom(int bits_per_key) :
+        filter_(NewBloomFilterPolicy(bits_per_key)),
+        counter_(0) {}
 
-void ExpectRecords(
-    const int expected_no_records,
-    std::unique_ptr<TransactionLogIterator>& iter) {
-  int num_records;
-  ReadRecords(iter, num_records);
-  ASSERT_EQ(num_records, expected_no_records);
-}
-}  // namespace
+  ~WrappedBloom() { delete filter_; }
 
-TEST_F(DBTest, TransactionLogIterator) {
-  do {
-    Options options = OptionsForLogIterTest();
-    DestroyAndReopen(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
-    Put(0, "key1", DummyString(1024));
-    Put(1, "key2", DummyString(1024));
-    Put(1, "key2", DummyString(1024));
-    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U);
-    {
-      auto iter = OpenTransactionLogIter(0);
-      ExpectRecords(3, iter);
-    }
-    ReopenWithColumnFamilies({"default", "pikachu"}, options);
-    env_->SleepForMicroseconds(2 * 1000 * 1000);
-    {
-      Put(0, "key4", DummyString(1024));
-      Put(1, "key5", DummyString(1024));
-      Put(0, "key6", DummyString(1024));
-    }
-    {
-      auto iter = OpenTransactionLogIter(0);
-      ExpectRecords(6, iter);
+  const char* Name() const override { return "WrappedRocksDbFilterPolicy"; }
+
+  void CreateFilter(const rocksdb::Slice* keys, int n, std::string* dst)
+      const override {
+    std::unique_ptr<rocksdb::Slice[]> user_keys(new rocksdb::Slice[n]);
+    for (int i = 0; i < n; ++i) {
+      user_keys[i] = convertKey(keys[i]);
     }
-  } while (ChangeCompactOptions());
-}
+    return filter_->CreateFilter(user_keys.get(), n, dst);
+  }
 
-#ifndef NDEBUG // sync point is not included with DNDEBUG build
-TEST_F(DBTest, TransactionLogIteratorRace) {
-  static const int LOG_ITERATOR_RACE_TEST_COUNT = 2;
-  static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = {
-      {"WalManager::GetSortedWalFiles:1",  "WalManager::PurgeObsoleteFiles:1",
-       "WalManager::PurgeObsoleteFiles:2", "WalManager::GetSortedWalFiles:2"},
-      {"WalManager::GetSortedWalsOfType:1",
-       "WalManager::PurgeObsoleteFiles:1",
-       "WalManager::PurgeObsoleteFiles:2",
-       "WalManager::GetSortedWalsOfType:2"}};
-  for (int test = 0; test < LOG_ITERATOR_RACE_TEST_COUNT; ++test) {
-    // Setup sync point dependency to reproduce the race condition of
-    // a log file moved to archived dir, in the middle of GetSortedWalFiles
-    rocksdb::SyncPoint::GetInstance()->LoadDependency(
-      { { sync_points[test][0], sync_points[test][1] },
-        { sync_points[test][2], sync_points[test][3] },
-      });
+  bool KeyMayMatch(const rocksdb::Slice& key, const rocksdb::Slice& filter)
+      const override {
+    counter_++;
+    return filter_->KeyMayMatch(convertKey(key), filter);
+  }
 
-    do {
-      rocksdb::SyncPoint::GetInstance()->ClearTrace();
-      rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-      Options options = OptionsForLogIterTest();
-      DestroyAndReopen(options);
-      Put("key1", DummyString(1024));
-      dbfull()->Flush(FlushOptions());
-      Put("key2", DummyString(1024));
-      dbfull()->Flush(FlushOptions());
-      Put("key3", DummyString(1024));
-      dbfull()->Flush(FlushOptions());
-      Put("key4", DummyString(1024));
-      ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U);
-
-      {
-        auto iter = OpenTransactionLogIter(0);
-        ExpectRecords(4, iter);
-      }
+  uint32_t GetCounter() { return counter_; }
 
-      rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-      // trigger async flush, and log move. Well, log move will
-      // wait until the GetSortedWalFiles:1 to reproduce the race
-      // condition
-      FlushOptions flush_options;
-      flush_options.wait = false;
-      dbfull()->Flush(flush_options);
-
-      // "key5" would be written in a new memtable and log
-      Put("key5", DummyString(1024));
-      {
-        // this iter would miss "key4" if not fixed
-        auto iter = OpenTransactionLogIter(0);
-        ExpectRecords(5, iter);
-      }
-    } while (ChangeCompactOptions());
+ private:
+  const FilterPolicy* filter_;
+  mutable uint32_t counter_;
+
+  rocksdb::Slice convertKey(const rocksdb::Slice& key) const {
+    return key;
   }
-}
-#endif
+};
+}  // namespace
 
-TEST_F(DBTest, TransactionLogIteratorStallAtLastRecord) {
-  do {
-    Options options = OptionsForLogIterTest();
-    DestroyAndReopen(options);
-    Put("key1", DummyString(1024));
-    auto iter = OpenTransactionLogIter(0);
-    ASSERT_OK(iter->status());
-    ASSERT_TRUE(iter->Valid());
-    iter->Next();
-    ASSERT_TRUE(!iter->Valid());
-    ASSERT_OK(iter->status());
-    Put("key2", DummyString(1024));
-    iter->Next();
-    ASSERT_OK(iter->status());
-    ASSERT_TRUE(iter->Valid());
-  } while (ChangeCompactOptions());
-}
+TEST_F(DBTest, BloomFilterWrapper) {
+  Options options = CurrentOptions();
+  options.statistics = rocksdb::CreateDBStatistics();
 
-TEST_F(DBTest, TransactionLogIteratorCheckAfterRestart) {
-  do {
-    Options options = OptionsForLogIterTest();
-    DestroyAndReopen(options);
-    Put("key1", DummyString(1024));
-    Put("key2", DummyString(1023));
-    dbfull()->Flush(FlushOptions());
-    Reopen(options);
-    auto iter = OpenTransactionLogIter(0);
-    ExpectRecords(2, iter);
-  } while (ChangeCompactOptions());
-}
+  BlockBasedTableOptions table_options;
+  WrappedBloom* policy = new WrappedBloom(10);
+  table_options.filter_policy.reset(policy);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-TEST_F(DBTest, TransactionLogIteratorCorruptedLog) {
-  do {
-    Options options = OptionsForLogIterTest();
-    DestroyAndReopen(options);
-    for (int i = 0; i < 1024; i++) {
-      Put("key"+ToString(i), DummyString(10));
-    }
-    dbfull()->Flush(FlushOptions());
-    // Corrupt this log to create a gap
-    rocksdb::VectorLogPtr wal_files;
-    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
-    const auto logfile_path = dbname_ + "/" + wal_files.front()->PathName();
-    if (mem_env_) {
-      mem_env_->Truncate(logfile_path, wal_files.front()->SizeFileBytes() / 2);
-    } else {
-      ASSERT_EQ(0, truncate(logfile_path.c_str(),
-                   wal_files.front()->SizeFileBytes() / 2));
-    }
+  CreateAndReopenWithCF({"pikachu"}, options);
 
-    // Insert a new entry to a new log file
-    Put("key1025", DummyString(10));
-    // Try to read from the beginning. Should stop before the gap and read less
-    // than 1025 entries
-    auto iter = OpenTransactionLogIter(0);
-    int count;
-    SequenceNumber last_sequence_read = ReadRecords(iter, count);
-    ASSERT_LT(last_sequence_read, 1025U);
-    // Try to read past the gap, should be able to seek to key1025
-    auto iter2 = OpenTransactionLogIter(last_sequence_read + 1);
-    ExpectRecords(1, iter2);
-  } while (ChangeCompactOptions());
+  const int maxKey = 10000;
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  // Add a large key to make the file contain wide range
+  ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+  ASSERT_EQ(0U, policy->GetCounter());
+  Flush(1);
+
+  // Check if they can be found
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ(Key(i), Get(1, Key(i)));
+  }
+  ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  ASSERT_EQ(1U * maxKey, policy->GetCounter());
+
+  // Check if filter is useful
+  for (int i = 0; i < maxKey; i++) {
+    ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333)));
+  }
+  ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98);
+  ASSERT_EQ(2U * maxKey, policy->GetCounter());
 }
 
-TEST_F(DBTest, TransactionLogIteratorBatchOperations) {
+TEST_F(DBTest, SnapshotFiles) {
   do {
-    Options options = OptionsForLogIterTest();
-    DestroyAndReopen(options);
+    Options options = CurrentOptions();
+    options.write_buffer_size = 100000000;        // Large write buffer
     CreateAndReopenWithCF({"pikachu"}, options);
-    WriteBatch batch;
-    batch.Put(handles_[1], "key1", DummyString(1024));
-    batch.Put(handles_[0], "key2", DummyString(1024));
-    batch.Put(handles_[1], "key3", DummyString(1024));
-    batch.Delete(handles_[0], "key2");
-    dbfull()->Write(WriteOptions(), &batch);
-    Flush(1);
-    Flush(0);
-    ReopenWithColumnFamilies({"default", "pikachu"}, options);
-    Put(1, "key4", DummyString(1024));
-    auto iter = OpenTransactionLogIter(3);
-    ExpectRecords(2, iter);
-  } while (ChangeCompactOptions());
-}
 
-TEST_F(DBTest, TransactionLogIteratorBlobs) {
-  Options options = OptionsForLogIterTest();
-  DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-  {
-    WriteBatch batch;
-    batch.Put(handles_[1], "key1", DummyString(1024));
-    batch.Put(handles_[0], "key2", DummyString(1024));
-    batch.PutLogData(Slice("blob1"));
-    batch.Put(handles_[1], "key3", DummyString(1024));
-    batch.PutLogData(Slice("blob2"));
-    batch.Delete(handles_[0], "key2");
-    dbfull()->Write(WriteOptions(), &batch);
-    ReopenWithColumnFamilies({"default", "pikachu"}, options);
-  }
+    Random rnd(301);
 
-  auto res = OpenTransactionLogIter(0)->GetBatch();
-  struct Handler : public WriteBatch::Handler {
-    std::string seen;
-    virtual Status PutCF(uint32_t cf, const Slice& key,
-                         const Slice& value) override {
-      seen += "Put(" + ToString(cf) + ", " + key.ToString() + ", " +
-              ToString(value.size()) + ")";
-      return Status::OK();
-    }
-    virtual Status MergeCF(uint32_t cf, const Slice& key,
-                           const Slice& value) override {
-      seen += "Merge(" + ToString(cf) + ", " + key.ToString() + ", " +
-              ToString(value.size()) + ")";
-      return Status::OK();
-    }
-    virtual void LogData(const Slice& blob) override {
-      seen += "LogData(" + blob.ToString() + ")";
-    }
-    virtual Status DeleteCF(uint32_t cf, const Slice& key) override {
-      seen += "Delete(" + ToString(cf) + ", " + key.ToString() + ")";
-      return Status::OK();
-    }
-  } handler;
-  res.writeBatchPtr->Iterate(&handler);
-  ASSERT_EQ(
-      "Put(1, key1, 1024)"
-      "Put(0, key2, 1024)"
-      "LogData(blob1)"
-      "Put(1, key3, 1024)"
-      "LogData(blob2)"
-      "Delete(0, key2)",
-      handler.seen);
-}
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    std::vector<std::string> values;
+    for (int i = 0; i < 80; i++) {
+      values.push_back(RandomString(&rnd, 100000));
+      ASSERT_OK(Put((i < 40), Key(i), values[i]));
+    }
 
-// Multi-threaded test:
-namespace {
+    // assert that nothing makes it to disk yet.
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
 
-static const int kColumnFamilies = 10;
-static const int kNumThreads = 10;
-static const int kTestSeconds = 10;
-static const int kNumKeys = 1000;
+    // get a file snapshot
+    uint64_t manifest_number = 0;
+    uint64_t manifest_size = 0;
+    std::vector<std::string> files;
+    dbfull()->DisableFileDeletions();
+    dbfull()->GetLiveFiles(files, &manifest_size);
 
-struct MTState {
-  DBTest* test;
-  std::atomic<bool> stop;
-  std::atomic<int> counter[kNumThreads];
-  std::atomic<bool> thread_done[kNumThreads];
-};
+    // CURRENT, MANIFEST, *.sst files (one for each CF)
+    ASSERT_EQ(files.size(), 4U);
 
-struct MTThread {
-  MTState* state;
-  int id;
-};
+    uint64_t number = 0;
+    FileType type;
 
-static void MTThreadBody(void* arg) {
-  MTThread* t = reinterpret_cast<MTThread*>(arg);
-  int id = t->id;
-  DB* db = t->state->test->db_;
-  int counter = 0;
-  fprintf(stderr, "... starting thread %d\n", id);
-  Random rnd(1000 + id);
-  char valbuf[1500];
-  while (t->state->stop.load(std::memory_order_acquire) == false) {
-    t->state->counter[id].store(counter, std::memory_order_release);
+    // copy these files to a new snapshot directory
+    std::string snapdir = dbname_ + ".snapdir/";
+    ASSERT_OK(env_->CreateDirIfMissing(snapdir));
 
-    int key = rnd.Uniform(kNumKeys);
-    char keybuf[20];
-    snprintf(keybuf, sizeof(keybuf), "%016d", key);
+    for (unsigned int i = 0; i < files.size(); i++) {
+      // our clients require that GetLiveFiles returns
+      // files with "/" as first character!
+      ASSERT_EQ(files[i][0], '/');
+      std::string src = dbname_ + files[i];
+      std::string dest = snapdir + files[i];
 
-    if (rnd.OneIn(2)) {
-      // Write values of the form <key, my id, counter, cf, unique_id>.
-      // into each of the CFs
-      // We add some padding for force compactions.
-      int unique_id = rnd.Uniform(1000000);
+      uint64_t size;
+      ASSERT_OK(env_->GetFileSize(src, &size));
 
-      // Half of the time directly use WriteBatch. Half of the time use
-      // WriteBatchWithIndex.
-      if (rnd.OneIn(2)) {
-        WriteBatch batch;
-        for (int cf = 0; cf < kColumnFamilies; ++cf) {
-          snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
-                   static_cast<int>(counter), cf, unique_id);
-          batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf));
-        }
-        ASSERT_OK(db->Write(WriteOptions(), &batch));
-      } else {
-        WriteBatchWithIndex batch(db->GetOptions().comparator);
-        for (int cf = 0; cf < kColumnFamilies; ++cf) {
-          snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
-                   static_cast<int>(counter), cf, unique_id);
-          batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf));
-        }
-        ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch()));
-      }
-    } else {
-      // Read a value and verify that it matches the pattern written above
-      // and that writes to all column families were atomic (unique_id is the
-      // same)
-      std::vector<Slice> keys(kColumnFamilies, Slice(keybuf));
-      std::vector<std::string> values;
-      std::vector<Status> statuses =
-          db->MultiGet(ReadOptions(), t->state->test->handles_, keys, &values);
-      Status s = statuses[0];
-      // all statuses have to be the same
-      for (size_t i = 1; i < statuses.size(); ++i) {
-        // they are either both ok or both not-found
-        ASSERT_TRUE((s.ok() && statuses[i].ok()) ||
-                    (s.IsNotFound() && statuses[i].IsNotFound()));
-      }
-      if (s.IsNotFound()) {
-        // Key has not yet been written
-      } else {
-        // Check that the writer thread counter is >= the counter in the value
-        ASSERT_OK(s);
-        int unique_id = -1;
-        for (int i = 0; i < kColumnFamilies; ++i) {
-          int k, w, c, cf, u;
-          ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w,
-                              &c, &cf, &u))
-              << values[i];
-          ASSERT_EQ(k, key);
-          ASSERT_GE(w, 0);
-          ASSERT_LT(w, kNumThreads);
-          ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire));
-          ASSERT_EQ(cf, i);
-          if (i == 0) {
-            unique_id = u;
-          } else {
-            // this checks that updates across column families happened
-            // atomically -- all unique ids are the same
-            ASSERT_EQ(u, unique_id);
+      // record the number and the size of the
+      // latest manifest file
+      if (ParseFileName(files[i].substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          if (number > manifest_number) {
+            manifest_number = number;
+            ASSERT_GE(size, manifest_size);
+            size = manifest_size; // copy only valid MANIFEST data
           }
         }
       }
+      CopyFile(src, dest, size);
     }
-    counter++;
-  }
-  t->state->thread_done[id].store(true, std::memory_order_release);
-  fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
-}
 
-}  // namespace
+    // release file snapshot
+    dbfull()->DisableFileDeletions();
+    // overwrite one key, this key should not appear in the snapshot
+    std::vector<std::string> extras;
+    for (unsigned int i = 0; i < 1; i++) {
+      extras.push_back(RandomString(&rnd, 100000));
+      ASSERT_OK(Put(0, Key(i), extras[i]));
+    }
 
-class MultiThreadedDBTest : public DBTest,
-                            public ::testing::WithParamInterface<int> {
- public:
-  virtual void SetUp() override { option_config_ = GetParam(); }
+    // verify that data in the snapshot are correct
+    std::vector<ColumnFamilyDescriptor> column_families;
+    column_families.emplace_back("default", ColumnFamilyOptions());
+    column_families.emplace_back("pikachu", ColumnFamilyOptions());
+    std::vector<ColumnFamilyHandle*> cf_handles;
+    DB* snapdb;
+    DBOptions opts;
+    opts.env = env_;
+    opts.create_if_missing = false;
+    Status stat =
+        DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb);
+    ASSERT_OK(stat);
 
-  static std::vector<int> GenerateOptionConfigs() {
-    std::vector<int> optionConfigs;
-    for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) {
-      // skip as HashCuckooRep does not support snapshot
-      if (optionConfig != kHashCuckoo) {
-        optionConfigs.push_back(optionConfig);
+    ReadOptions roptions;
+    std::string val;
+    for (unsigned int i = 0; i < 80; i++) {
+      stat = snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val);
+      ASSERT_EQ(values[i].compare(val), 0);
+    }
+    for (auto cfh : cf_handles) {
+      delete cfh;
+    }
+    delete snapdb;
+
+    // look at the new live files after we added an 'extra' key
+    // and after we took the first snapshot.
+    uint64_t new_manifest_number = 0;
+    uint64_t new_manifest_size = 0;
+    std::vector<std::string> newfiles;
+    dbfull()->DisableFileDeletions();
+    dbfull()->GetLiveFiles(newfiles, &new_manifest_size);
+
+    // find the new manifest file. assert that this manifest file is
+    // the same one as in the previous snapshot. But its size should be
+    // larger because we added an extra key after taking the
+    // previous shapshot.
+    for (unsigned int i = 0; i < newfiles.size(); i++) {
+      std::string src = dbname_ + "/" + newfiles[i];
+      // record the lognumber and the size of the
+      // latest manifest file
+      if (ParseFileName(newfiles[i].substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          if (number > new_manifest_number) {
+            uint64_t size;
+            new_manifest_number = number;
+            ASSERT_OK(env_->GetFileSize(src, &size));
+            ASSERT_GE(size, new_manifest_size);
+          }
+        }
       }
     }
-    return optionConfigs;
-  }
-};
+    ASSERT_EQ(manifest_number, new_manifest_number);
+    ASSERT_GT(new_manifest_size, manifest_size);
 
-TEST_P(MultiThreadedDBTest, MultiThreaded) {
+    // release file snapshot
+    dbfull()->DisableFileDeletions();
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, CompactOnFlush) {
   anon::OptionsOverride options_override;
   options_override.skip_policy = kSkipNoSnapshot;
-  std::vector<std::string> cfs;
-  for (int i = 1; i < kColumnFamilies; ++i) {
-    cfs.push_back(ToString(i));
-  }
-  CreateAndReopenWithCF(cfs, CurrentOptions(options_override));
-  // Initialize state
-  MTState mt;
-  mt.test = this;
-  mt.stop.store(false, std::memory_order_release);
-  for (int id = 0; id < kNumThreads; id++) {
-    mt.counter[id].store(0, std::memory_order_release);
-    mt.thread_done[id].store(false, std::memory_order_release);
-  }
+  do {
+    Options options = CurrentOptions(options_override);
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
 
-  // Start threads
-  MTThread thread[kNumThreads];
-  for (int id = 0; id < kNumThreads; id++) {
-    thread[id].state = &mt;
-    thread[id].id = id;
-    env_->StartThread(MTThreadBody, &thread[id]);
-  }
+    Put(1, "foo", "v1");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]");
 
-  // Let them run for a while
-  env_->SleepForMicroseconds(kTestSeconds * 1000000);
+    // Write two new keys
+    Put(1, "a", "begin");
+    Put(1, "z", "end");
+    Flush(1);
 
-  // Stop the threads and wait for them to finish
-  mt.stop.store(true, std::memory_order_release);
-  for (int id = 0; id < kNumThreads; id++) {
-    while (mt.thread_done[id].load(std::memory_order_acquire) == false) {
-      env_->SleepForMicroseconds(100000);
-    }
-  }
-}
+    // Case1: Delete followed by a put
+    Delete(1, "foo");
+    Put(1, "foo", "v2");
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
 
-INSTANTIATE_TEST_CASE_P(
-    MultiThreaded, MultiThreadedDBTest,
-    ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs()));
+    // After the current memtable is flushed, the DEL should
+    // have been removed
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
 
-// Group commit test:
-namespace {
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
 
-static const int kGCNumThreads = 4;
-static const int kGCNumKeys = 1000;
+    // Case 2: Delete followed by another delete
+    Delete(1, "foo");
+    Delete(1, "foo");
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]");
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
 
-struct GCThread {
-  DB* db;
-  int id;
-  std::atomic<bool> done;
-};
+    // Case 3: Put followed by a delete
+    Put(1, "foo", "v3");
+    Delete(1, "foo");
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]");
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
 
-static void GCThreadBody(void* arg) {
-  GCThread* t = reinterpret_cast<GCThread*>(arg);
-  int id = t->id;
-  DB* db = t->db;
-  WriteOptions wo;
+    // Case 4: Put followed by another Put
+    Put(1, "foo", "v4");
+    Put(1, "foo", "v5");
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
 
-  for (int i = 0; i < kGCNumKeys; ++i) {
-    std::string kv(ToString(i + id * kGCNumKeys));
-    ASSERT_OK(db->Put(wo, kv, kv));
-  }
-  t->done = true;
-}
+    // clear database
+    Delete(1, "foo");
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
 
-}  // namespace
+    // Case 5: Put followed by snapshot followed by another Put
+    // Both puts should remain.
+    Put(1, "foo", "v6");
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Put(1, "foo", "v7");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]");
+    db_->ReleaseSnapshot(snapshot);
 
-TEST_F(DBTest, GroupCommitTest) {
-  do {
-    Options options = CurrentOptions();
-    options.env = env_;
-    env_->log_write_slowdown_.store(100);
-    options.statistics = rocksdb::CreateDBStatistics();
-    Reopen(options);
+    // clear database
+    Delete(1, "foo");
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
 
-    // Start threads
-    GCThread thread[kGCNumThreads];
-    for (int id = 0; id < kGCNumThreads; id++) {
-      thread[id].id = id;
-      thread[id].db = db_;
-      thread[id].done = false;
-      env_->StartThread(GCThreadBody, &thread[id]);
-    }
+    // Case 5: snapshot followed by a put followed by another Put
+    // Only the last put should remain.
+    const Snapshot* snapshot1 = db_->GetSnapshot();
+    Put(1, "foo", "v8");
+    Put(1, "foo", "v9");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]");
+    db_->ReleaseSnapshot(snapshot1);
+  } while (ChangeCompactOptions());
+}
 
-    for (int id = 0; id < kGCNumThreads; id++) {
-      while (thread[id].done == false) {
-        env_->SleepForMicroseconds(100000);
+namespace {
+std::vector<std::uint64_t> ListSpecificFiles(
+    Env* env, const std::string& path, const FileType expected_file_type) {
+  std::vector<std::string> files;
+  std::vector<uint64_t> file_numbers;
+  env->GetChildren(path, &files);
+  uint64_t number;
+  FileType type;
+  for (size_t i = 0; i < files.size(); ++i) {
+    if (ParseFileName(files[i], &number, &type)) {
+      if (type == expected_file_type) {
+        file_numbers.push_back(number);
       }
     }
-    env_->log_write_slowdown_.store(0);
+  }
+  return std::move(file_numbers);
+}
 
-    ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0);
+std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path) {
+  return ListSpecificFiles(env, path, kTableFile);
+}
+}  // namespace
 
-    std::vector<std::string> expected_db;
-    for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
-      expected_db.push_back(ToString(i));
-    }
-    sort(expected_db.begin(), expected_db.end());
+TEST_F(DBTest, FlushOneColumnFamily) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
 
-    Iterator* itr = db_->NewIterator(ReadOptions());
-    itr->SeekToFirst();
-    for (auto x : expected_db) {
-      ASSERT_TRUE(itr->Valid());
-      ASSERT_EQ(itr->key().ToString(), x);
-      ASSERT_EQ(itr->value().ToString(), x);
-      itr->Next();
-    }
-    ASSERT_TRUE(!itr->Valid());
-    delete itr;
+  ASSERT_OK(Put(0, "Default", "Default"));
+  ASSERT_OK(Put(1, "pikachu", "pikachu"));
+  ASSERT_OK(Put(2, "ilya", "ilya"));
+  ASSERT_OK(Put(3, "muromec", "muromec"));
+  ASSERT_OK(Put(4, "dobrynia", "dobrynia"));
+  ASSERT_OK(Put(5, "nikitich", "nikitich"));
+  ASSERT_OK(Put(6, "alyosha", "alyosha"));
+  ASSERT_OK(Put(7, "popovich", "popovich"));
 
-  } while (ChangeOptions(kSkipNoSeekToLast));
+  for (int i = 0; i < 8; ++i) {
+    Flush(i);
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), i + 1U);
+  }
 }
 
-namespace {
-typedef std::map<std::string, std::string> KVMap;
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it was empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST_F(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 5000000;
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  // Since we will reopen DB with smaller write_buffer_size,
+  // each key will go to new SST file
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  // Make 'dobrynia' to be flushed and new WAL file to be created
+  ASSERT_OK(Put(2, Key(10), DummyString(7500000)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+    // Make sure 'dobrynia' was flushed: check sst files amount
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+  }
+  // New WAL file
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+
+  options.write_buffer_size = 4096;
+  options.arena_block_size = 4096;
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           options);
+  {
+    // No inserts => default is empty
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(0));
+    // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(5));
+    // 1 SST for big key + 1 SST for small one
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(2));
+    // 1 SST for all keys
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
+  }
 }
 
-class ModelDB: public DB {
- public:
-  class ModelSnapshot : public Snapshot {
-   public:
-    KVMap map_;
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it wasn't empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST_F(DBTest, RecoverCheckFileAmount) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000;
+  options.arena_block_size = 4 * 1024;
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
 
-    virtual SequenceNumber GetSequenceNumber() const override {
-      // no need to call this
-      assert(false);
-      return 0;
-    }
-  };
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
 
-  explicit ModelDB(const Options& options) : options_(options) {}
-  using DB::Put;
-  virtual Status Put(const WriteOptions& o, ColumnFamilyHandle* cf,
-                     const Slice& k, const Slice& v) override {
-    WriteBatch batch;
-    batch.Put(cf, k, v);
-    return Write(o, &batch);
+  // Make 'nikitich' memtable to be flushed
+  ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  // 4 memtable are not flushed, 1 sst file
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
   }
-  using DB::Merge;
-  virtual Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf,
-                       const Slice& k, const Slice& v) override {
-    WriteBatch batch;
-    batch.Merge(cf, k, v);
-    return Write(o, &batch);
+  // Memtable for 'nikitich' has flushed, new WAL file has opened
+  // 4 memtable still not flushed
+
+  // Write to new WAL file
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  // Fill up 'nikitich' one more time
+  ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+  // make it flush
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  // There are still 4 memtable not flushed, and 2 sst tables
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
   }
-  using DB::Delete;
-  virtual Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf,
-                        const Slice& key) override {
-    WriteBatch batch;
-    batch.Delete(cf, key);
-    return Write(o, &batch);
+
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           options);
+  {
+    std::vector<uint64_t> table_files = ListTableFiles(env_, dbname_);
+    // Check, that records for 'default', 'dobrynia' and 'pikachu' from
+    // first, second and third WALs  went to the same SST.
+    // So, there is 6 SSTs: three  for 'nikitich', one for 'default', one for
+    // 'dobrynia', one for 'pikachu'
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(3));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(1));
   }
-  using DB::Get;
-  virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* cf,
-                     const Slice& key, std::string* value) override {
-    return Status::NotSupported(key);
+}
+
+TEST_F(DBTest, SharedWriteBuffer) {
+  Options options = CurrentOptions();
+  options.db_write_buffer_size = 100000;  // this is the real limit
+  options.write_buffer_size    = 500000;  // this is never hit
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  // Trigger a flush on every CF
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(3, Key(1), DummyString(90000)));
+  ASSERT_OK(Put(2, Key(2), DummyString(20000)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
   }
 
-  using DB::MultiGet;
-  virtual std::vector<Status> MultiGet(
-      const ReadOptions& options,
-      const std::vector<ColumnFamilyHandle*>& column_family,
-      const std::vector<Slice>& keys,
-      std::vector<std::string>* values) override {
-    std::vector<Status> s(keys.size(),
-                          Status::NotSupported("Not implemented."));
-    return s;
+  // Flush 'dobrynia' and 'nikitich'
+  ASSERT_OK(Put(2, Key(2), DummyString(50000)));
+  ASSERT_OK(Put(3, Key(2), DummyString(40000)));
+  ASSERT_OK(Put(2, Key(3), DummyString(20000)));
+  ASSERT_OK(Put(3, Key(2), DummyString(40000)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
   }
 
-  using DB::GetPropertiesOfAllTables;
-  virtual Status GetPropertiesOfAllTables(
-      ColumnFamilyHandle* column_family,
-      TablePropertiesCollection* props) override {
-    return Status();
+  // Make 'dobrynia' and 'nikitich' both take up 40% of space
+  // When 'pikachu' puts us over 100%, all 3 flush.
+  ASSERT_OK(Put(2, Key(2), DummyString(40000)));
+  ASSERT_OK(Put(1, Key(2), DummyString(20000)));
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
+  dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(3));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(3));
   }
 
-  using DB::KeyMayExist;
-  virtual bool KeyMayExist(const ReadOptions& options,
-                           ColumnFamilyHandle* column_family, const Slice& key,
-                           std::string* value,
-                           bool* value_found = nullptr) override {
-    if (value_found != nullptr) {
-      *value_found = false;
-    }
-    return true; // Not Supported directly
+  // Some remaining writes so 'default' and 'nikitich' flush on closure.
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           options);
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(3));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(4));
   }
-  using DB::NewIterator;
-  virtual Iterator* NewIterator(const ReadOptions& options,
-                                ColumnFamilyHandle* column_family) override {
-    if (options.snapshot == nullptr) {
-      KVMap* saved = new KVMap;
-      *saved = map_;
-      return new ModelIter(saved, true);
+}
+
+TEST_F(DBTest, PurgeInfoLogs) {
+  Options options = CurrentOptions();
+  options.keep_log_file_num = 5;
+  options.create_if_missing = true;
+  for (int mode = 0; mode <= 1; mode++) {
+    if (mode == 1) {
+      options.db_log_dir = dbname_ + "_logs";
+      env_->CreateDirIfMissing(options.db_log_dir);
     } else {
-      const KVMap* snapshot_state =
-          &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
-      return new ModelIter(snapshot_state, false);
+      options.db_log_dir = "";
+    }
+    for (int i = 0; i < 8; i++) {
+      Reopen(options);
     }
-  }
-  virtual Status NewIterators(
-      const ReadOptions& options,
-      const std::vector<ColumnFamilyHandle*>& column_family,
-      std::vector<Iterator*>* iterators) override {
-    return Status::NotSupported("Not supported yet");
-  }
-  virtual const Snapshot* GetSnapshot() override {
-    ModelSnapshot* snapshot = new ModelSnapshot;
-    snapshot->map_ = map_;
-    return snapshot;
-  }
-
-  virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
-    delete reinterpret_cast<const ModelSnapshot*>(snapshot);
-  }
 
-  virtual Status Write(const WriteOptions& options,
-                       WriteBatch* batch) override {
-    class Handler : public WriteBatch::Handler {
-     public:
-      KVMap* map_;
-      virtual void Put(const Slice& key, const Slice& value) override {
-        (*map_)[key.ToString()] = value.ToString();
-      }
-      virtual void Merge(const Slice& key, const Slice& value) override {
-        // ignore merge for now
-        //(*map_)[key.ToString()] = value.ToString();
-      }
-      virtual void Delete(const Slice& key) override {
-        map_->erase(key.ToString());
+    std::vector<std::string> files;
+    env_->GetChildren(options.db_log_dir.empty() ? dbname_ : options.db_log_dir,
+                      &files);
+    int info_log_count = 0;
+    for (std::string file : files) {
+      if (file.find("LOG") != std::string::npos) {
+        info_log_count++;
       }
-    };
-    Handler handler;
-    handler.map_ = &map_;
-    return batch->Iterate(&handler);
-  }
+    }
+    ASSERT_EQ(5, info_log_count);
 
-  using DB::GetProperty;
-  virtual bool GetProperty(ColumnFamilyHandle* column_family,
-                           const Slice& property, std::string* value) override {
-    return false;
-  }
-  using DB::GetIntProperty;
-  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
-                              const Slice& property, uint64_t* value) override {
-    return false;
-  }
-  using DB::GetApproximateSizes;
-  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                   const Range* range, int n,
-                                   uint64_t* sizes) override {
-    for (int i = 0; i < n; i++) {
-      sizes[i] = 0;
+    Destroy(options);
+    // For mode (1), test DestroyDB() to delete all the logs under DB dir.
+    // For mode (2), no info log file should have been put under DB dir.
+    std::vector<std::string> db_files;
+    env_->GetChildren(dbname_, &db_files);
+    for (std::string file : db_files) {
+      ASSERT_TRUE(file.find("LOG") == std::string::npos);
     }
-  }
-  using DB::CompactRange;
-  virtual Status CompactRange(ColumnFamilyHandle* column_family,
-                              const Slice* start, const Slice* end,
-                              bool reduce_level, int target_level,
-                              uint32_t output_path_id) override {
-    return Status::NotSupported("Not supported operation.");
-  }
 
-  using DB::CompactFiles;
-  virtual Status CompactFiles(
-      const CompactionOptions& compact_options,
-      ColumnFamilyHandle* column_family,
-      const std::vector<std::string>& input_file_names,
-      const int output_level, const int output_path_id = -1) override {
-    return Status::NotSupported("Not supported operation.");
+    if (mode == 1) {
+      // Cleaning up
+      env_->GetChildren(options.db_log_dir, &files);
+      for (std::string file : files) {
+        env_->DeleteFile(options.db_log_dir + "/" + file);
+      }
+      env_->DeleteDir(options.db_log_dir);
+    }
   }
+}
 
-  using DB::NumberLevels;
-  virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
-    return 1;
-  }
+TEST_F(DBTest, SyncMultipleLogs) {
+  const uint64_t kNumBatches = 2;
+  const int kBatchSize = 1000;
 
-  using DB::MaxMemCompactionLevel;
-  virtual int MaxMemCompactionLevel(
-      ColumnFamilyHandle* column_family) override {
-    return 1;
-  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.write_buffer_size = 4096;
+  Reopen(options);
 
-  using DB::Level0StopWriteTrigger;
-  virtual int Level0StopWriteTrigger(
-      ColumnFamilyHandle* column_family) override {
-    return -1;
+  WriteBatch batch;
+  WriteOptions wo;
+  wo.sync = true;
+
+  for (uint64_t b = 0; b < kNumBatches; b++) {
+    batch.Clear();
+    for (int i = 0; i < kBatchSize; i++) {
+      batch.Put(Key(i), DummyString(128));
+    }
+
+    dbfull()->Write(wo, &batch);
   }
 
-  virtual const std::string& GetName() const override { return name_; }
+  ASSERT_OK(dbfull()->SyncWAL());
+}
 
-  virtual Env* GetEnv() const override { return nullptr; }
+//
+// Test WAL recovery for the various modes available
+//
+class RecoveryTestHelper {
+ public:
+  // Number of WAL files to generate
+  static const int kWALFilesCount = 10;
+  // Starting number for the WAL file name like 00010.log
+  static const int kWALFileOffset = 10;
+  // Keys to be written per WAL file
+  static const int kKeysPerWALFile = 1024;
+  // Size of the value
+  static const int kValueSize = 10;
+
+  // Create WAL files with values filled in
+  static void FillData(DBTest* test, Options& options, const size_t wal_count,
+                       size_t& count) {
+    DBOptions& db_options = options;
 
-  using DB::GetOptions;
-  virtual const Options& GetOptions(
-      ColumnFamilyHandle* column_family) const override {
-    return options_;
-  }
+    count = 0;
 
-  using DB::GetDBOptions;
-  virtual const DBOptions& GetDBOptions() const override { return options_; }
+    shared_ptr<Cache> table_cache = NewLRUCache(50000, 16);
+    EnvOptions env_options;
+    WriteBuffer write_buffer(db_options.db_write_buffer_size);
 
-  using DB::Flush;
-  virtual Status Flush(const rocksdb::FlushOptions& options,
-                       ColumnFamilyHandle* column_family) override {
-    Status ret;
-    return ret;
-  }
+    unique_ptr<VersionSet> versions;
+    unique_ptr<WalManager> wal_manager;
+    WriteController write_controller;
 
-  virtual Status DisableFileDeletions() override { return Status::OK(); }
-  virtual Status EnableFileDeletions(bool force) override {
-    return Status::OK();
-  }
-  virtual Status GetLiveFiles(std::vector<std::string>&, uint64_t* size,
-                              bool flush_memtable = true) override {
-    return Status::OK();
-  }
+    versions.reset(new VersionSet(test->dbname_, &db_options, env_options,
+                                  table_cache.get(), &write_buffer,
+                                  &write_controller));
 
-  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
-    return Status::OK();
-  }
+    wal_manager.reset(new WalManager(db_options, env_options));
 
-  virtual Status DeleteFile(std::string name) override { return Status::OK(); }
+    std::unique_ptr<log::Writer> current_log_writer;
 
-  virtual Status GetDbIdentity(std::string& identity) override {
-    return Status::OK();
-  }
+    for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) {
+      uint64_t current_log_number = j;
+      std::string fname = LogFileName(test->dbname_, current_log_number);
+      unique_ptr<WritableFile> file;
+      ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options));
+      unique_ptr<WritableFileWriter> file_writer(
+          new WritableFileWriter(std::move(file), env_options));
+      current_log_writer.reset(new log::Writer(std::move(file_writer)));
 
-  virtual SequenceNumber GetLatestSequenceNumber() const override { return 0; }
-  virtual Status GetUpdatesSince(
-      rocksdb::SequenceNumber, unique_ptr<rocksdb::TransactionLogIterator>*,
-      const TransactionLogIterator::ReadOptions&
-          read_options = TransactionLogIterator::ReadOptions()) override {
-    return Status::NotSupported("Not supported in Model DB");
+      for (int i = 0; i < kKeysPerWALFile; i++) {
+        std::string key = "key" + ToString(count++);
+        std::string value = test->DummyString(kValueSize);
+        assert(current_log_writer.get() != nullptr);
+        uint64_t seq = versions->LastSequence() + 1;
+        WriteBatch batch;
+        batch.Put(key, value);
+        WriteBatchInternal::SetSequence(&batch, seq);
+        current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch));
+        versions->SetLastSequence(seq);
+      }
+    }
   }
 
-  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
-    return nullptr;
-  }
+  // Recreate and fill the store with some data
+  static size_t FillData(DBTest* test, Options& options) {
+    options.create_if_missing = true;
+    test->DestroyAndReopen(options);
+    test->Close();
 
-  virtual void GetColumnFamilyMetaData(
-      ColumnFamilyHandle* column_family,
-      ColumnFamilyMetaData* metadata) override {}
+    size_t count = 0;
+    FillData(test, options, kWALFilesCount, count);
+    return count;
+  }
 
- private:
-  class ModelIter: public Iterator {
-   public:
-    ModelIter(const KVMap* map, bool owned)
-        : map_(map), owned_(owned), iter_(map_->end()) {
-    }
-    ~ModelIter() {
-      if (owned_) delete map_;
-    }
-    virtual bool Valid() const override { return iter_ != map_->end(); }
-    virtual void SeekToFirst() override { iter_ = map_->begin(); }
-    virtual void SeekToLast() override {
-      if (map_->empty()) {
-        iter_ = map_->end();
-      } else {
-        iter_ = map_->find(map_->rbegin()->first);
+  // Read back all the keys we wrote and return the number of keys found
+  static size_t GetData(DBTest* test) {
+    size_t count = 0;
+    for (size_t i = 0; i < kWALFilesCount * kKeysPerWALFile; i++) {
+      if (test->Get("key" + ToString(i)) != "NOT_FOUND") {
+        ++count;
       }
     }
-    virtual void Seek(const Slice& k) override {
-      iter_ = map_->lower_bound(k.ToString());
-    }
-    virtual void Next() override { ++iter_; }
-    virtual void Prev() override {
-      if (iter_ == map_->begin()) {
-        iter_ = map_->end();
-        return;
-      }
-      --iter_;
+    return count;
+  }
+
+  // Manuall corrupt the specified WAL
+  static void CorruptWAL(DBTest* test, Options& options, const double off,
+                         const double len, const int wal_file_id,
+                         const bool trunc = false) {
+    Env* env = options.env;
+    std::string fname = LogFileName(test->dbname_, wal_file_id);
+    uint64_t size;
+    ASSERT_OK(env->GetFileSize(fname, &size));
+    ASSERT_GT(size, 0);
+#ifdef OS_WIN
+    // Windows disk cache behaves differently. When we truncate
+    // the original content is still in the cache due to the original
+    // handle is still open. Generally, in Windows, one prohibits
+    // shared access to files and it is not needed for WAL but we allow
+    // it to induce corruption at various tests.
+    test->Close();
+#endif
+    if (trunc) {
+      ASSERT_EQ(0, truncate(fname.c_str(), size * off));
+    } else {
+      InduceCorruption(fname, size * off, size * len);
     }
+  }
 
-    virtual Slice key() const override { return iter_->first; }
-    virtual Slice value() const override { return iter_->second; }
-    virtual Status status() const override { return Status::OK(); }
+  // Overwrite data with 'a' from offset for length len
+  static void InduceCorruption(const std::string& filename, uint32_t offset,
+                               uint32_t len) {
+    ASSERT_GT(len, 0);
 
-   private:
-    const KVMap* const map_;
-    const bool owned_;  // Do we own map_
-    KVMap::const_iterator iter_;
-  };
-  const Options options_;
-  KVMap map_;
-  std::string name_ = "";
-};
+    int fd = open(filename.c_str(), O_RDWR);
 
-static std::string RandomKey(Random* rnd, int minimum = 0) {
-  int len;
-  do {
-    len = (rnd->OneIn(3)
-           ? 1                // Short sometimes to encourage collisions
-           : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
-  } while (len < minimum);
-  return test::RandomKey(rnd, len);
-}
+    ASSERT_GT(fd, 0);
+    ASSERT_EQ(offset, lseek(fd, offset, SEEK_SET));
 
-static bool CompareIterators(int step,
-                             DB* model,
-                             DB* db,
-                             const Snapshot* model_snap,
-                             const Snapshot* db_snap) {
-  ReadOptions options;
-  options.snapshot = model_snap;
-  Iterator* miter = model->NewIterator(options);
-  options.snapshot = db_snap;
-  Iterator* dbiter = db->NewIterator(options);
-  bool ok = true;
-  int count = 0;
-  for (miter->SeekToFirst(), dbiter->SeekToFirst();
-       ok && miter->Valid() && dbiter->Valid();
-       miter->Next(), dbiter->Next()) {
-    count++;
-    if (miter->key().compare(dbiter->key()) != 0) {
-      fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n",
-              step,
-              EscapeString(miter->key()).c_str(),
-              EscapeString(dbiter->key()).c_str());
-      ok = false;
-      break;
-    }
+    void* buf = alloca(len);
+    memset(buf, 'a', len);
+    ASSERT_EQ(len, write(fd, buf, len));
 
-    if (miter->value().compare(dbiter->value()) != 0) {
-      fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
-              step,
-              EscapeString(miter->key()).c_str(),
-              EscapeString(miter->value()).c_str(),
-              EscapeString(miter->value()).c_str());
-      ok = false;
-    }
+    close(fd);
   }
+};
 
-  if (ok) {
-    if (miter->Valid() != dbiter->Valid()) {
-      fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
-              step, miter->Valid(), dbiter->Valid());
-      ok = false;
+// Test scope:
+// - We expect to open the data store when there is incomplete trailing writes
+// at the end of any of the logs
+// - We do not expect to open the data store for corruption
+TEST_F(DBTest, kTolerateCorruptedTailRecords) {
+  const int jstart = RecoveryTestHelper::kWALFileOffset;
+  const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+
+  for (auto trunc : {true, false}) {        /* Corruption style */
+    for (int i = 0; i < 4; i++) {           /* Corruption offset position */
+      for (int j = jstart; j < jend; j++) { /* WAL file */
+        // Fill data for testing
+        Options options = CurrentOptions();
+        const size_t row_count = RecoveryTestHelper::FillData(this, options);
+        // test checksum failure or parsing
+        RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
+                                       /*len%=*/.1, /*wal=*/j, trunc);
+
+        if (trunc) {
+          options.wal_recovery_mode =
+              WALRecoveryMode::kTolerateCorruptedTailRecords;
+          options.create_if_missing = false;
+          ASSERT_OK(TryReopen(options));
+          const size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+          ASSERT_TRUE(i == 0 || recovered_row_count > 0);
+          ASSERT_LT(recovered_row_count, row_count);
+        } else {
+          options.wal_recovery_mode =
+              WALRecoveryMode::kTolerateCorruptedTailRecords;
+          ASSERT_NOK(TryReopen(options));
+        }
+      }
     }
   }
-  delete miter;
-  delete dbiter;
-  return ok;
 }
 
-TEST_F(DBTest, Randomized) {
-  anon::OptionsOverride options_override;
-  options_override.skip_policy = kSkipNoSnapshot;
-  Random rnd(test::RandomSeed());
-  do {
-    ModelDB model(CurrentOptions(options_override));
-    const int N = 10000;
-    const Snapshot* model_snap = nullptr;
-    const Snapshot* db_snap = nullptr;
-    std::string k, v;
-    for (int step = 0; step < N; step++) {
-      // TODO(sanjay): Test Get() works
-      int p = rnd.Uniform(100);
-      int minimum = 0;
-      if (option_config_ == kHashSkipList ||
-          option_config_ == kHashLinkList ||
-          option_config_ == kHashCuckoo ||
-          option_config_ == kPlainTableFirstBytePrefix ||
-          option_config_ == kBlockBasedTableWithWholeKeyHashIndex ||
-          option_config_ == kBlockBasedTableWithPrefixHashIndex) {
-        minimum = 1;
+// Test scope:
+// We don't expect the data store to be opened if there is any corruption
+// (leading, middle or trailing -- incomplete writes or corruption)
+TEST_F(DBTest, kAbsoluteConsistency) {
+  const int jstart = RecoveryTestHelper::kWALFileOffset;
+  const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+
+  // Verify clean slate behavior
+  Options options = CurrentOptions();
+  const size_t row_count = RecoveryTestHelper::FillData(this, options);
+  options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+  ASSERT_EQ(RecoveryTestHelper::GetData(this), row_count);
+
+  for (auto trunc : {true, false}) { /* Corruption style */
+    for (int i = 0; i < 4; i++) {    /* Corruption offset position */
+      if (trunc && i == 0) {
+        continue;
       }
-      if (p < 45) {                               // Put
-        k = RandomKey(&rnd, minimum);
-        v = RandomString(&rnd,
-                         rnd.OneIn(20)
-                         ? 100 + rnd.Uniform(100)
-                         : rnd.Uniform(8));
-        ASSERT_OK(model.Put(WriteOptions(), k, v));
-        ASSERT_OK(db_->Put(WriteOptions(), k, v));
 
-      } else if (p < 90) {                        // Delete
-        k = RandomKey(&rnd, minimum);
-        ASSERT_OK(model.Delete(WriteOptions(), k));
-        ASSERT_OK(db_->Delete(WriteOptions(), k));
+      for (int j = jstart; j < jend; j++) { /* wal files */
+        // fill with new date
+        RecoveryTestHelper::FillData(this, options);
+        // corrupt the wal
+        RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
+                                       /*len%=*/.1, j, trunc);
+        // verify
+        options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+        options.create_if_missing = false;
+        ASSERT_NOK(TryReopen(options));
+      }
+    }
+  }
+}
+
+// Test scope:
+// - We expect to open data store under all circumstances
+// - We expect only data upto the point where the first error was encountered
+TEST_F(DBTest, kPointInTimeRecovery) {
+  const int jstart = RecoveryTestHelper::kWALFileOffset;
+  const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
+  const int maxkeys =
+      RecoveryTestHelper::kWALFilesCount * RecoveryTestHelper::kKeysPerWALFile;
 
+  for (auto trunc : {true, false}) {        /* Corruption style */
+    for (int i = 0; i < 4; i++) {           /* Offset of corruption */
+      for (int j = jstart; j < jend; j++) { /* WAL file */
+        // Fill data for testing
+        Options options = CurrentOptions();
+        const size_t row_count = RecoveryTestHelper::FillData(this, options);
 
-      } else {                                    // Multi-element batch
-        WriteBatch b;
-        const int num = rnd.Uniform(8);
-        for (int i = 0; i < num; i++) {
-          if (i == 0 || !rnd.OneIn(10)) {
-            k = RandomKey(&rnd, minimum);
-          } else {
-            // Periodically re-use the same key from the previous iter, so
-            // we have multiple entries in the write batch for the same key
-          }
-          if (rnd.OneIn(2)) {
-            v = RandomString(&rnd, rnd.Uniform(10));
-            b.Put(k, v);
-          } else {
-            b.Delete(k);
+        // Corrupt the wal
+        RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
+                                       /*len%=*/.1, j, trunc);
+
+        // Verify
+        options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+        options.create_if_missing = false;
+        ASSERT_OK(TryReopen(options));
+
+        // Probe data for invariants
+        size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+        ASSERT_LT(recovered_row_count, row_count);
+
+        bool expect_data = true;
+        for (size_t k = 0; k < maxkeys; ++k) {
+          bool found = Get("key" + ToString(i)) != "NOT_FOUND";
+          if (expect_data && !found) {
+            expect_data = false;
           }
+          ASSERT_EQ(found, expect_data);
         }
-        ASSERT_OK(model.Write(WriteOptions(), &b));
-        ASSERT_OK(db_->Write(WriteOptions(), &b));
-      }
 
-      if ((step % 100) == 0) {
-        // For DB instances that use the hash index + block-based table, the
-        // iterator will be invalid right when seeking a non-existent key, right
-        // than return a key that is close to it.
-        if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
-            option_config_ != kBlockBasedTableWithPrefixHashIndex) {
-          ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
-          ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+        const size_t min = RecoveryTestHelper::kKeysPerWALFile *
+                           (j - RecoveryTestHelper::kWALFileOffset);
+        ASSERT_GE(recovered_row_count, min);
+        if (!trunc && i != 0) {
+          const size_t max = RecoveryTestHelper::kKeysPerWALFile *
+                             (j - RecoveryTestHelper::kWALFileOffset + 1);
+          ASSERT_LE(recovered_row_count, max);
         }
+      }
+    }
+  }
+}
 
-        // Save a snapshot from each DB this time that we'll use next
-        // time we compare things, to make sure the current state is
-        // preserved with the snapshot
-        if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
-        if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+// Test scope:
+// - We expect to open the data store under all scenarios
+// - We expect to have recovered records past the corruption zone
+TEST_F(DBTest, kSkipAnyCorruptedRecords) {
+  const int jstart = RecoveryTestHelper::kWALFileOffset;
+  const int jend = jstart + RecoveryTestHelper::kWALFilesCount;
 
+  for (auto trunc : {true, false}) {        /* Corruption style */
+    for (int i = 0; i < 4; i++) {           /* Corruption offset */
+      for (int j = jstart; j < jend; j++) { /* wal files */
+        // Fill data for testing
+        Options options = CurrentOptions();
+        const size_t row_count = RecoveryTestHelper::FillData(this, options);
 
-        auto options = CurrentOptions(options_override);
-        Reopen(options);
-        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+        // Corrupt the WAL
+        RecoveryTestHelper::CorruptWAL(this, options, /*off=*/i * .3,
+                                       /*len%=*/.1, j, trunc);
 
-        model_snap = model.GetSnapshot();
-        db_snap = db_->GetSnapshot();
-      }
+        // Verify behavior
+        options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords;
+        options.create_if_missing = false;
+        ASSERT_OK(TryReopen(options));
 
-      if ((step % 2000) == 0) {
-        fprintf(stderr,
-                "DBTest.Randomized, option ID: %d, step: %d out of %d\n",
-                option_config_, step, N);
+        // Probe data for invariants
+        size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+        ASSERT_LT(recovered_row_count, row_count);
+
+        if (!trunc) {
+          ASSERT_TRUE(i != 0 || recovered_row_count > 0);
+        }
       }
     }
-    if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
-    if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
-    // skip cuckoo hash as it does not support snapshot.
-  } while (ChangeOptions(kSkipDeletesFilterFirst | kSkipNoSeekToLast |
-                         kSkipHashCuckoo));
+  }
 }
 
-TEST_F(DBTest, MultiGetSimple) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    ASSERT_OK(Put(1, "k1", "v1"));
-    ASSERT_OK(Put(1, "k2", "v2"));
-    ASSERT_OK(Put(1, "k3", "v3"));
-    ASSERT_OK(Put(1, "k4", "v4"));
-    ASSERT_OK(Delete(1, "k4"));
-    ASSERT_OK(Put(1, "k5", "v5"));
-    ASSERT_OK(Delete(1, "no_key"));
 
-    std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
+// Multi-threaded test:
+namespace {
 
-    std::vector<std::string> values(20, "Temporary data to be overwritten");
-    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+static const int kColumnFamilies = 10;
+static const int kNumThreads = 10;
+static const int kTestSeconds = 10;
+static const int kNumKeys = 1000;
 
-    std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
-    ASSERT_EQ(values.size(), keys.size());
-    ASSERT_EQ(values[0], "v1");
-    ASSERT_EQ(values[1], "v2");
-    ASSERT_EQ(values[2], "v3");
-    ASSERT_EQ(values[4], "v5");
+struct MTState {
+  DBTest* test;
+  std::atomic<bool> stop;
+  std::atomic<int> counter[kNumThreads];
+  std::atomic<bool> thread_done[kNumThreads];
+};
 
-    ASSERT_OK(s[0]);
-    ASSERT_OK(s[1]);
-    ASSERT_OK(s[2]);
-    ASSERT_TRUE(s[3].IsNotFound());
-    ASSERT_OK(s[4]);
-    ASSERT_TRUE(s[5].IsNotFound());
-  } while (ChangeCompactOptions());
+struct MTThread {
+  MTState* state;
+  int id;
+};
+
+static void MTThreadBody(void* arg) {
+  MTThread* t = reinterpret_cast<MTThread*>(arg);
+  int id = t->id;
+  DB* db = t->state->test->db_;
+  int counter = 0;
+  fprintf(stderr, "... starting thread %d\n", id);
+  Random rnd(1000 + id);
+  char valbuf[1500];
+  while (t->state->stop.load(std::memory_order_acquire) == false) {
+    t->state->counter[id].store(counter, std::memory_order_release);
+
+    int key = rnd.Uniform(kNumKeys);
+    char keybuf[20];
+    snprintf(keybuf, sizeof(keybuf), "%016d", key);
+
+    if (rnd.OneIn(2)) {
+      // Write values of the form <key, my id, counter, cf, unique_id>.
+      // into each of the CFs
+      // We add some padding for force compactions.
+      int unique_id = rnd.Uniform(1000000);
+
+      // Half of the time directly use WriteBatch. Half of the time use
+      // WriteBatchWithIndex.
+      if (rnd.OneIn(2)) {
+        WriteBatch batch;
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+                   static_cast<int>(counter), cf, unique_id);
+          batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf));
+        }
+        ASSERT_OK(db->Write(WriteOptions(), &batch));
+      } else {
+        WriteBatchWithIndex batch(db->GetOptions().comparator);
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+                   static_cast<int>(counter), cf, unique_id);
+          batch.Put(t->state->test->handles_[cf], Slice(keybuf), Slice(valbuf));
+        }
+        ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch()));
+      }
+    } else {
+      // Read a value and verify that it matches the pattern written above
+      // and that writes to all column families were atomic (unique_id is the
+      // same)
+      std::vector<Slice> keys(kColumnFamilies, Slice(keybuf));
+      std::vector<std::string> values;
+      std::vector<Status> statuses =
+          db->MultiGet(ReadOptions(), t->state->test->handles_, keys, &values);
+      Status s = statuses[0];
+      // all statuses have to be the same
+      for (size_t i = 1; i < statuses.size(); ++i) {
+        // they are either both ok or both not-found
+        ASSERT_TRUE((s.ok() && statuses[i].ok()) ||
+                    (s.IsNotFound() && statuses[i].IsNotFound()));
+      }
+      if (s.IsNotFound()) {
+        // Key has not yet been written
+      } else {
+        // Check that the writer thread counter is >= the counter in the value
+        ASSERT_OK(s);
+        int unique_id = -1;
+        for (int i = 0; i < kColumnFamilies; ++i) {
+          int k, w, c, cf, u;
+          ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w,
+                              &c, &cf, &u))
+              << values[i];
+          ASSERT_EQ(k, key);
+          ASSERT_GE(w, 0);
+          ASSERT_LT(w, kNumThreads);
+          ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire));
+          ASSERT_EQ(cf, i);
+          if (i == 0) {
+            unique_id = u;
+          } else {
+            // this checks that updates across column families happened
+            // atomically -- all unique ids are the same
+            ASSERT_EQ(u, unique_id);
+          }
+        }
+      }
+    }
+    counter++;
+  }
+  t->state->thread_done[id].store(true, std::memory_order_release);
+  fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
 }
 
-TEST_F(DBTest, MultiGetEmpty) {
-  do {
-    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    // Empty Key Set
-    std::vector<Slice> keys;
-    std::vector<std::string> values;
-    std::vector<ColumnFamilyHandle*> cfs;
-    std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
-    ASSERT_EQ(s.size(), 0U);
+}  // namespace
 
-    // Empty Database, Empty Key Set
-    Options options = CurrentOptions();
-    options.create_if_missing = true;
-    DestroyAndReopen(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
-    s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
-    ASSERT_EQ(s.size(), 0U);
+class MultiThreadedDBTest : public DBTest,
+                            public ::testing::WithParamInterface<int> {
+ public:
+  virtual void SetUp() override { option_config_ = GetParam(); }
 
-    // Empty Database, Search for Keys
-    keys.resize(2);
-    keys[0] = "a";
-    keys[1] = "b";
-    cfs.push_back(handles_[0]);
-    cfs.push_back(handles_[1]);
-    s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
-    ASSERT_EQ((int)s.size(), 2);
-    ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound());
-  } while (ChangeCompactOptions());
+  static std::vector<int> GenerateOptionConfigs() {
+    std::vector<int> optionConfigs;
+    for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) {
+      // skip as HashCuckooRep does not support snapshot
+      if (optionConfig != kHashCuckoo) {
+        optionConfigs.push_back(optionConfig);
+      }
+    }
+    return optionConfigs;
+  }
+};
+
+TEST_P(MultiThreadedDBTest, MultiThreaded) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  std::vector<std::string> cfs;
+  for (int i = 1; i < kColumnFamilies; ++i) {
+    cfs.push_back(ToString(i));
+  }
+  CreateAndReopenWithCF(cfs, CurrentOptions(options_override));
+  // Initialize state
+  MTState mt;
+  mt.test = this;
+  mt.stop.store(false, std::memory_order_release);
+  for (int id = 0; id < kNumThreads; id++) {
+    mt.counter[id].store(0, std::memory_order_release);
+    mt.thread_done[id].store(false, std::memory_order_release);
+  }
+
+  // Start threads
+  MTThread thread[kNumThreads];
+  for (int id = 0; id < kNumThreads; id++) {
+    thread[id].state = &mt;
+    thread[id].id = id;
+    env_->StartThread(MTThreadBody, &thread[id]);
+  }
+
+  // Let them run for a while
+  env_->SleepForMicroseconds(kTestSeconds * 1000000);
+
+  // Stop the threads and wait for them to finish
+  mt.stop.store(true, std::memory_order_release);
+  for (int id = 0; id < kNumThreads; id++) {
+    while (mt.thread_done[id].load(std::memory_order_acquire) == false) {
+      env_->SleepForMicroseconds(100000);
+    }
+  }
 }
 
+INSTANTIATE_TEST_CASE_P(
+    MultiThreaded, MultiThreadedDBTest,
+    ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs()));
+
+// Group commit test:
 namespace {
-void PrefixScanInit(DBTest *dbtest) {
-  char buf[100];
-  std::string keystr;
-  const int small_range_sstfiles = 5;
-  const int big_range_sstfiles = 5;
 
-  // Generate 11 sst files with the following prefix ranges.
-  // GROUP 0: [0,10]                              (level 1)
-  // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6]  (level 0)
-  // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10]  (level 0)
-  //
-  // A seek with the previous API would do 11 random I/Os (to all the
-  // files).  With the new API and a prefix filter enabled, we should
-  // only do 2 random I/O, to the 2 files containing the key.
+static const int kGCNumThreads = 4;
+static const int kGCNumKeys = 1000;
 
-  // GROUP 0
-  snprintf(buf, sizeof(buf), "%02d______:start", 0);
-  keystr = std::string(buf);
-  ASSERT_OK(dbtest->Put(keystr, keystr));
-  snprintf(buf, sizeof(buf), "%02d______:end", 10);
-  keystr = std::string(buf);
-  ASSERT_OK(dbtest->Put(keystr, keystr));
-  dbtest->Flush();
-  dbtest->dbfull()->CompactRange(nullptr, nullptr); // move to level 1
+struct GCThread {
+  DB* db;
+  int id;
+  std::atomic<bool> done;
+};
 
-  // GROUP 1
-  for (int i = 1; i <= small_range_sstfiles; i++) {
-    snprintf(buf, sizeof(buf), "%02d______:start", i);
-    keystr = std::string(buf);
-    ASSERT_OK(dbtest->Put(keystr, keystr));
-    snprintf(buf, sizeof(buf), "%02d______:end", i+1);
-    keystr = std::string(buf);
-    ASSERT_OK(dbtest->Put(keystr, keystr));
-    dbtest->Flush();
-  }
+static void GCThreadBody(void* arg) {
+  GCThread* t = reinterpret_cast<GCThread*>(arg);
+  int id = t->id;
+  DB* db = t->db;
+  WriteOptions wo;
 
-  // GROUP 2
-  for (int i = 1; i <= big_range_sstfiles; i++) {
-    snprintf(buf, sizeof(buf), "%02d______:start", 0);
-    keystr = std::string(buf);
-    ASSERT_OK(dbtest->Put(keystr, keystr));
-    snprintf(buf, sizeof(buf), "%02d______:end",
-             small_range_sstfiles+i+1);
-    keystr = std::string(buf);
-    ASSERT_OK(dbtest->Put(keystr, keystr));
-    dbtest->Flush();
+  for (int i = 0; i < kGCNumKeys; ++i) {
+    std::string kv(ToString(i + id * kGCNumKeys));
+    ASSERT_OK(db->Put(wo, kv, kv));
   }
+  t->done = true;
 }
+
 }  // namespace
 
-TEST_F(DBTest, PrefixScan) {
-  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip,
-             kSkipNoPrefix);
-  while (ChangeFilterOptions()) {
-    int count;
-    Slice prefix;
-    Slice key;
-    char buf[100];
-    Iterator* iter;
-    snprintf(buf, sizeof(buf), "03______:");
-    prefix = Slice(buf, 8);
-    key = Slice(buf, 9);
-    // db configs
-    env_->count_random_reads_ = true;
+TEST_F(DBTest, GroupCommitTest) {
+  do {
     Options options = CurrentOptions();
     options.env = env_;
-    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
-    options.disable_auto_compactions = true;
-    options.max_background_compactions = 2;
-    options.create_if_missing = true;
-    options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+    env_->log_write_slowdown_.store(100);
+    options.statistics = rocksdb::CreateDBStatistics();
+    Reopen(options);
 
-    BlockBasedTableOptions table_options;
-    table_options.no_block_cache = true;
-    table_options.filter_policy.reset(NewBloomFilterPolicy(10));
-    table_options.whole_key_filtering = false;
-    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    // Start threads
+    GCThread thread[kGCNumThreads];
+    for (int id = 0; id < kGCNumThreads; id++) {
+      thread[id].id = id;
+      thread[id].db = db_;
+      thread[id].done = false;
+      env_->StartThread(GCThreadBody, &thread[id]);
+    }
 
-    // 11 RAND I/Os
-    DestroyAndReopen(options);
-    PrefixScanInit(this);
-    count = 0;
-    env_->random_read_counter_.Reset();
-    iter = db_->NewIterator(ReadOptions());
-    for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
-      if (! iter->key().starts_with(prefix)) {
-        break;
+    for (int id = 0; id < kGCNumThreads; id++) {
+      while (thread[id].done == false) {
+        env_->SleepForMicroseconds(100000);
       }
-      count++;
     }
-    ASSERT_OK(iter->status());
-    delete iter;
-    ASSERT_EQ(count, 2);
-    ASSERT_EQ(env_->random_read_counter_.Read(), 2);
-    Close();
-  }  // end of while
-  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, 0);
-}
+    env_->log_write_slowdown_.store(0);
 
-TEST_F(DBTest, TailingIteratorSingle) {
-  ReadOptions read_options;
-  read_options.tailing = true;
+    ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0);
 
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
-  iter->SeekToFirst();
-  ASSERT_TRUE(!iter->Valid());
+    std::vector<std::string> expected_db;
+    for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
+      expected_db.push_back(ToString(i));
+    }
+    sort(expected_db.begin(), expected_db.end());
 
-  // add a record and check that iter can see it
-  ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor"));
-  iter->SeekToFirst();
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(iter->key().ToString(), "mirko");
+    Iterator* itr = db_->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    for (auto x : expected_db) {
+      ASSERT_TRUE(itr->Valid());
+      ASSERT_EQ(itr->key().ToString(), x);
+      ASSERT_EQ(itr->value().ToString(), x);
+      itr->Next();
+    }
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
 
-  iter->Next();
-  ASSERT_TRUE(!iter->Valid());
+    HistogramData hist_data = {0};
+    options.statistics->histogramData(DB_WRITE, &hist_data);
+    ASSERT_GT(hist_data.average, 0.0);
+  } while (ChangeOptions(kSkipNoSeekToLast));
 }
 
-TEST_F(DBTest, TailingIteratorKeepAdding) {
-  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-  ReadOptions read_options;
-  read_options.tailing = true;
-
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
-  std::string value(1024, 'a');
+namespace {
+typedef std::map<std::string, std::string> KVMap;
+}
 
-  const int num_records = 10000;
-  for (int i = 0; i < num_records; ++i) {
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%016d", i);
+class ModelDB: public DB {
+ public:
+  class ModelSnapshot : public Snapshot {
+   public:
+    KVMap map_;
 
-    Slice key(buf, 16);
-    ASSERT_OK(Put(1, key, value));
+    virtual SequenceNumber GetSequenceNumber() const override {
+      // no need to call this
+      assert(false);
+      return 0;
+    }
+  };
 
-    iter->Seek(key);
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(key), 0);
+  explicit ModelDB(const Options& options) : options_(options) {}
+  using DB::Put;
+  virtual Status Put(const WriteOptions& o, ColumnFamilyHandle* cf,
+                     const Slice& k, const Slice& v) override {
+    WriteBatch batch;
+    batch.Put(cf, k, v);
+    return Write(o, &batch);
+  }
+  using DB::Delete;
+  virtual Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf,
+                        const Slice& key) override {
+    WriteBatch batch;
+    batch.Delete(cf, key);
+    return Write(o, &batch);
+  }
+  using DB::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf,
+                              const Slice& key) override {
+    WriteBatch batch;
+    batch.SingleDelete(cf, key);
+    return Write(o, &batch);
+  }
+  using DB::Merge;
+  virtual Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf,
+                       const Slice& k, const Slice& v) override {
+    WriteBatch batch;
+    batch.Merge(cf, k, v);
+    return Write(o, &batch);
+  }
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* cf,
+                     const Slice& key, std::string* value) override {
+    return Status::NotSupported(key);
   }
-}
-
-TEST_F(DBTest, TailingIteratorSeekToNext) {
-  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-  ReadOptions read_options;
-  read_options.tailing = true;
 
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
-  std::string value(1024, 'a');
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override {
+    std::vector<Status> s(keys.size(),
+                          Status::NotSupported("Not implemented."));
+    return s;
+  }
 
-  const int num_records = 1000;
-  for (int i = 1; i < num_records; ++i) {
-    char buf1[32];
-    char buf2[32];
-    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+  using DB::AddFile;
+  virtual Status AddFile(ColumnFamilyHandle* column_family,
+                         const ExternalSstFileInfo* file_path,
+                         bool move_file) override {
+    return Status::NotSupported("Not implemented.");
+  }
+  virtual Status AddFile(ColumnFamilyHandle* column_family,
+                         const std::string& file_path,
+                         bool move_file) override {
+    return Status::NotSupported("Not implemented.");
+  }
 
-    Slice key(buf1, 20);
-    ASSERT_OK(Put(1, key, value));
+  using DB::GetPropertiesOfAllTables;
+  virtual Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* column_family,
+      TablePropertiesCollection* props) override {
+    return Status();
+  }
 
-    if (i % 100 == 99) {
-      ASSERT_OK(Flush(1));
+  using DB::KeyMayExist;
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value,
+                           bool* value_found = nullptr) override {
+    if (value_found != nullptr) {
+      *value_found = false;
+    }
+    return true; // Not Supported directly
+  }
+  using DB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) override {
+    if (options.snapshot == nullptr) {
+      KVMap* saved = new KVMap;
+      *saved = map_;
+      return new ModelIter(saved, true);
+    } else {
+      const KVMap* snapshot_state =
+          &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
+      return new ModelIter(snapshot_state, false);
     }
+  }
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      std::vector<Iterator*>* iterators) override {
+    return Status::NotSupported("Not supported yet");
+  }
+  virtual const Snapshot* GetSnapshot() override {
+    ModelSnapshot* snapshot = new ModelSnapshot;
+    snapshot->map_ = map_;
+    return snapshot;
+  }
 
-    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
-    Slice target(buf2, 20);
-    iter->Seek(target);
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(key), 0);
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
+    delete reinterpret_cast<const ModelSnapshot*>(snapshot);
   }
-  for (int i = 2 * num_records; i > 0; --i) {
-    char buf1[32];
-    char buf2[32];
-    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
 
-    Slice key(buf1, 20);
-    ASSERT_OK(Put(1, key, value));
+  virtual Status Write(const WriteOptions& options,
+                       WriteBatch* batch) override {
+    class Handler : public WriteBatch::Handler {
+     public:
+      KVMap* map_;
+      virtual void Put(const Slice& key, const Slice& value) override {
+        (*map_)[key.ToString()] = value.ToString();
+      }
+      virtual void Merge(const Slice& key, const Slice& value) override {
+        // ignore merge for now
+        //(*map_)[key.ToString()] = value.ToString();
+      }
+      virtual void Delete(const Slice& key) override {
+        map_->erase(key.ToString());
+      }
+    };
+    Handler handler;
+    handler.map_ = &map_;
+    return batch->Iterate(&handler);
+  }
 
-    if (i % 100 == 99) {
-      ASSERT_OK(Flush(1));
+  using DB::GetProperty;
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value) override {
+    return false;
+  }
+  using DB::GetIntProperty;
+  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+                              const Slice& property, uint64_t* value) override {
+    return false;
+  }
+  using DB::GetApproximateSizes;
+  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
+                                   const Range* range, int n, uint64_t* sizes,
+                                   bool include_memtable) override {
+    for (int i = 0; i < n; i++) {
+      sizes[i] = 0;
     }
-
-    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
-    Slice target(buf2, 20);
-    iter->Seek(target);
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(key), 0);
   }
-}
-
-TEST_F(DBTest, TailingIteratorDeletes) {
-  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-  ReadOptions read_options;
-  read_options.tailing = true;
-
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
-
-  // write a single record, read it using the iterator, then delete it
-  ASSERT_OK(Put(1, "0test", "test"));
-  iter->SeekToFirst();
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(iter->key().ToString(), "0test");
-  ASSERT_OK(Delete(1, "0test"));
-
-  // write many more records
-  const int num_records = 10000;
-  std::string value(1024, 'A');
-
-  for (int i = 0; i < num_records; ++i) {
-    char buf[32];
-    snprintf(buf, sizeof(buf), "1%015d", i);
-
-    Slice key(buf, 16);
-    ASSERT_OK(Put(1, key, value));
+  using DB::CompactRange;
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* start, const Slice* end) override {
+    return Status::NotSupported("Not supported operation.");
   }
 
-  // force a flush to make sure that no records are read from memtable
-  ASSERT_OK(Flush(1));
-
-  // skip "0test"
-  iter->Next();
-
-  // make sure we can read all new records using the existing iterator
-  int count = 0;
-  for (; iter->Valid(); iter->Next(), ++count) ;
-
-  ASSERT_EQ(count, num_records);
-}
-
-TEST_F(DBTest, TailingIteratorPrefixSeek) {
-  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip,
-             kSkipNoPrefix);
-  ReadOptions read_options;
-  read_options.tailing = true;
-
-  Options options = CurrentOptions();
-  options.env = env_;
-  options.create_if_missing = true;
-  options.disable_auto_compactions = true;
-  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
-  options.memtable_factory.reset(NewHashSkipListRepFactory(16));
-  DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
-  ASSERT_OK(Put(1, "0101", "test"));
-
-  ASSERT_OK(Flush(1));
-
-  ASSERT_OK(Put(1, "0202", "test"));
-
-  // Seek(0102) shouldn't find any records since 0202 has a different prefix
-  iter->Seek("0102");
-  ASSERT_TRUE(!iter->Valid());
+  using DB::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names,
+      const int output_level, const int output_path_id = -1) override {
+    return Status::NotSupported("Not supported operation.");
+  }
 
-  iter->Seek("0202");
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(iter->key().ToString(), "0202");
+  Status PauseBackgroundWork() override {
+    return Status::NotSupported("Not supported operation.");
+  }
 
-  iter->Next();
-  ASSERT_TRUE(!iter->Valid());
-  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, 0);
-}
+  Status ContinueBackgroundWork() override {
+    return Status::NotSupported("Not supported operation.");
+  }
 
-TEST_F(DBTest, TailingIteratorIncomplete) {
-  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.read_tier = kBlockCacheTier;
+  using DB::NumberLevels;
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
+    return 1;
+  }
 
-  std::string key("key");
-  std::string value("value");
+  using DB::MaxMemCompactionLevel;
+  virtual int MaxMemCompactionLevel(
+      ColumnFamilyHandle* column_family) override {
+    return 1;
+  }
 
-  ASSERT_OK(db_->Put(WriteOptions(), key, value));
+  using DB::Level0StopWriteTrigger;
+  virtual int Level0StopWriteTrigger(
+      ColumnFamilyHandle* column_family) override {
+    return -1;
+  }
 
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
-  iter->SeekToFirst();
-  // we either see the entry or it's not in cache
-  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+  virtual const std::string& GetName() const override { return name_; }
 
-  ASSERT_OK(db_->CompactRange(nullptr, nullptr));
-  iter->SeekToFirst();
-  // should still be true after compaction
-  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
-}
+  virtual Env* GetEnv() const override { return nullptr; }
 
-TEST_F(DBTest, TailingIteratorSeekToSame) {
-  Options options = CurrentOptions();
-  options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 1000;
-  CreateAndReopenWithCF({"pikachu"}, options);
+  using DB::GetOptions;
+  virtual const Options& GetOptions(
+      ColumnFamilyHandle* column_family) const override {
+    return options_;
+  }
 
-  ReadOptions read_options;
-  read_options.tailing = true;
+  using DB::GetDBOptions;
+  virtual const DBOptions& GetDBOptions() const override { return options_; }
 
-  const int NROWS = 10000;
-  // Write rows with keys 00000, 00002, 00004 etc.
-  for (int i = 0; i < NROWS; ++i) {
-    char buf[100];
-    snprintf(buf, sizeof(buf), "%05d", 2*i);
-    std::string key(buf);
-    std::string value("value");
-    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+  using DB::Flush;
+  virtual Status Flush(const rocksdb::FlushOptions& options,
+                       ColumnFamilyHandle* column_family) override {
+    Status ret;
+    return ret;
   }
 
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
-  // Seek to 00001.  We expect to find 00002.
-  std::string start_key = "00001";
-  iter->Seek(start_key);
-  ASSERT_TRUE(iter->Valid());
+  virtual Status SyncWAL() override {
+    return Status::OK();
+  }
 
-  std::string found = iter->key().ToString();
-  ASSERT_EQ("00002", found);
+  virtual Status DisableFileDeletions() override { return Status::OK(); }
+  virtual Status EnableFileDeletions(bool force) override {
+    return Status::OK();
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>&, uint64_t* size,
+                              bool flush_memtable = true) override {
+    return Status::OK();
+  }
 
-  // Now seek to the same key.  The iterator should remain in the same
-  // position.
-  iter->Seek(found);
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(found, iter->key().ToString());
-}
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
+    return Status::OK();
+  }
 
-TEST_F(DBTest, ManagedTailingIteratorSingle) {
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.managed = true;
+  virtual Status DeleteFile(std::string name) override { return Status::OK(); }
 
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
-  iter->SeekToFirst();
-  ASSERT_TRUE(!iter->Valid());
+  virtual Status GetDbIdentity(std::string& identity) const override {
+    return Status::OK();
+  }
 
-  // add a record and check that iter can see it
-  ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor"));
-  iter->SeekToFirst();
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(iter->key().ToString(), "mirko");
+  virtual SequenceNumber GetLatestSequenceNumber() const override { return 0; }
+  virtual Status GetUpdatesSince(
+      rocksdb::SequenceNumber, unique_ptr<rocksdb::TransactionLogIterator>*,
+      const TransactionLogIterator::ReadOptions&
+          read_options = TransactionLogIterator::ReadOptions()) override {
+    return Status::NotSupported("Not supported in Model DB");
+  }
 
-  iter->Next();
-  ASSERT_TRUE(!iter->Valid());
-}
+  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
+    return nullptr;
+  }
 
-TEST_F(DBTest, ManagedTailingIteratorKeepAdding) {
-  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.managed = true;
+  virtual void GetColumnFamilyMetaData(
+      ColumnFamilyHandle* column_family,
+      ColumnFamilyMetaData* metadata) override {}
 
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
-  std::string value(1024, 'a');
+ private:
+  class ModelIter: public Iterator {
+   public:
+    ModelIter(const KVMap* map, bool owned)
+        : map_(map), owned_(owned), iter_(map_->end()) {
+    }
+    ~ModelIter() {
+      if (owned_) delete map_;
+    }
+    virtual bool Valid() const override { return iter_ != map_->end(); }
+    virtual void SeekToFirst() override { iter_ = map_->begin(); }
+    virtual void SeekToLast() override {
+      if (map_->empty()) {
+        iter_ = map_->end();
+      } else {
+        iter_ = map_->find(map_->rbegin()->first);
+      }
+    }
+    virtual void Seek(const Slice& k) override {
+      iter_ = map_->lower_bound(k.ToString());
+    }
+    virtual void Next() override { ++iter_; }
+    virtual void Prev() override {
+      if (iter_ == map_->begin()) {
+        iter_ = map_->end();
+        return;
+      }
+      --iter_;
+    }
 
-  const int num_records = 10000;
-  for (int i = 0; i < num_records; ++i) {
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%016d", i);
+    virtual Slice key() const override { return iter_->first; }
+    virtual Slice value() const override { return iter_->second; }
+    virtual Status status() const override { return Status::OK(); }
 
-    Slice key(buf, 16);
-    ASSERT_OK(Put(1, key, value));
+   private:
+    const KVMap* const map_;
+    const bool owned_;  // Do we own map_
+    KVMap::const_iterator iter_;
+  };
+  const Options options_;
+  KVMap map_;
+  std::string name_ = "";
+};
 
-    iter->Seek(key);
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(key), 0);
-  }
+static std::string RandomKey(Random* rnd, int minimum = 0) {
+  int len;
+  do {
+    len = (rnd->OneIn(3)
+           ? 1                // Short sometimes to encourage collisions
+           : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
+  } while (len < minimum);
+  return test::RandomKey(rnd, len);
 }
 
-TEST_F(DBTest, ManagedTailingIteratorSeekToNext) {
-  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.managed = true;
-
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
-  std::string value(1024, 'a');
-
-  const int num_records = 1000;
-  for (int i = 1; i < num_records; ++i) {
-    char buf1[32];
-    char buf2[32];
-    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
-
-    Slice key(buf1, 20);
-    ASSERT_OK(Put(1, key, value));
-
-    if (i % 100 == 99) {
-      ASSERT_OK(Flush(1));
+static bool CompareIterators(int step,
+                             DB* model,
+                             DB* db,
+                             const Snapshot* model_snap,
+                             const Snapshot* db_snap) {
+  ReadOptions options;
+  options.snapshot = model_snap;
+  Iterator* miter = model->NewIterator(options);
+  options.snapshot = db_snap;
+  Iterator* dbiter = db->NewIterator(options);
+  bool ok = true;
+  int count = 0;
+  for (miter->SeekToFirst(), dbiter->SeekToFirst();
+       ok && miter->Valid() && dbiter->Valid();
+       miter->Next(), dbiter->Next()) {
+    count++;
+    if (miter->key().compare(dbiter->key()) != 0) {
+      fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n",
+              step,
+              EscapeString(miter->key()).c_str(),
+              EscapeString(dbiter->key()).c_str());
+      ok = false;
+      break;
     }
 
-    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
-    Slice target(buf2, 20);
-    iter->Seek(target);
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(key), 0);
+    if (miter->value().compare(dbiter->value()) != 0) {
+      fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
+              step,
+              EscapeString(miter->key()).c_str(),
+              EscapeString(miter->value()).c_str(),
+              EscapeString(miter->value()).c_str());
+      ok = false;
+    }
   }
-  for (int i = 2 * num_records; i > 0; --i) {
-    char buf1[32];
-    char buf2[32];
-    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
 
-    Slice key(buf1, 20);
-    ASSERT_OK(Put(1, key, value));
-
-    if (i % 100 == 99) {
-      ASSERT_OK(Flush(1));
+  if (ok) {
+    if (miter->Valid() != dbiter->Valid()) {
+      fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
+              step, miter->Valid(), dbiter->Valid());
+      ok = false;
     }
-
-    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
-    Slice target(buf2, 20);
-    iter->Seek(target);
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(key), 0);
   }
+  delete miter;
+  delete dbiter;
+  return ok;
 }
 
-TEST_F(DBTest, ManagedTailingIteratorDeletes) {
-  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.managed = true;
+TEST_F(DBTest, Randomized) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Random rnd(test::RandomSeed());
+  do {
+    ModelDB model(CurrentOptions(options_override));
+    const int N = 10000;
+    const Snapshot* model_snap = nullptr;
+    const Snapshot* db_snap = nullptr;
+    std::string k, v;
+    for (int step = 0; step < N; step++) {
+      // TODO(sanjay): Test Get() works
+      int p = rnd.Uniform(100);
+      int minimum = 0;
+      if (option_config_ == kHashSkipList ||
+          option_config_ == kHashLinkList ||
+          option_config_ == kHashCuckoo ||
+          option_config_ == kPlainTableFirstBytePrefix ||
+          option_config_ == kBlockBasedTableWithWholeKeyHashIndex ||
+          option_config_ == kBlockBasedTableWithPrefixHashIndex) {
+        minimum = 1;
+      }
+      if (p < 45) {                               // Put
+        k = RandomKey(&rnd, minimum);
+        v = RandomString(&rnd,
+                         rnd.OneIn(20)
+                         ? 100 + rnd.Uniform(100)
+                         : rnd.Uniform(8));
+        ASSERT_OK(model.Put(WriteOptions(), k, v));
+        ASSERT_OK(db_->Put(WriteOptions(), k, v));
 
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+      } else if (p < 90) {                        // Delete
+        k = RandomKey(&rnd, minimum);
+        ASSERT_OK(model.Delete(WriteOptions(), k));
+        ASSERT_OK(db_->Delete(WriteOptions(), k));
 
-  // write a single record, read it using the iterator, then delete it
-  ASSERT_OK(Put(1, "0test", "test"));
-  iter->SeekToFirst();
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(iter->key().ToString(), "0test");
-  ASSERT_OK(Delete(1, "0test"));
 
-  // write many more records
-  const int num_records = 10000;
-  std::string value(1024, 'A');
+      } else {                                    // Multi-element batch
+        WriteBatch b;
+        const int num = rnd.Uniform(8);
+        for (int i = 0; i < num; i++) {
+          if (i == 0 || !rnd.OneIn(10)) {
+            k = RandomKey(&rnd, minimum);
+          } else {
+            // Periodically re-use the same key from the previous iter, so
+            // we have multiple entries in the write batch for the same key
+          }
+          if (rnd.OneIn(2)) {
+            v = RandomString(&rnd, rnd.Uniform(10));
+            b.Put(k, v);
+          } else {
+            b.Delete(k);
+          }
+        }
+        ASSERT_OK(model.Write(WriteOptions(), &b));
+        ASSERT_OK(db_->Write(WriteOptions(), &b));
+      }
 
-  for (int i = 0; i < num_records; ++i) {
-    char buf[32];
-    snprintf(buf, sizeof(buf), "1%015d", i);
+      if ((step % 100) == 0) {
+        // For DB instances that use the hash index + block-based table, the
+        // iterator will be invalid right when seeking a non-existent key, right
+        // than return a key that is close to it.
+        if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
+            option_config_ != kBlockBasedTableWithPrefixHashIndex) {
+          ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+          ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+        }
 
-    Slice key(buf, 16);
-    ASSERT_OK(Put(1, key, value));
-  }
+        // Save a snapshot from each DB this time that we'll use next
+        // time we compare things, to make sure the current state is
+        // preserved with the snapshot
+        if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+        if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
 
-  // force a flush to make sure that no records are read from memtable
-  ASSERT_OK(Flush(1));
 
-  // skip "0test"
-  iter->Next();
+        auto options = CurrentOptions(options_override);
+        Reopen(options);
+        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
 
-  // make sure we can read all new records using the existing iterator
-  int count = 0;
-  for (; iter->Valid(); iter->Next(), ++count) {
-  }
+        model_snap = model.GetSnapshot();
+        db_snap = db_->GetSnapshot();
+      }
 
-  ASSERT_EQ(count, num_records);
+      if ((step % 2000) == 0) {
+        fprintf(stderr,
+                "DBTest.Randomized, option ID: %d, step: %d out of %d\n",
+                option_config_, step, N);
+      }
+    }
+    if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+    if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+    // skip cuckoo hash as it does not support snapshot.
+  } while (ChangeOptions(kSkipDeletesFilterFirst | kSkipNoSeekToLast |
+                         kSkipHashCuckoo));
 }
 
-TEST_F(DBTest, ManagedTailingIteratorPrefixSeek) {
-  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip,
-             kSkipNoPrefix);
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.managed = true;
-
-  Options options = CurrentOptions();
-  options.env = env_;
-  options.create_if_missing = true;
-  options.disable_auto_compactions = true;
-  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
-  options.memtable_factory.reset(NewHashSkipListRepFactory(16));
-  DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
-  ASSERT_OK(Put(1, "0101", "test"));
-
-  ASSERT_OK(Flush(1));
+TEST_F(DBTest, MultiGetSimple) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Put(1, "k5", "v5"));
+    ASSERT_OK(Delete(1, "no_key"));
 
-  ASSERT_OK(Put(1, "0202", "test"));
+    std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
 
-  // Seek(0102) shouldn't find any records since 0202 has a different prefix
-  iter->Seek("0102");
-  ASSERT_TRUE(!iter->Valid());
+    std::vector<std::string> values(20, "Temporary data to be overwritten");
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
 
-  iter->Seek("0202");
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(iter->key().ToString(), "0202");
+    std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(values[0], "v1");
+    ASSERT_EQ(values[1], "v2");
+    ASSERT_EQ(values[2], "v3");
+    ASSERT_EQ(values[4], "v5");
 
-  iter->Next();
-  ASSERT_TRUE(!iter->Valid());
-  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, 0);
+    ASSERT_OK(s[0]);
+    ASSERT_OK(s[1]);
+    ASSERT_OK(s[2]);
+    ASSERT_TRUE(s[3].IsNotFound());
+    ASSERT_OK(s[4]);
+    ASSERT_TRUE(s[5].IsNotFound());
+  } while (ChangeCompactOptions());
 }
 
-TEST_F(DBTest, ManagedTailingIteratorIncomplete) {
-  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.managed = true;
-  read_options.read_tier = kBlockCacheTier;
+TEST_F(DBTest, MultiGetEmpty) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    // Empty Key Set
+    std::vector<Slice> keys;
+    std::vector<std::string> values;
+    std::vector<ColumnFamilyHandle*> cfs;
+    std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(s.size(), 0U);
 
-  std::string key = "key";
-  std::string value = "value";
+    // Empty Database, Empty Key Set
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(s.size(), 0U);
 
-  ASSERT_OK(db_->Put(WriteOptions(), key, value));
+    // Empty Database, Search for Keys
+    keys.resize(2);
+    keys[0] = "a";
+    keys[1] = "b";
+    cfs.push_back(handles_[0]);
+    cfs.push_back(handles_[1]);
+    s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ((int)s.size(), 2);
+    ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound());
+  } while (ChangeCompactOptions());
+}
 
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
-  iter->SeekToFirst();
-  // we either see the entry or it's not in cache
-  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+namespace {
+void PrefixScanInit(DBTest *dbtest) {
+  char buf[100];
+  std::string keystr;
+  const int small_range_sstfiles = 5;
+  const int big_range_sstfiles = 5;
 
-  ASSERT_OK(db_->CompactRange(nullptr, nullptr));
-  iter->SeekToFirst();
-  // should still be true after compaction
-  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
-}
+  // Generate 11 sst files with the following prefix ranges.
+  // GROUP 0: [0,10]                              (level 1)
+  // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6]  (level 0)
+  // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10]  (level 0)
+  //
+  // A seek with the previous API would do 11 random I/Os (to all the
+  // files).  With the new API and a prefix filter enabled, we should
+  // only do 2 random I/O, to the 2 files containing the key.
 
-TEST_F(DBTest, ManagedTailingIteratorSeekToSame) {
-  Options options = CurrentOptions();
-  options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 1000;
-  CreateAndReopenWithCF({"pikachu"}, options);
+  // GROUP 0
+  snprintf(buf, sizeof(buf), "%02d______:start", 0);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  snprintf(buf, sizeof(buf), "%02d______:end", 10);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  dbtest->Flush();
+  dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr,
+                                 nullptr);  // move to level 1
 
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.managed = true;
+  // GROUP 1
+  for (int i = 1; i <= small_range_sstfiles; i++) {
+    snprintf(buf, sizeof(buf), "%02d______:start", i);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end", i+1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->Flush();
+  }
 
-  const int NROWS = 10000;
-  // Write rows with keys 00000, 00002, 00004 etc.
-  for (int i = 0; i < NROWS; ++i) {
-    char buf[100];
-    snprintf(buf, sizeof(buf), "%05d", 2 * i);
-    std::string key(buf);
-    std::string value("value");
-    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+  // GROUP 2
+  for (int i = 1; i <= big_range_sstfiles; i++) {
+    snprintf(buf, sizeof(buf), "%02d______:start", 0);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end",
+             small_range_sstfiles+i+1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->Flush();
   }
+}
+}  // namespace
 
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
-  // Seek to 00001.  We expect to find 00002.
-  std::string start_key = "00001";
-  iter->Seek(start_key);
-  ASSERT_TRUE(iter->Valid());
+TEST_F(DBTest, PrefixScan) {
+  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip,
+             kSkipNoPrefix);
+  while (ChangeFilterOptions()) {
+    int count;
+    Slice prefix;
+    Slice key;
+    char buf[100];
+    Iterator* iter;
+    snprintf(buf, sizeof(buf), "03______:");
+    prefix = Slice(buf, 8);
+    key = Slice(buf, 9);
+    ASSERT_EQ(key.difference_offset(prefix), 8);
+    ASSERT_EQ(prefix.difference_offset(key), 8);
+    // db configs
+    env_->count_random_reads_ = true;
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    options.disable_auto_compactions = true;
+    options.max_background_compactions = 2;
+    options.create_if_missing = true;
+    options.memtable_factory.reset(NewHashSkipListRepFactory(16));
 
-  std::string found = iter->key().ToString();
-  ASSERT_EQ("00002", found);
+    BlockBasedTableOptions table_options;
+    table_options.no_block_cache = true;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+    table_options.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  // Now seek to the same key.  The iterator should remain in the same
-  // position.
-  iter->Seek(found);
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(found, iter->key().ToString());
+    // 11 RAND I/Os
+    DestroyAndReopen(options);
+    PrefixScanInit(this);
+    count = 0;
+    env_->random_read_counter_.Reset();
+    iter = db_->NewIterator(ReadOptions());
+    for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
+      if (! iter->key().starts_with(prefix)) {
+        break;
+      }
+      count++;
+    }
+    ASSERT_OK(iter->status());
+    delete iter;
+    ASSERT_EQ(count, 2);
+    ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+    Close();
+  }  // end of while
+  XFUNC_TEST("", "dbtest_prefix", prefix_skip1, XFuncPoint::SetSkip, 0);
 }
 
 TEST_F(DBTest, BlockBasedTablePrefixIndexTest) {
@@ -9680,16 +6209,18 @@ TEST_F(DBTest, ChecksumTest) {
   ASSERT_EQ("h", Get("g"));
 }
 
-TEST_F(DBTest, FIFOCompactionTest) {
+TEST_P(DBTestWithParam, FIFOCompactionTest) {
   for (int iter = 0; iter < 2; ++iter) {
     // first iteration -- auto compaction
     // second iteration -- manual compaction
     Options options;
     options.compaction_style = kCompactionStyleFIFO;
     options.write_buffer_size = 100 << 10;                             // 100KB
+    options.arena_block_size = 4096;
     options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 500KB
     options.compression = kNoCompression;
     options.create_if_missing = true;
+    options.max_subcompactions = max_subcompactions_;
     if (iter == 1) {
       options.disable_auto_compactions = true;
     }
@@ -9698,8 +6229,8 @@ TEST_F(DBTest, FIFOCompactionTest) {
 
     Random rnd(301);
     for (int i = 0; i < 6; ++i) {
-      for (int j = 0; j < 100; ++j) {
-        ASSERT_OK(Put(ToString(i * 100 + j), RandomString(&rnd, 1024)));
+      for (int j = 0; j < 110; ++j) {
+        ASSERT_OK(Put(ToString(i * 100 + j), RandomString(&rnd, 980)));
       }
       // flush should happen here
       ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
@@ -9707,7 +6238,7 @@ TEST_F(DBTest, FIFOCompactionTest) {
     if (iter == 0) {
       ASSERT_OK(dbfull()->TEST_WaitForCompact());
     } else {
-      ASSERT_OK(db_->CompactRange(nullptr, nullptr));
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     }
     // only 5 files should survive
     ASSERT_EQ(NumTableFilesAtLevel(0), 5);
@@ -9718,436 +6249,83 @@ TEST_F(DBTest, FIFOCompactionTest) {
   }
 }
 
+// verify that we correctly deprecated timeout_hint_us
 TEST_F(DBTest, SimpleWriteTimeoutTest) {
-  // Block compaction thread, which will also block the flushes because
-  // max_background_flushes == 0, so flushes are getting executed by the
-  // compaction thread
-  env_->SetBackgroundThreads(1, Env::LOW);
-  SleepingBackgroundTask sleeping_task_low;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
-                 Env::Priority::LOW);
-
-  Options options;
-  options.env = env_;
-  options.create_if_missing = true;
-  options.write_buffer_size = 100000;
-  options.max_background_flushes = 0;
-  options.max_write_buffer_number = 2;
-  options.max_total_wal_size = std::numeric_limits<uint64_t>::max();
   WriteOptions write_opt;
   write_opt.timeout_hint_us = 0;
-  DestroyAndReopen(options);
-  // fill the two write buffers
-  ASSERT_OK(Put(Key(1), Key(1) + std::string(100000, 'v'), write_opt));
-  ASSERT_OK(Put(Key(2), Key(2) + std::string(100000, 'v'), write_opt));
-  // As the only two write buffers are full in this moment, the third
-  // Put is expected to be timed-out.
-  write_opt.timeout_hint_us = 50;
-  ASSERT_TRUE(
-      Put(Key(3), Key(3) + std::string(100000, 'v'), write_opt).IsTimedOut());
-
-  sleeping_task_low.WakeUp();
-  sleeping_task_low.WaitUntilDone();
-}
-
-// Multi-threaded Timeout Test
-namespace {
-
-static const int kValueSize = 1000;
-static const int kWriteBufferSize = 100000;
-
-struct TimeoutWriterState {
-  int id;
-  DB* db;
-  std::atomic<bool> done;
-  std::map<int, std::string> success_kvs;
-};
-
-static void RandomTimeoutWriter(void* arg) {
-  TimeoutWriterState* state = reinterpret_cast<TimeoutWriterState*>(arg);
-  static const uint64_t kTimerBias = 50;
-  int thread_id = state->id;
-  DB* db = state->db;
-
-  Random rnd(1000 + thread_id);
-  WriteOptions write_opt;
-  write_opt.timeout_hint_us = 500;
-  int timeout_count = 0;
-  int num_keys = kNumKeys * 5;
-
-  for (int k = 0; k < num_keys; ++k) {
-    int key = k + thread_id * num_keys;
-    std::string value = RandomString(&rnd, kValueSize);
-    // only the second-half is randomized
-    if (k > num_keys / 2) {
-      switch (rnd.Next() % 5) {
-        case 0:
-          write_opt.timeout_hint_us = 500 * thread_id;
-          break;
-        case 1:
-          write_opt.timeout_hint_us = num_keys - k;
-          break;
-        case 2:
-          write_opt.timeout_hint_us = 1;
-          break;
-        default:
-          write_opt.timeout_hint_us = 0;
-          state->success_kvs.insert({key, value});
-      }
-    }
-
-    uint64_t time_before_put = db->GetEnv()->NowMicros();
-    Status s = db->Put(write_opt, Key(key), value);
-    uint64_t put_duration = db->GetEnv()->NowMicros() - time_before_put;
-    if (write_opt.timeout_hint_us == 0 ||
-        put_duration + kTimerBias < write_opt.timeout_hint_us) {
-      ASSERT_OK(s);
-    }
-    if (s.IsTimedOut()) {
-      timeout_count++;
-      ASSERT_GT(put_duration + kTimerBias, write_opt.timeout_hint_us);
-    }
-  }
-
-  state->done = true;
-}
-
-TEST_F(DBTest, MTRandomTimeoutTest) {
-  Options options;
-  options.env = env_;
-  options.create_if_missing = true;
-  options.max_write_buffer_number = 2;
-  options.compression = kNoCompression;
-  options.level0_slowdown_writes_trigger = 10;
-  options.level0_stop_writes_trigger = 20;
-  options.write_buffer_size = kWriteBufferSize;
-  DestroyAndReopen(options);
-
-  TimeoutWriterState thread_states[kNumThreads];
-  for (int tid = 0; tid < kNumThreads; ++tid) {
-    thread_states[tid].id = tid;
-    thread_states[tid].db = db_;
-    thread_states[tid].done = false;
-    env_->StartThread(RandomTimeoutWriter, &thread_states[tid]);
-  }
-
-  for (int tid = 0; tid < kNumThreads; ++tid) {
-    while (thread_states[tid].done == false) {
-      env_->SleepForMicroseconds(100000);
-    }
-  }
-
-  Flush();
-
-  for (int tid = 0; tid < kNumThreads; ++tid) {
-    auto& success_kvs = thread_states[tid].success_kvs;
-    for (auto it = success_kvs.begin(); it != success_kvs.end(); ++it) {
-      ASSERT_EQ(Get(Key(it->first)), it->second);
-    }
-  }
-}
-
-TEST_F(DBTest, Level0StopWritesTest) {
-  Options options = CurrentOptions();
-  options.level0_slowdown_writes_trigger = 2;
-  options.level0_stop_writes_trigger = 4;
-  options.disable_auto_compactions = true;
-  options.max_mem_compaction_level = 0;
-  Reopen(options);
-
-  // create 4 level0 tables
-  for (int i = 0; i < 4; ++i) {
-    Put("a", "b");
-    Flush();
-  }
-
-  WriteOptions woptions;
-  woptions.timeout_hint_us = 30 * 1000;  // 30 ms
-  Status s = Put("a", "b", woptions);
-  ASSERT_TRUE(s.IsTimedOut());
+  ASSERT_OK(Put(Key(1), Key(1) + std::string(100, 'v'), write_opt));
+  write_opt.timeout_hint_us = 10;
+  ASSERT_NOK(Put(Key(1), Key(1) + std::string(100, 'v'), write_opt));
 }
 
-}  // anonymous namespace
-
 /*
- * This test is not reliable enough as it heavily depends on disk behavior.
- */
-TEST_F(DBTest, RateLimitingTest) {
-  Options options = CurrentOptions();
-  options.write_buffer_size = 1 << 20;         // 1MB
-  options.level0_file_num_compaction_trigger = 2;
-  options.target_file_size_base = 1 << 20;     // 1MB
-  options.max_bytes_for_level_base = 4 << 20;  // 4MB
-  options.max_bytes_for_level_multiplier = 4;
-  options.compression = kNoCompression;
-  options.create_if_missing = true;
-  options.env = env_;
-  options.IncreaseParallelism(4);
-  DestroyAndReopen(options);
-
-  WriteOptions wo;
-  wo.disableWAL = true;
-
-  // # no rate limiting
-  Random rnd(301);
-  uint64_t start = env_->NowMicros();
-  // Write ~96M data
-  for (int64_t i = 0; i < (96 << 10); ++i) {
-    ASSERT_OK(Put(RandomString(&rnd, 32),
-                  RandomString(&rnd, (1 << 10) + 1), wo));
-  }
-  uint64_t elapsed = env_->NowMicros() - start;
-  double raw_rate = env_->bytes_written_ * 1000000 / elapsed;
-  Close();
-
-  // # rate limiting with 0.7 x threshold
-  options.rate_limiter.reset(
-    NewGenericRateLimiter(static_cast<int64_t>(0.7 * raw_rate)));
-  env_->bytes_written_ = 0;
-  DestroyAndReopen(options);
-
-  start = env_->NowMicros();
-  // Write ~96M data
-  for (int64_t i = 0; i < (96 << 10); ++i) {
-    ASSERT_OK(Put(RandomString(&rnd, 32),
-                  RandomString(&rnd, (1 << 10) + 1), wo));
-  }
-  elapsed = env_->NowMicros() - start;
-  Close();
-  ASSERT_TRUE(options.rate_limiter->GetTotalBytesThrough() ==
-              env_->bytes_written_);
-  double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
-  fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio);
-  ASSERT_TRUE(ratio < 0.8);
-
-  // # rate limiting with half of the raw_rate
-  options.rate_limiter.reset(
-    NewGenericRateLimiter(static_cast<int64_t>(raw_rate / 2)));
-  env_->bytes_written_ = 0;
-  DestroyAndReopen(options);
-
-  start = env_->NowMicros();
-  // Write ~96M data
-  for (int64_t i = 0; i < (96 << 10); ++i) {
-    ASSERT_OK(Put(RandomString(&rnd, 32),
-                  RandomString(&rnd, (1 << 10) + 1), wo));
-  }
-  elapsed = env_->NowMicros() - start;
-  Close();
-  ASSERT_TRUE(options.rate_limiter->GetTotalBytesThrough() ==
-              env_->bytes_written_);
-  ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
-  fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio);
-  ASSERT_TRUE(ratio < 0.6);
-}
-
-namespace {
-  bool HaveOverlappingKeyRanges(
-      const Comparator* c,
-      const SstFileMetaData& a, const SstFileMetaData& b) {
-    if (c->Compare(a.smallestkey, b.smallestkey) >= 0) {
-      if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
-        // b.smallestkey <= a.smallestkey <= b.largestkey
-        return true;
-      }
-    } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
-      // a.smallestkey < b.smallestkey <= a.largestkey
-      return true;
-    }
-    if (c->Compare(a.largestkey, b.largestkey) <= 0) {
-      if (c->Compare(a.largestkey, b.smallestkey) >= 0) {
-        // b.smallestkey <= a.largestkey <= b.largestkey
-        return true;
-      }
-    } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) {
-      // a.smallestkey <= b.largestkey < a.largestkey
-      return true;
-    }
-    return false;
-  }
-
-  // Identifies all files between level "min_level" and "max_level"
-  // which has overlapping key range with "input_file_meta".
-  void GetOverlappingFileNumbersForLevelCompaction(
-      const ColumnFamilyMetaData& cf_meta,
-      const Comparator* comparator,
-      int min_level, int max_level,
-      const SstFileMetaData* input_file_meta,
-      std::set<std::string>* overlapping_file_names) {
-    std::set<const SstFileMetaData*> overlapping_files;
-    overlapping_files.insert(input_file_meta);
-    for (int m = min_level; m <= max_level; ++m) {
-      for (auto& file : cf_meta.levels[m].files) {
-        for (auto* included_file : overlapping_files) {
-          if (HaveOverlappingKeyRanges(
-                  comparator, *included_file, file)) {
-            overlapping_files.insert(&file);
-            overlapping_file_names->insert(file.name);
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  void VerifyCompactionResult(
-      const ColumnFamilyMetaData& cf_meta,
-      const std::set<std::string>& overlapping_file_numbers) {
-#ifndef NDEBUG
-    for (auto& level : cf_meta.levels) {
-      for (auto& file : level.files) {
-        assert(overlapping_file_numbers.find(file.name) ==
-               overlapping_file_numbers.end());
-      }
-    }
-#endif
-  }
-
-  const SstFileMetaData* PickFileRandomly(
-      const ColumnFamilyMetaData& cf_meta,
-      Random* rand,
-      int* level = nullptr) {
-    auto file_id = rand->Uniform(static_cast<int>(
-        cf_meta.file_count)) + 1;
-    for (auto& level_meta : cf_meta.levels) {
-      if (file_id <= level_meta.files.size()) {
-        if (level != nullptr) {
-          *level = level_meta.level;
-        }
-        auto result = rand->Uniform(file_id);
-        return &(level_meta.files[result]);
-      }
-      file_id -= level_meta.files.size();
-    }
-    assert(false);
-    return nullptr;
-  }
-}  // namespace
-
-// TODO t6534343 -- Don't run two level 0 CompactFiles concurrently
-TEST_F(DBTest, DISABLED_CompactFilesOnLevelCompaction) {
-  const int kTestKeySize = 16;
-  const int kTestValueSize = 984;
-  const int kEntrySize = kTestKeySize + kTestValueSize;
-  const int kEntriesPerBuffer = 100;
-  Options options;
-  options.create_if_missing = true;
-  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
-  options.compaction_style = kCompactionStyleLevel;
-  options.target_file_size_base = options.write_buffer_size;
-  options.max_bytes_for_level_base = options.target_file_size_base * 2;
-  options.level0_stop_writes_trigger = 2;
-  options.max_bytes_for_level_multiplier = 2;
-  options.compression = kNoCompression;
-  options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  Random rnd(301);
-  for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
-    ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
-  }
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-  dbfull()->TEST_WaitForCompact();
-
-  ColumnFamilyMetaData cf_meta;
-  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
-  int output_level = static_cast<int>(cf_meta.levels.size()) - 1;
-  for (int file_picked = 5; file_picked > 0; --file_picked) {
-    std::set<std::string> overlapping_file_names;
-    std::vector<std::string> compaction_input_file_names;
-    for (int f = 0; f < file_picked; ++f) {
-      int level;
-      auto file_meta = PickFileRandomly(cf_meta, &rnd, &level);
-      compaction_input_file_names.push_back(file_meta->name);
-      GetOverlappingFileNumbersForLevelCompaction(
-          cf_meta, options.comparator, level, output_level,
-          file_meta, &overlapping_file_names);
-    }
-
-    ASSERT_OK(dbfull()->CompactFiles(
-        CompactionOptions(), handles_[1],
-        compaction_input_file_names,
-        output_level));
-
-    // Make sure all overlapping files do not exist after compaction
-    dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
-    VerifyCompactionResult(cf_meta, overlapping_file_names);
-  }
-
-  // make sure all key-values are still there.
-  for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
-    ASSERT_NE(Get(1, ToString(key)), "NOT_FOUND");
-  }
-}
-
-TEST_F(DBTest, CompactFilesOnUniversalCompaction) {
-  const int kTestKeySize = 16;
-  const int kTestValueSize = 984;
-  const int kEntrySize = kTestKeySize + kTestValueSize;
-  const int kEntriesPerBuffer = 10;
-
-  ChangeCompactOptions();
-  Options options;
-  options.create_if_missing = true;
-  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
-  options.compaction_style = kCompactionStyleLevel;
-  options.num_levels = 1;
-  options.target_file_size_base = options.write_buffer_size;
-  options.compression = kNoCompression;
-  options = CurrentOptions(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-  ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
-  Random rnd(301);
-  for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) {
-    ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
-  }
-  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
-  dbfull()->TEST_WaitForCompact();
-  ColumnFamilyMetaData cf_meta;
-  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
-  std::vector<std::string> compaction_input_file_names;
-  for (auto file : cf_meta.levels[0].files) {
-    if (rnd.OneIn(2)) {
-      compaction_input_file_names.push_back(file.name);
-    }
-  }
-
-  if (compaction_input_file_names.size() == 0) {
-    compaction_input_file_names.push_back(
-        cf_meta.levels[0].files[0].name);
-  }
+ * This test is not reliable enough as it heavily depends on disk behavior.
+ */
+TEST_F(DBTest, RateLimitingTest) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1 << 20;         // 1MB
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = 1 << 20;     // 1MB
+  options.max_bytes_for_level_base = 4 << 20;  // 4MB
+  options.max_bytes_for_level_multiplier = 4;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.env = env_;
+  options.IncreaseParallelism(4);
+  DestroyAndReopen(options);
 
-  // expect fail since universal compaction only allow L0 output
-  ASSERT_TRUE(!dbfull()->CompactFiles(
-      CompactionOptions(), handles_[1],
-      compaction_input_file_names, 1).ok());
+  WriteOptions wo;
+  wo.disableWAL = true;
 
-  // expect ok and verify the compacted files no longer exist.
-  ASSERT_OK(dbfull()->CompactFiles(
-      CompactionOptions(), handles_[1],
-      compaction_input_file_names, 0));
+  // # no rate limiting
+  Random rnd(301);
+  uint64_t start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(Put(RandomString(&rnd, 32),
+                  RandomString(&rnd, (1 << 10) + 1), wo));
+  }
+  uint64_t elapsed = env_->NowMicros() - start;
+  double raw_rate = env_->bytes_written_ * 1000000 / elapsed;
+  Close();
 
-  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
-  VerifyCompactionResult(
-      cf_meta,
-      std::set<std::string>(compaction_input_file_names.begin(),
-          compaction_input_file_names.end()));
+  // # rate limiting with 0.7 x threshold
+  options.rate_limiter.reset(
+    NewGenericRateLimiter(static_cast<int64_t>(0.7 * raw_rate)));
+  env_->bytes_written_ = 0;
+  DestroyAndReopen(options);
 
-  compaction_input_file_names.clear();
+  start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(Put(RandomString(&rnd, 32),
+                  RandomString(&rnd, (1 << 10) + 1), wo));
+  }
+  elapsed = env_->NowMicros() - start;
+  Close();
+  ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_);
+  double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
+  fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio);
+  ASSERT_TRUE(ratio < 0.8);
 
-  // Pick the first and the last file, expect everything is
-  // compacted into one single file.
-  compaction_input_file_names.push_back(
-      cf_meta.levels[0].files[0].name);
-  compaction_input_file_names.push_back(
-      cf_meta.levels[0].files[
-          cf_meta.levels[0].files.size() - 1].name);
-  ASSERT_OK(dbfull()->CompactFiles(
-      CompactionOptions(), handles_[1],
-      compaction_input_file_names, 0));
+  // # rate limiting with half of the raw_rate
+  options.rate_limiter.reset(
+    NewGenericRateLimiter(static_cast<int64_t>(raw_rate / 2)));
+  env_->bytes_written_ = 0;
+  DestroyAndReopen(options);
 
-  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
-  ASSERT_EQ(cf_meta.levels[0].files.size(), 1U);
+  start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(Put(RandomString(&rnd, 32),
+                  RandomString(&rnd, (1 << 10) + 1), wo));
+  }
+  elapsed = env_->NowMicros() - start;
+  Close();
+  ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_);
+  ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
+  fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio);
+  ASSERT_LT(ratio, 0.6);
 }
 
 TEST_F(DBTest, TableOptionsSanitizeTest) {
@@ -10159,7 +6337,7 @@ TEST_F(DBTest, TableOptionsSanitizeTest) {
   options.table_factory.reset(new PlainTableFactory());
   options.prefix_extractor.reset(NewNoopTransform());
   Destroy(options);
-  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+  ASSERT_TRUE(!TryReopen(options).IsNotSupported());
 
   // Test for check of prefix_extractor when hash index is used for
   // block-based table
@@ -10176,7 +6354,7 @@ TEST_F(DBTest, TableOptionsSanitizeTest) {
 TEST_F(DBTest, SanitizeNumThreads) {
   for (int attempt = 0; attempt < 2; attempt++) {
     const size_t kTotalTasks = 8;
-    SleepingBackgroundTask sleeping_tasks[kTotalTasks];
+    test::SleepingBackgroundTask sleeping_tasks[kTotalTasks];
 
     Options options = CurrentOptions();
     if (attempt == 0) {
@@ -10188,2816 +6366,3512 @@ TEST_F(DBTest, SanitizeNumThreads) {
 
     for (size_t i = 0; i < kTotalTasks; i++) {
       // Insert 5 tasks to low priority queue and 5 tasks to high priority queue
-      env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_tasks[i],
+      env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                     &sleeping_tasks[i],
                      (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH);
     }
 
     // Wait 100 milliseconds for they are scheduled.
     env_->SleepForMicroseconds(100000);
 
-    // pool size 3, total task 4. Queue size should be 1.
-    ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW));
-    // pool size 2, total task 4. Queue size should be 2.
-    ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH));
+    // pool size 3, total task 4. Queue size should be 1.
+    ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW));
+    // pool size 2, total task 4. Queue size should be 2.
+    ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+    for (size_t i = 0; i < kTotalTasks; i++) {
+      sleeping_tasks[i].WakeUp();
+      sleeping_tasks[i].WaitUntilDone();
+    }
+
+    ASSERT_OK(Put("abc", "def"));
+    ASSERT_EQ("def", Get("abc"));
+    Flush();
+    ASSERT_EQ("def", Get("abc"));
+  }
+}
+
+TEST_F(DBTest, DBIteratorBoundTest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+
+  options.prefix_extractor = nullptr;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("g1", "0"));
+
+  // testing basic case with no iterate_upper_bound and no prefix_extractor
+  {
+    ReadOptions ro;
+    ro.iterate_upper_bound = nullptr;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
+  }
+
+  // testing iterate_upper_bound and forward iterator
+  // to make sure it stops at bound
+  {
+    ReadOptions ro;
+    // iterate_upper_bound points beyond the last expected entry
+    Slice prefix("foo2");
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("foo1")), 0);
+
+    iter->Next();
+    // should stop here...
+    ASSERT_TRUE(!iter->Valid());
+  }
+  // Testing SeekToLast with iterate_upper_bound set
+  {
+    ReadOptions ro;
+
+    Slice prefix("foo");
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("a")), 0);
+  }
+
+  // prefix is the first letter of the key
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("g1", "0"));
+
+  // testing with iterate_upper_bound and prefix_extractor
+  // Seek target and iterate_upper_bound are not is same prefix
+  // This should be an error
+  {
+    ReadOptions ro;
+    Slice upper_bound("g");
+    ro.iterate_upper_bound = &upper_bound;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo1", iter->key().ToString());
+
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  // testing that iterate_upper_bound prevents iterating over deleted items
+  // if the bound has already reached
+  {
+    options.prefix_extractor = nullptr;
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("a", "0"));
+    ASSERT_OK(Put("b", "0"));
+    ASSERT_OK(Put("b1", "0"));
+    ASSERT_OK(Put("c", "0"));
+    ASSERT_OK(Put("d", "0"));
+    ASSERT_OK(Put("e", "0"));
+    ASSERT_OK(Delete("c"));
+    ASSERT_OK(Delete("d"));
+
+    // base case with no bound
+    ReadOptions ro;
+    ro.iterate_upper_bound = nullptr;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+
+    iter->Seek("b");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+    perf_context.Reset();
+    iter->Next();
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(static_cast<int>(perf_context.internal_delete_skipped_count), 2);
+
+    // now testing with iterate_bound
+    Slice prefix("c");
+    ro.iterate_upper_bound = &prefix;
+
+    iter.reset(db_->NewIterator(ro));
+
+    perf_context.Reset();
+
+    iter->Seek("b");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+    iter->Next();
+    // the iteration should stop as soon as the the bound key is reached
+    // even though the key is deleted
+    // hence internal_delete_skipped_count should be 0
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_EQ(static_cast<int>(perf_context.internal_delete_skipped_count), 0);
+  }
+}
+
+TEST_F(DBTest, WriteSingleThreadEntry) {
+  std::vector<std::thread> threads;
+  dbfull()->TEST_LockMutex();
+  auto w = dbfull()->TEST_BeginWrite();
+  threads.emplace_back([&] { Put("a", "b"); });
+  env_->SleepForMicroseconds(10000);
+  threads.emplace_back([&] { Flush(); });
+  env_->SleepForMicroseconds(10000);
+  dbfull()->TEST_UnlockMutex();
+  dbfull()->TEST_LockMutex();
+  dbfull()->TEST_EndWrite(w);
+  dbfull()->TEST_UnlockMutex();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+TEST_F(DBTest, DisableDataSyncTest) {
+  env_->sync_counter_.store(0);
+  // iter 0 -- no sync
+  // iter 1 -- sync
+  for (int iter = 0; iter < 2; ++iter) {
+    Options options = CurrentOptions();
+    options.disableDataSync = iter == 0;
+    options.create_if_missing = true;
+    options.num_levels = 10;
+    options.env = env_;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    MakeTables(10, "a", "z");
+    Compact("a", "z");
 
-    for (size_t i = 0; i < kTotalTasks; i++) {
-      sleeping_tasks[i].WakeUp();
-      sleeping_tasks[i].WaitUntilDone();
+    if (iter == 0) {
+      ASSERT_EQ(env_->sync_counter_.load(), 0);
+    } else {
+      ASSERT_GT(env_->sync_counter_.load(), 0);
     }
-
-    ASSERT_OK(Put("abc", "def"));
-    ASSERT_EQ("def", Get("abc"));
-    Flush();
-    ASSERT_EQ("def", Get("abc"));
+    Destroy(options);
   }
 }
 
-TEST_F(DBTest, DBIteratorBoundTest) {
-  Options options = CurrentOptions();
+TEST_F(DBTest, DynamicMemtableOptions) {
+  const uint64_t k64KB = 1 << 16;
+  const uint64_t k128KB = 1 << 17;
+  const uint64_t k5KB = 5 * 1024;
+  const int kNumPutsBeforeWaitForFlush = 64;
+  Options options;
   options.env = env_;
   options.create_if_missing = true;
-
-  options.prefix_extractor = nullptr;
+  options.compression = kNoCompression;
+  options.max_background_compactions = 1;
+  options.write_buffer_size = k64KB;
+  options.arena_block_size = 16 * 1024;
+  options.max_write_buffer_number = 2;
+  // Don't trigger compact/slowdown/stop
+  options.level0_file_num_compaction_trigger = 1024;
+  options.level0_slowdown_writes_trigger = 1024;
+  options.level0_stop_writes_trigger = 1024;
   DestroyAndReopen(options);
-  ASSERT_OK(Put("a", "0"));
-  ASSERT_OK(Put("foo", "bar"));
-  ASSERT_OK(Put("foo1", "bar1"));
-  ASSERT_OK(Put("g1", "0"));
 
-  // testing basic case with no iterate_upper_bound and no prefix_extractor
-  {
-    ReadOptions ro;
-    ro.iterate_upper_bound = nullptr;
+  auto gen_l0_kb = [this, kNumPutsBeforeWaitForFlush](int size) {
+    Random rnd(301);
+    for (int i = 0; i < size; i++) {
+      ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
 
-    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+      // The following condition prevents a race condition between flush jobs
+      // acquiring work and this thread filling up multiple memtables. Without
+      // this, the flush might produce less files than expected because
+      // multiple memtables are flushed into a single L0 file. This race
+      // condition affects assertion (A).
+      if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) {
+        dbfull()->TEST_WaitForFlushMemTable();
+      }
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+  };
 
-    iter->Seek("foo");
+  // Test write_buffer_size
+  gen_l0_kb(64);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  ASSERT_LT(SizeAtLevel(0), k64KB + k5KB);
+  ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2);
 
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+  // Clean up L0
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 
-    iter->Next();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+  // Increase buffer size
+  ASSERT_OK(dbfull()->SetOptions({
+    {"write_buffer_size", "131072"},
+  }));
 
-    iter->Next();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
+  // The existing memtable is still 64KB in size, after it becomes immutable,
+  // the next memtable will be 128KB in size. Write 256KB total, we should
+  // have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data
+  gen_l0_kb(256);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);  // (A)
+  ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
+  ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB);
+
+  // Test max_write_buffer_number
+  // Block compaction thread, which will also block the flushes because
+  // max_background_flushes == 0, so flushes are getting executed by the
+  // compaction thread
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  // Start from scratch and disable compaction/flush. Flush can only happen
+  // during compaction but trigger is pretty high
+  options.max_background_flushes = 0;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // Put until writes are stopped, bounded by 256 puts. We should see stop at
+  // ~128KB
+  int count = 0;
+  Random rnd(301);
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait",
+      [&](void* arg) { sleeping_task_low.WakeUp(); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  while (!sleeping_task_low.WokenUp() && count < 256) {
+    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions()));
+    count++;
   }
+  ASSERT_GT(static_cast<double>(count), 128 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 128 * 1.2);
 
-  // testing iterate_upper_bound and forward iterator
-  // to make sure it stops at bound
-  {
-    ReadOptions ro;
-    // iterate_upper_bound points beyond the last expected entry
-    Slice prefix("foo2");
-    ro.iterate_upper_bound = &prefix;
+  sleeping_task_low.WaitUntilDone();
 
-    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+  // Increase
+  ASSERT_OK(dbfull()->SetOptions({
+    {"max_write_buffer_number", "8"},
+  }));
+  // Clean up memtable and L0
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
 
-    iter->Seek("foo");
+  sleeping_task_low.Reset();
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  count = 0;
+  while (!sleeping_task_low.WokenUp() && count < 1024) {
+    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions()));
+    count++;
+  }
+  // Windows fails this test. Will tune in the future and figure out
+  // approp number
+#ifndef OS_WIN
+  ASSERT_GT(static_cast<double>(count), 512 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 512 * 1.2);
+#endif
+  sleeping_task_low.WaitUntilDone();
 
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+  // Decrease
+  ASSERT_OK(dbfull()->SetOptions({
+    {"max_write_buffer_number", "4"},
+  }));
+  // Clean up memtable and L0
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
 
-    iter->Next();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(("foo1")), 0);
+  sleeping_task_low.Reset();
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
 
-    iter->Next();
-    // should stop here...
-    ASSERT_TRUE(!iter->Valid());
+  count = 0;
+  while (!sleeping_task_low.WokenUp() && count < 1024) {
+    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), WriteOptions()));
+    count++;
   }
+  // Windows fails this test. Will tune in the future and figure out
+  // approp number
+#ifndef OS_WIN
+  ASSERT_GT(static_cast<double>(count), 256 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 266 * 1.2);
+#endif
+  sleeping_task_low.WaitUntilDone();
 
-  // prefix is the first letter of the key
-  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
 
-  DestroyAndReopen(options);
-  ASSERT_OK(Put("a", "0"));
-  ASSERT_OK(Put("foo", "bar"));
-  ASSERT_OK(Put("foo1", "bar1"));
-  ASSERT_OK(Put("g1", "0"));
+#if ROCKSDB_USING_THREAD_STATUS
+namespace {
+void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type,
+                          int expected_count) {
+  int op_count = 0;
+  std::vector<ThreadStatus> thread_list;
+  ASSERT_OK(env->GetThreadList(&thread_list));
+  for (auto thread : thread_list) {
+    if (thread.operation_type == op_type) {
+      op_count++;
+    }
+  }
+  ASSERT_EQ(op_count, expected_count);
+}
+}  // namespace
 
-  // testing with iterate_upper_bound and prefix_extractor
-  // Seek target and iterate_upper_bound are not is same prefix
-  // This should be an error
-  {
-    ReadOptions ro;
-    Slice prefix("g1");
-    ro.iterate_upper_bound = &prefix;
+TEST_F(DBTest, GetThreadStatus) {
+  Options options;
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  TryReopen(options);
 
-    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+  std::vector<ThreadStatus> thread_list;
+  Status s = env_->GetThreadList(&thread_list);
+
+  for (int i = 0; i < 2; ++i) {
+    // repeat the test with differet number of high / low priority threads
+    const int kTestCount = 3;
+    const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5};
+    const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3};
+    for (int test = 0; test < kTestCount; ++test) {
+      // Change the number of threads in high / low priority pool.
+      env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH);
+      env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW);
+      // Wait to ensure the all threads has been registered
+      env_->SleepForMicroseconds(100000);
+      s = env_->GetThreadList(&thread_list);
+      ASSERT_OK(s);
+      unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES];
+      memset(thread_type_counts, 0, sizeof(thread_type_counts));
+      for (auto thread : thread_list) {
+        ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES);
+        thread_type_counts[thread.thread_type]++;
+      }
+      // Verify the total number of threades
+      ASSERT_EQ(
+          thread_type_counts[ThreadStatus::HIGH_PRIORITY] +
+              thread_type_counts[ThreadStatus::LOW_PRIORITY],
+          kHighPriCounts[test] + kLowPriCounts[test]);
+      // Verify the number of high-priority threads
+      ASSERT_EQ(
+          thread_type_counts[ThreadStatus::HIGH_PRIORITY],
+          kHighPriCounts[test]);
+      // Verify the number of low-priority threads
+      ASSERT_EQ(
+          thread_type_counts[ThreadStatus::LOW_PRIORITY],
+          kLowPriCounts[test]);
+    }
+    if (i == 0) {
+      // repeat the test with multiple column families
+      CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+      env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
+          handles_, true);
+    }
+  }
+  db_->DropColumnFamily(handles_[2]);
+  delete handles_[2];
+  handles_.erase(handles_.begin() + 2);
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
+      handles_, true);
+  Close();
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
+      handles_, true);
+}
+
+TEST_F(DBTest, DisableThreadStatus) {
+  Options options;
+  options.env = env_;
+  options.enable_thread_tracking = false;
+  TryReopen(options);
+  CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+  // Verify non of the column family info exists
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
+      handles_, false);
+}
+
+TEST_F(DBTest, ThreadStatusFlush) {
+  Options options;
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.enable_thread_tracking = true;
+  options = CurrentOptions(options);
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"},
+      {"DBTest::ThreadStatusFlush:2",
+       "FlushJob::LogAndNotifyTableFileCreation()"},
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-    iter->Seek("foo");
+  CreateAndReopenWithCF({"pikachu"}, options);
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
 
-    ASSERT_TRUE(!iter->Valid());
-    ASSERT_TRUE(iter->status().IsInvalidArgument());
-  }
+  ASSERT_OK(Put(1, "foo", "v1"));
+  ASSERT_EQ("v1", Get(1, "foo"));
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
 
-  // testing that iterate_upper_bound prevents iterating over deleted items
-  // if the bound has already reached
-  {
-    options.prefix_extractor = nullptr;
-    DestroyAndReopen(options);
-    ASSERT_OK(Put("a", "0"));
-    ASSERT_OK(Put("b", "0"));
-    ASSERT_OK(Put("b1", "0"));
-    ASSERT_OK(Put("c", "0"));
-    ASSERT_OK(Put("d", "0"));
-    ASSERT_OK(Put("e", "0"));
-    ASSERT_OK(Delete("c"));
-    ASSERT_OK(Delete("d"));
+  Put(1, "k1", std::string(100000, 'x'));  // Fill memtable
+  Put(1, "k2", std::string(100000, 'y'));  // Trigger flush
 
-    // base case with no bound
-    ReadOptions ro;
-    ro.iterate_upper_bound = nullptr;
+  // The first sync point is to make sure there's one flush job
+  // running when we perform VerifyOperationCount().
+  TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1");
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1);
+  // This second sync point is to ensure the flush job will not
+  // be completed until we already perform VerifyOperationCount().
+  TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2");
 
-    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
 
-    iter->Seek("b");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+TEST_P(DBTestWithParam, ThreadStatusSingleCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  const int kNumL0Files = 4;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_subcompactions = max_subcompactions_;
 
-    iter->Next();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(("b1")), 0);
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"},
+      {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"},
+      {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"},
+  });
+  for (int tests = 0; tests < 2; ++tests) {
+    DestroyAndReopen(options);
+    rocksdb::SyncPoint::GetInstance()->ClearTrace();
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-    perf_context.Reset();
-    iter->Next();
+    Random rnd(301);
+    // The Put Phase.
+    for (int file = 0; file < kNumL0Files; ++file) {
+      for (int key = 0; key < kEntriesPerBuffer; ++key) {
+        ASSERT_OK(Put(ToString(key + file * kEntriesPerBuffer),
+                      RandomString(&rnd, kTestValueSize)));
+      }
+      Flush();
+    }
+    // This makes sure a compaction won't be scheduled until
+    // we have done with the above Put Phase.
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0");
+    ASSERT_GE(NumTableFilesAtLevel(0),
+              options.level0_file_num_compaction_trigger);
 
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(static_cast<int>(perf_context.internal_delete_skipped_count), 2);
+    // This makes sure at least one compaction is running.
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1");
 
-    // now testing with iterate_bound
-    Slice prefix("c");
-    ro.iterate_upper_bound = &prefix;
+    if (options.enable_thread_tracking) {
+      // expecting one single L0 to L1 compaction
+      VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1);
+    } else {
+      // If thread tracking is not enabled, compaction count should be 0.
+      VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0);
+    }
+    // TODO(yhchiang): adding assert to verify each compaction stage.
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2");
 
-    iter.reset(db_->NewIterator(ro));
+    // repeat the test with disabling thread tracking.
+    options.enable_thread_tracking = false;
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
 
-    perf_context.Reset();
+TEST_P(DBTestWithParam, PreShutdownManualCompaction) {
+  Options options = CurrentOptions();
+  options.max_background_flushes = 0;
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
 
-    iter->Seek("b");
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q", 1);
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
 
-    iter->Next();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(("b1")), 0);
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
 
-    iter->Next();
-    // the iteration should stop as soon as the the bound key is reached
-    // even though the key is deleted
-    // hence internal_delete_skipped_count should be 0
-    ASSERT_TRUE(!iter->Valid());
-    ASSERT_EQ(static_cast<int>(perf_context.internal_delete_skipped_count), 0);
-  }
-}
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
 
-TEST_F(DBTest, WriteSingleThreadEntry) {
-  std::vector<std::thread> threads;
-  dbfull()->TEST_LockMutex();
-  auto w = dbfull()->TEST_BeginWrite();
-  threads.emplace_back([&] { Put("a", "b"); });
-  env_->SleepForMicroseconds(10000);
-  threads.emplace_back([&] { Flush(); });
-  env_->SleepForMicroseconds(10000);
-  dbfull()->TEST_UnlockMutex();
-  dbfull()->TEST_LockMutex();
-  dbfull()->TEST_EndWrite(w);
-  dbfull()->TEST_UnlockMutex();
+    // Compaction range overlaps files
+    Compact(1, "p1", "p9");
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
 
-  for (auto& t : threads) {
-    t.join();
-  }
-}
+    // Populate a different range
+    MakeTables(3, "c", "e", 1);
+    ASSERT_EQ("1,1,2", FilesPerLevel(1));
 
-TEST_F(DBTest, DisableDataSyncTest) {
-  env_->sync_counter_.store(0);
-  // iter 0 -- no sync
-  // iter 1 -- sync
-  for (int iter = 0; iter < 2; ++iter) {
-    Options options = CurrentOptions();
-    options.disableDataSync = iter == 0;
-    options.create_if_missing = true;
-    options.env = env_;
-    Reopen(options);
-    CreateAndReopenWithCF({"pikachu"}, options);
+    // Compact just the new range
+    Compact(1, "b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel(1));
 
-    MakeTables(10, "a", "z");
-    Compact("a", "z");
+    // Compact all
+    MakeTables(1, "a", "z", 1);
+    ASSERT_EQ("1,0,2", FilesPerLevel(1));
+    CancelAllBackgroundWork(db_);
+    db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+    ASSERT_EQ("1,0,2", FilesPerLevel(1));
 
     if (iter == 0) {
-      ASSERT_EQ(env_->sync_counter_.load(), 0);
-    } else {
-      ASSERT_GT(env_->sync_counter_.load(), 0);
+      options = CurrentOptions();
+      options.max_background_flushes = 0;
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"pikachu"}, options);
     }
-    Destroy(options);
   }
 }
 
-TEST_F(DBTest, DynamicMemtableOptions) {
-  const uint64_t k64KB = 1 << 16;
-  const uint64_t k128KB = 1 << 17;
-  const uint64_t k5KB = 5 * 1024;
-  Options options;
-  options.env = env_;
-  options.create_if_missing = true;
-  options.compression = kNoCompression;
-  options.max_background_compactions = 1;
-  options.max_mem_compaction_level = 0;
-  options.write_buffer_size = k64KB;
-  options.max_write_buffer_number = 2;
-  // Don't trigger compact/slowdown/stop
-  options.level0_file_num_compaction_trigger = 1024;
-  options.level0_slowdown_writes_trigger = 1024;
-  options.level0_stop_writes_trigger = 1024;
-  DestroyAndReopen(options);
-
-  auto gen_l0_kb = [this](int size) {
-    Random rnd(301);
-    for (int i = 0; i < size; i++) {
-      ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
-    }
-    dbfull()->TEST_WaitForFlushMemTable();
-  };
-
-  // Test write_buffer_size
-  gen_l0_kb(64);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
-  ASSERT_LT(SizeAtLevel(0), k64KB + k5KB);
-  ASSERT_GT(SizeAtLevel(0), k64KB - k5KB);
-
-  // Clean up L0
-  dbfull()->CompactRange(nullptr, nullptr);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+TEST_F(DBTest, PreShutdownFlush) {
+  Options options = CurrentOptions();
+  options.max_background_flushes = 0;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(1, "key", "value"));
+  CancelAllBackgroundWork(db_);
+  Status s =
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+  ASSERT_TRUE(s.IsShutdownInProgress());
+}
 
-  // Increase buffer size
-  ASSERT_OK(dbfull()->SetOptions({
-    {"write_buffer_size", "131072"},
-  }));
+TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 40;
+  const int kNumL0Files = 4;
 
-  // The existing memtable is still 64KB in size, after it becomes immutable,
-  // the next memtable will be 128KB in size. Write 256KB total, we should
-  // have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data
-  gen_l0_kb(256);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
-  ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
-  ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 2 * k5KB);
+  const int kHighPriCount = 3;
+  const int kLowPriCount = 5;
+  env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+  env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
 
-  // Test max_write_buffer_number
-  // Block compaction thread, which will also block the flushes because
-  // max_background_flushes == 0, so flushes are getting executed by the
-  // compaction thread
-  env_->SetBackgroundThreads(1, Env::LOW);
-  SleepingBackgroundTask sleeping_task_low1;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low1,
-                 Env::Priority::LOW);
-  // Start from scratch and disable compaction/flush. Flush can only happen
-  // during compaction but trigger is pretty high
-  options.max_background_flushes = 0;
-  options.disable_auto_compactions = true;
-  DestroyAndReopen(options);
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * kNumL0Files;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_background_compactions = kLowPriCount;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+  options.max_subcompactions = max_subcompactions_;
 
-  // Put until timeout, bounded by 256 puts. We should see timeout at ~128KB
-  int count = 0;
+  TryReopen(options);
   Random rnd(301);
-  WriteOptions wo;
-  wo.timeout_hint_us = 100000;  // Reasonabley long timeout to make sure sleep
-                                // triggers but not forever.
 
-  std::atomic<int> sleep_count(0);
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::DelayWrite:TimedWait",
-      [&](void* arg) { sleep_count.fetch_add(1); });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  std::vector<ThreadStatus> thread_list;
+  // Delay both flush and compaction
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"},
+       {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownMultipleCompaction:Preshutdown"},
+        {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"},
+       {"DBTest::PreShutdownMultipleCompaction:Preshutdown",
+        "CompactionJob::Run():End"},
+       {"CompactionJob::Run():End",
+        "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}});
 
-  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 256) {
-    count++;
-  }
-  ASSERT_GT(sleep_count.load(), 0);
-  ASSERT_GT(static_cast<double>(count), 128 * 0.8);
-  ASSERT_LT(static_cast<double>(count), 128 * 1.2);
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-  sleeping_task_low1.WakeUp();
-  sleeping_task_low1.WaitUntilDone();
+  // Make rocksdb busy
+  int key = 0;
+  // check how many threads are doing compaction using GetThreadList
+  int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
+  for (int file = 0; file < 16 * kNumL0Files; ++file) {
+    for (int k = 0; k < kEntriesPerBuffer; ++k) {
+      ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize)));
+    }
 
-  // Increase
-  ASSERT_OK(dbfull()->SetOptions({
-    {"max_write_buffer_number", "8"},
-  }));
-  // Clean up memtable and L0
-  dbfull()->CompactRange(nullptr, nullptr);
+    Status s = env_->GetThreadList(&thread_list);
+    for (auto thread : thread_list) {
+      operation_count[thread.operation_type]++;
+    }
 
-  SleepingBackgroundTask sleeping_task_low2;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low2,
-                 Env::Priority::LOW);
-  count = 0;
-  sleep_count.store(0);
-  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 1024) {
-    count++;
+    // Speed up the test
+    if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
+        operation_count[ThreadStatus::OP_COMPACTION] >
+            0.6 * options.max_background_compactions) {
+      break;
+    }
+    if (file == 15 * kNumL0Files) {
+      TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
+    }
   }
-  ASSERT_GT(sleep_count.load(), 0);
-  ASSERT_GT(static_cast<double>(count), 512 * 0.8);
-  ASSERT_LT(static_cast<double>(count), 512 * 1.2);
-  sleeping_task_low2.WakeUp();
-  sleeping_task_low2.WaitUntilDone();
-
-  // Decrease
-  ASSERT_OK(dbfull()->SetOptions({
-    {"max_write_buffer_number", "4"},
-  }));
-  // Clean up memtable and L0
-  dbfull()->CompactRange(nullptr, nullptr);
-
-  SleepingBackgroundTask sleeping_task_low3;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low3,
-                 Env::Priority::LOW);
 
-  count = 0;
-  sleep_count.store(0);
-  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 1024) {
-    count++;
+  TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
+  ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
+  CancelAllBackgroundWork(db_);
+  TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown");
+  dbfull()->TEST_WaitForCompact();
+  // Record the number of compactions at a time.
+  for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
+    operation_count[i] = 0;
   }
-  ASSERT_GT(sleep_count.load(), 0);
-  ASSERT_GT(static_cast<double>(count), 256 * 0.8);
-  ASSERT_LT(static_cast<double>(count), 266 * 1.2);
-  sleeping_task_low3.WakeUp();
-  sleeping_task_low3.WaitUntilDone();
-
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-}
-
-#if ROCKSDB_USING_THREAD_STATUS
-namespace {
-void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type,
-                          int expected_count) {
-  int op_count = 0;
-  std::vector<ThreadStatus> thread_list;
-  ASSERT_OK(env->GetThreadList(&thread_list));
+  Status s = env_->GetThreadList(&thread_list);
   for (auto thread : thread_list) {
-    if (thread.operation_type == op_type) {
-      op_count++;
-    }
+    operation_count[thread.operation_type]++;
   }
-  ASSERT_EQ(op_count, expected_count);
+  ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
 }
-}  // namespace
 
-TEST_F(DBTest, GetThreadStatus) {
+TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 40;
+  const int kNumL0Files = 4;
+
+  const int kHighPriCount = 3;
+  const int kLowPriCount = 5;
+  env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+  env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+
   Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * kNumL0Files;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
   options.env = env_;
   options.enable_thread_tracking = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_background_compactions = kLowPriCount;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+  options.max_subcompactions = max_subcompactions_;
+
   TryReopen(options);
+  Random rnd(301);
 
   std::vector<ThreadStatus> thread_list;
-  Status s = env_->GetThreadList(&thread_list);
+  // Delay both flush and compaction
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBTest::PreShutdownCompactionMiddle:Preshutdown",
+        "CompactionJob::Run():Inprogress"},
+        {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"},
+       {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"},
+       {"CompactionJob::Run():End",
+        "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}});
 
-  for (int i = 0; i < 2; ++i) {
-    // repeat the test with differet number of high / low priority threads
-    const int kTestCount = 3;
-    const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5};
-    const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3};
-    for (int test = 0; test < kTestCount; ++test) {
-      // Change the number of threads in high / low priority pool.
-      env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH);
-      env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW);
-      // Wait to ensure the all threads has been registered
-      env_->SleepForMicroseconds(100000);
-      s = env_->GetThreadList(&thread_list);
-      ASSERT_OK(s);
-      unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES];
-      memset(thread_type_counts, 0, sizeof(thread_type_counts));
-      for (auto thread : thread_list) {
-        ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES);
-        thread_type_counts[thread.thread_type]++;
-      }
-      // Verify the total number of threades
-      ASSERT_EQ(
-          thread_type_counts[ThreadStatus::HIGH_PRIORITY] +
-              thread_type_counts[ThreadStatus::LOW_PRIORITY],
-          kHighPriCounts[test] + kLowPriCounts[test]);
-      // Verify the number of high-priority threads
-      ASSERT_EQ(
-          thread_type_counts[ThreadStatus::HIGH_PRIORITY],
-          kHighPriCounts[test]);
-      // Verify the number of low-priority threads
-      ASSERT_EQ(
-          thread_type_counts[ThreadStatus::LOW_PRIORITY],
-          kLowPriCounts[test]);
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Make rocksdb busy
+  int key = 0;
+  // check how many threads are doing compaction using GetThreadList
+  int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
+  for (int file = 0; file < 16 * kNumL0Files; ++file) {
+    for (int k = 0; k < kEntriesPerBuffer; ++k) {
+      ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize)));
     }
-    if (i == 0) {
-      // repeat the test with multiple column families
-      CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
-      env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
-          handles_, true);
+
+    Status s = env_->GetThreadList(&thread_list);
+    for (auto thread : thread_list) {
+      operation_count[thread.operation_type]++;
+    }
+
+    // Speed up the test
+    if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
+        operation_count[ThreadStatus::OP_COMPACTION] >
+            0.6 * options.max_background_compactions) {
+      break;
+    }
+    if (file == 15 * kNumL0Files) {
+      TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction");
     }
   }
-  db_->DropColumnFamily(handles_[2]);
-  delete handles_[2];
-  handles_.erase(handles_.begin() + 2);
-  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
-      handles_, true);
-  Close();
-  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
-      handles_, true);
+
+  ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
+  CancelAllBackgroundWork(db_);
+  TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown");
+  TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown");
+  dbfull()->TEST_WaitForCompact();
+  // Record the number of compactions at a time.
+  for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
+    operation_count[i] = 0;
+  }
+  Status s = env_->GetThreadList(&thread_list);
+  for (auto thread : thread_list) {
+    operation_count[thread.operation_type]++;
+  }
+  ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
 }
 
-TEST_F(DBTest, DisableThreadStatus) {
-  Options options;
-  options.env = env_;
-  options.enable_thread_tracking = false;
-  TryReopen(options);
-  CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
-  // Verify non of the column family info exists
-  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(
-      handles_, false);
+#endif  // ROCKSDB_USING_THREAD_STATUS
+
+TEST_F(DBTest, FlushOnDestroy) {
+  WriteOptions wo;
+  wo.disableWAL = true;
+  ASSERT_OK(Put("foo", "v1", wo));
+  CancelAllBackgroundWork(db_);
 }
 
-TEST_F(DBTest, ThreadStatusFlush) {
-  Options options;
-  options.env = env_;
-  options.write_buffer_size = 100000;  // Small write buffer
-  options.enable_thread_tracking = true;
-  options = CurrentOptions(options);
+namespace {
+class OnFileDeletionListener : public EventListener {
+ public:
+  OnFileDeletionListener() :
+      matched_count_(0),
+      expected_file_name_("") {}
 
-  rocksdb::SyncPoint::GetInstance()->LoadDependency({
-      {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"},
-      {"DBTest::ThreadStatusFlush:2", "FlushJob::~FlushJob()"},
-  });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  void SetExpectedFileName(
+      const std::string file_name) {
+    expected_file_name_ = file_name;
+  }
 
-  CreateAndReopenWithCF({"pikachu"}, options);
-  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+  void VerifyMatchedCount(size_t expected_value) {
+    ASSERT_EQ(matched_count_, expected_value);
+  }
 
-  ASSERT_OK(Put(1, "foo", "v1"));
-  ASSERT_EQ("v1", Get(1, "foo"));
-  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+  void OnTableFileDeleted(
+      const TableFileDeletionInfo& info) override {
+    if (expected_file_name_ != "") {
+      ASSERT_EQ(expected_file_name_, info.file_path);
+      expected_file_name_ = "";
+      matched_count_++;
+    }
+  }
 
-  Put(1, "k1", std::string(100000, 'x'));  // Fill memtable
-  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
-  Put(1, "k2", std::string(100000, 'y'));  // Trigger flush
-  // wait for flush to be scheduled
-  env_->SleepForMicroseconds(250000);
-  TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1");
-  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1);
-  TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2");
+ private:
+  size_t matched_count_;
+  std::string expected_file_name_;
+};
 
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-}
+}  // namespace
 
-TEST_F(DBTest, ThreadStatusSingleCompaction) {
-  const int kTestKeySize = 16;
-  const int kTestValueSize = 984;
-  const int kEntrySize = kTestKeySize + kTestValueSize;
-  const int kEntriesPerBuffer = 100;
+TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  const int kNKeys = 120;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  std::random_shuffle(std::begin(keys), std::end(keys));
+
+  Random rnd(301);
   Options options;
   options.create_if_missing = true;
-  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
-  options.compaction_style = kCompactionStyleLevel;
-  options.target_file_size_base = options.write_buffer_size;
-  options.max_bytes_for_level_base = options.target_file_size_base * 2;
-  options.max_bytes_for_level_multiplier = 2;
-  options.compression = kNoCompression;
-  options = CurrentOptions(options);
-  options.env = env_;
-  options.enable_thread_tracking = true;
-  const int kNumL0Files = 4;
-  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.db_write_buffer_size = 20480;
+  options.write_buffer_size = 20480;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 2048;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 102400;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 1;
+  options.num_levels = 5;
 
-  rocksdb::SyncPoint::GetInstance()->LoadDependency({
-      {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"},
-      {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"},
-      {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"},
-  });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  options.compression_per_level.resize(3);
+  options.compression_per_level[0] = kNoCompression;
+  options.compression_per_level[1] = kNoCompression;
+  options.compression_per_level[2] = kSnappyCompression;
 
-  for (int tests = 0; tests < 2; ++tests) {
-    DestroyAndReopen(options);
+  OnFileDeletionListener* listener = new OnFileDeletionListener();
+  options.listeners.emplace_back(listener);
 
-    Random rnd(301);
-    // The Put Phase.
-    for (int file = 0; file < kNumL0Files; ++file) {
-      for (int key = 0; key < kEntriesPerBuffer; ++key) {
-        ASSERT_OK(Put(ToString(key + file * kEntriesPerBuffer),
-                      RandomString(&rnd, kTestValueSize)));
-      }
-      Flush();
-    }
-    // This makes sure a compaction won't be scheduled until
-    // we have done with the above Put Phase.
-    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0");
-    ASSERT_GE(NumTableFilesAtLevel(0),
-              options.level0_file_num_compaction_trigger);
+  DestroyAndReopen(options);
 
-    // This makes sure at least one compaction is running.
-    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1");
+  // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
+  // be compressed, so total data size should be more than 80K.
+  for (int i = 0; i < 20; i++) {
+    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
 
-    if (options.enable_thread_tracking) {
-      // expecting one single L0 to L1 compaction
-      VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1);
-    } else {
-      // If thread tracking is not enabled, compaction count should be 0.
-      VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0);
-    }
-    // TODO(yhchiang): adding assert to verify each compaction stage.
-    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2");
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U);
 
-    // repeat the test with disabling thread tracking.
-    options.enable_thread_tracking = false;
+  // Insert 400KB. Some data will be compressed
+  for (int i = 21; i < 120; i++) {
+    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
   }
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4), 120U * 4000U);
+  // Make sure data in files in L3 is not compacted by removing all files
+  // in L4 and calculate number of rows
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+  for (auto file : cf_meta.levels[4].files) {
+    listener->SetExpectedFileName(dbname_ + file.name);
+    ASSERT_OK(dbfull()->DeleteFile(file.name));
+  }
+  listener->VerifyMatchedCount(cf_meta.levels[4].files.size());
+
+  int num_keys = 0;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    num_keys++;
+  }
+  ASSERT_OK(iter->status());
+  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U);
 }
 
-TEST_F(DBTest, PreShutdownManualCompaction) {
-  Options options = CurrentOptions();
-  options.max_background_flushes = 0;
-  CreateAndReopenWithCF({"pikachu"}, options);
-  ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
-      << "Need to update this test to match kMaxMemCompactLevel";
+TEST_F(DBTest, DynamicLevelCompressionPerLevel2) {
+  if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) {
+    return;
+  }
+  const int kNKeys = 500;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  std::random_shuffle(std::begin(keys), std::end(keys));
 
-  // iter - 0 with 7 levels
-  // iter - 1 with 3 levels
-  for (int iter = 0; iter < 2; ++iter) {
-    MakeTables(3, "p", "q", 1);
-    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+  Random rnd(301);
+  Options options;
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 6000;
+  options.write_buffer_size = 6000;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.soft_rate_limit = 1.1;
 
-    // Compaction range falls before files
-    Compact(1, "", "c");
-    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+  // Use file size to distinguish levels
+  // L1: 10, L2: 20, L3 40, L4 80
+  // L0 is less than 30
+  options.target_file_size_base = 10;
+  options.target_file_size_multiplier = 2;
 
-    // Compaction range falls after files
-    Compact(1, "r", "z");
-    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 200;
+  options.max_bytes_for_level_multiplier = 8;
+  options.max_background_compactions = 1;
+  options.num_levels = 5;
+  std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
+  options.table_factory = mtf;
 
-    // Compaction range overlaps files
-    Compact(1, "p1", "p9");
-    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+  options.compression_per_level.resize(3);
+  options.compression_per_level[0] = kNoCompression;
+  options.compression_per_level[1] = kLZ4Compression;
+  options.compression_per_level[2] = kZlibCompression;
 
-    // Populate a different range
-    MakeTables(3, "c", "e", 1);
-    ASSERT_EQ("1,1,2", FilesPerLevel(1));
+  DestroyAndReopen(options);
+  // When base level is L4, L4 is LZ4.
+  std::atomic<int> num_zlib(0);
+  std::atomic<int> num_lz4(0);
+  std::atomic<int> num_no(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        if (compaction->output_level() == 4) {
+          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+          num_lz4.fetch_add(1);
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+        auto* compression = reinterpret_cast<CompressionType*>(arg);
+        ASSERT_TRUE(*compression == kNoCompression);
+        num_no.fetch_add(1);
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-    // Compact just the new range
-    Compact(1, "b", "f");
-    ASSERT_EQ("0,0,2", FilesPerLevel(1));
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 200)));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
 
-    // Compact all
-    MakeTables(1, "a", "z", 1);
-    ASSERT_EQ("0,1,2", FilesPerLevel(1));
-    CancelAllBackgroundWork(db_);
-    db_->CompactRange(handles_[1], nullptr, nullptr);
-    ASSERT_EQ("0,1,2", FilesPerLevel(1));
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(NumTableFilesAtLevel(4), 0);
+  ASSERT_GT(num_no.load(), 2);
+  ASSERT_GT(num_lz4.load(), 0);
+  int prev_num_files_l4 = NumTableFilesAtLevel(4);
 
-    if (iter == 0) {
-      options = CurrentOptions();
-      options.max_background_flushes = 0;
-      options.num_levels = 3;
-      options.create_if_missing = true;
-      DestroyAndReopen(options);
-      CreateAndReopenWithCF({"pikachu"}, options);
+  // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib
+  num_lz4.store(0);
+  num_no.store(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        if (compaction->output_level() == 4 && compaction->start_level() == 3) {
+          ASSERT_TRUE(compaction->output_compression() == kZlibCompression);
+          num_zlib.fetch_add(1);
+        } else {
+          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+          num_lz4.fetch_add(1);
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+        auto* compression = reinterpret_cast<CompressionType*>(arg);
+        ASSERT_TRUE(*compression == kNoCompression);
+        num_no.fetch_add(1);
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 101; i < 500; i++) {
+    ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 200)));
+    if (i % 100 == 99) {
+      Flush();
+      dbfull()->TEST_WaitForCompact();
     }
   }
-}
-
-TEST_F(DBTest, PreShutdownMultipleCompaction) {
-  const int kTestKeySize = 16;
-  const int kTestValueSize = 984;
-  const int kEntrySize = kTestKeySize + kTestValueSize;
-  const int kEntriesPerBuffer = 40;
-  const int kNumL0Files = 4;
 
-  const int kHighPriCount = 3;
-  const int kLowPriCount = 5;
-  env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
-  env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_GT(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4);
+  ASSERT_GT(num_no.load(), 2);
+  ASSERT_GT(num_lz4.load(), 0);
+  ASSERT_GT(num_zlib.load(), 0);
+}
 
+TEST_F(DBTest, DynamicCompactionOptions) {
+  // minimum write buffer size is enforced at 64KB
+  const uint64_t k32KB = 1 << 15;
+  const uint64_t k64KB = 1 << 16;
+  const uint64_t k128KB = 1 << 17;
+  const uint64_t k1MB = 1 << 20;
+  const uint64_t k4KB = 1 << 12;
   Options options;
+  options.env = env_;
   options.create_if_missing = true;
-  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
-  options.compaction_style = kCompactionStyleLevel;
-  options.target_file_size_base = options.write_buffer_size;
-  options.max_bytes_for_level_base =
-      options.target_file_size_base * kNumL0Files;
   options.compression = kNoCompression;
-  options = CurrentOptions(options);
-  options.env = env_;
-  options.enable_thread_tracking = true;
-  options.level0_file_num_compaction_trigger = kNumL0Files;
-  options.max_bytes_for_level_multiplier = 2;
-  options.max_background_compactions = kLowPriCount;
-  options.level0_stop_writes_trigger = 1 << 10;
-  options.level0_slowdown_writes_trigger = 1 << 10;
-
-  TryReopen(options);
-  Random rnd(301);
-
-  std::vector<ThreadStatus> thread_list;
-  // Delay both flush and compaction
-  rocksdb::SyncPoint::GetInstance()->LoadDependency(
-      {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"},
-       {"CompactionJob::Run():Start",
-        "DBTest::PreShutdownMultipleCompaction:Preshutdown"},
-        {"CompactionJob::Run():Start",
-        "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"},
-       {"DBTest::PreShutdownMultipleCompaction:Preshutdown",
-        "CompactionJob::Run():End"},
-       {"CompactionJob::Run():End",
-        "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}});
+  options.soft_rate_limit = 1.1;
+  options.write_buffer_size = k64KB;
+  options.arena_block_size = 4 * k4KB;
+  options.max_write_buffer_number = 2;
+  // Compaction related options
+  options.level0_file_num_compaction_trigger = 3;
+  options.level0_slowdown_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 8;
+  options.max_grandparent_overlap_factor = 10;
+  options.expanded_compaction_factor = 25;
+  options.source_compaction_factor = 1;
+  options.target_file_size_base = k64KB;
+  options.target_file_size_multiplier = 1;
+  options.max_bytes_for_level_base = k128KB;
+  options.max_bytes_for_level_multiplier = 4;
 
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  // Block flush thread and disable compaction thread
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  DestroyAndReopen(options);
 
-  // Make rocksdb busy
-  int key = 0;
-  // check how many threads are doing compaction using GetThreadList
-  int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
-  for (int file = 0; file < 16 * kNumL0Files; ++file) {
-    for (int k = 0; k < kEntriesPerBuffer; ++k) {
-      ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize)));
+  auto gen_l0_kb = [this](int start, int size, int stride) {
+    Random rnd(301);
+    for (int i = 0; i < size; i++) {
+      ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024)));
     }
+    dbfull()->TEST_WaitForFlushMemTable();
+  };
 
-    Status s = env_->GetThreadList(&thread_list);
-    for (auto thread : thread_list) {
-      operation_count[thread.operation_type]++;
-    }
+  // Write 3 files that have the same key range.
+  // Since level0_file_num_compaction_trigger is 3, compaction should be
+  // triggered. The compaction should result in one L1 file
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  gen_l0_kb(0, 64, 1);
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,1", FilesPerLevel());
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(1U, metadata.size());
+  ASSERT_LE(metadata[0].size, k64KB + k4KB);
+  ASSERT_GE(metadata[0].size, k64KB - k4KB);
 
-    // Speed up the test
-    if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
-        operation_count[ThreadStatus::OP_COMPACTION] >
-            0.6 * options.max_background_compactions) {
-      break;
-    }
-    if (file == 15 * kNumL0Files) {
-      TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
-    }
-  }
+  // Test compaction trigger and target_file_size_base
+  // Reduce compaction trigger to 2, and reduce L1 file size to 32KB.
+  // Writing to 64KB L0 files should trigger a compaction. Since these
+  // 2 L0 files have the same key range, compaction merge them and should
+  // result in 2 32KB L1 files.
+  ASSERT_OK(dbfull()->SetOptions({
+    {"level0_file_num_compaction_trigger", "2"},
+    {"target_file_size_base", ToString(k32KB) }
+  }));
 
-  TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
-  ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
-  CancelAllBackgroundWork(db_);
-  TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown");
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ("1,1", FilesPerLevel());
+  gen_l0_kb(0, 64, 1);
   dbfull()->TEST_WaitForCompact();
-  // Record the number of compactions at a time.
-  for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
-    operation_count[i] = 0;
-  }
-  Status s = env_->GetThreadList(&thread_list);
-  for (auto thread : thread_list) {
-    operation_count[thread.operation_type]++;
-  }
-  ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
-}
+  ASSERT_EQ("0,2", FilesPerLevel());
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(2U, metadata.size());
+  ASSERT_LE(metadata[0].size, k32KB + k4KB);
+  ASSERT_GE(metadata[0].size, k32KB - k4KB);
+  ASSERT_LE(metadata[1].size, k32KB + k4KB);
+  ASSERT_GE(metadata[1].size, k32KB - k4KB);
 
-TEST_F(DBTest, PreShutdownCompactionMiddle) {
-  const int kTestKeySize = 16;
-  const int kTestValueSize = 984;
-  const int kEntrySize = kTestKeySize + kTestValueSize;
-  const int kEntriesPerBuffer = 40;
-  const int kNumL0Files = 4;
+  // Test max_bytes_for_level_base
+  // Increase level base size to 256KB and write enough data that will
+  // fill L1 and L2. L1 size should be around 256KB while L2 size should be
+  // around 256KB x 4.
+  ASSERT_OK(dbfull()->SetOptions({
+    {"max_bytes_for_level_base", ToString(k1MB) }
+  }));
 
-  const int kHighPriCount = 3;
-  const int kLowPriCount = 5;
-  env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
-  env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+  // writing 96 x 64KB => 6 * 1024KB
+  // (L1 + L2) = (1 + 4) * 1024KB
+  for (int i = 0; i < 96; ++i) {
+    gen_l0_kb(i, 64, 96);
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_GT(SizeAtLevel(1), k1MB / 2);
+  ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2);
 
-  Options options;
-  options.create_if_missing = true;
-  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
-  options.compaction_style = kCompactionStyleLevel;
-  options.target_file_size_base = options.write_buffer_size;
-  options.max_bytes_for_level_base =
-      options.target_file_size_base * kNumL0Files;
-  options.compression = kNoCompression;
-  options = CurrentOptions(options);
-  options.env = env_;
-  options.enable_thread_tracking = true;
-  options.level0_file_num_compaction_trigger = kNumL0Files;
-  options.max_bytes_for_level_multiplier = 2;
-  options.max_background_compactions = kLowPriCount;
-  options.level0_stop_writes_trigger = 1 << 10;
-  options.level0_slowdown_writes_trigger = 1 << 10;
+  // Within (0.5, 1.5) of 4MB.
+  ASSERT_GT(SizeAtLevel(2), 2 * k1MB);
+  ASSERT_LT(SizeAtLevel(2), 6 * k1MB);
 
-  TryReopen(options);
-  Random rnd(301);
+  // Test max_bytes_for_level_multiplier and
+  // max_bytes_for_level_base. Now, reduce both mulitplier and level base,
+  // After filling enough data that can fit in L1 - L3, we should see L1 size
+  // reduces to 128KB from 256KB which was asserted previously. Same for L2.
+  ASSERT_OK(dbfull()->SetOptions({
+    {"max_bytes_for_level_multiplier", "2"},
+    {"max_bytes_for_level_base", ToString(k128KB) }
+  }));
 
-  std::vector<ThreadStatus> thread_list;
-  // Delay both flush and compaction
-  rocksdb::SyncPoint::GetInstance()->LoadDependency(
-      {{"DBTest::PreShutdownCompactionMiddle:Preshutdown",
-        "CompactionJob::Run():Inprogress"},
-        {"CompactionJob::Run():Start",
-        "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"},
-       {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"},
-       {"CompactionJob::Run():End",
-        "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}});
+  // writing 20 x 64KB = 10 x 128KB
+  // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB
+  for (int i = 0; i < 20; ++i) {
+    gen_l0_kb(i, 64, 32);
+  }
+  dbfull()->TEST_WaitForCompact();
+  uint64_t total_size =
+    SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3);
+  ASSERT_TRUE(total_size < k128KB * 7 * 1.5);
 
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  // Test level0_stop_writes_trigger.
+  // Clean up memtable and L0. Block compaction threads. If continue to write
+  // and flush memtables. We should see put stop after 8 memtable flushes
+  // since level0_stop_writes_trigger = 8
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  // Block compaction
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
 
-  // Make rocksdb busy
-  int key = 0;
-  // check how many threads are doing compaction using GetThreadList
-  int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
-  for (int file = 0; file < 16 * kNumL0Files; ++file) {
-    for (int k = 0; k < kEntriesPerBuffer; ++k) {
-      ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize)));
-    }
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait",
+      [&](void* arg) { sleeping_task_low.WakeUp(); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-    Status s = env_->GetThreadList(&thread_list);
-    for (auto thread : thread_list) {
-      operation_count[thread.operation_type]++;
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  int count = 0;
+  Random rnd(301);
+  WriteOptions wo;
+  while (count < 64) {
+    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
+    if (sleeping_task_low.WokenUp()) {
+      break;
     }
+    dbfull()->TEST_FlushMemTable(true);
+    count++;
+  }
+  // Stop trigger = 8
+  ASSERT_EQ(count, 8);
+  // Unblock
+  sleeping_task_low.WaitUntilDone();
 
-    // Speed up the test
-    if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
-        operation_count[ThreadStatus::OP_COMPACTION] >
-            0.6 * options.max_background_compactions) {
+  // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0.
+  // Block compaction thread again. Perform the put and memtable flushes
+  // until we see the stop after 6 memtable flushes.
+  ASSERT_OK(dbfull()->SetOptions({
+    {"level0_stop_writes_trigger", "6"}
+  }));
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // Block compaction again
+  sleeping_task_low.Reset();
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  count = 0;
+  while (count < 64) {
+    ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
+    if (sleeping_task_low.WokenUp()) {
       break;
     }
-    if (file == 15 * kNumL0Files) {
-      TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction");
-    }
+    dbfull()->TEST_FlushMemTable(true);
+    count++;
   }
+  ASSERT_EQ(count, 6);
+  // Unblock
+  sleeping_task_low.WaitUntilDone();
 
-  ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
-  CancelAllBackgroundWork(db_);
-  TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown");
-  TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown");
-  dbfull()->TEST_WaitForCompact();
-  // Record the number of compactions at a time.
-  for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
-    operation_count[i] = 0;
-  }
-  Status s = env_->GetThreadList(&thread_list);
-  for (auto thread : thread_list) {
-    operation_count[thread.operation_type]++;
+  // Test disable_auto_compactions
+  // Compaction thread is unblocked but auto compaction is disabled. Write
+  // 4 L0 files and compaction should be triggered. If auto compaction is
+  // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of
+  // L0 files do not change after the call.
+  ASSERT_OK(dbfull()->SetOptions({
+    {"disable_auto_compactions", "true"}
+  }));
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    // Wait for compaction so that put won't stop
+    dbfull()->TEST_FlushMemTable(true);
   }
-  ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
-}
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
 
-#endif  // ROCKSDB_USING_THREAD_STATUS
+  // Enable auto compaction and perform the same test, # of L0 files should be
+  // reduced after compaction.
+  ASSERT_OK(dbfull()->SetOptions({
+    {"disable_auto_compactions", "false"}
+  }));
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 
-TEST_F(DBTest, DynamicLevelMaxBytesBase) {
-  // Use InMemoryEnv, or it would be too slow.
-  unique_ptr<Env> env(new MockEnv(env_));
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+    // Wait for compaction so that put won't stop
+    dbfull()->TEST_FlushMemTable(true);
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_LT(NumTableFilesAtLevel(0), 4);
 
-  const int kNKeys = 1000;
-  int keys[kNKeys];
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
 
-  auto verify_func = [&]() {
-    for (int i = 0; i < kNKeys; i++) {
-      ASSERT_NE("NOT_FOUND", Get(Key(i)));
-      ASSERT_NE("NOT_FOUND", Get(Key(kNKeys * 2 + i)));
-      if (i < kNKeys / 10) {
-        ASSERT_EQ("NOT_FOUND", Get(Key(kNKeys + keys[i])));
-      } else {
-        ASSERT_NE("NOT_FOUND", Get(Key(kNKeys + keys[i])));
-      }
-    }
-  };
+TEST_F(DBTest, FileCreationRandomFailure) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.target_file_size_base = 200000;
+  options.max_bytes_for_level_base = 1000000;
+  options.max_bytes_for_level_multiplier = 2;
 
+  DestroyAndReopen(options);
   Random rnd(301);
-  for (int ordered_insert = 0; ordered_insert <= 1; ordered_insert++) {
-    for (int i = 0; i < kNKeys; i++) {
-      keys[i] = i;
-    }
-    if (ordered_insert == 0) {
-      std::random_shuffle(std::begin(keys), std::end(keys));
-    }
-    for (int max_background_compactions = 1; max_background_compactions < 4;
-         max_background_compactions += 2) {
-      Options options;
-      options.env = env.get();
-      options.create_if_missing = true;
-      options.db_write_buffer_size = 2048;
-      options.write_buffer_size = 2048;
-      options.max_write_buffer_number = 2;
-      options.level0_file_num_compaction_trigger = 2;
-      options.level0_slowdown_writes_trigger = 2;
-      options.level0_stop_writes_trigger = 2;
-      options.target_file_size_base = 2048;
-      options.level_compaction_dynamic_level_bytes = true;
-      options.max_bytes_for_level_base = 10240;
-      options.max_bytes_for_level_multiplier = 4;
-      options.hard_rate_limit = 1.1;
-      options.max_background_compactions = max_background_compactions;
-      options.num_levels = 5;
-
-      options.compression_per_level.resize(3);
-      options.compression_per_level[0] = kNoCompression;
-      options.compression_per_level[1] = kLZ4Compression;
-      options.compression_per_level[2] = kSnappyCompression;
-
-      DestroyAndReopen(options);
 
-      for (int i = 0; i < kNKeys; i++) {
-        int key = keys[i];
-        ASSERT_OK(Put(Key(kNKeys + key), RandomString(&rnd, 102)));
-        ASSERT_OK(Put(Key(key), RandomString(&rnd, 102)));
-        ASSERT_OK(Put(Key(kNKeys * 2 + key), RandomString(&rnd, 102)));
-        ASSERT_OK(Delete(Key(kNKeys + keys[i / 10])));
-        env_->SleepForMicroseconds(5000);
+  const int kCDTKeysPerBuffer = 4;
+  const int kTestSize = kCDTKeysPerBuffer * 4096;
+  const int kTotalIteration = 100;
+  // the second half of the test involves in random failure
+  // of file creation.
+  const int kRandomFailureTest = kTotalIteration / 2;
+  std::vector<std::string> values;
+  for (int i = 0; i < kTestSize; ++i) {
+    values.push_back("NOT_FOUND");
+  }
+  for (int j = 0; j < kTotalIteration; ++j) {
+    if (j == kRandomFailureTest) {
+      env_->non_writeable_rate_.store(90);
+    }
+    for (int k = 0; k < kTestSize; ++k) {
+      // here we expect some of the Put fails.
+      std::string value = RandomString(&rnd, 100);
+      Status s = Put(Key(k), Slice(value));
+      if (s.ok()) {
+        // update the latest successful put
+        values[k] = value;
       }
-
-      uint64_t int_prop;
-      ASSERT_TRUE(db_->GetIntProperty("rocksdb.background-errors", &int_prop));
-      ASSERT_EQ(0U, int_prop);
-
-      // Verify DB
-      for (int j = 0; j < 2; j++) {
-        verify_func();
-        if (j == 0) {
-          Reopen(options);
-        }
+      // But everything before we simulate the failure-test should succeed.
+      if (j < kRandomFailureTest) {
+        ASSERT_OK(s);
       }
+    }
+  }
 
-      // Test compact range works
-      dbfull()->CompactRange(nullptr, nullptr);
-      // All data should be in the last level.
-      ColumnFamilyMetaData cf_meta;
-      db_->GetColumnFamilyMetaData(&cf_meta);
-      ASSERT_EQ(5U, cf_meta.levels.size());
-      for (int i = 0; i < 4; i++) {
-        ASSERT_EQ(0U, cf_meta.levels[i].files.size());
-      }
-      ASSERT_GT(cf_meta.levels[4U].files.size(), 0U);
-      verify_func();
+  // If rocksdb does not do the correct job, internal assert will fail here.
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
 
-      Close();
-    }
+  // verify we have the latest successful update
+  for (int k = 0; k < kTestSize; ++k) {
+    auto v = Get(Key(k));
+    ASSERT_EQ(v, values[k]);
   }
 
-  env_->SetBackgroundThreads(1, Env::LOW);
-  env_->SetBackgroundThreads(1, Env::HIGH);
+  // reopen and reverify we have the latest successful update
+  env_->non_writeable_rate_.store(0);
+  Reopen(options);
+  for (int k = 0; k < kTestSize; ++k) {
+    auto v = Get(Key(k));
+    ASSERT_EQ(v, values[k]);
+  }
 }
 
-// Test specific cases in dynamic max bytes
-TEST_F(DBTest, DynamicLevelMaxBytesBase2) {
-  Random rnd(301);
-  int kMaxKey = 1000000;
-
-  Options options = CurrentOptions();
+TEST_F(DBTest, DynamicMiscOptions) {
+  // Test max_sequential_skip_in_iterations
+  Options options;
+  options.env = env_;
   options.create_if_missing = true;
-  options.db_write_buffer_size = 2048;
-  options.write_buffer_size = 2048;
-  options.max_write_buffer_number = 2;
-  options.level0_file_num_compaction_trigger = 2;
-  options.level0_slowdown_writes_trigger = 9999;
-  options.level0_stop_writes_trigger = 9999;
-  options.target_file_size_base = 2048;
-  options.level_compaction_dynamic_level_bytes = true;
-  options.max_bytes_for_level_base = 10240;
-  options.max_bytes_for_level_multiplier = 4;
-  options.max_background_compactions = 2;
-  options.num_levels = 5;
-  options.expanded_compaction_factor = 0;  // Force not expanding in compactions
-  BlockBasedTableOptions table_options;
-  table_options.block_size = 1024;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-
+  options.max_sequential_skip_in_iterations = 16;
+  options.compression = kNoCompression;
+  options.statistics = rocksdb::CreateDBStatistics();
   DestroyAndReopen(options);
-  ASSERT_OK(dbfull()->SetOptions({
-      {"disable_auto_compactions", "true"},
-  }));
 
-  uint64_t int_prop;
-  std::string str_prop;
-
-  // Initial base level is the last level
-  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
-  ASSERT_EQ(4U, int_prop);
+  auto assert_reseek_count = [this, &options](int key_start, int num_reseek) {
+    int key0 = key_start;
+    int key1 = key_start + 1;
+    int key2 = key_start + 2;
+    Random rnd(301);
+    ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8)));
+    for (int i = 0; i < 10; ++i) {
+      ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8)));
+    }
+    ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8)));
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->Seek(Key(key1));
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Key(key1)), 0);
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Key(key2)), 0);
+    ASSERT_EQ(num_reseek,
+              TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
+  };
+  // No reseek
+  assert_reseek_count(100, 0);
 
-  // Put about 7K to L0
-  for (int i = 0; i < 70; i++) {
-    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
-                  RandomString(&rnd, 80)));
-  }
   ASSERT_OK(dbfull()->SetOptions({
-      {"disable_auto_compactions", "false"},
+    {"max_sequential_skip_in_iterations", "4"}
   }));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
-  ASSERT_EQ(4U, int_prop);
+  // Clear memtable and make new option effective
+  dbfull()->TEST_FlushMemTable(true);
+  // Trigger reseek
+  assert_reseek_count(200, 1);
 
-  // Insert extra about 3.5K to L0. After they are compacted to L4, base level
-  // should be changed to L3.
   ASSERT_OK(dbfull()->SetOptions({
-      {"disable_auto_compactions", "true"},
+    {"max_sequential_skip_in_iterations", "16"}
   }));
-  for (int i = 0; i < 70; i++) {
-    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
-                  RandomString(&rnd, 80)));
-  }
+  // Clear memtable and make new option effective
+  dbfull()->TEST_FlushMemTable(true);
+  // No reseek
+  assert_reseek_count(300, 1);
+}
 
-  ASSERT_OK(dbfull()->SetOptions({
-      {"disable_auto_compactions", "false"},
-  }));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
-  ASSERT_EQ(3U, int_prop);
-  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
-  ASSERT_EQ("0", str_prop);
-  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
-  ASSERT_EQ("0", str_prop);
-
-  // Trigger parallel compaction, and the first one would change the base
-  // level.
-  // Hold compaction jobs to make sure
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "CompactionJob::Run():Start",
-      [&](void* arg) { env_->SleepForMicroseconds(100000); });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-  ASSERT_OK(dbfull()->SetOptions({
-      {"disable_auto_compactions", "true"},
-  }));
-  // Write about 10K more
-  for (int i = 0; i < 100; i++) {
-    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
-                  RandomString(&rnd, 80)));
-  }
-  ASSERT_OK(dbfull()->SetOptions({
-      {"disable_auto_compactions", "false"},
-  }));
-  Flush();
-  // Wait for 200 milliseconds before proceeding compactions to make sure two
-  // parallel ones are executed.
-  env_->SleepForMicroseconds(200000);
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
-  ASSERT_EQ(3U, int_prop);
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+TEST_F(DBTest, DontDeletePendingOutputs) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
 
-  // Trigger a condition that the compaction changes base level and L0->Lbase
-  // happens at the same time.
-  // We try to make last levels' targets to be 10K, 40K, 160K, add triggers
-  // another compaction from 40K->160K.
-  ASSERT_OK(dbfull()->SetOptions({
-      {"disable_auto_compactions", "true"},
-  }));
-  // Write about 150K more
-  for (int i = 0; i < 1350; i++) {
-    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
-                  RandomString(&rnd, 80)));
-  }
-  ASSERT_OK(dbfull()->SetOptions({
-      {"disable_auto_compactions", "false"},
-  }));
-  Flush();
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
-  ASSERT_EQ(2U, int_prop);
+  // Every time we write to a table file, call FOF/POF with full DB scan. This
+  // will make sure our pending_outputs_ protection work correctly
+  std::function<void()> purge_obsolete_files_function = [&]() {
+    JobContext job_context(0);
+    dbfull()->TEST_LockMutex();
+    dbfull()->FindObsoleteFiles(&job_context, true /*force*/);
+    dbfull()->TEST_UnlockMutex();
+    dbfull()->PurgeObsoleteFiles(job_context);
+    job_context.Clean();
+  };
 
-  // Keep Writing data until base level changed 2->1. There will be L0->L2
-  // compaction going on at the same time.
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-  for (int attempt = 0; attempt <= 20; attempt++) {
-    // Write about 5K more data with two flushes. It should be flush to level 2
-    // but when it is applied, base level is already 1.
-    for (int i = 0; i < 50; i++) {
-      ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
-                    RandomString(&rnd, 80)));
-    }
-    Flush();
+  env_->table_write_callback_ = &purge_obsolete_files_function;
 
-    ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
-    if (int_prop == 2U) {
-      env_->SleepForMicroseconds(50000);
-    } else {
-      break;
-    }
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_OK(Put("a", "begin"));
+    ASSERT_OK(Put("z", "end"));
+    ASSERT_OK(Flush());
   }
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-
-  env_->SleepForMicroseconds(200000);
 
-  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
-  ASSERT_EQ(1U, int_prop);
+  // If pending output guard does not work correctly, PurgeObsoleteFiles() will
+  // delete the file that Compaction is trying to create, causing this: error
+  // db/db_test.cc:975: IO error:
+  // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory
+  Compact("a", "b");
 }
 
-// Test specific cases in dynamic max bytes
-TEST_F(DBTest, DynamicLevelMaxBytesCompactRange) {
-  Random rnd(301);
-  int kMaxKey = 1000000;
-
+TEST_F(DBTest, DontDeleteMovedFile) {
+  // This test triggers move compaction and verifies that the file is not
+  // deleted when it's part of move compaction
   Options options = CurrentOptions();
+  options.env = env_;
   options.create_if_missing = true;
-  options.db_write_buffer_size = 2048;
-  options.write_buffer_size = 2048;
-  options.max_write_buffer_number = 2;
-  options.level0_file_num_compaction_trigger = 2;
-  options.level0_slowdown_writes_trigger = 9999;
-  options.level0_stop_writes_trigger = 9999;
-  options.target_file_size_base = 2;
-  options.level_compaction_dynamic_level_bytes = true;
-  options.max_bytes_for_level_base = 10240;
-  options.max_bytes_for_level_multiplier = 4;
-  options.max_background_compactions = 1;
-  const int kNumLevels = 5;
-  options.num_levels = kNumLevels;
-  options.expanded_compaction_factor = 0;  // Force not expanding in compactions
-  BlockBasedTableOptions table_options;
-  table_options.block_size = 1024;
-  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-
+  options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+  options.level0_file_num_compaction_trigger =
+      2;  // trigger compaction when we have 2 files
   DestroyAndReopen(options);
 
-  // Compact against empty DB
-  dbfull()->CompactRange(nullptr, nullptr);
-
-  uint64_t int_prop;
-  std::string str_prop;
-
-  // Initial base level is the last level
-  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
-  ASSERT_EQ(4U, int_prop);
-
-  // Put about 7K to L0
-  for (int i = 0; i < 140; i++) {
-    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
-                  RandomString(&rnd, 80)));
+  Random rnd(301);
+  // Create two 1MB sst files
+  for (int i = 0; i < 2; ++i) {
+    // Create 1MB sst file
+    for (int j = 0; j < 100; ++j) {
+      ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+    }
+    ASSERT_OK(Flush());
   }
-  Flush();
+  // this should execute both L0->L1 and L1->(move)->L2 compactions
   dbfull()->TEST_WaitForCompact();
-  if (NumTableFilesAtLevel(0) == 0) {
-    // Make sure level 0 is not empty
-    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
-                  RandomString(&rnd, 80)));
-    Flush();
-  }
-
-  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
-  ASSERT_EQ(3U, int_prop);
-  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
-  ASSERT_EQ("0", str_prop);
-  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
-  ASSERT_EQ("0", str_prop);
-
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-
-  std::set<int> output_levels;
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "CompactionPicker::CompactRange:Return", [&](void* arg) {
-        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
-        output_levels.insert(compaction->output_level());
-      });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
 
-  dbfull()->CompactRange(nullptr, nullptr);
-  ASSERT_EQ(output_levels.size(), 2);
-  ASSERT_TRUE(output_levels.find(3) != output_levels.end());
-  ASSERT_TRUE(output_levels.find(4) != output_levels.end());
-  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &str_prop));
-  ASSERT_EQ("0", str_prop);
-  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level3", &str_prop));
-  ASSERT_EQ("0", str_prop);
-  // Base level is still level 3.
-  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
-  ASSERT_EQ(3U, int_prop);
+  // If the moved file is actually deleted (the move-safeguard in
+  // ~Version::Version() is not there), we get this failure:
+  // Corruption: Can't access /000009.sst
+  Reopen(options);
 }
 
-TEST_F(DBTest, DynamicLevelMaxBytesBaseInc) {
+TEST_F(DBTest, OptimizeFiltersForHits) {
   Options options = CurrentOptions();
-  options.create_if_missing = true;
-  options.db_write_buffer_size = 2048;
-  options.write_buffer_size = 2048;
-  options.max_write_buffer_number = 2;
+  options.write_buffer_size = 64 * 1024;
+  options.arena_block_size = 4 * 1024;
+  options.target_file_size_base = 64 * 1024;
   options.level0_file_num_compaction_trigger = 2;
   options.level0_slowdown_writes_trigger = 2;
-  options.level0_stop_writes_trigger = 2;
-  options.target_file_size_base = 2048;
+  options.level0_stop_writes_trigger = 4;
+  options.max_bytes_for_level_base = 256 * 1024;
+  options.max_write_buffer_number = 2;
+  options.max_background_compactions = 8;
+  options.max_background_flushes = 8;
+  options.compression = kNoCompression;
+  options.compaction_style = kCompactionStyleLevel;
   options.level_compaction_dynamic_level_bytes = true;
-  options.max_bytes_for_level_base = 10240;
-  options.max_bytes_for_level_multiplier = 4;
-  options.hard_rate_limit = 1.1;
-  options.max_background_compactions = 2;
-  options.num_levels = 5;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.optimize_filters_for_hits = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  CreateAndReopenWithCF({"mypikachu"}, options);
 
-  DestroyAndReopen(options);
+  int numkeys = 200000;
 
-  int non_trivial = 0;
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction:NonTrivial",
-      [&](void* arg) { non_trivial++; });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  // Generate randomly shuffled keys, so the updates are almost
+  // random.
+  std::vector<int> keys;
+  keys.reserve(numkeys);
+  for (int i = 0; i < numkeys; i += 2) {
+    keys.push_back(i);
+  }
+  std::random_shuffle(std::begin(keys), std::end(keys));
 
-  Random rnd(301);
-  const int total_keys = 3000;
-  const int random_part_size = 100;
-  for (int i = 0; i < total_keys; i++) {
-    std::string value = RandomString(&rnd, random_part_size);
-    PutFixed32(&value, static_cast<uint32_t>(i));
-    ASSERT_OK(Put(Key(i), value));
+  int num_inserted = 0;
+  for (int key : keys) {
+    ASSERT_OK(Put(1, Key(key), "val"));
+    if (++num_inserted % 1000 == 0) {
+      dbfull()->TEST_WaitForFlushMemTable();
+      dbfull()->TEST_WaitForCompact();
+    }
   }
-  Flush();
+  ASSERT_OK(Put(1, Key(0), "val"));
+  ASSERT_OK(Put(1, Key(numkeys), "val"));
+  ASSERT_OK(Flush(1));
   dbfull()->TEST_WaitForCompact();
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 
-  ASSERT_EQ(non_trivial, 0);
+  if (NumTableFilesAtLevel(0, 1) == 0) {
+    // No Level 0 file. Create one.
+    ASSERT_OK(Put(1, Key(0), "val"));
+    ASSERT_OK(Put(1, Key(numkeys), "val"));
+    ASSERT_OK(Flush(1));
+    dbfull()->TEST_WaitForCompact();
+  }
 
-  for (int i = 0; i < total_keys; i++) {
-    std::string value = Get(Key(i));
-    ASSERT_EQ(DecodeFixed32(value.c_str() + random_part_size),
-              static_cast<uint32_t>(i));
+  for (int i = 1; i < numkeys; i += 2) {
+    ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND");
   }
 
-  env_->SetBackgroundThreads(1, Env::LOW);
-  env_->SetBackgroundThreads(1, Env::HIGH);
-}
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
 
+  // Now we have three sorted run, L0, L5 and L6 with most files in L6 have
+  // no blooom filter. Most keys be checked bloom filters twice.
+  ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2);
+  ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2);
 
-TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
-  if (!Snappy_Supported()) {
-    return;
-  }
-  const int kNKeys = 120;
-  int keys[kNKeys];
-  for (int i = 0; i < kNKeys; i++) {
-    keys[i] = i;
+  for (int i = 0; i < numkeys; i += 2) {
+    ASSERT_EQ(Get(1, Key(i)), "val");
   }
-  std::random_shuffle(std::begin(keys), std::end(keys));
+}
 
-  Random rnd(301);
-  Options options;
-  options.create_if_missing = true;
-  options.db_write_buffer_size = 20480;
-  options.write_buffer_size = 20480;
-  options.max_write_buffer_number = 2;
+TEST_F(DBTest, L0L1L2AndUpHitCounter) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 32 * 1024;
+  options.target_file_size_base = 32 * 1024;
   options.level0_file_num_compaction_trigger = 2;
   options.level0_slowdown_writes_trigger = 2;
-  options.level0_stop_writes_trigger = 2;
-  options.target_file_size_base = 2048;
-  options.level_compaction_dynamic_level_bytes = true;
-  options.max_bytes_for_level_base = 102400;
-  options.max_bytes_for_level_multiplier = 4;
-  options.max_background_compactions = 1;
-  options.num_levels = 5;
-
-  options.compression_per_level.resize(3);
-  options.compression_per_level[0] = kNoCompression;
-  options.compression_per_level[1] = kNoCompression;
-  options.compression_per_level[2] = kSnappyCompression;
-
-  DestroyAndReopen(options);
+  options.level0_stop_writes_trigger = 4;
+  options.max_bytes_for_level_base = 64 * 1024;
+  options.max_write_buffer_number = 2;
+  options.max_background_compactions = 8;
+  options.max_background_flushes = 8;
+  options.statistics = rocksdb::CreateDBStatistics();
+  CreateAndReopenWithCF({"mypikachu"}, options);
 
-  // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
-  // be compressed, so total data size should be more than 80K.
-  for (int i = 0; i < 20; i++) {
-    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+  int numkeys = 20000;
+  for (int i = 0; i < numkeys; i++) {
+    ASSERT_OK(Put(1, Key(i), "val"));
   }
-  Flush();
-  dbfull()->TEST_WaitForCompact();
-
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
-  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U);
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
 
-  // Insert 400KB. Some data will be compressed
-  for (int i = 21; i < 120; i++) {
-    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
-  }
-  Flush();
+  ASSERT_OK(Flush(1));
   dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
-  ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4), 120U * 4000U);
-  // Make sure data in files in L3 is not compacted by removing all files
-  // in L4 and calculate number of rows
-  ASSERT_OK(dbfull()->SetOptions({
-      {"disable_auto_compactions", "true"},
-  }));
-  ColumnFamilyMetaData cf_meta;
-  db_->GetColumnFamilyMetaData(&cf_meta);
-  for (auto file : cf_meta.levels[4].files) {
-    ASSERT_OK(dbfull()->DeleteFile(file.name));
-  }
-  int num_keys = 0;
-  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    num_keys++;
-  }
-  ASSERT_OK(iter->status());
-  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U);
-}
 
-TEST_F(DBTest, DynamicLevelCompressionPerLevel2) {
-  const int kNKeys = 500;
-  int keys[kNKeys];
-  for (int i = 0; i < kNKeys; i++) {
-    keys[i] = i;
+  for (int i = 0; i < numkeys; i++) {
+    ASSERT_EQ(Get(1, Key(i)), "val");
   }
-  std::random_shuffle(std::begin(keys), std::end(keys));
-
-  Random rnd(301);
-  Options options;
-  options.create_if_missing = true;
-  options.db_write_buffer_size = 6000;
-  options.write_buffer_size = 6000;
-  options.max_write_buffer_number = 2;
-  options.level0_file_num_compaction_trigger = 2;
-  options.level0_slowdown_writes_trigger = 2;
-  options.level0_stop_writes_trigger = 2;
-  options.hard_rate_limit = 1.1;
-
-  // Use file size to distinguish levels
-  // L1: 10, L2: 20, L3 40, L4 80
-  // L0 is less than 30
-  options.target_file_size_base = 10;
-  options.target_file_size_multiplier = 2;
-
-  options.level_compaction_dynamic_level_bytes = true;
-  options.max_bytes_for_level_base = 200;
-  options.max_bytes_for_level_multiplier = 8;
-  options.max_background_compactions = 1;
-  options.num_levels = 5;
-  std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
-  options.table_factory = mtf;
 
-  options.compression_per_level.resize(3);
-  options.compression_per_level[0] = kNoCompression;
-  options.compression_per_level[1] = kLZ4Compression;
-  options.compression_per_level[2] = kZlibCompression;
+  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100);
+  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100);
+  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100);
 
-  DestroyAndReopen(options);
-  // When base level is L4, L4 is LZ4.
-  std::atomic<int> num_zlib(0);
-  std::atomic<int> num_lz4(0);
-  std::atomic<int> num_no(0);
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
-        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
-        if (compaction->output_level() == 4) {
-          ASSERT_TRUE(compaction->OutputCompressionType() == kLZ4Compression);
-          num_lz4.fetch_add(1);
-        }
-      });
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
-        auto* compression = reinterpret_cast<CompressionType*>(arg);
-        ASSERT_TRUE(*compression == kNoCompression);
-        num_no.fetch_add(1);
-      });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) +
+                         TestGetTickerCount(options, GET_HIT_L1) +
+                         TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+}
 
-  for (int i = 0; i < 100; i++) {
-    ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 200)));
-  }
-  Flush();
-  dbfull()->TEST_WaitForCompact();
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
+  // iter 0 -- zlib
+  // iter 1 -- bzip2
+  // iter 2 -- lz4
+  // iter 3 -- lz4HC
+  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+                                    kLZ4Compression,  kLZ4HCCompression};
+  for (int iter = 0; iter < 4; ++iter) {
+    if (!CompressionTypeSupported(compressions[iter])) {
+      continue;
+    }
+    // first_table_version 1 -- generate with table_version == 1, read with
+    // table_version == 2
+    // first_table_version 2 -- generate with table_version == 2, read with
+    // table_version == 1
+    for (int first_table_version = 1; first_table_version <= 2;
+         ++first_table_version) {
+      BlockBasedTableOptions table_options;
+      table_options.format_version = first_table_version;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+      Options options = CurrentOptions();
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      options.create_if_missing = true;
+      options.compression = compressions[iter];
+      DestroyAndReopen(options);
 
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
-  ASSERT_GT(NumTableFilesAtLevel(4), 0);
-  ASSERT_GT(num_no.load(), 2);
-  ASSERT_GT(num_lz4.load(), 0);
-  int prev_num_files_l4 = NumTableFilesAtLevel(4);
+      int kNumKeysWritten = 100000;
 
-  // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib
-  num_lz4.store(0);
-  num_no.store(0);
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
-        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
-        if (compaction->output_level() == 4 && compaction->start_level() == 3) {
-          ASSERT_TRUE(compaction->OutputCompressionType() == kZlibCompression);
-          num_zlib.fetch_add(1);
-        } else {
-          ASSERT_TRUE(compaction->OutputCompressionType() == kLZ4Compression);
-          num_lz4.fetch_add(1);
-        }
-      });
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
-        auto* compression = reinterpret_cast<CompressionType*>(arg);
-        ASSERT_TRUE(*compression == kNoCompression);
-        num_no.fetch_add(1);
-      });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+      Random rnd(301);
+      for (int i = 0; i < kNumKeysWritten; ++i) {
+        // compressible string
+        ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a')));
+      }
 
-  for (int i = 101; i < 500; i++) {
-    ASSERT_OK(Put(Key(keys[i]), RandomString(&rnd, 200)));
-    if (i % 100 == 99) {
-      Flush();
-      dbfull()->TEST_WaitForCompact();
+      table_options.format_version = first_table_version == 1 ? 2 : 1;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      Reopen(options);
+      for (int i = 0; i < kNumKeysWritten; ++i) {
+        auto r = Get(Key(i));
+        ASSERT_EQ(r.substr(128), std::string(128, 'a'));
+      }
     }
   }
+}
 
-  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
-  ASSERT_GT(NumTableFilesAtLevel(3), 0);
-  ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4);
-  ASSERT_GT(num_no.load(), 2);
-  ASSERT_GT(num_lz4.load(), 0);
-  ASSERT_GT(num_zlib.load(), 0);
+TEST_F(DBTest, MutexWaitStats) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  const int64_t kMutexWaitDelay = 100;
+  ThreadStatusUtil::TEST_SetStateDelay(
+      ThreadStatus::STATE_MUTEX_WAIT, kMutexWaitDelay);
+  ASSERT_OK(Put("hello", "rocksdb"));
+  ASSERT_GE(TestGetTickerCount(
+            options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay);
+  ThreadStatusUtil::TEST_SetStateDelay(
+      ThreadStatus::STATE_MUTEX_WAIT, 0);
 }
 
-TEST_F(DBTest, DynamicCompactionOptions) {
-  // minimum write buffer size is enforced at 64KB
-  const uint64_t k32KB = 1 << 15;
-  const uint64_t k64KB = 1 << 16;
-  const uint64_t k128KB = 1 << 17;
-  const uint64_t k1MB = 1 << 20;
-  const uint64_t k4KB = 1 << 12;
-  Options options;
+// This reproduces a bug where we don't delete a file because when it was
+// supposed to be deleted, it was blocked by pending_outputs
+// Consider:
+// 1. current file_number is 13
+// 2. compaction (1) starts, blocks deletion of all files starting with 13
+// (pending outputs)
+// 3. file 13 is created by compaction (2)
+// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file
+// 13 has no references, it is put into VersionSet::obsolete_files_
+// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13
+// is deleted from obsolete_files_ set.
+// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by
+// pending outputs since compaction (1) is still running. It is not deleted and
+// it is not present in obsolete_files_ anymore. Therefore, we never delete it.
+TEST_F(DBTest, DeleteObsoleteFilesPendingOutputs) {
+  Options options = CurrentOptions();
   options.env = env_;
-  options.create_if_missing = true;
-  options.compression = kNoCompression;
-  options.hard_rate_limit = 1.1;
-  options.write_buffer_size = k64KB;
-  options.max_write_buffer_number = 2;
-  // Compaction related options
-  options.level0_file_num_compaction_trigger = 3;
-  options.level0_slowdown_writes_trigger = 4;
-  options.level0_stop_writes_trigger = 8;
-  options.max_grandparent_overlap_factor = 10;
-  options.expanded_compaction_factor = 25;
-  options.source_compaction_factor = 1;
-  options.target_file_size_base = k64KB;
-  options.target_file_size_multiplier = 1;
-  options.max_bytes_for_level_base = k128KB;
-  options.max_bytes_for_level_multiplier = 4;
+  options.write_buffer_size = 2 * 1024 * 1024;     // 2 MB
+  options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+  options.level0_file_num_compaction_trigger =
+      2;  // trigger compaction when we have 2 files
+  options.max_background_flushes = 2;
+  options.max_background_compactions = 2;
 
-  // Block flush thread and disable compaction thread
-  env_->SetBackgroundThreads(1, Env::LOW);
-  env_->SetBackgroundThreads(1, Env::HIGH);
-  DestroyAndReopen(options);
+  OnFileDeletionListener* listener = new OnFileDeletionListener();
+  options.listeners.emplace_back(listener);
 
-  auto gen_l0_kb = [this](int start, int size, int stride) {
-    Random rnd(301);
-    for (int i = 0; i < size; i++) {
-      ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024)));
+  Reopen(options);
+
+  Random rnd(301);
+  // Create two 1MB sst files
+  for (int i = 0; i < 2; ++i) {
+    // Create 1MB sst file
+    for (int j = 0; j < 100; ++j) {
+      ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  // this should execute both L0->L1 and L1->(move)->L2 compactions
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  test::SleepingBackgroundTask blocking_thread;
+  port::Mutex mutex_;
+  bool already_blocked(false);
+
+  // block the flush
+  std::function<void()> block_first_time = [&]() {
+    bool blocking = false;
+    {
+      MutexLock l(&mutex_);
+      if (!already_blocked) {
+        blocking = true;
+        already_blocked = true;
+      }
+    }
+    if (blocking) {
+      blocking_thread.DoSleep();
     }
-    dbfull()->TEST_WaitForFlushMemTable();
   };
+  env_->table_write_callback_ = &block_first_time;
+  // Create 1MB sst file
+  for (int j = 0; j < 256; ++j) {
+    ASSERT_OK(Put(Key(j), RandomString(&rnd, 10 * 1024)));
+  }
+  // this should trigger a flush, which is blocked with block_first_time
+  // pending_file is protecting all the files created after
 
-  // Write 3 files that have the same key range.
-  // Since level0_file_num_compaction_trigger is 3, compaction should be
-  // triggered. The compaction should result in one L1 file
-  gen_l0_kb(0, 64, 1);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
-  gen_l0_kb(0, 64, 1);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
-  gen_l0_kb(0, 64, 1);
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ("0,1", FilesPerLevel());
+  ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr));
+
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
   std::vector<LiveFileMetaData> metadata;
   db_->GetLiveFilesMetaData(&metadata);
-  ASSERT_EQ(1U, metadata.size());
-  ASSERT_LE(metadata[0].size, k64KB + k4KB);
-  ASSERT_GE(metadata[0].size, k64KB - k4KB);
+  ASSERT_EQ(metadata.size(), 1U);
+  auto file_on_L2 = metadata[0].name;
+  listener->SetExpectedFileName(dbname_ + file_on_L2);
 
-  // Test compaction trigger and target_file_size_base
-  // Reduce compaction trigger to 2, and reduce L1 file size to 32KB.
-  // Writing to 64KB L0 files should trigger a compaction. Since these
-  // 2 L0 files have the same key range, compaction merge them and should
-  // result in 2 32KB L1 files.
-  ASSERT_OK(dbfull()->SetOptions({
-    {"level0_file_num_compaction_trigger", "2"},
-    {"target_file_size_base", ToString(k32KB) }
-  }));
+  ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr,
+                                        true /* disallow trivial move */));
+  ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+
+  // finish the flush!
+  blocking_thread.WakeUp();
+  blocking_thread.WaitUntilDone();
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_EQ("1,0,0,0,1", FilesPerLevel(0));
 
-  gen_l0_kb(0, 64, 1);
-  ASSERT_EQ("1,1", FilesPerLevel());
-  gen_l0_kb(0, 64, 1);
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ("0,2", FilesPerLevel());
   metadata.clear();
   db_->GetLiveFilesMetaData(&metadata);
-  ASSERT_EQ(2U, metadata.size());
-  ASSERT_LE(metadata[0].size, k32KB + k4KB);
-  ASSERT_GE(metadata[0].size, k32KB - k4KB);
-  ASSERT_LE(metadata[1].size, k32KB + k4KB);
-  ASSERT_GE(metadata[1].size, k32KB - k4KB);
-
-  // Test max_bytes_for_level_base
-  // Increase level base size to 256KB and write enough data that will
-  // fill L1 and L2. L1 size should be around 256KB while L2 size should be
-  // around 256KB x 4.
-  ASSERT_OK(dbfull()->SetOptions({
-    {"max_bytes_for_level_base", ToString(k1MB) }
-  }));
+  ASSERT_EQ(metadata.size(), 2U);
 
-  // writing 96 x 64KB => 6 * 1024KB
-  // (L1 + L2) = (1 + 4) * 1024KB
-  for (int i = 0; i < 96; ++i) {
-    gen_l0_kb(i, 64, 96);
-  }
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_GT(SizeAtLevel(1), k1MB / 2);
-  ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2);
+  // This file should have been deleted during last compaction
+  ASSERT_EQ(Status::NotFound(), env_->FileExists(dbname_ + file_on_L2));
+  listener->VerifyMatchedCount(1);
+}
 
-  // Within (0.5, 1.5) of 4MB.
-  ASSERT_GT(SizeAtLevel(2), 2 * k1MB);
-  ASSERT_LT(SizeAtLevel(2), 6 * k1MB);
+TEST_F(DBTest, CloseSpeedup) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_write_buffer_number = 16;
 
-  // Test max_bytes_for_level_multiplier and
-  // max_bytes_for_level_base. Now, reduce both mulitplier and level base,
-  // After filling enough data that can fit in L1 - L3, we should see L1 size
-  // reduces to 128KB from 256KB which was asserted previously. Same for L2.
-  ASSERT_OK(dbfull()->SetOptions({
-    {"max_bytes_for_level_multiplier", "2"},
-    {"max_bytes_for_level_base", ToString(k128KB) }
-  }));
+  // Block background threads
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  test::SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_high, Env::Priority::HIGH);
 
-  // writing 20 x 64KB = 10 x 128KB
-  // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB
-  for (int i = 0; i < 20; ++i) {
-    gen_l0_kb(i, 64, 32);
+  std::vector<std::string> filenames;
+  env_->GetChildren(dbname_, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(dbname_ + "/" + filenames[i]);
   }
-  dbfull()->TEST_WaitForCompact();
-  uint64_t total_size =
-    SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3);
-  ASSERT_TRUE(total_size < k128KB * 7 * 1.5);
+  env_->DeleteDir(dbname_);
+  DestroyAndReopen(options);
 
-  // Test level0_stop_writes_trigger.
-  // Clean up memtable and L0. Block compaction threads. If continue to write
-  // and flush memtables. We should see put timeout after 8 memtable flushes
-  // since level0_stop_writes_trigger = 8
-  dbfull()->CompactRange(nullptr, nullptr);
-  // Block compaction
-  SleepingBackgroundTask sleeping_task_low1;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low1,
-                 Env::Priority::LOW);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
-  int count = 0;
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
   Random rnd(301);
-  WriteOptions wo;
-  wo.timeout_hint_us = 10000;
-  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 64) {
-    dbfull()->TEST_FlushMemTable(true);
-    count++;
+  int key_idx = 0;
+
+  // First three 110KB files are not going to level 2
+  // After that, (100K, 200K)
+  for (int num = 0; num < 5; num++) {
+    GenerateNewFile(&rnd, &key_idx, true);
   }
-  // Stop trigger = 8
-  ASSERT_EQ(count, 8);
-  // Unblock
-  sleeping_task_low1.WakeUp();
-  sleeping_task_low1.WaitUntilDone();
 
-  // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0.
-  // Block compaction thread again. Perform the put and memtable flushes
-  // until we see timeout after 6 memtable flushes.
-  ASSERT_OK(dbfull()->SetOptions({
-    {"level0_stop_writes_trigger", "6"}
-  }));
-  dbfull()->CompactRange(nullptr, nullptr);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
 
-  // Block compaction
-  SleepingBackgroundTask sleeping_task_low2;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low2,
-                 Env::Priority::LOW);
-  count = 0;
-  while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 64) {
-    dbfull()->TEST_FlushMemTable(true);
-    count++;
-  }
-  ASSERT_EQ(count, 6);
-  // Unblock
-  sleeping_task_low2.WakeUp();
-  sleeping_task_low2.WaitUntilDone();
+  Close();
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
 
-  // Test disable_auto_compactions
-  // Compaction thread is unblocked but auto compaction is disabled. Write
-  // 4 L0 files and compaction should be triggered. If auto compaction is
-  // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of
-  // L0 files do not change after the call.
-  ASSERT_OK(dbfull()->SetOptions({
-    {"disable_auto_compactions", "true"}
-  }));
-  dbfull()->CompactRange(nullptr, nullptr);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  // Unblock background threads
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
 
-  for (int i = 0; i < 4; ++i) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
-    // Wait for compaction so that put won't timeout
-    dbfull()->TEST_FlushMemTable(true);
-  }
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+  Destroy(options);
+}
 
-  // Enable auto compaction and perform the same test, # of L0 files should be
-  // reduced after compaction.
-  ASSERT_OK(dbfull()->SetOptions({
-    {"disable_auto_compactions", "false"}
-  }));
-  dbfull()->CompactRange(nullptr, nullptr);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+class DelayedMergeOperator : public AssociativeMergeOperator {
+ private:
+  DBTest* db_test_;
 
-  for (int i = 0; i < 4; ++i) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
-    // Wait for compaction so that put won't timeout
-    dbfull()->TEST_FlushMemTable(true);
+ public:
+  explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {}
+  virtual bool Merge(const Slice& key, const Slice* existing_value,
+                     const Slice& value, std::string* new_value,
+                     Logger* logger) const override {
+    db_test_->env_->addon_time_.fetch_add(1000);
+    return true;
   }
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_LT(NumTableFilesAtLevel(0), 4);
 
-  // Test for hard_rate_limit.
-  // First change max_bytes_for_level_base to a big value and populate
-  // L1 - L3. Then thrink max_bytes_for_level_base and disable auto compaction
-  // at the same time, we should see some level with score greater than 2.
-  ASSERT_OK(dbfull()->SetOptions({
-    {"max_bytes_for_level_base", ToString(k1MB) }
-  }));
-  // writing 40 x 64KB = 10 x 256KB
-  // (L1 + L2 + L3) = (1 + 2 + 4) * 256KB
-  for (int i = 0; i < 40; ++i) {
-    gen_l0_kb(i, 64, 32);
-  }
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_TRUE((SizeAtLevel(1) > k1MB * 0.8 &&
-               SizeAtLevel(1) < k1MB * 1.2) ||
-              (SizeAtLevel(2) > 2 * k1MB * 0.8 &&
-               SizeAtLevel(2) < 2 * k1MB * 1.2) ||
-              (SizeAtLevel(3) > 4 * k1MB * 0.8 &&
-               SizeAtLevel(3) < 4 * k1MB * 1.2));
-  // Reduce max_bytes_for_level_base and disable compaction at the same time
-  // This should cause score to increase
-  ASSERT_OK(dbfull()->SetOptions({
-    {"disable_auto_compactions", "true"},
-    {"max_bytes_for_level_base", "65536"},
-  }));
-  ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024)));
-  dbfull()->TEST_FlushMemTable(true);
+  virtual const char* Name() const override { return "DelayedMergeOperator"; }
+};
 
-  // Check score is above 2
-  ASSERT_TRUE(SizeAtLevel(1) / k64KB > 2 ||
-              SizeAtLevel(2) / k64KB > 4 ||
-              SizeAtLevel(3) / k64KB > 8);
+TEST_F(DBTest, MergeTestTime) {
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
 
-  // Enfoce hard rate limit. Now set hard_rate_limit to 2,
-  // we should start to see put delay (1000 us) and timeout as a result
-  // (L0 score is not regulated by this limit).
-  ASSERT_OK(dbfull()->SetOptions({
-    {"hard_rate_limit", "2"},
-    {"level0_slowdown_writes_trigger", "18"},
-    {"level0_stop_writes_trigger", "20"}
-  }));
-  ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024)));
-  dbfull()->TEST_FlushMemTable(true);
+  // Enable time profiling
+  SetPerfLevel(kEnableTime);
+  this->env_->addon_time_.store(0);
+  Options options;
+  options = CurrentOptions(options);
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.merge_operator.reset(new DelayedMergeOperator(this));
+  DestroyAndReopen(options);
 
-  std::atomic<int> sleep_count(0);
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::DelayWrite:Sleep", [&](void* arg) { sleep_count.fetch_add(1); });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
+  db_->Put(WriteOptions(), "foo", one);
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", three));
+  ASSERT_OK(Flush());
 
-  // Hard rate limit slow down for 1000 us, so default 10ms should be ok
-  ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
-  sleep_count.store(0);
-  ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
-  ASSERT_GT(sleep_count.load(), 0);
+  ReadOptions opt;
+  opt.verify_checksums = true;
+  opt.snapshot = nullptr;
+  std::string result;
+  db_->Get(opt, "foo", &result);
 
-  // Lift the limit and no timeout
-  ASSERT_OK(dbfull()->SetOptions({
-    {"hard_rate_limit", "200"},
-  }));
-  dbfull()->TEST_FlushMemTable(true);
-  sleep_count.store(0);
-  ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
-  // Technically, time out is still possible for timing issue.
-  ASSERT_EQ(sleep_count.load(), 0);
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_LT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 2800000);
+  ASSERT_GT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 1200000);
 
+  ReadOptions read_options;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    ++count;
+  }
 
-  // Test max_mem_compaction_level.
-  // Destroy DB and start from scratch
-  options.max_background_compactions = 1;
-  options.max_background_flushes = 0;
-  options.max_mem_compaction_level = 2;
-  DestroyAndReopen(options);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(1, count);
 
-  ASSERT_OK(Put("max_mem_compaction_level_key", RandomString(&rnd, 8)));
-  dbfull()->TEST_FlushMemTable(true);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 1);
+  ASSERT_LT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 6000000);
+  ASSERT_GT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 3200000);
+#if ROCKSDB_USING_THREAD_STATUS
+  ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0);
+#endif  // ROCKSDB_USING_THREAD_STATUS
+}
 
-  ASSERT_TRUE(Put("max_mem_compaction_level_key",
-              RandomString(&rnd, 8)).ok());
-  // Set new value and it becomes effective in this flush
-  ASSERT_OK(dbfull()->SetOptions({
-    {"max_mem_compaction_level", "1"}
-  }));
-  dbfull()->TEST_FlushMemTable(true);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 1);
+TEST_P(DBTestWithParam, MergeCompactionTimeTest) {
+  SetPerfLevel(kEnableTime);
+  Options options;
+  options = CurrentOptions(options);
+  options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.merge_operator.reset(new DelayedMergeOperator(this));
+  options.compaction_style = kCompactionStyleUniversal;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
 
-  ASSERT_TRUE(Put("max_mem_compaction_level_key",
-              RandomString(&rnd, 8)).ok());
-  // Set new value and it becomes effective in this flush
-  ASSERT_OK(dbfull()->SetOptions({
-    {"max_mem_compaction_level", "0"}
-  }));
-  dbfull()->TEST_FlushMemTable(true);
-  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
-  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
-  ASSERT_EQ(NumTableFilesAtLevel(2), 1);
+  for (int i = 0; i < 1000; i++) {
+    ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST"));
+    ASSERT_OK(Flush());
+  }
+  dbfull()->TEST_WaitForFlushMemTable();
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_NE(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
 }
 
-TEST_F(DBTest, FileCreationRandomFailure) {
+TEST_P(DBTestWithParam, FilterCompactionTimeTest) {
   Options options;
-  options.env = env_;
+  options.compaction_filter_factory =
+      std::make_shared<DelayFilterFactory>(this);
+  options.disable_auto_compactions = true;
   options.create_if_missing = true;
-  options.write_buffer_size = 100000;  // Small write buffer
-  options.target_file_size_base = 200000;
-  options.max_bytes_for_level_base = 1000000;
-  options.max_bytes_for_level_multiplier = 2;
-
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.max_subcompactions = max_subcompactions_;
+  options = CurrentOptions(options);
   DestroyAndReopen(options);
-  Random rnd(301);
 
-  const int kTestSize = kCDTKeysPerBuffer * 4096;
-  const int kTotalIteration = 100;
-  // the second half of the test involves in random failure
-  // of file creation.
-  const int kRandomFailureTest = kTotalIteration / 2;
-  std::vector<std::string> values;
-  for (int i = 0; i < kTestSize; ++i) {
-    values.push_back("NOT_FOUND");
-  }
-  for (int j = 0; j < kTotalIteration; ++j) {
-    if (j == kRandomFailureTest) {
-      env_->non_writeable_rate_.store(90);
-    }
-    for (int k = 0; k < kTestSize; ++k) {
-      // here we expect some of the Put fails.
-      std::string value = RandomString(&rnd, 100);
-      Status s = Put(Key(k), Slice(value));
-      if (s.ok()) {
-        // update the latest successful put
-        values[k] = value;
-      }
-      // But everything before we simulate the failure-test should succeed.
-      if (j < kRandomFailureTest) {
-        ASSERT_OK(s);
-      }
+  // put some data
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      Put(ToString(table * 100 + i), "val");
     }
+    Flush();
   }
 
-  // If rocksdb does not do the correct job, internal assert will fail here.
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(0U, CountLiveFiles());
 
-  // verify we have the latest successful update
-  for (int k = 0; k < kTestSize; ++k) {
-    auto v = Get(Key(k));
-    ASSERT_EQ(v, values[k]);
-  }
+  Reopen(options);
+
+  Iterator* itr = db_->NewIterator(ReadOptions());
+  itr->SeekToFirst();
+  ASSERT_NE(TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME), 0);
+  delete itr;
+}
 
-  // reopen and reverify we have the latest successful update
-  env_->non_writeable_rate_.store(0);
+TEST_F(DBTest, TestLogCleanup) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 64 * 1024;  // very small
+  // only two memtables allowed ==> only two log files
+  options.max_write_buffer_number = 2;
   Reopen(options);
-  for (int k = 0; k < kTestSize; ++k) {
-    auto v = Get(Key(k));
-    ASSERT_EQ(v, values[k]);
+
+  for (int i = 0; i < 100000; ++i) {
+    Put(Key(i), "val");
+    // only 2 memtables will be alive, so logs_to_free needs to always be below
+    // 2
+    ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast<size_t>(3));
   }
 }
 
-TEST_F(DBTest, PartialCompactionFailure) {
+TEST_F(DBTest, EmptyCompactedDB) {
   Options options;
-  const int kKeySize = 16;
-  const int kKvSize = 1000;
-  const int kKeysPerBuffer = 100;
-  const int kNumL1Files = 5;
-  options.create_if_missing = true;
-  options.write_buffer_size = kKeysPerBuffer * kKvSize;
-  options.max_write_buffer_number = 2;
-  options.target_file_size_base =
-      options.write_buffer_size *
-      (options.max_write_buffer_number - 1);
-  options.level0_file_num_compaction_trigger = kNumL1Files;
-  options.max_bytes_for_level_base =
-      options.level0_file_num_compaction_trigger *
-      options.target_file_size_base;
-  options.max_bytes_for_level_multiplier = 2;
-  options.compression = kNoCompression;
+  options.max_open_files = -1;
+  options = CurrentOptions(options);
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  Status s = Put("new", "value");
+  ASSERT_TRUE(s.IsNotSupported());
+  Close();
+}
 
-  env_->SetBackgroundThreads(1, Env::HIGH);
-  env_->SetBackgroundThreads(1, Env::LOW);
-  // stop the compaction thread until we simulate the file creation failure.
-  SleepingBackgroundTask sleeping_task_low;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
-                 Env::Priority::LOW);
+class CountingDeleteTabPropCollector : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "CountingDeleteTabPropCollector"; }
 
-  options.env = env_;
+  Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type,
+                    SequenceNumber seq, uint64_t file_size) override {
+    if (type == kEntryDelete) {
+      num_deletes_++;
+    }
+    return Status::OK();
+  }
 
-  DestroyAndReopen(options);
+  bool NeedCompact() const override { return num_deletes_ > 10; }
 
-  const int kNumInsertedKeys =
-      options.level0_file_num_compaction_trigger *
-      (options.max_write_buffer_number - 1) *
-      kKeysPerBuffer;
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
 
-  Random rnd(301);
-  std::vector<std::string> keys;
-  std::vector<std::string> values;
-  for (int k = 0; k < kNumInsertedKeys; ++k) {
-    keys.emplace_back(RandomString(&rnd, kKeySize));
-    values.emplace_back(RandomString(&rnd, kKvSize - kKeySize));
-    ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
+  Status Finish(UserCollectedProperties* properties) override {
+    *properties =
+        UserCollectedProperties{{"num_delete", ToString(num_deletes_)}};
+    return Status::OK();
   }
 
-  dbfull()->TEST_FlushMemTable(true);
-  // Make sure the number of L0 files can trigger compaction.
-  ASSERT_GE(NumTableFilesAtLevel(0),
-            options.level0_file_num_compaction_trigger);
+ private:
+  uint32_t num_deletes_ = 0;
+};
+
+class CountingDeleteTabPropCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector() override {
+    return new CountingDeleteTabPropCollector();
+  }
+  const char* Name() const override {
+    return "CountingDeleteTabPropCollectorFactory";
+  }
+};
 
-  auto previous_num_level0_files = NumTableFilesAtLevel(0);
+TEST_F(DBTest, TablePropertiesNeedCompactTest) {
+  Random rnd(301);
 
-  // Fail the first file creation.
-  env_->non_writable_count_ = 1;
-  sleeping_task_low.WakeUp();
-  sleeping_task_low.WaitUntilDone();
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = 4096;
+  options.max_write_buffer_number = 8;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 4;
+  options.target_file_size_base = 2048;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.soft_rate_limit = 1.1;
+  options.num_levels = 8;
 
-  // Expect compaction to fail here as one file will fail its
-  // creation.
-  ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok());
+  std::shared_ptr<TablePropertiesCollectorFactory> collector_factory(
+      new CountingDeleteTabPropCollectorFactory);
+  options.table_properties_collector_factories.resize(1);
+  options.table_properties_collector_factories[0] = collector_factory;
 
-  // Verify L0 -> L1 compaction does fail.
-  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  DestroyAndReopen(options);
 
-  // Verify all L0 files are still there.
-  ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files);
+  const int kMaxKey = 1000;
+  for (int i = 0; i < kMaxKey; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102)));
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  if (NumTableFilesAtLevel(0) == 1) {
+    // Clear Level 0 so that when later flush a file with deletions,
+    // we don't trigger an organic compaction.
+    ASSERT_OK(Put(Key(0), ""));
+    ASSERT_OK(Put(Key(kMaxKey * 2), ""));
+    Flush();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
 
-  // All key-values must exist after compaction fails.
-  for (int k = 0; k < kNumInsertedKeys; ++k) {
-    ASSERT_EQ(values[k], Get(keys[k]));
+  {
+    int c = 0;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->Seek(Key(kMaxKey - 100));
+    while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) {
+      iter->Next();
+      ++c;
+    }
+    ASSERT_EQ(c, 200);
   }
 
-  env_->non_writable_count_ = 0;
+  Delete(Key(0));
+  for (int i = kMaxKey - 100; i < kMaxKey + 100; i++) {
+    Delete(Key(i));
+  }
+  Delete(Key(kMaxKey * 2));
 
-  // Make sure RocksDB will not get into corrupted state.
-  Reopen(options);
+  Flush();
+  dbfull()->TEST_WaitForCompact();
 
-  // Verify again after reopen.
-  for (int k = 0; k < kNumInsertedKeys; ++k) {
-    ASSERT_EQ(values[k], Get(keys[k]));
+  {
+    SetPerfLevel(kEnableCount);
+    perf_context.Reset();
+    int c = 0;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->Seek(Key(kMaxKey - 100));
+    while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) {
+      iter->Next();
+    }
+    ASSERT_EQ(c, 0);
+    ASSERT_LT(perf_context.internal_delete_skipped_count, 30u);
+    ASSERT_LT(perf_context.internal_key_skipped_count, 30u);
+    SetPerfLevel(kDisable);
   }
 }
 
-TEST_F(DBTest, DynamicMiscOptions) {
-  // Test max_sequential_skip_in_iterations
-  Options options;
-  options.env = env_;
-  options.create_if_missing = true;
-  options.max_sequential_skip_in_iterations = 16;
-  options.compression = kNoCompression;
-  options.statistics = rocksdb::CreateDBStatistics();
-  DestroyAndReopen(options);
-
-  auto assert_reseek_count = [this, &options](int key_start, int num_reseek) {
-    int key0 = key_start;
-    int key1 = key_start + 1;
-    int key2 = key_start + 2;
-    Random rnd(301);
-    ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8)));
-    for (int i = 0; i < 10; ++i) {
-      ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8)));
+TEST_F(DBTest, SuggestCompactRangeTest) {
+  class CompactionFilterFactoryGetContext : public CompactionFilterFactory {
+   public:
+    virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context& context) override {
+      saved_context = context;
+      std::unique_ptr<CompactionFilter> empty_filter;
+      return empty_filter;
     }
-    ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8)));
-    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
-    iter->Seek(Key(key1));
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(Key(key1)), 0);
-    iter->Next();
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(Key(key2)), 0);
-    ASSERT_EQ(num_reseek,
-              TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
+    const char* Name() const override {
+      return "CompactionFilterFactoryGetContext";
+    }
+    static bool IsManual(CompactionFilterFactory* compaction_filter_factory) {
+      return reinterpret_cast<CompactionFilterFactoryGetContext*>(
+                 compaction_filter_factory)->saved_context.is_manual_compaction;
+    }
+    CompactionFilter::Context saved_context;
   };
-  // No reseek
-  assert_reseek_count(100, 0);
 
-  ASSERT_OK(dbfull()->SetOptions({
-    {"max_sequential_skip_in_iterations", "4"}
-  }));
-  // Clear memtable and make new option effective
-  dbfull()->TEST_FlushMemTable(true);
-  // Trigger reseek
-  assert_reseek_count(200, 1);
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.compaction_filter_factory.reset(
+      new CompactionFilterFactoryGetContext());
+  options.write_buffer_size = 100 << 10;
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_grandparent_overlap_factor = 1 << 20;  // inf
 
-  ASSERT_OK(dbfull()->SetOptions({
-    {"max_sequential_skip_in_iterations", "16"}
-  }));
-  // Clear memtable and make new option effective
-  dbfull()->TEST_FlushMemTable(true);
-  // No reseek
-  assert_reseek_count(300, 1);
-}
+  Reopen(options);
 
-TEST_F(DBTest, DontDeletePendingOutputs) {
-  Options options;
-  options.env = env_;
-  options.create_if_missing = true;
-  DestroyAndReopen(options);
+  Random rnd(301);
 
-  // Every time we write to a table file, call FOF/POF with full DB scan. This
-  // will make sure our pending_outputs_ protection work correctly
-  std::function<void()> purge_obsolete_files_function = [&]() {
-    JobContext job_context(0);
-    dbfull()->TEST_LockMutex();
-    dbfull()->FindObsoleteFiles(&job_context, true /*force*/);
-    dbfull()->TEST_UnlockMutex();
-    dbfull()->PurgeObsoleteFiles(job_context);
-  };
+  for (int num = 0; num < 3; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
 
-  env_->table_write_callback_ = &purge_obsolete_files_function;
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("0,4", FilesPerLevel(0));
+  ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
+                   options.compaction_filter_factory.get()));
 
-  for (int i = 0; i < 2; ++i) {
-    ASSERT_OK(Put("a", "begin"));
-    ASSERT_OK(Put("z", "end"));
-    ASSERT_OK(Flush());
-  }
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("1,4", FilesPerLevel(0));
 
-  // If pending output guard does not work correctly, PurgeObsoleteFiles() will
-  // delete the file that Compaction is trying to create, causing this: error
-  // db/db_test.cc:975: IO error:
-  // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory
-  Compact("a", "b");
-}
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("2,4", FilesPerLevel(0));
 
-TEST_F(DBTest, DontDeleteMovedFile) {
-  // This test triggers move compaction and verifies that the file is not
-  // deleted when it's part of move compaction
-  Options options = CurrentOptions();
-  options.env = env_;
-  options.create_if_missing = true;
-  options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
-  options.level0_file_num_compaction_trigger =
-      2;  // trigger compaction when we have 2 files
-  DestroyAndReopen(options);
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("3,4", FilesPerLevel(0));
 
-  Random rnd(301);
-  // Create two 1MB sst files
-  for (int i = 0; i < 2; ++i) {
-    // Create 1MB sst file
-    for (int j = 0; j < 100; ++j) {
-      ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
-    }
-    ASSERT_OK(Flush());
-  }
-  // this should execute both L0->L1 and L1->(move)->L2 compactions
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("0,4,4", FilesPerLevel(0));
 
-  // If the moved file is actually deleted (the move-safeguard in
-  // ~Version::Version() is not there), we get this failure:
-  // Corruption: Can't access /000009.sst
-  Reopen(options);
-}
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("1,4,4", FilesPerLevel(0));
 
-TEST_F(DBTest, DeleteMovedFileAfterCompaction) {
-  // iter 1 -- delete_obsolete_files_period_micros == 0
-  for (int iter = 0; iter < 2; ++iter) {
-    // This test triggers move compaction and verifies that the file is not
-    // deleted when it's part of move compaction
-    Options options = CurrentOptions();
-    options.env = env_;
-    if (iter == 1) {
-      options.delete_obsolete_files_period_micros = 0;
-    }
-    options.create_if_missing = true;
-    options.level0_file_num_compaction_trigger =
-        2;  // trigger compaction when we have 2 files
-    DestroyAndReopen(options);
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("2,4,4", FilesPerLevel(0));
 
-    Random rnd(301);
-    // Create two 1MB sst files
-    for (int i = 0; i < 2; ++i) {
-      // Create 1MB sst file
-      for (int j = 0; j < 100; ++j) {
-        ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
-      }
-      ASSERT_OK(Flush());
-    }
-    // this should execute L0->L1
-    dbfull()->TEST_WaitForCompact();
-    ASSERT_EQ("0,1", FilesPerLevel(0));
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("3,4,4", FilesPerLevel(0));
 
-    // block compactions
-    SleepingBackgroundTask sleeping_task;
-    env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task,
-                   Env::Priority::LOW);
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("0,4,8", FilesPerLevel(0));
 
-    options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
-    Reopen(options);
-    std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
-    ASSERT_EQ("0,1", FilesPerLevel(0));
-    // let compactions go
-    sleeping_task.WakeUp();
-    sleeping_task.WaitUntilDone();
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("1,4,8", FilesPerLevel(0));
 
-    // this should execute L1->L2 (move)
+  // compact it three times
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
     dbfull()->TEST_WaitForCompact();
+  }
 
-    ASSERT_EQ("0,0,1", FilesPerLevel(0));
-
-    std::vector<LiveFileMetaData> metadata;
-    db_->GetLiveFilesMetaData(&metadata);
-    ASSERT_EQ(metadata.size(), 1U);
-    auto moved_file_name = metadata[0].name;
+  ASSERT_EQ("0,0,13", FilesPerLevel(0));
 
-    // Create two more 1MB sst files
-    for (int i = 0; i < 2; ++i) {
-      // Create 1MB sst file
-      for (int j = 0; j < 100; ++j) {
-        ASSERT_OK(Put(Key(i * 50 + j + 100), RandomString(&rnd, 10 * 1024)));
-      }
-      ASSERT_OK(Flush());
-    }
-    // this should execute both L0->L1 and L1->L2 (merge with previous file)
-    dbfull()->TEST_WaitForCompact();
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ("1,0,13", FilesPerLevel(0));
 
-    ASSERT_EQ("0,0,2", FilesPerLevel(0));
+  // nonoverlapping with the file on level 0
+  Slice start("a"), end("b");
+  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  dbfull()->TEST_WaitForCompact();
 
-    // iterator is holding the file
-    ASSERT_TRUE(env_->FileExists(dbname_ + "/" + moved_file_name));
+  // should not compact the level 0 file
+  ASSERT_EQ("1,0,13", FilesPerLevel(0));
 
-    iterator.reset();
+  start = Slice("j");
+  end = Slice("m");
+  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(CompactionFilterFactoryGetContext::IsManual(
+      options.compaction_filter_factory.get()));
 
-    // this file should have been compacted away
-    ASSERT_TRUE(!env_->FileExists(dbname_ + "/" + moved_file_name));
-  }
+  // now it should compact the level 0 file
+  ASSERT_EQ("0,1,13", FilesPerLevel(0));
 }
 
-TEST_F(DBTest, OptimizeFiltersForHits) {
+TEST_F(DBTest, PromoteL0) {
   Options options = CurrentOptions();
-  options.write_buffer_size = 256 * 1024;
-  options.target_file_size_base = 256 * 1024;
-  options.level0_file_num_compaction_trigger = 2;
-  options.level0_slowdown_writes_trigger = 2;
-  options.level0_stop_writes_trigger = 4;
-  options.max_bytes_for_level_base = 256 * 1024;
-  options.max_write_buffer_number = 2;
-  options.max_background_compactions = 8;
-  options.max_background_flushes = 8;
-  options.compaction_style = kCompactionStyleLevel;
-  BlockBasedTableOptions bbto;
-  bbto.filter_policy.reset(NewBloomFilterPolicy(10, true));
-  bbto.whole_key_filtering = true;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  options.optimize_filters_for_hits = true;
-  options.statistics = rocksdb::CreateDBStatistics();
-  CreateAndReopenWithCF({"mypikachu"}, options);
-
-  int numkeys = 200000;
-  for (int i = 0; i < 20; i += 2) {
-    for (int j = i; j < numkeys; j += 20) {
-      ASSERT_OK(Put(1, Key(j), "val"));
-    }
-  }
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  DestroyAndReopen(options);
 
+  // non overlapping ranges
+  std::vector<std::pair<int32_t, int32_t>> ranges = {
+      {81, 160}, {0, 80}, {161, 240}, {241, 320}};
 
-  ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  int32_t value_size = 10 * 1024;  // 10 KB
 
-  for (int i = 1; i < numkeys; i += 2) {
-    ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND");
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+  for (const auto& range : ranges) {
+    for (int32_t j = range.first; j < range.second; j++) {
+      values[j] = RandomString(&rnd, value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
   }
 
-  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
-  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
-  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+  int32_t level0_files = NumTableFilesAtLevel(0, 0);
+  ASSERT_EQ(level0_files, ranges.size());
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // No files in L1
 
-  // When the skip_filters_on_last_level is ON, the last level which has
-  // most of the keys does not use bloom filters. We end up using
-  // bloom filters in a very small number of cases. Without the flag.
-  // this number would be close to 150000 (all the key at the last level) +
-  // some use in the upper levels
-  //
-  ASSERT_GT(90000, TestGetTickerCount(options, BLOOM_FILTER_USEFUL));
+  // Promote L0 level to L2.
+  ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2));
+  // We expect that all the files were trivially moved from L0 to L2
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files);
 
-  for (int i = 0; i < numkeys; i += 2) {
-    ASSERT_EQ(Get(1, Key(i)), "val");
+  for (const auto& kv : values) {
+    ASSERT_EQ(Get(Key(kv.first)), kv.second);
   }
 }
 
-TEST_F(DBTest, L0L1L2AndUpHitCounter) {
+TEST_F(DBTest, PromoteL0Failure) {
   Options options = CurrentOptions();
-  options.write_buffer_size = 32 * 1024;
-  options.target_file_size_base = 32 * 1024;
-  options.level0_file_num_compaction_trigger = 2;
-  options.level0_slowdown_writes_trigger = 2;
-  options.level0_stop_writes_trigger = 4;
-  options.max_bytes_for_level_base = 64 * 1024;
-  options.max_write_buffer_number = 2;
-  options.max_background_compactions = 8;
-  options.max_background_flushes = 8;
-  options.statistics = rocksdb::CreateDBStatistics();
-  CreateAndReopenWithCF({"mypikachu"}, options);
-
-  int numkeys = 20000;
-  for (int i = 0; i < numkeys; i++) {
-    ASSERT_OK(Put(1, Key(i), "val"));
-  }
-  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
-  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
-  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  DestroyAndReopen(options);
 
-  ASSERT_OK(Flush(1));
-  dbfull()->TEST_WaitForCompact();
+  // Produce two L0 files with overlapping ranges.
+  ASSERT_OK(Put(Key(0), ""));
+  ASSERT_OK(Put(Key(3), ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), ""));
+  ASSERT_OK(Flush());
 
-  for (int i = 0; i < numkeys; i++) {
-    ASSERT_EQ(Get(1, Key(i)), "val");
-  }
+  Status status;
+  // Fails because L0 has overlapping files.
+  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+  ASSERT_TRUE(status.IsInvalidArgument());
 
-  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100);
-  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100);
-  ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Now there is a file in L1.
+  ASSERT_GE(NumTableFilesAtLevel(1, 0), 1);
 
-  ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) +
-                         TestGetTickerCount(options, GET_HIT_L1) +
-                         TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+  ASSERT_OK(Put(Key(5), ""));
+  ASSERT_OK(Flush());
+  // Fails because L1 is non-empty.
+  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+  ASSERT_TRUE(status.IsInvalidArgument());
 }
 
-TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
-  // iter 0 -- zlib
-  // iter 1 -- bzip2
-  // iter 2 -- lz4
-  // iter 3 -- lz4HC
-  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
-                                    kLZ4Compression,  kLZ4HCCompression};
-  for (int iter = 0; iter < 4; ++iter) {
-    // first_table_version 1 -- generate with table_version == 1, read with
-    // table_version == 2
-    // first_table_version 2 -- generate with table_version == 2, read with
-    // table_version == 1
-    for (int first_table_version = 1; first_table_version <= 2;
-         ++first_table_version) {
-      BlockBasedTableOptions table_options;
-      table_options.format_version = first_table_version;
-      table_options.filter_policy.reset(NewBloomFilterPolicy(10));
-      Options options = CurrentOptions();
-      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-      options.create_if_missing = true;
-      options.compression = compressions[iter];
-      DestroyAndReopen(options);
-
-      int kNumKeysWritten = 100000;
-
-      Random rnd(301);
-      for (int i = 0; i < kNumKeysWritten; ++i) {
-        // compressible string
-        ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a')));
-      }
+// Github issue #596
+TEST_F(DBTest, HugeNumberOfLevels) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 2 * 1024 * 1024;         // 2MB
+  options.max_bytes_for_level_base = 2 * 1024 * 1024;  // 2MB
+  options.num_levels = 12;
+  options.max_background_compactions = 10;
+  options.max_bytes_for_level_multiplier = 2;
+  options.level_compaction_dynamic_level_bytes = true;
+  DestroyAndReopen(options);
 
-      table_options.format_version = first_table_version == 1 ? 2 : 1;
-      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-      Reopen(options);
-      for (int i = 0; i < kNumKeysWritten; ++i) {
-        auto r = Get(Key(i));
-        ASSERT_EQ(r.substr(128), std::string(128, 'a'));
-      }
-    }
+  Random rnd(301);
+  for (int i = 0; i < 300000; ++i) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
   }
-}
 
-TEST_F(DBTest, MutexWaitStats) {
-  Options options = CurrentOptions();
-  options.create_if_missing = true;
-  options.statistics = rocksdb::CreateDBStatistics();
-  CreateAndReopenWithCF({"pikachu"}, options);
-  const int64_t kMutexWaitDelay = 100;
-  ThreadStatusUtil::TEST_SetStateDelay(
-      ThreadStatus::STATE_MUTEX_WAIT, kMutexWaitDelay);
-  ASSERT_OK(Put("hello", "rocksdb"));
-  ASSERT_GE(TestGetTickerCount(
-            options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay);
-  ThreadStatusUtil::TEST_SetStateDelay(
-      ThreadStatus::STATE_MUTEX_WAIT, 0);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
 }
 
-// This reproduces a bug where we don't delete a file because when it was
-// supposed to be deleted, it was blocked by pending_outputs
-// Consider:
-// 1. current file_number is 13
-// 2. compaction (1) starts, blocks deletion of all files starting with 13
-// (pending outputs)
-// 3. file 13 is created by compaction (2)
-// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file
-// 13 has no references, it is put into VersionSet::obsolete_files_
-// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13
-// is deleted from obsolete_files_ set.
-// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by
-// pending outputs since compaction (1) is still running. It is not deleted and
-// it is not present in obsolete_files_ anymore. Therefore, we never delete it.
-TEST_F(DBTest, DeleteObsoleteFilesPendingOutputs) {
-  Options options = CurrentOptions();
+// Github issue #595
+// Large write batch with column families
+TEST_F(DBTest, LargeBatchWithColumnFamilies) {
+  Options options;
   options.env = env_;
-  options.write_buffer_size = 2 * 1024 * 1024;     // 2 MB
-  options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
-  options.level0_file_num_compaction_trigger =
-      2;  // trigger compaction when we have 2 files
-  options.max_background_flushes = 2;
-  options.max_background_compactions = 2;
-  Reopen(options);
-
-  Random rnd(301);
-  // Create two 1MB sst files
-  for (int i = 0; i < 2; ++i) {
-    // Create 1MB sst file
-    for (int j = 0; j < 100; ++j) {
-      ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024)));
+  options = CurrentOptions(options);
+  options.write_buffer_size = 100000;  // Small write buffer
+  CreateAndReopenWithCF({"pikachu"}, options);
+  int64_t j = 0;
+  for (int i = 0; i < 5; i++) {
+    for (int pass = 1; pass <= 3; pass++) {
+      WriteBatch batch;
+      size_t write_size = 1024 * 1024 * (5 + i);
+      fprintf(stderr, "prepare: %ld MB, pass:%d\n", (write_size / 1024 / 1024),
+              pass);
+      for (;;) {
+        std::string data(3000, j++ % 127 + 20);
+        data += ToString(j);
+        batch.Put(handles_[0], Slice(data), Slice(data));
+        if (batch.GetDataSize() > write_size) {
+          break;
+        }
+      }
+      fprintf(stderr, "write: %ld MB\n", (batch.GetDataSize() / 1024 / 1024));
+      ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+      fprintf(stderr, "done\n");
     }
-    ASSERT_OK(Flush());
   }
-  // this should execute both L0->L1 and L1->(move)->L2 compactions
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+  // make sure we can re-open it.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
 
-  SleepingBackgroundTask blocking_thread;
-  port::Mutex mutex_;
-  bool already_blocked(false);
+// Make sure that Flushes can proceed in parallel with CompactRange()
+TEST_F(DBTest, FlushesInParallelWithCompactRange) {
+  // iter == 0 -- leveled
+  // iter == 1 -- leveled, but throw in a flush between two levels compacting
+  // iter == 2 -- universal
+  for (int iter = 0; iter < 3; ++iter) {
+    Options options = CurrentOptions();
+    if (iter < 2) {
+      options.compaction_style = kCompactionStyleLevel;
+    } else {
+      options.compaction_style = kCompactionStyleUniversal;
+    }
+    options.write_buffer_size = 110 << 10;
+    options.level0_file_num_compaction_trigger = 4;
+    options.num_levels = 4;
+    options.compression = kNoCompression;
+    options.max_bytes_for_level_base = 450 << 10;
+    options.target_file_size_base = 98 << 10;
+    options.max_write_buffer_number = 2;
 
-  // block the flush
-  std::function<void()> block_first_time = [&]() {
-    bool blocking = false;
-    {
-      MutexLock l(&mutex_);
-      if (!already_blocked) {
-        blocking = true;
-        already_blocked = true;
-      }
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int num = 0; num < 14; num++) {
+      GenerateNewRandomFile(&rnd);
     }
-    if (blocking) {
-      blocking_thread.DoSleep();
+
+    if (iter == 1) {
+    rocksdb::SyncPoint::GetInstance()->LoadDependency(
+        {{"DBImpl::RunManualCompaction()::1",
+          "DBTest::FlushesInParallelWithCompactRange:1"},
+         {"DBTest::FlushesInParallelWithCompactRange:2",
+          "DBImpl::RunManualCompaction()::2"}});
+    } else {
+      rocksdb::SyncPoint::GetInstance()->LoadDependency(
+          {{"CompactionJob::Run():Start",
+            "DBTest::FlushesInParallelWithCompactRange:1"},
+           {"DBTest::FlushesInParallelWithCompactRange:2",
+            "CompactionJob::Run():End"}});
     }
-  };
-  env_->table_write_callback_ = &block_first_time;
-  // Create 1MB sst file
-  for (int j = 0; j < 256; ++j) {
-    ASSERT_OK(Put(Key(j), RandomString(&rnd, 10 * 1024)));
-  }
-  // this should trigger a flush, which is blocked with block_first_time
-  // pending_file is protecting all the files created after
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-  ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr));
+    std::vector<std::thread> threads;
+    threads.emplace_back([&]() { Compact("a", "z"); });
 
-  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
-  std::vector<LiveFileMetaData> metadata;
-  db_->GetLiveFilesMetaData(&metadata);
-  ASSERT_EQ(metadata.size(), 1U);
-  auto file_on_L2 = metadata[0].name;
+    TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1");
 
-  ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr));
-  ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+    // this has to start a flush. if flushes are blocked, this will try to
+    // create
+    // 3 memtables, and that will fail because max_write_buffer_number is 2
+    for (int num = 0; num < 3; num++) {
+      GenerateNewRandomFile(&rnd, /* nowait */ true);
+    }
 
-  // finish the flush!
-  blocking_thread.WakeUp();
-  blocking_thread.WaitUntilDone();
-  dbfull()->TEST_WaitForFlushMemTable();
-  ASSERT_EQ("1,0,0,0,1", FilesPerLevel(0));
+    TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2");
 
-  metadata.clear();
-  db_->GetLiveFilesMetaData(&metadata);
-  ASSERT_EQ(metadata.size(), 2U);
+    for (auto& t : threads) {
+      t.join();
+    }
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBTest, DelayedWriteRate) {
+  Options options;
+  options.env = env_;
+  env_->no_sleep_ = true;
+  options = CurrentOptions(options);
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.max_write_buffer_number = 256;
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = 3;
+  options.level0_slowdown_writes_trigger = 3;
+  options.level0_stop_writes_trigger = 999999;
+  options.delayed_write_rate = 200000;  // About 200KB/s limited rate
 
-  // This file should have been deleted
-  ASSERT_TRUE(!env_->FileExists(dbname_ + "/" + file_on_L2));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (int i = 0; i < 3; i++) {
+    Put(Key(i), std::string(10000, 'x'));
+    Flush();
+  }
+
+  // These writes will be slowed down to 1KB/s
+  size_t estimated_total_size = 0;
+  Random rnd(301);
+  for (int i = 0; i < 3000; i++) {
+    auto rand_num = rnd.Uniform(20);
+    // Spread the size range to more.
+    size_t entry_size = rand_num * rand_num * rand_num;
+    WriteOptions wo;
+    Put(Key(i), std::string(entry_size, 'x'), wo);
+    estimated_total_size += entry_size + 20;
+    // Ocassionally sleep a while
+    if (rnd.Uniform(20) == 6) {
+      env_->SleepForMicroseconds(2666);
+    }
+  }
+  uint64_t estimated_sleep_time =
+      estimated_total_size / options.delayed_write_rate * 1000000U;
+  ASSERT_GT(env_->addon_time_.load(), estimated_sleep_time * 0.8);
+  ASSERT_LT(env_->addon_time_.load(), estimated_sleep_time * 1.1);
+
+  env_->no_sleep_ = false;
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
-TEST_F(DBTest, CloseSpeedup) {
-  Options options = CurrentOptions();
-  options.compaction_style = kCompactionStyleLevel;
-  options.write_buffer_size = 100 << 10;  // 100KB
-  options.level0_file_num_compaction_trigger = 2;
-  options.num_levels = 4;
-  options.max_bytes_for_level_base = 400 * 1024;
-  options.max_write_buffer_number = 16;
+TEST_F(DBTest, HardLimit) {
+  Options options;
+  options.env = env_;
+  env_->SetBackgroundThreads(1, Env::LOW);
+  options = CurrentOptions(options);
+  options.max_write_buffer_number = 256;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 * 1024;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 999999;
+  options.level0_stop_writes_trigger = 999999;
+  options.hard_pending_compaction_bytes_limit = 800 << 10;
+  options.max_bytes_for_level_base = 10000000000u;
+  options.max_background_compactions = 1;
 
-  // Block background threads
   env_->SetBackgroundThreads(1, Env::LOW);
-  env_->SetBackgroundThreads(1, Env::HIGH);
-  SleepingBackgroundTask sleeping_task_low;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
                  Env::Priority::LOW);
-  SleepingBackgroundTask sleeping_task_high;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_high,
-                 Env::Priority::HIGH);
 
-  std::vector<std::string> filenames;
-  env_->GetChildren(dbname_, &filenames);
-  // Delete archival files.
-  for (size_t i = 0; i < filenames.size(); ++i) {
-    env_->DeleteFile(dbname_ + "/" + filenames[i]);
-  }
-  env_->DeleteDir(dbname_);
-  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
 
+  std::atomic<int> callback_count(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack("DBImpl::DelayWrite:Wait",
+                                                 [&](void* arg) {
+                                                   callback_count.fetch_add(1);
+                                                   sleeping_task_low.WakeUp();
+                                                 });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-  env_->SetBackgroundThreads(1, Env::LOW);
-  env_->SetBackgroundThreads(1, Env::HIGH);
+
   Random rnd(301);
   int key_idx = 0;
-
-  // First three 110KB files are not going to level 2
-  // After that, (100K, 200K)
   for (int num = 0; num < 5; num++) {
     GenerateNewFile(&rnd, &key_idx, true);
   }
 
-  ASSERT_EQ(0, GetSstFileCount(dbname_));
-
-  Close();
-  ASSERT_EQ(0, GetSstFileCount(dbname_));
-
-  // Unblock background threads
-  sleeping_task_high.WakeUp();
-  sleeping_task_high.WaitUntilDone();
-  sleeping_task_low.WakeUp();
-  sleeping_task_low.WaitUntilDone();
+  ASSERT_EQ(0, callback_count.load());
 
-  Destroy(options);
-}
-
-class DelayedMergeOperator : public AssociativeMergeOperator {
- private:
-  DBTest* db_test_;
-
- public:
-  explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {}
-  virtual bool Merge(const Slice& key, const Slice* existing_value,
-                     const Slice& value, std::string* new_value,
-                     Logger* logger) const override {
-    db_test_->env_->addon_time_ += 1000;
-    return true;
+  for (int num = 0; num < 5; num++) {
+    GenerateNewFile(&rnd, &key_idx, true);
+    dbfull()->TEST_WaitForFlushMemTable();
   }
+  ASSERT_GE(callback_count.load(), 1);
 
-  virtual const char* Name() const override { return "DelayedMergeOperator"; }
-};
-
-TEST_F(DBTest, MergeTestTime) {
-  std::string one, two, three;
-  PutFixed64(&one, 1);
-  PutFixed64(&two, 2);
-  PutFixed64(&three, 3);
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
 
-  // Enable time profiling
-  SetPerfLevel(kEnableTime);
-  this->env_->addon_time_ = 0;
+TEST_F(DBTest, SoftLimit) {
   Options options;
+  options.env = env_;
   options = CurrentOptions(options);
-  options.statistics = rocksdb::CreateDBStatistics();
-  options.merge_operator.reset(new DelayedMergeOperator(this));
-  DestroyAndReopen(options);
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.max_write_buffer_number = 256;
+  options.level0_file_num_compaction_trigger = 3;
+  options.level0_slowdown_writes_trigger = 3;
+  options.level0_stop_writes_trigger = 999999;
+  options.delayed_write_rate = 200000;  // About 200KB/s limited rate
+  options.soft_rate_limit = 1.1;
+  options.target_file_size_base = 99999999;  // All into one file
+  options.max_bytes_for_level_base = 50000;
+  options.compression = kNoCompression;
 
-  ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
-  db_->Put(WriteOptions(), "foo", one);
-  ASSERT_OK(Flush());
-  ASSERT_OK(db_->Merge(WriteOptions(), "foo", two));
-  ASSERT_OK(Flush());
-  ASSERT_OK(db_->Merge(WriteOptions(), "foo", three));
-  ASSERT_OK(Flush());
+  Reopen(options);
+  Put(Key(0), "");
 
-  ReadOptions opt;
-  opt.verify_checksums = true;
-  opt.snapshot = nullptr;
-  std::string result;
-  db_->Get(opt, "foo", &result);
+  // Only allow two compactions
+  port::Mutex mut;
+  port::CondVar cv(&mut);
+  std::atomic<int> compaction_cnt(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void* arg) {
+        // Three flushes and the first compaction,
+        // three flushes and the second compaction go through.
+        MutexLock l(&mut);
+        while (compaction_cnt.load() >= 8) {
+          cv.Wait();
+        }
+        compaction_cnt.fetch_add(1);
+      });
 
-  ASSERT_LT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 2800000);
-  ASSERT_GT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 1200000);
+  std::atomic<int> sleep_count(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Sleep", [&](void* arg) { sleep_count.fetch_add(1); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-  ReadOptions read_options;
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
-  int count = 0;
-  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-    ASSERT_OK(iter->status());
-    ++count;
+  for (int i = 0; i < 3; i++) {
+    Put(Key(i), std::string(5000, 'x'));
+    Put(Key(100 - i), std::string(5000, 'x'));
+    Flush();
   }
-
-  ASSERT_EQ(1, count);
-
-  ASSERT_LT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 6000000);
-  ASSERT_GT(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 3200000);
-}
-
-TEST_F(DBTest, MergeCompactionTimeTest) {
-  SetPerfLevel(kEnableTime);
-  Options options;
-  options = CurrentOptions(options);
-  options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
-  options.statistics = rocksdb::CreateDBStatistics();
-  options.merge_operator.reset(new DelayedMergeOperator(this));
-  options.compaction_style = kCompactionStyleUniversal;
-  DestroyAndReopen(options);
-
-  for (int i = 0; i < 1000; i++) {
-    ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST"));
-    ASSERT_OK(Flush());
+  while (compaction_cnt.load() < 4 || NumTableFilesAtLevel(0) > 0) {
+    env_->SleepForMicroseconds(1000);
   }
-  dbfull()->TEST_WaitForFlushMemTable();
-  dbfull()->TEST_WaitForCompact();
+  // Now there is one L1 file but doesn't trigger soft_rate_limit
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_EQ(sleep_count.load(), 0);
 
-  ASSERT_NE(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
-}
+  for (int i = 0; i < 3; i++) {
+    Put(Key(10 + i), std::string(5000, 'x'));
+    Put(Key(90 - i), std::string(5000, 'x'));
+    Flush();
+  }
+  while (compaction_cnt.load() < 8 || NumTableFilesAtLevel(0) > 0) {
+    env_->SleepForMicroseconds(1000);
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_EQ(sleep_count.load(), 0);
 
-TEST_F(DBTest, FilterCompactionTimeTest) {
-  Options options;
-  options.compaction_filter_factory =
-      std::make_shared<DelayFilterFactory>(this);
-  options.disable_auto_compactions = true;
-  options.create_if_missing = true;
-  options.statistics = rocksdb::CreateDBStatistics();
-  options = CurrentOptions(options);
-  DestroyAndReopen(options);
+  // Slowdown is triggered now
+  for (int i = 0; i < 10; i++) {
+    Put(Key(i), std::string(100, 'x'));
+  }
+  ASSERT_GT(sleep_count.load(), 0);
 
-  // put some data
-  for (int table = 0; table < 4; ++table) {
-    for (int i = 0; i < 10 + table; ++i) {
-      Put(ToString(table * 100 + i), "val");
-    }
-    Flush();
+  {
+    MutexLock l(&mut);
+    compaction_cnt.store(7);
+    cv.SignalAll();
+  }
+  while (NumTableFilesAtLevel(1) > 0) {
+    env_->SleepForMicroseconds(1000);
   }
 
-  ASSERT_OK(db_->CompactRange(nullptr, nullptr));
-  ASSERT_EQ(0U, CountLiveFiles());
+  // Slowdown is not triggered any more.
+  sleep_count.store(0);
+  // Slowdown is not triggered now
+  for (int i = 0; i < 10; i++) {
+    Put(Key(i), std::string(100, 'x'));
+  }
+  ASSERT_EQ(sleep_count.load(), 0);
 
-  Reopen(options);
+  // shrink level base so L2 will hit soft limit easier.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"max_bytes_for_level_base", "5000"},
+  }));
+  compaction_cnt.store(7);
+  Flush();
 
-  Iterator* itr = db_->NewIterator(ReadOptions());
-  itr->SeekToFirst();
-  ASSERT_NE(TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME), 0);
-  delete itr;
-}
+  while (NumTableFilesAtLevel(0) == 0) {
+    env_->SleepForMicroseconds(1000);
+  }
 
-TEST_F(DBTest, TestLogCleanup) {
-  Options options = CurrentOptions();
-  options.write_buffer_size = 64 * 1024;  // very small
-  // only two memtables allowed ==> only two log files
-  options.max_write_buffer_number = 2;
-  Reopen(options);
+  // Slowdown is triggered now
+  for (int i = 0; i < 10; i++) {
+    Put(Key(i), std::string(100, 'x'));
+  }
+  ASSERT_GT(sleep_count.load(), 0);
 
-  for (int i = 0; i < 100000; ++i) {
-    Put(Key(i), "val");
-    // only 2 memtables will be alive, so logs_to_free needs to always be below
-    // 2
-    ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast<size_t>(3));
+  {
+    MutexLock l(&mut);
+    compaction_cnt.store(7);
+    cv.SignalAll();
+  }
+
+  while (NumTableFilesAtLevel(2) != 0) {
+    env_->SleepForMicroseconds(1000);
+  }
+
+  // Slowdown is not triggered anymore
+  sleep_count.store(0);
+  for (int i = 0; i < 10; i++) {
+    Put(Key(i), std::string(100, 'x'));
   }
+  ASSERT_EQ(sleep_count.load(), 0);
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
-TEST_F(DBTest, EmptyCompactedDB) {
-  Options options;
-  options.max_open_files = -1;
-  options = CurrentOptions(options);
-  Close();
-  ASSERT_OK(ReadOnlyReopen(options));
-  Status s = Put("new", "value");
-  ASSERT_TRUE(s.IsNotSupported());
-  Close();
+TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
+  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+                                    kLZ4Compression,  kLZ4HCCompression};
+  for (int iter = 0; iter < 4; ++iter) {
+    if (!CompressionTypeSupported(compressions[iter])) {
+      // not supported, we should fail the Open()
+      Options options = CurrentOptions();
+      options.compression = compressions[iter];
+      ASSERT_TRUE(!TryReopen(options).ok());
+      // Try if CreateColumnFamily also fails
+      options.compression = kNoCompression;
+      ASSERT_OK(TryReopen(options));
+      ColumnFamilyOptions cf_options(options);
+      cf_options.compression = compressions[iter];
+      ColumnFamilyHandle* handle;
+      ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok());
+    }
+  }
 }
 
-TEST_F(DBTest, CompressLevelCompaction) {
+TEST_F(DBTest, RowCache) {
   Options options = CurrentOptions();
-  options.compaction_style = kCompactionStyleLevel;
-  options.write_buffer_size = 100 << 10;  // 100KB
-  options.level0_file_num_compaction_trigger = 2;
-  options.num_levels = 4;
-  options.max_bytes_for_level_base = 400 * 1024;
-  // First two levels have no compression, so that a trivial move between
-  // them will be allowed. Level 2 has Zlib compression so that a trivial
-  // move to level 3 will not be allowed
-  options.compression_per_level = {kNoCompression, kNoCompression,
-                                   kZlibCompression};
-  int matches = 0, didnt_match = 0, trivial_move = 0, non_trivial = 0;
-
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "Compaction::InputCompressionMatchesOutput:Matches",
-      [&](void* arg) { matches++; });
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "Compaction::InputCompressionMatchesOutput:DidntMatch",
-      [&](void* arg) { didnt_match++; });
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction:NonTrivial",
-      [&](void* arg) { non_trivial++; });
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction:TrivialMove",
-      [&](void* arg) { trivial_move++; });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8192);
+  DestroyAndReopen(options);
 
-  Reopen(options);
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
 
-  Random rnd(301);
-  int key_idx = 0;
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+  ASSERT_EQ(Get("foo"), "bar");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo"), "bar");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+}
 
-  // First three 110KB files are going to level 0
-  // After that, (100K, 200K)
-  for (int num = 0; num < 3; num++) {
-    GenerateNewFile(&rnd, &key_idx);
-  }
+// TODO(3.13): fix the issue of Seek() + Prev() which might not necessary
+//             return the biggest key which is smaller than the seek key.
+TEST_F(DBTest, PrevAfterMerge) {
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  DestroyAndReopen(options);
 
-  // Another 110KB triggers a compaction to 400K file to fill up level 0
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ(4, GetSstFileCount(dbname_));
+  // write three entries with different keys using Merge()
+  WriteOptions wopts;
+  db_->Merge(wopts, "1", "data1");
+  db_->Merge(wopts, "2", "data2");
+  db_->Merge(wopts, "3", "data3");
 
-  // (1, 4)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4", FilesPerLevel(0));
+  std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
 
-  // (1, 4, 1)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,1", FilesPerLevel(0));
+  it->Seek("2");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("2", it->key().ToString());
 
-  // (1, 4, 2)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,2", FilesPerLevel(0));
+  it->Prev();
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("1", it->key().ToString());
+}
 
-  // (1, 4, 3)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,3", FilesPerLevel(0));
+TEST_F(DBTest, DeletingOldWalAfterDrop) {
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      { { "Test:AllowFlushes", "DBImpl::BGWorkFlush" },
+        { "DBImpl::BGWorkFlush:done", "Test:WaitForFlush"} });
+  rocksdb::SyncPoint::GetInstance()->ClearTrace();
 
-  // (1, 4, 4)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  Options options = CurrentOptions();
+  options.max_total_wal_size = 8192;
+  options.compression = kNoCompression;
+  options.write_buffer_size = 1 << 20;
+  options.level0_file_num_compaction_trigger = (1<<30);
+  options.level0_slowdown_writes_trigger = (1<<30);
+  options.level0_stop_writes_trigger = (1<<30);
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-  // (1, 4, 5)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,5", FilesPerLevel(0));
+  CreateColumnFamilies({"cf1", "cf2"}, options);
+  ASSERT_OK(Put(0, "key1", DummyString(8192)));
+  ASSERT_OK(Put(0, "key2", DummyString(8192)));
+  // the oldest wal should now be getting_flushed
+  ASSERT_OK(db_->DropColumnFamily(handles_[0]));
+  // all flushes should now do nothing because their CF is dropped
+  TEST_SYNC_POINT("Test:AllowFlushes");
+  TEST_SYNC_POINT("Test:WaitForFlush");
+  uint64_t lognum1 = dbfull()->TEST_LogfileNumber();
+  ASSERT_OK(Put(1, "key3", DummyString(8192)));
+  ASSERT_OK(Put(1, "key4", DummyString(8192)));
+  // new wal should have been created
+  uint64_t lognum2 = dbfull()->TEST_LogfileNumber();
+  EXPECT_GT(lognum2, lognum1);
+}
+
+TEST_F(DBTest, RateLimitedDelete) {
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DBTest::RateLimitedDelete:1",
+       "DeleteSchedulerImpl::BackgroundEmptyTrash"},
+  });
 
-  // (1, 4, 6)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,6", FilesPerLevel(0));
+  std::vector<uint64_t> penalties;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteSchedulerImpl::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); });
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 
-  // (1, 4, 7)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,7", FilesPerLevel(0));
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.env = env_;
 
-  // (1, 4, 8)
-  GenerateNewFile(&rnd, &key_idx);
-  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+  std::string trash_dir = test::TmpDir(env_) + "/trash";
+  int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
+  Status s;
+  options.delete_scheduler.reset(NewDeleteScheduler(
+      env_, trash_dir, rate_bytes_per_sec, nullptr, false, &s));
+  ASSERT_OK(s);
 
-  ASSERT_EQ(matches, 12);
-  // Currently, the test relies on the number of calls to
-  // InputCompressionMatchesOutput() per compaction.
-  const int kCallsToInputCompressionMatch = 2;
-  ASSERT_EQ(didnt_match, 8 * kCallsToInputCompressionMatch);
-  ASSERT_EQ(trivial_move, 12);
-  ASSERT_EQ(non_trivial, 8);
+  Destroy(last_options_);
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(TryReopen(options));
+  // Create 4 files in L0
+  for (char v = 'a'; v <= 'd'; v++) {
+    ASSERT_OK(Put("Key2", DummyString(1024, v)));
+    ASSERT_OK(Put("Key3", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Put("Key1", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
 
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
 
-  for (int i = 0; i < key_idx; i++) {
-    auto v = Get(Key(i));
-    ASSERT_NE(v, "NOT_FOUND");
-    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
-  }
+  // Compaction will move the 4 files in L0 to trash and create 1 L1 file
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
 
-  Reopen(options);
+  uint64_t delete_start_time = env_->NowMicros();
+  // Hold BackgroundEmptyTrash
+  TEST_SYNC_POINT("DBTest::RateLimitedDelete:1");
+  options.delete_scheduler->WaitForEmptyTrash();
+  uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
 
-  for (int i = 0; i < key_idx; i++) {
-    auto v = Get(Key(i));
-    ASSERT_NE(v, "NOT_FOUND");
-    ASSERT_TRUE(v.size() == 1 || v.size() == 10000);
+  uint64_t total_files_size = 0;
+  uint64_t expected_penlty = 0;
+  ASSERT_EQ(penalties.size(), metadata.size());
+  for (size_t i = 0; i < metadata.size(); i++) {
+    total_files_size += metadata[i].size;
+    expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec);
+    ASSERT_EQ(expected_penlty, penalties[i]);
   }
+  ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
 
-  Destroy(options);
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
-TEST_F(DBTest, SuggestCompactRangeTest) {
-  class CompactionFilterFactoryGetContext : public CompactionFilterFactory {
-   public:
-    virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-        const CompactionFilter::Context& context) override {
-      saved_context = context;
-      std::unique_ptr<CompactionFilter> empty_filter;
-      return empty_filter;
-    }
-    const char* Name() const override {
-      return "CompactionFilterFactoryGetContext";
-    }
-    static bool IsManual(CompactionFilterFactory* compaction_filter_factory) {
-      return reinterpret_cast<CompactionFilterFactoryGetContext*>(
-          compaction_filter_factory)->saved_context.is_manual_compaction;
-    }
-    CompactionFilter::Context saved_context;
-  };
+// Create a DB with 2 db_paths, and generate multiple files in the 2
+// db_paths using CompactRangeOptions, make sure that files that were
+// deleted from first db_path were deleted using DeleteScheduler and
+// files in the second path were not.
+TEST_F(DBTest, DeleteSchedulerMultipleDBPaths) {
+  int bg_delete_file = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteSchedulerImpl::DeleteTrashFile:DeleteFile",
+      [&](void* arg) { bg_delete_file++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   Options options = CurrentOptions();
-  options.compaction_style = kCompactionStyleLevel;
-  options.compaction_filter_factory.reset(
-      new CompactionFilterFactoryGetContext());
-  options.write_buffer_size = 110 << 10;
-  options.level0_file_num_compaction_trigger = 4;
-  options.num_levels = 4;
-  options.compression = kNoCompression;
-  options.max_bytes_for_level_base = 450 << 10;
-  options.target_file_size_base = 98 << 10;
-  options.max_grandparent_overlap_factor = 1 << 20;  // inf
+  options.disable_auto_compactions = true;
+  options.db_paths.emplace_back(dbname_, 1024 * 100);
+  options.db_paths.emplace_back(dbname_ + "_2", 1024 * 100);
+  options.env = env_;
 
-  Reopen(options);
+  std::string trash_dir = test::TmpDir(env_) + "/trash";
+  int64_t rate_bytes_per_sec = 1024 * 1024;  // 1 Mb / Sec
+  Status s;
+  options.delete_scheduler.reset(NewDeleteScheduler(
+      env_, trash_dir, rate_bytes_per_sec, nullptr, false, &s));
+  ASSERT_OK(s);
 
-  Random rnd(301);
+  DestroyAndReopen(options);
 
-  for (int num = 0; num < 3; num++) {
-    GenerateNewRandomFile(&rnd);
+  // Create 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A')));
+    ASSERT_OK(Flush());
   }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+  // Compaction will delete files from L0 in first db path and generate a new
+  // file in L1 in second db path
+  CompactRangeOptions compact_options;
+  compact_options.target_path_id = 1;
+  Slice begin("Key0");
+  Slice end("Key3");
+  ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
 
-  GenerateNewRandomFile(&rnd);
-  ASSERT_EQ("0,4", FilesPerLevel(0));
-  ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
-                   options.compaction_filter_factory.get()));
+  // Create 4 files in L0
+  for (int i = 4; i < 8; i++) {
+    ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'B')));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("4,1", FilesPerLevel(0));
 
-  GenerateNewRandomFile(&rnd);
-  ASSERT_EQ("1,4", FilesPerLevel(0));
+  // Compaction will delete files from L0 in first db path and generate a new
+  // file in L1 in second db path
+  begin = "Key4";
+  end  = "Key7";
+  ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+  ASSERT_EQ("0,2", FilesPerLevel(0));
 
-  GenerateNewRandomFile(&rnd);
-  ASSERT_EQ("2,4", FilesPerLevel(0));
+  options.delete_scheduler->WaitForEmptyTrash();
+  ASSERT_EQ(bg_delete_file, 8);
 
-  GenerateNewRandomFile(&rnd);
-  ASSERT_EQ("3,4", FilesPerLevel(0));
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
 
-  GenerateNewRandomFile(&rnd);
-  ASSERT_EQ("0,4,4", FilesPerLevel(0));
+  options.delete_scheduler->WaitForEmptyTrash();
+  ASSERT_EQ(bg_delete_file, 8);
 
-  GenerateNewRandomFile(&rnd);
-  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
 
-  GenerateNewRandomFile(&rnd);
-  ASSERT_EQ("2,4,4", FilesPerLevel(0));
+TEST_F(DBTest, DestroyDBWithRateLimitedDelete) {
+  int bg_delete_file = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteSchedulerImpl::DeleteTrashFile:DeleteFile",
+      [&](void* arg) { bg_delete_file++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-  GenerateNewRandomFile(&rnd);
-  ASSERT_EQ("3,4,4", FilesPerLevel(0));
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  DestroyAndReopen(options);
 
-  GenerateNewRandomFile(&rnd);
-  ASSERT_EQ("0,4,8", FilesPerLevel(0));
+  // Create 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A')));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
 
-  GenerateNewRandomFile(&rnd);
-  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+  // Close DB and destory it using DeleteScheduler
+  Close();
+  std::string trash_dir = test::TmpDir(env_) + "/trash";
+  int64_t rate_bytes_per_sec = 1024 * 1024;  // 1 Mb / Sec
+  Status s;
+  options.delete_scheduler.reset(NewDeleteScheduler(
+      env_, trash_dir, rate_bytes_per_sec, nullptr, false, &s));
+  ASSERT_OK(s);
+  ASSERT_OK(DestroyDB(dbname_, options));
 
-  // compact it three times
-  for (int i = 0; i < 3; ++i) {
-    ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
-    dbfull()->TEST_WaitForCompact();
-  }
+  options.delete_scheduler->WaitForEmptyTrash();
+  // We have deleted the 4 sst files in the delete_scheduler
+  ASSERT_EQ(bg_delete_file, 4);
+}
 
-  ASSERT_EQ("0,0,13", FilesPerLevel(0));
+TEST_F(DBTest, UnsupportedManualSync) {
+  DestroyAndReopen(CurrentOptions());
+  env_->is_wal_sync_thread_safe_.store(false);
+  Status s = db_->SyncWAL();
+  ASSERT_TRUE(s.IsNotSupported());
+}
 
-  GenerateNewRandomFile(&rnd);
-  ASSERT_EQ("1,0,13", FilesPerLevel(0));
+TEST_F(DBTest, OpenDBWithInfiniteMaxOpenFiles) {
+  // Open DB with infinite max open files
+  //  - First iteration use 1 thread to open files
+  //  - Second iteration use 5 threads to open files
+  for (int iter = 0; iter < 2; iter++) {
+    Options options;
+    options.create_if_missing = true;
+    options.write_buffer_size = 100000;
+    options.disable_auto_compactions = true;
+    options.max_open_files = -1;
+    if (iter == 0) {
+      options.max_file_opening_threads = 1;
+    } else {
+      options.max_file_opening_threads = 5;
+    }
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
 
-  // nonoverlapping with the file on level 0
-  Slice start("a"), end("b");
-  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
-  dbfull()->TEST_WaitForCompact();
+    // Create 12 Files in L0 (then move then to L2)
+    for (int i = 0; i < 12; i++) {
+      std::string k = "L2_" + Key(i);
+      ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+      ASSERT_OK(Flush());
+    }
+    CompactRangeOptions compact_options;
+    compact_options.change_level = true;
+    compact_options.target_level = 2;
+    db_->CompactRange(compact_options, nullptr, nullptr);
 
-  // should not compact the level 0 file
-  ASSERT_EQ("1,0,13", FilesPerLevel(0));
+    // Create 12 Files in L0
+    for (int i = 0; i < 12; i++) {
+      std::string k = "L0_" + Key(i);
+      ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+      ASSERT_OK(Flush());
+    }
+    Close();
 
-  start = Slice("j");
-  end = Slice("m");
-  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
-  dbfull()->TEST_WaitForCompact();
-  ASSERT_TRUE(CompactionFilterFactoryGetContext::IsManual(
-      options.compaction_filter_factory.get()));
+    // Reopening the DB will load all exisitng files
+    Reopen(options);
+    ASSERT_EQ("12,0,12", FilesPerLevel(0));
+    std::vector<std::vector<FileMetaData>> files;
+    dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
 
-  // now it should compact the level 0 file
-  ASSERT_EQ("0,1,13", FilesPerLevel(0));
+    for (const auto& level : files) {
+      for (const auto& file : level) {
+        ASSERT_TRUE(file.table_reader_handle != nullptr);
+      }
+    }
+
+    for (int i = 0; i < 12; i++) {
+      ASSERT_EQ(Get("L0_" + Key(i)), "L0_" + Key(i) + std::string(1000, 'a'));
+      ASSERT_EQ(Get("L2_" + Key(i)), "L2_" + Key(i) + std::string(1000, 'a'));
+    }
+  }
 }
 
-TEST_F(DBTest, PromoteL0) {
+TEST_F(DBTest, GetTotalSstFilesSize) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
-  options.write_buffer_size = 10 * 1024 * 1024;
+  options.compression = kNoCompression;
   DestroyAndReopen(options);
+  // Generate 5 files in L0
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 10; j++) {
+      std::string val = "val_file_" + ToString(i);
+      ASSERT_OK(Put(Key(j), val));
+    }
+    Flush();
+  }
+  ASSERT_EQ("5", FilesPerLevel(0));
 
-  // non overlapping ranges
-  std::vector<std::pair<int32_t, int32_t>> ranges = {
-      {81, 160}, {0, 80}, {161, 240}, {241, 320}};
-
-  int32_t value_size = 10 * 1024;  // 10 KB
+  std::vector<LiveFileMetaData> live_files_meta;
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 5);
+  uint64_t single_file_size = live_files_meta[0].size;
 
-  Random rnd(301);
-  std::map<int32_t, std::string> values;
-  for (const auto& range : ranges) {
-    for (int32_t j = range.first; j < range.second; j++) {
-      values[j] = RandomString(&rnd, value_size);
-      ASSERT_OK(Put(Key(j), values[j]));
-    }
-    ASSERT_OK(Flush());
+  uint64_t live_sst_files_size = 0;
+  uint64_t total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
   }
 
-  int32_t level0_files = NumTableFilesAtLevel(0, 0);
-  ASSERT_EQ(level0_files, ranges.size());
-  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // No files in L1
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 5
+  // Total SST files = 5
+  ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
 
-  // Promote L0 level to L2.
-  ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2));
-  // We expect that all the files were trivially moved from L0 to L2
-  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
-  ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files);
+  // hold current version
+  std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
 
-  for (const auto& kv : values) {
-    ASSERT_EQ(Get(Key(kv.first)), kv.second);
+  // Compact 5 files into 1 file in L0
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 1);
+
+  live_sst_files_size = 0;
+  total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
   }
-}
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 1 (compacted file)
+  // Total SST files = 6 (5 original files + compacted file)
+  ASSERT_EQ(live_sst_files_size, 1 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 6 * single_file_size);
 
-TEST_F(DBTest, PromoteL0Failure) {
+  // hold current version
+  std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+
+  // Delete all keys and compact, this will delete all live files
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  Flush();
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 0);
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 6 (5 original files + compacted file)
+  ASSERT_EQ(total_sst_files_size, 6 * single_file_size);
+
+  iter1.reset();
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 1 (compacted file)
+  ASSERT_EQ(total_sst_files_size, 1 * single_file_size);
+
+  iter2.reset();
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 0
+  ASSERT_EQ(total_sst_files_size, 0);
+}
+
+TEST_F(DBTest, GetTotalSstFilesSizeVersionsFilesShared) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
-  options.write_buffer_size = 10 * 1024 * 1024;
+  options.compression = kNoCompression;
   DestroyAndReopen(options);
+  // Generate 5 files in L0
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Put(Key(i), "val"));
+    Flush();
+  }
+  ASSERT_EQ("5", FilesPerLevel(0));
 
-  // Produce two L0 files with overlapping ranges.
-  ASSERT_OK(Put(Key(0), ""));
-  ASSERT_OK(Put(Key(3), ""));
-  ASSERT_OK(Flush());
-  ASSERT_OK(Put(Key(1), ""));
-  ASSERT_OK(Flush());
+  std::vector<LiveFileMetaData> live_files_meta;
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 5);
+  uint64_t single_file_size = live_files_meta[0].size;
 
-  Status status;
-  // Fails because L0 has overlapping files.
-  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
-  ASSERT_TRUE(status.IsInvalidArgument());
+  uint64_t live_sst_files_size = 0;
+  uint64_t total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
+  }
 
-  ASSERT_OK(db_->CompactRange(nullptr, nullptr));
-  // Now there is a file in L1.
-  ASSERT_GE(NumTableFilesAtLevel(1, 0), 1);
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
 
-  ASSERT_OK(Put(Key(5), ""));
-  ASSERT_OK(Flush());
-  // Fails because L1 is non-empty.
-  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
-  ASSERT_TRUE(status.IsInvalidArgument());
-}
+  // Live SST files = 5
+  // Total SST files = 5
+  ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
 
-// Github issue #596
-TEST_F(DBTest, HugeNumberOfLevels) {
-  Options options = CurrentOptions();
-  options.write_buffer_size = 2 * 1024 * 1024;         // 2MB
-  options.max_bytes_for_level_base = 2 * 1024 * 1024;  // 2MB
-  options.num_levels = 12;
-  options.max_background_compactions = 10;
-  options.max_bytes_for_level_multiplier = 2;
-  options.level_compaction_dynamic_level_bytes = true;
-  DestroyAndReopen(options);
+  // hold current version
+  std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
 
-  Random rnd(301);
-  for (int i = 0; i < 300000; ++i) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+  // Compaction will do trivial move from L0 to L1
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,5", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 5);
+
+  live_sst_files_size = 0;
+  total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
   }
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 5
+  // Total SST files = 5 (used in 2 version)
+  ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
 
-  ASSERT_OK(db_->CompactRange(nullptr, nullptr));
-}
+  // hold current version
+  std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
 
-// Github issue #595
-// Large write batch with column families
-TEST_F(DBTest, LargeBatchWithColumnFamilies) {
-  Options options;
-  options.env = env_;
-  options = CurrentOptions(options);
-  options.write_buffer_size = 100000;  // Small write buffer
-  CreateAndReopenWithCF({"pikachu"}, options);
-  int64_t j = 0;
+  // Delete all keys and compact, this will delete all live files
   for (int i = 0; i < 5; i++) {
-    for (int pass = 1; pass <= 3; pass++) {
-      WriteBatch batch;
-      size_t write_size = 1024 * 1024 * (5 + i);
-      fprintf(stderr, "prepare: %ld MB, pass:%d\n", (write_size / 1024 / 1024),
-              pass);
-      for (;;) {
-        std::string data(3000, j++ % 127 + 20);
-        data += std::to_string(j);
-        batch.Put(handles_[0], Slice(data), Slice(data));
-        if (batch.GetDataSize() > write_size) {
-          break;
-        }
-      }
-      fprintf(stderr, "write: %ld MB\n", (batch.GetDataSize() / 1024 / 1024));
-      ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
-      fprintf(stderr, "done\n");
-    }
+    ASSERT_OK(Delete(Key(i)));
   }
-  // make sure we can re-open it.
-  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+  Flush();
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 0);
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 5 (used in 2 version)
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+  iter1.reset();
+  iter2.reset();
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 0
+  ASSERT_EQ(total_sst_files_size, 0);
 }
 
-// Make sure that Flushes can proceed in parallel with CompactRange()
-TEST_F(DBTest, FlushesInParallelWithCompactRange) {
-  // iter == 0 -- leveled
-  // iter == 1 -- leveled, but throw in a flush between two levels compacting
-  // iter == 2 -- universal
-  for (int iter = 0; iter < 3; ++iter) {
+TEST_F(DBTest, AddExternalSstFile) {
+  do {
+    std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/";
+    env_->CreateDir(sst_files_folder);
     Options options = CurrentOptions();
-    if (iter < 2) {
-      options.compaction_style = kCompactionStyleLevel;
-    } else {
-      options.compaction_style = kCompactionStyleUniversal;
-    }
-    options.write_buffer_size = 110 << 10;
-    options.level0_file_num_compaction_trigger = 4;
-    options.num_levels = 4;
-    options.compression = kNoCompression;
-    options.max_bytes_for_level_base = 450 << 10;
-    options.target_file_size_base = 98 << 10;
-    options.max_write_buffer_number = 2;
+    options.env = env_;
+    const ImmutableCFOptions ioptions(options);
+
+    SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator);
+
+    // file1.sst (0 => 99)
+    std::string file1 = sst_files_folder + "file1.sst";
+    ASSERT_OK(sst_file_writer.Open(file1));
+    for (int k = 0; k < 100; k++) {
+      ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file1_info;
+    Status s = sst_file_writer.Finish(&file1_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file1_info.file_path, file1);
+    ASSERT_EQ(file1_info.num_entries, 100);
+    ASSERT_EQ(file1_info.smallest_key, Key(0));
+    ASSERT_EQ(file1_info.largest_key, Key(99));
+    // sst_file_writer already finished, cannot add this value
+    s = sst_file_writer.Add(Key(100), "bad_val");
+    ASSERT_FALSE(s.ok()) << s.ToString();
+
+    // file2.sst (100 => 199)
+    std::string file2 = sst_files_folder + "file2.sst";
+    ASSERT_OK(sst_file_writer.Open(file2));
+    for (int k = 100; k < 200; k++) {
+      ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val"));
+    }
+    // Cannot add this key because it's not after last added key
+    s = sst_file_writer.Add(Key(99), "bad_val");
+    ASSERT_FALSE(s.ok()) << s.ToString();
+    ExternalSstFileInfo file2_info;
+    s = sst_file_writer.Finish(&file2_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file2_info.file_path, file2);
+    ASSERT_EQ(file2_info.num_entries, 100);
+    ASSERT_EQ(file2_info.smallest_key, Key(100));
+    ASSERT_EQ(file2_info.largest_key, Key(199));
+
+    // file3.sst (195 => 299)
+    // This file values overlap with file2 values
+    std::string file3 = sst_files_folder + "file3.sst";
+    ASSERT_OK(sst_file_writer.Open(file3));
+    for (int k = 195; k < 300; k++) {
+      ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap"));
+    }
+    ExternalSstFileInfo file3_info;
+    s = sst_file_writer.Finish(&file3_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file3_info.file_path, file3);
+    ASSERT_EQ(file3_info.num_entries, 105);
+    ASSERT_EQ(file3_info.smallest_key, Key(195));
+    ASSERT_EQ(file3_info.largest_key, Key(299));
+
+    // file4.sst (30 => 39)
+    // This file values overlap with file1 values
+    std::string file4 = sst_files_folder + "file4.sst";
+    ASSERT_OK(sst_file_writer.Open(file4));
+    for (int k = 30; k < 40; k++) {
+      ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap"));
+    }
+    ExternalSstFileInfo file4_info;
+    s = sst_file_writer.Finish(&file4_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file4_info.file_path, file4);
+    ASSERT_EQ(file4_info.num_entries, 10);
+    ASSERT_EQ(file4_info.smallest_key, Key(30));
+    ASSERT_EQ(file4_info.largest_key, Key(39));
+
+    // file5.sst (400 => 499)
+    std::string file5 = sst_files_folder + "file5.sst";
+    ASSERT_OK(sst_file_writer.Open(file5));
+    for (int k = 400; k < 500; k++) {
+      ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file5_info;
+    s = sst_file_writer.Finish(&file5_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file5_info.file_path, file5);
+    ASSERT_EQ(file5_info.num_entries, 100);
+    ASSERT_EQ(file5_info.smallest_key, Key(400));
+    ASSERT_EQ(file5_info.largest_key, Key(499));
 
     DestroyAndReopen(options);
+    // Add file using file path
+    s = db_->AddFile(file1);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+    for (int k = 0; k < 100; k++) {
+      ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+    }
 
-    Random rnd(301);
-    for (int num = 0; num < 14; num++) {
-      GenerateNewRandomFile(&rnd);
+    // Add file using file info
+    s = db_->AddFile(&file2_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+    for (int k = 0; k < 200; k++) {
+      ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
     }
 
-    if (iter == 1) {
-    rocksdb::SyncPoint::GetInstance()->LoadDependency(
-        {{"DBImpl::RunManualCompaction()::1",
-          "DBTest::FlushesInParallelWithCompactRange:1"},
-         {"DBTest::FlushesInParallelWithCompactRange:2",
-          "DBImpl::RunManualCompaction()::2"}});
-    } else {
-      rocksdb::SyncPoint::GetInstance()->LoadDependency(
-          {{"CompactionJob::Run():Start",
-            "DBTest::FlushesInParallelWithCompactRange:1"},
-           {"DBTest::FlushesInParallelWithCompactRange:2",
-            "CompactionJob::Run():End"}});
+    // This file have overlapping values with the exisitng data
+    s = db_->AddFile(file3);
+    ASSERT_FALSE(s.ok()) << s.ToString();
+
+    // This file have overlapping values with the exisitng data
+    s = db_->AddFile(&file4_info);
+    ASSERT_FALSE(s.ok()) << s.ToString();
+
+    // Overwrite values of keys divisible by 5
+    for (int k = 0; k < 200; k += 5) {
+      ASSERT_OK(Put(Key(k), Key(k) + "_val_new"));
     }
-    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_NE(db_->GetLatestSequenceNumber(), 0U);
 
-    std::vector<std::thread> threads;
-    threads.emplace_back([&]() { Compact("a", "z"); });
+    // DB have values in memtable now, we cannot add files anymore
+    s = db_->AddFile(file5);
+    ASSERT_FALSE(s.ok()) << s.ToString();
 
-    TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1");
+    // Make sure values are correct before and after flush/compaction
+    for (int i = 0; i < 2; i++) {
+      for (int k = 0; k < 200; k++) {
+        std::string value = Key(k) + "_val";
+        if (k % 5 == 0) {
+          value += "_new";
+        }
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
 
-    // this has to start a flush. if flushes are blocked, this will try to
-    // create
-    // 3 memtables, and that will fail because max_write_buffer_number is 2
-    for (int num = 0; num < 3; num++) {
-      GenerateNewRandomFile(&rnd, /* nowait */ true);
+    // DB sequence number is not zero, cannot add files anymore
+    s = db_->AddFile(file5);
+    ASSERT_FALSE(s.ok()) << s.ToString();
+  } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction |
+                         kSkipFIFOCompaction));
+}
+
+TEST_F(DBTest, AddExternalSstFileNoCopy) {
+  std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/";
+  env_->CreateDir(sst_files_folder);
+  Options options = CurrentOptions();
+  options.env = env_;
+  const ImmutableCFOptions ioptions(options);
+
+  SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator);
+
+  // file1.sst (0 => 99)
+  std::string file1 = sst_files_folder + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 0; k < 100; k++) {
+    ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(0));
+  ASSERT_EQ(file1_info.largest_key, Key(99));
+
+  // file2.sst (100 => 299)
+  std::string file2 = sst_files_folder + "file2.sst";
+  ASSERT_OK(sst_file_writer.Open(file2));
+  for (int k = 100; k < 300; k++) {
+    ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file2_info;
+  s = sst_file_writer.Finish(&file2_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file2_info.file_path, file2);
+  ASSERT_EQ(file2_info.num_entries, 200);
+  ASSERT_EQ(file2_info.smallest_key, Key(100));
+  ASSERT_EQ(file2_info.largest_key, Key(299));
+
+  // file3.sst (110 => 124) .. overlap with file2.sst
+  std::string file3 = sst_files_folder + "file3.sst";
+  ASSERT_OK(sst_file_writer.Open(file3));
+  for (int k = 110; k < 125; k++) {
+    ASSERT_OK(sst_file_writer.Add(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file3_info;
+  s = sst_file_writer.Finish(&file3_info);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(file3_info.file_path, file3);
+  ASSERT_EQ(file3_info.num_entries, 15);
+  ASSERT_EQ(file3_info.smallest_key, Key(110));
+  ASSERT_EQ(file3_info.largest_key, Key(124));
+
+  s = db_->AddFile(&file1_info, true /* move file */);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_EQ(Status::NotFound(), env_->FileExists(file1));
+
+  s = db_->AddFile(&file2_info, false /* copy file */);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_OK(env_->FileExists(file2));
+
+  // This file have overlapping values with the exisitng data
+  s = db_->AddFile(&file3_info, true /* move file */);
+  ASSERT_FALSE(s.ok()) << s.ToString();
+  ASSERT_OK(env_->FileExists(file3));
+
+  for (int k = 0; k < 300; k++) {
+    ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+  }
+}
+
+TEST_F(DBTest, AddExternalSstFileMultiThreaded) {
+  std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/";
+  // Bulk load 10 files every file contain 1000 keys
+  int num_files = 10;
+  int keys_per_file = 1000;
+
+  // Generate file names
+  std::vector<std::string> file_names;
+  for (int i = 0; i < num_files; i++) {
+    std::string file_name = "file_" + ToString(i) + ".sst";
+    file_names.push_back(sst_files_folder + file_name);
+  }
+
+  do {
+    env_->CreateDir(sst_files_folder);
+    Options options = CurrentOptions();
+    const ImmutableCFOptions ioptions(options);
+
+    std::atomic<int> thread_num(0);
+    std::function<void()> write_file_func = [&]() {
+      int file_idx = thread_num.fetch_add(1);
+      int range_start = file_idx * keys_per_file;
+      int range_end = range_start + keys_per_file;
+
+      SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator);
+
+      ASSERT_OK(sst_file_writer.Open(file_names[file_idx]));
+
+      for (int k = range_start; k < range_end; k++) {
+        ASSERT_OK(sst_file_writer.Add(Key(k), Key(k)));
+      }
+
+      Status s = sst_file_writer.Finish();
+      ASSERT_TRUE(s.ok()) << s.ToString();
+    };
+    // Write num_files files in parallel
+    std::vector<std::thread> sst_writer_threads;
+    for (int i = 0; i < num_files; ++i) {
+      sst_writer_threads.emplace_back(write_file_func);
     }
 
-    TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2");
+    for (auto& t : sst_writer_threads) {
+      t.join();
+    }
 
-    for (auto& t : threads) {
+    fprintf(stderr, "Wrote %d files (%d keys)\n", num_files,
+            num_files * keys_per_file);
+
+    thread_num.store(0);
+    std::atomic<int> files_added(0);
+    std::function<void()> load_file_func = [&]() {
+      // We intentionally add every file twice, and assert that it was added
+      // only once and the other add failed
+      int thread_id = thread_num.fetch_add(1);
+      int file_idx = thread_id / 2;
+      // sometimes we use copy, sometimes link .. the result should be the same
+      bool move_file = (thread_id % 3 == 0);
+
+      Status s = db_->AddFile(file_names[file_idx], move_file);
+      if (s.ok()) {
+        files_added++;
+      }
+    };
+    // Bulk load num_files files in parallel
+    std::vector<std::thread> add_file_threads;
+    DestroyAndReopen(options);
+    for (int i = 0; i < num_files * 2; ++i) {
+      add_file_threads.emplace_back(load_file_func);
+    }
+
+    for (auto& t : add_file_threads) {
       t.join();
     }
-    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  }
+    ASSERT_EQ(files_added.load(), num_files);
+    fprintf(stderr, "Loaded %d files (%d keys)\n", num_files,
+            num_files * keys_per_file);
+
+    // Overwrite values of keys divisible by 100
+    for (int k = 0; k < num_files * keys_per_file; k += 100) {
+      std::string key = Key(k);
+      Status s = Put(key, key + "_new");
+      ASSERT_TRUE(s.ok());
+    }
+
+    for (int i = 0; i < 2; i++) {
+      // Make sure the values are correct before and after flush/compaction
+      for (int k = 0; k < num_files * keys_per_file; ++k) {
+        std::string key = Key(k);
+        std::string value = (k % 100 == 0) ? (key + "_new") : key;
+        ASSERT_EQ(Get(key), value);
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+
+    fprintf(stderr, "Verified %d values\n", num_files * keys_per_file);
+  } while (ChangeOptions(kSkipPlainTable | kSkipUniversalCompaction |
+                         kSkipFIFOCompaction));
 }
 
-// This tests for a bug that could cause two level0 compactions running
-// concurrently
-TEST_F(DBTest, SuggestCompactRangeNoTwoLevel0Compactions) {
+// 1 Create some SST files by inserting K-V pairs into DB
+// 2 Close DB and change suffix from ".sst" to ".ldb" for every other SST file
+// 3 Open DB and check if all key can be read
+TEST_F(DBTest, SSTsWithLdbSuffixHandling) {
   Options options = CurrentOptions();
-  options.compaction_style = kCompactionStyleLevel;
-  options.write_buffer_size = 110 << 10;
-  options.level0_file_num_compaction_trigger = 4;
+  options.write_buffer_size = 110 << 10;  // 110KB
   options.num_levels = 4;
-  options.compression = kNoCompression;
-  options.max_bytes_for_level_base = 450 << 10;
-  options.target_file_size_base = 98 << 10;
-  options.max_write_buffer_number = 2;
-  options.max_background_compactions = 2;
-
   DestroyAndReopen(options);
 
-  // fill up the DB
   Random rnd(301);
-  for (int num = 0; num < 10; num++) {
-    GenerateNewRandomFile(&rnd);
+  int key_id = 0;
+  for (int i = 0; i < 10; ++i) {
+    GenerateNewFile(&rnd, &key_id, false);
   }
-  db_->CompactRange(nullptr, nullptr);
-
-  rocksdb::SyncPoint::GetInstance()->LoadDependency(
-      {{"CompactionJob::Run():Start",
-        "DBTest::SuggestCompactRangeNoTwoLevel0Compactions:1"},
-       {"DBTest::SuggestCompactRangeNoTwoLevel0Compactions:2",
-        "CompactionJob::Run():End"}});
+  Flush();
+  Close();
+  int const num_files = GetSstFileCount(dbname_);
+  ASSERT_GT(num_files, 0);
 
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  std::vector<std::string> filenames;
+  GetSstFiles(dbname_, &filenames);
+  int num_ldb_files = 0;
+  for (unsigned int i = 0; i < filenames.size(); ++i) {
+    if (i & 1) {
+      continue;
+    }
+    std::string const rdb_name = dbname_ + "/" + filenames[i];
+    std::string const ldb_name = Rocks2LevelTableFileName(rdb_name);
+    ASSERT_TRUE(env_->RenameFile(rdb_name, ldb_name).ok());
+    ++num_ldb_files;
+  }
+  ASSERT_GT(num_ldb_files, 0);
+  ASSERT_EQ(num_files, GetSstFileCount(dbname_));
 
-  // trigger L0 compaction
-  for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
-       num++) {
-    GenerateNewRandomFile(&rnd, /* nowait */ true);
+  Reopen(options);
+  for (int k = 0; k < key_id; ++k) {
+    ASSERT_NE("NOT_FOUND", Get(Key(k)));
   }
+  Destroy(options);
+}
 
-  TEST_SYNC_POINT("DBTest::SuggestCompactRangeNoTwoLevel0Compactions:1");
+INSTANTIATE_TEST_CASE_P(DBTestWithParam, DBTestWithParam,
+                        ::testing::Values(1, 4));
 
-  GenerateNewRandomFile(&rnd, /* nowait */ true);
-  dbfull()->TEST_WaitForFlushMemTable();
-  ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
-  for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
-       num++) {
-    GenerateNewRandomFile(&rnd, /* nowait */ true);
+TEST_F(DBTest, PauseBackgroundWorkTest) {
+  Options options;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options = CurrentOptions(options);
+  Reopen(options);
+
+  std::vector<std::thread> threads;
+  std::atomic<bool> done(false);
+  db_->PauseBackgroundWork();
+  threads.emplace_back([&]() {
+    Random rnd(301);
+    for (int i = 0; i < 10000; ++i) {
+      Put(RandomString(&rnd, 10), RandomString(&rnd, 10));
+    }
+    done.store(true);
+  });
+  env_->SleepForMicroseconds(200000);
+  // make sure the thread is not done
+  ASSERT_EQ(false, done.load());
+  db_->ContinueBackgroundWork();
+  for (auto& t : threads) {
+    t.join();
   }
+  // now it's done
+  ASSERT_EQ(true, done.load());
+}
+
+// 1 Insert 2 K-V pairs into DB
+// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2
+// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1
+// 4 Call Flush() to create SST
+// 5 Call Get() for both keys - expext SST bloom hit stat to be 2
+// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1
+// Test both: block and plain SST
+TEST_P(BloomStatsTestWithParam, BloomStatsTest) {
+  std::string key1("AAAA");
+  std::string key2("RXDB");  // not in DB
+  std::string key3("ZBRA");
+  std::string value1("Value1");
+  std::string value3("Value3");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+
+  // check memtable bloom stats
+  ASSERT_EQ(value1, Get(key1));
+  ASSERT_EQ(1, perf_context.bloom_memtable_hit_count);
+  ASSERT_EQ(value3, Get(key3));
+  ASSERT_EQ(2, perf_context.bloom_memtable_hit_count);
+  ASSERT_EQ(0, perf_context.bloom_memtable_miss_count);
+
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  ASSERT_EQ(1, perf_context.bloom_memtable_miss_count);
+  ASSERT_EQ(2, perf_context.bloom_memtable_hit_count);
+
+  // sanity checks
+  ASSERT_EQ(0, perf_context.bloom_sst_hit_count);
+  ASSERT_EQ(0, perf_context.bloom_sst_miss_count);
 
-  TEST_SYNC_POINT("DBTest::SuggestCompactRangeNoTwoLevel0Compactions:2");
+  Flush();
+
+  // sanity checks
+  ASSERT_EQ(0, perf_context.bloom_sst_hit_count);
+  ASSERT_EQ(0, perf_context.bloom_sst_miss_count);
+
+  // check SST bloom stats
+  // NOTE: hits per get differs because of code paths differences
+  // in BlockBasedTable::Get()
+  int hits_per_get = use_block_table_ && !use_block_based_builder_ ? 2 : 1;
+  ASSERT_EQ(value1, Get(key1));
+  ASSERT_EQ(hits_per_get, perf_context.bloom_sst_hit_count);
+  ASSERT_EQ(value3, Get(key3));
+  ASSERT_EQ(2 * hits_per_get, perf_context.bloom_sst_hit_count);
+
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  ASSERT_EQ(1, perf_context.bloom_sst_miss_count);
+}
+
+// Same scenario as in BloomStatsTest but using an iterator
+TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
+  std::string key1("AAAA");
+  std::string key2("RXDB");  // not in DB
+  std::string key3("ZBRA");
+  std::string value1("Value1");
+  std::string value3("Value3");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+
+  unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+
+  // check memtable bloom stats
+  iter->Seek(key1);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value1, iter->value().ToString());
+  ASSERT_EQ(1, perf_context.bloom_memtable_hit_count);
+  ASSERT_EQ(0, perf_context.bloom_memtable_miss_count);
+
+  iter->Seek(key3);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value3, iter->value().ToString());
+  ASSERT_EQ(2, perf_context.bloom_memtable_hit_count);
+  ASSERT_EQ(0, perf_context.bloom_memtable_miss_count);
+
+  iter->Seek(key2);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(1, perf_context.bloom_memtable_miss_count);
+  ASSERT_EQ(2, perf_context.bloom_memtable_hit_count);
+
+  Flush();
+
+  iter.reset(dbfull()->NewIterator(ReadOptions()));
+
+  // check SST bloom stats
+  iter->Seek(key1);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value1, iter->value().ToString());
+  ASSERT_EQ(1, perf_context.bloom_sst_hit_count);
+
+  iter->Seek(key3);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value3, iter->value().ToString());
+  ASSERT_EQ(2, perf_context.bloom_sst_hit_count);
+
+  iter->Seek(key2);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(1, perf_context.bloom_sst_miss_count);
+  ASSERT_EQ(2, perf_context.bloom_sst_hit_count);
 }
 
+INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam,
+                        ::testing::Values(std::make_tuple(true, true),
+                                          std::make_tuple(true, false),
+                                          std::make_tuple(false, false)));
 }  // namespace rocksdb
 
+#endif
+
 int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
   rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif
 }
diff --git a/src/rocksdb/db/db_universal_compaction_test.cc b/src/rocksdb/db/db_universal_compaction_test.cc
new file mode 100644
index 0000000..8e18699
--- /dev/null
+++ b/src/rocksdb/db/db_universal_compaction_test.cc
@@ -0,0 +1,1223 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "port/stack_trace.h"
+#include "util/db_test_util.h"
+#if !(defined NDEBUG) || !defined(OS_WIN)
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+static std::string CompressibleString(Random* rnd, int len) {
+  std::string r;
+  test::CompressibleString(rnd, 0.8, len, &r);
+  return r;
+}
+
+class DBTestUniversalCompactionBase
+    : public DBTestBase,
+      public ::testing::WithParamInterface<int> {
+ public:
+  explicit DBTestUniversalCompactionBase(
+      const std::string& path) : DBTestBase(path) {}
+  virtual void SetUp() override { num_levels_ = GetParam(); }
+  int num_levels_;
+};
+
+class DBTestUniversalCompaction : public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalCompaction() :
+      DBTestUniversalCompactionBase("/db_universal_compaction_test") {}
+};
+
+namespace {
+void VerifyCompactionResult(
+    const ColumnFamilyMetaData& cf_meta,
+    const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
+  for (auto& level : cf_meta.levels) {
+    for (auto& file : level.files) {
+      assert(overlapping_file_numbers.find(file.name) ==
+             overlapping_file_numbers.end());
+    }
+  }
+#endif
+}
+
+class KeepFilter : public CompactionFilter {
+ public:
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value, bool* value_changed) const
+      override {
+    return false;
+  }
+
+  virtual const char* Name() const override { return "KeepFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit KeepFilterFactory(bool check_context = false)
+      : check_context_(check_context) {}
+
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (check_context_) {
+      EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+      EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+    }
+    return std::unique_ptr<CompactionFilter>(new KeepFilter());
+  }
+
+  virtual const char* Name() const override { return "KeepFilterFactory"; }
+  bool check_context_;
+  std::atomic_bool expect_full_compaction_;
+  std::atomic_bool expect_manual_compaction_;
+};
+
+class DelayFilter : public CompactionFilter {
+ public:
+  explicit DelayFilter(DBTestBase* d) : db_test(d) {}
+  virtual bool Filter(int level, const Slice& key, const Slice& value,
+                      std::string* new_value,
+                      bool* value_changed) const override {
+    db_test->env_->addon_time_.fetch_add(1000);
+    return true;
+  }
+
+  virtual const char* Name() const override { return "DelayFilter"; }
+
+ private:
+  DBTestBase* db_test;
+};
+
+class DelayFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
+  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
+  }
+
+  virtual const char* Name() const override { return "DelayFilterFactory"; }
+
+ private:
+  DBTestBase* db_test;
+};
+}  // namespace
+
+// TODO(kailiu) The tests on UniversalCompaction has some issues:
+//  1. A lot of magic numbers ("11" or "12").
+//  2. Made assumption on the memtable flush conditions, which may change from
+//     time to time.
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 105 << 10;  // 105KB
+  options.arena_block_size = 4 << 10;
+  options.target_file_size_base = 32 << 10;  // 32KB
+  // trigger compaction if there are >= 4 files
+  options.level0_file_num_compaction_trigger = 4;
+  KeepFilterFactory* filter = new KeepFilterFactory(true);
+  filter->expect_manual_compaction_.store(false);
+  options.compaction_filter_factory.reset(filter);
+
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWritableFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        if (num_levels_ > 3) {
+          ASSERT_LE(preallocation_size, options.target_file_size_base * 1.1);
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  filter->expect_full_compaction_.store(true);
+  // Stage 1:
+  //   Generate a set of files at level 0, but don't trigger level-0
+  //   compaction.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 100KB
+    GenerateNewFile(1, &rnd, &key_idx);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  GenerateNewFile(1, &rnd, &key_idx);
+  // Suppose each file flushed from mem table has size 1. Now we compact
+  // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+  // file of size 4.
+  ASSERT_EQ(NumSortedRuns(1), 1);
+
+  // Stage 2:
+  //   Now we have one file at level 0, with size 4. We also have some data in
+  //   mem table. Let's continue generating new files at level 0, but don't
+  //   trigger level-0 compaction.
+  //   First, clean up memtable before inserting new data. This will generate
+  //   a level-0 file, with size around 0.4 (according to previously written
+  //   data amount).
+  filter->expect_full_compaction_.store(false);
+  ASSERT_OK(Flush(1));
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+       num++) {
+    GenerateNewFile(1, &rnd, &key_idx);
+    ASSERT_EQ(NumSortedRuns(1), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  GenerateNewFile(1, &rnd, &key_idx);
+  // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+  // After compaction, we should have 2 files, with size 4, 2.4.
+  ASSERT_EQ(NumSortedRuns(1), 2);
+
+  // Stage 3:
+  //   Now we have 2 files at level 0, with size 4 and 2.4. Continue
+  //   generating new files at level 0.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+       num++) {
+    GenerateNewFile(1, &rnd, &key_idx);
+    ASSERT_EQ(NumSortedRuns(1), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  GenerateNewFile(1, &rnd, &key_idx);
+  // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1.
+  // After compaction, we should have 3 files, with size 4, 2.4, 2.
+  ASSERT_EQ(NumSortedRuns(1), 3);
+
+  // Stage 4:
+  //   Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a
+  //   new file of size 1.
+  GenerateNewFile(1, &rnd, &key_idx);
+  dbfull()->TEST_WaitForCompact();
+  // Level-0 compaction is triggered, but no file will be picked up.
+  ASSERT_EQ(NumSortedRuns(1), 4);
+
+  // Stage 5:
+  //   Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate
+  //   a new file of size 1.
+  filter->expect_full_compaction_.store(true);
+  GenerateNewFile(1, &rnd, &key_idx);
+  dbfull()->TEST_WaitForCompact();
+  // All files at level 0 will be compacted into a single one.
+  ASSERT_EQ(NumSortedRuns(1), 1);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSizeAmplification) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 3;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  //   Generate two files in Level 0. Both files are approx the same size.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_EQ(NumSortedRuns(1), num + 1);
+  }
+  ASSERT_EQ(NumSortedRuns(1), 2);
+
+  // Flush whatever is remaining in memtable. This is typically
+  // small, which should not trigger size ratio based compaction
+  // but will instead trigger size amplification.
+  ASSERT_OK(Flush(1));
+
+  dbfull()->TEST_WaitForCompact();
+
+  // Verify that size amplification did occur
+  ASSERT_EQ(NumSortedRuns(1), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 10;
+
+  ChangeCompactOptions();
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.num_levels = 1;
+  options.target_file_size_base = options.write_buffer_size;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
+  Random rnd(301);
+  for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize)));
+  }
+  dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  dbfull()->TEST_WaitForCompact();
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  std::vector<std::string> compaction_input_file_names;
+  for (auto file : cf_meta.levels[0].files) {
+    if (rnd.OneIn(2)) {
+      compaction_input_file_names.push_back(file.name);
+    }
+  }
+
+  if (compaction_input_file_names.size() == 0) {
+    compaction_input_file_names.push_back(
+        cf_meta.levels[0].files[0].name);
+  }
+
+  // expect fail since universal compaction only allow L0 output
+  ASSERT_FALSE(dbfull()
+                   ->CompactFiles(CompactionOptions(), handles_[1],
+                                  compaction_input_file_names, 1)
+                   .ok());
+
+  // expect ok and verify the compacted files no longer exist.
+  ASSERT_OK(dbfull()->CompactFiles(
+      CompactionOptions(), handles_[1],
+      compaction_input_file_names, 0));
+
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  VerifyCompactionResult(
+      cf_meta,
+      std::set<std::string>(compaction_input_file_names.begin(),
+          compaction_input_file_names.end()));
+
+  compaction_input_file_names.clear();
+
+  // Pick the first and the last file, expect everything is
+  // compacted into one single file.
+  compaction_input_file_names.push_back(
+      cf_meta.levels[0].files[0].name);
+  compaction_input_file_names.push_back(
+      cf_meta.levels[0].files[
+          cf_meta.levels[0].files.size() - 1].name);
+  ASSERT_OK(dbfull()->CompactFiles(
+      CompactionOptions(), handles_[1],
+      compaction_input_file_names, 0));
+
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  ASSERT_EQ(cf_meta.levels[0].files.size(), 1U);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTargetLevel) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.num_levels = 7;
+  options.disable_auto_compactions = true;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  // Generate 3 overlapping files
+  Random rnd(301);
+  for (int i = 0; i < 210; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 200; i < 300; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 250; i < 260; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 100)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("3", FilesPerLevel(0));
+  // Compact all files into 1 file and put it in L4
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 4;
+  db_->CompactRange(compact_options, nullptr, nullptr);
+  ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+}
+
+
+class DBTestUniversalCompactionMultiLevels
+    : public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalCompactionMultiLevels() :
+      DBTestUniversalCompactionBase(
+          "/db_universal_compaction_multi_levels_test") {}
+};
+
+TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionMultiLevels) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 8;
+  options.max_background_compactions = 3;
+  options.target_file_size_base = 32 * 1024;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 100000;
+  for (int i = 0; i < num_keys * 2; i++) {
+    ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
+  }
+
+  dbfull()->TEST_WaitForCompact();
+
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+}
+// Tests universal compaction with trivial move enabled
+TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionTrivialMove) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* arg) { trivial_move++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+        non_trivial_move++;
+        ASSERT_TRUE(arg != nullptr);
+        int output_level = *(static_cast<int*>(arg));
+        ASSERT_EQ(output_level, 0);
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.allow_trivial_move = true;
+  options.num_levels = 3;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 2;
+  options.target_file_size_base = 32 * 1024;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 150000;
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  std::vector<std::string> values;
+
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_GT(trivial_move, 0);
+  ASSERT_GT(non_trivial_move, 0);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestUniversalCompactionMultiLevels,
+                        DBTestUniversalCompactionMultiLevels,
+                        ::testing::Values(3, 20));
+
+class DBTestUniversalCompactionParallel :
+    public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalCompactionParallel() :
+      DBTestUniversalCompactionBase(
+          "/db_universal_compaction_prallel_test") {}
+};
+
+TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 1 << 10;  // 1KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 3;
+  options.max_background_flushes = 3;
+  options.target_file_size_base = 1 * 1024;
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Delay every compaction so multiple compactions will happen.
+  std::atomic<int> num_compactions_running(0);
+  std::atomic<bool> has_parallel(false);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack("CompactionJob::Run():Start",
+                                                 [&](void* arg) {
+    if (num_compactions_running.fetch_add(1) > 0) {
+      has_parallel.store(true);
+      return;
+    }
+    for (int nwait = 0; nwait < 20000; nwait++) {
+      if (has_parallel.load() || num_compactions_running.load() > 1) {
+        has_parallel.store(true);
+        break;
+      }
+      env_->SleepForMicroseconds(1000);
+    }
+  });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():End",
+      [&](void* arg) { num_compactions_running.fetch_add(-1); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 30000;
+  for (int i = 0; i < num_keys * 2; i++) {
+    ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(num_compactions_running.load(), 0);
+  ASSERT_TRUE(has_parallel.load());
+
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+
+  // Reopen and check.
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestUniversalCompactionParallel,
+                        DBTestUniversalCompactionParallel,
+                        ::testing::Values(1, 10));
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 105 << 10;    // 105KB
+  options.arena_block_size = 4 << 10;       // 4KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = num_levels_;
+  options.compaction_options_universal.compression_size_percent = -1;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    // Write 100KB (100 values, each 1K)
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 990)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+
+    if (num < options.level0_file_num_compaction_trigger - 1) {
+      ASSERT_EQ(NumSortedRuns(1), num + 1);
+    }
+  }
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumSortedRuns(1), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionStopStyleSimilarSize) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 105 << 10;    // 105KB
+  options.arena_block_size = 4 << 10;       // 4KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  // trigger compaction if there are >= 4 files
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_options_universal.size_ratio = 10;
+  options.compaction_options_universal.stop_style =
+      kCompactionStopStyleSimilarSize;
+  options.num_levels = num_levels_;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Stage 1:
+  //   Generate a set of files at level 0, but don't trigger level-0
+  //   compaction.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 100KB (100 values, each 1K)
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumSortedRuns(), num + 1);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Suppose each file flushed from mem table has size 1. Now we compact
+  // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+  // file of size 4.
+  ASSERT_EQ(NumSortedRuns(), 1);
+
+  // Stage 2:
+  //   Now we have one file at level 0, with size 4. We also have some data in
+  //   mem table. Let's continue generating new files at level 0, but don't
+  //   trigger level-0 compaction.
+  //   First, clean up memtable before inserting new data. This will generate
+  //   a level-0 file, with size around 0.4 (according to previously written
+  //   data amount).
+  dbfull()->Flush(FlushOptions());
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumSortedRuns(), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+  // After compaction, we should have 3 files, with size 4, 0.4, 2.
+  ASSERT_EQ(NumSortedRuns(), 3);
+  // Stage 3:
+  //   Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one
+  //   more file at level-0, which should trigger level-0 compaction.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(key_idx), RandomString(&rnd, 990)));
+    key_idx++;
+  }
+  dbfull()->TEST_WaitForCompact();
+  // Level-0 compaction is triggered, but no file will be picked up.
+  ASSERT_EQ(NumSortedRuns(), 4);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio1) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = num_levels_;
+  options.compaction_options_universal.compression_size_percent = 70;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // The first compaction (2) is compressed.
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT(TotalSize(), 110000U * 2 * 0.9);
+
+  // The second compaction (4) is compressed
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT(TotalSize(), 110000 * 4 * 0.9);
+
+  // The third compaction (2 4) is compressed since this time it is
+  // (1 1 3.2) and 3.2/5.2 doesn't reach ratio.
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT(TotalSize(), 110000 * 6 * 0.9);
+
+  // When we start for the compaction up to (2 4 8), the latest
+  // compressed is not compressed.
+  for (int num = 0; num < 8; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_GT(TotalSize(), 110000 * 11 * 0.8 + 110000 * 2);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio2) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = num_levels_;
+  options.compaction_options_universal.compression_size_percent = 95;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // When we start for the compaction up to (2 4 8), the latest
+  // compressed is compressed given the size ratio to compress.
+  for (int num = 0; num < 14; num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+  ASSERT_LT(TotalSize(), 120000U * 12 * 0.8 + 120000 * 2);
+}
+
+// Test that checks trivial move in universal compaction
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest1) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* arg) { trivial_move++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+        non_trivial_move++;
+        ASSERT_TRUE(arg != nullptr);
+        int output_level = *(static_cast<int*>(arg));
+        ASSERT_EQ(output_level, 0);
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.allow_trivial_move = true;
+  options.num_levels = 2;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 1;
+  options.target_file_size_base = 32 * 1024;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 250000;
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  std::vector<std::string> values;
+
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_GT(trivial_move, 0);
+  ASSERT_GT(non_trivial_move, 0);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+// Test that checks trivial move in universal compaction
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest2) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* arg) { trivial_move++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* arg) { non_trivial_move++; });
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.allow_trivial_move = true;
+  options.num_levels = 15;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 8;
+  options.max_background_compactions = 4;
+  options.target_file_size_base = 64 * 1024;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 500000;
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  std::vector<std::string> values;
+
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  ASSERT_GT(trivial_move, 0);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionFourPaths) {
+  Options options;
+  options.db_paths.emplace_back(dbname_, 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 110 << 10;  // 105KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+  options = CurrentOptions(options);
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(options.db_paths[1].path, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  }
+  env_->DeleteDir(options.db_paths[1].path);
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to second path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1,1,4) -> (2, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 2, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 1, 2, 4) -> (8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+
+  // (1, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 1, 8) -> (2, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 2, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 1, 2, 8) -> (4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) {
+  std::function<void(int)> verify_func = [&](int num_keys_in_db) {
+    std::string keys_in_db;
+    Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      keys_in_db.append(iter->key().ToString());
+      keys_in_db.push_back(',');
+    }
+    delete iter;
+
+    std::string expected_keys;
+    for (int i = 0; i <= num_keys_in_db; i++) {
+      expected_keys.append(Key(i));
+      expected_keys.push_back(',');
+    }
+
+    ASSERT_EQ(keys_in_db, expected_keys);
+  };
+
+  Random rnd(301);
+  int max_key1 = 200;
+  int max_key2 = 600;
+  int max_key3 = 800;
+
+  // Stage 1: open a DB with universal compaction, num_levels=1
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 3;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (int i = 0; i <= max_key1; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  }
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  int non_level0_num_files = 0;
+  for (int i = 1; i < options.num_levels; i++) {
+    non_level0_num_files += NumTableFilesAtLevel(i, 1);
+  }
+  ASSERT_EQ(non_level0_num_files, 0);
+
+  // Stage 2: reopen with universal compaction, num_levels=4
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 4;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  verify_func(max_key1);
+
+  // Insert more keys
+  for (int i = max_key1 + 1; i <= max_key2; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  }
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+
+  verify_func(max_key2);
+  // Compaction to non-L0 has happened.
+  ASSERT_GT(NumTableFilesAtLevel(options.num_levels - 1, 1), 0);
+
+  // Stage 3: Revert it back to one level and revert to num_levels=1.
+  options.num_levels = 4;
+  options.target_file_size_base = INT_MAX;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  // Compact all to level 0
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 0;
+  dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+  // Need to restart it once to remove higher level records in manifest.
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  // Final reopen
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // Insert more keys
+  for (int i = max_key2 + 1; i <= max_key3; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000)));
+  }
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+  verify_func(max_key3);
+}
+
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSecondPathRatio) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  Options options;
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 110 << 10;  // 105KB
+  options.arena_block_size = 4 * 1024;
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+  options = CurrentOptions(options);
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(options.db_paths[1].path, &filenames);
+  // Delete archival files.
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]);
+  }
+  env_->DeleteDir(options.db_paths[1].path);
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to second path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1,1,4) -> (2, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 2, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(2, GetSstFileCount(dbname_));
+
+  // (1, 1, 2, 4) -> (8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 1, 8) -> (2, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 2, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(2, GetSstFileCount(dbname_));
+
+  // (1, 1, 2, 8) -> (4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+INSTANTIATE_TEST_CASE_P(UniversalCompactionNumLevels, DBTestUniversalCompaction,
+                        ::testing::Values(1, 3, 5));
+
+class DBTestUniversalManualCompactionOutputPathId
+    : public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalManualCompactionOutputPathId() :
+      DBTestUniversalCompactionBase(
+          "/db_universal_compaction_manual_pid_test") {}
+};
+
+TEST_P(DBTestUniversalManualCompactionOutputPathId,
+       ManualCompactionOutputPathId) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.db_paths.emplace_back(dbname_, 1000000000);
+  options.db_paths.emplace_back(dbname_ + "_2", 1000000000);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.target_file_size_base = 1 << 30;  // Big size
+  options.level0_file_num_compaction_trigger = 10;
+  Destroy(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  MakeTables(3, "p", "q", 1);
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(2, TotalLiveFiles(1));
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
+
+  // Full compaction to DB path 0
+  CompactRangeOptions compact_options;
+  compact_options.target_path_id = 1;
+  db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  MakeTables(1, "p", "q", 1);
+  ASSERT_EQ(2, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(2, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  // Full compaction to DB path 0
+  compact_options.target_path_id = 0;
+  db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
+
+  // Fail when compacting to an invalid path ID
+  compact_options.target_path_id = 2;
+  ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)
+                  .IsInvalidArgument());
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestUniversalManualCompactionOutputPathId,
+                        DBTestUniversalManualCompactionOutputPathId,
+                        ::testing::Values(1, 8));
+
+}  // namespace rocksdb
+
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+
+int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_wal_test.cc b/src/rocksdb/db/db_wal_test.cc
new file mode 100644
index 0000000..531021e
--- /dev/null
+++ b/src/rocksdb/db/db_wal_test.cc
@@ -0,0 +1,144 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "port/stack_trace.h"
+#include "util/db_test_util.h"
+#if !(defined NDEBUG) || !defined(OS_WIN)
+#include "util/sync_point.h"
+#endif
+
+namespace rocksdb {
+class DBWALTest : public DBTestBase {
+ public:
+  DBWALTest() : DBTestBase("/db_wal_test") {}
+};
+
+TEST_F(DBWALTest, WAL) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    // Both value's should be present.
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    // again both values should be present.
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_EQ("v3", Get(1, "bar"));
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBWALTest, RollLog) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "baz", "v5"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    }
+    ASSERT_OK(Put(1, "foo", "v4"));
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    }
+  } while (ChangeOptions());
+}
+
+#if !(defined NDEBUG) || !defined(OS_WIN)
+TEST_F(DBWALTest, SyncWALNotBlockWrite) {
+  Options options = CurrentOptions();
+  options.max_write_buffer_number = 4;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("foo5", "bar5"));
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"WritableFileWriter::SyncWithoutFlush:1",
+       "DBWALTest::SyncWALNotBlockWrite:1"},
+      {"DBWALTest::SyncWALNotBlockWrite:2",
+       "WritableFileWriter::SyncWithoutFlush:2"},
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::thread thread([&]() { ASSERT_OK(db_->SyncWAL()); });
+
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:1");
+  ASSERT_OK(Put("foo2", "bar2"));
+  ASSERT_OK(Put("foo3", "bar3"));
+  FlushOptions fo;
+  fo.wait = false;
+  ASSERT_OK(db_->Flush(fo));
+  ASSERT_OK(Put("foo4", "bar4"));
+
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:2");
+
+  thread.join();
+
+  ASSERT_EQ(Get("foo1"), "bar1");
+  ASSERT_EQ(Get("foo2"), "bar2");
+  ASSERT_EQ(Get("foo3"), "bar3");
+  ASSERT_EQ(Get("foo4"), "bar4");
+  ASSERT_EQ(Get("foo5"), "bar5");
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, SyncWALNotWaitWrite) {
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("foo3", "bar3"));
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"SpecialEnv::WalFile::Append:1", "DBWALTest::SyncWALNotWaitWrite:1"},
+      {"DBWALTest::SyncWALNotWaitWrite:2", "SpecialEnv::WalFile::Append:2"},
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::thread thread([&]() { ASSERT_OK(Put("foo2", "bar2")); });
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
+  ASSERT_OK(db_->SyncWAL());
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
+
+  thread.join();
+
+  ASSERT_EQ(Get("foo1"), "bar1");
+  ASSERT_EQ(Get("foo2"), "bar2");
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/dbformat.cc b/src/rocksdb/db/dbformat.cc
index f0bd9d0..eb19a7b 100644
--- a/src/rocksdb/db/dbformat.cc
+++ b/src/rocksdb/db/dbformat.cc
@@ -8,6 +8,11 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #include "db/dbformat.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <stdio.h>
 #include "port/port.h"
 #include "util/coding.h"
@@ -17,10 +22,18 @@ namespace rocksdb {
 
 uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
   assert(seq <= kMaxSequenceNumber);
-  assert(t <= kValueTypeForSeek);
+  assert(IsValueType(t));
   return (seq << 8) | t;
 }
 
+void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t) {
+  *seq = packed >> 8;
+  *t = static_cast<ValueType>(packed & 0xff);
+
+  assert(*seq <= kMaxSequenceNumber);
+  assert(IsValueType(*t));
+}
+
 void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
   result->append(key.user_key.data(), key.user_key.size());
   PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
@@ -28,9 +41,8 @@ void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
 
 std::string ParsedInternalKey::DebugString(bool hex) const {
   char buf[50];
-  snprintf(buf, sizeof(buf), "' @ %llu : %d",
-           (unsigned long long) sequence,
-           int(type));
+  snprintf(buf, sizeof(buf), "' @ %" PRIu64 ": %d", sequence,
+           static_cast<int>(type));
   std::string result = "'";
   result += user_key.ToString(hex);
   result += buf;
diff --git a/src/rocksdb/db/dbformat.h b/src/rocksdb/db/dbformat.h
index f15a8c0..2f5d59e 100644
--- a/src/rocksdb/db/dbformat.h
+++ b/src/rocksdb/db/dbformat.h
@@ -33,13 +33,13 @@ enum ValueType : unsigned char {
   kTypeDeletion = 0x0,
   kTypeValue = 0x1,
   kTypeMerge = 0x2,
-  // Following types are used only in write ahead logs. They are not used in
-  // memtables or sst files:
-  kTypeLogData = 0x3,
-  kTypeColumnFamilyDeletion = 0x4,
-  kTypeColumnFamilyValue = 0x5,
-  kTypeColumnFamilyMerge = 0x6,
-  kMaxValue = 0x7F
+  kTypeLogData = 0x3,               // WAL only.
+  kTypeColumnFamilyDeletion = 0x4,  // WAL only.
+  kTypeColumnFamilyValue = 0x5,     // WAL only.
+  kTypeColumnFamilyMerge = 0x6,     // WAL only.
+  kTypeSingleDeletion = 0x7,
+  kTypeColumnFamilySingleDeletion = 0x8,  // WAL only.
+  kMaxValue = 0x7F                        // Not used for storing records.
 };
 
 // kValueTypeForSeek defines the ValueType that should be passed when
@@ -48,7 +48,13 @@ enum ValueType : unsigned char {
 // and the value type is embedded as the low 8 bits in the sequence
 // number in internal keys, we need to use the highest-numbered
 // ValueType, not the lowest).
-static const ValueType kValueTypeForSeek = kTypeMerge;
+static const ValueType kValueTypeForSeek = kTypeSingleDeletion;
+
+// Checks whether a type is a value type (i.e. a type used in memtables and sst
+// files).
+inline bool IsValueType(ValueType t) {
+  return t <= kTypeMerge || t == kTypeSingleDeletion;
+}
 
 // We leave eight bits empty at the bottom so a type and sequence#
 // can be packed together into 64-bits.
@@ -71,8 +77,13 @@ inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
   return key.user_key.size() + 8;
 }
 
+// Pack a sequence number and a ValueType into a uint64_t
 extern uint64_t PackSequenceAndType(uint64_t seq, ValueType t);
 
+// Given the result of PackSequenceAndType, store the sequence number in *seq
+// and the ValueType in *t.
+extern void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t);
+
 // Append the serialization of "key" to *result.
 extern void AppendInternalKey(std::string* result,
                               const ParsedInternalKey& key);
@@ -161,6 +172,7 @@ class InternalKey {
   }
 
   Slice user_key() const { return ExtractUserKey(rep_); }
+  size_t size() { return rep_.size(); }
 
   void SetFrom(const ParsedInternalKey& p) {
     rep_.clear();
@@ -187,17 +199,19 @@ inline bool ParseInternalKey(const Slice& internal_key,
   result->type = static_cast<ValueType>(c);
   assert(result->type <= ValueType::kMaxValue);
   result->user_key = Slice(internal_key.data(), n - 8);
-  return (c <= static_cast<unsigned char>(kValueTypeForSeek));
+  return IsValueType(result->type);
 }
 
-// Update the sequence number in the internal key
-inline void UpdateInternalKey(char* internal_key,
-                              const size_t internal_key_size,
-                              uint64_t seq, ValueType t) {
-  assert(internal_key_size >= 8);
-  char* seqtype = internal_key + internal_key_size - 8;
+// Update the sequence number in the internal key.
+// Guarantees not to invalidate ikey.data().
+inline void UpdateInternalKey(std::string* ikey, uint64_t seq, ValueType t) {
+  size_t ikey_sz = ikey->size();
+  assert(ikey_sz >= 8);
   uint64_t newval = (seq << 8) | t;
-  EncodeFixed64(seqtype, newval);
+
+  // Note: Since C++11, strings are guaranteed to be stored contiguously and
+  // string::operator[]() is guaranteed not to change ikey.data().
+  EncodeFixed64(&(*ikey)[ikey_sz - 8], newval);
 }
 
 // Get the sequence number from the internal key
@@ -263,7 +277,12 @@ class IterKey {
 
   Slice GetKey() const { return Slice(key_, key_size_); }
 
-  size_t Size() { return key_size_; }
+  Slice GetUserKey() const {
+    assert(key_size_ >= 8);
+    return Slice(key_, key_size_ - 8);
+  }
+
+  size_t Size() const { return key_size_; }
 
   void Clear() { key_size_ = 0; }
 
@@ -283,7 +302,7 @@ class IterKey {
       char* p = new char[total_size];
       memcpy(p, key_, shared_len);
 
-      if (key_ != nullptr && key_ != space_) {
+      if (key_ != space_) {
         delete[] key_;
       }
 
@@ -295,11 +314,30 @@ class IterKey {
     memcpy(key_ + shared_len, non_shared_data, non_shared_len);
   }
 
-  void SetKey(const Slice& key) {
+  Slice SetKey(const Slice& key) {
     size_t size = key.size();
     EnlargeBufferIfNeeded(size);
     memcpy(key_, key.data(), size);
     key_size_ = size;
+    return Slice(key_, key_size_);
+  }
+
+  // Copies the content of key, updates the reference to the user key in ikey
+  // and returns a Slice referencing the new copy.
+  Slice SetKey(const Slice& key, ParsedInternalKey* ikey) {
+    size_t key_n = key.size();
+    assert(key_n >= 8);
+    SetKey(key);
+    ikey->user_key = Slice(key_, key_n - 8);
+    return Slice(key_, key_n);
+  }
+
+  // Update the sequence number in the internal key.  Guarantees not to
+  // invalidate slices to the key (and the user key).
+  void UpdateInternalKey(uint64_t seq, ValueType t) {
+    assert(key_size_ >= 8);
+    uint64_t newval = (seq << 8) | t;
+    EncodeFixed64(&key_[key_size_ - 8], newval);
   }
 
   void SetInternalKey(const Slice& key_prefix, const Slice& user_key,
@@ -350,10 +388,10 @@ class IterKey {
   char space_[32];  // Avoid allocation for short keys
 
   void ResetBuffer() {
-    if (key_ != nullptr && key_ != space_) {
+    if (key_ != space_) {
       delete[] key_;
+      key_ = space_;
     }
-    key_ = space_;
     buf_size_ = sizeof(space_);
     key_size_ = 0;
   }
diff --git a/src/rocksdb/db/dbformat_test.cc b/src/rocksdb/db/dbformat_test.cc
index 56e2927..0273dd0 100644
--- a/src/rocksdb/db/dbformat_test.cc
+++ b/src/rocksdb/db/dbformat_test.cc
@@ -149,6 +149,25 @@ TEST_F(FormatTest, IterKeyOperation) {
               "abcdefghijklmnopqrstuvwxyz"));
 }
 
+TEST_F(FormatTest, UpdateInternalKey) {
+  std::string user_key("abcdefghijklmnopqrstuvwxyz");
+  uint64_t new_seq = 0x123456;
+  ValueType new_val_type = kTypeDeletion;
+
+  std::string ikey;
+  AppendInternalKey(&ikey, ParsedInternalKey(user_key, 100U, kTypeValue));
+  size_t ikey_size = ikey.size();
+  UpdateInternalKey(&ikey, new_seq, new_val_type);
+  ASSERT_EQ(ikey_size, ikey.size());
+
+  Slice in(ikey);
+  ParsedInternalKey decoded;
+  ASSERT_TRUE(ParseInternalKey(in, &decoded));
+  ASSERT_EQ(user_key, decoded.user_key.ToString());
+  ASSERT_EQ(new_seq, decoded.sequence);
+  ASSERT_EQ(new_val_type, decoded.type);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/src/rocksdb/db/deletefile_test.cc b/src/rocksdb/db/deletefile_test.cc
index 83d7b0f..b4ddad5 100644
--- a/src/rocksdb/db/deletefile_test.cc
+++ b/src/rocksdb/db/deletefile_test.cc
@@ -7,6 +7,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#ifndef ROCKSDB_LITE
+
 #include "rocksdb/db.h"
 #include "db/db_impl.h"
 #include "db/filename.h"
@@ -36,7 +38,6 @@ class DeleteFileTest : public testing::Test {
     db_ = nullptr;
     env_ = Env::Default();
     options_.enable_thread_tracking = true;
-    options_.max_background_flushes = 0;
     options_.write_buffer_size = 1024*1024*1000;
     options_.target_file_size_base = 1024*1024*1000;
     options_.max_bytes_for_level_base = 1024*1024*1000;
@@ -117,10 +118,14 @@ class DeleteFileTest : public testing::Test {
     DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
     ASSERT_OK(dbi->TEST_FlushMemTable());
     ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+    for (int i = 0; i < 2; ++i) {
+      ASSERT_OK(dbi->TEST_CompactRange(i, nullptr, nullptr));
+    }
 
     AddKeys(50000, 10000);
     ASSERT_OK(dbi->TEST_FlushMemTable());
     ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
   }
 
   void CheckFileTypeCounts(std::string& dir,
@@ -201,8 +206,11 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
   // 2 ssts, 1 manifest
   CheckFileTypeCounts(dbname_, 0, 2, 1);
   std::string first("0"), last("999999");
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
   Slice first_slice(first), last_slice(last);
-  db_->CompactRange(&first_slice, &last_slice, true, 2);
+  db_->CompactRange(compact_options, &first_slice, &last_slice);
   // 1 sst after compaction
   CheckFileTypeCounts(dbname_, 0, 1, 1);
 
@@ -211,7 +219,7 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
   Iterator *itr = 0;
   CreateTwoLevels();
   itr = db_->NewIterator(ReadOptions());
-  db_->CompactRange(&first_slice, &last_slice, true, 2);
+  db_->CompactRange(compact_options, &first_slice, &last_slice);
   // 3 sst after compaction with live iterator
   CheckFileTypeCounts(dbname_, 0, 3, 1);
   delete itr;
@@ -261,11 +269,11 @@ TEST_F(DeleteFileTest, DeleteLogFiles) {
   // Should not succeed because live logs are not allowed to be deleted
   std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
   ASSERT_EQ(alive_log->Type(), kAliveLogFile);
-  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
+  ASSERT_OK(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
   fprintf(stdout, "Deleting alive log file %s\n",
           alive_log->PathName().c_str());
   ASSERT_TRUE(!db_->DeleteFile(alive_log->PathName()).ok());
-  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
+  ASSERT_OK(env_->FileExists(options_.wal_dir + "/" + alive_log->PathName()));
   logfiles.clear();
 
   // Call Flush to bring about a new working log file and add more keys
@@ -279,13 +287,13 @@ TEST_F(DeleteFileTest, DeleteLogFiles) {
   ASSERT_GT(logfiles.size(), 0UL);
   std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
   ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
-  ASSERT_TRUE(env_->FileExists(options_.wal_dir + "/" +
-        archived_log->PathName()));
+  ASSERT_OK(
+      env_->FileExists(options_.wal_dir + "/" + archived_log->PathName()));
   fprintf(stdout, "Deleting archived log file %s\n",
           archived_log->PathName().c_str());
   ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
-  ASSERT_TRUE(!env_->FileExists(options_.wal_dir + "/" +
-        archived_log->PathName()));
+  ASSERT_EQ(Status::NotFound(), env_->FileExists(options_.wal_dir + "/" +
+                                                 archived_log->PathName()));
   CloseDB();
 }
 
@@ -365,3 +373,13 @@ int main(int argc, char** argv) {
   return RUN_ALL_TESTS();
 }
 
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr,
+          "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/event_helpers.cc b/src/rocksdb/db/event_helpers.cc
new file mode 100644
index 0000000..9035c0c
--- /dev/null
+++ b/src/rocksdb/db/event_helpers.cc
@@ -0,0 +1,108 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "db/event_helpers.h"
+
+namespace rocksdb {
+
+namespace {
+inline double SafeDivide(double a, double b) { return b == 0.0 ? 0 : a / b; }
+}  // namespace
+
+void EventHelpers::AppendCurrentTime(JSONWriter* jwriter) {
+  *jwriter << "time_micros"
+           << std::chrono::duration_cast<std::chrono::microseconds>(
+                  std::chrono::system_clock::now().time_since_epoch()).count();
+}
+
+void EventHelpers::LogAndNotifyTableFileCreation(
+    EventLogger* event_logger,
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const FileDescriptor& fd, const TableFileCreationInfo& info) {
+  assert(event_logger);
+  JSONWriter jwriter;
+  AppendCurrentTime(&jwriter);
+  jwriter << "cf_name" << info.cf_name
+          << "job" << info.job_id
+          << "event" << "table_file_creation"
+          << "file_number" << fd.GetNumber()
+          << "file_size" << fd.GetFileSize();
+
+  // table_properties
+  {
+    jwriter << "table_properties";
+    jwriter.StartObject();
+
+    // basic properties:
+    jwriter << "data_size" << info.table_properties.data_size
+            << "index_size" << info.table_properties.index_size
+            << "filter_size" << info.table_properties.filter_size
+            << "raw_key_size" << info.table_properties.raw_key_size
+            << "raw_average_key_size" << SafeDivide(
+                info.table_properties.raw_key_size,
+                info.table_properties.num_entries)
+            << "raw_value_size" << info.table_properties.raw_value_size
+            << "raw_average_value_size" << SafeDivide(
+               info.table_properties.raw_value_size,
+               info.table_properties.num_entries)
+            << "num_data_blocks" << info.table_properties.num_data_blocks
+            << "num_entries" << info.table_properties.num_entries
+            << "filter_policy_name" <<
+                info.table_properties.filter_policy_name;
+
+    // user collected properties
+    for (const auto& prop : info.table_properties.user_collected_properties) {
+      jwriter << prop.first << prop.second;
+    }
+    jwriter.EndObject();
+  }
+  jwriter.EndObject();
+
+  event_logger->Log(jwriter);
+
+#ifndef ROCKSDB_LITE
+  if (listeners.size() == 0) {
+    return;
+  }
+
+  for (auto listener : listeners) {
+    listener->OnTableFileCreated(info);
+  }
+#endif  // !ROCKSDB_LITE
+}
+
+void EventHelpers::LogAndNotifyTableFileDeletion(
+    EventLogger* event_logger, int job_id,
+    uint64_t file_number, const std::string& file_path,
+    const Status& status, const std::string& dbname,
+    const std::vector<std::shared_ptr<EventListener>>& listeners) {
+
+  JSONWriter jwriter;
+  AppendCurrentTime(&jwriter);
+
+  jwriter << "job" << job_id
+          << "event" << "table_file_deletion"
+          << "file_number" << file_number;
+  if (!status.ok()) {
+    jwriter << "status" << status.ToString();
+  }
+
+  jwriter.EndObject();
+
+  event_logger->Log(jwriter);
+
+#ifndef ROCKSDB_LITE
+  TableFileDeletionInfo info;
+  info.db_name = dbname;
+  info.job_id = job_id;
+  info.file_path = file_path;
+  info.status = status;
+  for (auto listener : listeners) {
+    listener->OnTableFileDeleted(info);
+  }
+#endif  // !ROCKSDB_LITE
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/event_helpers.h b/src/rocksdb/db/event_helpers.h
new file mode 100644
index 0000000..a60bc9a
--- /dev/null
+++ b/src/rocksdb/db/event_helpers.h
@@ -0,0 +1,33 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/version_edit.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/table_properties.h"
+#include "util/event_logger.h"
+
+namespace rocksdb {
+
+class EventHelpers {
+ public:
+  static void AppendCurrentTime(JSONWriter* json_writer);
+  static void LogAndNotifyTableFileCreation(
+      EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const FileDescriptor& fd, const TableFileCreationInfo& info);
+  static void LogAndNotifyTableFileDeletion(
+      EventLogger* event_logger, int job_id,
+      uint64_t file_number, const std::string& file_path,
+      const Status& status, const std::string& db_name,
+      const std::vector<std::shared_ptr<EventListener>>& listeners);
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/event_logger_helpers.cc b/src/rocksdb/db/event_logger_helpers.cc
deleted file mode 100644
index 521b684..0000000
--- a/src/rocksdb/db/event_logger_helpers.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#include "db/event_logger_helpers.h"
-
-namespace rocksdb {
-
-namespace {
-inline double SafeDivide(double a, double b) { return b == 0.0 ? 0 : a / b; }
-}  // namespace
-
-void EventLoggerHelpers::LogTableFileCreation(
-    EventLogger* event_logger, int job_id, uint64_t file_number,
-    uint64_t file_size, const TableProperties& table_properties) {
-  auto stream = event_logger->Log();
-  stream << "job" << job_id << "event"
-         << "table_file_creation"
-         << "file_number" << file_number << "file_size" << file_size
-         << "table_properties";
-  stream.StartObject();
-
-  // basic properties:
-  stream << "data_size" << table_properties.data_size
-         << "index_size" << table_properties.index_size
-         << "filter_size" << table_properties.filter_size
-         << "raw_key_size" << table_properties.raw_key_size
-         << "raw_average_key_size" << SafeDivide(table_properties.raw_key_size,
-             table_properties.num_entries)
-         << "raw_value_size" << table_properties.raw_value_size
-         << "raw_average_value_size" << SafeDivide(
-             table_properties.raw_value_size, table_properties.num_entries)
-         << "num_data_blocks" << table_properties.num_data_blocks
-         << "num_entries" << table_properties.num_entries
-         << "filter_policy_name" << table_properties.filter_policy_name;
-
-  // user collected properties
-  for (const auto& prop : table_properties.user_collected_properties) {
-    stream << prop.first << prop.second;
-  }
-
-  stream.EndObject();
-}
-
-}  // namespace rocksdb
diff --git a/src/rocksdb/db/event_logger_helpers.h b/src/rocksdb/db/event_logger_helpers.h
deleted file mode 100644
index 86e9adc..0000000
--- a/src/rocksdb/db/event_logger_helpers.h
+++ /dev/null
@@ -1,18 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-#pragma once
-
-#include "util/event_logger.h"
-#include "rocksdb/table_properties.h"
-
-namespace rocksdb {
-
-class EventLoggerHelpers {
- public:
-  static void LogTableFileCreation(EventLogger* event_logger, int job_id,
-                                   uint64_t file_number, uint64_t file_size,
-                                   const TableProperties& table_properties);
-};
-}  // namespace rocksdb
diff --git a/src/rocksdb/db/fault_injection_test.cc b/src/rocksdb/db/fault_injection_test.cc
index 6926e24..84a6e9a 100644
--- a/src/rocksdb/db/fault_injection_test.cc
+++ b/src/rocksdb/db/fault_injection_test.cc
@@ -11,6 +11,8 @@
 // the last "sync". It then checks for data loss errors by purposely dropping
 // file data (or entire files) not protected by a "sync".
 
+#if !(defined NDEBUG) || !defined(OS_WIN)
+
 #include <map>
 #include <set>
 #include "db/db_impl.h"
@@ -25,6 +27,7 @@
 #include "util/logging.h"
 #include "util/mock_env.h"
 #include "util/mutexlock.h"
+#include "util/sync_point.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
@@ -77,9 +80,12 @@ Status Truncate(Env* env, const std::string& filename, uint64_t length) {
     return s;
   }
 
-  char* scratch = new char[length];
+  std::unique_ptr<char[]> scratch(new char[length]);
   rocksdb::Slice result;
-  s = orig_file->Read(length, &result, scratch);
+  s = orig_file->Read(length, &result, scratch.get());
+#ifdef OS_WIN
+  orig_file.reset();
+#endif
   if (s.ok()) {
     std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
     unique_ptr<WritableFile> tmp_file;
@@ -100,8 +106,6 @@ Status Truncate(Env* env, const std::string& filename, uint64_t length) {
             s.ToString().c_str());
   }
 
-  delete[] scratch;
-
   return s;
 }
 
@@ -128,7 +132,7 @@ struct FileState {
 
 }  // anonymous namespace
 
-// A wrapper around WritableFile which informs another Env whenever this file
+// A wrapper around WritableFileWriter* file
 // is written to or sync'ed.
 class TestWritableFile : public WritableFile {
  public:
@@ -137,9 +141,11 @@ class TestWritableFile : public WritableFile {
                             FaultInjectionTestEnv* env);
   virtual ~TestWritableFile();
   virtual Status Append(const Slice& data) override;
+  virtual Status Truncate(uint64_t size) override { return target_->Truncate(size); }
   virtual Status Close() override;
   virtual Status Flush() override;
   virtual Status Sync() override;
+  virtual bool IsSyncThreadSafe() const override { return true; }
 
  private:
   FileState state_;
@@ -185,10 +191,21 @@ class FaultInjectionTestEnv : public EnvWrapper {
   Status NewWritableFile(const std::string& fname,
                          unique_ptr<WritableFile>* result,
                          const EnvOptions& soptions) override {
-    Status s = target()->NewWritableFile(fname, result, soptions);
+    if (!IsFilesystemActive()) {
+      return Status::Corruption("Not Active");
+    }
+    // Not allow overwriting files
+    Status s = target()->FileExists(fname);
+    if (s.ok()) {
+      return Status::Corruption("File already exists.");
+    } else if (!s.IsNotFound()) {
+      assert(s.IsIOError());
+      return s;
+    }
+    s = target()->NewWritableFile(fname, result, soptions);
     if (s.ok()) {
       result->reset(new TestWritableFile(fname, std::move(*result), this));
-      // WritableFile doesn't append to files, so if the same file is opened
+      // WritableFileWriter* file is opened
       // again then it will be truncated - so forget our saved state.
       UntrackFile(fname);
       MutexLock l(&mutex_);
@@ -201,6 +218,9 @@ class FaultInjectionTestEnv : public EnvWrapper {
   }
 
   virtual Status DeleteFile(const std::string& f) override {
+    if (!IsFilesystemActive()) {
+      return Status::Corruption("Not Active");
+    }
     Status s = EnvWrapper::DeleteFile(f);
     if (!s.ok()) {
       fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(),
@@ -215,6 +235,9 @@ class FaultInjectionTestEnv : public EnvWrapper {
 
   virtual Status RenameFile(const std::string& s,
                             const std::string& t) override {
+    if (!IsFilesystemActive()) {
+      return Status::Corruption("Not Active");
+    }
     Status ret = EnvWrapper::RenameFile(s, t);
 
     if (ret.ok()) {
@@ -373,8 +396,11 @@ TestWritableFile::~TestWritableFile() {
 }
 
 Status TestWritableFile::Append(const Slice& data) {
+  if (!env_->IsFilesystemActive()) {
+    return Status::Corruption("Not Active");
+  }
   Status s = target_->Append(data);
-  if (s.ok() && env_->IsFilesystemActive()) {
+  if (s.ok()) {
     state_.pos_ += data.size();
   }
   return s;
@@ -406,7 +432,8 @@ Status TestWritableFile::Sync() {
   return Status::OK();
 }
 
-class FaultInjectionTest : public testing::Test {
+class FaultInjectionTest : public testing::Test,
+                           public testing::WithParamInterface<bool> {
  protected:
   enum OptionConfig {
     kDefault,
@@ -423,6 +450,8 @@ class FaultInjectionTest : public testing::Test {
   // When need to make sure data is persistent, call DB::CompactRange()
   bool sync_use_compact_;
 
+  bool sequential_order_;
+
  protected:
  public:
   enum ExpectedVerifResult { kValExpectFound, kValExpectNoError };
@@ -449,6 +478,11 @@ class FaultInjectionTest : public testing::Test {
         db_(NULL) {
   }
 
+  ~FaultInjectionTest() {
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+
   bool ChangeOptions() {
     option_config_++;
     if (option_config_ >= kEnd) {
@@ -529,7 +563,10 @@ class FaultInjectionTest : public testing::Test {
     return s;
   }
 
-  void SetUp() override { ASSERT_OK(NewDB()); }
+  void SetUp() override {
+    sequential_order_ = GetParam();
+    ASSERT_OK(NewDB());
+  }
 
   void TearDown() override {
     CloseDB();
@@ -591,8 +628,15 @@ class FaultInjectionTest : public testing::Test {
 
   // Return the ith key
   Slice Key(int i, std::string* storage) const {
+    int num = i;
+    if (!sequential_order_) {
+      // random transfer
+      const int m = 0x5bd1e995;
+      num *= m;
+      num ^= num << 24;
+    }
     char buf[100];
-    snprintf(buf, sizeof(buf), "%016d", i);
+    snprintf(buf, sizeof(buf), "%016d", num);
     storage->assign(buf, strlen(buf));
     return Slice(*storage);
   }
@@ -659,7 +703,7 @@ class FaultInjectionTest : public testing::Test {
 
     Build(write_options, 0, num_pre_sync);
     if (sync_use_compact_) {
-      db_->CompactRange(nullptr, nullptr);
+      db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
     }
     write_options.sync = false;
     Build(write_options, num_pre_sync, num_post_sync);
@@ -675,6 +719,10 @@ class FaultInjectionTest : public testing::Test {
     ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
     ASSERT_OK(Verify(num_pre_sync, num_post_sync,
                      FaultInjectionTest::kValExpectNoError));
+    WaitCompactionFinish();
+    ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
+    ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+                     FaultInjectionTest::kValExpectNoError));
   }
 
   void NoWriteTestPreFault() {
@@ -685,9 +733,14 @@ class FaultInjectionTest : public testing::Test {
     ResetDBState(reset_method);
     ASSERT_OK(OpenDB());
   }
+
+  void WaitCompactionFinish() {
+    static_cast<DBImpl*>(db_)->TEST_WaitForCompact();
+    ASSERT_OK(db_->Put(WriteOptions(), "", ""));
+  }
 };
 
-TEST_F(FaultInjectionTest, FaultTest) {
+TEST_P(FaultInjectionTest, FaultTest) {
   do {
     Random rnd(301);
 
@@ -726,46 +779,95 @@ TEST_F(FaultInjectionTest, FaultTest) {
   } while (ChangeOptions());
 }
 
-class SleepingBackgroundTask {
- public:
-  SleepingBackgroundTask()
-      : bg_cv_(&mutex_), should_sleep_(true), done_with_sleep_(false) {}
-  void DoSleep() {
-    MutexLock l(&mutex_);
-    while (should_sleep_) {
-      bg_cv_.Wait();
-    }
-    done_with_sleep_ = true;
-    bg_cv_.SignalAll();
-  }
-  void WakeUp() {
-    MutexLock l(&mutex_);
-    should_sleep_ = false;
-    bg_cv_.SignalAll();
-    while (!done_with_sleep_) {
-      bg_cv_.Wait();
-    }
-  }
+// Previous log file is not fsynced if sync is forced after log rolling.
+TEST_P(FaultInjectionTest, WriteOptionSyncTest) {
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  // Block the job queue to prevent flush job from running.
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::HIGH);
 
-  static void DoSleepTask(void* arg) {
-    reinterpret_cast<SleepingBackgroundTask*>(arg)->DoSleep();
+  WriteOptions write_options;
+  write_options.sync = false;
+
+  std::string key_space, value_space;
+  ASSERT_OK(
+      db_->Put(write_options, Key(1, &key_space), Value(1, &value_space)));
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(db_->Flush(flush_options));
+  write_options.sync = true;
+  ASSERT_OK(
+      db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
+
+  env_->SetFilesystemActive(false);
+  NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+  sleeping_task_low.WakeUp();
+
+  ASSERT_OK(OpenDB());
+  std::string val;
+  Value(2, &value_space);
+  ASSERT_OK(ReadValue(2, &val));
+  ASSERT_EQ(value_space, val);
+
+  Value(1, &value_space);
+  ASSERT_OK(ReadValue(1, &val));
+  ASSERT_EQ(value_space, val);
+}
+
+TEST_P(FaultInjectionTest, UninstalledCompaction) {
+  options_.target_file_size_base = 32 * 1024;
+  options_.write_buffer_size = 100 << 10;  // 100KB
+  options_.level0_file_num_compaction_trigger = 6;
+  options_.level0_stop_writes_trigger = 1 << 10;
+  options_.level0_slowdown_writes_trigger = 1 << 10;
+  options_.max_background_compactions = 1;
+  OpenDB();
+
+  if (!sequential_order_) {
+    rocksdb::SyncPoint::GetInstance()->LoadDependency({
+        {"FaultInjectionTest::FaultTest:0", "DBImpl::BGWorkCompaction"},
+        {"CompactionJob::Run():End", "FaultInjectionTest::FaultTest:1"},
+        {"FaultInjectionTest::FaultTest:2",
+         "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+    });
   }
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
- private:
-  port::Mutex mutex_;
-  port::CondVar bg_cv_;  // Signalled when background work finishes
-  bool should_sleep_;
-  bool done_with_sleep_;
-};
+  int kNumKeys = 1000;
+  Build(WriteOptions(), 0, kNumKeys);
+  FlushOptions flush_options;
+  flush_options.wait = true;
+  db_->Flush(flush_options);
+  ASSERT_OK(db_->Put(WriteOptions(), "", ""));
+  TEST_SYNC_POINT("FaultInjectionTest::FaultTest:0");
+  TEST_SYNC_POINT("FaultInjectionTest::FaultTest:1");
+  env_->SetFilesystemActive(false);
+  TEST_SYNC_POINT("FaultInjectionTest::FaultTest:2");
+  CloseDB();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  ResetDBState(kResetDropUnsyncedData);
+
+  std::atomic<bool> opened(false);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::Open:Opened", [&](void* arg) { opened.store(true); });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BGWorkCompaction",
+      [&](void* arg) { ASSERT_TRUE(opened.load()); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(OpenDB());
+  ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
+  WaitCompactionFinish();
+  ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
 
-// Disable the test because it is not passing.
-// Previous log file is not fsynced if sync is forced after log rolling.
-// TODO(FB internal task#6730880) Fix the bug
-TEST_F(FaultInjectionTest, DISABLED_WriteOptionSyncTest) {
-  SleepingBackgroundTask sleeping_task_low;
+TEST_P(FaultInjectionTest, ManualLogSyncTest) {
+  test::SleepingBackgroundTask sleeping_task_low;
   env_->SetBackgroundThreads(1, Env::HIGH);
   // Block the job queue to prevent flush job from running.
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
                  Env::Priority::HIGH);
 
   WriteOptions write_options;
@@ -777,9 +879,9 @@ TEST_F(FaultInjectionTest, DISABLED_WriteOptionSyncTest) {
   FlushOptions flush_options;
   flush_options.wait = false;
   ASSERT_OK(db_->Flush(flush_options));
-  write_options.sync = true;
   ASSERT_OK(
       db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
+  ASSERT_OK(db_->SyncWAL());
 
   env_->SetFilesystemActive(false);
   NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
@@ -796,9 +898,17 @@ TEST_F(FaultInjectionTest, DISABLED_WriteOptionSyncTest) {
   ASSERT_EQ(value_space, val);
 }
 
+INSTANTIATE_TEST_CASE_P(FaultTest, FaultInjectionTest, ::testing::Bool());
+
 }  // namespace rocksdb
 
+#endif // #if !(defined NDEBUG) || !defined(OS_WIN)
+
 int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif
 }
diff --git a/src/rocksdb/db/file_indexer.h b/src/rocksdb/db/file_indexer.h
index e673499..3a335be 100644
--- a/src/rocksdb/db/file_indexer.h
+++ b/src/rocksdb/db/file_indexer.h
@@ -12,6 +12,7 @@
 #include <functional>
 #include <limits>
 #include <vector>
+#include "port/port.h"
 #include "util/arena.h"
 #include "util/autovector.h"
 
@@ -58,7 +59,8 @@ class FileIndexer {
                    std::vector<FileMetaData*>* const files);
 
   enum {
-    kLevelMaxIndex = std::numeric_limits<int32_t>::max()
+    // MSVC version 1800 still does not have constexpr for ::max()
+    kLevelMaxIndex = rocksdb::port::kMaxInt32
   };
 
  private:
diff --git a/src/rocksdb/db/filename.cc b/src/rocksdb/db/filename.cc
index 160005d..e152037 100644
--- a/src/rocksdb/db/filename.cc
+++ b/src/rocksdb/db/filename.cc
@@ -18,11 +18,15 @@
 #include <vector>
 #include "db/dbformat.h"
 #include "rocksdb/env.h"
+#include "util/file_reader_writer.h"
 #include "util/logging.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
 
+static const std::string kRocksDbTFileExt = "sst";
+static const std::string kLevelDbTFileExt = "ldb";
+
 // Given a path, flatten the path name by replacing all chars not in
 // {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end.
 // Return the number of chars stored in dest not including the trailing '\0'.
@@ -77,7 +81,16 @@ std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
 }
 
 std::string MakeTableFileName(const std::string& path, uint64_t number) {
-  return MakeFileName(path, number, "sst");
+  return MakeFileName(path, number, kRocksDbTFileExt.c_str());
+}
+
+std::string Rocks2LevelTableFileName(const std::string& fullname) {
+  assert(fullname.size() > kRocksDbTFileExt.size() + 1);
+  if (fullname.size() <= kRocksDbTFileExt.size() + 1) {
+    return "";
+  }
+  return fullname.substr(0, fullname.size() - kRocksDbTFileExt.size()) +
+         kLevelDbTFileExt;
 }
 
 uint64_t TableFileNameToNumber(const std::string& name) {
@@ -103,8 +116,6 @@ std::string TableFileName(const std::vector<DbPath>& db_paths, uint64_t number,
   return MakeTableFileName(path, number);
 }
 
-const size_t kFormatFileNumberBufSize = 38;
-
 void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
                       size_t out_buf_size) {
   if (path_id == 0) {
@@ -152,8 +163,9 @@ InfoLogPrefix::InfoLogPrefix(bool has_log_dir,
 
 std::string InfoLogFileName(const std::string& dbname,
     const std::string& db_path, const std::string& log_dir) {
-  if (log_dir.empty())
+  if (log_dir.empty()) {
     return dbname + "/LOG";
+  }
 
   InfoLogPrefix info_log_prefix(true, db_path);
   return log_dir + "/" + info_log_prefix.buf;
@@ -165,8 +177,9 @@ std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
   char buf[50];
   snprintf(buf, sizeof(buf), "%llu", static_cast<unsigned long long>(ts));
 
-  if (log_dir.empty())
+  if (log_dir.empty()) {
     return dbname + "/LOG.old." + buf;
+  }
 
   InfoLogPrefix info_log_prefix(true, db_path);
   return log_dir + "/" + info_log_prefix.buf + ".old." + buf;
@@ -272,17 +285,23 @@ bool ParseFileName(const std::string& fname, uint64_t* number,
     if (!ConsumeDecimalNumber(&rest, &num)) {
       return false;
     }
+    if (rest.size() <= 1 || rest[0] != '.') {
+      return false;
+    }
+    rest.remove_prefix(1);
+
     Slice suffix = rest;
-    if (suffix == Slice(".log")) {
+    if (suffix == Slice("log")) {
       *type = kLogFile;
       if (log_type && !archive_dir_found) {
         *log_type = kAliveLogFile;
       }
     } else if (archive_dir_found) {
       return false; // Archive dir can contain only log files
-    } else if (suffix == Slice(".sst")) {
+    } else if (suffix == Slice(kRocksDbTFileExt) ||
+               suffix == Slice(kLevelDbTFileExt)) {
       *type = kTableFile;
-    } else if (suffix == Slice(".dbtmp")) {
+    } else if (suffix == Slice("dbtmp")) {
       *type = kTempFile;
     } else {
       return false;
@@ -330,15 +349,13 @@ Status SetIdentityFile(Env* env, const std::string& dbname) {
   return s;
 }
 
-Status SyncManifest(Env* env, const DBOptions* db_options, WritableFile* file) {
+Status SyncManifest(Env* env, const DBOptions* db_options,
+                    WritableFileWriter* file) {
   if (db_options->disableDataSync) {
     return Status::OK();
-  } else if (db_options->use_fsync) {
-    StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
-    return file->Fsync();
   } else {
     StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS);
-    return file->Sync();
+    return file->Sync(db_options->use_fsync);
   }
 }
 
diff --git a/src/rocksdb/db/filename.h b/src/rocksdb/db/filename.h
index 33f5ace..926f027 100644
--- a/src/rocksdb/db/filename.h
+++ b/src/rocksdb/db/filename.h
@@ -25,7 +25,7 @@ namespace rocksdb {
 
 class Env;
 class Directory;
-class WritableFile;
+class WritableFileWriter;
 
 enum FileType {
   kLogFile,
@@ -55,6 +55,10 @@ extern std::string ArchivedLogFileName(const std::string& dbname,
 
 extern std::string MakeTableFileName(const std::string& name, uint64_t number);
 
+// Return the name of sstable with LevelDB suffix
+// created from RocksDB sstable suffixed name
+extern std::string Rocks2LevelTableFileName(const std::string& fullname);
+
 // the reverse function of MakeTableFileName
 // TODO(yhchiang): could merge this function with ParseFileName()
 extern uint64_t TableFileNameToNumber(const std::string& name);
@@ -66,7 +70,7 @@ extern std::string TableFileName(const std::vector<DbPath>& db_paths,
                                  uint64_t number, uint32_t path_id);
 
 // Sufficient buffer size for FormatFileNumber.
-extern const size_t kFormatFileNumberBufSize;
+const size_t kFormatFileNumberBufSize = 38;
 
 extern void FormatFileNumber(uint64_t number, uint32_t path_id, char* out_buf,
                              size_t out_buf_size);
@@ -102,11 +106,13 @@ struct InfoLogPrefix {
 
 // Return the name of the info log file for "dbname".
 extern std::string InfoLogFileName(const std::string& dbname,
-    const std::string& db_path="", const std::string& log_dir="");
+                                   const std::string& db_path = "",
+                                   const std::string& log_dir = "");
 
 // Return the name of the old info log file for "dbname".
 extern std::string OldInfoLogFileName(const std::string& dbname, uint64_t ts,
-    const std::string& db_path="", const std::string& log_dir="");
+                                      const std::string& db_path = "",
+                                      const std::string& log_dir = "");
 
 // Return the name to use for a metadatabase. The result will be prefixed with
 // "dbname".
@@ -140,6 +146,6 @@ extern Status SetIdentityFile(Env* env, const std::string& dbname);
 
 // Sync manifest file `file`.
 extern Status SyncManifest(Env* env, const DBOptions* db_options,
-                           WritableFile* file);
+                           WritableFileWriter* file);
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/flush_job.cc b/src/rocksdb/db/flush_job.cc
index 0f6c85f..410108a 100644
--- a/src/rocksdb/db/flush_job.cc
+++ b/src/rocksdb/db/flush_job.cc
@@ -14,13 +14,14 @@
 #endif
 
 #include <inttypes.h>
+
 #include <algorithm>
 #include <vector>
 
 #include "db/builder.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
-#include "db/event_logger_helpers.h"
+#include "db/event_helpers.h"
 #include "db/filename.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
@@ -28,8 +29,8 @@
 #include "db/memtable_list.h"
 #include "db/merge_context.h"
 #include "db/version_set.h"
-#include "port/port.h"
 #include "port/likely.h"
+#include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/statistics.h"
@@ -43,11 +44,11 @@
 #include "util/coding.h"
 #include "util/event_logger.h"
 #include "util/file_util.h"
-#include "util/logging.h"
+#include "util/iostats_context_imp.h"
 #include "util/log_buffer.h"
+#include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/perf_context_imp.h"
-#include "util/iostats_context_imp.h"
 #include "util/stop_watch.h"
 #include "util/sync_point.h"
 #include "util/thread_status_util.h"
@@ -60,9 +61,9 @@ FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
                    const EnvOptions& env_options, VersionSet* versions,
                    InstrumentedMutex* db_mutex,
                    std::atomic<bool>* shutting_down,
-                   SequenceNumber newest_snapshot, JobContext* job_context,
-                   LogBuffer* log_buffer, Directory* db_directory,
-                   Directory* output_file_directory,
+                   std::vector<SequenceNumber> existing_snapshots,
+                   JobContext* job_context, LogBuffer* log_buffer,
+                   Directory* db_directory, Directory* output_file_directory,
                    CompressionType output_compression, Statistics* stats,
                    EventLogger* event_logger)
     : dbname_(dbname),
@@ -73,7 +74,7 @@ FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
       versions_(versions),
       db_mutex_(db_mutex),
       shutting_down_(shutting_down),
-      newest_snapshot_(newest_snapshot),
+      existing_snapshots_(std::move(existing_snapshots)),
       job_context_(job_context),
       log_buffer_(log_buffer),
       db_directory_(db_directory),
@@ -87,7 +88,6 @@ FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
 }
 
 FlushJob::~FlushJob() {
-  TEST_SYNC_POINT("FlushJob::~FlushJob()");
   ThreadStatusUtil::ResetThreadStatus();
 }
 
@@ -111,16 +111,15 @@ void FlushJob::ReportFlushInputSize(const autovector<MemTable*>& mems) {
 }
 
 void FlushJob::RecordFlushIOStats() {
-  ThreadStatusUtil::IncreaseThreadOperationProperty(
+  ThreadStatusUtil::SetThreadOperationProperty(
       ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
-  IOSTATS_RESET(bytes_written);
 }
 
-Status FlushJob::Run(uint64_t* file_number) {
+Status FlushJob::Run(FileMetaData* file_meta) {
   AutoThreadOperationStageUpdater stage_run(
       ThreadStatus::STAGE_FLUSH_RUN);
   // Save the contents of the earliest memtable as a new Table
-  uint64_t fn;
+  FileMetaData meta;
   autovector<MemTable*> mems;
   cfd_->imm()->PickMemtablesToFlush(&mems);
   if (mems.empty()) {
@@ -143,7 +142,7 @@ Status FlushJob::Run(uint64_t* file_number) {
   edit->SetColumnFamily(cfd_->GetID());
 
   // This will release and re-acquire the mutex.
-  Status s = WriteLevel0Table(mems, edit, &fn);
+  Status s = WriteLevel0Table(mems, edit, &meta);
 
   if (s.ok() &&
       (shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) {
@@ -152,16 +151,18 @@ Status FlushJob::Run(uint64_t* file_number) {
   }
 
   if (!s.ok()) {
-    cfd_->imm()->RollbackMemtableFlush(mems, fn);
+    cfd_->imm()->RollbackMemtableFlush(mems, meta.fd.GetNumber());
   } else {
+    TEST_SYNC_POINT("FlushJob::InstallResults");
     // Replace immutable memtable with the generated Table
     s = cfd_->imm()->InstallMemtableFlushResults(
-        cfd_, mutable_cf_options_, mems, versions_, db_mutex_, fn,
-        &job_context_->memtables_to_free, db_directory_, log_buffer_);
+        cfd_, mutable_cf_options_, mems, versions_, db_mutex_,
+        meta.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
+        log_buffer_);
   }
 
-  if (s.ok() && file_number != nullptr) {
-    *file_number = fn;
+  if (s.ok() && file_meta != nullptr) {
+    *file_meta = meta;
   }
   RecordFlushIOStats();
 
@@ -180,18 +181,14 @@ Status FlushJob::Run(uint64_t* file_number) {
 }
 
 Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
-                                  VersionEdit* edit, uint64_t* filenumber) {
+                                  VersionEdit* edit, FileMetaData* meta) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_FLUSH_WRITE_L0);
   db_mutex_->AssertHeld();
   const uint64_t start_micros = db_options_.env->NowMicros();
-  FileMetaData meta;
   // path 0 for level 0 file.
-  meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
-  *filenumber = meta.fd.GetNumber();
+  meta->fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
 
-  const SequenceNumber earliest_seqno_in_memtable =
-      mems[0]->GetFirstSequenceNumber();
   Version* base = cfd_->current();
   base->Ref();  // it is likely that we do not need this reference
   Status s;
@@ -222,37 +219,48 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
                          << total_num_entries << "num_deletes"
                          << total_num_deletes << "memory_usage"
                          << total_memory_usage;
-    TableProperties table_properties;
+
+    TableFileCreationInfo info;
     {
       ScopedArenaIterator iter(
           NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
                              static_cast<int>(memtables.size()), &arena));
       Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
           "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
-          cfd_->GetName().c_str(), job_context_->job_id, meta.fd.GetNumber());
+          cfd_->GetName().c_str(), job_context_->job_id, meta->fd.GetNumber());
 
       TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
                                &output_compression_);
-      s = BuildTable(dbname_, db_options_.env, *cfd_->ioptions(), env_options_,
-                     cfd_->table_cache(), iter.get(), &meta,
-                     cfd_->internal_comparator(),
-                     cfd_->int_tbl_prop_collector_factories(), newest_snapshot_,
-                     earliest_seqno_in_memtable, output_compression_,
-                     cfd_->ioptions()->compression_opts,
-                     mutable_cf_options_.paranoid_file_checks, Env::IO_HIGH,
-                     &table_properties);
+      s = BuildTable(
+          dbname_, db_options_.env, *cfd_->ioptions(), env_options_,
+          cfd_->table_cache(), iter.get(), meta, cfd_->internal_comparator(),
+          cfd_->int_tbl_prop_collector_factories(), existing_snapshots_,
+          output_compression_, cfd_->ioptions()->compression_opts,
+          mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
+          Env::IO_HIGH, &info.table_properties);
       LogFlush(db_options_.info_log);
     }
     Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
-        "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s",
-        cfd_->GetName().c_str(), job_context_->job_id, meta.fd.GetNumber(),
-        meta.fd.GetFileSize(), s.ToString().c_str());
+        "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
+        " bytes %s"
+        "%s",
+        cfd_->GetName().c_str(), job_context_->job_id, meta->fd.GetNumber(),
+        meta->fd.GetFileSize(), s.ToString().c_str(),
+        meta->marked_for_compaction ? " (needs compaction)" : "");
 
     // output to event logger
     if (s.ok()) {
-      EventLoggerHelpers::LogTableFileCreation(
-          event_logger_, job_context_->job_id, meta.fd.GetNumber(),
-          meta.fd.GetFileSize(), table_properties);
+      info.db_name = dbname_;
+      info.cf_name = cfd_->GetName();
+      info.file_path = TableFileName(db_options_.db_paths,
+                                     meta->fd.GetNumber(),
+                                     meta->fd.GetPathId());
+      info.file_size = meta->fd.GetFileSize();
+      info.job_id = job_context_->job_id;
+      EventHelpers::LogAndNotifyTableFileCreation(
+          event_logger_, db_options_.listeners,
+          meta->fd, info);
+      TEST_SYNC_POINT("FlushJob::LogAndNotifyTableFileCreation()");
     }
 
     if (!db_options_.disableDataSync && output_file_directory_ != nullptr) {
@@ -267,38 +275,25 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
 
   // Note that if file_size is zero, the file has been deleted and
   // should not be added to the manifest.
-  int level = 0;
-  if (s.ok() && meta.fd.GetFileSize() > 0) {
-    const Slice min_user_key = meta.smallest.user_key();
-    const Slice max_user_key = meta.largest.user_key();
+  if (s.ok() && meta->fd.GetFileSize() > 0) {
     // if we have more than 1 background thread, then we cannot
     // insert files directly into higher levels because some other
     // threads could be concurrently producing compacted files for
     // that key range.
-    if (base != nullptr && db_options_.max_background_compactions <= 1 &&
-        db_options_.max_background_flushes == 0 &&
-        cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
-      level = base->storage_info()->PickLevelForMemTableOutput(
-          mutable_cf_options_, min_user_key, max_user_key);
-      // If level does not match path id, reset level back to 0
-      uint32_t fdpath = LevelCompactionPicker::GetPathId(
-          *cfd_->ioptions(), mutable_cf_options_, level);
-      if (fdpath != 0) {
-        level = 0;
-      }
-    }
-    edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
-                  meta.fd.GetFileSize(), meta.smallest, meta.largest,
-                  meta.smallest_seqno, meta.largest_seqno);
+    // Add file to L0
+    edit->AddFile(0 /* level */, meta->fd.GetNumber(), meta->fd.GetPathId(),
+                  meta->fd.GetFileSize(), meta->smallest, meta->largest,
+                  meta->smallest_seqno, meta->largest_seqno,
+                  meta->marked_for_compaction);
   }
 
   InternalStats::CompactionStats stats(1);
   stats.micros = db_options_.env->NowMicros() - start_micros;
-  stats.bytes_written = meta.fd.GetFileSize();
-  cfd_->internal_stats()->AddCompactionStats(level, stats);
+  stats.bytes_written = meta->fd.GetFileSize();
+  cfd_->internal_stats()->AddCompactionStats(0 /* level */, stats);
   cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
-                                     meta.fd.GetFileSize());
-  RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
+                                     meta->fd.GetFileSize());
+  RecordTick(stats_, COMPACT_WRITE_BYTES, meta->fd.GetFileSize());
   return s;
 }
 
diff --git a/src/rocksdb/db/flush_job.h b/src/rocksdb/db/flush_job.h
index c504b14..14555ef 100644
--- a/src/rocksdb/db/flush_job.h
+++ b/src/rocksdb/db/flush_job.h
@@ -17,11 +17,11 @@
 #include <string>
 
 #include "db/dbformat.h"
-#include "db/log_writer.h"
-#include "db/snapshot.h"
 #include "db/column_family.h"
-#include "db/version_edit.h"
+#include "db/log_writer.h"
 #include "db/memtable_list.h"
+#include "db/snapshot_impl.h"
+#include "db/version_edit.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@@ -57,21 +57,22 @@ class FlushJob {
            const MutableCFOptions& mutable_cf_options,
            const EnvOptions& env_options, VersionSet* versions,
            InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
-           SequenceNumber newest_snapshot, JobContext* job_context,
-           LogBuffer* log_buffer, Directory* db_directory,
-           Directory* output_file_directory, CompressionType output_compression,
-           Statistics* stats, EventLogger* event_logger);
+           std::vector<SequenceNumber> existing_snapshots,
+           JobContext* job_context, LogBuffer* log_buffer,
+           Directory* db_directory, Directory* output_file_directory,
+           CompressionType output_compression, Statistics* stats,
+           EventLogger* event_logger);
 
   ~FlushJob();
 
-  Status Run(uint64_t* file_number = nullptr);
+  Status Run(FileMetaData* file_meta = nullptr);
 
  private:
   void ReportStartedFlush();
   void ReportFlushInputSize(const autovector<MemTable*>& mems);
   void RecordFlushIOStats();
   Status WriteLevel0Table(const autovector<MemTable*>& mems, VersionEdit* edit,
-                          uint64_t* filenumber);
+                          FileMetaData* meta);
   const std::string& dbname_;
   ColumnFamilyData* cfd_;
   const DBOptions& db_options_;
@@ -80,7 +81,7 @@ class FlushJob {
   VersionSet* versions_;
   InstrumentedMutex* db_mutex_;
   std::atomic<bool>* shutting_down_;
-  SequenceNumber newest_snapshot_;
+  std::vector<SequenceNumber> existing_snapshots_;
   JobContext* job_context_;
   LogBuffer* log_buffer_;
   Directory* db_directory_;
diff --git a/src/rocksdb/db/flush_job_test.cc b/src/rocksdb/db/flush_job_test.cc
index 6946ae0..d2c423c 100644
--- a/src/rocksdb/db/flush_job_test.cc
+++ b/src/rocksdb/db/flush_job_test.cc
@@ -3,6 +3,7 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#include <algorithm>
 #include <map>
 #include <string>
 
@@ -11,6 +12,7 @@
 #include "db/version_set.h"
 #include "db/writebuffer.h"
 #include "rocksdb/cache.h"
+#include "util/file_reader_writer.h"
 #include "util/string_util.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -56,8 +58,10 @@ class FlushJobTest : public testing::Test {
     Status s = env_->NewWritableFile(
         manifest, &file, env_->OptimizeForManifestWrite(env_options_));
     ASSERT_OK(s);
+    unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(file), EnvOptions()));
     {
-      log::Writer log(std::move(file));
+      log::Writer log(std::move(file_writer));
       std::string record;
       new_db.EncodeTo(&record);
       s = log.AddRecord(record);
@@ -88,7 +92,7 @@ TEST_F(FlushJobTest, Empty) {
   FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
                      db_options_, *cfd->GetLatestMutableCFOptions(),
                      env_options_, versions_.get(), &mutex_, &shutting_down_,
-                     SequenceNumber(), &job_context, nullptr, nullptr, nullptr,
+                     {}, &job_context, nullptr, nullptr, nullptr,
                      kNoCompression, nullptr, &event_logger);
   ASSERT_OK(flush_job.Run());
   job_context.Clean();
@@ -97,23 +101,101 @@ TEST_F(FlushJobTest, Empty) {
 TEST_F(FlushJobTest, NonEmpty) {
   JobContext job_context(0);
   auto cfd = versions_->GetColumnFamilySet()->GetDefault();
-  auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions());
+  auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                           kMaxSequenceNumber);
   new_mem->Ref();
-  std::map<std::string, std::string> inserted_keys;
+  auto inserted_keys = mock::MakeMockFile();
+  // Test data:
+  //   seqno [    1,    2 ... 8998, 8999, 9000, 9001, 9002 ... 9999 ]
+  //   key   [ 1001, 1002 ... 9998, 9999,    0,    1,    2 ...  999 ]
+  // Expected:
+  //   smallest_key   = "0"
+  //   largest_key    = "9999"
+  //   smallest_seqno = 1
+  //   smallest_seqno = 9999
   for (int i = 1; i < 10000; ++i) {
-    std::string key(ToString(i));
-    std::string value("value" + ToString(i));
+    std::string key(ToString((i + 1000) % 10000));
+    std::string value("value" + key);
     new_mem->Add(SequenceNumber(i), kTypeValue, key, value);
     InternalKey internal_key(key, SequenceNumber(i), kTypeValue);
     inserted_keys.insert({internal_key.Encode().ToString(), value});
   }
-  cfd->imm()->Add(new_mem);
+
+  autovector<MemTable*> to_delete;
+  cfd->imm()->Add(new_mem, &to_delete);
+  for (auto& m : to_delete) {
+    delete m;
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     {}, &job_context, nullptr, nullptr, nullptr,
+                     kNoCompression, nullptr, &event_logger);
+  FileMetaData fd;
+  mutex_.Lock();
+  ASSERT_OK(flush_job.Run(&fd));
+  mutex_.Unlock();
+  ASSERT_EQ(ToString(0), fd.smallest.user_key().ToString());
+  ASSERT_EQ(ToString(9999), fd.largest.user_key().ToString());
+  ASSERT_EQ(1, fd.smallest_seqno);
+  ASSERT_EQ(9999, fd.largest_seqno);
+  mock_table_factory_->AssertSingleFile(inserted_keys);
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, Snapshots) {
+  JobContext job_context(0);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                           kMaxSequenceNumber);
+
+  std::vector<SequenceNumber> snapshots;
+  std::set<SequenceNumber> snapshots_set;
+  int keys = 10000;
+  int max_inserts_per_keys = 8;
+
+  Random rnd(301);
+  for (int i = 0; i < keys / 2; ++i) {
+    snapshots.push_back(rnd.Uniform(keys * (max_inserts_per_keys / 2)) + 1);
+    snapshots_set.insert(snapshots.back());
+  }
+  std::sort(snapshots.begin(), snapshots.end());
+
+  new_mem->Ref();
+  SequenceNumber current_seqno = 0;
+  auto inserted_keys = mock::MakeMockFile();
+  for (int i = 1; i < keys; ++i) {
+    std::string key(ToString(i));
+    int insertions = rnd.Uniform(max_inserts_per_keys);
+    for (int j = 0; j < insertions; ++j) {
+      std::string value(test::RandomHumanReadableString(&rnd, 10));
+      auto seqno = ++current_seqno;
+      new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value);
+      // a key is visible only if:
+      // 1. it's the last one written (j == insertions - 1)
+      // 2. there's a snapshot pointing at it
+      bool visible = (j == insertions - 1) ||
+                     (snapshots_set.find(seqno) != snapshots_set.end());
+      if (visible) {
+        InternalKey internal_key(key, seqno, kTypeValue);
+        inserted_keys.insert({internal_key.Encode().ToString(), value});
+      }
+    }
+  }
+
+  autovector<MemTable*> to_delete;
+  cfd->imm()->Add(new_mem, &to_delete);
+  for (auto& m : to_delete) {
+    delete m;
+  }
 
   EventLogger event_logger(db_options_.info_log.get());
   FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
                      db_options_, *cfd->GetLatestMutableCFOptions(),
                      env_options_, versions_.get(), &mutex_, &shutting_down_,
-                     SequenceNumber(), &job_context, nullptr, nullptr, nullptr,
+                     snapshots, &job_context, nullptr, nullptr, nullptr,
                      kNoCompression, nullptr, &event_logger);
   mutex_.Lock();
   ASSERT_OK(flush_job.Run());
diff --git a/src/rocksdb/db/forward_iterator.cc b/src/rocksdb/db/forward_iterator.cc
index b441019..c0d7647 100644
--- a/src/rocksdb/db/forward_iterator.cc
+++ b/src/rocksdb/db/forward_iterator.cc
@@ -19,6 +19,7 @@
 #include "rocksdb/slice_transform.h"
 #include "table/merger.h"
 #include "db/dbformat.h"
+#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -47,7 +48,8 @@ class LevelIterator : public Iterator {
     assert(file_index_ < files_.size());
     file_iter_.reset(cfd_->table_cache()->NewIterator(
         read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
-        files_[file_index_]->fd, nullptr /* table_reader_ptr */, false));
+        files_[file_index_]->fd, nullptr /* table_reader_ptr */, nullptr,
+        false));
   }
   void SeekToLast() override {
     status_ = Status::NotSupported("LevelIterator::SeekToLast()");
@@ -115,7 +117,8 @@ class LevelIterator : public Iterator {
 };
 
 ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
-    ColumnFamilyData* cfd, SuperVersion* current_sv)
+                                 ColumnFamilyData* cfd,
+                                 SuperVersion* current_sv)
     : db_(db),
       read_options_(read_options),
       cfd_(cfd),
@@ -125,9 +128,11 @@ ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
       sv_(current_sv),
       mutable_iter_(nullptr),
       current_(nullptr),
+      valid_(false),
       status_(Status::OK()),
       immutable_status_(Status::OK()),
-      valid_(false),
+      has_iter_trimmed_for_upper_bound_(false),
+      current_over_upper_bound_(false),
       is_prev_set_(false),
       is_prev_inclusive_(false) {
   if (sv_) {
@@ -169,12 +174,14 @@ void ForwardIterator::Cleanup(bool release_sv) {
       if (job_context.HaveSomethingToDelete()) {
         db_->PurgeObsoleteFiles(job_context);
       }
+      job_context.Clean();
     }
   }
 }
 
 bool ForwardIterator::Valid() const {
-  return valid_;
+  // See UpdateCurrent().
+  return valid_ ? !current_over_upper_bound_ : false;
 }
 
 void ForwardIterator::SeekToFirst() {
@@ -187,7 +194,17 @@ void ForwardIterator::SeekToFirst() {
   SeekInternal(Slice(), true);
 }
 
+bool ForwardIterator::IsOverUpperBound(const Slice& internal_key) const {
+  return !(read_options_.iterate_upper_bound == nullptr ||
+           cfd_->internal_comparator().user_comparator()->Compare(
+               ExtractUserKey(internal_key),
+               *read_options_.iterate_upper_bound) < 0);
+}
+
 void ForwardIterator::Seek(const Slice& internal_key) {
+  if (IsOverUpperBound(internal_key)) {
+    valid_ = false;
+  }
   if (sv_ == nullptr ||
       sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
     RebuildIterators(true);
@@ -210,11 +227,19 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
   // an option to turn it off.
   if (seek_to_first || NeedToSeekImmutable(internal_key)) {
     immutable_status_ = Status::OK();
+    if (has_iter_trimmed_for_upper_bound_) {
+      // Some iterators are trimmed. Need to rebuild.
+      RebuildIterators(true);
+      // Already seeked mutable iter, so seek again
+      seek_to_first ? mutable_iter_->SeekToFirst()
+                    : mutable_iter_->Seek(internal_key);
+    }
     {
       auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator()));
       immutable_min_heap_.swap(tmp);
     }
-    for (auto* m : imm_iters_) {
+    for (size_t i = 0; i < imm_iters_.size(); i++) {
+      auto* m = imm_iters_[i];
       seek_to_first ? m->SeekToFirst() : m->Seek(internal_key);
       if (!m->status().ok()) {
         immutable_status_ = m->status();
@@ -230,6 +255,9 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
     const VersionStorageInfo* vstorage = sv_->current->storage_info();
     const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
     for (uint32_t i = 0; i < l0.size(); ++i) {
+      if (!l0_iters_[i]) {
+        continue;
+      }
       if (seek_to_first) {
         l0_iters_[i]->SeekToFirst();
       } else {
@@ -237,6 +265,11 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
         // won't go over this file.
         if (user_comparator_->Compare(user_key,
               l0[i]->largest.user_key()) > 0) {
+          if (read_options_.iterate_upper_bound != nullptr) {
+            has_iter_trimmed_for_upper_bound_ = true;
+            delete l0_iters_[i];
+            l0_iters_[i] = nullptr;
+          }
           continue;
         }
         l0_iters_[i]->Seek(internal_key);
@@ -245,7 +278,13 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
       if (!l0_iters_[i]->status().ok()) {
         immutable_status_ = l0_iters_[i]->status();
       } else if (l0_iters_[i]->Valid()) {
-        immutable_min_heap_.push(l0_iters_[i]);
+        if (!IsOverUpperBound(l0_iters_[i]->key())) {
+          immutable_min_heap_.push(l0_iters_[i]);
+        } else {
+          has_iter_trimmed_for_upper_bound_ = true;
+          delete l0_iters_[i];
+          l0_iters_[i] = nullptr;
+        }
       }
     }
 
@@ -259,7 +298,9 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
         search_right_bound = FileIndexer::kLevelMaxIndex;
         continue;
       }
-      assert(level_iters_[level - 1] != nullptr);
+      if (level_iters_[level - 1] == nullptr) {
+        continue;
+      }
       uint32_t f_idx = 0;
       const auto& indexer = vstorage->file_indexer();
       if (!seek_to_first) {
@@ -293,14 +334,10 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
         if (f_idx < level_files.size()) {
           int cmp_smallest = user_comparator_->Compare(
               user_key, level_files[f_idx]->smallest.user_key());
-          int cmp_largest = -1;
-          if (cmp_smallest >= 0) {
-            cmp_smallest = user_comparator_->Compare(
-                user_key, level_files[f_idx]->smallest.user_key());
-          }
-          indexer.GetNextLevelIndex(level, f_idx,
-              cmp_smallest, cmp_largest,
-              &search_left_bound, &search_right_bound);
+          assert(user_comparator_->Compare(
+                     user_key, level_files[f_idx]->largest.user_key()) <= 0);
+          indexer.GetNextLevelIndex(level, f_idx, cmp_smallest, -1,
+                                    &search_left_bound, &search_right_bound);
         } else {
           indexer.GetNextLevelIndex(
               level, level_files.size() - 1,
@@ -317,7 +354,14 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
         if (!level_iters_[level - 1]->status().ok()) {
           immutable_status_ = level_iters_[level - 1]->status();
         } else if (level_iters_[level - 1]->Valid()) {
-          immutable_min_heap_.push(level_iters_[level - 1]);
+          if (!IsOverUpperBound(level_iters_[level - 1]->key())) {
+            immutable_min_heap_.push(level_iters_[level - 1]);
+          } else {
+            // Nothing in this level is interesting. Remove.
+            has_iter_trimmed_for_upper_bound_ = true;
+            delete level_iters_[level - 1];
+            level_iters_[level - 1] = nullptr;
+          }
         }
       }
     }
@@ -329,16 +373,20 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
       is_prev_set_ = true;
       is_prev_inclusive_ = true;
     }
+
+    TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Immutable", this);
   } else if (current_ && current_ != mutable_iter_) {
     // current_ is one of immutable iterators, push it back to the heap
     immutable_min_heap_.push(current_);
   }
 
   UpdateCurrent();
+  TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Return", this);
 }
 
 void ForwardIterator::Next() {
   assert(valid_);
+  bool update_prev_key = false;
 
   if (sv_ == nullptr ||
       sv_->version_number != cfd_->GetSuperVersionNumber()) {
@@ -353,14 +401,16 @@ void ForwardIterator::Next() {
   } else if (current_ != mutable_iter_) {
     // It is going to advance immutable iterator
 
-    bool update_prev_key = true;
     if (is_prev_set_ && prefix_extractor_) {
       // advance prev_key_ to current_ only if they share the same prefix
       update_prev_key =
         prefix_extractor_->Transform(prev_key_.GetKey()).compare(
           prefix_extractor_->Transform(current_->key())) == 0;
+    } else {
+      update_prev_key = true;
     }
 
+
     if (update_prev_key) {
       prev_key_.SetKey(current_->key());
       is_prev_set_ = true;
@@ -372,12 +422,21 @@ void ForwardIterator::Next() {
   if (current_ != mutable_iter_) {
     if (!current_->status().ok()) {
       immutable_status_ = current_->status();
-    } else if (current_->Valid()) {
+    } else if ((current_->Valid()) && (!IsOverUpperBound(current_->key()))) {
       immutable_min_heap_.push(current_);
+    } else {
+      if ((current_->Valid()) && (IsOverUpperBound(current_->key()))) {
+        // remove the current iterator
+        DeleteCurrentIter();
+        current_ = nullptr;
+      }
+      if (update_prev_key) {
+        mutable_iter_->Seek(prev_key_.GetKey());
+      }
     }
   }
-
   UpdateCurrent();
+  TEST_SYNC_POINT_CALLBACK("ForwardIterator::Next:Return", this);
 }
 
 Slice ForwardIterator::key() const {
@@ -409,11 +468,19 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
   }
   mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
   sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
+  has_iter_trimmed_for_upper_bound_ = false;
 
   const auto* vstorage = sv_->current->storage_info();
   const auto& l0_files = vstorage->LevelFiles(0);
   l0_iters_.reserve(l0_files.size());
   for (const auto* l0 : l0_files) {
+    if ((read_options_.iterate_upper_bound != nullptr) &&
+        cfd_->internal_comparator().user_comparator()->Compare(
+            l0->smallest.user_key(), *read_options_.iterate_upper_bound) > 0) {
+      has_iter_trimmed_for_upper_bound_ = true;
+      l0_iters_.push_back(nullptr);
+      continue;
+    }
     l0_iters_.push_back(cfd_->table_cache()->NewIterator(
         read_options_, *cfd_->soptions(), cfd_->internal_comparator(), l0->fd));
   }
@@ -421,8 +488,15 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
   for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
     const auto& level_files = vstorage->LevelFiles(level);
 
-    if (level_files.empty()) {
+    if ((level_files.empty()) ||
+        ((read_options_.iterate_upper_bound != nullptr) &&
+         (user_comparator_->Compare(*read_options_.iterate_upper_bound,
+                                    level_files[0]->smallest.user_key()) <
+          0))) {
       level_iters_.push_back(nullptr);
+      if (!level_files.empty()) {
+        has_iter_trimmed_for_upper_bound_ = true;
+      }
     } else {
       level_iters_.push_back(
           new LevelIterator(cfd_, read_options_, level_files));
@@ -437,7 +511,7 @@ void ForwardIterator::ResetIncompleteIterators() {
   const auto& l0_files = sv_->current->storage_info()->LevelFiles(0);
   for (uint32_t i = 0; i < l0_iters_.size(); ++i) {
     assert(i < l0_files.size());
-    if (!l0_iters_[i]->status().IsIncomplete()) {
+    if (!l0_iters_[i] || !l0_iters_[i]->status().IsIncomplete()) {
       continue;
     }
     delete l0_iters_[i];
@@ -481,6 +555,13 @@ void ForwardIterator::UpdateCurrent() {
   if (!status_.ok()) {
     status_ = Status::OK();
   }
+
+  // Upper bound doesn't apply to the memtable iterator. We want Valid() to
+  // return false when all iterators are over iterate_upper_bound, but can't
+  // just set valid_ to false, as that would effectively disable the tailing
+  // optimization (Seek() would be called on all immutable iterators regardless
+  // of whether the target key is greater than prev_key_).
+  current_over_upper_bound_ = valid_ && IsOverUpperBound(current_->key());
 }
 
 bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
@@ -516,6 +597,71 @@ bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
   return false;
 }
 
+void ForwardIterator::DeleteCurrentIter() {
+  const VersionStorageInfo* vstorage = sv_->current->storage_info();
+  const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+  for (uint32_t i = 0; i < l0.size(); ++i) {
+    if (!l0_iters_[i]) {
+      continue;
+    }
+    if (l0_iters_[i] == current_) {
+      has_iter_trimmed_for_upper_bound_ = true;
+      delete l0_iters_[i];
+      l0_iters_[i] = nullptr;
+      return;
+    }
+  }
+
+  for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+    if (level_iters_[level - 1] == nullptr) {
+      continue;
+    }
+    if (level_iters_[level - 1] == current_) {
+      has_iter_trimmed_for_upper_bound_ = true;
+      delete level_iters_[level - 1];
+      level_iters_[level - 1] = nullptr;
+    }
+  }
+}
+
+bool ForwardIterator::TEST_CheckDeletedIters(int* pdeleted_iters,
+                                             int* pnum_iters) {
+  bool retval = false;
+  int deleted_iters = 0;
+  int num_iters = 0;
+
+  const VersionStorageInfo* vstorage = sv_->current->storage_info();
+  const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+  for (uint32_t i = 0; i < l0.size(); ++i) {
+    if (!l0_iters_[i]) {
+      retval = true;
+      deleted_iters++;
+    } else {
+      num_iters++;
+    }
+  }
+
+  for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+    if ((level_iters_[level - 1] == nullptr) &&
+        (!vstorage->LevelFiles(level).empty())) {
+      retval = true;
+      deleted_iters++;
+    } else if (!vstorage->LevelFiles(level).empty()) {
+      num_iters++;
+    }
+  }
+  if ((!retval) && num_iters <= 1) {
+    retval = true;
+  }
+  if (pdeleted_iters) {
+    *pdeleted_iters = deleted_iters;
+  }
+  if (pnum_iters) {
+    *pnum_iters = num_iters;
+  }
+  return retval;
+}
+
 uint32_t ForwardIterator::FindFileInRange(
     const std::vector<FileMetaData*>& files, const Slice& internal_key,
     uint32_t left, uint32_t right) {
diff --git a/src/rocksdb/db/forward_iterator.h b/src/rocksdb/db/forward_iterator.h
index f72c9cb..e6ef0bd 100644
--- a/src/rocksdb/db/forward_iterator.h
+++ b/src/rocksdb/db/forward_iterator.h
@@ -70,6 +70,7 @@ class ForwardIterator : public Iterator {
   virtual Slice key() const override;
   virtual Slice value() const override;
   virtual Status status() const override;
+  bool TEST_CheckDeletedIters(int* deleted_iters, int* num_iters);
 
  private:
   void Cleanup(bool release_sv);
@@ -78,10 +79,13 @@ class ForwardIterator : public Iterator {
   void SeekInternal(const Slice& internal_key, bool seek_to_first);
   void UpdateCurrent();
   bool NeedToSeekImmutable(const Slice& internal_key);
+  void DeleteCurrentIter();
   uint32_t FindFileInRange(
     const std::vector<FileMetaData*>& files, const Slice& internal_key,
     uint32_t left, uint32_t right);
 
+  bool IsOverUpperBound(const Slice& internal_key) const;
+
   DBImpl* const db_;
   const ReadOptions read_options_;
   ColumnFamilyData* const cfd_;
@@ -95,14 +99,30 @@ class ForwardIterator : public Iterator {
   std::vector<Iterator*> l0_iters_;
   std::vector<LevelIterator*> level_iters_;
   Iterator* current_;
-  // internal iterator status
-  Status status_;
-  Status immutable_status_;
   bool valid_;
 
+  // Internal iterator status; set only by one of the unsupported methods.
+  Status status_;
+  // Status of immutable iterators, maintained here to avoid iterating over
+  // all of them in status().
+  Status immutable_status_;
+  // Indicates that at least one of the immutable iterators pointed to a key
+  // larger than iterate_upper_bound and was therefore destroyed. Seek() may
+  // need to rebuild such iterators.
+  bool has_iter_trimmed_for_upper_bound_;
+  // Is current key larger than iterate_upper_bound? If so, makes Valid()
+  // return false.
+  bool current_over_upper_bound_;
+
+  // Left endpoint of the range of keys that immutable iterators currently
+  // cover. When Seek() is called with a key that's within that range, immutable
+  // iterators don't need to be moved; see NeedToSeekImmutable(). This key is
+  // included in the range after a Seek(), but excluded when advancing the
+  // iterator using Next().
   IterKey prev_key_;
   bool is_prev_set_;
   bool is_prev_inclusive_;
+
   Arena arena_;
 };
 
diff --git a/src/rocksdb/db/internal_stats.cc b/src/rocksdb/db/internal_stats.cc
index e6eb9fb..4e37c1d 100644
--- a/src/rocksdb/db/internal_stats.cc
+++ b/src/rocksdb/db/internal_stats.cc
@@ -13,6 +13,7 @@
 #endif
 
 #include <inttypes.h>
+#include <string>
 #include <algorithm>
 #include <vector>
 #include "db/column_family.h"
@@ -26,6 +27,7 @@ namespace rocksdb {
 namespace {
 const double kMB = 1048576.0;
 const double kGB = kMB * 1024;
+const double kMicrosInSec = 1000000.0;
 
 void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name) {
   snprintf(
@@ -45,15 +47,17 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
     int num_files, int being_compacted, double total_file_size, double score,
     double w_amp, uint64_t stalls,
     const InternalStats::CompactionStats& stats) {
-  uint64_t bytes_read = stats.bytes_readn + stats.bytes_readnp1;
-  int64_t bytes_new = stats.bytes_written - stats.bytes_readnp1;
-  double elapsed = (stats.micros + 1) / 1000000.0;
+  uint64_t bytes_read =
+      stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
+  int64_t bytes_new =
+      stats.bytes_written - stats.bytes_read_output_level;
+  double elapsed = (stats.micros + 1) / kMicrosInSec;
   std::string num_input_records = NumberToHumanString(stats.num_input_records);
   std::string num_dropped_records =
       NumberToHumanString(stats.num_dropped_records);
 
   snprintf(buf, len,
-           "%4s %6d/%-3d %8.0f %5.1f " /* Level, Files, Size(MB), Score */
+           "%4s %6d/%-3d %8.2f %5.1f " /* Level, Files, Size(MB), Score */
            "%8.1f "                    /* Read(GB) */
            "%7.1f "                    /* Rn(GB) */
            "%8.1f "                    /* Rnp1(GB) */
@@ -70,16 +74,15 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name,
            " "      /* Stall(cnt) */
            "%7s "   /* KeyIn */
            "%6s\n", /* KeyDrop */
-           name.c_str(), num_files, being_compacted, total_file_size / kMB,
-           score, bytes_read / kGB, stats.bytes_readn / kGB,
-           stats.bytes_readnp1 / kGB, stats.bytes_written / kGB,
-           bytes_new / kGB, stats.bytes_moved / kGB,
-           w_amp, bytes_read / kMB / elapsed,
-           stats.bytes_written / kMB / elapsed, stats.micros / 1000000.0,
-           stats.count,
-           stats.count == 0 ? 0 : stats.micros / 1000000.0 / stats.count,
-           stalls,
-           num_input_records.c_str(), num_dropped_records.c_str());
+           name.c_str(),
+           num_files, being_compacted, total_file_size / kMB, score,
+           bytes_read / kGB, stats.bytes_read_non_output_levels / kGB,
+           stats.bytes_read_output_level / kGB, stats.bytes_written / kGB,
+           bytes_new / kGB, stats.bytes_moved / kGB, w_amp,
+           bytes_read / kMB / elapsed, stats.bytes_written / kMB / elapsed,
+           stats.micros / kMicrosInSec, stats.count,
+           stats.count == 0 ? 0 : stats.micros / kMicrosInSec / stats.count,
+           stalls, num_input_records.c_str(), num_dropped_records.c_str());
 }
 }
 
@@ -92,12 +95,16 @@ static const std::string cfstats = "cfstats";
 static const std::string dbstats = "dbstats";
 static const std::string levelstats = "levelstats";
 static const std::string num_immutable_mem_table = "num-immutable-mem-table";
+static const std::string num_immutable_mem_table_flushed =
+    "num-immutable-mem-table-flushed";
 static const std::string mem_table_flush_pending = "mem-table-flush-pending";
 static const std::string compaction_pending = "compaction-pending";
 static const std::string background_errors = "background-errors";
 static const std::string cur_size_active_mem_table =
                           "cur-size-active-mem-table";
-static const std::string cur_size_all_mem_tables = "cur-size-all-mem-tables";
+static const std::string cur_size_unflushed_mem_tables =
+    "cur-size-all-mem-tables";
+static const std::string cur_size_all_mem_tables = "size-all-mem-tables";
 static const std::string num_entries_active_mem_table =
                           "num-entries-active-mem-table";
 static const std::string num_entries_imm_mem_tables =
@@ -114,7 +121,15 @@ static const std::string is_file_deletions_enabled =
 static const std::string num_snapshots = "num-snapshots";
 static const std::string oldest_snapshot_time = "oldest-snapshot-time";
 static const std::string num_live_versions = "num-live-versions";
+static const std::string estimate_live_data_size = "estimate-live-data-size";
 static const std::string base_level = "base-level";
+static const std::string total_sst_files_size = "total-sst-files-size";
+static const std::string estimate_pending_comp_bytes =
+    "estimate-pending-compaction-bytes";
+static const std::string aggregated_table_properties =
+    "aggregated-table-properties";
+static const std::string aggregated_table_properties_at_level =
+    aggregated_table_properties + "-at-level";
 
 const std::string DB::Properties::kNumFilesAtLevelPrefix =
                       rocksdb_prefix + num_files_at_level_prefix;
@@ -133,7 +148,9 @@ const std::string DB::Properties::kBackgroundErrors =
 const std::string DB::Properties::kCurSizeActiveMemTable =
                       rocksdb_prefix + cur_size_active_mem_table;
 const std::string DB::Properties::kCurSizeAllMemTables =
-                      rocksdb_prefix + cur_size_all_mem_tables;
+    rocksdb_prefix + cur_size_unflushed_mem_tables;
+const std::string DB::Properties::kSizeAllMemTables =
+    rocksdb_prefix + cur_size_all_mem_tables;
 const std::string DB::Properties::kNumEntriesActiveMemTable =
                       rocksdb_prefix + num_entries_active_mem_table;
 const std::string DB::Properties::kNumEntriesImmMemTables =
@@ -154,6 +171,16 @@ const std::string DB::Properties::kOldestSnapshotTime =
                       rocksdb_prefix + oldest_snapshot_time;
 const std::string DB::Properties::kNumLiveVersions =
                       rocksdb_prefix + num_live_versions;
+const std::string DB::Properties::kEstimateLiveDataSize =
+                      rocksdb_prefix + estimate_live_data_size;
+const std::string DB::Properties::kTotalSstFilesSize =
+                      rocksdb_prefix + total_sst_files_size;
+const std::string DB::Properties::kEstimatePendingCompactionBytes =
+    rocksdb_prefix + estimate_pending_comp_bytes;
+const std::string DB::Properties::kAggregatedTableProperties =
+    rocksdb_prefix + aggregated_table_properties;
+const std::string DB::Properties::kAggregatedTablePropertiesAtLevel =
+    rocksdb_prefix + aggregated_table_properties_at_level;
 
 DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
                                bool* need_out_of_mutex) {
@@ -180,11 +207,17 @@ DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
     return kDBStats;
   } else if (in == sstables) {
     return kSsTables;
+  } else if (in == aggregated_table_properties) {
+    return kAggregatedTableProperties;
+  } else if (in.starts_with(aggregated_table_properties_at_level)) {
+    return kAggregatedTablePropertiesAtLevel;
   }
 
   *is_int_property = true;
   if (in == num_immutable_mem_table) {
     return kNumImmutableMemTable;
+  } else if (in == num_immutable_mem_table_flushed) {
+    return kNumImmutableMemTableFlushed;
   } else if (in == mem_table_flush_pending) {
     return kMemtableFlushPending;
   } else if (in == compaction_pending) {
@@ -193,8 +226,10 @@ DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
     return kBackgroundErrors;
   } else if (in == cur_size_active_mem_table) {
     return kCurSizeActiveMemTable;
-  } else if (in == cur_size_all_mem_tables) {
+  } else if (in == cur_size_unflushed_mem_tables) {
     return kCurSizeAllMemTables;
+  } else if (in == cur_size_all_mem_tables) {
+    return kSizeAllMemTables;
   } else if (in == num_entries_active_mem_table) {
     return kNumEntriesInMutableMemtable;
   } else if (in == num_entries_imm_mem_tables) {
@@ -216,8 +251,15 @@ DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
     return kOldestSnapshotTime;
   } else if (in == num_live_versions) {
     return kNumLiveVersions;
+  } else if (in == estimate_live_data_size) {
+    *need_out_of_mutex = true;
+    return kEstimateLiveDataSize;
   } else if (in == base_level) {
     return kBaseLevel;
+  } else if (in == total_sst_files_size) {
+    return kTotalSstFilesSize;
+  } else if (in == estimate_pending_comp_bytes) {
+    return kEstimatePendingCompactionBytes;
   }
   return kUnknown;
 }
@@ -226,15 +268,19 @@ bool InternalStats::GetIntPropertyOutOfMutex(DBPropertyType property_type,
                                              Version* version,
                                              uint64_t* value) const {
   assert(value != nullptr);
-  if (property_type != kEstimatedUsageByTableReaders) {
-    return false;
-  }
-  if (version == nullptr) {
-    *value = 0;
-  } else {
-    *value = version->GetMemoryUsageByTableReaders();
+  const auto* vstorage = cfd_->current()->storage_info();
+
+  switch (property_type) {
+    case kEstimatedUsageByTableReaders:
+      *value = (version == nullptr) ?
+        0 : version->GetMemoryUsageByTableReaders();
+      return true;
+    case kEstimateLiveDataSize:
+      *value = vstorage->EstimateLiveDataSize();
+      return true;
+    default:
+      return false;
   }
-  return true;
 }
 
 bool InternalStats::GetStringProperty(DBPropertyType property_type,
@@ -295,6 +341,32 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
     case kSsTables:
       *value = current->DebugString();
       return true;
+    case kAggregatedTableProperties: {
+      std::shared_ptr<const TableProperties> tp;
+      auto s = cfd_->current()->GetAggregatedTableProperties(&tp);
+      if (!s.ok()) {
+        return false;
+      }
+      *value = tp->ToString();
+      return true;
+    }
+    case kAggregatedTablePropertiesAtLevel: {
+      in.remove_prefix(
+          DB::Properties::kAggregatedTablePropertiesAtLevel.length());
+      uint64_t level;
+      bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
+      if (!ok || static_cast<int>(level) >= number_levels_) {
+        return false;
+      }
+      std::shared_ptr<const TableProperties> tp;
+      auto s = cfd_->current()->GetAggregatedTableProperties(
+          &tp, static_cast<int>(level));
+      if (!s.ok()) {
+        return false;
+      }
+      *value = tp->ToString();
+      return true;
+    }
     default:
       return false;
   }
@@ -307,14 +379,17 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
 
   switch (property_type) {
     case kNumImmutableMemTable:
-      *value = cfd_->imm()->size();
+      *value = cfd_->imm()->NumNotFlushed();
+      return true;
+    case kNumImmutableMemTableFlushed:
+      *value = cfd_->imm()->NumFlushed();
       return true;
     case kMemtableFlushPending:
       // Return number of mem tables that are ready to flush (made immutable)
       *value = (cfd_->imm()->IsFlushPending() ? 1 : 0);
       return true;
     case kCompactionPending:
-      // 1 if the system already determines at least one compacdtion is needed.
+      // 1 if the system already determines at least one compaction is needed.
       // 0 otherwise,
       *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0);
       return true;
@@ -329,6 +404,10 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
     case kCurSizeAllMemTables:
       // Current size of the active memtable + immutable memtables
       *value = cfd_->mem()->ApproximateMemoryUsage() +
+               cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage();
+      return true;
+    case kSizeAllMemTables:
+      *value = cfd_->mem()->ApproximateMemoryUsage() +
                cfd_->imm()->ApproximateMemoryUsage();
       return true;
     case kNumEntriesInMutableMemtable:
@@ -366,14 +445,18 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
     case kNumLiveVersions:
       *value = cfd_->GetNumLiveVersions();
       return true;
-#ifndef ROCKSDB_LITE
     case kIsFileDeletionEnabled:
       *value = db->IsFileDeletionsEnabled();
       return true;
-#endif
     case kBaseLevel:
       *value = vstorage->base_level();
       return true;
+    case kTotalSstFilesSize:
+      *value = cfd_->GetTotalSstFilesSize();
+      return true;
+    case kEstimatePendingCompactionBytes:
+      *value = vstorage->estimated_compaction_needed_bytes();
+      return true;
     default:
       return false;
   }
@@ -382,7 +465,7 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
 void InternalStats::DumpDBStats(std::string* value) {
   char buf[1000];
   // DB-level stats, only available from default column family
-  double seconds_up = (env_->NowMicros() - started_at_ + 1) / 1000000.0;
+  double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec;
   double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up;
   snprintf(buf, sizeof(buf),
            "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n",
@@ -433,19 +516,17 @@ void InternalStats::DumpDBStats(std::string* value) {
   value->append(buf);
   // Compact
   for (int level = 0; level < number_levels_; level++) {
-    compact_bytes_read += comp_stats_[level].bytes_readnp1 +
-                          comp_stats_[level].bytes_readn;
+    compact_bytes_read += comp_stats_[level].bytes_read_output_level +
+                          comp_stats_[level].bytes_read_non_output_levels;
     compact_bytes_write += comp_stats_[level].bytes_written;
     compact_micros += comp_stats_[level].micros;
   }
   snprintf(buf, sizeof(buf),
            "Cumulative compaction: %.2f GB write, %.2f MB/s write, "
            "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
-           compact_bytes_write / kGB,
-           compact_bytes_write / kMB / seconds_up,
-           compact_bytes_read / kGB,
-           compact_bytes_read / kMB / seconds_up,
-           compact_micros / 1000000.0);
+           compact_bytes_write / kGB, compact_bytes_write / kMB / seconds_up,
+           compact_bytes_read / kGB, compact_bytes_read / kMB / seconds_up,
+           compact_micros / kMicrosInSec);
   value->append(buf);
   // Stall
   AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true);
@@ -499,16 +580,15 @@ void InternalStats::DumpDBStats(std::string* value) {
   uint64_t interval_compact_micros =
       compact_micros - db_stats_snapshot_.compact_micros;
 
-  snprintf(buf, sizeof(buf),
-           "Interval compaction: %.2f GB write, %.2f MB/s write, "
-           "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
-           interval_compact_bytes_write / kGB,
-           interval_compact_bytes_write / kMB /
-               std::max(interval_seconds_up, 0.001),
-           interval_compact_bytes_read / kGB,
-           interval_compact_bytes_read / kMB /
-               std::max(interval_seconds_up, 0.001),
-           interval_compact_micros / 1000000.0);
+  snprintf(
+      buf, sizeof(buf),
+      "Interval compaction: %.2f GB write, %.2f MB/s write, "
+      "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
+      interval_compact_bytes_write / kGB,
+      interval_compact_bytes_write / kMB / std::max(interval_seconds_up, 0.001),
+      interval_compact_bytes_read / kGB,
+      interval_compact_bytes_read / kMB / std::max(interval_seconds_up, 0.001),
+      interval_compact_micros / kMicrosInSec);
   value->append(buf);
 
   // Stall
@@ -523,6 +603,16 @@ void InternalStats::DumpDBStats(std::string* value) {
                10000.0 / std::max(interval_seconds_up, 0.001));
   value->append(buf);
 
+  for (int level = 0; level < number_levels_; level++) {
+    if (!file_read_latency_[level].Empty()) {
+      char buf2[5000];
+      snprintf(buf2, sizeof(buf2),
+               "** Level %d read latency histogram (micros):\n%s\n", level,
+               file_read_latency_[level].ToString().c_str());
+      value->append(buf2);
+    }
+  }
+
   db_stats_snapshot_.seconds_up = seconds_up;
   db_stats_snapshot_.ingest_bytes = user_bytes_written;
   db_stats_snapshot_.write_other = write_other;
@@ -541,8 +631,7 @@ void InternalStats::DumpCFStats(std::string* value) {
   const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
 
   int num_levels_to_check =
-      (cfd_->ioptions()->compaction_style != kCompactionStyleUniversal &&
-       cfd_->ioptions()->compaction_style != kCompactionStyleFIFO)
+      (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO)
           ? vstorage->num_levels() - 1
           : 1;
 
@@ -555,7 +644,7 @@ void InternalStats::DumpCFStats(std::string* value) {
   }
   // Count # of files being compacted for each level
   std::vector<int> files_being_compacted(number_levels_, 0);
-  for (int level = 0; level < num_levels_to_check; ++level) {
+  for (int level = 0; level < number_levels_; ++level) {
     for (auto* f : vstorage->LevelFiles(level)) {
       if (f->being_compacted) {
         ++files_being_compacted[level];
@@ -580,21 +669,23 @@ void InternalStats::DumpCFStats(std::string* value) {
     total_files += files;
     total_files_being_compacted += files_being_compacted[level];
     if (comp_stats_[level].micros > 0 || files > 0) {
-      uint64_t stalls = level == 0 ?
-        (cf_stats_count_[LEVEL0_SLOWDOWN] +
-         cf_stats_count_[LEVEL0_NUM_FILES] +
-         cf_stats_count_[MEMTABLE_COMPACTION])
-        : (stall_leveln_slowdown_count_soft_[level] +
-           stall_leveln_slowdown_count_hard_[level]);
+      uint64_t stalls =
+          level == 0 ? (cf_stats_count_[LEVEL0_SLOWDOWN_TOTAL] +
+                        cf_stats_count_[LEVEL0_NUM_FILES_TOTAL] +
+                        cf_stats_count_[HARD_PENDING_COMPACTION_BYTES_LIMIT] +
+                        cf_stats_count_[MEMTABLE_COMPACTION])
+                     : (stall_leveln_slowdown_count_soft_[level] +
+                        stall_leveln_slowdown_count_hard_[level]);
 
       stats_sum.Add(comp_stats_[level]);
       total_file_size += vstorage->NumLevelBytes(level);
       total_stall_count += stalls;
       total_slowdown_count_soft += stall_leveln_slowdown_count_soft_[level];
       total_slowdown_count_hard += stall_leveln_slowdown_count_hard_[level];
-      double w_amp = (comp_stats_[level].bytes_readn == 0) ? 0.0
-          : comp_stats_[level].bytes_written /
-            static_cast<double>(comp_stats_[level].bytes_readn);
+      double w_amp =
+          (comp_stats_[level].bytes_read_non_output_levels == 0) ? 0.0
+          : static_cast<double>(comp_stats_[level].bytes_written) /
+            comp_stats_[level].bytes_read_non_output_levels;
       PrintLevelStats(buf, sizeof(buf), "L" + ToString(level), files,
                       files_being_compacted[level],
                       vstorage->NumLevelBytes(level), compaction_score[level],
@@ -626,15 +717,28 @@ void InternalStats::DumpCFStats(std::string* value) {
            curr_ingest / kGB, interval_ingest / kGB);
   value->append(buf);
 
-  snprintf(buf, sizeof(buf),
-           "Stalls(count): %" PRIu64 " level0_slowdown, "
-           "%" PRIu64 " level0_numfiles, %" PRIu64 " memtable_compaction, "
-           "%" PRIu64 " leveln_slowdown_soft, "
-           "%" PRIu64 " leveln_slowdown_hard\n",
-           cf_stats_count_[LEVEL0_SLOWDOWN],
-           cf_stats_count_[LEVEL0_NUM_FILES],
-           cf_stats_count_[MEMTABLE_COMPACTION],
-           total_slowdown_count_soft, total_slowdown_count_hard);
+  snprintf(buf, sizeof(buf), "Stalls(count): %" PRIu64
+                             " level0_slowdown, "
+                             "%" PRIu64
+                             " level0_slowdown_with_compaction, "
+                             "%" PRIu64
+                             " level0_numfiles, "
+                             "%" PRIu64
+                             " level0_numfiles_with_compaction, "
+                             "%" PRIu64
+                             " pending_compaction_bytes, "
+                             "%" PRIu64
+                             " memtable_compaction, "
+                             "%" PRIu64
+                             " leveln_slowdown_soft, "
+                             "%" PRIu64 " leveln_slowdown_hard\n",
+           cf_stats_count_[LEVEL0_SLOWDOWN_TOTAL],
+           cf_stats_count_[LEVEL0_SLOWDOWN_WITH_COMPACTION],
+           cf_stats_count_[LEVEL0_NUM_FILES_TOTAL],
+           cf_stats_count_[LEVEL0_NUM_FILES_WITH_COMPACTION],
+           cf_stats_count_[HARD_PENDING_COMPACTION_BYTES_LIMIT],
+           cf_stats_count_[MEMTABLE_COMPACTION], total_slowdown_count_soft,
+           total_slowdown_count_hard);
   value->append(buf);
 
   cf_stats_snapshot_.ingest_bytes = curr_ingest;
diff --git a/src/rocksdb/db/internal_stats.h b/src/rocksdb/db/internal_stats.h
index 55f1467..eeb226e 100644
--- a/src/rocksdb/db/internal_stats.h
+++ b/src/rocksdb/db/internal_stats.h
@@ -32,14 +32,19 @@ enum DBPropertyType : uint32_t {
   kStats,            // Return general statitistics of both DB and CF
   kSsTables,         // Return a human readable string of current SST files
   kStartIntTypes,    // ---- Dummy value to indicate the start of integer values
-  kNumImmutableMemTable,   // Return number of immutable mem tables
-  kMemtableFlushPending,   // Return 1 if mem table flushing is pending,
-                           // otherwise 0.
+  kNumImmutableMemTable,         // Return number of immutable mem tables that
+                                 // have not been flushed.
+  kNumImmutableMemTableFlushed,  // Return number of immutable mem tables
+                                 // in memory that have already been flushed
+  kMemtableFlushPending,         // Return 1 if mem table flushing is pending,
+                                 // otherwise 0.
   kCompactionPending,      // Return 1 if a compaction is pending. Otherwise 0.
   kBackgroundErrors,       // Return accumulated background errors encountered.
   kCurSizeActiveMemTable,  // Return current size of the active memtable
-  kCurSizeAllMemTables,    // Return current size of all (active + immutable)
-                           // memtables
+  kCurSizeAllMemTables,    // Return current size of unflushed
+                           // (active + immutable) memtables
+  kSizeAllMemTables,       // Return current size of all (active + immutable
+                           // + pinned) memtables
   kNumEntriesInMutableMemtable,    // Return number of deletes in the mutable
                                    // memtable.
   kNumEntriesInImmutableMemtable,  // Return sum of number of entries in all
@@ -55,7 +60,15 @@ enum DBPropertyType : uint32_t {
   kNumSnapshots,                  // Number of snapshots in the system
   kOldestSnapshotTime,            // Unix timestamp of the first snapshot
   kNumLiveVersions,
-  kBaseLevel,  // The level that L0 data is compacted to
+  kEstimateLiveDataSize,            // Estimated amount of live data in bytes
+  kTotalSstFilesSize,               // Total size of all sst files.
+  kBaseLevel,                       // The level that L0 data is compacted to
+  kEstimatePendingCompactionBytes,  // Estimated bytes to compaction
+  kAggregatedTableProperties,  // Return a string that contains the aggregated
+                               // table properties.
+  kAggregatedTablePropertiesAtLevel,  // Return a string that contains the
+                                      // aggregated
+  // table properties at the specified level.
 };
 
 extern DBPropertyType GetPropertyType(const Slice& property,
@@ -67,9 +80,12 @@ extern DBPropertyType GetPropertyType(const Slice& property,
 class InternalStats {
  public:
   enum InternalCFStatsType {
-    LEVEL0_SLOWDOWN,
+    LEVEL0_SLOWDOWN_TOTAL,
+    LEVEL0_SLOWDOWN_WITH_COMPACTION,
     MEMTABLE_COMPACTION,
-    LEVEL0_NUM_FILES,
+    LEVEL0_NUM_FILES_TOTAL,
+    LEVEL0_NUM_FILES_WITH_COMPACTION,
+    HARD_PENDING_COMPACTION_BYTES_LIMIT,
     WRITE_STALLS_ENUM_MAX,
     BYTES_FLUSHED,
     INTERNAL_CF_STATS_ENUM_MAX,
@@ -94,6 +110,7 @@ class InternalStats {
         comp_stats_(num_levels),
         stall_leveln_slowdown_count_hard_(num_levels),
         stall_leveln_slowdown_count_soft_(num_levels),
+        file_read_latency_(num_levels),
         bg_error_count_(0),
         number_levels_(num_levels),
         env_(env),
@@ -117,26 +134,26 @@ class InternalStats {
   struct CompactionStats {
     uint64_t micros;
 
-    // Bytes read from level N during compaction between levels N and N+1
-    uint64_t bytes_readn;
+    // The number of bytes read from all non-output levels
+    uint64_t bytes_read_non_output_levels;
 
-    // Bytes read from level N+1 during compaction between levels N and N+1
-    uint64_t bytes_readnp1;
+    // The number of bytes read from the compaction output level.
+    uint64_t bytes_read_output_level;
 
-    // Total bytes written during compaction between levels N and N+1
+    // Total number of bytes written during compaction
     uint64_t bytes_written;
 
-    // Total bytes moved to this level
+    // Total number of bytes moved to the output level
     uint64_t bytes_moved;
 
-    // Files read from level N during compaction between levels N and N+1
-    int files_in_leveln;
+    // The number of compaction input files in all non-output levels.
+    int num_input_files_in_non_output_levels;
 
-    // Files read from level N+1 during compaction between levels N and N+1
-    int files_in_levelnp1;
+    // The number of compaction input files in the output level.
+    int num_input_files_in_output_level;
 
-    // Files written during compaction between levels N and N+1
-    int files_out_levelnp1;
+    // The number of compaction output files.
+    int num_output_files;
 
     // Total incoming entries during compaction between levels N and N+1
     uint64_t num_input_records;
@@ -150,39 +167,43 @@ class InternalStats {
 
     explicit CompactionStats(int _count = 0)
         : micros(0),
-          bytes_readn(0),
-          bytes_readnp1(0),
+          bytes_read_non_output_levels(0),
+          bytes_read_output_level(0),
           bytes_written(0),
           bytes_moved(0),
-          files_in_leveln(0),
-          files_in_levelnp1(0),
-          files_out_levelnp1(0),
+          num_input_files_in_non_output_levels(0),
+          num_input_files_in_output_level(0),
+          num_output_files(0),
           num_input_records(0),
           num_dropped_records(0),
           count(_count) {}
 
     explicit CompactionStats(const CompactionStats& c)
         : micros(c.micros),
-          bytes_readn(c.bytes_readn),
-          bytes_readnp1(c.bytes_readnp1),
+          bytes_read_non_output_levels(c.bytes_read_non_output_levels),
+          bytes_read_output_level(c.bytes_read_output_level),
           bytes_written(c.bytes_written),
           bytes_moved(c.bytes_moved),
-          files_in_leveln(c.files_in_leveln),
-          files_in_levelnp1(c.files_in_levelnp1),
-          files_out_levelnp1(c.files_out_levelnp1),
+          num_input_files_in_non_output_levels(
+              c.num_input_files_in_non_output_levels),
+          num_input_files_in_output_level(
+              c.num_input_files_in_output_level),
+          num_output_files(c.num_output_files),
           num_input_records(c.num_input_records),
           num_dropped_records(c.num_dropped_records),
           count(c.count) {}
 
     void Add(const CompactionStats& c) {
       this->micros += c.micros;
-      this->bytes_readn += c.bytes_readn;
-      this->bytes_readnp1 += c.bytes_readnp1;
+      this->bytes_read_non_output_levels += c.bytes_read_non_output_levels;
+      this->bytes_read_output_level += c.bytes_read_output_level;
       this->bytes_written += c.bytes_written;
       this->bytes_moved += c.bytes_moved;
-      this->files_in_leveln += c.files_in_leveln;
-      this->files_in_levelnp1 += c.files_in_levelnp1;
-      this->files_out_levelnp1 += c.files_out_levelnp1;
+      this->num_input_files_in_non_output_levels +=
+          c.num_input_files_in_non_output_levels;
+      this->num_input_files_in_output_level +=
+          c.num_input_files_in_output_level;
+      this->num_output_files += c.num_output_files;
       this->num_input_records += c.num_input_records;
       this->num_dropped_records += c.num_dropped_records;
       this->count += c.count;
@@ -190,13 +211,15 @@ class InternalStats {
 
     void Subtract(const CompactionStats& c) {
       this->micros -= c.micros;
-      this->bytes_readn -= c.bytes_readn;
-      this->bytes_readnp1 -= c.bytes_readnp1;
+      this->bytes_read_non_output_levels -= c.bytes_read_non_output_levels;
+      this->bytes_read_output_level -= c.bytes_read_output_level;
       this->bytes_written -= c.bytes_written;
       this->bytes_moved -= c.bytes_moved;
-      this->files_in_leveln -= c.files_in_leveln;
-      this->files_in_levelnp1 -= c.files_in_levelnp1;
-      this->files_out_levelnp1 -= c.files_out_levelnp1;
+      this->num_input_files_in_non_output_levels -=
+          c.num_input_files_in_non_output_levels;
+      this->num_input_files_in_output_level -=
+          c.num_input_files_in_output_level;
+      this->num_output_files -= c.num_output_files;
       this->num_input_records -= c.num_input_records;
       this->num_dropped_records -= c.num_dropped_records;
       this->count -= c.count;
@@ -228,6 +251,10 @@ class InternalStats {
     db_stats_[type] += value;
   }
 
+  HistogramImpl* GetFileReadHist(int level) {
+    return &file_read_latency_[level];
+  }
+
   uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
 
   uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
@@ -255,6 +282,7 @@ class InternalStats {
   // These count the number of microseconds for which MakeRoomForWrite stalls.
   std::vector<uint64_t> stall_leveln_slowdown_count_hard_;
   std::vector<uint64_t> stall_leveln_slowdown_count_soft_;
+  std::vector<HistogramImpl> file_read_latency_;
 
   // Used to compute per-interval statistics
   struct CFStatsSnapshot {
@@ -325,9 +353,12 @@ class InternalStats {
 class InternalStats {
  public:
   enum InternalCFStatsType {
-    LEVEL0_SLOWDOWN,
+    LEVEL0_SLOWDOWN_TOTAL,
+    LEVEL0_SLOWDOWN_WITH_COMPACTION,
     MEMTABLE_COMPACTION,
-    LEVEL0_NUM_FILES,
+    LEVEL0_NUM_FILES_TOTAL,
+    LEVEL0_NUM_FILES_WITH_COMPACTION,
+    HARD_PENDING_COMPACTION_BYTES_LIMIT,
     WRITE_STALLS_ENUM_MAX,
     BYTES_FLUSHED,
     INTERNAL_CF_STATS_ENUM_MAX,
@@ -349,13 +380,13 @@ class InternalStats {
 
   struct CompactionStats {
     uint64_t micros;
-    uint64_t bytes_readn;
-    uint64_t bytes_readnp1;
+    uint64_t bytes_read_non_output_levels;
+    uint64_t bytes_read_output_level;
     uint64_t bytes_written;
     uint64_t bytes_moved;
-    int files_in_leveln;
-    int files_in_levelnp1;
-    int files_out_levelnp1;
+    int num_input_files_in_non_output_levels;
+    int num_input_files_in_output_level;
+    int num_output_files;
     uint64_t num_input_records;
     uint64_t num_dropped_records;
     int count;
@@ -379,6 +410,8 @@ class InternalStats {
 
   void AddDBStats(InternalDBStatsType type, uint64_t value) {}
 
+  HistogramImpl* GetFileReadHist(int level) { return nullptr; }
+
   uint64_t GetBackgroundErrorCount() const { return 0; }
 
   uint64_t BumpAndGetBackgroundErrorCount() { return 0; }
diff --git a/src/rocksdb/db/job_context.h b/src/rocksdb/db/job_context.h
index d028144..5a54e2d 100644
--- a/src/rocksdb/db/job_context.h
+++ b/src/rocksdb/db/job_context.h
@@ -83,6 +83,10 @@ struct JobContext {
     new_superversion = create_superversion ? new SuperVersion() : nullptr;
   }
 
+  // For non-empty JobContext Clean() has to be called at least once before
+  // before destruction (see asserts in ~JobContext()). Should be called with
+  // unlocked DB mutex. Destructor doesn't call Clean() to avoid accidentally
+  // doing potentially slow Clean() with locked DB mutex.
   void Clean() {
     // free pending memtables
     for (auto m : memtables_to_free) {
@@ -109,6 +113,7 @@ struct JobContext {
     assert(memtables_to_free.size() == 0);
     assert(superversions_to_free.size() == 0);
     assert(new_superversion == nullptr);
+    assert(logs_to_free.size() == 0);
   }
 };
 
diff --git a/src/rocksdb/db/listener_test.cc b/src/rocksdb/db/listener_test.cc
index a605bff..ce683a5 100644
--- a/src/rocksdb/db/listener_test.cc
+++ b/src/rocksdb/db/listener_test.cc
@@ -2,8 +2,9 @@
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
-#include "db/dbformat.h"
+
 #include "db/db_impl.h"
+#include "db/dbformat.h"
 #include "db/filename.h"
 #include "db/version_set.h"
 #include "db/write_batch_internal.h"
@@ -12,25 +13,25 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
-#include "rocksdb/options.h"
 #include "rocksdb/table_properties.h"
 #include "table/block_based_table_factory.h"
 #include "table/plain_table_factory.h"
 #include "util/hash.h"
 #include "util/hash_linklist_rep.h"
-#include "utilities/merge_operators.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/statistics.h"
 #include "util/string_util.h"
-#include "util/testharness.h"
 #include "util/sync_point.h"
+#include "util/testharness.h"
 #include "util/testutil.h"
+#include "utilities/merge_operators.h"
 
 #ifndef ROCKSDB_LITE
 
@@ -134,7 +135,7 @@ class EventListenerTest : public testing::Test {
     return db_->Put(wo, handles_[cf], k, v);
   }
 
-  Status Flush(int cf = 0) {
+  Status Flush(size_t cf = 0) {
     FlushOptions opt = FlushOptions();
     opt.wait = true;
     if (cf == 0) {
@@ -144,6 +145,8 @@ class EventListenerTest : public testing::Test {
     }
   }
 
+  const size_t k110KB = 110 << 10;
+
   DB* db_;
   std::string dbname_;
   std::vector<ColumnFamilyHandle*> handles_;
@@ -156,6 +159,8 @@ class TestCompactionListener : public EventListener {
     compacted_dbs_.push_back(db);
     ASSERT_GT(ci.input_files.size(), 0U);
     ASSERT_GT(ci.output_files.size(), 0U);
+    ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id);
+    ASSERT_GT(ci.thread_id, 0U);
   }
 
   std::vector<DB*> compacted_dbs_;
@@ -177,7 +182,9 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
   options.max_bytes_for_level_base = options.target_file_size_base * 2;
   options.max_bytes_for_level_multiplier = 2;
   options.compression = kNoCompression;
+#if ROCKSDB_USING_THREAD_STATUS
   options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
   options.level0_file_num_compaction_trigger = kNumL0Files;
 
   TestCompactionListener* listener = new TestCompactionListener();
@@ -194,10 +201,11 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
   ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
   ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
   for (size_t i = 1; i < 8; ++i) {
-    ASSERT_OK(Flush(static_cast<int>(i)));
+    ASSERT_OK(Flush(i));
     const Slice kStart = "a";
     const Slice kEnd = "z";
-    ASSERT_OK(dbfull()->CompactRange(handles_[i], &kStart, &kEnd));
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i],
+                                     &kStart, &kEnd));
     dbfull()->TEST_WaitForFlushMemTable();
     dbfull()->TEST_WaitForCompact();
   }
@@ -208,33 +216,85 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
   }
 }
 
+// This simple Listener can only handle one flush at a time.
 class TestFlushListener : public EventListener {
  public:
+  explicit TestFlushListener(Env* env)
+      : slowdown_count(0), stop_count(0), db_closed(), env_(env) {
+    db_closed = false;
+  }
+  void OnTableFileCreated(
+      const TableFileCreationInfo& info) override {
+    // remember the info for later checking the FlushJobInfo.
+    prev_fc_info_ = info;
+    ASSERT_GT(info.db_name.size(), 0U);
+    ASSERT_GT(info.cf_name.size(), 0U);
+    ASSERT_GT(info.file_path.size(), 0U);
+    ASSERT_GT(info.job_id, 0);
+    ASSERT_GT(info.table_properties.data_size, 0U);
+    ASSERT_GT(info.table_properties.raw_key_size, 0U);
+    ASSERT_GT(info.table_properties.raw_value_size, 0U);
+    ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+    ASSERT_GT(info.table_properties.num_entries, 0U);
+
+#if ROCKSDB_USING_THREAD_STATUS
+    // Verify the id of the current thread that created this table
+    // file matches the id of any active flush or compaction thread.
+    uint64_t thread_id = env_->GetThreadID();
+    std::vector<ThreadStatus> thread_list;
+    ASSERT_OK(env_->GetThreadList(&thread_list));
+    bool found_match = false;
+    for (auto thread_status : thread_list) {
+      if (thread_status.operation_type == ThreadStatus::OP_FLUSH ||
+          thread_status.operation_type == ThreadStatus::OP_COMPACTION) {
+        if (thread_id == thread_status.thread_id) {
+          found_match = true;
+          break;
+        }
+      }
+    }
+    ASSERT_TRUE(found_match);
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  }
+
   void OnFlushCompleted(
-      DB* db, const std::string& name,
-      const std::string& file_path,
-      bool triggered_writes_slowdown,
-      bool triggered_writes_stop) override {
+      DB* db, const FlushJobInfo& info) override {
     flushed_dbs_.push_back(db);
-    flushed_column_family_names_.push_back(name);
-    if (triggered_writes_slowdown) {
+    flushed_column_family_names_.push_back(info.cf_name);
+    if (info.triggered_writes_slowdown) {
       slowdown_count++;
     }
-    if (triggered_writes_stop) {
+    if (info.triggered_writes_stop) {
       stop_count++;
     }
+    // verify whether the previously created file matches the flushed file.
+    ASSERT_EQ(prev_fc_info_.db_name, db->GetName());
+    ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name);
+    ASSERT_EQ(prev_fc_info_.job_id, info.job_id);
+    ASSERT_EQ(prev_fc_info_.file_path, info.file_path);
+    ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id);
+    ASSERT_GT(info.thread_id, 0U);
   }
 
   std::vector<std::string> flushed_column_family_names_;
   std::vector<DB*> flushed_dbs_;
   int slowdown_count;
   int stop_count;
+  bool db_closing;
+  std::atomic_bool db_closed;
+  TableFileCreationInfo prev_fc_info_;
+
+ protected:
+  Env* env_;
 };
 
 TEST_F(EventListenerTest, OnSingleDBFlushTest) {
   Options options;
-  options.write_buffer_size = 100000;
-  TestFlushListener* listener = new TestFlushListener();
+  options.write_buffer_size = k110KB;
+#if ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  TestFlushListener* listener = new TestFlushListener(options.env);
   options.listeners.emplace_back(listener);
   std::vector<std::string> cf_names = {
       "pikachu", "ilya", "muromec", "dobrynia",
@@ -249,7 +309,7 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
   ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
   ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
   for (size_t i = 1; i < 8; ++i) {
-    ASSERT_OK(Flush(static_cast<int>(i)));
+    ASSERT_OK(Flush(i));
     dbfull()->TEST_WaitForFlushMemTable();
     ASSERT_EQ(listener->flushed_dbs_.size(), i);
     ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
@@ -264,8 +324,11 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
 
 TEST_F(EventListenerTest, MultiCF) {
   Options options;
-  options.write_buffer_size = 100000;
-  TestFlushListener* listener = new TestFlushListener();
+  options.write_buffer_size = k110KB;
+#if ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  TestFlushListener* listener = new TestFlushListener(options.env);
   options.listeners.emplace_back(listener);
   std::vector<std::string> cf_names = {
       "pikachu", "ilya", "muromec", "dobrynia",
@@ -280,7 +343,7 @@ TEST_F(EventListenerTest, MultiCF) {
   ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
   ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
   for (size_t i = 1; i < 8; ++i) {
-    ASSERT_OK(Flush(static_cast<int>(i)));
+    ASSERT_OK(Flush(i));
     ASSERT_EQ(listener->flushed_dbs_.size(), i);
     ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
   }
@@ -293,18 +356,21 @@ TEST_F(EventListenerTest, MultiCF) {
 }
 
 TEST_F(EventListenerTest, MultiDBMultiListeners) {
+  Options options;
+#if ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
   std::vector<TestFlushListener*> listeners;
   const int kNumDBs = 5;
   const int kNumListeners = 10;
   for (int i = 0; i < kNumListeners; ++i) {
-    listeners.emplace_back(new TestFlushListener());
+    listeners.emplace_back(new TestFlushListener(options.env));
   }
 
   std::vector<std::string> cf_names = {
       "pikachu", "ilya", "muromec", "dobrynia",
       "nikitich", "alyosha", "popovich"};
 
-  Options options;
   options.create_if_missing = true;
   for (int i = 0; i < kNumListeners; ++i) {
     options.listeners.emplace_back(listeners[i]);
@@ -355,6 +421,7 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
     }
   }
 
+
   for (auto handles : vec_handles) {
     for (auto h : handles) {
       delete h;
@@ -370,11 +437,17 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
 
 TEST_F(EventListenerTest, DisableBGCompaction) {
   Options options;
-  TestFlushListener* listener = new TestFlushListener();
+#if ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  TestFlushListener* listener = new TestFlushListener(options.env);
+  const int kCompactionTrigger = 1;
   const int kSlowdownTrigger = 5;
-  const int kStopTrigger = 10;
+  const int kStopTrigger = 100;
+  options.level0_file_num_compaction_trigger = kCompactionTrigger;
   options.level0_slowdown_writes_trigger = kSlowdownTrigger;
   options.level0_stop_writes_trigger = kStopTrigger;
+  options.max_write_buffer_number = 10;
   options.listeners.emplace_back(listener);
   // BG compaction is disabled.  Number of L0 files will simply keeps
   // increasing in this test.
@@ -383,17 +456,17 @@ TEST_F(EventListenerTest, DisableBGCompaction) {
   options.write_buffer_size = 100000;  // Small write buffer
 
   CreateAndReopenWithCF({"pikachu"}, &options);
-  WriteOptions wopts;
-  wopts.timeout_hint_us = 100000;
   ColumnFamilyMetaData cf_meta;
   db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+
   // keep writing until writes are forced to stop.
-  for (int i = 0; static_cast<int>(cf_meta.file_count) < kStopTrigger; ++i) {
-    Put(1, ToString(i), std::string(100000, 'x'), wopts);
+  for (int i = 0; static_cast<int>(cf_meta.file_count) < kSlowdownTrigger * 10;
+       ++i) {
+    Put(1, ToString(i), std::string(10000, 'x'), WriteOptions());
+    db_->Flush(FlushOptions(), handles_[1]);
     db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
   }
-  ASSERT_GE(listener->slowdown_count, kStopTrigger - kSlowdownTrigger);
-  ASSERT_GE(listener->stop_count, 1);
+  ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9);
 }
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/log_reader.cc b/src/rocksdb/db/log_reader.cc
index f6514cf..296f1d5 100644
--- a/src/rocksdb/db/log_reader.cc
+++ b/src/rocksdb/db/log_reader.cc
@@ -13,6 +13,7 @@
 #include "rocksdb/env.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 namespace log {
@@ -20,7 +21,7 @@ namespace log {
 Reader::Reporter::~Reporter() {
 }
 
-Reader::Reader(unique_ptr<SequentialFile>&& _file, Reporter* reporter,
+Reader::Reader(unique_ptr<SequentialFileReader>&& _file, Reporter* reporter,
                bool checksum, uint64_t initial_offset)
     : file_(std::move(_file)),
       reporter_(reporter),
@@ -61,7 +62,8 @@ bool Reader::SkipToInitialBlock() {
   return true;
 }
 
-bool Reader::ReadRecord(Slice* record, std::string* scratch) {
+bool Reader::ReadRecord(Slice* record, std::string* scratch,
+                        const bool report_eof_inconsistency) {
   if (last_record_offset_ < initial_offset_) {
     if (!SkipToInitialBlock()) {
       return false;
@@ -78,7 +80,8 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
   Slice fragment;
   while (true) {
     uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
-    const unsigned int record_type = ReadPhysicalRecord(&fragment);
+    const unsigned int record_type =
+        ReadPhysicalRecord(&fragment, report_eof_inconsistency);
     switch (record_type) {
       case kFullType:
         if (in_fragmented_record && !scratch->empty()) {
@@ -130,6 +133,9 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
 
       case kEof:
         if (in_fragmented_record) {
+          if (report_eof_inconsistency) {
+            ReportCorruption(scratch->size(), "error reading trailing data");
+          }
           // This can be caused by the writer dying immediately after
           //  writing a physical record but before completing the next; don't
           //  treat it as a corruption, just ignore the entire logical record.
@@ -238,7 +244,8 @@ void Reader::ReportDrop(size_t bytes, const Status& reason) {
   }
 }
 
-unsigned int Reader::ReadPhysicalRecord(Slice* result) {
+unsigned int Reader::ReadPhysicalRecord(Slice* result,
+                                        const bool report_eof_inconsistency) {
   while (true) {
     if (buffer_.size() < (size_t)kHeaderSize) {
       if (!eof_ && !read_error_) {
@@ -259,8 +266,11 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
       } else {
         // Note that if buffer_ is non-empty, we have a truncated header at the
         //  end of the file, which can be caused by the writer crashing in the
-        //  middle of writing the header. Instead of considering this an error,
-        //  just report EOF.
+        //  middle of writing the header. Unless explicitly requested we don't
+        //  considering this an error, just report EOF.
+        if (buffer_.size() && report_eof_inconsistency) {
+          ReportCorruption(buffer_.size(), "truncated header");
+        }
         buffer_.clear();
         return kEof;
       }
@@ -281,7 +291,10 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result) {
       }
       // If the end of the file has been reached without reading |length| bytes
       // of payload, assume the writer died in the middle of writing the record.
-      // Don't report a corruption.
+      // Don't report a corruption unless requested.
+      if (drop_size && report_eof_inconsistency) {
+        ReportCorruption(drop_size, "truncated header");
+      }
       return kEof;
     }
 
diff --git a/src/rocksdb/db/log_reader.h b/src/rocksdb/db/log_reader.h
index a7cf45b..390696b 100644
--- a/src/rocksdb/db/log_reader.h
+++ b/src/rocksdb/db/log_reader.h
@@ -17,7 +17,7 @@
 
 namespace rocksdb {
 
-class SequentialFile;
+class SequentialFileReader;
 using std::unique_ptr;
 
 namespace log {
@@ -51,7 +51,7 @@ class Reader {
   //
   // The Reader will start reading at the first record located at physical
   // position >= initial_offset within the file.
-  Reader(unique_ptr<SequentialFile>&& file, Reporter* reporter,
+  Reader(unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
          bool checksum, uint64_t initial_offset);
 
   ~Reader();
@@ -61,7 +61,8 @@ class Reader {
   // "*scratch" as temporary storage.  The contents filled in *record
   // will only be valid until the next mutating operation on this
   // reader or the next mutation to *scratch.
-  bool ReadRecord(Slice* record, std::string* scratch);
+  bool ReadRecord(Slice* record, std::string* scratch,
+                  bool report_eof_inconsistency = false);
 
   // Returns the physical offset of the last record returned by ReadRecord.
   //
@@ -80,10 +81,10 @@ class Reader {
   // block that was partially read.
   void UnmarkEOF();
 
-  SequentialFile* file() { return file_.get(); }
+  SequentialFileReader* file() { return file_.get(); }
 
  private:
-  const unique_ptr<SequentialFile> file_;
+  const unique_ptr<SequentialFileReader> file_;
   Reporter* const reporter_;
   bool const checksum_;
   char* const backing_store_;
@@ -120,7 +121,8 @@ class Reader {
   bool SkipToInitialBlock();
 
   // Return type, or one of the preceding special values
-  unsigned int ReadPhysicalRecord(Slice* result);
+  unsigned int ReadPhysicalRecord(Slice* result,
+                                  bool report_eof_inconsistency = false);
 
   // Reports dropped bytes to the reporter.
   // buffer_ must be updated to remove the dropped bytes prior to invocation.
diff --git a/src/rocksdb/db/log_test.cc b/src/rocksdb/db/log_test.cc
index 816e38d..5ab41f2 100644
--- a/src/rocksdb/db/log_test.cc
+++ b/src/rocksdb/db/log_test.cc
@@ -12,8 +12,10 @@
 #include "rocksdb/env.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
+#include "util/file_reader_writer.h"
 #include "util/random.h"
 #include "util/testharness.h"
+#include "util/testutil.h"
 
 namespace rocksdb {
 namespace log {
@@ -43,46 +45,6 @@ static std::string RandomSkewedString(int i, Random* rnd) {
 
 class LogTest : public testing::Test {
  private:
-  class StringDest : public WritableFile {
-   public:
-    std::string contents_;
-
-    explicit StringDest(Slice& reader_contents) :
-      WritableFile(),
-      contents_(""),
-      reader_contents_(reader_contents),
-      last_flush_(0) {
-      reader_contents_ = Slice(contents_.data(), 0);
-    };
-
-    virtual Status Close() override { return Status::OK(); }
-    virtual Status Flush() override {
-      EXPECT_TRUE(reader_contents_.size() <= last_flush_);
-      size_t offset = last_flush_ - reader_contents_.size();
-      reader_contents_ = Slice(
-          contents_.data() + offset,
-          contents_.size() - offset);
-      last_flush_ = contents_.size();
-
-      return Status::OK();
-    }
-    virtual Status Sync() override { return Status::OK(); }
-    virtual Status Append(const Slice& slice) override {
-      contents_.append(slice.data(), slice.size());
-      return Status::OK();
-    }
-    void Drop(size_t bytes) {
-      contents_.resize(contents_.size() - bytes);
-      reader_contents_ = Slice(
-          reader_contents_.data(), reader_contents_.size() - bytes);
-      last_flush_ = contents_.size();
-    }
-
-   private:
-    Slice& reader_contents_;
-    size_t last_flush_;
-  };
-
   class StringSource : public SequentialFile {
    public:
     Slice& contents_;
@@ -163,26 +125,28 @@ class LogTest : public testing::Test {
   };
 
   std::string& dest_contents() {
-    auto dest = dynamic_cast<StringDest*>(writer_.file());
+    auto dest =
+      dynamic_cast<test::StringSink*>(writer_.file()->writable_file());
     assert(dest);
     return dest->contents_;
   }
 
   const std::string& dest_contents() const {
-    auto dest = dynamic_cast<const StringDest*>(writer_.file());
+    auto dest =
+      dynamic_cast<const test::StringSink*>(writer_.file()->writable_file());
     assert(dest);
     return dest->contents_;
   }
 
   void reset_source_contents() {
-    auto src = dynamic_cast<StringSource*>(reader_.file());
+    auto src = dynamic_cast<StringSource*>(reader_.file()->file());
     assert(src);
     src->contents_ = dest_contents();
   }
 
   Slice reader_contents_;
-  unique_ptr<StringDest> dest_holder_;
-  unique_ptr<StringSource> source_holder_;
+  unique_ptr<WritableFileWriter> dest_holder_;
+  unique_ptr<SequentialFileReader> source_holder_;
   ReportCollector report_;
   Writer writer_;
   Reader reader_;
@@ -192,13 +156,16 @@ class LogTest : public testing::Test {
   static uint64_t initial_offset_last_record_offsets_[];
 
  public:
-  LogTest() : reader_contents_(),
-              dest_holder_(new StringDest(reader_contents_)),
-              source_holder_(new StringSource(reader_contents_)),
-              writer_(std::move(dest_holder_)),
-              reader_(std::move(source_holder_), &report_, true/*checksum*/,
-                      0/*initial_offset*/) {
-  }
+  LogTest()
+      : reader_contents_(),
+        dest_holder_(
+            test::GetWritableFileWriter(
+              new test::StringSink(&reader_contents_))),
+        source_holder_(
+            test::GetSequentialFileReader(new StringSource(reader_contents_))),
+        writer_(std::move(dest_holder_)),
+        reader_(std::move(source_holder_), &report_, true /*checksum*/,
+                0 /*initial_offset*/) {}
 
   void Write(const std::string& msg) {
     writer_.AddRecord(Slice(msg));
@@ -208,10 +175,10 @@ class LogTest : public testing::Test {
     return dest_contents().size();
   }
 
-  std::string Read() {
+  std::string Read(const bool report_eof_inconsistency = false) {
     std::string scratch;
     Slice record;
-    if (reader_.ReadRecord(&record, &scratch)) {
+    if (reader_.ReadRecord(&record, &scratch, report_eof_inconsistency)) {
       return record.ToString();
     } else {
       return "EOF";
@@ -227,7 +194,8 @@ class LogTest : public testing::Test {
   }
 
   void ShrinkSize(int bytes) {
-    auto dest = dynamic_cast<StringDest*>(writer_.file());
+    auto dest =
+      dynamic_cast<test::StringSink*>(writer_.file()->writable_file());
     assert(dest);
     dest->Drop(bytes);
   }
@@ -240,7 +208,7 @@ class LogTest : public testing::Test {
   }
 
   void ForceError(size_t position = 0) {
-    auto src = dynamic_cast<StringSource*>(reader_.file());
+    auto src = dynamic_cast<StringSource*>(reader_.file()->file());
     src->force_error_ = true;
     src->force_error_position_ = position;
   }
@@ -254,13 +222,13 @@ class LogTest : public testing::Test {
   }
 
   void ForceEOF(size_t position = 0) {
-    auto src = dynamic_cast<StringSource*>(reader_.file());
+    auto src = dynamic_cast<StringSource*>(reader_.file()->file());
     src->force_eof_ = true;
     src->force_eof_position_ = position;
   }
 
   void UnmarkEOF() {
-    auto src = dynamic_cast<StringSource*>(reader_.file());
+    auto src = dynamic_cast<StringSource*>(reader_.file()->file());
     src->returned_partial_ = false;
     reader_.UnmarkEOF();
   }
@@ -288,10 +256,11 @@ class LogTest : public testing::Test {
 
   void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
     WriteInitialOffsetLog();
-    unique_ptr<StringSource> source(new StringSource(reader_contents_));
+    unique_ptr<SequentialFileReader> file_reader(
+        test::GetSequentialFileReader(new StringSource(reader_contents_)));
     unique_ptr<Reader> offset_reader(
-      new Reader(std::move(source), &report_, true/*checksum*/,
-                 WrittenBytes() + offset_past_end));
+        new Reader(std::move(file_reader), &report_, true /*checksum*/,
+                   WrittenBytes() + offset_past_end));
     Slice record;
     std::string scratch;
     ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
@@ -300,10 +269,10 @@ class LogTest : public testing::Test {
   void CheckInitialOffsetRecord(uint64_t initial_offset,
                                 int expected_record_offset) {
     WriteInitialOffsetLog();
-    unique_ptr<StringSource> source(new StringSource(reader_contents_));
-    unique_ptr<Reader> offset_reader(
-      new Reader(std::move(source), &report_, true/*checksum*/,
-                 initial_offset));
+    unique_ptr<SequentialFileReader> file_reader(
+        test::GetSequentialFileReader(new StringSource(reader_contents_)));
+    unique_ptr<Reader> offset_reader(new Reader(
+        std::move(file_reader), &report_, true /*checksum*/, initial_offset));
     Slice record;
     std::string scratch;
     ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
@@ -452,6 +421,15 @@ TEST_F(LogTest, TruncatedTrailingRecordIsIgnored) {
   ASSERT_EQ("", ReportMessage());
 }
 
+TEST_F(LogTest, TruncatedTrailingRecordIsNotIgnored) {
+  Write("foo");
+  ShrinkSize(4);  // Drop all payload as well as a header byte
+  ASSERT_EQ("EOF", Read(/*report_eof_inconsistency*/ true));
+  // Truncated last record is ignored, not treated as an error
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
+}
+
 TEST_F(LogTest, BadLength) {
   const int kPayloadSize = kBlockSize - kHeaderSize;
   Write(BigString("bar", kPayloadSize));
@@ -471,6 +449,14 @@ TEST_F(LogTest, BadLengthAtEndIsIgnored) {
   ASSERT_EQ("", ReportMessage());
 }
 
+TEST_F(LogTest, BadLengthAtEndIsNotIgnored) {
+  Write("foo");
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true));
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
+}
+
 TEST_F(LogTest, ChecksumMismatch) {
   Write("foo");
   IncrementByte(0, 10);
@@ -528,6 +514,15 @@ TEST_F(LogTest, MissingLastIsIgnored) {
   ASSERT_EQ(0U, DroppedBytes());
 }
 
+TEST_F(LogTest, MissingLastIsNotIgnored) {
+  Write(BigString("bar", kBlockSize));
+  // Remove the LAST block, including header.
+  ShrinkSize(14);
+  ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true));
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError("Corruption: error reading trailing data"));
+}
+
 TEST_F(LogTest, PartialLastIsIgnored) {
   Write(BigString("bar", kBlockSize));
   // Cause a bad record length in the LAST block.
@@ -537,6 +532,17 @@ TEST_F(LogTest, PartialLastIsIgnored) {
   ASSERT_EQ(0U, DroppedBytes());
 }
 
+TEST_F(LogTest, PartialLastIsNotIgnored) {
+  Write(BigString("bar", kBlockSize));
+  // Cause a bad record length in the LAST block.
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true));
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError(
+                      "Corruption: truncated headerCorruption: "
+                      "error reading trailing data"));
+}
+
 TEST_F(LogTest, ErrorJoinsRecords) {
   // Consider two fragmented records:
   //    first(R1) last(R1) first(R2) last(R2)
diff --git a/src/rocksdb/db/log_writer.cc b/src/rocksdb/db/log_writer.cc
index d78de5e..32d4afd 100644
--- a/src/rocksdb/db/log_writer.cc
+++ b/src/rocksdb/db/log_writer.cc
@@ -13,13 +13,13 @@
 #include "rocksdb/env.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 namespace log {
 
-Writer::Writer(unique_ptr<WritableFile>&& dest)
-    : dest_(std::move(dest)),
-      block_offset_(0) {
+Writer::Writer(unique_ptr<WritableFileWriter>&& dest)
+    : dest_(std::move(dest)), block_offset_(0) {
   for (int i = 0; i <= kMaxRecordType; i++) {
     char t = static_cast<char>(i);
     type_crc_[i] = crc32c::Value(&t, 1);
diff --git a/src/rocksdb/db/log_writer.h b/src/rocksdb/db/log_writer.h
index 46226ec..6b59bbd 100644
--- a/src/rocksdb/db/log_writer.h
+++ b/src/rocksdb/db/log_writer.h
@@ -6,17 +6,19 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-
 #pragma once
-#include <memory>
+
 #include <stdint.h>
+
+#include <memory>
+
 #include "db/log_format.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 
 namespace rocksdb {
 
-class WritableFile;
+class WritableFileWriter;
 
 using std::unique_ptr;
 
@@ -61,16 +63,16 @@ class Writer {
   // Create a writer that will append data to "*dest".
   // "*dest" must be initially empty.
   // "*dest" must remain live while this Writer is in use.
-  explicit Writer(unique_ptr<WritableFile>&& dest);
+  explicit Writer(unique_ptr<WritableFileWriter>&& dest);
   ~Writer();
 
   Status AddRecord(const Slice& slice);
 
-  WritableFile* file() { return dest_.get(); }
-  const WritableFile* file() const { return dest_.get(); }
+  WritableFileWriter* file() { return dest_.get(); }
+  const WritableFileWriter* file() const { return dest_.get(); }
 
  private:
-  unique_ptr<WritableFile> dest_;
+  unique_ptr<WritableFileWriter> dest_;
   int block_offset_;       // Current offset in block
 
   // crc32c values for all supported record types.  These are
diff --git a/src/rocksdb/db/managed_iterator.cc b/src/rocksdb/db/managed_iterator.cc
index 8dd5f4d..45faeba 100644
--- a/src/rocksdb/db/managed_iterator.cc
+++ b/src/rocksdb/db/managed_iterator.cc
@@ -93,6 +93,7 @@ ManagedIterator::~ManagedIterator() {
     snapshot_created_ = false;
     read_options_.snapshot = nullptr;
   }
+  UnLock();
 }
 
 bool ManagedIterator::Valid() const { return valid_; }
diff --git a/src/rocksdb/db/memtable.cc b/src/rocksdb/db/memtable.cc
index 76392d6..54c119e 100644
--- a/src/rocksdb/db/memtable.cc
+++ b/src/rocksdb/db/memtable.cc
@@ -54,7 +54,7 @@ MemTableOptions::MemTableOptions(
 MemTable::MemTable(const InternalKeyComparator& cmp,
                    const ImmutableCFOptions& ioptions,
                    const MutableCFOptions& mutable_cf_options,
-                   WriteBuffer* write_buffer)
+                   WriteBuffer* write_buffer, SequenceNumber earliest_seq)
     : comparator_(cmp),
       moptions_(ioptions, mutable_cf_options),
       refs_(0),
@@ -64,12 +64,14 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       table_(ioptions.memtable_factory->CreateMemTableRep(
           comparator_, &allocator_, ioptions.prefix_extractor,
           ioptions.info_log)),
+      data_size_(0),
       num_entries_(0),
       num_deletes_(0),
       flush_in_progress_(false),
       flush_completed_(false),
       file_number_(0),
       first_seqno_(0),
+      earliest_seqno_(earliest_seq),
       mem_next_logfile_number_(0),
       locks_(moptions_.inplace_update_support
                  ? moptions_.inplace_update_num_locks
@@ -110,7 +112,7 @@ bool MemTable::ShouldFlushNow() const {
   // In a lot of times, we cannot allocate arena blocks that exactly matches the
   // buffer size. Thus we have to decide if we should over-allocate or
   // under-allocate.
-  // This constant avariable can be interpreted as: if we still have more than
+  // This constant variable can be interpreted as: if we still have more than
   // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over
   // allocate one more block.
   const double kAllowOverAllocationRatio = 0.6;
@@ -228,10 +230,15 @@ class MemTableIterator: public Iterator {
   virtual void Seek(const Slice& k) override {
     PERF_TIMER_GUARD(seek_on_memtable_time);
     PERF_COUNTER_ADD(seek_on_memtable_count, 1);
-    if (bloom_ != nullptr &&
-        !bloom_->MayContain(prefix_extractor_->Transform(ExtractUserKey(k)))) {
-      valid_ = false;
-      return;
+    if (bloom_ != nullptr) {
+      if (!bloom_->MayContain(
+              prefix_extractor_->Transform(ExtractUserKey(k)))) {
+        PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+        valid_ = false;
+        return;
+      } else {
+        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+      }
     }
     iter_->Seek(k, nullptr);
     valid_ = iter_->Valid();
@@ -289,6 +296,26 @@ port::RWMutex* MemTable::GetLock(const Slice& key) {
   return &locks_[hash(key) % locks_.size()];
 }
 
+uint64_t MemTable::ApproximateSize(const Slice& start_ikey,
+                                   const Slice& end_ikey) {
+  uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey);
+  if (entry_count == 0) {
+    return 0;
+  }
+  uint64_t n = num_entries_.load(std::memory_order_relaxed);
+  if (n == 0) {
+    return 0;
+  }
+  if (entry_count > n) {
+    // table_->ApproximateNumEntries() is just an estimate so it can be larger
+    // than actual entries we have. Cap it to entries we have to limit the
+    // inaccuracy.
+    entry_count = n;
+  }
+  uint64_t data_size = data_size_.load(std::memory_order_relaxed);
+  return entry_count * (data_size / n);
+}
+
 void MemTable::Add(SequenceNumber s, ValueType type,
                    const Slice& key, /* user key */
                    const Slice& value) {
@@ -309,13 +336,17 @@ void MemTable::Add(SequenceNumber s, ValueType type,
   char* p = EncodeVarint32(buf, internal_key_size);
   memcpy(p, key.data(), key_size);
   p += key_size;
-  EncodeFixed64(p, (s << 8) | type);
+  uint64_t packed = PackSequenceAndType(s, type);
+  EncodeFixed64(p, packed);
   p += 8;
   p = EncodeVarint32(p, val_size);
   memcpy(p, value.data(), val_size);
   assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
   table_->Insert(handle);
-  num_entries_++;
+  num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1,
+                     std::memory_order_relaxed);
+  data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len,
+                   std::memory_order_relaxed);
   if (type == kTypeDeletion) {
     num_deletes_++;
   }
@@ -329,6 +360,11 @@ void MemTable::Add(SequenceNumber s, ValueType type,
   assert(first_seqno_ == 0 || s > first_seqno_);
   if (first_seqno_ == 0) {
     first_seqno_ = s;
+
+    if (earliest_seqno_ == kMaxSequenceNumber) {
+      earliest_seqno_ = first_seqno_;
+    }
+    assert(first_seqno_ >= earliest_seqno_);
   }
 
   should_flush_ = ShouldFlushNow();
@@ -343,6 +379,7 @@ struct Saver {
   bool* found_final_value;  // Is value set correctly? Used by KeyMayExist
   bool* merge_in_progress;
   std::string* value;
+  SequenceNumber seq;
   const MergeOperator* merge_operator;
   // the merge operations encountered;
   MergeContext* merge_context;
@@ -372,11 +409,14 @@ static bool SaveValue(void* arg, const char* entry) {
   // all entries with overly large sequence numbers.
   uint32_t key_length;
   const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
-  if (s->mem->GetInternalKeyComparator().user_comparator()->Compare(
-          Slice(key_ptr, key_length - 8), s->key->user_key()) == 0) {
+  if (s->mem->GetInternalKeyComparator().user_comparator()->Equal(
+          Slice(key_ptr, key_length - 8), s->key->user_key())) {
     // Correct user key
     const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
-    switch (static_cast<ValueType>(tag & 0xff)) {
+    ValueType type;
+    UnPackSequenceAndType(tag, &s->seq, &type);
+
+    switch (type) {
       case kTypeValue: {
         if (s->inplace_update_support) {
           s->mem->GetLock(s->key->user_key())->ReadLock();
@@ -409,9 +449,10 @@ static bool SaveValue(void* arg, const char* entry) {
         *(s->found_final_value) = true;
         return false;
       }
-      case kTypeDeletion: {
+      case kTypeDeletion:
+      case kTypeSingleDeletion: {
         if (*(s->merge_in_progress)) {
-          assert(merge_operator);
+          assert(merge_operator != nullptr);
           *(s->status) = Status::OK();
           bool merge_success = false;
           {
@@ -461,7 +502,7 @@ static bool SaveValue(void* arg, const char* entry) {
 }
 
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
-                   MergeContext* merge_context) {
+                   MergeContext* merge_context, SequenceNumber* seq) {
   // The sequence number is updated synchronously in version_set.h
   if (IsEmpty()) {
     // Avoiding recording stats for speed.
@@ -472,18 +513,25 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   Slice user_key = key.user_key();
   bool found_final_value = false;
   bool merge_in_progress = s->IsMergeInProgress();
-
-  if (prefix_bloom_ &&
-      !prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) {
+  bool const may_contain =
+      nullptr == prefix_bloom_
+          ? false
+          : prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key));
+  if (prefix_bloom_ && !may_contain) {
     // iter is null if prefix bloom says the key does not exist
+    PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+    *seq = kMaxSequenceNumber;
   } else {
+    if (prefix_bloom_) {
+      PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+    }
     Saver saver;
     saver.status = s;
     saver.found_final_value = &found_final_value;
     saver.merge_in_progress = &merge_in_progress;
     saver.key = &key;
     saver.value = value;
-    saver.status = s;
+    saver.seq = kMaxSequenceNumber;
     saver.mem = this;
     saver.merge_context = merge_context;
     saver.merge_operator = moptions_.merge_operator;
@@ -492,11 +540,13 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     saver.statistics = moptions_.statistics;
     saver.env_ = env_;
     table_->Get(key, &saver, SaveValue);
+
+    *seq = saver.seq;
   }
 
   // No change to value, since we have not yet found a Put/Delete
   if (!found_final_value && merge_in_progress) {
-    *s = Status::MergeInProgress("");
+    *s = Status::MergeInProgress();
   }
   PERF_COUNTER_ADD(get_from_memtable_count, 1);
   return found_final_value;
@@ -525,11 +575,14 @@ void MemTable::Update(SequenceNumber seq,
     const char* entry = iter->key();
     uint32_t key_length = 0;
     const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
-    if (comparator_.comparator.user_comparator()->Compare(
-        Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
+    if (comparator_.comparator.user_comparator()->Equal(
+            Slice(key_ptr, key_length - 8), lkey.user_key())) {
       // Correct user key
       const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
-      switch (static_cast<ValueType>(tag & 0xff)) {
+      ValueType type;
+      SequenceNumber unused;
+      UnPackSequenceAndType(tag, &unused, &type);
+      switch (type) {
         case kTypeValue: {
           Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
           uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
@@ -583,11 +636,14 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
     const char* entry = iter->key();
     uint32_t key_length = 0;
     const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
-    if (comparator_.comparator.user_comparator()->Compare(
-        Slice(key_ptr, key_length - 8), lkey.user_key()) == 0) {
+    if (comparator_.comparator.user_comparator()->Equal(
+            Slice(key_ptr, key_length - 8), lkey.user_key())) {
       // Correct user key
       const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
-      switch (static_cast<ValueType>(tag & 0xff)) {
+      ValueType type;
+      uint64_t unused;
+      UnPackSequenceAndType(tag, &unused, &type);
+      switch (type) {
         case kTypeValue: {
           Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
           uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
@@ -651,13 +707,16 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
     const char* entry = iter->key();
     uint32_t key_length = 0;
     const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
-    if (comparator_.comparator.user_comparator()->Compare(
-            Slice(iter_key_ptr, key_length - 8), key.user_key()) != 0) {
+    if (!comparator_.comparator.user_comparator()->Equal(
+            Slice(iter_key_ptr, key_length - 8), key.user_key())) {
       break;
     }
 
     const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
-    if (static_cast<ValueType>(tag & 0xff) != kTypeMerge) {
+    ValueType type;
+    uint64_t unused;
+    UnPackSequenceAndType(tag, &unused, &type);
+    if (type != kTypeMerge) {
       break;
     }
 
diff --git a/src/rocksdb/db/memtable.h b/src/rocksdb/db/memtable.h
index aa26b32..f09082c 100644
--- a/src/rocksdb/db/memtable.h
+++ b/src/rocksdb/db/memtable.h
@@ -80,10 +80,17 @@ class MemTable {
 
   // MemTables are reference counted.  The initial reference count
   // is zero and the caller must call Ref() at least once.
+  //
+  // earliest_seq should be the current SequenceNumber in the db such that any
+  // key inserted into this memtable will have an equal or larger seq number.
+  // (When a db is first created, the earliest sequence number will be 0).
+  // If the earliest sequence number is not known, kMaxSequenceNumber may be
+  // used, but this may prevent some transactions from succeeding until the
+  // first key is inserted into the memtable.
   explicit MemTable(const InternalKeyComparator& comparator,
                     const ImmutableCFOptions& ioptions,
                     const MutableCFOptions& mutable_cf_options,
-                    WriteBuffer* write_buffer);
+                    WriteBuffer* write_buffer, SequenceNumber earliest_seq);
 
   // Do not delete this MemTable unless Unref() indicates it not in use.
   ~MemTable();
@@ -153,8 +160,19 @@ class MemTable {
   //   prepend the current merge operand to *operands.
   //   store MergeInProgress in s, and return false.
   // Else, return false.
+  // If any operation was found, its most recent sequence number
+  // will be stored in *seq on success (regardless of whether true/false is
+  // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
+  // On success, *s may be set to OK, NotFound, or MergeInProgress.  Any other
+  // status returned indicates a corruption or other unexpected error.
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext* merge_context);
+           MergeContext* merge_context, SequenceNumber* seq);
+
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           MergeContext* merge_context) {
+    SequenceNumber seq;
+    return Get(key, value, s, merge_context, &seq);
+  }
 
   // Attempts to update the new_value inplace, else does normal Add
   // Pseudocode
@@ -194,7 +212,9 @@ class MemTable {
   // Get total number of entries in the mem table.
   // REQUIRES: external synchronization to prevent simultaneous
   // operations on the same MemTable (unless this Memtable is immutable).
-  uint64_t num_entries() const { return num_entries_; }
+  uint64_t num_entries() const {
+    return num_entries_.load(std::memory_order_relaxed);
+  }
 
   // Get total number of deletes in the mem table.
   // REQUIRES: external synchronization to prevent simultaneous
@@ -215,6 +235,15 @@ class MemTable {
   // operations on the same MemTable (unless this Memtable is immutable).
   SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }
 
+  // Returns the sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into this
+  // memtable. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  //
+  // If the earliest sequence number could not be determined,
+  // kMaxSequenceNumber will be returned.
+  SequenceNumber GetEarliestSequenceNumber() { return earliest_seqno_; }
+
   // Returns the next active logfile number when this memtable is about to
   // be flushed to storage
   // REQUIRES: external synchronization to prevent simultaneous
@@ -248,6 +277,8 @@ class MemTable {
     return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
   }
 
+  uint64_t ApproximateSize(const Slice& start_ikey, const Slice& end_ikey);
+
   // Get the lock associated for the key
   port::RWMutex* GetLock(const Slice& key);
 
@@ -273,7 +304,9 @@ class MemTable {
   MemTableAllocator allocator_;
   unique_ptr<MemTableRep> table_;
 
-  uint64_t num_entries_;
+  // Total data size of all data inserted
+  std::atomic<uint64_t> data_size_;
+  std::atomic<uint64_t> num_entries_;
   uint64_t num_deletes_;
 
   // These are used to manage memtable flushes to storage
@@ -288,6 +321,10 @@ class MemTable {
   // The sequence number of the kv that was inserted first
   SequenceNumber first_seqno_;
 
+  // The db sequence number at the time of creation or kMaxSequenceNumber
+  // if not set.
+  SequenceNumber earliest_seqno_;
+
   // The log files earlier than this number can be deleted.
   uint64_t mem_next_logfile_number_;
 
diff --git a/src/rocksdb/db/memtable_list.cc b/src/rocksdb/db/memtable_list.cc
index 54473dc..b2bbbd1 100644
--- a/src/rocksdb/db/memtable_list.cc
+++ b/src/rocksdb/db/memtable_list.cc
@@ -27,18 +27,48 @@ class InternalKeyComparator;
 class Mutex;
 class VersionSet;
 
-MemTableListVersion::MemTableListVersion(MemTableListVersion* old) {
+void MemTableListVersion::AddMemTable(MemTable* m) {
+  memlist_.push_front(m);
+  *parent_memtable_list_memory_usage_ += m->ApproximateMemoryUsage();
+}
+
+void MemTableListVersion::UnrefMemTable(autovector<MemTable*>* to_delete,
+                                        MemTable* m) {
+  if (m->Unref()) {
+    to_delete->push_back(m);
+    assert(*parent_memtable_list_memory_usage_ >= m->ApproximateMemoryUsage());
+    *parent_memtable_list_memory_usage_ -= m->ApproximateMemoryUsage();
+  } else {
+  }
+}
+
+MemTableListVersion::MemTableListVersion(
+    size_t* parent_memtable_list_memory_usage, MemTableListVersion* old)
+    : max_write_buffer_number_to_maintain_(
+          old->max_write_buffer_number_to_maintain_),
+      parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {
   if (old != nullptr) {
     memlist_ = old->memlist_;
-    size_ = old->size_;
     for (auto& m : memlist_) {
       m->Ref();
     }
+
+    memlist_history_ = old->memlist_history_;
+    for (auto& m : memlist_history_) {
+      m->Ref();
+    }
   }
 }
 
+MemTableListVersion::MemTableListVersion(
+    size_t* parent_memtable_list_memory_usage,
+    int max_write_buffer_number_to_maintain)
+    : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain),
+      parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {}
+
 void MemTableListVersion::Ref() { ++refs_; }
 
+// called by superversion::clean()
 void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
   assert(refs_ >= 1);
   --refs_;
@@ -47,30 +77,61 @@ void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
     // that refs_ will not be zero
     assert(to_delete != nullptr);
     for (const auto& m : memlist_) {
-      MemTable* x = m->Unref();
-      if (x != nullptr) {
-        to_delete->push_back(x);
-      }
+      UnrefMemTable(to_delete, m);
+    }
+    for (const auto& m : memlist_history_) {
+      UnrefMemTable(to_delete, m);
     }
     delete this;
   }
 }
 
-int MemTableListVersion::size() const { return size_; }
+int MemTableList::NumNotFlushed() const {
+  int size = static_cast<int>(current_->memlist_.size());
+  assert(num_flush_not_started_ <= size);
+  return size;
+}
 
-// Returns the total number of memtables in the list
-int MemTableList::size() const {
-  assert(num_flush_not_started_ <= current_->size_);
-  return current_->size_;
+int MemTableList::NumFlushed() const {
+  return static_cast<int>(current_->memlist_history_.size());
 }
 
 // Search all the memtables starting from the most recent one.
 // Return the most recent value found, if any.
 // Operands stores the list of merge operations to apply, so far.
 bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
-                              Status* s, MergeContext* merge_context) {
-  for (auto& memtable : memlist_) {
-    if (memtable->Get(key, value, s, merge_context)) {
+                              Status* s, MergeContext* merge_context,
+                              SequenceNumber* seq) {
+  return GetFromList(&memlist_, key, value, s, merge_context, seq);
+}
+
+bool MemTableListVersion::GetFromHistory(const LookupKey& key,
+                                         std::string* value, Status* s,
+                                         MergeContext* merge_context,
+                                         SequenceNumber* seq) {
+  return GetFromList(&memlist_history_, key, value, s, merge_context, seq);
+}
+
+bool MemTableListVersion::GetFromList(std::list<MemTable*>* list,
+                                      const LookupKey& key, std::string* value,
+                                      Status* s, MergeContext* merge_context,
+                                      SequenceNumber* seq) {
+  *seq = kMaxSequenceNumber;
+
+  for (auto& memtable : *list) {
+    SequenceNumber current_seq = kMaxSequenceNumber;
+
+    bool done = memtable->Get(key, value, s, merge_context, &current_seq);
+    if (*seq == kMaxSequenceNumber) {
+      // Store the most recent sequence number of any operation on this key.
+      // Since we only care about the most recent change, we only need to
+      // return the first operation found when searching memtables in
+      // reverse-chronological order.
+      *seq = current_seq;
+    }
+
+    if (done) {
+      assert(*seq != kMaxSequenceNumber);
       return true;
     }
   }
@@ -101,6 +162,15 @@ uint64_t MemTableListVersion::GetTotalNumEntries() const {
   return total_num;
 }
 
+uint64_t MemTableListVersion::ApproximateSize(const Slice& start_ikey,
+                                              const Slice& end_ikey) {
+  uint64_t total_size = 0;
+  for (auto& m : memlist_) {
+    total_size += m->ApproximateSize(start_ikey, end_ikey);
+  }
+  return total_size;
+}
+
 uint64_t MemTableListVersion::GetTotalNumDeletes() const {
   uint64_t total_num = 0;
   for (auto& m : memlist_) {
@@ -109,18 +179,49 @@ uint64_t MemTableListVersion::GetTotalNumDeletes() const {
   return total_num;
 }
 
+SequenceNumber MemTableListVersion::GetEarliestSequenceNumber(
+    bool include_history) const {
+  if (include_history && !memlist_history_.empty()) {
+    return memlist_history_.back()->GetEarliestSequenceNumber();
+  } else if (!memlist_.empty()) {
+    return memlist_.back()->GetEarliestSequenceNumber();
+  } else {
+    return kMaxSequenceNumber;
+  }
+}
+
 // caller is responsible for referencing m
-void MemTableListVersion::Add(MemTable* m) {
+void MemTableListVersion::Add(MemTable* m, autovector<MemTable*>* to_delete) {
   assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
-  memlist_.push_front(m);
-  ++size_;
+  AddMemTable(m);
+
+  TrimHistory(to_delete);
 }
 
-// caller is responsible for unreferencing m
-void MemTableListVersion::Remove(MemTable* m) {
+// Removes m from list of memtables not flushed.  Caller should NOT Unref m.
+void MemTableListVersion::Remove(MemTable* m,
+                                 autovector<MemTable*>* to_delete) {
   assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
   memlist_.remove(m);
-  --size_;
+
+  if (max_write_buffer_number_to_maintain_ > 0) {
+    memlist_history_.push_front(m);
+    TrimHistory(to_delete);
+  } else {
+    UnrefMemTable(to_delete, m);
+  }
+}
+
+// Make sure we don't use up too much space in history
+void MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete) {
+  while (memlist_.size() + memlist_history_.size() >
+             static_cast<size_t>(max_write_buffer_number_to_maintain_) &&
+         !memlist_history_.empty()) {
+    MemTable* x = memlist_history_.back();
+    memlist_history_.pop_back();
+
+    UnrefMemTable(to_delete, x);
+  }
 }
 
 // Returns true if there is at least one memtable on which flush has
@@ -223,20 +324,16 @@ Status MemTableList::InstallMemtableFlushResults(
 
     // All the later memtables that have the same filenum
     // are part of the same batch. They can be committed now.
-    uint64_t mem_id = 1;  // how many memtables has been flushed.
+    uint64_t mem_id = 1;  // how many memtables have been flushed.
     do {
       if (s.ok()) { // commit new state
         LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64
                                 ": memtable #%" PRIu64 " done",
                     cfd->GetName().c_str(), m->file_number_, mem_id);
-        current_->Remove(m);
         assert(m->file_number_ > 0);
-
-        if (m->Unref() != nullptr) {
-          to_delete->push_back(m);
-        }
+        current_->Remove(m, to_delete);
       } else {
-        //commit failed. setup state so that we can flush again.
+        // commit failed. setup state so that we can flush again.
         LogToBuffer(log_buffer, "Level-0 commit table #%" PRIu64
                                 ": memtable #%" PRIu64 " failed",
                     m->file_number_, mem_id);
@@ -256,15 +353,15 @@ Status MemTableList::InstallMemtableFlushResults(
 }
 
 // New memtables are inserted at the front of the list.
-void MemTableList::Add(MemTable* m) {
-  assert(current_->size_ >= num_flush_not_started_);
+void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) {
+  assert(static_cast<int>(current_->memlist_.size()) >= num_flush_not_started_);
   InstallNewVersion();
   // this method is used to move mutable memtable into an immutable list.
   // since mutable memtable is already refcounted by the DBImpl,
   // and when moving to the imutable list we don't unref it,
   // we don't have to ref the memtable here. we just take over the
   // reference from the DBImpl.
-  current_->Add(m);
+  current_->Add(m, to_delete);
   m->MarkImmutable();
   num_flush_not_started_++;
   if (num_flush_not_started_ == 1) {
@@ -273,7 +370,7 @@ void MemTableList::Add(MemTable* m) {
 }
 
 // Returns an estimate of the number of bytes of data in use.
-size_t MemTableList::ApproximateMemoryUsage() {
+size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() {
   size_t total_size = 0;
   for (auto& memtable : current_->memlist_) {
     total_size += memtable->ApproximateMemoryUsage();
@@ -281,13 +378,15 @@ size_t MemTableList::ApproximateMemoryUsage() {
   return total_size;
 }
 
+size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; }
+
 void MemTableList::InstallNewVersion() {
   if (current_->refs_ == 1) {
     // we're the only one using the version, just keep using it
   } else {
     // somebody else holds the current version, we need to create new one
     MemTableListVersion* version = current_;
-    current_ = new MemTableListVersion(current_);
+    current_ = new MemTableListVersion(&current_memory_usage_, current_);
     current_->Ref();
     version->Unref();
   }
diff --git a/src/rocksdb/db/memtable_list.h b/src/rocksdb/db/memtable_list.h
index 7b75dfa..63e2773 100644
--- a/src/rocksdb/db/memtable_list.h
+++ b/src/rocksdb/db/memtable_list.h
@@ -10,17 +10,15 @@
 #include <vector>
 #include <set>
 #include <deque>
-#include "rocksdb/db.h"
-#include "rocksdb/options.h"
-#include "rocksdb/iterator.h"
 
 #include "db/dbformat.h"
 #include "db/filename.h"
-#include "db/skiplist.h"
 #include "db/memtable.h"
+#include "db/skiplist.h"
 #include "rocksdb/db.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/types.h"
 #include "util/autovector.h"
 #include "util/instrumented_mutex.h"
 #include "util/log_buffer.h"
@@ -40,17 +38,40 @@ class MergeIteratorBuilder;
 // (such as holding the db mutex or being on the write thread).
 class MemTableListVersion {
  public:
-  explicit MemTableListVersion(MemTableListVersion* old = nullptr);
+  explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
+                               MemTableListVersion* old = nullptr);
+  explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
+                               int max_write_buffer_number_to_maintain);
 
   void Ref();
   void Unref(autovector<MemTable*>* to_delete = nullptr);
 
-  int size() const;
-
   // Search all the memtables starting from the most recent one.
   // Return the most recent value found, if any.
+  //
+  // If any operation was found for this key, its most recent sequence number
+  // will be stored in *seq on success (regardless of whether true/false is
+  // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext* merge_context);
+           MergeContext* merge_context, SequenceNumber* seq);
+
+  bool Get(const LookupKey& key, std::string* value, Status* s,
+           MergeContext* merge_context) {
+    SequenceNumber seq;
+    return Get(key, value, s, merge_context, &seq);
+  }
+
+  // Similar to Get(), but searches the Memtable history of memtables that
+  // have already been flushed.  Should only be used from in-memory only
+  // queries (such as Transaction validation) as the history may contain
+  // writes that are also present in the SST files.
+  bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
+                      MergeContext* merge_context, SequenceNumber* seq);
+  bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
+                      MergeContext* merge_context) {
+    SequenceNumber seq;
+    return GetFromHistory(key, value, s, merge_context, &seq);
+  }
 
   void AddIterators(const ReadOptions& options,
                     std::vector<Iterator*>* iterator_list, Arena* arena);
@@ -62,16 +83,46 @@ class MemTableListVersion {
 
   uint64_t GetTotalNumDeletes() const;
 
+  uint64_t ApproximateSize(const Slice& start_ikey, const Slice& end_ikey);
+
+  // Returns the value of MemTable::GetEarliestSequenceNumber() on the most
+  // recent MemTable in this list or kMaxSequenceNumber if the list is empty.
+  // If include_history=true, will also search Memtables in MemTableList
+  // History.
+  SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;
+
  private:
-  // REQUIRE: m is mutable memtable
-  void Add(MemTable* m);
-  // REQUIRE: m is mutable memtable
-  void Remove(MemTable* m);
+  // REQUIRE: m is an immutable memtable
+  void Add(MemTable* m, autovector<MemTable*>* to_delete);
+  // REQUIRE: m is an immutable memtable
+  void Remove(MemTable* m, autovector<MemTable*>* to_delete);
+
+  void TrimHistory(autovector<MemTable*>* to_delete);
+
+  bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
+                   std::string* value, Status* s, MergeContext* merge_context,
+                   SequenceNumber* seq);
+
+  void AddMemTable(MemTable* m);
+
+  void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
 
   friend class MemTableList;
+
+  // Immutable MemTables that have not yet been flushed.
   std::list<MemTable*> memlist_;
-  int size_ = 0;
+
+  // MemTables that have already been flushed
+  // (used during Transaction validation)
+  std::list<MemTable*> memlist_history_;
+
+  // Maximum number of MemTables to keep in memory (including both flushed
+  // and not-yet-flushed tables).
+  const int max_write_buffer_number_to_maintain_;
+
   int refs_ = 0;
+
+  size_t* parent_memtable_list_memory_usage_;
 };
 
 // This class stores references to all the immutable memtables.
@@ -88,14 +139,17 @@ class MemTableListVersion {
 class MemTableList {
  public:
   // A list of memtables.
-  explicit MemTableList(int min_write_buffer_number_to_merge)
+  explicit MemTableList(int min_write_buffer_number_to_merge,
+                        int max_write_buffer_number_to_maintain)
       : imm_flush_needed(false),
         min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
-        current_(new MemTableListVersion()),
+        current_(new MemTableListVersion(&current_memory_usage_,
+                                         max_write_buffer_number_to_maintain)),
         num_flush_not_started_(0),
         commit_in_progress_(false),
         flush_requested_(false) {
     current_->Ref();
+    current_memory_usage_ = 0;
   }
 
   // Should not delete MemTableList without making sure MemTableList::current()
@@ -108,8 +162,13 @@ class MemTableList {
   // determine whether there is anything more to start flushing.
   std::atomic<bool> imm_flush_needed;
 
-  // Returns the total number of memtables in the list
-  int size() const;
+  // Returns the total number of memtables in the list that haven't yet
+  // been flushed and logged.
+  int NumNotFlushed() const;
+
+  // Returns total number of memtables in the list that have been
+  // completely flushed and logged.
+  int NumFlushed() const;
 
   // Returns true if there is at least one memtable on which flush has
   // not yet started.
@@ -133,11 +192,15 @@ class MemTableList {
 
   // New memtables are inserted at the front of the list.
   // Takes ownership of the referenced held on *m by the caller of Add().
-  void Add(MemTable* m);
+  void Add(MemTable* m, autovector<MemTable*>* to_delete);
 
   // Returns an estimate of the number of bytes of data in use.
   size_t ApproximateMemoryUsage();
 
+  // Returns an estimate of the number of bytes of data used by
+  // the unflushed mem-tables.
+  size_t ApproximateUnflushedMemTablesMemoryUsage();
+
   // Request a flush of all existing memtables to storage.  This will
   // cause future calls to IsFlushPending() to return true if this list is
   // non-empty (regardless of the min_write_buffer_number_to_merge
@@ -149,11 +212,13 @@ class MemTableList {
   // MemTableList(const MemTableList&);
   // void operator=(const MemTableList&);
 
+  size_t* current_memory_usage() { return &current_memory_usage_; }
+
  private:
   // DB mutex held
   void InstallNewVersion();
 
-  int min_write_buffer_number_to_merge_;
+  const int min_write_buffer_number_to_merge_;
 
   MemTableListVersion* current_;
 
@@ -166,6 +231,8 @@ class MemTableList {
   // Requested a flush of all memtables to storage
   bool flush_requested_;
 
+  // The current memory usage.
+  size_t current_memory_usage_;
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/memtable_list_test.cc b/src/rocksdb/db/memtable_list_test.cc
index fc4e948..7bb8b3b 100644
--- a/src/rocksdb/db/memtable_list_test.cc
+++ b/src/rocksdb/db/memtable_list_test.cc
@@ -3,6 +3,7 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#include <algorithm>
 #include <string>
 #include <vector>
 #include "db/memtable_list.h"
@@ -12,17 +13,12 @@
 #include "db/writebuffer.h"
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
+#include "util/testutil.h"
+#include "util/string_util.h"
 #include "util/testharness.h"
 
 namespace rocksdb {
 
-class DumbLogger : public Logger {
- public:
-  using Logger::Logv;
-  virtual void Logv(const char* format, va_list ap) override {}
-  virtual size_t GetLogFileSize() const override { return 0; }
-};
-
 class MemTableListTest : public testing::Test {
  public:
   std::string dbname;
@@ -56,7 +52,7 @@ class MemTableListTest : public testing::Test {
       MemTableList* list, const MutableCFOptions& mutable_cf_options,
       const autovector<MemTable*>& m, autovector<MemTable*>* to_delete) {
     // Create a mock Logger
-    DumbLogger logger;
+    test::NullLogger logger;
     LogBuffer log_buffer(DEBUG_LEVEL, &logger);
 
     // Create a mock VersionSet
@@ -64,7 +60,7 @@ class MemTableListTest : public testing::Test {
     EnvOptions env_options;
     shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
     WriteBuffer write_buffer(db_options.db_write_buffer_size);
-    WriteController write_controller;
+    WriteController write_controller(10000000u);
 
     CreateDB();
     VersionSet versions(dbname, &db_options, env_options, table_cache.get(),
@@ -92,9 +88,9 @@ class MemTableListTest : public testing::Test {
 
 TEST_F(MemTableListTest, Empty) {
   // Create an empty MemTableList and validate basic functions.
-  MemTableList list(1);
+  MemTableList list(1, 0);
 
-  ASSERT_EQ(0, list.size());
+  ASSERT_EQ(0, list.NumNotFlushed());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
   ASSERT_FALSE(list.IsFlushPending());
 
@@ -110,12 +106,15 @@ TEST_F(MemTableListTest, Empty) {
 TEST_F(MemTableListTest, GetTest) {
   // Create MemTableList
   int min_write_buffer_number_to_merge = 2;
-  MemTableList list(min_write_buffer_number_to_merge);
+  int max_write_buffer_number_to_maintain = 0;
+  MemTableList list(min_write_buffer_number_to_merge,
+                    max_write_buffer_number_to_maintain);
 
   SequenceNumber seq = 1;
   std::string value;
   Status s;
   MergeContext merge_context;
+  autovector<MemTable*> to_delete;
 
   LookupKey lkey("key1", seq);
   bool found = list.current()->Get(lkey, &value, &s, &merge_context);
@@ -129,7 +128,8 @@ TEST_F(MemTableListTest, GetTest) {
 
   WriteBuffer wb(options.db_write_buffer_size);
   MemTable* mem =
-      new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb);
+      new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb,
+                   kMaxSequenceNumber);
   mem->Ref();
 
   // Write some keys to this memtable.
@@ -158,21 +158,22 @@ TEST_F(MemTableListTest, GetTest) {
   ASSERT_EQ(1, mem->num_deletes());
 
   // Add memtable to list
-  list.Add(mem);
+  list.Add(mem, &to_delete);
 
   SequenceNumber saved_seq = seq;
 
   // Create another memtable and write some keys to it
   WriteBuffer wb2(options.db_write_buffer_size);
   MemTable* mem2 =
-      new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb2);
+      new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb2,
+                   kMaxSequenceNumber);
   mem2->Ref();
 
   mem2->Add(++seq, kTypeDeletion, "key1", "");
   mem2->Add(++seq, kTypeValue, "key2", "value2.3");
 
   // Add second memtable to list
-  list.Add(mem2);
+  list.Add(mem2, &to_delete);
 
   // Fetch keys via MemTableList
   merge_context.Clear();
@@ -196,10 +197,185 @@ TEST_F(MemTableListTest, GetTest) {
   found = list.current()->Get(LookupKey("key2", 1), &value, &s, &merge_context);
   ASSERT_FALSE(found);
 
-  ASSERT_EQ(2, list.size());
+  ASSERT_EQ(2, list.NumNotFlushed());
+
+  list.current()->Unref(&to_delete);
+  for (MemTable* m : to_delete) {
+    delete m;
+  }
+}
+
+TEST_F(MemTableListTest, GetFromHistoryTest) {
+  // Create MemTableList
+  int min_write_buffer_number_to_merge = 2;
+  int max_write_buffer_number_to_maintain = 2;
+  MemTableList list(min_write_buffer_number_to_merge,
+                    max_write_buffer_number_to_maintain);
 
+  SequenceNumber seq = 1;
+  std::string value;
+  Status s;
+  MergeContext merge_context;
   autovector<MemTable*> to_delete;
+
+  LookupKey lkey("key1", seq);
+  bool found = list.current()->Get(lkey, &value, &s, &merge_context);
+  ASSERT_FALSE(found);
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableCFOptions ioptions(options);
+
+  WriteBuffer wb(options.db_write_buffer_size);
+  MemTable* mem =
+      new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb,
+                   kMaxSequenceNumber);
+  mem->Ref();
+
+  // Write some keys to this memtable.
+  mem->Add(++seq, kTypeDeletion, "key1", "");
+  mem->Add(++seq, kTypeValue, "key2", "value2");
+  mem->Add(++seq, kTypeValue, "key2", "value2.2");
+
+  // Fetch the newly written keys
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context);
+  // MemTable found out that this key is *not* found (at this sequence#)
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context);
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value2.2");
+
+  // Add memtable to list
+  list.Add(mem, &to_delete);
+  ASSERT_EQ(0, to_delete.size());
+
+  // Fetch keys via MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context);
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context);
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ("value2.2", value);
+
+  // Flush this memtable from the list.
+  // (It will then be a part of the memtable history).
+  autovector<MemTable*> to_flush;
+  list.PickMemtablesToFlush(&to_flush);
+  ASSERT_EQ(1, to_flush.size());
+
+  s = Mock_InstallMemtableFlushResults(
+      &list, MutableCFOptions(options, ioptions), to_flush, &to_delete);
+  ASSERT_OK(s);
+  ASSERT_EQ(0, list.NumNotFlushed());
+  ASSERT_EQ(1, list.NumFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Verify keys are no longer in MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context);
+  ASSERT_FALSE(found);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context);
+  ASSERT_FALSE(found);
+
+  // Verify keys are present in history
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(LookupKey("key1", seq), &value, &s,
+                                         &merge_context);
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(LookupKey("key2", seq), &value, &s,
+                                         &merge_context);
+  ASSERT_TRUE(found);
+  ASSERT_EQ("value2.2", value);
+
+  // Create another memtable and write some keys to it
+  WriteBuffer wb2(options.db_write_buffer_size);
+  MemTable* mem2 =
+      new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb2,
+                   kMaxSequenceNumber);
+  mem2->Ref();
+
+  mem2->Add(++seq, kTypeDeletion, "key1", "");
+  mem2->Add(++seq, kTypeValue, "key3", "value3");
+
+  // Add second memtable to list
+  list.Add(mem2, &to_delete);
+  ASSERT_EQ(0, to_delete.size());
+
+  to_flush.clear();
+  list.PickMemtablesToFlush(&to_flush);
+  ASSERT_EQ(1, to_flush.size());
+
+  // Flush second memtable
+  s = Mock_InstallMemtableFlushResults(
+      &list, MutableCFOptions(options, ioptions), to_flush, &to_delete);
+  ASSERT_OK(s);
+  ASSERT_EQ(0, list.NumNotFlushed());
+  ASSERT_EQ(2, list.NumFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Add a third memtable to push the first memtable out of the history
+  WriteBuffer wb3(options.db_write_buffer_size);
+  MemTable* mem3 =
+      new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb3,
+                   kMaxSequenceNumber);
+  mem3->Ref();
+  list.Add(mem3, &to_delete);
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_EQ(1, list.NumFlushed());
+  ASSERT_EQ(1, to_delete.size());
+
+  // Verify keys are no longer in MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context);
+  ASSERT_FALSE(found);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context);
+  ASSERT_FALSE(found);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key3", seq), &value, &s, &merge_context);
+  ASSERT_FALSE(found);
+
+  // Verify that the second memtable's keys are in the history
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(LookupKey("key1", seq), &value, &s,
+                                         &merge_context);
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(LookupKey("key3", seq), &value, &s,
+                                         &merge_context);
+  ASSERT_TRUE(found);
+  ASSERT_EQ("value3", value);
+
+  // Verify that key2 from the first memtable is no longer in the history
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context);
+  ASSERT_FALSE(found);
+
+  // Cleanup
   list.current()->Unref(&to_delete);
+  ASSERT_EQ(3, to_delete.size());
   for (MemTable* m : to_delete) {
     delete m;
   }
@@ -215,26 +391,30 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ImmutableCFOptions ioptions(options);
   InternalKeyComparator cmp(BytewiseComparator());
   WriteBuffer wb(options.db_write_buffer_size);
+  autovector<MemTable*> to_delete;
 
   // Create MemTableList
   int min_write_buffer_number_to_merge = 3;
-  MemTableList list(min_write_buffer_number_to_merge);
+  int max_write_buffer_number_to_maintain = 7;
+  MemTableList list(min_write_buffer_number_to_merge,
+                    max_write_buffer_number_to_maintain);
 
   // Create some MemTables
   std::vector<MemTable*> tables;
   MutableCFOptions mutable_cf_options(options, ioptions);
   for (int i = 0; i < num_tables; i++) {
-    MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb);
+    MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb,
+                                 kMaxSequenceNumber);
     mem->Ref();
 
     std::string value;
     MergeContext merge_context;
 
-    mem->Add(++seq, kTypeValue, "key1", std::to_string(i));
-    mem->Add(++seq, kTypeValue, "keyN" + std::to_string(i), "valueN");
-    mem->Add(++seq, kTypeValue, "keyX" + std::to_string(i), "value");
-    mem->Add(++seq, kTypeValue, "keyM" + std::to_string(i), "valueM");
-    mem->Add(++seq, kTypeDeletion, "keyX" + std::to_string(i), "");
+    mem->Add(++seq, kTypeValue, "key1", ToString(i));
+    mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN");
+    mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value");
+    mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM");
+    mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "");
 
     tables.push_back(mem);
   }
@@ -264,9 +444,10 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Add 2 tables
-  list.Add(tables[0]);
-  list.Add(tables[1]);
-  ASSERT_EQ(2, list.size());
+  list.Add(tables[0], &to_delete);
+  list.Add(tables[1], &to_delete);
+  ASSERT_EQ(2, list.NumNotFlushed());
+  ASSERT_EQ(0, to_delete.size());
 
   // Even though we have less than the minimum to flush, a flush is
   // pending since we had previously requested a flush and never called
@@ -277,7 +458,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   // Pick tables to flush
   list.PickMemtablesToFlush(&to_flush);
   ASSERT_EQ(2, to_flush.size());
-  ASSERT_EQ(2, list.size());
+  ASSERT_EQ(2, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
@@ -288,16 +469,17 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   to_flush.clear();
 
   // Add another table
-  list.Add(tables[2]);
+  list.Add(tables[2], &to_delete);
   // We now have the minimum to flush regardles of whether FlushRequested()
   // was called.
   ASSERT_TRUE(list.IsFlushPending());
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_EQ(0, to_delete.size());
 
   // Pick tables to flush
   list.PickMemtablesToFlush(&to_flush);
   ASSERT_EQ(3, to_flush.size());
-  ASSERT_EQ(3, list.size());
+  ASSERT_EQ(3, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
@@ -305,14 +487,15 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   autovector<MemTable*> to_flush2;
   list.PickMemtablesToFlush(&to_flush2);
   ASSERT_EQ(0, to_flush2.size());
-  ASSERT_EQ(3, list.size());
+  ASSERT_EQ(3, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Add another table
-  list.Add(tables[3]);
+  list.Add(tables[3], &to_delete);
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_EQ(0, to_delete.size());
 
   // Request a flush again
   list.FlushRequested();
@@ -322,7 +505,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   // Pick tables to flush again
   list.PickMemtablesToFlush(&to_flush2);
   ASSERT_EQ(1, to_flush2.size());
-  ASSERT_EQ(4, list.size());
+  ASSERT_EQ(4, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
@@ -333,29 +516,28 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   to_flush.clear();
 
   // Add another tables
-  list.Add(tables[4]);
-  ASSERT_EQ(5, list.size());
+  list.Add(tables[4], &to_delete);
+  ASSERT_EQ(5, list.NumNotFlushed());
   // We now have the minimum to flush regardles of whether FlushRequested()
   ASSERT_TRUE(list.IsFlushPending());
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_EQ(0, to_delete.size());
 
   // Pick tables to flush
   list.PickMemtablesToFlush(&to_flush);
   // Should pick 4 of 5 since 1 table has been picked in to_flush2
   ASSERT_EQ(4, to_flush.size());
-  ASSERT_EQ(5, list.size());
+  ASSERT_EQ(5, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Pick tables to flush again
   autovector<MemTable*> to_flush3;
   ASSERT_EQ(0, to_flush3.size());  // nothing not in progress of being flushed
-  ASSERT_EQ(5, list.size());
+  ASSERT_EQ(5, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
-  autovector<MemTable*> to_delete;
-
   // Flush the 4 memtables that were picked in to_flush
   s = Mock_InstallMemtableFlushResults(
       &list, MutableCFOptions(options, ioptions), to_flush, &to_delete);
@@ -366,17 +548,10 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   // Current implementation will only commit memtables in the order they were
   // created.  So InstallMemtableFlushResults will install the first 3 tables
   // in to_flush and stop when it encounters a table not yet flushed.
-  ASSERT_EQ(3, to_delete.size());
-  ASSERT_EQ(2, list.size());
-
-  for (const auto& m : to_delete) {
-    // Refcount should be 0 after calling InstallMemtableFlushResults.
-    // Verify this, by Ref'ing then UnRef'ing:
-    m->Ref();
-    ASSERT_EQ(m, m->Unref());
-    delete m;
-  }
-  to_delete.clear();
+  ASSERT_EQ(2, list.NumNotFlushed());
+  int num_in_history = std::min(3, max_write_buffer_number_to_maintain);
+  ASSERT_EQ(num_in_history, list.NumFlushed());
+  ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
 
   // Request a flush again. Should be nothing to flush
   list.FlushRequested();
@@ -388,10 +563,12 @@ TEST_F(MemTableListTest, FlushPendingTest) {
       &list, MutableCFOptions(options, ioptions), to_flush2, &to_delete);
   ASSERT_OK(s);
 
-  // This will actually intall 2 tables.  The 1 we told it to flush, and also
+  // This will actually install 2 tables.  The 1 we told it to flush, and also
   // tables[4] which has been waiting for tables[3] to commit.
-  ASSERT_EQ(2, to_delete.size());
-  ASSERT_EQ(0, list.size());
+  ASSERT_EQ(0, list.NumNotFlushed());
+  num_in_history = std::min(5, max_write_buffer_number_to_maintain);
+  ASSERT_EQ(num_in_history, list.NumFlushed());
+  ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
 
   for (const auto& m : to_delete) {
     // Refcount should be 0 after calling InstallMemtableFlushResults.
@@ -403,7 +580,17 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   to_delete.clear();
 
   list.current()->Unref(&to_delete);
-  ASSERT_EQ(0, to_delete.size());
+  int to_delete_size = std::min(5, max_write_buffer_number_to_maintain);
+  ASSERT_EQ(to_delete_size, to_delete.size());
+
+  for (const auto& m : to_delete) {
+    // Refcount should be 0 after calling InstallMemtableFlushResults.
+    // Verify this, by Ref'ing then UnRef'ing:
+    m->Ref();
+    ASSERT_EQ(m, m->Unref());
+    delete m;
+  }
+  to_delete.clear();
 }
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/memtablerep_bench.cc b/src/rocksdb/db/memtablerep_bench.cc
index feb3723..a2a8722 100644
--- a/src/rocksdb/db/memtablerep_bench.cc
+++ b/src/rocksdb/db/memtablerep_bench.cc
@@ -132,6 +132,8 @@ DEFINE_int64(seed, 0,
              "Seed base for random number generators. "
              "When 0 it is deterministic.");
 
+static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
+
 namespace rocksdb {
 
 namespace {
@@ -310,9 +312,10 @@ class ReadBenchmarkThread : public BenchmarkThread {
     assert(callback_args != nullptr);
     uint32_t key_length;
     const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
-    if ((callback_args->comparator)->user_comparator()->Compare(
-            Slice(key_ptr, key_length - 8), callback_args->key->user_key()) ==
-        0) {
+    if ((callback_args->comparator)
+            ->user_comparator()
+            ->Equal(Slice(key_ptr, key_length - 8),
+                    callback_args->key->user_key())) {
       callback_args->found = true;
     }
     return false;
diff --git a/src/rocksdb/db/merge_helper.cc b/src/rocksdb/db/merge_helper.cc
index cd4d456..f9cb67e 100644
--- a/src/rocksdb/db/merge_helper.cc
+++ b/src/rocksdb/db/merge_helper.cc
@@ -2,17 +2,18 @@
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
-//
-#include "merge_helper.h"
+
+#include "db/merge_helper.h"
+
+#include <stdio.h>
+#include <string>
+
 #include "db/dbformat.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/merge_operator.h"
-#include "util/statistics.h"
-#include <string>
-#include <stdio.h>
 #include "util/perf_context_imp.h"
-#include "util/stop_watch.h"
+#include "util/statistics.h"
 
 namespace rocksdb {
 
@@ -39,8 +40,7 @@ Status MergeHelper::TimedFullMerge(const Slice& key, const Slice* value,
   bool success =
       merge_operator->FullMerge(key, value, operands, result, logger);
 
-  RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME,
-             env != nullptr ? timer.ElapsedNanos() : 0);
+  RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME, timer.ElapsedNanosSafe());
 
   if (!success) {
     RecordTick(statistics, NUMBER_MERGE_FAILURES);
@@ -56,136 +56,146 @@ Status MergeHelper::TimedFullMerge(const Slice& key, const Slice* value,
 //       keys_ stores the list of keys encountered while merging.
 //       operands_ stores the list of merge operands encountered while merging.
 //       keys_[i] corresponds to operands_[i] for each i.
-void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
-                             bool at_bottom, Statistics* stats, int* steps,
-                             Env* env_) {
+Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
+                               const bool at_bottom) {
   // Get a copy of the internal key, before it's invalidated by iter->Next()
   // Also maintain the list of merge operands seen.
   assert(HasOperator());
   keys_.clear();
   operands_.clear();
-  keys_.push_front(iter->key().ToString());
-  operands_.push_front(iter->value().ToString());
   assert(user_merge_operator_);
-
-  success_ = false;   // Will become true if we hit Put/Delete or bottom
+  bool first_key = true;
 
   // We need to parse the internal key again as the parsed key is
   // backed by the internal key!
   // Assume no internal key corruption as it has been successfully parsed
   // by the caller.
-  // Invariant: keys_.back() will not change. Hence, orig_ikey is always valid.
+  // original_key_is_iter variable is just caching the information:
+  // original_key_is_iter == (iter->key().ToString() == original_key)
+  bool original_key_is_iter = true;
+  std::string original_key = iter->key().ToString();
+  // Important:
+  // orig_ikey is backed by original_key if keys_.empty()
+  // orig_ikey is backed by keys_.back() if !keys_.empty()
   ParsedInternalKey orig_ikey;
-  ParseInternalKey(keys_.back(), &orig_ikey);
+  ParseInternalKey(original_key, &orig_ikey);
 
+  Status s;
   bool hit_the_next_user_key = false;
-  std::string merge_result;  // Temporary value for merge results
-  if (steps) {
-    ++(*steps);
-  }
-  for (iter->Next(); iter->Valid(); iter->Next()) {
+  for (; iter->Valid(); iter->Next(), original_key_is_iter = false) {
     ParsedInternalKey ikey;
-    assert(operands_.size() >= 1);        // Should be invariants!
     assert(keys_.size() == operands_.size());
 
     if (!ParseInternalKey(iter->key(), &ikey)) {
       // stop at corrupted key
       if (assert_valid_internal_key_) {
-        assert(!"corrupted internal key is not expected");
+        assert(!"Corrupted internal key not expected.");
+        return Status::Corruption("Corrupted internal key not expected.");
       }
       break;
-    }
-
-    if (user_comparator_->Compare(ikey.user_key, orig_ikey.user_key) != 0) {
+    } else if (first_key) {
+      assert(user_comparator_->Equal(ikey.user_key, orig_ikey.user_key));
+      first_key = false;
+    } else if (!user_comparator_->Equal(ikey.user_key, orig_ikey.user_key)) {
       // hit a different user key, stop right here
       hit_the_next_user_key = true;
       break;
-    }
-
-    if (stop_before && ikey.sequence <= stop_before) {
+    } else if (stop_before && ikey.sequence <= stop_before) {
       // hit an entry that's visible by the previous snapshot, can't touch that
       break;
     }
 
     // At this point we are guaranteed that we need to process this key.
 
-    if (kTypeDeletion == ikey.type) {
-      // hit a delete
-      //   => merge nullptr with operands_
-      //   => store result in operands_.back() (and update keys_.back())
-      //   => change the entry type to kTypeValue for keys_.back()
-      // We are done! Return a success if the merge passes.
-
-      Status s = TimedFullMerge(ikey.user_key, nullptr, operands_,
-                                user_merge_operator_, stats, env_, logger_,
-                                &merge_result);
-
-      // We store the result in keys_.back() and operands_.back()
-      // if nothing went wrong (i.e.: no operand corruption on disk)
-      if (s.ok()) {
-        std::string& original_key =
-            keys_.back();  // The original key encountered
-        orig_ikey.type = kTypeValue;
-        UpdateInternalKey(&original_key[0], original_key.size(),
-                          orig_ikey.sequence, orig_ikey.type);
-        swap(operands_.back(), merge_result);
-      }
-
-      // move iter to the next entry (before doing anything else)
-      iter->Next();
-      if (steps) {
-        ++(*steps);
+    assert(IsValueType(ikey.type));
+    if (ikey.type != kTypeMerge) {
+      if (ikey.type != kTypeValue && ikey.type != kTypeDeletion) {
+        // Merges operands can only be used with puts and deletions, single
+        // deletions are not supported.
+        assert(false);
+        // release build doesn't have asserts, so we return error status
+        return Status::InvalidArgument(
+            " Merges operands can only be used with puts and deletions, single "
+            "deletions are not supported.");
       }
-      return;
-    }
 
-    if (kTypeValue == ikey.type) {
-      // hit a put
-      //   => merge the put value with operands_
+      // hit a put/delete
+      //   => merge the put value or a nullptr with operands_
       //   => store result in operands_.back() (and update keys_.back())
       //   => change the entry type to kTypeValue for keys_.back()
       // We are done! Success!
+
+      // If there are no operands, just return the Status::OK(). That will cause
+      // the compaction iterator to write out the key we're currently at, which
+      // is the put/delete we just encountered.
+      if (keys_.empty()) {
+        return Status::OK();
+      }
+
+      // TODO(noetzli) If the merge operator returns false, we are currently
+      // (almost) silently dropping the put/delete. That's probably not what we
+      // want.
       const Slice val = iter->value();
-      Status s =
-          TimedFullMerge(ikey.user_key, &val, operands_, user_merge_operator_,
-                         stats, env_, logger_, &merge_result);
+      const Slice* val_ptr = (kTypeValue == ikey.type) ? &val : nullptr;
+      std::string merge_result;
+      s = TimedFullMerge(ikey.user_key, val_ptr, operands_,
+                         user_merge_operator_, stats_, env_, logger_,
+                         &merge_result);
 
       // We store the result in keys_.back() and operands_.back()
       // if nothing went wrong (i.e.: no operand corruption on disk)
       if (s.ok()) {
-        std::string& original_key =
-            keys_.back();  // The original key encountered
+        // The original key encountered
+        original_key = std::move(keys_.back());
         orig_ikey.type = kTypeValue;
-        UpdateInternalKey(&original_key[0], original_key.size(),
-                          orig_ikey.sequence, orig_ikey.type);
-        swap(operands_.back(), merge_result);
+        UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
+        keys_.clear();
+        operands_.clear();
+        keys_.emplace_front(std::move(original_key));
+        operands_.emplace_front(std::move(merge_result));
       }
 
       // move iter to the next entry
       iter->Next();
-      if (steps) {
-        ++(*steps);
-      }
-      return;
-    }
-
-    if (kTypeMerge == ikey.type) {
+      return s;
+    } else {
       // hit a merge
+      //   => if there is a compaction filter, apply it.
       //   => merge the operand into the front of the operands_ list
-      //   => use the user's associative merge function to determine how.
+      //      if not filtered
       //   => then continue because we haven't yet seen a Put/Delete.
-      assert(!operands_.empty()); // Should have at least one element in it
-
-      // keep queuing keys and operands until we either meet a put / delete
+      //
+      // Keep queuing keys and operands until we either meet a put / delete
       // request or later did a partial merge.
-      keys_.push_front(iter->key().ToString());
-      operands_.push_front(iter->value().ToString());
-      if (steps) {
-        ++(*steps);
+
+      Slice value_slice = iter->value();
+      // add an operand to the list if:
+      // 1) it's included in one of the snapshots. in that case we *must* write
+      // it out, no matter what compaction filter says
+      // 2) it's not filtered by a compaction filter
+      if (ikey.sequence <= latest_snapshot_ ||
+          !FilterMerge(orig_ikey.user_key, value_slice)) {
+        if (original_key_is_iter) {
+          // this is just an optimization that saves us one memcpy
+          keys_.push_front(std::move(original_key));
+        } else {
+          keys_.push_front(iter->key().ToString());
+        }
+        if (keys_.size() == 1) {
+          // we need to re-anchor the orig_ikey because it was anchored by
+          // original_key before
+          ParseInternalKey(keys_.back(), &orig_ikey);
+        }
+        operands_.push_front(value_slice.ToString());
       }
     }
   }
 
+  if (operands_.size() == 0) {
+    // we filtered out all the merge operands
+    return Status::OK();
+  }
+
   // We are sure we have seen this key's entire history if we are at the
   // last level and exhausted all internal keys of this user key.
   // NOTE: !iter->Valid() does not necessarily mean we hit the
@@ -208,53 +218,88 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before,
     assert(kTypeMerge == orig_ikey.type);
     assert(operands_.size() >= 1);
     assert(operands_.size() == keys_.size());
-    {
-      StopWatchNano timer(env_, stats != nullptr);
-      PERF_TIMER_GUARD(merge_operator_time_nanos);
-      success_ = user_merge_operator_->FullMerge(
-          orig_ikey.user_key, nullptr, operands_, &merge_result, logger_);
-      RecordTick(stats, MERGE_OPERATION_TOTAL_TIME,
-                 env_ != nullptr ? timer.ElapsedNanos() : 0);
-    }
-    if (success_) {
-      std::string& original_key = keys_.back();  // The original key encountered
+    std::string merge_result;
+    s = TimedFullMerge(orig_ikey.user_key, nullptr, operands_,
+                       user_merge_operator_, stats_, env_, logger_,
+                       &merge_result);
+    if (s.ok()) {
+      // The original key encountered
+      // We are certain that keys_ is not empty here (see assertions couple of
+      // lines before).
+      original_key = std::move(keys_.back());
       orig_ikey.type = kTypeValue;
-      UpdateInternalKey(&original_key[0], original_key.size(),
-                        orig_ikey.sequence, orig_ikey.type);
-
-      // The final value() is always stored in operands_.back()
-      swap(operands_.back(),merge_result);
-    } else {
-      RecordTick(stats, NUMBER_MERGE_FAILURES);
-      // Do nothing if not success_. Leave keys() and operands() as they are.
+      UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
+      keys_.clear();
+      operands_.clear();
+      keys_.emplace_front(std::move(original_key));
+      operands_.emplace_front(std::move(merge_result));
     }
   } else {
     // We haven't seen the beginning of the key nor a Put/Delete.
     // Attempt to use the user's associative merge function to
     // merge the stacked merge operands into a single operand.
-
+    //
+    // TODO(noetzli) The docblock of MergeUntil suggests that a successful
+    // partial merge returns Status::OK(). Should we change the status code
+    // after a successful partial merge?
+    s = Status::MergeInProgress();
     if (operands_.size() >= 2 &&
         operands_.size() >= min_partial_merge_operands_) {
       bool merge_success = false;
+      std::string merge_result;
       {
-        StopWatchNano timer(env_, stats != nullptr);
+        StopWatchNano timer(env_, stats_ != nullptr);
         PERF_TIMER_GUARD(merge_operator_time_nanos);
         merge_success = user_merge_operator_->PartialMergeMulti(
             orig_ikey.user_key,
             std::deque<Slice>(operands_.begin(), operands_.end()),
             &merge_result, logger_);
-        RecordTick(stats, MERGE_OPERATION_TOTAL_TIME,
-                   env_ != nullptr ? timer.ElapsedNanos() : 0);
+        RecordTick(stats_, MERGE_OPERATION_TOTAL_TIME,
+                   timer.ElapsedNanosSafe());
       }
       if (merge_success) {
         // Merging of operands (associative merge) was successful.
         // Replace operands with the merge result
         operands_.clear();
-        operands_.push_front(std::move(merge_result));
+        operands_.emplace_front(std::move(merge_result));
         keys_.erase(keys_.begin(), keys_.end() - 1);
       }
     }
   }
+
+  return s;
+}
+
+MergeOutputIterator::MergeOutputIterator(const MergeHelper* merge_helper)
+    : merge_helper_(merge_helper) {
+  it_keys_ = merge_helper_->keys().rend();
+  it_values_ = merge_helper_->values().rend();
+}
+
+void MergeOutputIterator::SeekToFirst() {
+  const auto& keys = merge_helper_->keys();
+  const auto& values = merge_helper_->values();
+  assert(keys.size() == values.size());
+  it_keys_ = keys.rbegin();
+  it_values_ = values.rbegin();
+}
+
+void MergeOutputIterator::Next() {
+  ++it_keys_;
+  ++it_values_;
+}
+
+bool MergeHelper::FilterMerge(const Slice& user_key, const Slice& value_slice) {
+  if (compaction_filter_ == nullptr) {
+    return false;
+  }
+  if (stats_ != nullptr) {
+    filter_timer_.Start();
+  }
+  bool to_delete =
+      compaction_filter_->FilterMergeOperand(level_, user_key, value_slice);
+  total_filter_time_ += filter_timer_.ElapsedNanosSafe();
+  return to_delete;
 }
 
 } // namespace rocksdb
diff --git a/src/rocksdb/db/merge_helper.h b/src/rocksdb/db/merge_helper.h
index 7722446..ade3d71 100644
--- a/src/rocksdb/db/merge_helper.h
+++ b/src/rocksdb/db/merge_helper.h
@@ -6,11 +6,14 @@
 #ifndef MERGE_HELPER_H
 #define MERGE_HELPER_H
 
-#include "db/dbformat.h"
-#include "rocksdb/slice.h"
-#include <string>
 #include <deque>
+#include <string>
+
+#include "db/dbformat.h"
+#include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "util/stop_watch.h"
 
 namespace rocksdb {
 
@@ -22,22 +25,36 @@ class Statistics;
 
 class MergeHelper {
  public:
-  MergeHelper(const Comparator* user_comparator,
-              const MergeOperator* user_merge_operator, Logger* logger,
+  MergeHelper(Env* env, const Comparator* user_comparator,
+              const MergeOperator* user_merge_operator,
+              const CompactionFilter* compaction_filter, Logger* logger,
               unsigned min_partial_merge_operands,
-              bool assert_valid_internal_key)
-      : user_comparator_(user_comparator),
+              bool assert_valid_internal_key, SequenceNumber latest_snapshot,
+              int level = 0, Statistics* stats = nullptr)
+      : env_(env),
+        user_comparator_(user_comparator),
         user_merge_operator_(user_merge_operator),
+        compaction_filter_(compaction_filter),
         logger_(logger),
         min_partial_merge_operands_(min_partial_merge_operands),
         assert_valid_internal_key_(assert_valid_internal_key),
+        latest_snapshot_(latest_snapshot),
+        level_(level),
         keys_(),
         operands_(),
-        success_(false) {}
+        filter_timer_(env_),
+        total_filter_time_(0U),
+        stats_(stats) {
+    assert(user_comparator_ != nullptr);
+  }
 
   // Wrapper around MergeOperator::FullMerge() that records perf statistics.
   // Result of merge will be written to result if status returned is OK.
   // If operands is empty, the value will simply be copied to result.
+  // Returns one of the following statuses:
+  // - OK: Entries were successfully merged.
+  // - Corruption: Merge operator reported unsuccessful merge.
+  // - NotSupported: Merge operator is missing.
   static Status TimedFullMerge(const Slice& key, const Slice* value,
                                const std::deque<std::string>& operands,
                                const MergeOperator* merge_operator,
@@ -56,18 +73,29 @@ class MergeHelper {
   //                   0 means no restriction
   // at_bottom:   (IN) true if the iterator covers the bottem level, which means
   //                   we could reach the start of the history of this user key.
-  void MergeUntil(Iterator* iter, SequenceNumber stop_before = 0,
-                  bool at_bottom = false, Statistics* stats = nullptr,
-                  int* steps = nullptr, Env* env_ = nullptr);
+  //
+  // Returns one of the following statuses:
+  // - OK: Entries were successfully merged.
+  // - MergeInProgress: Put/Delete not encountered and unable to merge operands.
+  // - Corruption: Merge operator reported unsuccessful merge or a corrupted
+  //   key has been encountered and not expected (applies only when compiling
+  //   with asserts removed).
+  //
+  // REQUIRED: The first key in the input is not corrupted.
+  Status MergeUntil(Iterator* iter, const SequenceNumber stop_before = 0,
+                    const bool at_bottom = false);
+
+  // Filters a merge operand using the compaction filter specified
+  // in the constructor. Returns true if the operand should be filtered out.
+  bool FilterMerge(const Slice& user_key, const Slice& value_slice);
 
   // Query the merge result
   // These are valid until the next MergeUntil call
   // If the merging was successful:
-  //   - IsSuccess() will be true
-  //   - key() will have the latest sequence number of the merges.
-  //           The type will be Put or Merge. See IMPORTANT 1 note, below.
-  //   - value() will be the result of merging all the operands together
-  //   - The user should ignore keys() and values().
+  //   - keys() contains a single element with the latest sequence number of
+  //     the merges. The type will be Put or Merge. See IMPORTANT 1 note, below.
+  //   - values() contains a single element with the result of merging all the
+  //     operands together
   //
   //   IMPORTANT 1: the key type could change after the MergeUntil call.
   //        Put/Delete + Merge + ... + Merge => Put
@@ -75,7 +103,6 @@ class MergeHelper {
   //
   // If the merge operator is not associative, and if a Put/Delete is not found
   // then the merging will be unsuccessful. In this case:
-  //   - IsSuccess() will be false
   //   - keys() contains the list of internal keys seen in order of iteration.
   //   - values() contains the list of values (merges) seen in the same order.
   //              values() is parallel to keys() so that the first entry in
@@ -83,34 +110,55 @@ class MergeHelper {
   //              and so on. These lists will be the same length.
   //              All of these pairs will be merges over the same user key.
   //              See IMPORTANT 2 note below.
-  //   - The user should ignore key() and value().
   //
   //   IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
   //                So keys().back() was the first key seen by iterator.
   // TODO: Re-style this comment to be like the first one
-  bool IsSuccess() const { return success_; }
-  Slice key() const { assert(success_); return Slice(keys_.back()); }
-  Slice value() const { assert(success_); return Slice(operands_.back()); }
-  const std::deque<std::string>& keys() const {
-    assert(!success_); return keys_;
-  }
-  const std::deque<std::string>& values() const {
-    assert(!success_); return operands_;
-  }
+  const std::deque<std::string>& keys() const { return keys_; }
+  const std::deque<std::string>& values() const { return operands_; }
+  uint64_t TotalFilterTime() const { return total_filter_time_; }
   bool HasOperator() const { return user_merge_operator_ != nullptr; }
 
  private:
+  Env* env_;
   const Comparator* user_comparator_;
   const MergeOperator* user_merge_operator_;
+  const CompactionFilter* compaction_filter_;
   Logger* logger_;
   unsigned min_partial_merge_operands_;
   bool assert_valid_internal_key_; // enforce no internal key corruption?
+  SequenceNumber latest_snapshot_;
+  int level_;
 
   // the scratch area that holds the result of MergeUntil
   // valid up to the next MergeUntil call
   std::deque<std::string> keys_;    // Keeps track of the sequence of keys seen
   std::deque<std::string> operands_;  // Parallel with keys_; stores the values
-  bool success_;
+
+  StopWatchNano filter_timer_;
+  uint64_t total_filter_time_;
+  Statistics* stats_;
+};
+
+// MergeOutputIterator can be used to iterate over the result of a merge.
+class MergeOutputIterator {
+ public:
+  // The MergeOutputIterator is bound to a MergeHelper instance.
+  explicit MergeOutputIterator(const MergeHelper* merge_helper);
+
+  // Seeks to the first record in the output.
+  void SeekToFirst();
+  // Advances to the next record in the output.
+  void Next();
+
+  Slice key() { return Slice(*it_keys_); }
+  Slice value() { return Slice(*it_values_); }
+  bool Valid() { return it_keys_ != merge_helper_->keys().rend(); }
+
+ private:
+  const MergeHelper* merge_helper_;
+  std::deque<std::string>::const_reverse_iterator it_keys_;
+  std::deque<std::string>::const_reverse_iterator it_values_;
 };
 
 } // namespace rocksdb
diff --git a/src/rocksdb/db/merge_helper_test.cc b/src/rocksdb/db/merge_helper_test.cc
new file mode 100644
index 0000000..2ef0d39
--- /dev/null
+++ b/src/rocksdb/db/merge_helper_test.cc
@@ -0,0 +1,289 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/merge_helper.h"
+#include "rocksdb/comparator.h"
+#include "util/coding.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+class MergeHelperTest : public testing::Test {
+ public:
+  MergeHelperTest() { env_ = Env::Default(); }
+
+  ~MergeHelperTest() = default;
+
+  Status Run(SequenceNumber stop_before, bool at_bottom,
+             SequenceNumber latest_snapshot = 0) {
+    iter_.reset(new test::VectorIterator(ks_, vs_));
+    iter_->SeekToFirst();
+    merge_helper_.reset(new MergeHelper(env_, BytewiseComparator(),
+                                        merge_op_.get(), filter_.get(), nullptr,
+                                        2U, false, latest_snapshot));
+    return merge_helper_->MergeUntil(iter_.get(), stop_before, at_bottom);
+  }
+
+  void AddKeyVal(const std::string& user_key, const SequenceNumber& seq,
+                 const ValueType& t, const std::string& val,
+                 bool corrupt = false) {
+    InternalKey ikey(user_key, seq, t);
+    if (corrupt) {
+      test::CorruptKeyType(&ikey);
+    }
+    ks_.push_back(ikey.Encode().ToString());
+    vs_.push_back(val);
+  }
+
+  Env* env_;
+  std::unique_ptr<test::VectorIterator> iter_;
+  std::shared_ptr<MergeOperator> merge_op_;
+  std::unique_ptr<MergeHelper> merge_helper_;
+  std::vector<std::string> ks_;
+  std::vector<std::string> vs_;
+  std::unique_ptr<test::FilterNumber> filter_;
+};
+
+// If MergeHelper encounters a new key on the last level, we know that
+// the key has no more history and it can merge keys.
+TEST_F(MergeHelperTest, MergeAtBottomSuccess) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 20, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("b", 10, kTypeMerge, test::EncodeInt(4U));  // <- iter_ after merge
+
+  ASSERT_TRUE(Run(0, true).ok());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 20, kTypeValue), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging with a value results in a successful merge.
+TEST_F(MergeHelperTest, MergeValue) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U));  // <- iter_ after merge
+  AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(0, false).ok());
+  ASSERT_EQ(ks_[3], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 40, kTypeValue), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(8U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging stops before a snapshot.
+TEST_F(MergeHelperTest, SnapshotBeforeValue) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(3U));  // <- iter_ after merge
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U));
+  AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(31, true).IsMergeInProgress());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// MergeHelper preserves the operand stack for merge operators that
+// cannot do a partial merge.
+TEST_F(MergeHelperTest, NoPartialMerge) {
+  merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
+
+  AddKeyVal("a", 50, kTypeMerge, "v2");
+  AddKeyVal("a", 40, kTypeMerge, "v");  // <- iter_ after merge
+  AddKeyVal("a", 30, kTypeMerge, "v");
+
+  ASSERT_TRUE(Run(31, true).IsMergeInProgress());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 40, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ("v", merge_helper_->values()[0]);
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[1]);
+  ASSERT_EQ("v2", merge_helper_->values()[1]);
+  ASSERT_EQ(2U, merge_helper_->keys().size());
+  ASSERT_EQ(2U, merge_helper_->values().size());
+}
+
+// A single operand can not be merged.
+TEST_F(MergeHelperTest, SingleOperand) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(31, true).IsMergeInProgress());
+  ASSERT_FALSE(iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(1U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging with a deletion turns the deletion into a value
+TEST_F(MergeHelperTest, MergeDeletion) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 20, kTypeDeletion, "");
+
+  ASSERT_TRUE(Run(15, false).ok());
+  ASSERT_FALSE(iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 30, kTypeValue), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(3U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// The merge helper stops upon encountering a corrupt key
+TEST_F(MergeHelperTest, CorruptKey) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(1U));
+  // Corrupt key
+  AddKeyVal("a", 20, kTypeDeletion, "", true);  // <- iter_ after merge
+
+  ASSERT_TRUE(Run(15, false).IsMergeInProgress());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 30, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// The compaction filter is called on every merge operand
+TEST_F(MergeHelperTest, FilterMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 25, kTypeValue, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(15, false).ok());
+  ASSERT_FALSE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  ASSERT_EQ(test::KeyStr("a", 30, kTypeValue),
+            merge_output_iter.key().ToString());
+  ASSERT_EQ(test::EncodeInt(8U), merge_output_iter.value().ToString());
+  merge_output_iter.Next();
+  ASSERT_FALSE(merge_output_iter.Valid());
+}
+
+TEST_F(MergeHelperTest, FilterAllMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
+
+  // filtered out all
+  ASSERT_TRUE(Run(15, false).ok());
+  ASSERT_FALSE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  ASSERT_FALSE(merge_output_iter.Valid());
+
+  // we have one operand that will survive because it's a delete
+  AddKeyVal("a", 24, kTypeDeletion, test::EncodeInt(5U));
+  AddKeyVal("b", 23, kTypeValue, test::EncodeInt(5U));
+  ASSERT_TRUE(Run(15, true).ok());
+  merge_output_iter = MergeOutputIterator(merge_helper_.get());
+  ASSERT_TRUE(iter_->Valid());
+  merge_output_iter.SeekToFirst();
+  ASSERT_FALSE(merge_output_iter.Valid());
+
+  // when all merge operands are filtered out, we leave the iterator pointing to
+  // the Put/Delete that survived
+  ASSERT_EQ(test::KeyStr("a", 24, kTypeDeletion), iter_->key().ToString());
+  ASSERT_EQ(test::EncodeInt(5U), iter_->value().ToString());
+}
+
+// Make sure that merge operands are filtered at the beginning
+TEST_F(MergeHelperTest, FilterFirstMergeOperand) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U));  // next user key
+
+  ASSERT_OK(Run(15, true));
+  ASSERT_TRUE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  // sequence number is 29 here, because the first merge operand got filtered
+  // out
+  ASSERT_EQ(test::KeyStr("a", 29, kTypeValue),
+            merge_output_iter.key().ToString());
+  ASSERT_EQ(test::EncodeInt(6U), merge_output_iter.value().ToString());
+  merge_output_iter.Next();
+  ASSERT_FALSE(merge_output_iter.Valid());
+
+  // make sure that we're passing user keys into the filter
+  ASSERT_EQ("a", filter_->last_merge_operand_key());
+}
+
+// Make sure that merge operands are not filtered out if there's a snapshot
+// pointing at them
+TEST_F(MergeHelperTest, DontFilterMergeOperandsBeforeSnapshotTest) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U));
+
+  ASSERT_OK(Run(15, true, 32));
+  ASSERT_TRUE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  ASSERT_EQ(test::KeyStr("a", 31, kTypeValue),
+            merge_output_iter.key().ToString());
+  ASSERT_EQ(test::EncodeInt(26U), merge_output_iter.value().ToString());
+  merge_output_iter.Next();
+  ASSERT_FALSE(merge_output_iter.Valid());
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/merge_operator.cc b/src/rocksdb/db/merge_operator.cc
index a14df8a..c6645a9 100644
--- a/src/rocksdb/db/merge_operator.cc
+++ b/src/rocksdb/db/merge_operator.cc
@@ -20,11 +20,11 @@ bool MergeOperator::PartialMergeMulti(const Slice& key,
                                       Logger* logger) const {
   assert(operand_list.size() >= 2);
   // Simply loop through the operands
-  std::string temp_value;
   Slice temp_slice(operand_list[0]);
 
   for (size_t i = 1; i < operand_list.size(); ++i) {
     auto& operand = operand_list[i];
+    std::string temp_value;
     if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) {
       return false;
     }
@@ -48,9 +48,9 @@ bool AssociativeMergeOperator::FullMerge(
 
   // Simply loop through the operands
   Slice temp_existing;
-  std::string temp_value;
   for (const auto& operand : operand_list) {
     Slice value(operand);
+    std::string temp_value;
     if (!Merge(key, existing_value, value, &temp_value, logger)) {
       return false;
     }
diff --git a/src/rocksdb/db/merge_test.cc b/src/rocksdb/db/merge_test.cc
index 2fa7fae..192ea2f 100644
--- a/src/rocksdb/db/merge_test.cc
+++ b/src/rocksdb/db/merge_test.cc
@@ -7,6 +7,7 @@
 #include <memory>
 #include <iostream>
 
+#include "port/stack_trace.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
@@ -41,6 +42,7 @@ class CountMergeOperator : public AssociativeMergeOperator {
                      const Slice& value,
                      std::string* new_value,
                      Logger* logger) const override {
+    assert(new_value->empty());
     ++num_merge_operator_calls;
     if (existing_value == nullptr) {
       new_value->assign(value.data(), value.size());
@@ -59,6 +61,7 @@ class CountMergeOperator : public AssociativeMergeOperator {
                                  const std::deque<Slice>& operand_list,
                                  std::string* new_value,
                                  Logger* logger) const override {
+    assert(new_value->empty());
     ++num_partial_merge_calls;
     return mergeOperator_->PartialMergeMulti(key, operand_list, new_value,
                                              logger);
@@ -84,6 +87,8 @@ std::shared_ptr<DB> OpenDb(const string& dbname, const bool ttl = false,
   options.min_partial_merge_operands = min_partial_merge_operands;
   Status s;
   DestroyDB(dbname, Options());
+// DBWithTTL is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
   if (ttl) {
     cout << "Opening database with TTL\n";
     DBWithTTL* db_with_ttl;
@@ -92,6 +97,10 @@ std::shared_ptr<DB> OpenDb(const string& dbname, const bool ttl = false,
   } else {
     s = DB::Open(options, dbname, &db);
   }
+#else
+  assert(!ttl);
+  s = DB::Open(options, dbname, &db);
+#endif  // !ROCKSDB_LITE
   if (!s.ok()) {
     cerr << s.ToString() << endl;
     assert(false);
@@ -294,7 +303,7 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) {
     db->Flush(o);
 
     cout << "Compaction started ...\n";
-    db->CompactRange(nullptr, nullptr);
+    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
     cout << "Compaction ended\n";
 
     dumpDb(db);
@@ -341,7 +350,7 @@ void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
     tmp_sum += i;
   }
   db->Flush(o);
-  db->CompactRange(nullptr, nullptr);
+  db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
   ASSERT_EQ(tmp_sum, counters->assert_get("b"));
   if (count > max_merge) {
     // in this case, FullMerge should be called instead.
@@ -360,7 +369,7 @@ void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
     tmp_sum += i;
   }
   db->Flush(o);
-  db->CompactRange(nullptr, nullptr);
+  db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
   ASSERT_EQ(tmp_sum, counters->assert_get("c"));
   ASSERT_EQ(num_partial_merge_calls, 0U);
 }
@@ -467,7 +476,7 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) {
       counters.add("test-key", 1);
       counters.add("test-key", 1);
       counters.add("test-key", 1);
-      db->CompactRange(nullptr, nullptr);
+      db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
     }
 
     DB* reopen_db;
@@ -498,8 +507,12 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) {
 
 int main(int argc, char *argv[]) {
   //TODO: Make this test like a general rocksdb unit-test
+  rocksdb::port::InstallStackTraceHandler();
   runTest(argc, test::TmpDir() + "/merge_testdb");
+// DBWithTTL is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
   runTest(argc, test::TmpDir() + "/merge_testdbttl", true); // Run test on TTL database
+#endif  // !ROCKSDB_LITE
   printf("Passed all tests!\n");
   return 0;
 }
diff --git a/src/rocksdb/db/plain_table_db_test.cc b/src/rocksdb/db/plain_table_db_test.cc
index edcfde7..d9c0082 100644
--- a/src/rocksdb/db/plain_table_db_test.cc
+++ b/src/rocksdb/db/plain_table_db_test.cc
@@ -8,6 +8,9 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
 #include <algorithm>
 #include <set>
 
@@ -24,6 +27,7 @@
 #include "rocksdb/table.h"
 #include "table/meta_blocks.h"
 #include "table/bloom_block.h"
+#include "table/table_builder.h"
 #include "table/plain_table_factory.h"
 #include "table/plain_table_reader.h"
 #include "util/hash.h"
@@ -38,28 +42,33 @@ using std::unique_ptr;
 
 namespace rocksdb {
 
-class PlainTableDBTest : public testing::Test {
+class PlainTableDBTest : public testing::Test,
+                         public testing::WithParamInterface<bool> {
  protected:
  private:
   std::string dbname_;
   Env* env_;
   DB* db_;
 
+  bool mmap_mode_;
   Options last_options_;
 
  public:
-  PlainTableDBTest() : env_(Env::Default()) {
-    dbname_ = test::TmpDir() + "/plain_table_db_test";
-    EXPECT_OK(DestroyDB(dbname_, Options()));
-    db_ = nullptr;
-    Reopen();
-  }
+  PlainTableDBTest() : env_(Env::Default()) {}
 
   ~PlainTableDBTest() {
     delete db_;
     EXPECT_OK(DestroyDB(dbname_, Options()));
   }
 
+  void SetUp() override {
+    mmap_mode_ = GetParam();
+    dbname_ = test::TmpDir() + "/plain_table_db_test";
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
   // Return the current option configuration.
   Options CurrentOptions() {
     Options options;
@@ -78,7 +87,7 @@ class PlainTableDBTest : public testing::Test {
     options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
 
     options.prefix_extractor.reset(NewFixedPrefixTransform(8));
-    options.allow_mmap_reads = true;
+    options.allow_mmap_reads = mmap_mode_;
     return options;
   }
 
@@ -183,7 +192,7 @@ class PlainTableDBTest : public testing::Test {
   }
 };
 
-TEST_F(PlainTableDBTest, Empty) {
+TEST_P(PlainTableDBTest, Empty) {
   ASSERT_TRUE(dbfull() != nullptr);
   ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
 }
@@ -198,14 +207,13 @@ class TestPlainTableReader : public PlainTableReader {
                        int bloom_bits_per_key, double hash_table_ratio,
                        size_t index_sparseness,
                        const TableProperties* table_properties,
-                       unique_ptr<RandomAccessFile>&& file,
+                       unique_ptr<RandomAccessFileReader>&& file,
                        const ImmutableCFOptions& ioptions,
-                       bool* expect_bloom_not_match,
-                       bool store_index_in_file)
+                       bool* expect_bloom_not_match, bool store_index_in_file)
       : PlainTableReader(ioptions, std::move(file), env_options, icomparator,
                          encoding_type, file_size, table_properties),
         expect_bloom_not_match_(expect_bloom_not_match) {
-    Status s = MmapDataFile();
+    Status s = MmapDataIfNeeded();
     EXPECT_TRUE(s.ok());
 
     s = PopulateIndex(const_cast<TableProperties*>(table_properties),
@@ -254,27 +262,29 @@ class TestPlainTableFactory : public PlainTableFactory {
         store_index_in_file_(options.store_index_in_file),
         expect_bloom_not_match_(expect_bloom_not_match) {}
 
-  Status NewTableReader(const ImmutableCFOptions& ioptions,
-                        const EnvOptions& env_options,
-                        const InternalKeyComparator& internal_comparator,
-                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+  Status NewTableReader(const TableReaderOptions& table_reader_options,
+                        unique_ptr<RandomAccessFileReader>&& file,
+                        uint64_t file_size,
                         unique_ptr<TableReader>* table) const override {
     TableProperties* props = nullptr;
-    auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
-                                 ioptions.env, ioptions.info_log, &props);
+    auto s =
+        ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+                            table_reader_options.ioptions.env,
+                            table_reader_options.ioptions.info_log, &props);
     EXPECT_TRUE(s.ok());
 
     if (store_index_in_file_) {
       BlockHandle bloom_block_handle;
       s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
-                        ioptions.env, BloomBlockBuilder::kBloomBlock,
-                        &bloom_block_handle);
+                        table_reader_options.ioptions.env,
+                        BloomBlockBuilder::kBloomBlock, &bloom_block_handle);
       EXPECT_TRUE(s.ok());
 
       BlockHandle index_block_handle;
-      s = FindMetaBlock(
-          file.get(), file_size, kPlainTableMagicNumber, ioptions.env,
-          PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle);
+      s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
+                        table_reader_options.ioptions.env,
+                        PlainTableIndexBuilder::kPlainTableIndexBlock,
+                        &index_block_handle);
       EXPECT_TRUE(s.ok());
     }
 
@@ -286,9 +296,10 @@ class TestPlainTableFactory : public PlainTableFactory {
         DecodeFixed32(encoding_type_prop->second.c_str()));
 
     std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
-        env_options, internal_comparator, encoding_type, file_size,
+        table_reader_options.env_options,
+        table_reader_options.internal_comparator, encoding_type, file_size,
         bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
-        std::move(file), ioptions, expect_bloom_not_match_,
+        std::move(file), table_reader_options.ioptions, expect_bloom_not_match_,
         store_index_in_file_));
 
     *table = std::move(new_reader);
@@ -303,7 +314,7 @@ class TestPlainTableFactory : public PlainTableFactory {
   bool* expect_bloom_not_match_;
 };
 
-TEST_F(PlainTableDBTest, Flush) {
+TEST_P(PlainTableDBTest, Flush) {
   for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
        huge_page_tlb_size += 2 * 1024 * 1024) {
     for (EncodingType encoding_type : {kPlain, kPrefix}) {
@@ -390,7 +401,7 @@ TEST_F(PlainTableDBTest, Flush) {
   }
 }
 
-TEST_F(PlainTableDBTest, Flush2) {
+TEST_P(PlainTableDBTest, Flush2) {
   for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
        huge_page_tlb_size += 2 * 1024 * 1024) {
     for (EncodingType encoding_type : {kPlain, kPrefix}) {
@@ -470,7 +481,7 @@ TEST_F(PlainTableDBTest, Flush2) {
   }
 }
 
-TEST_F(PlainTableDBTest, Iterator) {
+TEST_P(PlainTableDBTest, Iterator) {
   for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
        huge_page_tlb_size += 2 * 1024 * 1024) {
     for (EncodingType encoding_type : {kPlain, kPrefix}) {
@@ -604,7 +615,7 @@ std::string MakeLongKey(size_t length, char c) {
 }
 }  // namespace
 
-TEST_F(PlainTableDBTest, IteratorLargeKeys) {
+TEST_P(PlainTableDBTest, IteratorLargeKeys) {
   Options options = CurrentOptions();
 
   PlainTableOptions plain_table_options;
@@ -654,7 +665,7 @@ std::string MakeLongKeyWithPrefix(size_t length, char c) {
 }
 }  // namespace
 
-TEST_F(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
+TEST_P(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
   Options options = CurrentOptions();
 
   PlainTableOptions plain_table_options;
@@ -696,7 +707,7 @@ TEST_F(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
   delete iter;
 }
 
-TEST_F(PlainTableDBTest, IteratorReverseSuffixComparator) {
+TEST_P(PlainTableDBTest, IteratorReverseSuffixComparator) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
   // Set only one bucket to force bucket conflict.
@@ -765,7 +776,7 @@ TEST_F(PlainTableDBTest, IteratorReverseSuffixComparator) {
   delete iter;
 }
 
-TEST_F(PlainTableDBTest, HashBucketConflict) {
+TEST_P(PlainTableDBTest, HashBucketConflict) {
   for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
        huge_page_tlb_size += 2 * 1024 * 1024) {
     for (unsigned char i = 1; i <= 3; i++) {
@@ -858,7 +869,7 @@ TEST_F(PlainTableDBTest, HashBucketConflict) {
   }
 }
 
-TEST_F(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
+TEST_P(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
   for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
        huge_page_tlb_size += 2 * 1024 * 1024) {
     for (unsigned char i = 1; i <= 3; i++) {
@@ -951,7 +962,7 @@ TEST_F(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
   }
 }
 
-TEST_F(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
+TEST_P(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
   // Set only one bucket to force bucket conflict.
@@ -1007,11 +1018,10 @@ static std::string RandomString(Random* rnd, int len) {
   return r;
 }
 
-TEST_F(PlainTableDBTest, CompactionTrigger) {
+TEST_P(PlainTableDBTest, CompactionTrigger) {
   Options options = CurrentOptions();
-  options.write_buffer_size = 100 << 10; //100KB
+  options.write_buffer_size = 120 << 10;  // 100KB
   options.num_levels = 3;
-  options.max_mem_compaction_level = 0;
   options.level0_file_num_compaction_trigger = 3;
   Reopen(&options);
 
@@ -1020,11 +1030,12 @@ TEST_F(PlainTableDBTest, CompactionTrigger) {
   for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
       num++) {
     std::vector<std::string> values;
-    // Write 120KB (12 values, each 10K)
-    for (int i = 0; i < 12; i++) {
-      values.push_back(RandomString(&rnd, 10000));
+    // Write 120KB (10 values, each 12K)
+    for (int i = 0; i < 10; i++) {
+      values.push_back(RandomString(&rnd, 12000));
       ASSERT_OK(Put(Key(i), values[i]));
     }
+    ASSERT_OK(Put(Key(999), ""));
     dbfull()->TEST_WaitForFlushMemTable();
     ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
   }
@@ -1035,13 +1046,14 @@ TEST_F(PlainTableDBTest, CompactionTrigger) {
     values.push_back(RandomString(&rnd, 10000));
     ASSERT_OK(Put(Key(i), values[i]));
   }
+  ASSERT_OK(Put(Key(999), ""));
   dbfull()->TEST_WaitForCompact();
 
   ASSERT_EQ(NumTableFilesAtLevel(0), 0);
   ASSERT_EQ(NumTableFilesAtLevel(1), 1);
 }
 
-TEST_F(PlainTableDBTest, AdaptiveTable) {
+TEST_P(PlainTableDBTest, AdaptiveTable) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
 
@@ -1084,9 +1096,21 @@ TEST_F(PlainTableDBTest, AdaptiveTable) {
   ASSERT_NE("v5", Get("3000000000000bar"));
 }
 
+INSTANTIATE_TEST_CASE_P(PlainTableDBTest, PlainTableDBTest, ::testing::Bool());
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as plain table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/prefix_test.cc b/src/rocksdb/db/prefix_test.cc
index 3cc4e89..d095d44 100644
--- a/src/rocksdb/db/prefix_test.cc
+++ b/src/rocksdb/db/prefix_test.cc
@@ -3,11 +3,13 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef ROCKSDB_LITE
+
 #ifndef GFLAGS
 #include <cstdio>
 int main() {
-  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
-  return 1;
+  fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+  return 0;
 }
 #else
 
@@ -501,3 +503,15 @@ int main(int argc, char** argv) {
 }
 
 #endif  // GFLAGS
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr,
+          "SKIPPED as HashSkipList and HashLinkList are not supported in "
+          "ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/repair.cc b/src/rocksdb/db/repair.cc
index 8b15eaa..d1ef6db 100644
--- a/src/rocksdb/db/repair.cc
+++ b/src/rocksdb/db/repair.cc
@@ -81,6 +81,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/immutable_options.h"
+#include "util/file_reader_writer.h"
 #include "util/scoped_arena_iterator.h"
 
 namespace rocksdb {
@@ -127,7 +128,7 @@ class Repairer {
       }
       Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
           "**** Repaired rocksdb %s; "
-          "recovered %zu files; %" PRIu64
+          "recovered %" ROCKSDB_PRIszt " files; %" PRIu64
           "bytes. "
           "Some data may have been lost. "
           "****",
@@ -236,6 +237,8 @@ class Repairer {
     if (!status.ok()) {
       return status;
     }
+    unique_ptr<SequentialFileReader> lfile_reader(
+        new SequentialFileReader(std::move(lfile)));
 
     // Create the log reader.
     LogReporter reporter;
@@ -246,16 +249,17 @@ class Repairer {
     // corruptions cause entire commits to be skipped instead of
     // propagating bad information (like overly large sequence
     // numbers).
-    log::Reader reader(std::move(lfile), &reporter, true /*enable checksum*/,
-                       0/*initial_offset*/);
+    log::Reader reader(std::move(lfile_reader), &reporter,
+                       true /*enable checksum*/, 0 /*initial_offset*/);
 
     // Read all the records and add to a memtable
     std::string scratch;
     Slice record;
     WriteBatch batch;
     WriteBuffer wb(options_.db_write_buffer_size);
-    MemTable* mem = new MemTable(icmp_, ioptions_,
-                                 MutableCFOptions(options_, ioptions_), &wb);
+    MemTable* mem =
+        new MemTable(icmp_, ioptions_, MutableCFOptions(options_, ioptions_),
+                     &wb, kMaxSequenceNumber);
     auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem);
     mem->Ref();
     int counter = 0;
@@ -288,8 +292,8 @@ class Repairer {
       ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
       status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_,
                           iter.get(), &meta, icmp_,
-                          &int_tbl_prop_collector_factories_, 0, 0,
-                          kNoCompression, CompressionOptions(), false);
+                          &int_tbl_prop_collector_factories_, {},
+                          kNoCompression, CompressionOptions(), false, nullptr);
     }
     delete mem->Unref();
     delete cf_mems_default;
@@ -377,8 +381,8 @@ class Repairer {
   Status WriteDescriptor() {
     std::string tmp = TempFileName(dbname_, 1);
     unique_ptr<WritableFile> file;
-    Status status = env_->NewWritableFile(
-        tmp, &file, env_->OptimizeForManifestWrite(env_options_));
+    EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_);
+    Status status = env_->NewWritableFile(tmp, &file, env_options);
     if (!status.ok()) {
       return status;
     }
@@ -400,12 +404,15 @@ class Repairer {
       const TableInfo& t = tables_[i];
       edit_->AddFile(0, t.meta.fd.GetNumber(), t.meta.fd.GetPathId(),
                      t.meta.fd.GetFileSize(), t.meta.smallest, t.meta.largest,
-                     t.min_sequence, t.max_sequence);
+                     t.min_sequence, t.max_sequence,
+                     t.meta.marked_for_compaction);
     }
 
     //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
     {
-      log::Writer log(std::move(file));
+      unique_ptr<WritableFileWriter> file_writer(
+          new WritableFileWriter(std::move(file), env_options));
+      log::Writer log(std::move(file_writer));
       std::string record;
       edit_->EncodeTo(&record);
       status = log.AddRecord(record);
diff --git a/src/rocksdb/db/skiplist.h b/src/rocksdb/db/skiplist.h
index c1e3750..787fad5 100644
--- a/src/rocksdb/db/skiplist.h
+++ b/src/rocksdb/db/skiplist.h
@@ -59,6 +59,9 @@ class SkipList {
   // Returns true iff an entry that compares equal to key is in the list.
   bool Contains(const Key& key) const;
 
+  // Return estimated number of entries smaller than `key`.
+  uint64_t EstimateCount(const Key& key) const;
+
   // Iteration over the contents of a skip list
   class Iterator {
    public:
@@ -117,7 +120,10 @@ class SkipList {
   // values are ok.
   std::atomic<int> max_height_;  // Height of the entire list
 
-  // Used for optimizing sequential insert patterns
+  // Used for optimizing sequential insert patterns.  Tricky.  prev_[i] for
+  // i up to max_height_ is the predecessor of prev_[0] and prev_height_
+  // is the height of prev_[0].  prev_[0] can only be equal to head before
+  // insertion, in which case max_height_ and prev_height_ are 1.
   Node** prev_;
   int32_t prev_height_;
 
@@ -135,16 +141,15 @@ class SkipList {
   // Return true if key is greater than the data stored in "n"
   bool KeyIsAfterNode(const Key& key, Node* n) const;
 
-  // Return the earliest node that comes at or after key.
+  // Returns the earliest node with a key >= key.
   // Return nullptr if there is no such node.
-  //
-  // If prev is non-nullptr, fills prev[level] with pointer to previous
-  // node at "level" for every level in [0..max_height_-1].
-  Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
+  Node* FindGreaterOrEqual(const Key& key) const;
 
   // Return the latest node with a key < key.
   // Return head_ if there is no such node.
-  Node* FindLessThan(const Key& key) const;
+  // Fills prev[level] with pointer to previous node at "level" for every
+  // level in [0..max_height_-1], if prev is non-null.
+  Node* FindLessThan(const Key& key, Node** prev = nullptr) const;
 
   // Return the last node in the list.
   // Return head_ if list is empty.
@@ -241,7 +246,7 @@ inline void SkipList<Key, Comparator>::Iterator::Prev() {
 
 template<typename Key, class Comparator>
 inline void SkipList<Key, Comparator>::Iterator::Seek(const Key& target) {
-  node_ = list_->FindGreaterOrEqual(target, nullptr);
+  node_ = list_->FindGreaterOrEqual(target);
 }
 
 template<typename Key, class Comparator>
@@ -277,36 +282,59 @@ bool SkipList<Key, Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
 
 template<typename Key, class Comparator>
 typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::
-  FindGreaterOrEqual(const Key& key, Node** prev) const {
-  // Use prev as an optimization hint and fallback to slow path
-  if (prev && !KeyIsAfterNode(key, prev[0]->Next(0))) {
-    Node* x = prev[0];
-    Node* next = x->Next(0);
-    if ((x == head_) || KeyIsAfterNode(key, x)) {
-      // Adjust all relevant insertion points to the previous entry
-      for (int i = 1; i < prev_height_; i++) {
-        prev[i] = x;
-      }
+  FindGreaterOrEqual(const Key& key) const {
+  // Note: It looks like we could reduce duplication by implementing
+  // this function as FindLessThan(key)->Next(0), but we wouldn't be able
+  // to exit early on equality and the result wouldn't even be correct.
+  // A concurrent insert might occur after FindLessThan(key) but before
+  // we get a chance to call Next(0).
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  Node* last_bigger = nullptr;
+  while (true) {
+    Node* next = x->Next(level);
+    // Make sure the lists are sorted
+    assert(x == head_ || next == nullptr || KeyIsAfterNode(next->key, x));
+    // Make sure we haven't overshot during our search
+    assert(x == head_ || KeyIsAfterNode(key, x));
+    int cmp = (next == nullptr || next == last_bigger)
+        ? 1 : compare_(next->key, key);
+    if (cmp == 0 || (cmp > 0 && level == 0)) {
       return next;
+    } else if (cmp < 0) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      // Switch to next list, reuse compare_() result
+      last_bigger = next;
+      level--;
     }
   }
-  // Normal lookup
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node*
+SkipList<Key, Comparator>::FindLessThan(const Key& key, Node** prev) const {
   Node* x = head_;
   int level = GetMaxHeight() - 1;
+  // KeyIsAfter(key, last_not_after) is definitely false
+  Node* last_not_after = nullptr;
   while (true) {
     Node* next = x->Next(level);
-    // Make sure the lists are sorted.
-    // If x points to head_ or next points nullptr, it is trivially satisfied.
-    assert((x == head_) || (next == nullptr) || KeyIsAfterNode(next->key, x));
-    if (KeyIsAfterNode(key, next)) {
+    assert(x == head_ || next == nullptr || KeyIsAfterNode(next->key, x));
+    assert(x == head_ || KeyIsAfterNode(key, x));
+    if (next != last_not_after && KeyIsAfterNode(key, next)) {
       // Keep searching in this list
       x = next;
     } else {
-      if (prev != nullptr) prev[level] = x;
+      if (prev != nullptr) {
+        prev[level] = x;
+      }
       if (level == 0) {
-        return next;
+        return x;
       } else {
-        // Switch to next list
+        // Switch to next list, reuse KeyIUsAfterNode() result
+        last_not_after = next;
         level--;
       }
     }
@@ -314,14 +342,13 @@ typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::
 }
 
 template<typename Key, class Comparator>
-typename SkipList<Key, Comparator>::Node*
-SkipList<Key, Comparator>::FindLessThan(const Key& key) const {
+typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::FindLast()
+    const {
   Node* x = head_;
   int level = GetMaxHeight() - 1;
   while (true) {
-    assert(x == head_ || compare_(x->key, key) < 0);
     Node* next = x->Next(level);
-    if (next == nullptr || compare_(next->key, key) >= 0) {
+    if (next == nullptr) {
       if (level == 0) {
         return x;
       } else {
@@ -334,30 +361,34 @@ SkipList<Key, Comparator>::FindLessThan(const Key& key) const {
   }
 }
 
-template<typename Key, class Comparator>
-typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::FindLast()
-    const {
+template <typename Key, class Comparator>
+uint64_t SkipList<Key, Comparator>::EstimateCount(const Key& key) const {
+  uint64_t count = 0;
+
   Node* x = head_;
   int level = GetMaxHeight() - 1;
   while (true) {
+    assert(x == head_ || compare_(x->key, key) < 0);
     Node* next = x->Next(level);
-    if (next == nullptr) {
+    if (next == nullptr || compare_(next->key, key) >= 0) {
       if (level == 0) {
-        return x;
+        return count;
       } else {
         // Switch to next list
+        count *= kBranching_;
         level--;
       }
     } else {
       x = next;
+      count++;
     }
   }
 }
 
-template<typename Key, class Comparator>
+template <typename Key, class Comparator>
 SkipList<Key, Comparator>::SkipList(const Comparator cmp, Allocator* allocator,
-                                   int32_t max_height,
-                                   int32_t branching_factor)
+                                    int32_t max_height,
+                                    int32_t branching_factor)
     : kMaxHeight_(max_height),
       kBranching_(branching_factor),
       compare_(cmp),
@@ -381,12 +412,27 @@ SkipList<Key, Comparator>::SkipList(const Comparator cmp, Allocator* allocator,
 
 template<typename Key, class Comparator>
 void SkipList<Key, Comparator>::Insert(const Key& key) {
-  // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
-  // here since Insert() is externally synchronized.
-  Node* x = FindGreaterOrEqual(key, prev_);
+  // fast path for sequential insertion
+  if (!KeyIsAfterNode(key, prev_[0]->NoBarrier_Next(0)) &&
+      (prev_[0] == head_ || KeyIsAfterNode(key, prev_[0]))) {
+    assert(prev_[0] != head_ || (prev_height_ == 1 && GetMaxHeight() == 1));
+
+    // Outside of this method prev_[1..max_height_] is the predecessor
+    // of prev_[0], and prev_height_ refers to prev_[0].  Inside Insert
+    // prev_[0..max_height - 1] is the predecessor of key.  Switch from
+    // the external state to the internal
+    for (int i = 1; i < prev_height_; i++) {
+      prev_[i] = prev_[0];
+    }
+  } else {
+    // TODO(opt): we could use a NoBarrier predecessor search as an
+    // optimization for architectures where memory_order_acquire needs
+    // a synchronization instruction.  Doesn't matter on x86
+    FindLessThan(key, prev_);
+  }
 
   // Our data structure does not allow duplicate insertion
-  assert(x == nullptr || !Equal(key, x->key));
+  assert(prev_[0]->Next(0) == nullptr || !Equal(key, prev_[0]->Next(0)->key));
 
   int height = RandomHeight();
   if (height > GetMaxHeight()) {
@@ -405,7 +451,7 @@ void SkipList<Key, Comparator>::Insert(const Key& key) {
     max_height_.store(height, std::memory_order_relaxed);
   }
 
-  x = NewNode(key, height);
+  Node* x = NewNode(key, height);
   for (int i = 0; i < height; i++) {
     // NoBarrier_SetNext() suffices since we will add a barrier when
     // we publish a pointer to "x" in prev[i].
@@ -418,7 +464,7 @@ void SkipList<Key, Comparator>::Insert(const Key& key) {
 
 template<typename Key, class Comparator>
 bool SkipList<Key, Comparator>::Contains(const Key& key) const {
-  Node* x = FindGreaterOrEqual(key, nullptr);
+  Node* x = FindGreaterOrEqual(key);
   if (x != nullptr && Equal(key, x->key)) {
     return true;
   } else {
diff --git a/src/rocksdb/db/snapshot.h b/src/rocksdb/db/snapshot.h
deleted file mode 100644
index c6852f5..0000000
--- a/src/rocksdb/db/snapshot.h
+++ /dev/null
@@ -1,111 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#pragma once
-#include <vector>
-
-#include "rocksdb/db.h"
-
-namespace rocksdb {
-
-class SnapshotList;
-
-// Snapshots are kept in a doubly-linked list in the DB.
-// Each SnapshotImpl corresponds to a particular sequence number.
-class SnapshotImpl : public Snapshot {
- public:
-  SequenceNumber number_;  // const after creation
-
-  virtual SequenceNumber GetSequenceNumber() const override { return number_; }
-
- private:
-  friend class SnapshotList;
-
-  // SnapshotImpl is kept in a doubly-linked circular list
-  SnapshotImpl* prev_;
-  SnapshotImpl* next_;
-
-  SnapshotList* list_;                 // just for sanity checks
-
-  int64_t unix_time_;
-};
-
-class SnapshotList {
- public:
-  SnapshotList() {
-    list_.prev_ = &list_;
-    list_.next_ = &list_;
-    list_.number_ = 0xFFFFFFFFL;      // placeholder marker, for debugging
-    count_ = 0;
-  }
-
-  bool empty() const { return list_.next_ == &list_; }
-  SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
-  SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
-
-  const SnapshotImpl* New(SequenceNumber seq, uint64_t unix_time) {
-    SnapshotImpl* s = new SnapshotImpl;
-    s->number_ = seq;
-    s->unix_time_ = unix_time;
-    s->list_ = this;
-    s->next_ = &list_;
-    s->prev_ = list_.prev_;
-    s->prev_->next_ = s;
-    s->next_->prev_ = s;
-    count_++;
-    return s;
-  }
-
-  void Delete(const SnapshotImpl* s) {
-    assert(s->list_ == this);
-    s->prev_->next_ = s->next_;
-    s->next_->prev_ = s->prev_;
-    count_--;
-    delete s;
-  }
-
-  // retrieve all snapshot numbers. They are sorted in ascending order.
-  std::vector<SequenceNumber> GetAll() {
-    std::vector<SequenceNumber> ret;
-    if (empty()) {
-      return ret;
-    }
-    SnapshotImpl* s = &list_;
-    while (s->next_ != &list_) {
-      ret.push_back(s->next_->number_);
-      s = s->next_;
-    }
-    return ret;
-  }
-
-  // get the sequence number of the most recent snapshot
-  SequenceNumber GetNewest() {
-    if (empty()) {
-      return 0;
-    }
-    return newest()->number_;
-  }
-
-  int64_t GetOldestSnapshotTime() const {
-    if (empty()) {
-      return 0;
-    } else {
-      return oldest()->unix_time_;
-    }
-  }
-
-  uint64_t count() const { return count_; }
-
- private:
-  // Dummy head of doubly-linked list of snapshots
-  SnapshotImpl list_;
-  uint64_t count_;
-};
-
-}  // namespace rocksdb
diff --git a/src/rocksdb/db/snapshot_impl.cc b/src/rocksdb/db/snapshot_impl.cc
new file mode 100644
index 0000000..1546d68
--- /dev/null
+++ b/src/rocksdb/db/snapshot_impl.cc
@@ -0,0 +1,23 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/snapshot.h"
+
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+ManagedSnapshot::ManagedSnapshot(DB* db) : db_(db),
+                                           snapshot_(db->GetSnapshot()) {}
+
+ManagedSnapshot::~ManagedSnapshot() {
+  if (snapshot_) {
+    db_->ReleaseSnapshot(snapshot_);
+  }
+}
+
+const Snapshot* ManagedSnapshot::snapshot() { return snapshot_;}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/snapshot_impl.h b/src/rocksdb/db/snapshot_impl.h
new file mode 100644
index 0000000..b4d58fd
--- /dev/null
+++ b/src/rocksdb/db/snapshot_impl.h
@@ -0,0 +1,111 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <vector>
+
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each SnapshotImpl corresponds to a particular sequence number.
+class SnapshotImpl : public Snapshot {
+ public:
+  SequenceNumber number_;  // const after creation
+
+  virtual SequenceNumber GetSequenceNumber() const override { return number_; }
+
+ private:
+  friend class SnapshotList;
+
+  // SnapshotImpl is kept in a doubly-linked circular list
+  SnapshotImpl* prev_;
+  SnapshotImpl* next_;
+
+  SnapshotList* list_;                 // just for sanity checks
+
+  int64_t unix_time_;
+};
+
+class SnapshotList {
+ public:
+  SnapshotList() {
+    list_.prev_ = &list_;
+    list_.next_ = &list_;
+    list_.number_ = 0xFFFFFFFFL;      // placeholder marker, for debugging
+    count_ = 0;
+  }
+
+  bool empty() const { return list_.next_ == &list_; }
+  SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
+  SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
+
+  const SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq,
+                          uint64_t unix_time) {
+    s->number_ = seq;
+    s->unix_time_ = unix_time;
+    s->list_ = this;
+    s->next_ = &list_;
+    s->prev_ = list_.prev_;
+    s->prev_->next_ = s;
+    s->next_->prev_ = s;
+    count_++;
+    return s;
+  }
+
+  // Do not responsible to free the object.
+  void Delete(const SnapshotImpl* s) {
+    assert(s->list_ == this);
+    s->prev_->next_ = s->next_;
+    s->next_->prev_ = s->prev_;
+    count_--;
+  }
+
+  // retrieve all snapshot numbers. They are sorted in ascending order.
+  std::vector<SequenceNumber> GetAll() {
+    std::vector<SequenceNumber> ret;
+    if (empty()) {
+      return ret;
+    }
+    SnapshotImpl* s = &list_;
+    while (s->next_ != &list_) {
+      ret.push_back(s->next_->number_);
+      s = s->next_;
+    }
+    return ret;
+  }
+
+  // get the sequence number of the most recent snapshot
+  SequenceNumber GetNewest() {
+    if (empty()) {
+      return 0;
+    }
+    return newest()->number_;
+  }
+
+  int64_t GetOldestSnapshotTime() const {
+    if (empty()) {
+      return 0;
+    } else {
+      return oldest()->unix_time_;
+    }
+  }
+
+  uint64_t count() const { return count_; }
+
+ private:
+  // Dummy head of doubly-linked list of snapshots
+  SnapshotImpl list_;
+  uint64_t count_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/db/table_cache.cc b/src/rocksdb/db/table_cache.cc
index e1b0ca8..b240fc7 100644
--- a/src/rocksdb/db/table_cache.cc
+++ b/src/rocksdb/db/table_cache.cc
@@ -9,21 +9,29 @@
 
 #include "db/table_cache.h"
 
+#include "db/dbformat.h"
 #include "db/filename.h"
 #include "db/version_edit.h"
 
 #include "rocksdb/statistics.h"
 #include "table/iterator_wrapper.h"
+#include "table/table_builder.h"
 #include "table/table_reader.h"
 #include "table/get_context.h"
 #include "util/coding.h"
+#include "util/file_reader_writer.h"
+#include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
+#include "util/sync_point.h"
 
 namespace rocksdb {
 
+namespace {
+
+template <class T>
 static void DeleteEntry(const Slice& key, void* value) {
-  TableReader* table_reader = reinterpret_cast<TableReader*>(value);
-  delete table_reader;
+  T* typed_value = reinterpret_cast<T*>(value);
+  delete typed_value;
 }
 
 static void UnrefEntry(void* arg1, void* arg2) {
@@ -32,16 +40,37 @@ static void UnrefEntry(void* arg1, void* arg2) {
   cache->Release(h);
 }
 
+static void DeleteTableReader(void* arg1, void* arg2) {
+  TableReader* table_reader = reinterpret_cast<TableReader*>(arg1);
+  delete table_reader;
+}
+
 static Slice GetSliceForFileNumber(const uint64_t* file_number) {
   return Slice(reinterpret_cast<const char*>(file_number),
                sizeof(*file_number));
 }
 
+#ifndef ROCKSDB_LITE
+
+void AppendVarint64(IterKey* key, uint64_t v) {
+  char buf[10];
+  auto ptr = EncodeVarint64(buf, v);
+  key->TrimAppend(key->Size(), buf, ptr - buf);
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace
+
 TableCache::TableCache(const ImmutableCFOptions& ioptions,
                        const EnvOptions& env_options, Cache* const cache)
-    : ioptions_(ioptions),
-      env_options_(env_options),
-      cache_(cache) {}
+    : ioptions_(ioptions), env_options_(env_options), cache_(cache) {
+  if (ioptions_.row_cache) {
+    // If the same cache is shared by multiple instances, we need to
+    // disambiguate its entries.
+    PutVarint64(&row_cache_id_, ioptions_.row_cache->NewId());
+  }
+}
 
 TableCache::~TableCache() {
 }
@@ -54,41 +83,66 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) {
   cache_->Release(handle);
 }
 
+Status TableCache::GetTableReader(
+    const EnvOptions& env_options,
+    const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
+    bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist,
+    unique_ptr<TableReader>* table_reader) {
+  std::string fname =
+      TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId());
+  unique_ptr<RandomAccessFile> file;
+  Status s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options);
+  if (sequential_mode && ioptions_.compaction_readahead_size > 0) {
+    file = NewReadaheadRandomAccessFile(std::move(file),
+                                        ioptions_.compaction_readahead_size);
+  }
+  RecordTick(ioptions_.statistics, NO_FILE_OPENS);
+  if (s.ok()) {
+    if (!sequential_mode && ioptions_.advise_random_on_open) {
+      file->Hint(RandomAccessFile::RANDOM);
+    }
+    StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS);
+    std::unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(std::move(file), ioptions_.env,
+                                   ioptions_.statistics, record_read_stats,
+                                   file_read_hist));
+    s = ioptions_.table_factory->NewTableReader(
+        TableReaderOptions(ioptions_, env_options, internal_comparator),
+        std::move(file_reader), fd.GetFileSize(), table_reader);
+    TEST_SYNC_POINT("TableCache::GetTableReader:0");
+  }
+  return s;
+}
+
 Status TableCache::FindTable(const EnvOptions& env_options,
                              const InternalKeyComparator& internal_comparator,
                              const FileDescriptor& fd, Cache::Handle** handle,
-                             const bool no_io) {
+                             const bool no_io, bool record_read_stats,
+                             HistogramImpl* file_read_hist) {
+  PERF_TIMER_GUARD(find_table_nanos);
   Status s;
   uint64_t number = fd.GetNumber();
   Slice key = GetSliceForFileNumber(&number);
   *handle = cache_->Lookup(key);
+  TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0",
+                           const_cast<bool*>(&no_io));
+
   if (*handle == nullptr) {
-    if (no_io) { // Dont do IO and return a not-found status
+    if (no_io) {  // Don't do IO and return a not-found status
       return Status::Incomplete("Table not found in table_cache, no_io is set");
     }
-    std::string fname =
-        TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId());
-    unique_ptr<RandomAccessFile> file;
     unique_ptr<TableReader> table_reader;
-    s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options);
-    RecordTick(ioptions_.statistics, NO_FILE_OPENS);
-    if (s.ok()) {
-      if (ioptions_.advise_random_on_open) {
-        file->Hint(RandomAccessFile::RANDOM);
-      }
-      StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS);
-      s = ioptions_.table_factory->NewTableReader(
-          ioptions_, env_options, internal_comparator, std::move(file),
-          fd.GetFileSize(), &table_reader);
-    }
-
+    s = GetTableReader(env_options, internal_comparator, fd,
+                       false /* sequential mode */, record_read_stats,
+                       file_read_hist, &table_reader);
     if (!s.ok()) {
       assert(table_reader == nullptr);
       RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
       // We do not cache error results so that if the error is transient,
       // or somebody repairs the file, we recover automatically.
     } else {
-      *handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry);
+      *handle = cache_->Insert(key, table_reader.release(), 1,
+                               &DeleteEntry<TableReader>);
     }
   }
   return s;
@@ -99,33 +153,56 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
                                   const InternalKeyComparator& icomparator,
                                   const FileDescriptor& fd,
                                   TableReader** table_reader_ptr,
+                                  HistogramImpl* file_read_hist,
                                   bool for_compaction, Arena* arena) {
+  PERF_TIMER_GUARD(new_table_iterator_nanos);
+
   if (table_reader_ptr != nullptr) {
     *table_reader_ptr = nullptr;
   }
-  TableReader* table_reader = fd.table_reader;
+
+  TableReader* table_reader = nullptr;
   Cache::Handle* handle = nullptr;
-  Status s;
-  if (table_reader == nullptr) {
-    s = FindTable(env_options, icomparator, fd, &handle,
-                  options.read_tier == kBlockCacheTier);
+  bool create_new_table_reader =
+      (for_compaction && ioptions_.new_table_reader_for_compaction_inputs);
+  if (create_new_table_reader) {
+    unique_ptr<TableReader> table_reader_unique_ptr;
+    Status s = GetTableReader(
+        env_options, icomparator, fd, /* sequential mode */ true,
+        /* record stats */ false, nullptr, &table_reader_unique_ptr);
     if (!s.ok()) {
       return NewErrorIterator(s, arena);
     }
-    table_reader = GetTableReaderFromHandle(handle);
+    table_reader = table_reader_unique_ptr.release();
+  } else {
+    table_reader = fd.table_reader;
+    if (table_reader == nullptr) {
+      Status s =
+          FindTable(env_options, icomparator, fd, &handle,
+                    options.read_tier == kBlockCacheTier /* no_io */,
+                    !for_compaction /* record read_stats */, file_read_hist);
+      if (!s.ok()) {
+        return NewErrorIterator(s, arena);
+      }
+      table_reader = GetTableReaderFromHandle(handle);
+    }
   }
 
   Iterator* result = table_reader->NewIterator(options, arena);
-  if (handle != nullptr) {
+
+  if (create_new_table_reader) {
+    assert(handle == nullptr);
+    result->RegisterCleanup(&DeleteTableReader, table_reader, nullptr);
+  } else if (handle != nullptr) {
     result->RegisterCleanup(&UnrefEntry, cache_, handle);
   }
-  if (table_reader_ptr != nullptr) {
-    *table_reader_ptr = table_reader;
-  }
 
   if (for_compaction) {
     table_reader->SetupForCompaction();
   }
+  if (table_reader_ptr != nullptr) {
+    *table_reader_ptr = table_reader;
+  }
 
   return result;
 }
@@ -133,27 +210,83 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
 Status TableCache::Get(const ReadOptions& options,
                        const InternalKeyComparator& internal_comparator,
                        const FileDescriptor& fd, const Slice& k,
-                       GetContext* get_context) {
+                       GetContext* get_context, HistogramImpl* file_read_hist) {
   TableReader* t = fd.table_reader;
   Status s;
   Cache::Handle* handle = nullptr;
+  std::string* row_cache_entry = nullptr;
+
+#ifndef ROCKSDB_LITE
+  IterKey row_cache_key;
+  std::string row_cache_entry_buffer;
+
+  if (ioptions_.row_cache) {
+    uint64_t fd_number = fd.GetNumber();
+    auto user_key = ExtractUserKey(k);
+    // We use the user key as cache key instead of the internal key,
+    // otherwise the whole cache would be invalidated every time the
+    // sequence key increases. However, to support caching snapshot
+    // reads, we append the sequence number (incremented by 1 to
+    // distinguish from 0) only in this case.
+    uint64_t seq_no =
+        options.snapshot == nullptr ? 0 : 1 + GetInternalKeySeqno(k);
+
+    // Compute row cache key.
+    row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
+                             row_cache_id_.size());
+    AppendVarint64(&row_cache_key, fd_number);
+    AppendVarint64(&row_cache_key, seq_no);
+    row_cache_key.TrimAppend(row_cache_key.Size(), user_key.data(),
+                             user_key.size());
+
+    if (auto row_handle = ioptions_.row_cache->Lookup(row_cache_key.GetKey())) {
+      auto found_row_cache_entry = static_cast<const std::string*>(
+          ioptions_.row_cache->Value(row_handle));
+      replayGetContextLog(*found_row_cache_entry, user_key, get_context);
+      ioptions_.row_cache->Release(row_handle);
+      RecordTick(ioptions_.statistics, ROW_CACHE_HIT);
+      return Status::OK();
+    }
+
+    // Not found, setting up the replay log.
+    RecordTick(ioptions_.statistics, ROW_CACHE_MISS);
+    row_cache_entry = &row_cache_entry_buffer;
+  }
+#endif  // ROCKSDB_LITE
+
   if (!t) {
     s = FindTable(env_options_, internal_comparator, fd, &handle,
-                  options.read_tier == kBlockCacheTier);
+                  options.read_tier == kBlockCacheTier /* no_io */,
+                  true /* record_read_stats */, file_read_hist);
     if (s.ok()) {
       t = GetTableReaderFromHandle(handle);
     }
   }
   if (s.ok()) {
+    get_context->SetReplayLog(row_cache_entry);  // nullptr if no cache.
     s = t->Get(options, k, get_context);
+    get_context->SetReplayLog(nullptr);
     if (handle != nullptr) {
       ReleaseHandle(handle);
     }
   } else if (options.read_tier && s.IsIncomplete()) {
-    // Couldnt find Table in cache but treat as kFound if no_io set
+    // Couldn't find Table in cache but treat as kFound if no_io set
     get_context->MarkKeyMayExist();
     return Status::OK();
   }
+
+#ifndef ROCKSDB_LITE
+  // Put the replay log in row cache only if something was found.
+  if (s.ok() && row_cache_entry && !row_cache_entry->empty()) {
+    size_t charge =
+        row_cache_key.Size() + row_cache_entry->size() + sizeof(std::string);
+    void* row_ptr = new std::string(std::move(*row_cache_entry));
+    auto row_handle = ioptions_.row_cache->Insert(
+        row_cache_key.GetKey(), row_ptr, charge, &DeleteEntry<std::string>);
+    ioptions_.row_cache->Release(row_handle);
+  }
+#endif  // ROCKSDB_LITE
+
   return s;
 }
 
diff --git a/src/rocksdb/db/table_cache.h b/src/rocksdb/db/table_cache.h
index 76bb1c0..d9ae013 100644
--- a/src/rocksdb/db/table_cache.h
+++ b/src/rocksdb/db/table_cache.h
@@ -28,6 +28,7 @@ class Env;
 class Arena;
 struct FileDescriptor;
 class GetContext;
+class HistogramImpl;
 
 class TableCache {
  public:
@@ -46,6 +47,7 @@ class TableCache {
                         const InternalKeyComparator& internal_comparator,
                         const FileDescriptor& file_fd,
                         TableReader** table_reader_ptr = nullptr,
+                        HistogramImpl* file_read_hist = nullptr,
                         bool for_compaction = false, Arena* arena = nullptr);
 
   // If a seek to internal key "k" in specified file finds an entry,
@@ -54,7 +56,7 @@ class TableCache {
   Status Get(const ReadOptions& options,
              const InternalKeyComparator& internal_comparator,
              const FileDescriptor& file_fd, const Slice& k,
-             GetContext* get_context);
+             GetContext* get_context, HistogramImpl* file_read_hist = nullptr);
 
   // Evict any entry for the specified file number
   static void Evict(Cache* cache, uint64_t file_number);
@@ -63,7 +65,8 @@ class TableCache {
   Status FindTable(const EnvOptions& toptions,
                    const InternalKeyComparator& internal_comparator,
                    const FileDescriptor& file_fd, Cache::Handle**,
-                   const bool no_io = false);
+                   const bool no_io = false, bool record_read_stats = true,
+                   HistogramImpl* file_read_hist = nullptr);
 
   // Get TableReader from a cache handle.
   TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
@@ -81,7 +84,7 @@ class TableCache {
                             bool no_io = false);
 
   // Return total memory usage of the table reader of the file.
-  // 0 of table reader of the file is not loaded.
+  // 0 if table reader of the file is not loaded.
   size_t GetMemoryUsageByTableReader(
       const EnvOptions& toptions,
       const InternalKeyComparator& internal_comparator,
@@ -91,9 +94,17 @@ class TableCache {
   void ReleaseHandle(Cache::Handle* handle);
 
  private:
+  // Build a table reader
+  Status GetTableReader(const EnvOptions& env_options,
+                        const InternalKeyComparator& internal_comparator,
+                        const FileDescriptor& fd, bool sequential_mode,
+                        bool record_read_stats, HistogramImpl* file_read_hist,
+                        unique_ptr<TableReader>* table_reader);
+
   const ImmutableCFOptions& ioptions_;
   const EnvOptions& env_options_;
   Cache* const cache_;
+  std::string row_cache_id_;
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/table_properties_collector.cc b/src/rocksdb/db/table_properties_collector.cc
index 2e0a679..c14ecec 100644
--- a/src/rocksdb/db/table_properties_collector.cc
+++ b/src/rocksdb/db/table_properties_collector.cc
@@ -19,7 +19,9 @@ Status InternalKeyPropertiesCollector::InternalAdd(const Slice& key,
     return Status::InvalidArgument("Invalid internal key");
   }
 
-  if (ikey.type == ValueType::kTypeDeletion) {
+  // Note: We count both, deletions and single deletions here.
+  if (ikey.type == ValueType::kTypeDeletion ||
+      ikey.type == ValueType::kTypeSingleDeletion) {
     ++deleted_keys_;
   }
 
@@ -47,18 +49,22 @@ InternalKeyPropertiesCollector::GetReadableProperties() const {
 }
 
 namespace {
+
 EntryType GetEntryType(ValueType value_type) {
   switch (value_type) {
     case kTypeValue:
       return kEntryPut;
     case kTypeDeletion:
       return kEntryDelete;
+    case kTypeSingleDeletion:
+      return kEntrySingleDelete;
     case kTypeMerge:
       return kEntryMerge;
     default:
       return kEntryOther;
   }
 }
+
 }  // namespace
 
 Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key,
diff --git a/src/rocksdb/db/table_properties_collector.h b/src/rocksdb/db/table_properties_collector.h
index 79bf132..51c2ba9 100644
--- a/src/rocksdb/db/table_properties_collector.h
+++ b/src/rocksdb/db/table_properties_collector.h
@@ -32,9 +32,11 @@ class IntTblPropCollector {
                              uint64_t file_size) = 0;
 
   virtual UserCollectedProperties GetReadableProperties() const = 0;
+
+  virtual bool NeedCompact() const { return false; }
 };
 
-// Facrtory for internal table properties collector.
+// Factory for internal table properties collector.
 class IntTblPropCollectorFactory {
  public:
   virtual ~IntTblPropCollectorFactory() {}
@@ -98,6 +100,10 @@ class UserKeyTablePropertiesCollector : public IntTblPropCollector {
 
   UserCollectedProperties GetReadableProperties() const override;
 
+  virtual bool NeedCompact() const override {
+    return collector_->NeedCompact();
+  }
+
  protected:
   std::unique_ptr<TablePropertiesCollector> collector_;
 };
diff --git a/src/rocksdb/db/table_properties_collector_test.cc b/src/rocksdb/db/table_properties_collector_test.cc
index 6f1a8d9..0eeed81 100644
--- a/src/rocksdb/db/table_properties_collector_test.cc
+++ b/src/rocksdb/db/table_properties_collector_test.cc
@@ -6,18 +6,20 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "db/table_properties_collector.h"
-#include "rocksdb/table.h"
 #include "rocksdb/immutable_options.h"
+#include "rocksdb/table.h"
 #include "table/block_based_table_factory.h"
 #include "table/meta_blocks.h"
 #include "table/plain_table_factory.h"
 #include "table/table_builder.h"
 #include "util/coding.h"
+#include "util/file_reader_writer.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
@@ -31,74 +33,17 @@ class TablePropertiesTest : public testing::Test,
   bool backward_mode_;
 };
 
-// TODO(kailiu) the following classes should be moved to some more general
-// places, so that other tests can also make use of them.
-// `FakeWritableFile` and `FakeRandomeAccessFile` bypass the real file system
-// and therefore enable us to quickly setup the tests.
-class FakeWritableFile : public WritableFile {
- public:
-  ~FakeWritableFile() { }
-
-  const std::string& contents() const { return contents_; }
-
-  virtual Status Close() override { return Status::OK(); }
-  virtual Status Flush() override { return Status::OK(); }
-  virtual Status Sync() override { return Status::OK(); }
-
-  virtual Status Append(const Slice& data) override {
-    contents_.append(data.data(), data.size());
-    return Status::OK();
-  }
-
- private:
-  std::string contents_;
-};
-
-
-class FakeRandomeAccessFile : public RandomAccessFile {
- public:
-  explicit FakeRandomeAccessFile(const Slice& contents)
-      : contents_(contents.data(), contents.size()) {
-  }
-
-  virtual ~FakeRandomeAccessFile() { }
-
-  uint64_t Size() const { return contents_.size(); }
-
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const override {
-    if (offset > contents_.size()) {
-      return Status::InvalidArgument("invalid Read offset");
-    }
-    if (offset + n > contents_.size()) {
-      n = contents_.size() - offset;
-    }
-    memcpy(scratch, &contents_[offset], n);
-    *result = Slice(scratch, n);
-    return Status::OK();
-  }
-
- private:
-  std::string contents_;
-};
-
-
-class DumbLogger : public Logger {
- public:
-  using Logger::Logv;
-  virtual void Logv(const char* format, va_list ap) override {}
-  virtual size_t GetLogFileSize() const override { return 0; }
-};
-
 // Utilities test functions
 namespace {
 void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions,
                  const InternalKeyComparator& internal_comparator,
                  const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
                      int_tbl_prop_collector_factories,
-                 std::unique_ptr<FakeWritableFile>* writable,
+                 std::unique_ptr<WritableFileWriter>* writable,
                  std::unique_ptr<TableBuilder>* builder) {
-  writable->reset(new FakeWritableFile);
+  unique_ptr<WritableFile> wf(new test::StringSink);
+  writable->reset(new WritableFileWriter(std::move(wf), EnvOptions()));
+
   builder->reset(NewTableBuilder(
       ioptions, internal_comparator, int_tbl_prop_collector_factories,
       writable->get(), options.compression, options.compression_opts));
@@ -114,16 +59,19 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
      std::string encoded;
      std::string encoded_num_puts;
      std::string encoded_num_deletes;
+     std::string encoded_num_single_deletes;
      std::string encoded_num_size_changes;
      PutVarint32(&encoded, count_);
      PutVarint32(&encoded_num_puts, num_puts_);
      PutVarint32(&encoded_num_deletes, num_deletes_);
+     PutVarint32(&encoded_num_single_deletes, num_single_deletes_);
      PutVarint32(&encoded_num_size_changes, num_size_changes_);
      *properties = UserCollectedProperties{
          {"TablePropertiesTest", message_},
          {"Count", encoded},
          {"NumPuts", encoded_num_puts},
          {"NumDeletes", encoded_num_deletes},
+         {"NumSingleDeletes", encoded_num_single_deletes},
          {"NumSizeChanges", encoded_num_size_changes},
      };
      return Status::OK();
@@ -139,6 +87,8 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
       num_puts_++;
     } else if (type == kEntryDelete) {
       num_deletes_++;
+    } else if (type == kEntrySingleDelete) {
+      num_single_deletes_++;
     }
     if (file_size < file_size_) {
       message_ = "File size should not decrease.";
@@ -158,6 +108,7 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
   uint32_t count_ = 0;
   uint32_t num_puts_ = 0;
   uint32_t num_deletes_ = 0;
+  uint32_t num_single_deletes_ = 0;
   uint32_t num_size_changes_ = 0;
   uint64_t file_size_ = 0;
 };
@@ -267,29 +218,29 @@ class FlushBlockEveryThreePolicyFactory : public FlushBlockPolicyFactory {
   }
 };
 
-extern uint64_t kBlockBasedTableMagicNumber;
-extern uint64_t kPlainTableMagicNumber;
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
 namespace {
 void TestCustomizedTablePropertiesCollector(
     bool backward_mode, uint64_t magic_number, bool test_int_tbl_prop_collector,
     const Options& options, const InternalKeyComparator& internal_comparator) {
-  const std::string kDeleteFlag = "D";
   // make sure the entries will be inserted with order.
-  std::map<std::string, std::string> kvs = {
-      {"About   ", "val5"},  // starts with 'A'
-      {"Abstract", "val2"},  // starts with 'A'
-      {"Around  ", "val7"},  // starts with 'A'
-      {"Beyond  ", "val3"},
-      {"Builder ", "val1"},
-      {"Love    ", kDeleteFlag},
-      {"Cancel  ", "val4"},
-      {"Find    ", "val6"},
-      {"Rocks   ", kDeleteFlag},
+  std::map<std::pair<std::string, ValueType>, std::string> kvs = {
+      {{"About   ", kTypeValue}, "val5"},  // starts with 'A'
+      {{"Abstract", kTypeValue}, "val2"},  // starts with 'A'
+      {{"Around  ", kTypeValue}, "val7"},  // starts with 'A'
+      {{"Beyond  ", kTypeValue}, "val3"},
+      {{"Builder ", kTypeValue}, "val1"},
+      {{"Love    ", kTypeDeletion}, ""},
+      {{"Cancel  ", kTypeValue}, "val4"},
+      {{"Find    ", kTypeValue}, "val6"},
+      {{"Rocks   ", kTypeDeletion}, ""},
+      {{"Foo     ", kTypeSingleDeletion}, ""},
   };
 
   // -- Step 1: build table
   std::unique_ptr<TableBuilder> builder;
-  std::unique_ptr<FakeWritableFile> writable;
+  std::unique_ptr<WritableFileWriter> writer;
   const ImmutableCFOptions ioptions(options);
   std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
       int_tbl_prop_collector_factories;
@@ -300,58 +251,60 @@ void TestCustomizedTablePropertiesCollector(
     GetIntTblPropCollectorFactory(options, &int_tbl_prop_collector_factories);
   }
   MakeBuilder(options, ioptions, internal_comparator,
-              &int_tbl_prop_collector_factories, &writable, &builder);
+              &int_tbl_prop_collector_factories, &writer, &builder);
 
   SequenceNumber seqNum = 0U;
   for (const auto& kv : kvs) {
-    InternalKey ikey(kv.first, seqNum++, (kv.second != kDeleteFlag)
-                                             ? ValueType::kTypeValue
-                                             : ValueType::kTypeDeletion);
+    InternalKey ikey(kv.first.first, seqNum++, kv.first.second);
     builder->Add(ikey.Encode(), kv.second);
   }
   ASSERT_OK(builder->Finish());
+  writer->Flush();
 
   // -- Step 2: Read properties
-  FakeRandomeAccessFile readable(writable->contents());
+  test::StringSink* fwf =
+      static_cast<test::StringSink*>(writer->writable_file());
+  std::unique_ptr<RandomAccessFileReader> fake_file_reader(
+      test::GetRandomAccessFileReader(
+          new test::StringSource(fwf->contents())));
   TableProperties* props;
-  Status s = ReadTableProperties(
-      &readable,
-      writable->contents().size(),
-      magic_number,
-      Env::Default(),
-      nullptr,
-      &props
-  );
+  Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(),
+                                 magic_number, Env::Default(), nullptr, &props);
   std::unique_ptr<TableProperties> props_guard(props);
   ASSERT_OK(s);
 
   auto user_collected = props->user_collected_properties;
 
-  ASSERT_TRUE(user_collected.find("TablePropertiesTest") !=
-              user_collected.end());
+  ASSERT_NE(user_collected.find("TablePropertiesTest"), user_collected.end());
   ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
 
   uint32_t starts_with_A = 0;
-  ASSERT_TRUE(user_collected.find("Count") != user_collected.end());
+  ASSERT_NE(user_collected.find("Count"), user_collected.end());
   Slice key(user_collected.at("Count"));
   ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
   ASSERT_EQ(3u, starts_with_A);
 
   if (!backward_mode && !test_int_tbl_prop_collector) {
+    uint32_t num_puts;
+    ASSERT_NE(user_collected.find("NumPuts"), user_collected.end());
+    Slice key_puts(user_collected.at("NumPuts"));
+    ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
+    ASSERT_EQ(7u, num_puts);
+
     uint32_t num_deletes;
-    ASSERT_TRUE(user_collected.find("NumDeletes") != user_collected.end());
+    ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end());
     Slice key_deletes(user_collected.at("NumDeletes"));
     ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes));
     ASSERT_EQ(2u, num_deletes);
 
-    uint32_t num_puts;
-    ASSERT_TRUE(user_collected.find("NumPuts") != user_collected.end());
-    Slice key_puts(user_collected.at("NumPuts"));
-    ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
-    ASSERT_EQ(7u, num_puts);
+    uint32_t num_single_deletes;
+    ASSERT_NE(user_collected.find("NumSingleDeletes"), user_collected.end());
+    Slice key_single_deletes(user_collected.at("NumSingleDeletes"));
+    ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes));
+    ASSERT_EQ(1u, num_single_deletes);
 
     uint32_t num_size_changes;
-    ASSERT_TRUE(user_collected.find("NumSizeChanges") != user_collected.end());
+    ASSERT_NE(user_collected.find("NumSizeChanges"), user_collected.end());
     Slice key_size_changes(user_collected.at("NumSizeChanges"));
     ASSERT_TRUE(GetVarint32(&key_size_changes, &num_size_changes));
     ASSERT_GE(num_size_changes, 2u);
@@ -363,10 +316,6 @@ TEST_P(TablePropertiesTest, CustomizedTablePropertiesCollector) {
   // Test properties collectors with internal keys or regular keys
   // for block based table
   for (bool encode_as_internal : { true, false }) {
-    if (!backward_mode_ && !encode_as_internal) {
-      continue;
-    }
-
     Options options;
     BlockBasedTableOptions table_options;
     table_options.flush_block_policy_factory =
@@ -383,6 +332,7 @@ TEST_P(TablePropertiesTest, CustomizedTablePropertiesCollector) {
                                            kBlockBasedTableMagicNumber,
                                            encode_as_internal, options, ikc);
 
+#ifndef ROCKSDB_LITE  // PlainTable is not supported in Lite
     // test plain table
     PlainTableOptions plain_table_options;
     plain_table_options.user_key_len = 8;
@@ -394,6 +344,7 @@ TEST_P(TablePropertiesTest, CustomizedTablePropertiesCollector) {
     TestCustomizedTablePropertiesCollector(backward_mode_,
                                            kPlainTableMagicNumber,
                                            encode_as_internal, options, ikc);
+#endif  // !ROCKSDB_LITE
   }
 }
 
@@ -409,10 +360,11 @@ void TestInternalKeyPropertiesCollector(
       InternalKey("X       ", 4, ValueType::kTypeDeletion),
       InternalKey("Y       ", 5, ValueType::kTypeDeletion),
       InternalKey("Z       ", 6, ValueType::kTypeDeletion),
+      InternalKey("a       ", 7, ValueType::kTypeSingleDeletion),
   };
 
   std::unique_ptr<TableBuilder> builder;
-  std::unique_ptr<FakeWritableFile> writable;
+  std::unique_ptr<WritableFileWriter> writable;
   Options options;
   test::PlainInternalKeyComparator pikc(options.comparator);
 
@@ -427,7 +379,7 @@ void TestInternalKeyPropertiesCollector(
     auto comparator = options.comparator;
     // HACK: Set options.info_log to avoid writing log in
     // SanitizeOptions().
-    options.info_log = std::make_shared<DumbLogger>();
+    options.info_log = std::make_shared<test::NullLogger>();
     options = SanitizeOptions("db",            // just a place holder
                               &pikc,
                               options);
@@ -447,38 +399,49 @@ void TestInternalKeyPropertiesCollector(
     }
 
     ASSERT_OK(builder->Finish());
+    writable->Flush();
 
-    FakeRandomeAccessFile readable(writable->contents());
+    test::StringSink* fwf =
+        static_cast<test::StringSink*>(writable->writable_file());
+    unique_ptr<RandomAccessFileReader> reader(test::GetRandomAccessFileReader(
+        new test::StringSource(fwf->contents())));
     TableProperties* props;
     Status s =
-        ReadTableProperties(&readable, writable->contents().size(),
-                            magic_number, Env::Default(), nullptr, &props);
+        ReadTableProperties(reader.get(), fwf->contents().size(), magic_number,
+                            Env::Default(), nullptr, &props);
     ASSERT_OK(s);
 
     std::unique_ptr<TableProperties> props_guard(props);
     auto user_collected = props->user_collected_properties;
     uint64_t deleted = GetDeletedKeys(user_collected);
-    ASSERT_EQ(4u, deleted);
+    ASSERT_EQ(5u, deleted);  // deletes + single-deletes
 
     if (sanitized) {
       uint32_t starts_with_A = 0;
-      ASSERT_TRUE(user_collected.find("Count") != user_collected.end());
+      ASSERT_NE(user_collected.find("Count"), user_collected.end());
       Slice key(user_collected.at("Count"));
       ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
       ASSERT_EQ(1u, starts_with_A);
 
       if (!backward_mode) {
+        uint32_t num_puts;
+        ASSERT_NE(user_collected.find("NumPuts"), user_collected.end());
+        Slice key_puts(user_collected.at("NumPuts"));
+        ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
+        ASSERT_EQ(3u, num_puts);
+
         uint32_t num_deletes;
-        ASSERT_TRUE(user_collected.find("NumDeletes") != user_collected.end());
+        ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end());
         Slice key_deletes(user_collected.at("NumDeletes"));
         ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes));
         ASSERT_EQ(4u, num_deletes);
 
-        uint32_t num_puts;
-        ASSERT_TRUE(user_collected.find("NumPuts") != user_collected.end());
-        Slice key_puts(user_collected.at("NumPuts"));
-        ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
-        ASSERT_EQ(3u, num_puts);
+        uint32_t num_single_deletes;
+        ASSERT_NE(user_collected.find("NumSingleDeletes"),
+                  user_collected.end());
+        Slice key_single_deletes(user_collected.at("NumSingleDeletes"));
+        ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes));
+        ASSERT_EQ(1u, num_single_deletes);
       }
     }
   }
@@ -495,6 +458,7 @@ TEST_P(TablePropertiesTest, InternalKeyPropertiesCollector) {
         std::make_shared<BlockBasedTableFactory>());
   }
 
+#ifndef ROCKSDB_LITE  // PlainTable is not supported in Lite
   PlainTableOptions plain_table_options;
   plain_table_options.user_key_len = 8;
   plain_table_options.bloom_bits_per_key = 8;
@@ -503,6 +467,7 @@ TEST_P(TablePropertiesTest, InternalKeyPropertiesCollector) {
   TestInternalKeyPropertiesCollector(
       backward_mode_, kPlainTableMagicNumber, false /* not sanitize */,
       std::make_shared<PlainTableFactory>(plain_table_options));
+#endif  // !ROCKSDB_LITE
 }
 
 INSTANTIATE_TEST_CASE_P(InternalKeyPropertiesCollector, TablePropertiesTest,
diff --git a/src/rocksdb/db/transaction_log_impl.cc b/src/rocksdb/db/transaction_log_impl.cc
index b0bf6e4..23bd667 100644
--- a/src/rocksdb/db/transaction_log_impl.cc
+++ b/src/rocksdb/db/transaction_log_impl.cc
@@ -11,6 +11,7 @@
 #include <inttypes.h>
 #include "db/transaction_log_impl.h"
 #include "db/write_batch_internal.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
@@ -40,23 +41,27 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
 }
 
 Status TransactionLogIteratorImpl::OpenLogFile(
-    const LogFile* logFile,
-    unique_ptr<SequentialFile>* file) {
+    const LogFile* logFile, unique_ptr<SequentialFileReader>* file_reader) {
   Env* env = options_->env;
+  unique_ptr<SequentialFile> file;
+  Status s;
   if (logFile->Type() == kArchivedLogFile) {
     std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber());
-    return env->NewSequentialFile(fname, file, soptions_);
+    s = env->NewSequentialFile(fname, &file, soptions_);
   } else {
     std::string fname = LogFileName(dir_, logFile->LogNumber());
-    Status s = env->NewSequentialFile(fname, file, soptions_);
+    s = env->NewSequentialFile(fname, &file, soptions_);
     if (!s.ok()) {
       //  If cannot open file in DB directory.
       //  Try the archive dir, as it could have moved in the meanwhile.
       fname = ArchivedLogFileName(dir_, logFile->LogNumber());
-      s = env->NewSequentialFile(fname, file, soptions_);
+      s = env->NewSequentialFile(fname, &file, soptions_);
     }
-    return s;
   }
+  if (s.ok()) {
+    file_reader->reset(new SequentialFileReader(std::move(file)));
+  }
+  return s;
 }
 
 BatchResult TransactionLogIteratorImpl::GetBatch()  {
@@ -251,7 +256,7 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
 }
 
 Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
-  unique_ptr<SequentialFile> file;
+  unique_ptr<SequentialFileReader> file;
   Status s = OpenLogFile(logFile, &file);
   if (!s.ok()) {
     return s;
diff --git a/src/rocksdb/db/transaction_log_impl.h b/src/rocksdb/db/transaction_log_impl.h
index af06154..f89cc32 100644
--- a/src/rocksdb/db/transaction_log_impl.h
+++ b/src/rocksdb/db/transaction_log_impl.h
@@ -14,6 +14,7 @@
 #include "db/version_set.h"
 #include "db/log_reader.h"
 #include "db/filename.h"
+#include "port/port.h"
 
 namespace rocksdb {
 
@@ -83,13 +84,15 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
   size_t currentFileIndex_;
   std::unique_ptr<WriteBatch> currentBatch_;
   unique_ptr<log::Reader> currentLogReader_;
-  Status OpenLogFile(const LogFile* logFile, unique_ptr<SequentialFile>* file);
+  Status OpenLogFile(const LogFile* logFile,
+                     unique_ptr<SequentialFileReader>* file);
 
   struct LogReporter : public log::Reader::Reporter {
     Env* env;
     Logger* info_log;
     virtual void Corruption(size_t bytes, const Status& s) override {
-      Log(InfoLogLevel::ERROR_LEVEL, info_log, "dropping %zu bytes; %s", bytes,
+      Log(InfoLogLevel::ERROR_LEVEL, info_log,
+          "dropping %" ROCKSDB_PRIszt " bytes; %s", bytes,
           s.ToString().c_str());
     }
     virtual void Info(const char* s) {
diff --git a/src/rocksdb/db/version_builder.cc b/src/rocksdb/db/version_builder.cc
index c010ee4..7444bfc 100644
--- a/src/rocksdb/db/version_builder.cc
+++ b/src/rocksdb/db/version_builder.cc
@@ -15,12 +15,16 @@
 
 #include <inttypes.h>
 #include <algorithm>
+#include <atomic>
 #include <set>
+#include <thread>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "db/dbformat.h"
+#include "db/internal_stats.h"
 #include "db/table_cache.h"
 #include "db/version_set.h"
 #include "table/table_reader.h"
@@ -278,21 +282,52 @@ class VersionBuilder::Rep {
     CheckConsistency(vstorage);
   }
 
-  void LoadTableHandlers() {
+  void LoadTableHandlers(InternalStats* internal_stats, int max_threads) {
     assert(table_cache_ != nullptr);
+    // <file metadata, level>
+    std::vector<std::pair<FileMetaData*, int>> files_meta;
     for (int level = 0; level < base_vstorage_->num_levels(); level++) {
       for (auto& file_meta_pair : levels_[level].added_files) {
         auto* file_meta = file_meta_pair.second;
         assert(!file_meta->table_reader_handle);
-        table_cache_->FindTable(
-            env_options_, *(base_vstorage_->InternalComparator()),
-            file_meta->fd, &file_meta->table_reader_handle, false);
+        files_meta.emplace_back(file_meta, level);
+      }
+    }
+
+    std::atomic<size_t> next_file_meta_idx(0);
+    std::function<void()> load_handlers_func = [&]() {
+      while (true) {
+        size_t file_idx = next_file_meta_idx.fetch_add(1);
+        if (file_idx >= files_meta.size()) {
+          break;
+        }
+
+        auto* file_meta = files_meta[file_idx].first;
+        int level = files_meta[file_idx].second;
+        table_cache_->FindTable(env_options_,
+                                *(base_vstorage_->InternalComparator()),
+                                file_meta->fd, &file_meta->table_reader_handle,
+                                false /*no_io */, true /* record_read_stats */,
+                                internal_stats->GetFileReadHist(level));
         if (file_meta->table_reader_handle != nullptr) {
           // Load table_reader
           file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
               file_meta->table_reader_handle);
         }
       }
+    };
+
+    if (max_threads <= 1) {
+      load_handlers_func();
+    } else {
+      std::vector<std::thread> threads;
+      for (int i = 0; i < max_threads; i++) {
+        threads.emplace_back(load_handlers_func);
+      }
+
+      for (auto& t : threads) {
+        t.join();
+      }
     }
   }
 
@@ -321,7 +356,10 @@ void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); }
 void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
   rep_->SaveTo(vstorage);
 }
-void VersionBuilder::LoadTableHandlers() { rep_->LoadTableHandlers(); }
+void VersionBuilder::LoadTableHandlers(InternalStats* internal_stats,
+                                       int max_threads) {
+  rep_->LoadTableHandlers(internal_stats, max_threads);
+}
 void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level,
                                   FileMetaData* f) {
   rep_->MaybeAddFile(vstorage, level, f);
diff --git a/src/rocksdb/db/version_builder.h b/src/rocksdb/db/version_builder.h
index 452604f..c7ef279 100644
--- a/src/rocksdb/db/version_builder.h
+++ b/src/rocksdb/db/version_builder.h
@@ -16,6 +16,7 @@ class TableCache;
 class VersionStorageInfo;
 class VersionEdit;
 struct FileMetaData;
+class InternalStats;
 
 // A helper class so we can efficiently apply a whole sequence
 // of edits to a particular state without creating intermediate
@@ -30,7 +31,7 @@ class VersionBuilder {
                                   int level);
   void Apply(VersionEdit* edit);
   void SaveTo(VersionStorageInfo* vstorage);
-  void LoadTableHandlers();
+  void LoadTableHandlers(InternalStats* internal_stats, int max_threads = 1);
   void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f);
 
  private:
diff --git a/src/rocksdb/db/version_builder_test.cc b/src/rocksdb/db/version_builder_test.cc
index 099bb78..66230ee 100644
--- a/src/rocksdb/db/version_builder_test.cc
+++ b/src/rocksdb/db/version_builder_test.cc
@@ -77,11 +77,12 @@ class VersionBuilderTest : public testing::Test {
   }
 
   void UpdateVersionStorageInfo() {
-    vstorage_.UpdateFilesBySize();
+    vstorage_.UpdateFilesByCompactionPri(mutable_cf_options_);
     vstorage_.UpdateNumNonEmptyLevels();
     vstorage_.GenerateFileIndexer();
     vstorage_.GenerateLevelFilesBrief();
     vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+    vstorage_.GenerateLevel0NonOverlapping();
     vstorage_.SetFinalized();
   }
 };
@@ -114,7 +115,7 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
 
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200);
+                       GetInternalKey("350"), 200, 200, false);
   version_edit.DeleteFile(3, 27U);
 
   EnvOptions env_options;
@@ -148,7 +149,7 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
 
   VersionEdit version_edit;
   version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200);
+                       GetInternalKey("350"), 200, 200, false);
   version_edit.DeleteFile(0, 1U);
   version_edit.DeleteFile(0, 88U);
 
@@ -185,7 +186,7 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
 
   VersionEdit version_edit;
   version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200);
+                       GetInternalKey("350"), 200, 200, false);
   version_edit.DeleteFile(0, 1U);
   version_edit.DeleteFile(0, 88U);
   version_edit.DeleteFile(4, 6U);
@@ -213,15 +214,15 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
 
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200);
+                       GetInternalKey("350"), 200, 200, false);
   version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
-                       GetInternalKey("450"), 200, 200);
+                       GetInternalKey("450"), 200, 200, false);
   version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
-                       GetInternalKey("650"), 200, 200);
+                       GetInternalKey("650"), 200, 200, false);
   version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
-                       GetInternalKey("550"), 200, 200);
+                       GetInternalKey("550"), 200, 200, false);
   version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
-                       GetInternalKey("750"), 200, 200);
+                       GetInternalKey("750"), 200, 200, false);
 
   EnvOptions env_options;
 
@@ -247,24 +248,24 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
 
   VersionEdit version_edit;
   version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"),
-                       GetInternalKey("350"), 200, 200);
+                       GetInternalKey("350"), 200, 200, false);
   version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"),
-                       GetInternalKey("450"), 200, 200);
+                       GetInternalKey("450"), 200, 200, false);
   version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"),
-                       GetInternalKey("650"), 200, 200);
+                       GetInternalKey("650"), 200, 200, false);
   version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"),
-                       GetInternalKey("550"), 200, 200);
+                       GetInternalKey("550"), 200, 200, false);
   version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"),
-                       GetInternalKey("750"), 200, 200);
+                       GetInternalKey("750"), 200, 200, false);
   version_builder.Apply(&version_edit);
 
   VersionEdit version_edit2;
   version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"),
-                       GetInternalKey("950"), 200, 200);
+                       GetInternalKey("950"), 200, 200, false);
   version_edit2.DeleteFile(2, 616);
   version_edit2.DeleteFile(2, 636);
   version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"),
-                       GetInternalKey("850"), 200, 200);
+                       GetInternalKey("850"), 200, 200, false);
   version_builder.Apply(&version_edit2);
 
   version_builder.SaveTo(&new_vstorage);
diff --git a/src/rocksdb/db/version_edit.cc b/src/rocksdb/db/version_edit.cc
index f7b2888..0c9efe4 100644
--- a/src/rocksdb/db/version_edit.cc
+++ b/src/rocksdb/db/version_edit.cc
@@ -11,6 +11,7 @@
 
 #include "db/version_set.h"
 #include "util/coding.h"
+#include "util/event_logger.h"
 #include "rocksdb/slice.h"
 
 namespace rocksdb {
@@ -359,7 +360,7 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     AppendNumberTo(&r, prev_log_number_);
   }
   if (has_next_file_number_) {
-    r.append("\n  NextFile: ");
+    r.append("\n  NextFileNumber: ");
     AppendNumberTo(&r, next_file_number_);
   }
   if (has_last_sequence_) {
@@ -404,4 +405,75 @@ std::string VersionEdit::DebugString(bool hex_key) const {
   return r;
 }
 
+std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
+  JSONWriter jw;
+  jw << "EditNumber" << edit_num;
+
+  if (has_comparator_) {
+    jw << "Comparator" << comparator_;
+  }
+  if (has_log_number_) {
+    jw << "LogNumber" << log_number_;
+  }
+  if (has_prev_log_number_) {
+    jw << "PrevLogNumber" << prev_log_number_;
+  }
+  if (has_next_file_number_) {
+    jw << "NextFileNumber" << next_file_number_;
+  }
+  if (has_last_sequence_) {
+    jw << "LastSeq" << last_sequence_;
+  }
+
+  if (!deleted_files_.empty()) {
+    jw << "DeletedFiles";
+    jw.StartArray();
+
+    for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+         iter != deleted_files_.end();
+         ++iter) {
+      jw.StartArrayedObject();
+      jw << "Level" << iter->first;
+      jw << "FileNumber" << iter->second;
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!new_files_.empty()) {
+    jw << "AddedFiles";
+    jw.StartArray();
+
+    for (size_t i = 0; i < new_files_.size(); i++) {
+      jw.StartArrayedObject();
+      jw << "Level" << new_files_[i].first;
+      const FileMetaData& f = new_files_[i].second;
+      jw << "FileNumber" << f.fd.GetNumber();
+      jw << "FileSize" << f.fd.GetFileSize();
+      jw << "SmallestIKey" << f.smallest.DebugString(hex_key);
+      jw << "LargestIKey" << f.largest.DebugString(hex_key);
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  jw << "ColumnFamily" << column_family_;
+
+  if (is_column_family_add_) {
+    jw << "ColumnFamilyAdd" << column_family_name_;
+  }
+  if (is_column_family_drop_) {
+    jw << "ColumnFamilyDrop" << column_family_name_;
+  }
+  if (has_max_column_family_) {
+    jw << "MaxColumnFamily" << max_column_family_;
+  }
+
+  jw.EndObject();
+
+  return jw.Get();
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/version_edit.h b/src/rocksdb/db/version_edit.h
index 6da4f5b..5c55840 100644
--- a/src/rocksdb/db/version_edit.h
+++ b/src/rocksdb/db/version_edit.h
@@ -93,6 +93,8 @@ struct FileMetaData {
   FileMetaData()
       : refs(0),
         being_compacted(false),
+        smallest_seqno(kMaxSequenceNumber),
+        largest_seqno(0),
         table_reader_handle(nullptr),
         compensated_file_size(0),
         num_entries(0),
@@ -101,6 +103,17 @@ struct FileMetaData {
         raw_value_size(0),
         init_stats_from_file(false),
         marked_for_compaction(false) {}
+
+  // REQUIRED: Keys must be given to the function in sorted order (it expects
+  // the last key to be the largest).
+  void UpdateBoundaries(const Slice& key, SequenceNumber seqno) {
+    if (smallest.size() == 0) {
+      smallest.DecodeFrom(key);
+    }
+    largest.DecodeFrom(key);
+    smallest_seqno = std::min(smallest_seqno, seqno);
+    largest_seqno = std::max(largest_seqno, seqno);
+  }
 };
 
 // A compressed copy of file meta data that just contain
@@ -169,7 +182,8 @@ class VersionEdit {
   void AddFile(int level, uint64_t file, uint32_t file_path_id,
                uint64_t file_size, const InternalKey& smallest,
                const InternalKey& largest, const SequenceNumber& smallest_seqno,
-               const SequenceNumber& largest_seqno) {
+               const SequenceNumber& largest_seqno,
+               bool marked_for_compaction) {
     assert(smallest_seqno <= largest_seqno);
     FileMetaData f;
     f.fd = FileDescriptor(file, file_path_id, file_size);
@@ -177,7 +191,13 @@ class VersionEdit {
     f.largest = largest;
     f.smallest_seqno = smallest_seqno;
     f.largest_seqno = largest_seqno;
-    new_files_.push_back(std::make_pair(level, f));
+    f.marked_for_compaction = marked_for_compaction;
+    new_files_.emplace_back(level, f);
+  }
+
+  void AddFile(int level, const FileMetaData& f) {
+    assert(f.smallest_seqno <= f.largest_seqno);
+    new_files_.emplace_back(level, f);
   }
 
   // Delete the specified "file" from the specified "level".
@@ -225,6 +245,7 @@ class VersionEdit {
   }
 
   std::string DebugString(bool hex_key = false) const;
+  std::string DebugJSON(int edit_num, bool hex_key = false) const;
 
  private:
   friend class VersionSet;
diff --git a/src/rocksdb/db/version_edit_test.cc b/src/rocksdb/db/version_edit_test.cc
index 8b7b31b..4186e08 100644
--- a/src/rocksdb/db/version_edit_test.cc
+++ b/src/rocksdb/db/version_edit_test.cc
@@ -34,7 +34,7 @@ TEST_F(VersionEditTest, EncodeDecode) {
     edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0,
                  InternalKey("foo", kBig + 500 + i, kTypeValue),
                  InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
-                 kBig + 500 + i, kBig + 600 + i);
+                 kBig + 500 + i, kBig + 600 + i, false);
     edit.DeleteFile(4, kBig + 700 + i);
   }
 
@@ -47,10 +47,7 @@ TEST_F(VersionEditTest, EncodeDecode) {
 
 TEST_F(VersionEditTest, EncodeEmptyFile) {
   VersionEdit edit;
-  edit.AddFile(0, 0, 0, 0,
-               InternalKey(),
-               InternalKey(),
-               0, 0);
+  edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false);
   std::string buffer;
   ASSERT_TRUE(!edit.EncodeTo(&buffer));
 }
diff --git a/src/rocksdb/db/version_set.cc b/src/rocksdb/db/version_set.cc
index 7cf010a..91471c4 100644
--- a/src/rocksdb/db/version_set.cc
+++ b/src/rocksdb/db/version_set.cc
@@ -24,6 +24,7 @@
 #include <string>
 
 #include "db/filename.h"
+#include "db/internal_stats.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
 #include "db/memtable.h"
@@ -42,8 +43,10 @@
 #include "table/meta_blocks.h"
 #include "table/get_context.h"
 #include "util/coding.h"
+#include "util/file_reader_writer.h"
 #include "util/logging.h"
 #include "util/stop_watch.h"
+#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -472,13 +475,18 @@ class LevelFileNumIterator : public Iterator {
 class LevelFileIteratorState : public TwoLevelIteratorState {
  public:
   LevelFileIteratorState(TableCache* table_cache,
-    const ReadOptions& read_options, const EnvOptions& env_options,
-    const InternalKeyComparator& icomparator, bool for_compaction,
-    bool prefix_enabled)
-    : TwoLevelIteratorState(prefix_enabled),
-      table_cache_(table_cache), read_options_(read_options),
-      env_options_(env_options), icomparator_(icomparator),
-      for_compaction_(for_compaction) {}
+                         const ReadOptions& read_options,
+                         const EnvOptions& env_options,
+                         const InternalKeyComparator& icomparator,
+                         HistogramImpl* file_read_hist, bool for_compaction,
+                         bool prefix_enabled)
+      : TwoLevelIteratorState(prefix_enabled),
+        table_cache_(table_cache),
+        read_options_(read_options),
+        env_options_(env_options),
+        icomparator_(icomparator),
+        file_read_hist_(file_read_hist),
+        for_compaction_(for_compaction) {}
 
   Iterator* NewSecondaryIterator(const Slice& meta_handle) override {
     if (meta_handle.size() != sizeof(FileDescriptor)) {
@@ -489,7 +497,8 @@ class LevelFileIteratorState : public TwoLevelIteratorState {
           reinterpret_cast<const FileDescriptor*>(meta_handle.data());
       return table_cache_->NewIterator(
           read_options_, env_options_, icomparator_, *fd,
-          nullptr /* don't need reference to table*/, for_compaction_);
+          nullptr /* don't need reference to table*/, file_read_hist_,
+          for_compaction_);
     }
   }
 
@@ -502,6 +511,7 @@ class LevelFileIteratorState : public TwoLevelIteratorState {
   const ReadOptions read_options_;
   const EnvOptions& env_options_;
   const InternalKeyComparator& icomparator_;
+  HistogramImpl* file_read_hist_;
   bool for_compaction_;
 };
 
@@ -566,10 +576,12 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
   TableProperties* raw_table_properties;
   // By setting the magic number to kInvalidTableMagicNumber, we can by
   // pass the magic number check in the footer.
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(file)));
   s = ReadTableProperties(
-      file.get(), file_meta->fd.GetFileSize(),
-      Footer::kInvalidTableMagicNumber /* table's magic number */,
-      vset_->env_, ioptions->info_log, &raw_table_properties);
+      file_reader.get(), file_meta->fd.GetFileSize(),
+      Footer::kInvalidTableMagicNumber /* table's magic number */, vset_->env_,
+      ioptions->info_log, &raw_table_properties);
   if (!s.ok()) {
     return s;
   }
@@ -580,26 +592,58 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
 }
 
 Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+  Status s;
   for (int level = 0; level < storage_info_.num_levels_; level++) {
-    for (const auto& file_meta : storage_info_.files_[level]) {
-      auto fname =
-          TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
-                        file_meta->fd.GetPathId());
-      // 1. If the table is already present in table cache, load table
-      // properties from there.
-      std::shared_ptr<const TableProperties> table_properties;
-      Status s = GetTableProperties(&table_properties, file_meta, &fname);
-      if (s.ok()) {
-        props->insert({fname, table_properties});
-      } else {
-        return s;
-      }
+    s = GetPropertiesOfAllTables(props, level);
+    if (!s.ok()) {
+      return s;
     }
   }
 
   return Status::OK();
 }
 
+Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props,
+                                         int level) {
+  for (const auto& file_meta : storage_info_.files_[level]) {
+    auto fname =
+        TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
+                      file_meta->fd.GetPathId());
+    // 1. If the table is already present in table cache, load table
+    // properties from there.
+    std::shared_ptr<const TableProperties> table_properties;
+    Status s = GetTableProperties(&table_properties, file_meta, &fname);
+    if (s.ok()) {
+      props->insert({fname, table_properties});
+    } else {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Version::GetAggregatedTableProperties(
+    std::shared_ptr<const TableProperties>* tp, int level) {
+  TablePropertiesCollection props;
+  Status s;
+  if (level < 0) {
+    s = GetPropertiesOfAllTables(&props);
+  } else {
+    s = GetPropertiesOfAllTables(&props, level);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  auto* new_tp = new TableProperties();
+  for (const auto& item : props) {
+    new_tp->Add(*item.second);
+  }
+  tp->reset(new_tp);
+  return Status::OK();
+}
+
 size_t Version::GetMemoryUsageByTableReaders() {
   size_t total_usage = 0;
   for (auto& file_level : storage_info_.level_files_brief_) {
@@ -694,12 +738,14 @@ void Version::AddIterators(const ReadOptions& read_options,
     return;
   }
 
+  auto* arena = merge_iter_builder->GetArena();
+
   // Merge all level zero files together since they may overlap
   for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
     const auto& file = storage_info_.LevelFilesBrief(0).files[i];
     merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
         read_options, soptions, cfd_->internal_comparator(), file.fd, nullptr,
-        false, merge_iter_builder->GetArena()));
+        cfd_->internal_stats()->GetFileReadHist(0), false, arena));
   }
 
   // For levels > 0, we can use a concatenating iterator that sequentially
@@ -707,14 +753,18 @@ void Version::AddIterators(const ReadOptions& read_options,
   // lazily.
   for (int level = 1; level < storage_info_.num_non_empty_levels(); level++) {
     if (storage_info_.LevelFilesBrief(level).num_files != 0) {
-      merge_iter_builder->AddIterator(NewTwoLevelIterator(
-          new LevelFileIteratorState(
-              cfd_->table_cache(), read_options, soptions,
-              cfd_->internal_comparator(), false /* for_compaction */,
-              cfd_->ioptions()->prefix_extractor != nullptr),
-          new LevelFileNumIterator(cfd_->internal_comparator(),
-                                   &storage_info_.LevelFilesBrief(level)),
-          merge_iter_builder->GetArena()));
+      auto* mem = arena->AllocateAligned(sizeof(LevelFileIteratorState));
+      auto* state = new (mem)
+          LevelFileIteratorState(cfd_->table_cache(), read_options, soptions,
+                                 cfd_->internal_comparator(),
+                                 cfd_->internal_stats()->GetFileReadHist(level),
+                                 false /* for_compaction */,
+                                 cfd_->ioptions()->prefix_extractor != nullptr);
+      mem = arena->AllocateAligned(sizeof(LevelFileNumIterator));
+      auto* first_level_iter = new (mem) LevelFileNumIterator(
+          cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level));
+      merge_iter_builder->AddIterator(
+          NewTwoLevelIterator(state, first_level_iter, arena, false));
     }
   }
 }
@@ -732,7 +782,8 @@ VersionStorageInfo::VersionStorageInfo(
       compaction_style_(compaction_style),
       files_(new std::vector<FileMetaData*>[num_levels_]),
       base_level_(num_levels_ == 1 ? -1 : 1),
-      files_by_size_(num_levels_),
+      files_by_compaction_pri_(num_levels_),
+      level0_non_overlapping_(false),
       next_file_to_compact_by_size_(num_levels_),
       compaction_score_(num_levels_),
       compaction_level_(num_levels_),
@@ -743,6 +794,7 @@ VersionStorageInfo::VersionStorageInfo(
       accumulated_num_non_deletions_(0),
       accumulated_num_deletions_(0),
       num_samples_(0),
+      estimated_compaction_needed_bytes_(0),
       finalized_(false) {
   if (ref_vstorage != nullptr) {
     accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
@@ -801,8 +853,9 @@ void Version::Get(const ReadOptions& read_options,
       user_comparator(), internal_comparator());
   FdWithKeyRange* f = fp.GetNextFile();
   while (f != nullptr) {
-    *status = table_cache_->Get(read_options, *internal_comparator(), f->fd,
-                                ikey, &get_context);
+    *status = table_cache_->Get(
+        read_options, *internal_comparator(), f->fd, ikey, &get_context,
+        cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()));
     // TODO: examine the behavior for corrupted key
     if (!status->ok()) {
       return;
@@ -864,13 +917,16 @@ void VersionStorageInfo::GenerateLevelFilesBrief() {
   }
 }
 
-void Version::PrepareApply(const MutableCFOptions& mutable_cf_options) {
-  UpdateAccumulatedStats();
+void Version::PrepareApply(
+    const MutableCFOptions& mutable_cf_options,
+    bool update_stats) {
+  UpdateAccumulatedStats(update_stats);
   storage_info_.UpdateNumNonEmptyLevels();
   storage_info_.CalculateBaseBytes(*cfd_->ioptions(), mutable_cf_options);
-  storage_info_.UpdateFilesBySize();
+  storage_info_.UpdateFilesByCompactionPri(mutable_cf_options);
   storage_info_.GenerateFileIndexer();
   storage_info_.GenerateLevelFilesBrief();
+  storage_info_.GenerateLevel0NonOverlapping();
 }
 
 bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
@@ -907,42 +963,45 @@ void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) {
   num_samples_++;
 }
 
-void Version::UpdateAccumulatedStats() {
-  // maximum number of table properties loaded from files.
-  const int kMaxInitCount = 20;
-  int init_count = 0;
-  // here only the first kMaxInitCount files which haven't been
-  // initialized from file will be updated with num_deletions.
-  // The motivation here is to cap the maximum I/O per Version creation.
-  // The reason for choosing files from lower-level instead of higher-level
-  // is that such design is able to propagate the initialization from
-  // lower-level to higher-level:  When the num_deletions of lower-level
-  // files are updated, it will make the lower-level files have accurate
-  // compensated_file_size, making lower-level to higher-level compaction
-  // will be triggered, which creates higher-level files whose num_deletions
-  // will be updated here.
-  for (int level = 0;
-       level < storage_info_.num_levels_ && init_count < kMaxInitCount;
-       ++level) {
-    for (auto* file_meta : storage_info_.files_[level]) {
-      if (MaybeInitializeFileMetaData(file_meta)) {
-        // each FileMeta will be initialized only once.
-        storage_info_.UpdateAccumulatedStats(file_meta);
-        if (++init_count >= kMaxInitCount) {
-          break;
+void Version::UpdateAccumulatedStats(bool update_stats) {
+  if (update_stats) {
+    // maximum number of table properties loaded from files.
+    const int kMaxInitCount = 20;
+    int init_count = 0;
+    // here only the first kMaxInitCount files which haven't been
+    // initialized from file will be updated with num_deletions.
+    // The motivation here is to cap the maximum I/O per Version creation.
+    // The reason for choosing files from lower-level instead of higher-level
+    // is that such design is able to propagate the initialization from
+    // lower-level to higher-level:  When the num_deletions of lower-level
+    // files are updated, it will make the lower-level files have accurate
+    // compensated_file_size, making lower-level to higher-level compaction
+    // will be triggered, which creates higher-level files whose num_deletions
+    // will be updated here.
+    for (int level = 0;
+         level < storage_info_.num_levels_ && init_count < kMaxInitCount;
+         ++level) {
+      for (auto* file_meta : storage_info_.files_[level]) {
+        if (MaybeInitializeFileMetaData(file_meta)) {
+          // each FileMeta will be initialized only once.
+          storage_info_.UpdateAccumulatedStats(file_meta);
+          if (++init_count >= kMaxInitCount) {
+            break;
+          }
         }
       }
     }
-  }
-  // In case all sampled-files contain only deletion entries, then we
-  // load the table-property of a file in higher-level to initialize
-  // that value.
-  for (int level = storage_info_.num_levels_ - 1;
-       storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
-    for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
-         storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
-      if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
-        storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
+    // In case all sampled-files contain only deletion entries, then we
+    // load the table-property of a file in higher-level to initialize
+    // that value.
+    for (int level = storage_info_.num_levels_ - 1;
+         storage_info_.accumulated_raw_value_size_ == 0 && level >= 0;
+         --level) {
+      for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
+           storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
+        if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
+          storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
+        }
       }
     }
   }
@@ -973,8 +1032,8 @@ void VersionStorageInfo::ComputeCompensatedSizes() {
         // shape of LSM tree.
         if (file_meta->num_deletions * 2 >= file_meta->num_entries) {
           file_meta->compensated_file_size +=
-              (file_meta->num_deletions * 2 - file_meta->num_entries)
-              * average_value_size * kDeletionWeightOnCompaction;
+              (file_meta->num_deletions * 2 - file_meta->num_entries) *
+              average_value_size * kDeletionWeightOnCompaction;
         }
       }
     }
@@ -988,6 +1047,62 @@ int VersionStorageInfo::MaxInputLevel() const {
   return 0;
 }
 
+void VersionStorageInfo::EstimateCompactionBytesNeeded(
+    const MutableCFOptions& mutable_cf_options) {
+  // Only implemented for level-based compaction
+  if (compaction_style_ != kCompactionStyleLevel) {
+    return;
+  }
+
+  // Start from Level 0, if level 0 qualifies compaction to level 1,
+  // we estimate the size of compaction.
+  // Then we move on to the next level and see whether it qualifies compaction
+  // to the next level. The size of the level is estimated as the actual size
+  // on the level plus the input bytes from the previous level if there is any.
+  // If it exceeds, take the exceeded bytes as compaction input and add the size
+  // of the compaction size to tatal size.
+  // We keep doing it to Level 2, 3, etc, until the last level and return the
+  // accumulated bytes.
+
+  size_t bytes_compact_to_next_level = 0;
+  // Level 0
+  bool level0_compact_triggered = false;
+  if (static_cast<int>(files_[0].size()) >
+      mutable_cf_options.level0_file_num_compaction_trigger) {
+    level0_compact_triggered = true;
+    for (auto* f : files_[0]) {
+      bytes_compact_to_next_level += f->fd.GetFileSize();
+    }
+    estimated_compaction_needed_bytes_ = bytes_compact_to_next_level;
+  } else {
+    estimated_compaction_needed_bytes_ = 0;
+  }
+
+  // Level 1 and up.
+  for (int level = base_level(); level <= MaxInputLevel(); level++) {
+    size_t level_size = 0;
+    for (auto* f : files_[level]) {
+      level_size += f->fd.GetFileSize();
+    }
+    if (level == base_level() && level0_compact_triggered) {
+      // Add base level size to compaction if level0 compaction triggered.
+      estimated_compaction_needed_bytes_ += level_size;
+    }
+    // Add size added by previous compaction
+    level_size += bytes_compact_to_next_level;
+    bytes_compact_to_next_level = 0;
+    size_t level_target = MaxBytesForLevel(level);
+    if (level_size > level_target) {
+      bytes_compact_to_next_level = level_size - level_target;
+      // Simplify to assume the actual compaction fan-out ratio is always
+      // mutable_cf_options.max_bytes_for_level_multiplier.
+      estimated_compaction_needed_bytes_ +=
+          bytes_compact_to_next_level *
+          (1 + mutable_cf_options.max_bytes_for_level_multiplier);
+    }
+  }
+}
+
 void VersionStorageInfo::ComputeCompactionScore(
     const MutableCFOptions& mutable_cf_options,
     const CompactionOptionsFIFO& compaction_options_fifo) {
@@ -1030,13 +1145,6 @@ void VersionStorageInfo::ComputeCompactionScore(
       if (compaction_style_ == kCompactionStyleFIFO) {
         score = static_cast<double>(total_size) /
                 compaction_options_fifo.max_table_files_size;
-      } else if (num_sorted_runs >=
-                 mutable_cf_options.level0_stop_writes_trigger) {
-        // If we are slowing down writes, then we better compact that first
-        score = 1000000;
-      } else if (num_sorted_runs >=
-                 mutable_cf_options.level0_slowdown_writes_trigger) {
-        score = 10000;
       } else {
         score = static_cast<double>(num_sorted_runs) /
                 mutable_cf_options.level0_file_num_compaction_trigger;
@@ -1079,11 +1187,24 @@ void VersionStorageInfo::ComputeCompactionScore(
     }
   }
   ComputeFilesMarkedForCompaction();
+  EstimateCompactionBytesNeeded(mutable_cf_options);
 }
 
 void VersionStorageInfo::ComputeFilesMarkedForCompaction() {
   files_marked_for_compaction_.clear();
-  for (int level = 0; level <= MaxInputLevel(); level++) {
+  int last_qualify_level = 0;
+
+  // Do not include files from the last level with data
+  // If table properties collector suggests a file on the last level,
+  // we should not move it to a new level.
+  for (int level = num_levels() - 1; level >= 1; level--) {
+    if (!files_[level].empty()) {
+      last_qualify_level = level - 1;
+      break;
+    }
+  }
+
+  for (int level = 0; level <= last_qualify_level; level++) {
     for (auto* f : files_[level]) {
       if (!f->being_compacted && f->marked_for_compaction) {
         files_marked_for_compaction_.emplace_back(level, f);
@@ -1106,11 +1227,9 @@ bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
   return (first.file->compensated_file_size >
       second.file->compensated_file_size);
 }
-
 } // anonymous namespace
 
 void VersionStorageInfo::AddFile(int level, FileMetaData* f) {
-  assert(level < num_levels());
   auto* level_files = &files_[level];
   // Must not overlap
   assert(level <= 0 || level_files->empty() ||
@@ -1125,9 +1244,10 @@ void VersionStorageInfo::AddFile(int level, FileMetaData* f) {
 // following functions called:
 // 1. UpdateNumNonEmptyLevels();
 // 2. CalculateBaseBytes();
-// 3. UpdateFilesBySize();
+// 3. UpdateFilesByCompactionPri();
 // 4. GenerateFileIndexer();
 // 5. GenerateLevelFilesBrief();
+// 6. GenerateLevel0NonOverlapping();
 void VersionStorageInfo::SetFinalized() {
   finalized_ = true;
 #ifndef NDEBUG
@@ -1176,7 +1296,8 @@ void VersionStorageInfo::UpdateNumNonEmptyLevels() {
   }
 }
 
-void VersionStorageInfo::UpdateFilesBySize() {
+void VersionStorageInfo::UpdateFilesByCompactionPri(
+    const MutableCFOptions& mutable_cf_options) {
   if (compaction_style_ == kCompactionStyleFIFO ||
       compaction_style_ == kCompactionStyleUniversal) {
     // don't need this
@@ -1185,8 +1306,8 @@ void VersionStorageInfo::UpdateFilesBySize() {
   // No need to sort the highest level because it is never compacted.
   for (int level = 0; level < num_levels() - 1; level++) {
     const std::vector<FileMetaData*>& files = files_[level];
-    auto& files_by_size = files_by_size_[level];
-    assert(files_by_size.size() == 0);
+    auto& files_by_compaction_pri = files_by_compaction_pri_[level];
+    assert(files_by_compaction_pri.size() == 0);
 
     // populate a temp vector for sorting based on size
     std::vector<Fsize> temp(files.size());
@@ -1200,16 +1321,55 @@ void VersionStorageInfo::UpdateFilesBySize() {
     if (num > temp.size()) {
       num = temp.size();
     }
-    std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
-                      CompareCompensatedSizeDescending);
+    switch (mutable_cf_options.compaction_pri) {
+      case kCompactionPriByCompensatedSize:
+        std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                          CompareCompensatedSizeDescending);
+        break;
+      case kCompactionPriByLargestSeq:
+        std::sort(temp.begin(), temp.end(),
+                  [this](const Fsize& f1, const Fsize& f2) -> bool {
+                    return f1.file->largest_seqno < f2.file->largest_seqno;
+                  });
+        break;
+      default:
+        assert(false);
+    }
     assert(temp.size() == files.size());
 
-    // initialize files_by_size_
+    // initialize files_by_compaction_pri_
     for (unsigned int i = 0; i < temp.size(); i++) {
-      files_by_size.push_back(temp[i].index);
+      files_by_compaction_pri.push_back(temp[i].index);
     }
     next_file_to_compact_by_size_[level] = 0;
-    assert(files_[level].size() == files_by_size_[level].size());
+    assert(files_[level].size() == files_by_compaction_pri_[level].size());
+  }
+}
+
+void VersionStorageInfo::GenerateLevel0NonOverlapping() {
+  assert(!finalized_);
+  level0_non_overlapping_ = true;
+  if (level_files_brief_.size() == 0) {
+    return;
+  }
+
+  // A copy of L0 files sorted by smallest key
+  std::vector<FdWithKeyRange> level0_sorted_file(
+      level_files_brief_[0].files,
+      level_files_brief_[0].files + level_files_brief_[0].num_files);
+  sort(level0_sorted_file.begin(), level0_sorted_file.end(),
+       [this](const FdWithKeyRange & f1, const FdWithKeyRange & f2)->bool {
+    return (internal_comparator_->Compare(f1.smallest_key, f2.smallest_key) <
+            0);
+  });
+
+  for (size_t i = 1; i < level0_sorted_file.size(); ++i) {
+    FdWithKeyRange& f = level0_sorted_file[i];
+    FdWithKeyRange& prev = level0_sorted_file[i - 1];
+    if (internal_comparator_->Compare(prev.largest_key, f.smallest_key) >= 0) {
+      level0_non_overlapping_ = false;
+      break;
+    }
   }
 }
 
@@ -1239,38 +1399,6 @@ bool VersionStorageInfo::OverlapInLevel(int level,
                                largest_user_key);
 }
 
-int VersionStorageInfo::PickLevelForMemTableOutput(
-    const MutableCFOptions& mutable_cf_options, const Slice& smallest_user_key,
-    const Slice& largest_user_key) {
-  int level = 0;
-  if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
-    // Push to next level if there is no overlap in next level,
-    // and the #bytes overlapping in the level after that are limited.
-    InternalKey start;
-    start.SetMaxPossibleForUserKey(smallest_user_key);
-    InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
-    std::vector<FileMetaData*> overlaps;
-    while (mutable_cf_options.max_mem_compaction_level > 0 &&
-           level < mutable_cf_options.max_mem_compaction_level) {
-      if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
-        break;
-      }
-      if (level + 2 >= num_levels_) {
-        level++;
-        break;
-      }
-      GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
-      const uint64_t sum = TotalFileSize(overlaps);
-      if (sum > mutable_cf_options.MaxGrandParentOverlapBytes(level)) {
-        break;
-      }
-      level++;
-    }
-  }
-
-  return level;
-}
-
 // Store in "*inputs" all files in "level" that overlap [begin,end]
 // If hint_index is specified, then it points to a file in the
 // overlapping range.
@@ -1465,7 +1593,7 @@ bool VersionStorageInfo::HasOverlappingUserKey(
         files[last_file].largest_key);
     const Slice first_key_after = ExtractUserKey(
         files[last_file+1].smallest_key);
-    if (user_cmp->Compare(last_key_in_input, first_key_after) == 0) {
+    if (user_cmp->Equal(last_key_in_input, first_key_after)) {
       // The last user key in input overlaps with the next file's first key
       return true;
     }
@@ -1480,7 +1608,7 @@ bool VersionStorageInfo::HasOverlappingUserKey(
         files[first_file].smallest_key);
     const Slice& last_key_before = ExtractUserKey(
         files[first_file-1].largest_key);
-    if (user_cmp->Compare(first_key_in_input, last_key_before) == 0) {
+    if (user_cmp->Equal(first_key_in_input, last_key_before)) {
       // The first user key in input overlaps with the previous file's last key
       return true;
     }
@@ -1516,8 +1644,15 @@ const char* VersionStorageInfo::LevelSummary(
     // overwrite the last space
     --len;
   }
-  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
-           "] max score %.2f", compaction_score_[0]);
+  len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+                  "] max score %.2f", compaction_score_[0]);
+
+  if (!files_marked_for_compaction_.empty()) {
+    snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+             " (%" ROCKSDB_PRIszt " files need compaction)",
+             files_marked_for_compaction_.size());
+  }
+
   return scratch->buffer;
 }
 
@@ -1682,6 +1817,41 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
   }
 }
 
+uint64_t VersionStorageInfo::EstimateLiveDataSize() const {
+  // Estimate the live data size by adding up the size of the last level for all
+  // key ranges. Note: Estimate depends on the ordering of files in level 0
+  // because files in level 0 can be overlapping.
+  uint64_t size = 0;
+
+  auto ikey_lt = [this](InternalKey* x, InternalKey* y) {
+    return internal_comparator_->Compare(*x, *y) < 0;
+  };
+  // (Ordered) map of largest keys in non-overlapping files
+  std::map<InternalKey*, FileMetaData*, decltype(ikey_lt)> ranges(ikey_lt);
+
+  for (int l = num_levels_ - 1; l >= 0; l--) {
+    bool found_end = false;
+    for (auto file : files_[l]) {
+      // Find the first file where the largest key is larger than the smallest
+      // key of the current file. If this file does not overlap with the
+      // current file, none of the files in the map does. If there is
+      // no potential overlap, we can safely insert the rest of this level
+      // (if the level is not 0) into the map without checking again because
+      // the elements in the level are sorted and non-overlapping.
+      auto lb = (found_end && l != 0) ?
+        ranges.end() : ranges.lower_bound(&file->smallest);
+      found_end = (lb == ranges.end());
+      if (found_end || internal_comparator_->Compare(
+            file->largest, (*lb).second->smallest) < 0) {
+          ranges.emplace_hint(lb, &file->largest, file);
+          size += file->fd.file_size;
+      }
+    }
+  }
+  return size;
+}
+
+
 void Version::AddLiveFiles(std::vector<FileDescriptor>* live) {
   for (int level = 0; level < storage_info_.num_levels(); level++) {
     const std::vector<FileMetaData*>& files = storage_info_.files_[level];
@@ -1821,7 +1991,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     if (!manifest_writers_.empty()) {
       manifest_writers_.front()->cv.Signal();
     }
-    return Status::OK();
+    // we steal this code to also inform about cf-drop
+    return Status::ShutdownInProgress();
   }
 
   std::vector<VersionEdit*> batch_edits;
@@ -1882,11 +2053,13 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
 
     mu->Unlock();
 
+    TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest");
     if (!edit->IsColumnFamilyManipulation() &&
         db_options_->max_open_files == -1) {
       // unlimited table cache. Pre-load table handle now.
       // Need to do it out of the mutex.
-      builder_guard->version_builder()->LoadTableHandlers();
+      builder_guard->version_builder()->LoadTableHandlers(
+          column_family_data->internal_stats());
     }
 
     // This is fine because everything inside of this block is serialized --
@@ -1896,20 +2069,24 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
       Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
           "Creating manifest %" PRIu64 "\n", pending_manifest_file_number_);
       unique_ptr<WritableFile> descriptor_file;
+      EnvOptions opt_env_opts = env_->OptimizeForManifestWrite(env_options_);
       s = env_->NewWritableFile(
           DescriptorFileName(dbname_, pending_manifest_file_number_),
-          &descriptor_file, env_->OptimizeForManifestWrite(env_options_));
+          &descriptor_file, opt_env_opts);
       if (s.ok()) {
         descriptor_file->SetPreallocationBlockSize(
             db_options_->manifest_preallocation_size);
-        descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
+
+        unique_ptr<WritableFileWriter> file_writer(
+            new WritableFileWriter(std::move(descriptor_file), opt_env_opts));
+        descriptor_log_.reset(new log::Writer(std::move(file_writer)));
         s = WriteSnapshot(descriptor_log_.get());
       }
     }
 
     if (!edit->IsColumnFamilyManipulation()) {
       // This is cpu-heavy operations, which should be called outside mutex.
-      v->PrepareApply(mutable_cf_options);
+      v->PrepareApply(mutable_cf_options, true);
     }
 
     // Write new record to MANIFEST log
@@ -1977,6 +2154,11 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
       new_manifest_file_size = descriptor_log_->file()->GetFileSize();
     }
 
+    if (edit->is_column_family_drop_) {
+      TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1");
+      TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2");
+    }
+
     LogFlush(db_options_->info_log);
     mu->Lock();
   }
@@ -2116,11 +2298,16 @@ Status VersionSet::Recover(
       manifest_filename.c_str());
 
   manifest_filename = dbname_ + "/" + manifest_filename;
-  unique_ptr<SequentialFile> manifest_file;
-  s = env_->NewSequentialFile(manifest_filename, &manifest_file,
-                              env_options_);
-  if (!s.ok()) {
-    return s;
+  unique_ptr<SequentialFileReader> manifest_file_reader;
+  {
+    unique_ptr<SequentialFile> manifest_file;
+    s = env_->NewSequentialFile(manifest_filename, &manifest_file,
+                                env_options_);
+    if (!s.ok()) {
+      return s;
+    }
+    manifest_file_reader.reset(
+        new SequentialFileReader(std::move(manifest_file)));
   }
   uint64_t current_manifest_file_size;
   s = env_->GetFileSize(manifest_filename, &current_manifest_file_size);
@@ -2154,8 +2341,8 @@ Status VersionSet::Recover(
   {
     VersionSet::LogReporter reporter;
     reporter.status = &s;
-    log::Reader reader(std::move(manifest_file), &reporter, true /*checksum*/,
-                       0 /*initial_offset*/);
+    log::Reader reader(std::move(manifest_file_reader), &reporter,
+                       true /*checksum*/, 0 /*initial_offset*/);
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@@ -2326,16 +2513,18 @@ Status VersionSet::Recover(
       auto* builder = builders_iter->second->version_builder();
 
       if (db_options_->max_open_files == -1) {
-      // unlimited table cache. Pre-load table handle now.
-      // Need to do it out of the mutex.
-        builder->LoadTableHandlers();
+        // unlimited table cache. Pre-load table handle now.
+        // Need to do it out of the mutex.
+        builder->LoadTableHandlers(cfd->internal_stats(),
+                                   db_options_->max_file_opening_threads);
       }
 
       Version* v = new Version(cfd, this, current_version_number_++);
       builder->SaveTo(v->storage_info());
 
       // Install recovered version
-      v->PrepareApply(*cfd->GetLatestMutableCFOptions());
+      v->PrepareApply(*cfd->GetLatestMutableCFOptions(),
+          !(db_options_->skip_stats_update_on_db_open));
       AppendVersion(cfd, v);
     }
 
@@ -2389,18 +2578,23 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
   current.resize(current.size() - 1);
 
   std::string dscname = dbname + "/" + current;
+
+  unique_ptr<SequentialFileReader> file_reader;
+  {
   unique_ptr<SequentialFile> file;
   s = env->NewSequentialFile(dscname, &file, soptions);
   if (!s.ok()) {
     return s;
   }
+  file_reader.reset(new SequentialFileReader(std::move(file)));
+  }
 
   std::map<uint32_t, std::string> column_family_names;
   // default column family is always implicitly there
   column_family_names.insert({0, kDefaultColumnFamilyName});
   VersionSet::LogReporter reporter;
   reporter.status = &s;
-  log::Reader reader(std::move(file), &reporter, true /*checksum*/,
+  log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/,
                      0 /*initial_offset*/);
   Slice record;
   std::string scratch;
@@ -2452,7 +2646,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
   ColumnFamilyOptions cf_options(*options);
   std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
                                         options->table_cache_numshardbits));
-  WriteController wc;
+  WriteController wc(options->delayed_write_rate);
   WriteBuffer wb(options->db_write_buffer_size);
   VersionSet versions(dbname, options, env_options, tc.get(), &wb, &wc);
   Status status;
@@ -2524,12 +2718,17 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
 }
 
 Status VersionSet::DumpManifest(Options& options, std::string& dscname,
-                                bool verbose, bool hex) {
+                                bool verbose, bool hex, bool json) {
   // Open the specified manifest file.
-  unique_ptr<SequentialFile> file;
-  Status s = options.env->NewSequentialFile(dscname, &file, env_options_);
-  if (!s.ok()) {
-    return s;
+  unique_ptr<SequentialFileReader> file_reader;
+  Status s;
+  {
+    unique_ptr<SequentialFile> file;
+    s = options.env->NewSequentialFile(dscname, &file, env_options_);
+    if (!s.ok()) {
+      return s;
+    }
+    file_reader.reset(new SequentialFileReader(std::move(file)));
   }
 
   bool have_prev_log_number = false;
@@ -2553,8 +2752,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
   {
     VersionSet::LogReporter reporter;
     reporter.status = &s;
-    log::Reader reader(std::move(file), &reporter, true/*checksum*/,
-                       0/*initial_offset*/);
+    log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/,
+                       0 /*initial_offset*/);
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@@ -2565,9 +2764,10 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
       }
 
       // Write out each individual edit
-      if (verbose) {
-        printf("*************************Edit[%d] = %s\n",
-                count, edit.DebugString(hex).c_str());
+      if (verbose && !json) {
+        printf("%s\n", edit.DebugString(hex).c_str());
+      } else if (json) {
+        printf("%s\n", edit.DebugJSON(count, hex).c_str());
       }
       count++;
 
@@ -2647,7 +2847,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
       }
     }
   }
-  file.reset();
+  file_reader.reset();
 
   if (s.ok()) {
     if (!have_next_file) {
@@ -2674,7 +2874,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
 
       Version* v = new Version(cfd, this, current_version_number_++);
       builder->SaveTo(v->storage_info());
-      v->PrepareApply(*cfd->GetLatestMutableCFOptions());
+      v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false);
 
       printf("--------------- Column family \"%s\"  (ID %u) --------------\n",
              cfd->GetName().c_str(), (unsigned int)cfd->GetID());
@@ -2762,7 +2962,8 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
              cfd->current()->storage_info()->LevelFiles(level)) {
           edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
                        f->fd.GetFileSize(), f->smallest, f->largest,
-                       f->smallest_seqno, f->largest_seqno);
+                       f->smallest_seqno, f->largest_seqno,
+                       f->marked_for_compaction);
         }
       }
       edit.SetLogNumber(cfd->GetLogNumber());
@@ -2788,17 +2989,23 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_num,
   std::string fname = DescriptorFileName(dbname_, manifest_file_num);
   Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
       "ManifestContains: checking %s\n", fname.c_str());
-  unique_ptr<SequentialFile> file;
-  Status s = env_->NewSequentialFile(fname, &file, env_options_);
-  if (!s.ok()) {
-    Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
-        "ManifestContains: %s\n", s.ToString().c_str());
-    Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
-        "ManifestContains: is unable to reopen the manifest file  %s",
-        fname.c_str());
-    return false;
+
+  unique_ptr<SequentialFileReader> file_reader;
+  Status s;
+  {
+    unique_ptr<SequentialFile> file;
+    s = env_->NewSequentialFile(fname, &file, env_options_);
+    if (!s.ok()) {
+      Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+          "ManifestContains: %s\n", s.ToString().c_str());
+      Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log,
+          "ManifestContains: is unable to reopen the manifest file  %s",
+          fname.c_str());
+      return false;
+    }
+    file_reader.reset(new SequentialFileReader(std::move(file)));
   }
-  log::Reader reader(std::move(file), nullptr, true/*checksum*/, 0);
+  log::Reader reader(std::move(file_reader), nullptr, true /*checksum*/, 0);
   Slice r;
   std::string scratch;
   bool result = false;
@@ -2813,15 +3020,27 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_num,
   return result;
 }
 
+// TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this
+// function is called repeatedly with consecutive pairs of slices. For example
+// if the slice list is [a, b, c, d] this function is called with arguments
+// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
+// we avoid doing binary search for the keys b and c twice and instead somehow
+// maintain state of where they first appear in the files.
 uint64_t VersionSet::ApproximateSize(Version* v, const Slice& start,
-                                     const Slice& end) {
+                                     const Slice& end, int start_level,
+                                     int end_level) {
   // pre-condition
   assert(v->cfd_->internal_comparator().Compare(start, end) <= 0);
 
   uint64_t size = 0;
   const auto* vstorage = v->storage_info();
+  end_level = end_level == -1
+                  ? vstorage->num_non_empty_levels()
+                  : std::min(end_level, vstorage->num_non_empty_levels());
+
+  assert(start_level <= end_level);
 
-  for (int level = 0; level < vstorage->num_non_empty_levels(); level++) {
+  for (int level = start_level; level < end_level; level++) {
     const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
     if (!files_brief.num_files) {
       // empty level, skip exploration
@@ -2953,6 +3172,9 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
   read_options.verify_checksums =
     c->mutable_cf_options()->verify_checksums_in_compaction;
   read_options.fill_cache = false;
+  if (c->ShouldFormSubcompactions()) {
+    read_options.total_order_seek = true;
+  }
 
   // Level-0 files have to be merged together.  For other levels,
   // we will make a concatenating iterator per level.
@@ -2970,14 +3192,17 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
           list[num++] = cfd->table_cache()->NewIterator(
               read_options, env_options_compactions_,
               cfd->internal_comparator(), flevel->files[i].fd, nullptr,
+              nullptr, /* no per level latency histogram*/
               true /* for compaction */);
         }
       } else {
         // Create concatenating iterator for the files from this level
-        list[num++] = NewTwoLevelIterator(new LevelFileIteratorState(
-              cfd->table_cache(), read_options, env_options_,
-              cfd->internal_comparator(), true /* for_compaction */,
-              false /* prefix enabled */),
+        list[num++] = NewTwoLevelIterator(
+            new LevelFileIteratorState(
+                cfd->table_cache(), read_options, env_options_,
+                cfd->internal_comparator(),
+                nullptr /* no per level latency histogram */,
+                true /* for_compaction */, false /* prefix enabled */),
             new LevelFileNumIterator(cfd->internal_comparator(),
                                      c->input_levels(which)));
       }
@@ -3122,7 +3347,8 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
   AppendVersion(new_cfd, v);
   // GetLatestMutableCFOptions() is safe here without mutex since the
   // cfd is not available to client
-  new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions());
+  new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions(),
+                             LastSequence());
   new_cfd->SetLogNumber(edit->log_number_);
   return new_cfd;
 }
@@ -3135,4 +3361,22 @@ uint64_t VersionSet::GetNumLiveVersions(Version* dummy_versions) {
   return count;
 }
 
+uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) {
+  std::unordered_set<uint64_t> unique_files;
+  uint64_t total_files_size = 0;
+  for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+    VersionStorageInfo* storage_info = v->storage_info();
+    for (int level = 0; level < storage_info->num_levels_; level++) {
+      for (const auto& file_meta : storage_info->LevelFiles(level)) {
+        if (unique_files.find(file_meta->fd.packed_number_and_path_id) ==
+            unique_files.end()) {
+          unique_files.insert(file_meta->fd.packed_number_and_path_id);
+          total_files_size += file_meta->fd.GetFileSize();
+        }
+      }
+    }
+  }
+  return total_files_size;
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/version_set.h b/src/rocksdb/db/version_set.h
index 5c5f1fc..3964600 100644
--- a/src/rocksdb/db/version_set.h
+++ b/src/rocksdb/db/version_set.h
@@ -121,6 +121,10 @@ class VersionStorageInfo {
       const MutableCFOptions& mutable_cf_options,
       const CompactionOptionsFIFO& compaction_options_fifo);
 
+  // Estimate est_comp_needed_bytes_
+  void EstimateCompactionBytesNeeded(
+      const MutableCFOptions& mutable_cf_options);
+
   // This computes files_marked_for_compaction_ and is called by
   // ComputeCompactionScore()
   void ComputeFilesMarkedForCompaction();
@@ -128,8 +132,14 @@ class VersionStorageInfo {
   // Generate level_files_brief_ from files_
   void GenerateLevelFilesBrief();
   // Sort all files for this version based on their file size and
-  // record results in files_by_size_. The largest files are listed first.
-  void UpdateFilesBySize();
+  // record results in files_by_compaction_pri_. The largest files are listed
+  // first.
+  void UpdateFilesByCompactionPri(const MutableCFOptions& mutable_cf_options);
+
+  void GenerateLevel0NonOverlapping();
+  bool level0_non_overlapping() const {
+    return level0_non_overlapping_;
+  }
 
   int MaxInputLevel() const;
 
@@ -181,12 +191,6 @@ class VersionStorageInfo {
   bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
                              int level);
 
-  // Return the level at which we should place a new memtable compaction
-  // result that covers the range [smallest_user_key,largest_user_key].
-  int PickLevelForMemTableOutput(const MutableCFOptions& mutable_cf_options,
-                                 const Slice& smallest_user_key,
-                                 const Slice& largest_user_key);
-
   int num_levels() const { return num_levels_; }
 
   // REQUIRES: This version has been saved (see VersionSet::SaveTo)
@@ -223,9 +227,9 @@ class VersionStorageInfo {
   }
 
   // REQUIRES: This version has been saved (see VersionSet::SaveTo)
-  const std::vector<int>& FilesBySize(int level) const {
+  const std::vector<int>& FilesByCompactionPri(int level) const {
     assert(finalized_);
-    return files_by_size_[level];
+    return files_by_compaction_pri_[level];
   }
 
   // REQUIRES: This version has been saved (see VersionSet::SaveTo)
@@ -239,7 +243,7 @@ class VersionStorageInfo {
   int base_level() const { return base_level_; }
 
   // REQUIRES: lock is held
-  // Set the index that is used to offset into files_by_size_ to find
+  // Set the index that is used to offset into files_by_compaction_pri_ to find
   // the next compaction candidate file.
   void SetNextCompactionIndex(int level, int index) {
     next_file_to_compact_by_size_[level] = index;
@@ -256,7 +260,7 @@ class VersionStorageInfo {
     return file_indexer_;
   }
 
-  // Only the first few entries of files_by_size_ are sorted.
+  // Only the first few entries of files_by_compaction_pri_ are sorted.
   // There is no need to sort all the files because it is likely
   // that on a running system, we need to look at only the first
   // few largest files because a new version is created every few
@@ -296,7 +300,8 @@ class VersionStorageInfo {
 
   uint64_t GetEstimatedActiveKeys() const;
 
-  // re-initializes the index that is used to offset into files_by_size_
+  // re-initializes the index that is used to offset into
+  // files_by_compaction_pri_
   // to find the next compaction candidate file.
   void ResetNextCompactionIndex(int level) {
     next_file_to_compact_by_size_[level] = 0;
@@ -313,6 +318,13 @@ class VersionStorageInfo {
   void CalculateBaseBytes(const ImmutableCFOptions& ioptions,
                           const MutableCFOptions& options);
 
+  // Returns an estimate of the amount of live data in bytes.
+  uint64_t EstimateLiveDataSize() const;
+
+  uint64_t estimated_compaction_needed_bytes() const {
+    return estimated_compaction_needed_bytes_;
+  }
+
  private:
   const InternalKeyComparator* internal_comparator_;
   const Comparator* user_comparator_;
@@ -341,13 +353,16 @@ class VersionStorageInfo {
   // but files in each level are now sorted based on file
   // size. The file with the largest size is at the front.
   // This vector stores the index of the file from files_.
-  std::vector<std::vector<int>> files_by_size_;
+  std::vector<std::vector<int>> files_by_compaction_pri_;
+
+  // If true, means that files in L0 have keys with non overlapping ranges
+  bool level0_non_overlapping_;
 
-  // An index into files_by_size_ that specifies the first
+  // An index into files_by_compaction_pri_ that specifies the first
   // file that is not yet compacted
   std::vector<int> next_file_to_compact_by_size_;
 
-  // Only the first few entries of files_by_size_ are sorted.
+  // Only the first few entries of files_by_compaction_pri_ are sorted.
   // There is no need to sort all the files because it is likely
   // that on a running system, we need to look at only the first
   // few largest files because a new version is created every few
@@ -384,6 +399,9 @@ class VersionStorageInfo {
   uint64_t accumulated_num_deletions_;
   // the number of samples
   uint64_t num_samples_;
+  // Estimated bytes needed to be compacted until all levels' size is down to
+  // target sizes.
+  uint64_t estimated_compaction_needed_bytes_;
 
   bool finalized_;
 
@@ -412,7 +430,8 @@ class Version {
 
   // Loads some stats information from files. Call without mutex held. It needs
   // to be called before applying the version to the version set.
-  void PrepareApply(const MutableCFOptions& mutable_cf_options);
+  void PrepareApply(const MutableCFOptions& mutable_cf_options,
+                    bool update_stats);
 
   // Reference count management (so Versions do not disappear out from
   // under live iterators)
@@ -445,6 +464,14 @@ class Version {
   // tables' propertis, represented as shared_ptr.
   Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
 
+  Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level);
+
+  // REQUIRES: lock is held
+  // On success, "tp" will contains the aggregated table property amoug
+  // the table properties of all sst files in this version.
+  Status GetAggregatedTableProperties(
+      std::shared_ptr<const TableProperties>* tp, int level = -1);
+
   uint64_t GetEstimatedActiveKeys() {
     return storage_info_.GetEstimatedActiveKeys();
   }
@@ -485,11 +512,12 @@ class Version {
 
   // Update the accumulated stats associated with the current version.
   // This accumulated stats will be used in compaction.
-  void UpdateAccumulatedStats();
+  void UpdateAccumulatedStats(bool update_stats);
 
   // Sort all files for this version based on their file size and
-  // record results in files_by_size_. The largest files are listed first.
-  void UpdateFilesBySize();
+  // record results in files_by_compaction_pri_. The largest files are listed
+  // first.
+  void UpdateFilesByCompactionPri();
 
   ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
   Logger* info_log_;
@@ -564,7 +592,7 @@ class VersionSet {
 
   // printf contents (for debugging)
   Status DumpManifest(Options& options, std::string& manifestFileName,
-                      bool verbose, bool hex = false);
+                      bool verbose, bool hex = false, bool json = false);
 
 #endif  // ROCKSDB_LITE
 
@@ -604,7 +632,9 @@ class VersionSet {
   uint64_t MinLogNumber() const {
     uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
     for (auto cfd : *column_family_set_) {
-      if (min_log_num > cfd->GetLogNumber()) {
+      // It's safe to ignore dropped column families here:
+      // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+      if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
         min_log_num = cfd->GetLogNumber();
       }
     }
@@ -619,7 +649,10 @@ class VersionSet {
   void AddLiveFiles(std::vector<FileDescriptor>* live_list);
 
   // Return the approximate size of data to be scanned for range [start, end)
-  uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end);
+  // in levels [start_level, end_level). If end_level == 0 it will search
+  // through all non-empty levels
+  uint64_t ApproximateSize(Version* v, const Slice& start, const Slice& end,
+                           int start_level = 0, int end_level = -1);
 
   // Return the size of the current manifest file
   uint64_t manifest_file_size() const { return manifest_file_size_; }
@@ -633,6 +666,7 @@ class VersionSet {
   Status GetMetadataForFile(uint64_t number, int* filelevel,
                             FileMetaData** metadata, ColumnFamilyData** cfd);
 
+  // This function doesn't support leveldb SST filenames
   void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
 
   void GetObsoleteFiles(std::vector<FileMetaData*>* files,
@@ -643,6 +677,8 @@ class VersionSet {
 
   static uint64_t GetNumLiveVersions(Version* dummy_versions);
 
+  static uint64_t GetTotalSstFilesSize(Version* dummy_versions);
+
  private:
   struct ManifestWriter;
 
diff --git a/src/rocksdb/db/version_set_test.cc b/src/rocksdb/db/version_set_test.cc
index 202bb1c..6e51382 100644
--- a/src/rocksdb/db/version_set_test.cc
+++ b/src/rocksdb/db/version_set_test.cc
@@ -234,6 +234,29 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) {
   ASSERT_EQ(0, logger_->log_count);
 }
 
+TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) {
+  // Test whether the overlaps are detected as expected
+  Add(1, 1U, "4", "7", 1U);  // Perfect overlap with last level
+  Add(2, 2U, "3", "5", 1U);  // Partial overlap with last level
+  Add(2, 3U, "6", "8", 1U);  // Partial overlap with last level
+  Add(3, 4U, "1", "9", 1U);  // Contains range of last level
+  Add(4, 5U, "4", "5", 1U);  // Inside range of last level
+  Add(4, 5U, "6", "7", 1U);  // Inside range of last level
+  Add(5, 6U, "4", "7", 10U);
+  ASSERT_EQ(10U, vstorage_.EstimateLiveDataSize());
+}
+
+TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) {
+  Add(0, 1U, "9", "9", 1U);  // Level 0 is not ordered
+  Add(0, 1U, "5", "6", 1U);  // Ignored because of [5,6] in l1
+  Add(1, 1U, "1", "2", 1U);  // Ignored because of [2,3] in l2
+  Add(1, 2U, "3", "4", 1U);  // Ignored because of [2,3] in l2
+  Add(1, 3U, "5", "6", 1U);
+  Add(2, 4U, "2", "3", 1U);
+  Add(3, 5U, "7", "8", 1U);
+  ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize());
+}
+
 class FindLevelFileTest : public testing::Test {
  public:
   LevelFilesBrief file_level_;
diff --git a/src/rocksdb/db/wal_manager.cc b/src/rocksdb/db/wal_manager.cc
index 5651bae..37861ab 100644
--- a/src/rocksdb/db/wal_manager.cc
+++ b/src/rocksdb/db/wal_manager.cc
@@ -28,6 +28,7 @@
 #include "rocksdb/options.h"
 #include "rocksdb/write_batch.h"
 #include "util/coding.h"
+#include "util/file_reader_writer.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/sync_point.h"
@@ -58,11 +59,15 @@ Status WalManager::GetSortedWalFiles(VectorLogPtr& files) {
   files.clear();
   // list wal files in archive dir.
   std::string archivedir = ArchivalDirectory(db_options_.wal_dir);
-  if (env_->FileExists(archivedir)) {
+  Status exists = env_->FileExists(archivedir);
+  if (exists.ok()) {
     s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
     if (!s.ok()) {
       return s;
     }
+  } else if (!exists.IsNotFound()) {
+    assert(s.IsIOError());
+    return s;
   }
 
   uint64_t latest_archived_log_number = 0;
@@ -312,9 +317,9 @@ Status WalManager::GetSortedWalsOfType(const std::string& path,
       // re-try in case the alive log file has been moved to archive.
       std::string archived_file = ArchivedLogFileName(path, number);
       if (!s.ok() && log_type == kAliveLogFile &&
-          env_->FileExists(archived_file)) {
+          env_->FileExists(archived_file).ok()) {
         s = env_->GetFileSize(archived_file, &size_bytes);
-        if (!s.ok() && !env_->FileExists(archived_file)) {
+        if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
           // oops, the file just got deleted from archived dir! move on
           s = Status::OK();
           continue;
@@ -379,7 +384,7 @@ Status WalManager::ReadFirstRecord(const WalFileType type,
   if (type == kAliveLogFile) {
     std::string fname = LogFileName(db_options_.wal_dir, number);
     s = ReadFirstLine(fname, sequence);
-    if (env_->FileExists(fname) && !s.ok()) {
+    if (env_->FileExists(fname).ok() && !s.ok()) {
       // return any error that is not caused by non-existing file
       return s;
     }
@@ -393,7 +398,7 @@ Status WalManager::ReadFirstRecord(const WalFileType type,
     // maybe the file was deleted from archive dir. If that's the case, return
     // Status::OK(). The caller with identify this as empty file because
     // *sequence == 0
-    if (!s.ok() && !env_->FileExists(archived_file)) {
+    if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
       return Status::OK();
     }
   }
@@ -430,6 +435,8 @@ Status WalManager::ReadFirstLine(const std::string& fname,
 
   std::unique_ptr<SequentialFile> file;
   Status status = env_->NewSequentialFile(fname, &file, env_options_);
+  unique_ptr<SequentialFileReader> file_reader(
+      new SequentialFileReader(std::move(file)));
 
   if (!status.ok()) {
     return status;
@@ -441,7 +448,7 @@ Status WalManager::ReadFirstLine(const std::string& fname,
   reporter.fname = fname.c_str();
   reporter.status = &status;
   reporter.ignore_error = !db_options_.paranoid_checks;
-  log::Reader reader(std::move(file), &reporter, true /*checksum*/,
+  log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/,
                      0 /*initial_offset*/);
   std::string scratch;
   Slice record;
diff --git a/src/rocksdb/db/wal_manager_test.cc b/src/rocksdb/db/wal_manager_test.cc
index 325f0d9..ec56c96 100644
--- a/src/rocksdb/db/wal_manager_test.cc
+++ b/src/rocksdb/db/wal_manager_test.cc
@@ -3,6 +3,8 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef ROCKSDB_LITE
+
 #include <map>
 #include <string>
 
@@ -14,6 +16,7 @@
 #include "db/column_family.h"
 #include "db/version_set.h"
 #include "db/writebuffer.h"
+#include "util/file_reader_writer.h"
 #include "util/mock_env.h"
 #include "util/string_util.h"
 #include "util/testharness.h"
@@ -72,7 +75,9 @@ class WalManagerTest : public testing::Test {
     std::string fname = ArchivedLogFileName(dbname_, current_log_number_);
     unique_ptr<WritableFile> file;
     ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_));
-    current_log_writer_.reset(new log::Writer(std::move(file)));
+    unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(file), env_options_));
+    current_log_writer_.reset(new log::Writer(std::move(file_writer)));
   }
 
   void CreateArchiveLogs(int num_logs, int entries_per_log) {
@@ -120,7 +125,9 @@ TEST_F(WalManagerTest, ReadFirstRecordCache) {
   ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
   ASSERT_EQ(s, 0U);
 
-  log::Writer writer(std::move(file));
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(file), EnvOptions()));
+  log::Writer writer(std::move(file_writer));
   WriteBatch batch;
   batch.Put("foo", "bar");
   WriteBatchInternal::SetSequence(&batch, 10);
@@ -287,3 +294,13 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as WalManager is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/write_batch.cc b/src/rocksdb/db/write_batch.cc
index 52956f8..53431b9 100644
--- a/src/rocksdb/db/write_batch.cc
+++ b/src/rocksdb/db/write_batch.cc
@@ -13,39 +13,58 @@
 //    data: record[count]
 // record :=
 //    kTypeValue varstring varstring
-//    kTypeMerge varstring varstring
 //    kTypeDeletion varstring
+//    kTypeSingleDeletion varstring
+//    kTypeMerge varstring varstring
 //    kTypeColumnFamilyValue varint32 varstring varstring
-//    kTypeColumnFamilyMerge varint32 varstring varstring
 //    kTypeColumnFamilyDeletion varint32 varstring varstring
+//    kTypeColumnFamilySingleDeletion varint32 varstring varstring
+//    kTypeColumnFamilyMerge varint32 varstring varstring
 // varstring :=
 //    len: varint32
 //    data: uint8[len]
 
 #include "rocksdb/write_batch.h"
-#include "rocksdb/merge_operator.h"
-#include "db/dbformat.h"
-#include "db/db_impl.h"
+
+#include <stack>
+#include <stdexcept>
+
 #include "db/column_family.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
 #include "db/memtable.h"
-#include "db/snapshot.h"
+#include "db/snapshot_impl.h"
 #include "db/write_batch_internal.h"
+#include "rocksdb/merge_operator.h"
 #include "util/coding.h"
-#include "util/statistics.h"
-#include <stdexcept>
 #include "util/perf_context_imp.h"
+#include "util/statistics.h"
 
 namespace rocksdb {
 
 // WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
 static const size_t kHeader = 12;
 
-WriteBatch::WriteBatch(size_t reserved_bytes) {
+struct SavePoint {
+  size_t size;  // size of rep_
+  int count;    // count of elements in rep_
+  SavePoint(size_t s, int c) : size(s), count(c) {}
+};
+
+struct SavePoints {
+  std::stack<SavePoint> stack;
+};
+
+WriteBatch::WriteBatch(size_t reserved_bytes) : save_points_(nullptr) {
   rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader);
   Clear();
 }
 
-WriteBatch::~WriteBatch() { }
+WriteBatch::~WriteBatch() {
+  if (save_points_ != nullptr) {
+    delete save_points_;
+  }
+}
 
 WriteBatch::Handler::~Handler() { }
 
@@ -61,6 +80,12 @@ bool WriteBatch::Handler::Continue() {
 void WriteBatch::Clear() {
   rep_.clear();
   rep_.resize(kHeader);
+
+  if (save_points_ != nullptr) {
+    while (!save_points_->stack.empty()) {
+      save_points_->stack.pop();
+    }
+  }
 }
 
 int WriteBatch::Count() const {
@@ -87,11 +112,13 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag,
       }
       break;
     case kTypeColumnFamilyDeletion:
+    case kTypeColumnFamilySingleDeletion:
       if (!GetVarint32(input, column_family)) {
         return Status::Corruption("bad WriteBatch Delete");
       }
     // intentional fallthrough
     case kTypeDeletion:
+    case kTypeSingleDeletion:
       if (!GetLengthPrefixedSlice(input, key)) {
         return Status::Corruption("bad WriteBatch Delete");
       }
@@ -150,6 +177,11 @@ Status WriteBatch::Iterate(Handler* handler) const {
         s = handler->DeleteCF(column_family, key);
         found++;
         break;
+      case kTypeColumnFamilySingleDeletion:
+      case kTypeSingleDeletion:
+        s = handler->SingleDeleteCF(column_family, key);
+        found++;
+        break;
       case kTypeColumnFamilyMerge:
       case kTypeMerge:
         s = handler->MergeCF(column_family, key, value);
@@ -188,6 +220,8 @@ void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
   EncodeFixed64(&b->rep_[0], seq);
 }
 
+size_t WriteBatchInternal::GetFirstOffset(WriteBatch* b) { return kHeader; }
+
 void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
                              const Slice& key, const Slice& value) {
   WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
@@ -257,6 +291,40 @@ void WriteBatch::Delete(ColumnFamilyHandle* column_family,
   WriteBatchInternal::Delete(this, GetColumnFamilyID(column_family), key);
 }
 
+void WriteBatchInternal::SingleDelete(WriteBatch* b, uint32_t column_family_id,
+                                      const Slice& key) {
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeSingleDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilySingleDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+}
+
+void WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+                              const Slice& key) {
+  WriteBatchInternal::SingleDelete(this, GetColumnFamilyID(column_family), key);
+}
+
+void WriteBatchInternal::SingleDelete(WriteBatch* b, uint32_t column_family_id,
+                                      const SliceParts& key) {
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeSingleDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilySingleDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+}
+
+void WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+                              const SliceParts& key) {
+  WriteBatchInternal::SingleDelete(this, GetColumnFamilyID(column_family), key);
+}
+
 void WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
                                const Slice& key, const Slice& value) {
   WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
@@ -275,11 +343,64 @@ void WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
   WriteBatchInternal::Merge(this, GetColumnFamilyID(column_family), key, value);
 }
 
+void WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
+                               const SliceParts& key,
+                               const SliceParts& value) {
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeMerge));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+  PutLengthPrefixedSliceParts(&b->rep_, value);
+}
+
+void WriteBatch::Merge(ColumnFamilyHandle* column_family,
+                       const SliceParts& key,
+                       const SliceParts& value) {
+  WriteBatchInternal::Merge(this, GetColumnFamilyID(column_family),
+                            key, value);
+}
+
 void WriteBatch::PutLogData(const Slice& blob) {
   rep_.push_back(static_cast<char>(kTypeLogData));
   PutLengthPrefixedSlice(&rep_, blob);
 }
 
+void WriteBatch::SetSavePoint() {
+  if (save_points_ == nullptr) {
+    save_points_ = new SavePoints();
+  }
+  // Record length and count of current batch of writes.
+  save_points_->stack.push(SavePoint(GetDataSize(), Count()));
+}
+
+Status WriteBatch::RollbackToSavePoint() {
+  if (save_points_ == nullptr || save_points_->stack.size() == 0) {
+    return Status::NotFound();
+  }
+
+  // Pop the most recent savepoint off the stack
+  SavePoint savepoint = save_points_->stack.top();
+  save_points_->stack.pop();
+
+  assert(savepoint.size <= rep_.size());
+
+  if (savepoint.size == rep_.size()) {
+    // No changes to rollback
+  } else if (savepoint.size == 0) {
+    // Rollback everything
+    Clear();
+  } else {
+    rep_.resize(savepoint.size);
+    WriteBatchInternal::SetCount(this, savepoint.count);
+  }
+
+  return Status::OK();
+}
+
 namespace {
 // This class can *only* be used from a single-threaded write thread, because it
 // calls ColumnFamilyMemTablesImpl::Seek()
@@ -388,6 +509,66 @@ class MemTableInserter : public WriteBatch::Handler {
     return Status::OK();
   }
 
+  virtual Status DeleteCF(uint32_t column_family_id,
+                          const Slice& key) override {
+    Status seek_status;
+    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
+      ++sequence_;
+      return seek_status;
+    }
+    MemTable* mem = cf_mems_->GetMemTable();
+    auto* moptions = mem->GetMemTableOptions();
+    if (!dont_filter_deletes_ && moptions->filter_deletes) {
+      SnapshotImpl read_from_snapshot;
+      read_from_snapshot.number_ = sequence_;
+      ReadOptions ropts;
+      ropts.snapshot = &read_from_snapshot;
+      std::string value;
+      auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+      if (cf_handle == nullptr) {
+        cf_handle = db_->DefaultColumnFamily();
+      }
+      if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) {
+        RecordTick(moptions->statistics, NUMBER_FILTERED_DELETES);
+        return Status::OK();
+      }
+    }
+    mem->Add(sequence_, kTypeDeletion, key, Slice());
+    sequence_++;
+    cf_mems_->CheckMemtableFull();
+    return Status::OK();
+  }
+
+  virtual Status SingleDeleteCF(uint32_t column_family_id,
+                                const Slice& key) override {
+    Status seek_status;
+    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
+      ++sequence_;
+      return seek_status;
+    }
+    MemTable* mem = cf_mems_->GetMemTable();
+    auto* moptions = mem->GetMemTableOptions();
+    if (!dont_filter_deletes_ && moptions->filter_deletes) {
+      SnapshotImpl read_from_snapshot;
+      read_from_snapshot.number_ = sequence_;
+      ReadOptions ropts;
+      ropts.snapshot = &read_from_snapshot;
+      std::string value;
+      auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+      if (cf_handle == nullptr) {
+        cf_handle = db_->DefaultColumnFamily();
+      }
+      if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) {
+        RecordTick(moptions->statistics, NUMBER_FILTERED_DELETES);
+        return Status::OK();
+      }
+    }
+    mem->Add(sequence_, kTypeSingleDeletion, key, Slice());
+    sequence_++;
+    cf_mems_->CheckMemtableFull();
+    return Status::OK();
+  }
+
   virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
                          const Slice& value) override {
     Status seek_status;
@@ -467,36 +648,6 @@ class MemTableInserter : public WriteBatch::Handler {
     cf_mems_->CheckMemtableFull();
     return Status::OK();
   }
-
-  virtual Status DeleteCF(uint32_t column_family_id,
-                          const Slice& key) override {
-    Status seek_status;
-    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
-      ++sequence_;
-      return seek_status;
-    }
-    MemTable* mem = cf_mems_->GetMemTable();
-    auto* moptions = mem->GetMemTableOptions();
-    if (!dont_filter_deletes_ && moptions->filter_deletes) {
-      SnapshotImpl read_from_snapshot;
-      read_from_snapshot.number_ = sequence_;
-      ReadOptions ropts;
-      ropts.snapshot = &read_from_snapshot;
-      std::string value;
-      auto cf_handle = cf_mems_->GetColumnFamilyHandle();
-      if (cf_handle == nullptr) {
-        cf_handle = db_->DefaultColumnFamily();
-      }
-      if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) {
-        RecordTick(moptions->statistics, NUMBER_FILTERED_DELETES);
-        return Status::OK();
-      }
-    }
-    mem->Add(sequence_, kTypeDeletion, key, Slice());
-    sequence_++;
-    cf_mems_->CheckMemtableFull();
-    return Status::OK();
-  }
 };
 }  // namespace
 
diff --git a/src/rocksdb/db/write_batch_base.cc b/src/rocksdb/db/write_batch_base.cc
index 5e3f5f0..9f7f00d 100644
--- a/src/rocksdb/db/write_batch_base.cc
+++ b/src/rocksdb/db/write_batch_base.cc
@@ -43,4 +43,34 @@ void WriteBatchBase::Delete(const SliceParts& key) {
   Delete(key_slice);
 }
 
+void WriteBatchBase::SingleDelete(ColumnFamilyHandle* column_family,
+                                  const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  SingleDelete(column_family, key_slice);
+}
+
+void WriteBatchBase::SingleDelete(const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  SingleDelete(key_slice);
+}
+
+void WriteBatchBase::Merge(ColumnFamilyHandle* column_family,
+                         const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  Merge(column_family, key_slice, value_slice);
+}
+
+void WriteBatchBase::Merge(const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  Merge(key_slice, value_slice);
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/write_batch_internal.h b/src/rocksdb/db/write_batch_internal.h
index 793c0d4..04db461 100644
--- a/src/rocksdb/db/write_batch_internal.h
+++ b/src/rocksdb/db/write_batch_internal.h
@@ -73,9 +73,18 @@ class WriteBatchInternal {
   static void Delete(WriteBatch* batch, uint32_t column_family_id,
                      const Slice& key);
 
+  static void SingleDelete(WriteBatch* batch, uint32_t column_family_id,
+                           const SliceParts& key);
+
+  static void SingleDelete(WriteBatch* batch, uint32_t column_family_id,
+                           const Slice& key);
+
   static void Merge(WriteBatch* batch, uint32_t column_family_id,
                     const Slice& key, const Slice& value);
 
+  static void Merge(WriteBatch* batch, uint32_t column_family_id,
+                    const SliceParts& key, const SliceParts& value);
+
   // Return the number of entries in the batch.
   static int Count(const WriteBatch* batch);
 
@@ -89,6 +98,10 @@ class WriteBatchInternal {
   // this batch.
   static void SetSequence(WriteBatch* batch, SequenceNumber seq);
 
+  // Returns the offset of the first entry in the batch.
+  // This offset is only valid if the batch is not empty.
+  static size_t GetFirstOffset(WriteBatch* batch);
+
   static Slice Contents(const WriteBatch* batch) {
     return Slice(batch->rep_);
   }
diff --git a/src/rocksdb/db/write_batch_test.cc b/src/rocksdb/db/write_batch_test.cc
index 649fb89..d8c6f8c 100644
--- a/src/rocksdb/db/write_batch_test.cc
+++ b/src/rocksdb/db/write_batch_test.cc
@@ -31,8 +31,9 @@ static std::string PrintContents(WriteBatch* b) {
   options.memtable_factory = factory;
   ImmutableCFOptions ioptions(options);
   WriteBuffer wb(options.db_write_buffer_size);
-  MemTable* mem = new MemTable(cmp, ioptions,
-                               MutableCFOptions(options, ioptions), &wb);
+  MemTable* mem =
+      new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb,
+                   kMaxSequenceNumber);
   mem->Ref();
   std::string state;
   ColumnFamilyMemTablesDefault cf_mems_default(mem);
@@ -53,20 +54,26 @@ static std::string PrintContents(WriteBatch* b) {
         state.append(")");
         count++;
         break;
-      case kTypeMerge:
-        state.append("Merge(");
+      case kTypeDeletion:
+        state.append("Delete(");
         state.append(ikey.user_key.ToString());
-        state.append(", ");
-        state.append(iter->value().ToString());
         state.append(")");
         count++;
         break;
-      case kTypeDeletion:
-        state.append("Delete(");
+      case kTypeSingleDeletion:
+        state.append("SingleDelete(");
         state.append(ikey.user_key.ToString());
         state.append(")");
         count++;
         break;
+      case kTypeMerge:
+        state.append("Merge(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
       default:
         assert(false);
         break;
@@ -150,6 +157,22 @@ TEST_F(WriteBatchTest, Append) {
   ASSERT_EQ(4, b1.Count());
 }
 
+TEST_F(WriteBatchTest, SingleDeletion) {
+  WriteBatch batch;
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ("", PrintContents(&batch));
+  ASSERT_EQ(0, batch.Count());
+  batch.Put("a", "va");
+  ASSERT_EQ("Put(a, va)@100", PrintContents(&batch));
+  ASSERT_EQ(1, batch.Count());
+  batch.SingleDelete("a");
+  ASSERT_EQ(
+      "SingleDelete(a)@101"
+      "Put(a, va)@100",
+      PrintContents(&batch));
+  ASSERT_EQ(2, batch.Count());
+}
+
 namespace {
   struct TestHandler : public WriteBatch::Handler {
     std::string seen;
@@ -163,6 +186,26 @@ namespace {
       }
       return Status::OK();
     }
+    virtual Status DeleteCF(uint32_t column_family_id,
+                            const Slice& key) override {
+      if (column_family_id == 0) {
+        seen += "Delete(" + key.ToString() + ")";
+      } else {
+        seen += "DeleteCF(" + ToString(column_family_id) + ", " +
+                key.ToString() + ")";
+      }
+      return Status::OK();
+    }
+    virtual Status SingleDeleteCF(uint32_t column_family_id,
+                                  const Slice& key) override {
+      if (column_family_id == 0) {
+        seen += "SingleDelete(" + key.ToString() + ")";
+      } else {
+        seen += "SingleDeleteCF(" + ToString(column_family_id) + ", " +
+                key.ToString() + ")";
+      }
+      return Status::OK();
+    }
     virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
                            const Slice& value) override {
       if (column_family_id == 0) {
@@ -176,47 +219,44 @@ namespace {
     virtual void LogData(const Slice& blob) override {
       seen += "LogData(" + blob.ToString() + ")";
     }
-    virtual Status DeleteCF(uint32_t column_family_id,
-                            const Slice& key) override {
-      if (column_family_id == 0) {
-        seen += "Delete(" + key.ToString() + ")";
-      } else {
-        seen += "DeleteCF(" + ToString(column_family_id) + ", " +
-                key.ToString() + ")";
-      }
-      return Status::OK();
-    }
   };
 }
 
-TEST_F(WriteBatchTest, MergeNotImplemented) {
+TEST_F(WriteBatchTest, PutNotImplemented) {
   WriteBatch batch;
-  batch.Merge(Slice("foo"), Slice("bar"));
+  batch.Put(Slice("k1"), Slice("v1"));
   ASSERT_EQ(1, batch.Count());
-  ASSERT_EQ("Merge(foo, bar)@0",
-            PrintContents(&batch));
+  ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch));
 
   WriteBatch::Handler handler;
   ASSERT_OK(batch.Iterate(&handler));
 }
 
-TEST_F(WriteBatchTest, PutNotImplemented) {
+TEST_F(WriteBatchTest, DeleteNotImplemented) {
   WriteBatch batch;
-  batch.Put(Slice("k1"), Slice("v1"));
+  batch.Delete(Slice("k2"));
   ASSERT_EQ(1, batch.Count());
-  ASSERT_EQ("Put(k1, v1)@0",
-            PrintContents(&batch));
+  ASSERT_EQ("Delete(k2)@0", PrintContents(&batch));
 
   WriteBatch::Handler handler;
   ASSERT_OK(batch.Iterate(&handler));
 }
 
-TEST_F(WriteBatchTest, DeleteNotImplemented) {
+TEST_F(WriteBatchTest, SingleDeleteNotImplemented) {
   WriteBatch batch;
-  batch.Delete(Slice("k2"));
+  batch.SingleDelete(Slice("k2"));
   ASSERT_EQ(1, batch.Count());
-  ASSERT_EQ("Delete(k2)@0",
-            PrintContents(&batch));
+  ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, MergeNotImplemented) {
+  WriteBatch batch;
+  batch.Merge(Slice("foo"), Slice("bar"));
+  ASSERT_EQ(1, batch.Count());
+  ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch));
 
   WriteBatch::Handler handler;
   ASSERT_OK(batch.Iterate(&handler));
@@ -229,27 +269,31 @@ TEST_F(WriteBatchTest, Blob) {
   batch.Put(Slice("k3"), Slice("v3"));
   batch.PutLogData(Slice("blob1"));
   batch.Delete(Slice("k2"));
+  batch.SingleDelete(Slice("k3"));
   batch.PutLogData(Slice("blob2"));
   batch.Merge(Slice("foo"), Slice("bar"));
-  ASSERT_EQ(5, batch.Count());
-  ASSERT_EQ("Merge(foo, bar)@4"
-            "Put(k1, v1)@0"
-            "Delete(k2)@3"
-            "Put(k2, v2)@1"
-            "Put(k3, v3)@2",
-            PrintContents(&batch));
+  ASSERT_EQ(6, batch.Count());
+  ASSERT_EQ(
+      "Merge(foo, bar)@5"
+      "Put(k1, v1)@0"
+      "Delete(k2)@3"
+      "Put(k2, v2)@1"
+      "SingleDelete(k3)@4"
+      "Put(k3, v3)@2",
+      PrintContents(&batch));
 
   TestHandler handler;
   batch.Iterate(&handler);
   ASSERT_EQ(
-            "Put(k1, v1)"
-            "Put(k2, v2)"
-            "Put(k3, v3)"
-            "LogData(blob1)"
-            "Delete(k2)"
-            "LogData(blob2)"
-            "Merge(foo, bar)",
-            handler.seen);
+      "Put(k1, v1)"
+      "Put(k2, v2)"
+      "Put(k3, v3)"
+      "LogData(blob1)"
+      "Delete(k2)"
+      "SingleDelete(k3)"
+      "LogData(blob2)"
+      "Merge(foo, bar)",
+      handler.seen);
 }
 
 TEST_F(WriteBatchTest, Continue) {
@@ -262,6 +306,16 @@ TEST_F(WriteBatchTest, Continue) {
       ++num_seen;
       return TestHandler::PutCF(column_family_id, key, value);
     }
+    virtual Status DeleteCF(uint32_t column_family_id,
+                            const Slice& key) override {
+      ++num_seen;
+      return TestHandler::DeleteCF(column_family_id, key);
+    }
+    virtual Status SingleDeleteCF(uint32_t column_family_id,
+                                  const Slice& key) override {
+      ++num_seen;
+      return TestHandler::SingleDeleteCF(column_family_id, key);
+    }
     virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
                            const Slice& value) override {
       ++num_seen;
@@ -271,27 +325,24 @@ TEST_F(WriteBatchTest, Continue) {
       ++num_seen;
       TestHandler::LogData(blob);
     }
-    virtual Status DeleteCF(uint32_t column_family_id,
-                            const Slice& key) override {
-      ++num_seen;
-      return TestHandler::DeleteCF(column_family_id, key);
-    }
-    virtual bool Continue() override {
-      return num_seen < 3;
-    }
+    virtual bool Continue() override { return num_seen < 5; }
   } handler;
 
   batch.Put(Slice("k1"), Slice("v1"));
+  batch.Put(Slice("k2"), Slice("v2"));
   batch.PutLogData(Slice("blob1"));
   batch.Delete(Slice("k1"));
+  batch.SingleDelete(Slice("k2"));
   batch.PutLogData(Slice("blob2"));
   batch.Merge(Slice("foo"), Slice("bar"));
   batch.Iterate(&handler);
   ASSERT_EQ(
-            "Put(k1, v1)"
-            "LogData(blob1)"
-            "Delete(k1)",
-            handler.seen);
+      "Put(k1, v1)"
+      "Put(k2, v2)"
+      "LogData(blob1)"
+      "Delete(k1)"
+      "SingleDelete(k2)",
+      handler.seen);
 }
 
 TEST_F(WriteBatchTest, PutGatherSlices) {
@@ -344,6 +395,7 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) {
   batch.Put(&two, Slice("twofoo"), Slice("bar2"));
   batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
   batch.Delete(&eight, Slice("eightfoo"));
+  batch.SingleDelete(&two, Slice("twofoo"));
   batch.Merge(&three, Slice("threethree"), Slice("3three"));
   batch.Put(&zero, Slice("foo"), Slice("bar"));
   batch.Merge(Slice("omom"), Slice("nom"));
@@ -355,12 +407,14 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) {
       "PutCF(2, twofoo, bar2)"
       "PutCF(8, eightfoo, bar8)"
       "DeleteCF(8, eightfoo)"
+      "SingleDeleteCF(2, twofoo)"
       "MergeCF(3, threethree, 3three)"
       "Put(foo, bar)"
       "Merge(omom, nom)",
       handler.seen);
 }
 
+#ifndef ROCKSDB_LITE
 TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
   WriteBatchWithIndex batch;
   ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
@@ -368,6 +422,7 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
   batch.Put(&two, Slice("twofoo"), Slice("bar2"));
   batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
   batch.Delete(&eight, Slice("eightfoo"));
+  batch.SingleDelete(&two, Slice("twofoo"));
   batch.Merge(&three, Slice("threethree"), Slice("3three"));
   batch.Put(&zero, Slice("foo"), Slice("bar"));
   batch.Merge(Slice("omom"), Slice("nom"));
@@ -392,6 +447,24 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
   ASSERT_OK(iter->status());
   ASSERT_TRUE(!iter->Valid());
 
+  iter.reset(batch.NewIterator(&two));
+  iter->Seek("twofoo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("twofoo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar2", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kSingleDeleteRecord, iter->Entry().type);
+  ASSERT_EQ("twofoo", iter->Entry().key.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
   iter.reset(batch.NewIterator());
   iter->Seek("gggg");
   ASSERT_OK(iter->status());
@@ -437,11 +510,118 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
       "PutCF(2, twofoo, bar2)"
       "PutCF(8, eightfoo, bar8)"
       "DeleteCF(8, eightfoo)"
+      "SingleDeleteCF(2, twofoo)"
       "MergeCF(3, threethree, 3three)"
       "Put(foo, bar)"
       "Merge(omom, nom)",
       handler.seen);
 }
+#endif  // !ROCKSDB_LITE
+
+TEST_F(WriteBatchTest, SavePointTest) {
+  Status s;
+  WriteBatch batch;
+  batch.SetSavePoint();
+
+  batch.Put("A", "a");
+  batch.Put("B", "b");
+  batch.SetSavePoint();
+
+  batch.Put("C", "c");
+  batch.Delete("A");
+  batch.SetSavePoint();
+  batch.SetSavePoint();
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Delete(A)@3"
+      "Put(A, a)@0"
+      "Put(B, b)@1"
+      "Put(C, c)@2",
+      PrintContents(&batch));
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Put(A, a)@0"
+      "Put(B, b)@1",
+      PrintContents(&batch));
+
+  batch.Delete("A");
+  batch.Put("B", "bb");
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ("", PrintContents(&batch));
+
+  s = batch.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch));
+
+  batch.Put("D", "d");
+  batch.Delete("A");
+
+  batch.SetSavePoint();
+
+  batch.Put("A", "aaa");
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Delete(A)@1"
+      "Put(D, d)@0",
+      PrintContents(&batch));
+
+  batch.SetSavePoint();
+
+  batch.Put("D", "d");
+  batch.Delete("A");
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Delete(A)@1"
+      "Put(D, d)@0",
+      PrintContents(&batch));
+
+  s = batch.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ(
+      "Delete(A)@1"
+      "Put(D, d)@0",
+      PrintContents(&batch));
+
+  WriteBatch batch2;
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  batch2.Delete("A");
+  batch2.SetSavePoint();
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("Delete(A)@0", PrintContents(&batch2));
+
+  batch2.Clear();
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  batch2.SetSavePoint();
+
+  batch2.Delete("B");
+  ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
+
+  batch2.SetSavePoint();
+  s = batch2.RollbackToSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch2));
+}
 
 }  // namespace rocksdb
 
diff --git a/src/rocksdb/db/write_callback.h b/src/rocksdb/db/write_callback.h
new file mode 100644
index 0000000..7dcca96
--- /dev/null
+++ b/src/rocksdb/db/write_callback.h
@@ -0,0 +1,24 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class DB;
+
+class WriteCallback {
+ public:
+  virtual ~WriteCallback() {}
+
+  // Will be called while on the write thread before the write executes.  If
+  // this function returns a non-OK status, the write will be aborted and this
+  // status will be returned to the caller of DB::Write().
+  virtual Status Callback(DB* db) = 0;
+};
+
+}  //  namespace rocksdb
diff --git a/src/rocksdb/db/write_callback_test.cc b/src/rocksdb/db/write_callback_test.cc
new file mode 100644
index 0000000..47b7cf7
--- /dev/null
+++ b/src/rocksdb/db/write_callback_test.cc
@@ -0,0 +1,129 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "db/db_impl.h"
+#include "db/write_callback.h"
+#include "rocksdb/db.h"
+#include "rocksdb/write_batch.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+using std::string;
+
+namespace rocksdb {
+
+class WriteCallbackTest : public testing::Test {
+ public:
+  string dbname;
+
+  WriteCallbackTest() {
+    dbname = test::TmpDir() + "/write_callback_testdb";
+  }
+};
+
+class WriteCallbackTestWriteCallback1 : public WriteCallback {
+ public:
+  bool was_called = false;
+
+  Status Callback(DB *db) override {
+    was_called = true;
+
+    // Make sure db is a DBImpl
+    DBImpl* db_impl = dynamic_cast<DBImpl*> (db);
+    if (db_impl == nullptr) {
+      return Status::InvalidArgument("");
+    }
+
+    return Status::OK();
+  }
+};
+
+class WriteCallbackTestWriteCallback2 : public WriteCallback {
+ public:
+  Status Callback(DB *db) override {
+    return Status::Busy();
+  }
+};
+
+TEST_F(WriteCallbackTest, WriteCallBackTest) {
+  Options options;
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  DB* db;
+  DBImpl* db_impl;
+
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+
+  db_impl = dynamic_cast<DBImpl*> (db);
+  ASSERT_TRUE(db_impl);
+
+  WriteBatch wb;
+
+  wb.Put("a", "value.a");
+  wb.Delete("x");
+
+  // Test a simple Write
+  s = db->Write(write_options, &wb);
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value.a", value);
+
+  // Test WriteWithCallback
+  WriteCallbackTestWriteCallback1 callback1;
+  WriteBatch wb2;
+
+  wb2.Put("a", "value.a2");
+
+  s = db_impl->WriteWithCallback(write_options, &wb2, &callback1);
+  ASSERT_OK(s);
+  ASSERT_TRUE(callback1.was_called);
+
+  s = db->Get(read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value.a2", value);
+
+  // Test WriteWithCallback for a callback that fails
+  WriteCallbackTestWriteCallback2 callback2;
+  WriteBatch wb3;
+
+  wb3.Put("a", "value.a3");
+
+  s = db_impl->WriteWithCallback(write_options, &wb3, &callback2);
+  ASSERT_NOK(s);
+
+  s = db->Get(read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value.a2", value);
+
+  delete db;
+  DestroyDB(dbname, options);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr,
+          "SKIPPED as WriteWithCallback is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/write_controller.cc b/src/rocksdb/db/write_controller.cc
index bb6f8ec..c26f6fb 100644
--- a/src/rocksdb/db/write_controller.cc
+++ b/src/rocksdb/db/write_controller.cc
@@ -5,7 +5,9 @@
 
 #include "db/write_controller.h"
 
+#include <atomic>
 #include <cassert>
+#include "rocksdb/env.h"
 
 namespace rocksdb {
 
@@ -14,15 +16,83 @@ std::unique_ptr<WriteControllerToken> WriteController::GetStopToken() {
   return std::unique_ptr<WriteControllerToken>(new StopWriteToken(this));
 }
 
-std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken(
-    uint64_t delay_us) {
-  total_delay_us_ += delay_us;
-  return std::unique_ptr<WriteControllerToken>(
-      new DelayWriteToken(this, delay_us));
+std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken() {
+  if (total_delayed_++ == 0) {
+    last_refill_time_ = 0;
+    bytes_left_ = 0;
+  }
+  return std::unique_ptr<WriteControllerToken>(new DelayWriteToken(this));
 }
 
 bool WriteController::IsStopped() const { return total_stopped_ > 0; }
-uint64_t WriteController::GetDelay() const { return total_delay_us_; }
+// Tihs is inside DB mutex, so we can't sleep and need to minimize
+// frequency to get time.
+// If it turns out to be a performance issue, we can redesign the thread
+// synchronization model here.
+// The function trust caller will sleep micros returned.
+uint64_t WriteController::GetDelay(Env* env, uint64_t num_bytes) {
+  if (total_stopped_ > 0) {
+    return 0;
+  }
+  if (total_delayed_ == 0) {
+    return 0;
+  }
+
+  const uint64_t kMicrosPerSecond = 1000000;
+  const uint64_t kRefillInterval = 1024U;
+
+  if (bytes_left_ >= num_bytes) {
+    bytes_left_ -= num_bytes;
+    return 0;
+  }
+  // The frequency to get time inside DB mutex is less than one per refill
+  // interval.
+  auto time_now = env->NowMicros();
+
+  uint64_t sleep_debt = 0;
+  uint64_t time_since_last_refill = 0;
+  if (last_refill_time_ != 0) {
+    if (last_refill_time_ > time_now) {
+      sleep_debt = last_refill_time_ - time_now;
+    } else {
+      time_since_last_refill = time_now - last_refill_time_;
+      bytes_left_ +=
+          static_cast<uint64_t>(static_cast<double>(time_since_last_refill) /
+                                kMicrosPerSecond * delayed_write_rate_);
+      if (time_since_last_refill >= kRefillInterval &&
+          bytes_left_ > num_bytes) {
+        // If refill interval already passed and we have enough bytes
+        // return without extra sleeping.
+        last_refill_time_ = time_now;
+        bytes_left_ -= num_bytes;
+        return 0;
+      }
+    }
+  }
+
+  uint64_t single_refill_amount =
+      delayed_write_rate_ * kRefillInterval / kMicrosPerSecond;
+  if (bytes_left_ + single_refill_amount >= num_bytes) {
+    // Wait until a refill interval
+    // Never trigger expire for less than one refill interval to avoid to get
+    // time.
+    bytes_left_ = bytes_left_ + single_refill_amount - num_bytes;
+    last_refill_time_ = time_now + kRefillInterval;
+    return kRefillInterval + sleep_debt;
+  }
+
+  // Need to refill more than one interval. Need to sleep longer. Check
+  // whether expiration will hit
+
+  // Sleep just until `num_bytes` is allowed.
+  uint64_t sleep_amount =
+      static_cast<uint64_t>(num_bytes /
+                            static_cast<long double>(delayed_write_rate_) *
+                            kMicrosPerSecond) +
+      sleep_debt;
+  last_refill_time_ = time_now + sleep_amount;
+  return sleep_amount;
+}
 
 StopWriteToken::~StopWriteToken() {
   assert(controller_->total_stopped_ >= 1);
@@ -30,8 +100,8 @@ StopWriteToken::~StopWriteToken() {
 }
 
 DelayWriteToken::~DelayWriteToken() {
-  assert(controller_->total_delay_us_ >= delay_us_);
-  controller_->total_delay_us_ -= delay_us_;
+  controller_->total_delayed_--;
+  assert(controller_->total_delayed_ >= 0);
 }
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/write_controller.h b/src/rocksdb/db/write_controller.h
index 32e1d58..50e5a99 100644
--- a/src/rocksdb/db/write_controller.h
+++ b/src/rocksdb/db/write_controller.h
@@ -11,6 +11,7 @@
 
 namespace rocksdb {
 
+class Env;
 class WriteControllerToken;
 
 // WriteController is controlling write stalls in our write code-path. Write
@@ -19,20 +20,38 @@ class WriteControllerToken;
 // to be called while holding DB mutex
 class WriteController {
  public:
-  WriteController() : total_stopped_(0), total_delay_us_(0) {}
+  explicit WriteController(uint64_t delayed_write_rate = 1024u * 1024u * 32u)
+      : total_stopped_(0),
+        total_delayed_(0),
+        bytes_left_(0),
+        last_refill_time_(0) {
+    set_delayed_write_rate(delayed_write_rate);
+  }
   ~WriteController() = default;
 
   // When an actor (column family) requests a stop token, all writes will be
   // stopped until the stop token is released (deleted)
   std::unique_ptr<WriteControllerToken> GetStopToken();
   // When an actor (column family) requests a delay token, total delay for all
-  // writes will be increased by delay_us. The delay will last until delay token
-  // is released
-  std::unique_ptr<WriteControllerToken> GetDelayToken(uint64_t delay_us);
+  // writes to the DB will be controlled under the delayed write rate. Every
+  // write needs to call GetDelay() with number of bytes writing to the DB,
+  // which returns number of microseconds to sleep.
+  std::unique_ptr<WriteControllerToken> GetDelayToken();
 
   // these two metods are querying the state of the WriteController
   bool IsStopped() const;
-  uint64_t GetDelay() const;
+  bool NeedsDelay() const { return total_delayed_ > 0; }
+  // return how many microseconds the caller needs to sleep after the call
+  // num_bytes: how many number of bytes to put into the DB.
+  // Prerequisite: DB mutex held.
+  uint64_t GetDelay(Env* env, uint64_t num_bytes);
+  void set_delayed_write_rate(uint64_t delayed_write_rate) {
+    delayed_write_rate_ = delayed_write_rate;
+    if (delayed_write_rate_ == 0) {
+      // avoid divide 0
+      delayed_write_rate_ = 1U;
+    }
+  }
 
  private:
   friend class WriteControllerToken;
@@ -40,7 +59,10 @@ class WriteController {
   friend class DelayWriteToken;
 
   int total_stopped_;
-  uint64_t total_delay_us_;
+  int total_delayed_;
+  uint64_t bytes_left_;
+  uint64_t last_refill_time_;
+  uint64_t delayed_write_rate_;
 };
 
 class WriteControllerToken {
@@ -67,12 +89,9 @@ class StopWriteToken : public WriteControllerToken {
 
 class DelayWriteToken : public WriteControllerToken {
  public:
-  DelayWriteToken(WriteController* controller, uint64_t delay_us)
-      : WriteControllerToken(controller), delay_us_(delay_us) {}
+  explicit DelayWriteToken(WriteController* controller)
+      : WriteControllerToken(controller) {}
   virtual ~DelayWriteToken();
-
- private:
-  uint64_t delay_us_;
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/write_controller_test.cc b/src/rocksdb/db/write_controller_test.cc
index 41f8313..aa8175d 100644
--- a/src/rocksdb/db/write_controller_test.cc
+++ b/src/rocksdb/db/write_controller_test.cc
@@ -5,14 +5,22 @@
 //
 #include "db/write_controller.h"
 
+#include "rocksdb/env.h"
 #include "util/testharness.h"
 
 namespace rocksdb {
 
 class WriteControllerTest : public testing::Test {};
 
+class TimeSetEnv : public EnvWrapper {
+ public:
+  explicit TimeSetEnv() : EnvWrapper(nullptr) {}
+  uint64_t now_micros_ = 6666;
+  virtual uint64_t NowMicros() override { return now_micros_; }
+};
+
 TEST_F(WriteControllerTest, SanityTest) {
-  WriteController controller;
+  WriteController controller(10000000u);
   auto stop_token_1 = controller.GetStopToken();
   auto stop_token_2 = controller.GetStopToken();
 
@@ -22,15 +30,66 @@ TEST_F(WriteControllerTest, SanityTest) {
   stop_token_2.reset();
   ASSERT_FALSE(controller.IsStopped());
 
-  auto delay_token_1 = controller.GetDelayToken(5);
-  ASSERT_EQ(static_cast<uint64_t>(5), controller.GetDelay());
-  auto delay_token_2 = controller.GetDelayToken(8);
-  ASSERT_EQ(static_cast<uint64_t>(13), controller.GetDelay());
+  TimeSetEnv env;
+
+  auto delay_token_1 = controller.GetDelayToken();
+  ASSERT_EQ(static_cast<uint64_t>(2000000),
+            controller.GetDelay(&env, 20000000u));
+
+  env.now_micros_ += 1999900u;  // sleep debt 1000
+  auto delay_token_2 = controller.GetDelayToken();
+  // One refill: 10240 bytes allowed, 1000 used, 9240 left
+  ASSERT_EQ(static_cast<uint64_t>(1124), controller.GetDelay(&env, 1000u));
+  env.now_micros_ += 1124u;  // sleep debt 0
 
   delay_token_2.reset();
-  ASSERT_EQ(static_cast<uint64_t>(5), controller.GetDelay());
+  // 1000 used, 8240 left
+  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
+
+  env.now_micros_ += 100u;  // sleep credit 100
+  // 1000 used, 7240 left
+  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
+
+  env.now_micros_ += 100u;  // sleep credit 200
+  // One refill: 10240 fileed, sleep credit generates 2000. 8000 used
+  //             7240 + 10240 + 2000 - 8000 = 11480 left
+  ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 8000u));
+
+  env.now_micros_ += 200u;  // sleep debt 824
+  // 1000 used, 10480 left.
+  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 1000u));
+
+  env.now_micros_ += 200u;  // sleep debt 624
+  // Out of bound sleep, still 10480 left
+  ASSERT_EQ(static_cast<uint64_t>(3000624u),
+            controller.GetDelay(&env, 30000000u));
+
+  env.now_micros_ += 3000724u;  // sleep credit 100
+  // 6000 used, 4480 left.
+  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 6000u));
+
+  env.now_micros_ += 200u;  // sleep credit 300
+  // One refill, credit 4480 balance + 3000 credit + 10240 refill
+  // Use 8000, 9720 left
+  ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 8000u));
+
+  env.now_micros_ += 3024u;  // sleep credit 2000
+
+  // 1720 left
+  ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
+
+  // 1720 balance + 20000 credit = 20170 left
+  // Use 8000, 12170 left
+  ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
+
+  // 4170 left
+  ASSERT_EQ(static_cast<uint64_t>(0u), controller.GetDelay(&env, 8000u));
+
+  // Need a refill
+  ASSERT_EQ(static_cast<uint64_t>(1024u), controller.GetDelay(&env, 9000u));
+
   delay_token_1.reset();
-  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay());
+  ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay(&env, 30000000u));
   delay_token_1.reset();
   ASSERT_FALSE(controller.IsStopped());
 }
diff --git a/src/rocksdb/db/write_thread.cc b/src/rocksdb/db/write_thread.cc
index 052e120..9b66af2 100644
--- a/src/rocksdb/db/write_thread.cc
+++ b/src/rocksdb/db/write_thread.cc
@@ -7,122 +7,110 @@
 
 namespace rocksdb {
 
-Status WriteThread::EnterWriteThread(WriteThread::Writer* w,
-                                     uint64_t expiration_time) {
-  // the following code block pushes the current writer "w" into the writer
-  // queue "writers_" and wait until one of the following conditions met:
-  // 1. the job of "w" has been done by some other writers.
-  // 2. "w" becomes the first writer in "writers_"
-  // 3. "w" timed-out.
-  writers_.push_back(w);
-
-  bool timed_out = false;
-  while (!w->done && w != writers_.front()) {
-    if (expiration_time == 0) {
-      w->cv.Wait();
-    } else if (w->cv.TimedWait(expiration_time)) {
-      if (w->in_batch_group) {
-        // then it means the front writer is currently doing the
-        // write on behalf of this "timed-out" writer.  Then it
-        // should wait until the write completes.
-        expiration_time = 0;
-      } else {
-        timed_out = true;
-        break;
-      }
-    }
-  }
+void WriteThread::Await(Writer* w) {
+  std::unique_lock<std::mutex> guard(w->JoinMutex());
+  w->JoinCV().wait(guard, [w] { return w->joined; });
+}
+
+void WriteThread::MarkJoined(Writer* w) {
+  std::lock_guard<std::mutex> guard(w->JoinMutex());
+  assert(!w->joined);
+  w->joined = true;
+  w->JoinCV().notify_one();
+}
+
+void WriteThread::LinkOne(Writer* w, bool* wait_needed) {
+  assert(!w->joined && !w->done);
 
-  if (timed_out) {
-#ifndef NDEBUG
-    bool found = false;
-#endif
-    for (auto iter = writers_.begin(); iter != writers_.end(); iter++) {
-      if (*iter == w) {
-        writers_.erase(iter);
-#ifndef NDEBUG
-        found = true;
-#endif
-        break;
-      }
+  Writer* writers = newest_writer_.load(std::memory_order_relaxed);
+  while (true) {
+    w->link_older = writers;
+    if (writers != nullptr) {
+      w->CreateMutex();
     }
-#ifndef NDEBUG
-    assert(found);
-#endif
-    // writers_.front() might still be in cond_wait without a time-out.
-    // As a result, we need to signal it to wake it up.  Otherwise no
-    // one else will wake him up, and RocksDB will hang.
-    if (!writers_.empty()) {
-      writers_.front()->cv.Signal();
+    if (newest_writer_.compare_exchange_strong(writers, w)) {
+      // Success.
+      *wait_needed = (writers != nullptr);
+      return;
     }
-    return Status::TimedOut();
   }
-  return Status::OK();
 }
 
-void WriteThread::ExitWriteThread(WriteThread::Writer* w,
-                                  WriteThread::Writer* last_writer,
-                                  Status status) {
-  // Pop out the current writer and all writers being pushed before the
-  // current writer from the writer queue.
-  while (!writers_.empty()) {
-    Writer* ready = writers_.front();
-    writers_.pop_front();
-    if (ready != w) {
-      ready->status = status;
-      ready->done = true;
-      ready->cv.Signal();
+void WriteThread::CreateMissingNewerLinks(Writer* head) {
+  while (true) {
+    Writer* next = head->link_older;
+    if (next == nullptr || next->link_newer != nullptr) {
+      assert(next == nullptr || next->link_newer == head);
+      break;
     }
-    if (ready == last_writer) break;
+    next->link_newer = head;
+    head = next;
   }
+}
 
-  // Notify new head of write queue
-  if (!writers_.empty()) {
-    writers_.front()->cv.Signal();
+void WriteThread::JoinBatchGroup(Writer* w) {
+  assert(w->batch != nullptr);
+  bool wait_needed;
+  LinkOne(w, &wait_needed);
+  if (wait_needed) {
+    Await(w);
   }
 }
 
-// This function will be called only when the first writer succeeds.
-// All writers in the to-be-built batch group will be processed.
-//
-// REQUIRES: Writer list must be non-empty
-// REQUIRES: First writer must have a non-nullptr batch
-void WriteThread::BuildBatchGroup(WriteThread::Writer** last_writer,
-                                  autovector<WriteBatch*>* write_batch_group) {
-  assert(!writers_.empty());
-  Writer* first = writers_.front();
-  assert(first->batch != nullptr);
+size_t WriteThread::EnterAsBatchGroupLeader(
+    Writer* leader, WriteThread::Writer** last_writer,
+    autovector<WriteBatch*>* write_batch_group) {
+  assert(leader->link_older == nullptr);
+  assert(leader->batch != nullptr);
 
-  size_t size = WriteBatchInternal::ByteSize(first->batch);
-  write_batch_group->push_back(first->batch);
+  size_t size = WriteBatchInternal::ByteSize(leader->batch);
+  write_batch_group->push_back(leader->batch);
 
   // Allow the group to grow up to a maximum size, but if the
   // original write is small, limit the growth so we do not slow
   // down the small write too much.
   size_t max_size = 1 << 20;
-  if (size <= (128<<10)) {
-    max_size = size + (128<<10);
+  if (size <= (128 << 10)) {
+    max_size = size + (128 << 10);
+  }
+
+  *last_writer = leader;
+
+  if (leader->has_callback) {
+    // TODO(agiardullo:) Batching not currently supported as this write may
+    // fail if the callback function decides to abort this write.
+    return size;
   }
 
-  *last_writer = first;
-  std::deque<Writer*>::iterator iter = writers_.begin();
-  ++iter;  // Advance past "first"
-  for (; iter != writers_.end(); ++iter) {
-    Writer* w = *iter;
-    if (w->sync && !first->sync) {
+  Writer* newest_writer = newest_writer_.load(std::memory_order_acquire);
+
+  // This is safe regardless of any db mutex status of the caller. Previous
+  // calls to ExitAsGroupLeader either didn't call CreateMissingNewerLinks
+  // (they emptied the list and then we added ourself as leader) or had to
+  // explicitly wake up us (the list was non-empty when we added ourself,
+  // so we have already received our MarkJoined).
+  CreateMissingNewerLinks(newest_writer);
+
+  // Tricky. Iteration start (leader) is exclusive and finish
+  // (newest_writer) is inclusive. Iteration goes from old to new.
+  Writer* w = leader;
+  while (w != newest_writer) {
+    w = w->link_newer;
+
+    if (w->sync && !leader->sync) {
       // Do not include a sync write into a batch handled by a non-sync write.
       break;
     }
 
-    if (!w->disableWAL && first->disableWAL) {
+    if (!w->disableWAL && leader->disableWAL) {
       // Do not include a write that needs WAL into a batch that has
       // WAL disabled.
       break;
     }
 
-    if (w->timeout_hint_us < first->timeout_hint_us) {
-      // Do not include those writes with shorter timeout.  Otherwise, we might
-      // execute a write that should instead be aborted because of timeout.
+    if (w->has_callback) {
+      // Do not include writes which may be aborted if the callback does not
+      // succeed.
       break;
     }
 
@@ -142,6 +130,71 @@ void WriteThread::BuildBatchGroup(WriteThread::Writer** last_writer,
     w->in_batch_group = true;
     *last_writer = w;
   }
+  return size;
+}
+
+void WriteThread::ExitAsBatchGroupLeader(Writer* leader, Writer* last_writer,
+                                         Status status) {
+  assert(leader->link_older == nullptr);
+
+  Writer* head = newest_writer_.load(std::memory_order_acquire);
+  if (head != last_writer ||
+      !newest_writer_.compare_exchange_strong(head, nullptr)) {
+    // Either w wasn't the head during the load(), or it was the head
+    // during the load() but somebody else pushed onto the list before
+    // we did the compare_exchange_strong (causing it to fail).  In the
+    // latter case compare_exchange_strong has the effect of re-reading
+    // its first param (head).  No need to retry a failing CAS, because
+    // only a departing leader (which we are at the moment) can remove
+    // nodes from the list.
+    assert(head != last_writer);
+
+    // After walking link_older starting from head (if not already done)
+    // we will be able to traverse w->link_newer below. This function
+    // can only be called from an active leader, only a leader can
+    // clear newest_writer_, we didn't, and only a clear newest_writer_
+    // could cause the next leader to start their work without a call
+    // to MarkJoined, so we can definitely conclude that no other leader
+    // work is going on here (with or without db mutex).
+    CreateMissingNewerLinks(head);
+    assert(last_writer->link_newer->link_older == last_writer);
+    last_writer->link_newer->link_older = nullptr;
+
+    // Next leader didn't self-identify, because newest_writer_ wasn't
+    // nullptr when they enqueued (we were definitely enqueued before them
+    // and are still in the list).  That means leader handoff occurs when
+    // we call MarkJoined
+    MarkJoined(last_writer->link_newer);
+  }
+  // else nobody else was waiting, although there might already be a new
+  // leader now
+
+  while (last_writer != leader) {
+    last_writer->status = status;
+    last_writer->done = true;
+    // We must read link_older before calling MarkJoined, because as
+    // soon as it is marked the other thread's AwaitJoined may return
+    // and deallocate the Writer.
+    auto next = last_writer->link_older;
+    MarkJoined(last_writer);
+    last_writer = next;
+  }
+}
+
+void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) {
+  assert(w->batch == nullptr);
+  bool wait_needed;
+  LinkOne(w, &wait_needed);
+  if (wait_needed) {
+    mu->Unlock();
+    Await(w);
+    mu->Lock();
+  }
+}
+
+void WriteThread::ExitUnbatched(Writer* w) {
+  Status dummy_status;
+  ExitAsBatchGroupLeader(w, w, dummy_status);
 }
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/db/write_thread.h b/src/rocksdb/db/write_thread.h
index db35202..3a15ea8 100644
--- a/src/rocksdb/db/write_thread.h
+++ b/src/rocksdb/db/write_thread.h
@@ -5,77 +5,145 @@
 
 #pragma once
 
+#include <assert.h>
 #include <stdint.h>
-#include <deque>
-#include <limits>
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+#include <type_traits>
 #include "rocksdb/status.h"
 #include "db/write_batch_internal.h"
 #include "util/autovector.h"
-#include "port/port.h"
 #include "util/instrumented_mutex.h"
 
 namespace rocksdb {
 
 class WriteThread {
  public:
-  static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max();
-  // Information kept for every waiting writer
+  // Information kept for every waiting writer.
   struct Writer {
-    Status status;
     WriteBatch* batch;
     bool sync;
     bool disableWAL;
     bool in_batch_group;
     bool done;
-    uint64_t timeout_hint_us;
-    InstrumentedCondVar cv;
+    bool has_callback;
+    Status status;
+    bool made_waitable;  // records lazy construction of mutex and cv
+    bool joined;         // read/write only under JoinMutex() (or pre-link)
+    std::aligned_storage<sizeof(std::mutex)>::type join_mutex_bytes;
+    std::aligned_storage<sizeof(std::condition_variable)>::type join_cv_bytes;
+    Writer* link_older;  // read/write only before linking, or as leader
+    Writer* link_newer;  // lazy, read/write only before linking, or as leader
 
-    explicit Writer(InstrumentedMutex* mu)
+    Writer()
         : batch(nullptr),
           sync(false),
           disableWAL(false),
           in_batch_group(false),
           done(false),
-          timeout_hint_us(kNoTimeOut),
-          cv(mu) {}
+          has_callback(false),
+          made_waitable(false),
+          joined(false),
+          link_older(nullptr),
+          link_newer(nullptr) {}
+
+    ~Writer() {
+      if (made_waitable) {
+        JoinMutex().~mutex();
+        JoinCV().~condition_variable();
+      }
+    }
+
+    void CreateMutex() {
+      assert(!joined);
+      if (!made_waitable) {
+        made_waitable = true;
+        new (&join_mutex_bytes) std::mutex;
+        new (&join_cv_bytes) std::condition_variable;
+      }
+    }
+
+    // No other mutexes may be acquired while holding JoinMutex(), it is
+    // always last in the order
+    std::mutex& JoinMutex() {
+      assert(made_waitable);
+      return *static_cast<std::mutex*>(static_cast<void*>(&join_mutex_bytes));
+    }
+
+    std::condition_variable& JoinCV() {
+      assert(made_waitable);
+      return *static_cast<std::condition_variable*>(
+          static_cast<void*>(&join_cv_bytes));
+    }
   };
 
-  WriteThread() = default;
-  ~WriteThread() = default;
+  WriteThread() : newest_writer_(nullptr) {}
 
-  // Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
-  // thread should grab the mutex_ and be the first on writers queue.
-  // EnterWriteThread is used for it.
-  // Be aware! Writer's job can be done by other thread (see DBImpl::Write
-  // for examples), so check it via w.done before applying changes.
+  // IMPORTANT: None of the methods in this class rely on the db mutex
+  // for correctness. All of the methods except JoinBatchGroup and
+  // EnterUnbatched may be called either with or without the db mutex held.
+  // Correctness is maintained by ensuring that only a single thread is
+  // a leader at a time.
+
+  // Registers w as ready to become part of a batch group, and blocks
+  // until some other thread has completed the write (in which case
+  // w->done will be set to true) or this write has become the leader
+  // of a batch group (w->done will remain unset).  The db mutex SHOULD
+  // NOT be held when calling this function, because it will block.
+  // If !w->done then JoinBatchGroup should be followed by a call to
+  // EnterAsBatchGroupLeader and ExitAsBatchGroupLeader.
   //
-  // Writer* w:                writer to be placed in the queue
-  // uint64_t expiration_time: maximum time to be in the queue
-  // See also: ExitWriteThread
-  // REQUIRES: db mutex held
-  Status EnterWriteThread(Writer* w, uint64_t expiration_time);
+  // Writer* w:        Writer to be executed as part of a batch group
+  void JoinBatchGroup(Writer* w);
 
-  // After doing write job, we need to remove already used writers from
-  // writers_ queue and notify head of the queue about it.
-  // ExitWriteThread is used for this.
+  // Constructs a write batch group led by leader, which should be a
+  // Writer passed to JoinBatchGroup on the current thread.
   //
-  // Writer* w:           Writer, that was added by EnterWriteThread function
-  // Writer* last_writer: Since we can join a few Writers (as DBImpl::Write
-  //                      does)
-  //                      we should pass last_writer as a parameter to
-  //                      ExitWriteThread
-  //                      (if you don't touch other writers, just pass w)
-  // Status status:       Status of write operation
-  // See also: EnterWriteThread
+  // Writer* leader:         Writer passed to JoinBatchGroup, but !done
+  // Writer** last_writer:   Out-param for use by ExitAsBatchGroupLeader
+  // autovector<WriteBatch*>* write_batch_group: Out-param of group members
+  // returns:                Total batch group size
+  size_t EnterAsBatchGroupLeader(Writer* leader, Writer** last_writer,
+                                 autovector<WriteBatch*>* write_batch_group);
+
+  // Unlinks the Writer-s in a batch group, wakes up the non-leaders, and
+  // wakes up the next leader (if any).
+  //
+  // Writer* leader:         From EnterAsBatchGroupLeader
+  // Writer* last_writer:    Value of out-param of EnterAsBatchGroupLeader
+  // Status status:          Status of write operation
+  void ExitAsBatchGroupLeader(Writer* leader, Writer* last_writer,
+                              Status status);
+
+  // Waits for all preceding writers (unlocking mu while waiting), then
+  // registers w as the currently proceeding writer.
+  //
+  // Writer* w:              A Writer not eligible for batching
+  // InstrumentedMutex* mu:  The db mutex, to unlock while waiting
   // REQUIRES: db mutex held
-  void ExitWriteThread(Writer* w, Writer* last_writer, Status status);
+  void EnterUnbatched(Writer* w, InstrumentedMutex* mu);
 
-  void BuildBatchGroup(Writer** last_writer,
-                       autovector<WriteBatch*>* write_batch_group);
+  // Completes a Writer begun with EnterUnbatched, unblocking subsequent
+  // writers.
+  void ExitUnbatched(Writer* w);
 
  private:
-  // Queue of writers.
-  std::deque<Writer*> writers_;
+  // Points to the newest pending Writer.  Only leader can remove
+  // elements, adding can be done lock-free by anybody
+  std::atomic<Writer*> newest_writer_;
+
+  void Await(Writer* w);
+  void MarkJoined(Writer* w);
+
+  // Links w into the newest_writer_ list. Sets *wait_needed to false
+  // if w was linked directly into the leader position, true otherwise.
+  // Safe to call from multiple threads without external locking.
+  void LinkOne(Writer* w, bool* wait_needed);
+
+  // Computes any missing link_newer links.  Should not be called
+  // concurrently with itself.
+  void CreateMissingNewerLinks(Writer* head);
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/examples/.gitignore b/src/rocksdb/examples/.gitignore
index 5cb04d4..8c06e79 100644
--- a/src/rocksdb/examples/.gitignore
+++ b/src/rocksdb/examples/.gitignore
@@ -1,4 +1,7 @@
-column_families_example
-simple_example
 c_simple_example
+column_families_example
 compact_files_example
+compaction_filter_example
+optimistic_transaction_example
+simple_example
+transaction_example
diff --git a/src/rocksdb/examples/Makefile b/src/rocksdb/examples/Makefile
index 7bd88fb..fe82d11 100644
--- a/src/rocksdb/examples/Makefile
+++ b/src/rocksdb/examples/Makefile
@@ -1,23 +1,35 @@
 include ../make_config.mk
 
-.PHONY: clean
+.PHONY: clean librocksdb
 
-all: simple_example column_families_example compact_files_example c_simple_example
+all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example
 
-simple_example: simple_example.cc
+simple_example: librocksdb simple_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
-column_families_example: column_families_example.cc
+column_families_example: librocksdb column_families_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
-compact_files_example: compact_files_example.cc
+compaction_filter_example: librocksdb compaction_filter_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+compact_files_example: librocksdb compact_files_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 .c.o:
 	$(CC) $(CFLAGS) -c $< -o $@ -I../include
 
-c_simple_example: c_simple_example.o
+c_simple_example: librocksdb c_simple_example.o
 	$(CXX) $@.o -o$@ ../librocksdb.a $(PLATFORM_LDFLAGS) $(EXEC_LDFLAGS)
 
+optimistic_transaction_example: librocksdb optimistic_transaction_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
+transaction_example: librocksdb transaction_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
 clean:
-	rm -rf ./simple_example ./column_families_example ./compact_files_example ./c_simple_example c_simple_example.o
+	rm -rf ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example
+
+librocksdb:
+	cd .. && $(MAKE) librocksdb.a
diff --git a/src/rocksdb/examples/c_simple_example.c b/src/rocksdb/examples/c_simple_example.c
index 7a63827..ab19f3b 100644
--- a/src/rocksdb/examples/c_simple_example.c
+++ b/src/rocksdb/examples/c_simple_example.c
@@ -27,7 +27,7 @@ int main(int argc, char **argv) {
   db = rocksdb_open(options, DBPath, &err);
   assert(!err);
 
-  // open Backup Engine that we will use for backing up or database
+  // open Backup Engine that we will use for backing up our database
   be = rocksdb_backup_engine_open(options, DBBackupPath, &err);
   assert(!err);
 
diff --git a/src/rocksdb/examples/compact_files_example.cc b/src/rocksdb/examples/compact_files_example.cc
index 3e7638b..6c04566 100644
--- a/src/rocksdb/examples/compact_files_example.cc
+++ b/src/rocksdb/examples/compact_files_example.cc
@@ -14,7 +14,7 @@
 
 using namespace rocksdb;
 std::string kDBPath = "/tmp/rocksdb_compact_files_example";
-class CompactionTask;
+struct CompactionTask;
 
 // This is an example interface of external-compaction algorithm.
 // Compaction algorithm can be implemented outside the core-RocksDB
@@ -35,19 +35,19 @@ class Compactor : public EventListener {
 // Example structure that describes a compaction task.
 struct CompactionTask {
   CompactionTask(
-      DB* db, Compactor* compactor,
-      const std::string& column_family_name,
-      const std::vector<std::string>& input_file_names,
-      const int output_level,
-      const CompactionOptions& compact_options,
-      bool retry_on_fail)
-          : db(db),
-            compactor(compactor),
-            column_family_name(column_family_name),
-            input_file_names(input_file_names),
-            output_level(output_level),
-            compact_options(compact_options),
-            retry_on_fail(false) {}
+      DB* _db, Compactor* _compactor,
+      const std::string& _column_family_name,
+      const std::vector<std::string>& _input_file_names,
+      const int _output_level,
+      const CompactionOptions& _compact_options,
+      bool _retry_on_fail)
+          : db(_db),
+            compactor(_compactor),
+            column_family_name(_column_family_name),
+            input_file_names(_input_file_names),
+            output_level(_output_level),
+            compact_options(_compact_options),
+            retry_on_fail(_retry_on_fail) {}
   DB* db;
   Compactor* compactor;
   const std::string& column_family_name;
@@ -67,17 +67,14 @@ class FullCompactor : public Compactor {
         options_.target_file_size_base;
   }
 
-  // When flush happens, it determins whether to trigger compaction.
-  // If triggered_writes_stop is true, it will also set the retry
-  // flag of compaction-task to true.
+  // When flush happens, it determines whether to trigger compaction. If
+  // triggered_writes_stop is true, it will also set the retry flag of
+  // compaction-task to true.
   void OnFlushCompleted(
-      DB* db, const std::string& cf_name,
-      const std::string& file_path,
-      bool triggered_writes_slowdown,
-      bool triggered_writes_stop) override {
-    CompactionTask* task = PickCompaction(db, cf_name);
+      DB* db, const FlushJobInfo& info) override {
+    CompactionTask* task = PickCompaction(db, info.cf_name);
     if (task != nullptr) {
-      if (triggered_writes_stop) {
+      if (info.triggered_writes_stop) {
         task->retry_on_fail = true;
       }
       // Schedule compaction in a different thread.
@@ -111,7 +108,8 @@ class FullCompactor : public Compactor {
   }
 
   static void CompactFiles(void* arg) {
-    CompactionTask* task = reinterpret_cast<CompactionTask*>(arg);
+    std::unique_ptr<CompactionTask> task(
+        reinterpret_cast<CompactionTask*>(arg));
     assert(task);
     assert(task->db);
     Status s = task->db->CompactFiles(
@@ -127,8 +125,6 @@ class FullCompactor : public Compactor {
           task->db, task->column_family_name);
       task->compactor->ScheduleCompaction(new_task);
     }
-    // release the task
-    delete task;
   }
 
  private:
diff --git a/src/rocksdb/examples/compaction_filter_example.cc b/src/rocksdb/examples/compaction_filter_example.cc
new file mode 100644
index 0000000..050f461
--- /dev/null
+++ b/src/rocksdb/examples/compaction_filter_example.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include <rocksdb/compaction_filter.h>
+#include <rocksdb/db.h>
+#include <rocksdb/merge_operator.h>
+#include <rocksdb/options.h>
+
+class MyMerge : public rocksdb::MergeOperator {
+ public:
+  bool FullMerge(const rocksdb::Slice& key,
+                 const rocksdb::Slice* existing_value,
+                 const std::deque<std::string>& operand_list,
+                 std::string* new_value,
+                 rocksdb::Logger* logger) const override {
+    new_value->clear();
+    if (existing_value != nullptr) {
+      new_value->assign(existing_value->data(), existing_value->size());
+    }
+    for (const std::string& m : operand_list) {
+      fprintf(stderr, "Merge(%s)\n", m.c_str());
+      assert(m != "bad");  // the compaction filter filters out bad values
+      new_value->assign(m);
+    }
+    return true;
+  }
+
+  const char* Name() const override { return "MyMerge"; }
+};
+
+class MyFilter : public rocksdb::CompactionFilter {
+ public:
+  bool Filter(int level, const rocksdb::Slice& key,
+              const rocksdb::Slice& existing_value, std::string* new_value,
+              bool* value_changed) const override {
+    fprintf(stderr, "Filter(%s)\n", key.ToString().c_str());
+    ++count_;
+    assert(*value_changed == false);
+    return false;
+  }
+
+  bool FilterMergeOperand(int level, const rocksdb::Slice& key,
+                          const rocksdb::Slice& existing_value) const override {
+    fprintf(stderr, "FilterMerge(%s)\n", key.ToString().c_str());
+    ++merge_count_;
+    return existing_value == "bad";
+  }
+
+  const char* Name() const override { return "MyFilter"; }
+
+  mutable int count_ = 0;
+  mutable int merge_count_ = 0;
+};
+
+int main() {
+  rocksdb::DB* raw_db;
+  rocksdb::Status status;
+
+  MyFilter filter;
+
+  system("rm -rf /tmp/rocksmergetest");
+  rocksdb::Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new MyMerge);
+  options.compaction_filter = &filter;
+  status = rocksdb::DB::Open(options, "/tmp/rocksmergetest", &raw_db);
+  assert(status.ok());
+  std::unique_ptr<rocksdb::DB> db(raw_db);
+
+  rocksdb::WriteOptions wopts;
+  db->Merge(wopts, "0", "bad");  // This is filtered out
+  db->Merge(wopts, "1", "data1");
+  db->Merge(wopts, "1", "bad");
+  db->Merge(wopts, "1", "data2");
+  db->Merge(wopts, "1", "bad");
+  db->Merge(wopts, "3", "data3");
+  db->CompactRange(rocksdb::CompactRangeOptions(), nullptr, nullptr);
+  fprintf(stderr, "filter.count_ = %d\n", filter.count_);
+  assert(filter.count_ == 1);
+  fprintf(stderr, "filter.merge_count_ = %d\n", filter.merge_count_);
+  assert(filter.merge_count_ == 5);
+}
diff --git a/src/rocksdb/examples/optimistic_transaction_example.cc b/src/rocksdb/examples/optimistic_transaction_example.cc
new file mode 100644
index 0000000..e9ab0e5
--- /dev/null
+++ b/src/rocksdb/examples/optimistic_transaction_example.cc
@@ -0,0 +1,142 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+
+using namespace rocksdb;
+
+std::string kDBPath = "/tmp/rocksdb_transaction_example";
+
+int main() {
+  // open DB
+  Options options;
+  options.create_if_missing = true;
+  DB* db;
+  OptimisticTransactionDB* txn_db;
+
+  Status s = OptimisticTransactionDB::Open(options, kDBPath, &txn_db);
+  assert(s.ok());
+  db = txn_db->GetBaseDB();
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+  OptimisticTransactionOptions txn_options;
+  std::string value;
+
+  ////////////////////////////////////////////////////////
+  //
+  // Simple OptimisticTransaction Example ("Read Committed")
+  //
+  ////////////////////////////////////////////////////////
+
+  // Start a transaction
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  assert(txn);
+
+  // Read a key in this transaction
+  s = txn->Get(read_options, "abc", &value);
+  assert(s.IsNotFound());
+
+  // Write a key in this transaction
+  txn->Put("abc", "def");
+
+  // Read a key OUTSIDE this transaction. Does not affect txn.
+  s = db->Get(read_options, "abc", &value);
+
+  // Write a key OUTSIDE of this transaction.
+  // Does not affect txn since this is an unrelated key.  If we wrote key 'abc'
+  // here, the transaction would fail to commit.
+  s = db->Put(write_options, "xyz", "zzz");
+
+  // Commit transaction
+  s = txn->Commit();
+  assert(s.ok());
+  delete txn;
+
+  ////////////////////////////////////////////////////////
+  //
+  // "Repeatable Read" (Snapshot Isolation) Example
+  //   -- Using a single Snapshot
+  //
+  ////////////////////////////////////////////////////////
+
+  // Set a snapshot at start of transaction by setting set_snapshot=true
+  txn_options.set_snapshot = true;
+  txn = txn_db->BeginTransaction(write_options, txn_options);
+
+  const Snapshot* snapshot = txn->GetSnapshot();
+
+  // Write a key OUTSIDE of transaction
+  db->Put(write_options, "abc", "xyz");
+
+  // Read a key using the snapshot
+  read_options.snapshot = snapshot;
+  s = txn->GetForUpdate(read_options, "abc", &value);
+  assert(value == "def");
+
+  // Attempt to commit transaction
+  s = txn->Commit();
+
+  // Transaction could not commit since the write outside of the txn conflicted
+  // with the read!
+  assert(s.IsBusy());
+
+  delete txn;
+  // Clear snapshot from read options since it is no longer valid
+  read_options.snapshot = nullptr;
+  snapshot = nullptr;
+
+  ////////////////////////////////////////////////////////
+  //
+  // "Read Committed" (Monotonic Atomic Views) Example
+  //   --Using multiple Snapshots
+  //
+  ////////////////////////////////////////////////////////
+
+  // In this example, we set the snapshot multiple times.  This is probably
+  // only necessary if you have very strict isolation requirements to
+  // implement.
+
+  // Set a snapshot at start of transaction
+  txn_options.set_snapshot = true;
+  txn = txn_db->BeginTransaction(write_options, txn_options);
+
+  // Do some reads and writes to key "x"
+  read_options.snapshot = db->GetSnapshot();
+  s = txn->Get(read_options, "x", &value);
+  txn->Put("x", "x");
+
+  // Do a write outside of the transaction to key "y"
+  s = db->Put(write_options, "y", "y");
+
+  // Set a new snapshot in the transaction
+  txn->SetSnapshot();
+  read_options.snapshot = db->GetSnapshot();
+
+  // Do some reads and writes to key "y"
+  s = txn->GetForUpdate(read_options, "y", &value);
+  txn->Put("y", "y");
+
+  // Commit.  Since the snapshot was advanced, the write done outside of the
+  // transaction does not prevent this transaction from Committing.
+  s = txn->Commit();
+  assert(s.ok());
+  delete txn;
+  // Clear snapshot from read options since it is no longer valid
+  read_options.snapshot = nullptr;
+
+  // Cleanup
+  delete txn_db;
+  DestroyDB(kDBPath, options);
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/examples/rocksdb_option_file_example.ini b/src/rocksdb/examples/rocksdb_option_file_example.ini
new file mode 100644
index 0000000..ce74f77
--- /dev/null
+++ b/src/rocksdb/examples/rocksdb_option_file_example.ini
@@ -0,0 +1,53 @@
+# This is a RocksDB option file.
+#
+# A typical RocksDB options file has three sections, which are
+# Version, DBOptions, and more than one CFOptions.  The RocksDB
+# options file in general follows the basic INI file format
+# with the following extensions / modifications:
+#
+#  * Escaped characters
+#    We escaped the following characters:
+#     - \n -- line feed - new line
+#     - \r -- carriage return
+#     - \\ -- backslash \
+#     - \: -- colon symbol :
+#     - \# -- hash tag #
+#  * Comments
+#    We support # style comments.  Comments can appear at the ending
+#    part of a line.
+#  * Statements
+#    A statement is of the form option_name = value.
+#    Each statement contains a '=', where extra white-spaces
+#    are supported. However, we don't support multi-lined statement.
+#    Furthermore, each line can only contain at most one statement.
+#  * Section
+#    Sections are of the form [SecitonTitle "SectionArgument"],
+#    where section argument is optional.
+#  * List
+#    We use colon-separated string to represent a list.
+#    For instance, n1:n2:n3:n4 is a list containing four values.
+#
+# Below is an example of a RocksDB options file:
+[Version]
+  # The Version section stores the version information about rocksdb
+  # and option file.  This is used for handling potential format
+  # change in the future.
+  rocksdb_version=4.0.0  # We support "#" style comment.
+  options_file_version=1.0
+[DBOptions]
+  # Followed by the Version section is the DBOptions section.
+  # The value of an options can be assigned using a statement.
+  # Note that for those options that is not set in the options file,
+  # we will use the default value.
+  max_open_files=12345
+  max_background_flushes=301
+[CFOptions "default"]
+  # ColumnFamilyOptions section must follow the format of
+  # [CFOptions "cf name"].  If a rocksdb instance
+  # has multiple column families, then its CFOptions must be
+  # specified in the same order as column family creation order.
+[CFOptions "the second column family"]
+  # Each column family must have one section in the RocksDB option
+  # file even all the options of this column family are set to
+  # default value.
+[CFOptions "the third column family"]
diff --git a/src/rocksdb/examples/transaction_example.cc b/src/rocksdb/examples/transaction_example.cc
new file mode 100644
index 0000000..a7d5061
--- /dev/null
+++ b/src/rocksdb/examples/transaction_example.cc
@@ -0,0 +1,144 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+
+using namespace rocksdb;
+
+std::string kDBPath = "/tmp/rocksdb_transaction_example";
+
+int main() {
+  // open DB
+  Options options;
+  TransactionDBOptions txn_db_options;
+  options.create_if_missing = true;
+  TransactionDB* txn_db;
+
+  Status s = TransactionDB::Open(options, txn_db_options, kDBPath, &txn_db);
+  assert(s.ok());
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  std::string value;
+
+  ////////////////////////////////////////////////////////
+  //
+  // Simple OptimisticTransaction Example ("Read Committed")
+  //
+  ////////////////////////////////////////////////////////
+
+  // Start a transaction
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  assert(txn);
+
+  // Read a key in this transaction
+  s = txn->Get(read_options, "abc", &value);
+  assert(s.IsNotFound());
+
+  // Write a key in this transaction
+  s = txn->Put("abc", "def");
+  assert(s.ok());
+
+  // Read a key OUTSIDE this transaction. Does not affect txn.
+  s = txn_db->Get(read_options, "abc", &value);
+
+  // Write a key OUTSIDE of this transaction.
+  // Does not affect txn since this is an unrelated key.  If we wrote key 'abc'
+  // here, the transaction would fail to commit.
+  s = txn_db->Put(write_options, "xyz", "zzz");
+
+  // Commit transaction
+  s = txn->Commit();
+  assert(s.ok());
+  delete txn;
+
+  ////////////////////////////////////////////////////////
+  //
+  // "Repeatable Read" (Snapshot Isolation) Example
+  //   -- Using a single Snapshot
+  //
+  ////////////////////////////////////////////////////////
+
+  // Set a snapshot at start of transaction by setting set_snapshot=true
+  txn_options.set_snapshot = true;
+  txn = txn_db->BeginTransaction(write_options, txn_options);
+
+  const Snapshot* snapshot = txn->GetSnapshot();
+
+  // Write a key OUTSIDE of transaction
+  s = txn_db->Put(write_options, "abc", "xyz");
+  assert(s.ok());
+
+  // Attempt to read a key using the snapshot.  This will fail since
+  // the previous write outside this txn conflicts with this read.
+  read_options.snapshot = snapshot;
+  s = txn->GetForUpdate(read_options, "abc", &value);
+  assert(s.IsBusy());
+
+  txn->Rollback();
+
+  delete txn;
+  // Clear snapshot from read options since it is no longer valid
+  read_options.snapshot = nullptr;
+  snapshot = nullptr;
+
+  ////////////////////////////////////////////////////////
+  //
+  // "Read Committed" (Monotonic Atomic Views) Example
+  //   --Using multiple Snapshots
+  //
+  ////////////////////////////////////////////////////////
+
+  // In this example, we set the snapshot multiple times.  This is probably
+  // only necessary if you have very strict isolation requirements to
+  // implement.
+
+  // Set a snapshot at start of transaction
+  txn_options.set_snapshot = true;
+  txn = txn_db->BeginTransaction(write_options, txn_options);
+
+  // Do some reads and writes to key "x"
+  read_options.snapshot = txn_db->GetSnapshot();
+  s = txn->Get(read_options, "x", &value);
+  txn->Put("x", "x");
+
+  // Do a write outside of the transaction to key "y"
+  s = txn_db->Put(write_options, "y", "y");
+
+  // Set a new snapshot in the transaction
+  txn->SetSnapshot();
+  txn->SetSavePoint();
+  read_options.snapshot = txn_db->GetSnapshot();
+
+  // Do some reads and writes to key "y"
+  // Since the snapshot was advanced, the write done outside of the
+  // transaction does not conflict.
+  s = txn->GetForUpdate(read_options, "y", &value);
+  txn->Put("y", "y");
+
+  // Decide we want to revert the last write from this transaction.
+  txn->RollbackToSavePoint();
+
+  // Commit.
+  s = txn->Commit();
+  assert(s.ok());
+  delete txn;
+  // Clear snapshot from read options since it is no longer valid
+  read_options.snapshot = nullptr;
+
+  // Cleanup
+  delete txn_db;
+  DestroyDB(kDBPath, options);
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/hdfs/env_hdfs.h b/src/rocksdb/hdfs/env_hdfs.h
index cc94d52..e1e9430 100644
--- a/src/rocksdb/hdfs/env_hdfs.h
+++ b/src/rocksdb/hdfs/env_hdfs.h
@@ -7,9 +7,9 @@
 #pragma once
 #include <algorithm>
 #include <stdio.h>
-#include <sys/time.h>
 #include <time.h>
 #include <iostream>
+#include "port/sys_time.h"
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
 
@@ -66,14 +66,10 @@ class HdfsEnv : public Env {
                                  std::unique_ptr<WritableFile>* result,
                                  const EnvOptions& options);
 
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 std::unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options);
-
   virtual Status NewDirectory(const std::string& name,
                               std::unique_ptr<Directory>* result);
 
-  virtual bool FileExists(const std::string& fname);
+  virtual Status FileExists(const std::string& fname);
 
   virtual Status GetChildren(const std::string& path,
                              std::vector<std::string>* result);
@@ -164,6 +160,10 @@ class HdfsEnv : public Env {
     return (uint64_t)pthread_self();
   }
 
+  virtual uint64_t GetThreadID() const override {
+    return HdfsEnv::gettid();
+  }
+
  private:
   std::string fsname_;  // string of the form "hdfs://hostname:port/"
   hdfsFS fileSys_;      //  a single FileSystem object for all files
@@ -264,18 +264,14 @@ class HdfsEnv : public Env {
     return notsup;
   }
 
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options) override {
-    return notsup;
-  }
-
   virtual Status NewDirectory(const std::string& name,
                               unique_ptr<Directory>* result) override {
     return notsup;
   }
 
-  virtual bool FileExists(const std::string& fname) override { return false; }
+  virtual Status FileExists(const std::string& fname) override {
+    return notsup;
+  }
 
   virtual Status GetChildren(const std::string& path,
                              std::vector<std::string>* result) override {
@@ -360,6 +356,10 @@ class HdfsEnv : public Env {
   virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) override {
   }
   virtual std::string TimeToString(uint64_t number) override { return ""; }
+
+  virtual uint64_t GetThreadID() const override {
+    return 0;
+  }
 };
 }
 
diff --git a/src/rocksdb/include/rocksdb/c.h b/src/rocksdb/include/rocksdb/c.h
index 9b92068..782d10b 100644
--- a/src/rocksdb/include/rocksdb/c.h
+++ b/src/rocksdb/include/rocksdb/c.h
@@ -44,6 +44,22 @@
 #ifndef STORAGE_ROCKSDB_INCLUDE_C_H_
 #define STORAGE_ROCKSDB_INCLUDE_C_H_
 
+#pragma once
+
+#ifdef _WIN32
+#ifdef ROCKSDB_DLL
+#ifdef ROCKSDB_LIBRARY_EXPORTS
+#define ROCKSDB_LIBRARY_API __declspec(dllexport)
+#else
+#define ROCKSDB_LIBRARY_API __declspec(dllimport)
+#endif
+#else
+#define ROCKSDB_LIBRARY_API
+#endif
+#else
+#define ROCKSDB_LIBRARY_API
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -64,10 +80,6 @@ typedef struct rocksdb_compactionfiltercontext_t
     rocksdb_compactionfiltercontext_t;
 typedef struct rocksdb_compactionfilterfactory_t
     rocksdb_compactionfilterfactory_t;
-typedef struct rocksdb_compactionfilterv2_t
-    rocksdb_compactionfilterv2_t;
-typedef struct rocksdb_compactionfilterfactoryv2_t
-    rocksdb_compactionfilterfactoryv2_t;
 typedef struct rocksdb_comparator_t      rocksdb_comparator_t;
 typedef struct rocksdb_env_t             rocksdb_env_t;
 typedef struct rocksdb_fifo_compaction_options_t rocksdb_fifo_compaction_options_t;
@@ -96,522 +108,566 @@ typedef struct rocksdb_column_family_handle_t rocksdb_column_family_handle_t;
 
 /* DB operations */
 
-extern rocksdb_t* rocksdb_open(
-    const rocksdb_options_t* options,
-    const char* name,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open(
+    const rocksdb_options_t* options, const char* name, char** errptr);
 
-extern rocksdb_t* rocksdb_open_for_read_only(
-    const rocksdb_options_t* options,
-    const char* name,
-    unsigned char error_if_log_file_exist,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only(
+    const rocksdb_options_t* options, const char* name,
+    unsigned char error_if_log_file_exist, char** errptr);
 
-extern rocksdb_backup_engine_t* rocksdb_backup_engine_open(
-    const rocksdb_options_t* options,
-    const char* path,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+    const rocksdb_options_t* options, const char* path, char** errptr);
 
-extern void rocksdb_backup_engine_create_new_backup(
-    rocksdb_backup_engine_t* be,
-    rocksdb_t* db,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup(
+    rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr);
 
-extern rocksdb_restore_options_t* rocksdb_restore_options_create();
-extern void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt);
-extern void rocksdb_restore_options_set_keep_log_files(
+extern ROCKSDB_LIBRARY_API rocksdb_restore_options_t*
+rocksdb_restore_options_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy(
+    rocksdb_restore_options_t* opt);
+extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_set_keep_log_files(
     rocksdb_restore_options_t* opt, int v);
 
-extern void rocksdb_backup_engine_restore_db_from_latest_backup(
-    rocksdb_backup_engine_t *be,
-    const char* db_dir,
-    const char* wal_dir,
-    const rocksdb_restore_options_t *restore_options,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_restore_db_from_latest_backup(
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, char** errptr);
 
-extern const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
-    rocksdb_backup_engine_t* be);
+extern ROCKSDB_LIBRARY_API const rocksdb_backup_engine_info_t*
+rocksdb_backup_engine_get_backup_info(rocksdb_backup_engine_t* be);
 
-extern int rocksdb_backup_engine_info_count(
+extern ROCKSDB_LIBRARY_API int rocksdb_backup_engine_info_count(
     const rocksdb_backup_engine_info_t* info);
 
-extern int64_t rocksdb_backup_engine_info_timestamp(
-    const rocksdb_backup_engine_info_t* info,
-    int index);
+extern ROCKSDB_LIBRARY_API int64_t
+rocksdb_backup_engine_info_timestamp(const rocksdb_backup_engine_info_t* info,
+                                     int index);
 
-extern uint32_t rocksdb_backup_engine_info_backup_id(
-    const rocksdb_backup_engine_info_t* info,
-    int index);
+extern ROCKSDB_LIBRARY_API uint32_t
+rocksdb_backup_engine_info_backup_id(const rocksdb_backup_engine_info_t* info,
+                                     int index);
 
-extern uint64_t rocksdb_backup_engine_info_size(
-    const rocksdb_backup_engine_info_t* info,
-    int index);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_backup_engine_info_size(const rocksdb_backup_engine_info_t* info,
+                                int index);
 
-extern uint32_t rocksdb_backup_engine_info_number_files(
-    const rocksdb_backup_engine_info_t* info,
-    int index);
+extern ROCKSDB_LIBRARY_API uint32_t rocksdb_backup_engine_info_number_files(
+    const rocksdb_backup_engine_info_t* info, int index);
 
-extern void rocksdb_backup_engine_info_destroy(
-    const rocksdb_backup_engine_info_t *info);
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_info_destroy(
+    const rocksdb_backup_engine_info_t* info);
 
-extern void rocksdb_backup_engine_close(
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_close(
     rocksdb_backup_engine_t* be);
 
-extern rocksdb_t* rocksdb_open_column_families(
-    const rocksdb_options_t* options,
-    const char* name,
-    int num_column_families,
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_column_families(
+    const rocksdb_options_t* options, const char* name, int num_column_families,
     const char** column_family_names,
     const rocksdb_options_t** column_family_options,
-    rocksdb_column_family_handle_t** column_family_handles,
-    char** errptr);
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr);
 
-extern rocksdb_t* rocksdb_open_for_read_only_column_families(
-    const rocksdb_options_t* options,
-    const char* name,
-    int num_column_families,
+extern ROCKSDB_LIBRARY_API rocksdb_t*
+rocksdb_open_for_read_only_column_families(
+    const rocksdb_options_t* options, const char* name, int num_column_families,
     const char** column_family_names,
     const rocksdb_options_t** column_family_options,
     rocksdb_column_family_handle_t** column_family_handles,
-    unsigned char error_if_log_file_exist,
-    char** errptr);
+    unsigned char error_if_log_file_exist, char** errptr);
 
-char** rocksdb_list_column_families(
-    const rocksdb_options_t* options,
-    const char* name,
-    size_t* lencf,
+extern ROCKSDB_LIBRARY_API char** rocksdb_list_column_families(
+    const rocksdb_options_t* options, const char* name, size_t* lencf,
     char** errptr);
-void rocksdb_list_column_families_destroy(char** list, size_t len);
 
-extern rocksdb_column_family_handle_t* rocksdb_create_column_family(
-    rocksdb_t* db,
-    const rocksdb_options_t* column_family_options,
-    const char* column_family_name,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_list_column_families_destroy(
+    char** list, size_t len);
 
-extern void rocksdb_drop_column_family(
-    rocksdb_t* db,
-    rocksdb_column_family_handle_t* handle,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t*
+rocksdb_create_column_family(rocksdb_t* db,
+                             const rocksdb_options_t* column_family_options,
+                             const char* column_family_name, char** errptr);
 
-extern void rocksdb_column_family_handle_destroy(rocksdb_column_family_handle_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_drop_column_family(
+    rocksdb_t* db, rocksdb_column_family_handle_t* handle, char** errptr);
 
-extern void rocksdb_close(rocksdb_t* db);
+extern ROCKSDB_LIBRARY_API void rocksdb_column_family_handle_destroy(
+    rocksdb_column_family_handle_t*);
 
-extern void rocksdb_put(
-    rocksdb_t* db,
-    const rocksdb_writeoptions_t* options,
-    const char* key, size_t keylen,
-    const char* val, size_t vallen,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_close(rocksdb_t* db);
 
-extern void rocksdb_put_cf(
-    rocksdb_t* db,
-    const rocksdb_writeoptions_t* options,
-    rocksdb_column_family_handle_t* column_family,
-    const char* key, size_t keylen,
-    const char* val, size_t vallen,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_put(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
 
-extern void rocksdb_delete(
-    rocksdb_t* db,
-    const rocksdb_writeoptions_t* options,
-    const char* key, size_t keylen,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_put_cf(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
 
-void rocksdb_delete_cf(
-    rocksdb_t* db,
-    const rocksdb_writeoptions_t* options,
-    rocksdb_column_family_handle_t* column_family,
-    const char* key, size_t keylen,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_delete(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, char** errptr);
 
-extern void rocksdb_merge(
-    rocksdb_t* db,
-    const rocksdb_writeoptions_t* options,
-    const char* key, size_t keylen,
-    const char* val, size_t vallen,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_cf(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr);
 
-extern void rocksdb_merge_cf(
-    rocksdb_t* db,
-    const rocksdb_writeoptions_t* options,
-    rocksdb_column_family_handle_t* column_family,
-    const char* key, size_t keylen,
-    const char* val, size_t vallen,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_merge(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
 
-extern void rocksdb_write(
-    rocksdb_t* db,
-    const rocksdb_writeoptions_t* options,
-    rocksdb_writebatch_t* batch,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_merge_cf(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* val, size_t vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_write(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_writebatch_t* batch, char** errptr);
 
 /* Returns NULL if not found.  A malloc()ed array otherwise.
    Stores the length of the array in *vallen. */
-extern char* rocksdb_get(
-    rocksdb_t* db,
-    const rocksdb_readoptions_t* options,
-    const char* key, size_t keylen,
-    size_t* vallen,
-    char** errptr);
-
-extern char* rocksdb_get_cf(
-    rocksdb_t* db,
-    const rocksdb_readoptions_t* options,
-    rocksdb_column_family_handle_t* column_family,
-    const char* key, size_t keylen,
-    size_t* vallen,
-    char** errptr);
-
-extern rocksdb_iterator_t* rocksdb_create_iterator(
-    rocksdb_t* db,
-    const rocksdb_readoptions_t* options);
-
-extern rocksdb_iterator_t* rocksdb_create_iterator_cf(
-    rocksdb_t* db,
-    const rocksdb_readoptions_t* options,
+extern ROCKSDB_LIBRARY_API char* rocksdb_get(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, size_t* vallen, char** errptr);
+
+extern ROCKSDB_LIBRARY_API char* rocksdb_get_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr);
+
+// if values_list[i] == NULL and errs[i] == NULL,
+// then we got status.IsNotFound(), which we will not return.
+// all errors except status status.ok() and status.IsNotFound() are returned.
+//
+// errs, values_list and values_list_sizes must be num_keys in length,
+// allocated by the caller.
+// errs is a list of strings as opposed to the conventional one error,
+// where errs[i] is the status for retrieval of keys_list[i].
+// each non-NULL errs entry is a malloc()ed, null terminated string.
+// each non-NULL values_list entry is a malloc()ed array, with
+// the length for each stored in values_list_sizes[i].
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, size_t num_keys,
+    const char* const* keys_list, const size_t* keys_list_sizes,
+    char** values_list, size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator(
+    rocksdb_t* db, const rocksdb_readoptions_t* options);
+
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
     rocksdb_column_family_handle_t* column_family);
 
-extern const rocksdb_snapshot_t* rocksdb_create_snapshot(
+extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t* rocksdb_create_snapshot(
     rocksdb_t* db);
 
-extern void rocksdb_release_snapshot(
-    rocksdb_t* db,
-    const rocksdb_snapshot_t* snapshot);
+extern ROCKSDB_LIBRARY_API void rocksdb_release_snapshot(
+    rocksdb_t* db, const rocksdb_snapshot_t* snapshot);
 
 /* Returns NULL if property name is unknown.
    Else returns a pointer to a malloc()-ed null-terminated value. */
-extern char* rocksdb_property_value(
-    rocksdb_t* db,
-    const char* propname);
+extern ROCKSDB_LIBRARY_API char* rocksdb_property_value(rocksdb_t* db,
+                                                        const char* propname);
 
-extern char* rocksdb_property_value_cf(
-    rocksdb_t* db,
-    rocksdb_column_family_handle_t* column_family,
+extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
     const char* propname);
 
-extern void rocksdb_approximate_sizes(
-    rocksdb_t* db,
-    int num_ranges,
-    const char* const* range_start_key, const size_t* range_start_key_len,
-    const char* const* range_limit_key, const size_t* range_limit_key_len,
-    uint64_t* sizes);
-
-extern void rocksdb_approximate_sizes_cf(
-    rocksdb_t* db,
-    rocksdb_column_family_handle_t* column_family,
-    int num_ranges,
-    const char* const* range_start_key, const size_t* range_start_key_len,
-    const char* const* range_limit_key, const size_t* range_limit_key_len,
-    uint64_t* sizes);
-
-extern void rocksdb_compact_range(
-    rocksdb_t* db,
-    const char* start_key, size_t start_key_len,
-    const char* limit_key, size_t limit_key_len);
-
-extern void rocksdb_compact_range_cf(
-    rocksdb_t* db,
-    rocksdb_column_family_handle_t* column_family,
-    const char* start_key, size_t start_key_len,
-    const char* limit_key, size_t limit_key_len);
-
-extern void rocksdb_delete_file(
-    rocksdb_t* db,
-    const char* name);
-
-extern const rocksdb_livefiles_t* rocksdb_livefiles(
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes(
+    rocksdb_t* db, int num_ranges, const char* const* range_start_key,
+    const size_t* range_start_key_len, const char* const* range_limit_key,
+    const size_t* range_limit_key_len, uint64_t* sizes);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_sizes_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    int num_ranges, const char* const* range_start_key,
+    const size_t* range_start_key_len, const char* const* range_limit_key,
+    const size_t* range_limit_key_len, uint64_t* sizes);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range(rocksdb_t* db,
+                                                      const char* start_key,
+                                                      size_t start_key_len,
+                                                      const char* limit_key,
+                                                      size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_compact_range_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* limit_key,
+    size_t limit_key_len);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_delete_file(rocksdb_t* db,
+                                                    const char* name);
+
+extern ROCKSDB_LIBRARY_API const rocksdb_livefiles_t* rocksdb_livefiles(
     rocksdb_t* db);
 
-extern void rocksdb_flush(
-    rocksdb_t* db,
-    const rocksdb_flushoptions_t* options,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_flush(
+    rocksdb_t* db, const rocksdb_flushoptions_t* options, char** errptr);
 
-extern void rocksdb_disable_file_deletions(
-    rocksdb_t* db,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_disable_file_deletions(rocksdb_t* db,
+                                                               char** errptr);
 
-extern void rocksdb_enable_file_deletions(
-    rocksdb_t* db,
-    unsigned char force,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_enable_file_deletions(
+    rocksdb_t* db, unsigned char force, char** errptr);
 
 /* Management operations */
 
-extern void rocksdb_destroy_db(
-    const rocksdb_options_t* options,
-    const char* name,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_destroy_db(
+    const rocksdb_options_t* options, const char* name, char** errptr);
 
-extern void rocksdb_repair_db(
-    const rocksdb_options_t* options,
-    const char* name,
-    char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_repair_db(
+    const rocksdb_options_t* options, const char* name, char** errptr);
 
 /* Iterator */
 
-extern void rocksdb_iter_destroy(rocksdb_iterator_t*);
-extern unsigned char rocksdb_iter_valid(const rocksdb_iterator_t*);
-extern void rocksdb_iter_seek_to_first(rocksdb_iterator_t*);
-extern void rocksdb_iter_seek_to_last(rocksdb_iterator_t*);
-extern void rocksdb_iter_seek(rocksdb_iterator_t*, const char* k, size_t klen);
-extern void rocksdb_iter_next(rocksdb_iterator_t*);
-extern void rocksdb_iter_prev(rocksdb_iterator_t*);
-extern const char* rocksdb_iter_key(const rocksdb_iterator_t*, size_t* klen);
-extern const char* rocksdb_iter_value(const rocksdb_iterator_t*, size_t* vlen);
-extern void rocksdb_iter_get_error(const rocksdb_iterator_t*, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_destroy(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_iter_valid(
+    const rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_first(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek_to_last(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_seek(rocksdb_iterator_t*,
+                                                  const char* k, size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_next(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_prev(rocksdb_iterator_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_key(
+    const rocksdb_iterator_t*, size_t* klen);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_value(
+    const rocksdb_iterator_t*, size_t* vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_iter_get_error(
+    const rocksdb_iterator_t*, char** errptr);
 
 /* Write batch */
 
-extern rocksdb_writebatch_t* rocksdb_writebatch_create();
-extern rocksdb_writebatch_t* rocksdb_writebatch_create_from(const char* rep,
-                                                            size_t size);
-extern void rocksdb_writebatch_destroy(rocksdb_writebatch_t*);
-extern void rocksdb_writebatch_clear(rocksdb_writebatch_t*);
-extern int rocksdb_writebatch_count(rocksdb_writebatch_t*);
-extern void rocksdb_writebatch_put(
-    rocksdb_writebatch_t*,
-    const char* key, size_t klen,
-    const char* val, size_t vlen);
-extern void rocksdb_writebatch_put_cf(
-    rocksdb_writebatch_t*,
-    rocksdb_column_family_handle_t* column_family,
-    const char* key, size_t klen,
-    const char* val, size_t vlen);
-extern void rocksdb_writebatch_merge(
-    rocksdb_writebatch_t*,
-    const char* key, size_t klen,
-    const char* val, size_t vlen);
-extern void rocksdb_writebatch_merge_cf(
-    rocksdb_writebatch_t*,
-    rocksdb_column_family_handle_t* column_family,
-    const char* key, size_t klen,
-    const char* val, size_t vlen);
-extern void rocksdb_writebatch_delete(
-    rocksdb_writebatch_t*,
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create();
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create_from(
+    const char* rep, size_t size);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_destroy(
+    rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_clear(rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_writebatch_count(rocksdb_writebatch_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put(rocksdb_writebatch_t*,
+                                                       const char* key,
+                                                       size_t klen,
+                                                       const char* val,
+                                                       size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_cf(
+    rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv(
+    rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, int num_values,
+    const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_putv_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge(rocksdb_writebatch_t*,
+                                                         const char* key,
+                                                         size_t klen,
+                                                         const char* val,
+                                                         size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_merge_cf(
+    rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev(
+    rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, int num_values,
+    const char* const* values_list, const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_mergev_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete(rocksdb_writebatch_t*,
+                                                          const char* key,
+                                                          size_t klen);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_delete_cf(
+    rocksdb_writebatch_t*, rocksdb_column_family_handle_t* column_family,
     const char* key, size_t klen);
-extern void rocksdb_writebatch_delete_cf(
-    rocksdb_writebatch_t*,
-    rocksdb_column_family_handle_t* column_family,
-    const char* key, size_t klen);
-extern void rocksdb_writebatch_iterate(
-    rocksdb_writebatch_t*,
-    void* state,
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev(
+    rocksdb_writebatch_t* b, int num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_deletev_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_put_log_data(
+    rocksdb_writebatch_t*, const char* blob, size_t len);
+extern ROCKSDB_LIBRARY_API void rocksdb_writebatch_iterate(
+    rocksdb_writebatch_t*, void* state,
     void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
     void (*deleted)(void*, const char* k, size_t klen));
-extern const char* rocksdb_writebatch_data(rocksdb_writebatch_t*, size_t *size);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_writebatch_data(
+    rocksdb_writebatch_t*, size_t* size);
 
 /* Block based table options */
 
-extern rocksdb_block_based_table_options_t*
-    rocksdb_block_based_options_create();
-extern void rocksdb_block_based_options_destroy(
+extern ROCKSDB_LIBRARY_API rocksdb_block_based_table_options_t*
+rocksdb_block_based_options_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_destroy(
     rocksdb_block_based_table_options_t* options);
-extern void rocksdb_block_based_options_set_block_size(
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_size(
     rocksdb_block_based_table_options_t* options, size_t block_size);
-extern void rocksdb_block_based_options_set_block_size_deviation(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_size_deviation(
     rocksdb_block_based_table_options_t* options, int block_size_deviation);
-extern void rocksdb_block_based_options_set_block_restart_interval(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_restart_interval(
     rocksdb_block_based_table_options_t* options, int block_restart_interval);
-extern void rocksdb_block_based_options_set_filter_policy(
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_filter_policy(
     rocksdb_block_based_table_options_t* options,
     rocksdb_filterpolicy_t* filter_policy);
-extern void rocksdb_block_based_options_set_no_block_cache(
-    rocksdb_block_based_table_options_t* options,
-    unsigned char no_block_cache);
-extern void rocksdb_block_based_options_set_block_cache(
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_no_block_cache(
+    rocksdb_block_based_table_options_t* options, unsigned char no_block_cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_cache(
     rocksdb_block_based_table_options_t* options, rocksdb_cache_t* block_cache);
-extern void rocksdb_block_based_options_set_block_cache_compressed(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_block_cache_compressed(
     rocksdb_block_based_table_options_t* options,
     rocksdb_cache_t* block_cache_compressed);
-extern void rocksdb_block_based_options_set_whole_key_filtering(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_whole_key_filtering(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_format_version(
+    rocksdb_block_based_table_options_t*, int);
+enum {
+  rocksdb_block_based_table_index_type_binary_search = 0,
+  rocksdb_block_based_table_index_type_hash_search = 1,
+};
+extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_index_type(
+    rocksdb_block_based_table_options_t*, int);  // uses one of the above enums
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_hash_index_allow_collision(
     rocksdb_block_based_table_options_t*, unsigned char);
-extern void rocksdb_options_set_block_based_table_factory(
-    rocksdb_options_t *opt, rocksdb_block_based_table_options_t* table_options);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_cache_index_and_filter_blocks(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory(
+    rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options);
 
 /* Cuckoo table options */
 
-extern rocksdb_cuckoo_table_options_t*
-    rocksdb_cuckoo_options_create();
-extern void rocksdb_cuckoo_options_destroy(
+extern ROCKSDB_LIBRARY_API rocksdb_cuckoo_table_options_t*
+rocksdb_cuckoo_options_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_destroy(
     rocksdb_cuckoo_table_options_t* options);
-extern void rocksdb_cuckoo_options_set_hash_ratio(
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_hash_ratio(
     rocksdb_cuckoo_table_options_t* options, double v);
-extern void rocksdb_cuckoo_options_set_max_search_depth(
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_max_search_depth(
     rocksdb_cuckoo_table_options_t* options, uint32_t v);
-extern void rocksdb_cuckoo_options_set_cuckoo_block_size(
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_cuckoo_block_size(
     rocksdb_cuckoo_table_options_t* options, uint32_t v);
-extern void rocksdb_cuckoo_options_set_identity_as_first_hash(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_cuckoo_options_set_identity_as_first_hash(
     rocksdb_cuckoo_table_options_t* options, unsigned char v);
-extern void rocksdb_cuckoo_options_set_use_module_hash(
+extern ROCKSDB_LIBRARY_API void rocksdb_cuckoo_options_set_use_module_hash(
     rocksdb_cuckoo_table_options_t* options, unsigned char v);
-extern void rocksdb_options_set_cuckoo_table_factory(
-    rocksdb_options_t *opt, rocksdb_cuckoo_table_options_t* table_options);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_cuckoo_table_factory(
+    rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options);
 
 /* Options */
 
-extern rocksdb_options_t* rocksdb_options_create();
-extern void rocksdb_options_destroy(rocksdb_options_t*);
-extern void rocksdb_options_increase_parallelism(
+extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_options_destroy(rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_increase_parallelism(
     rocksdb_options_t* opt, int total_threads);
-extern void rocksdb_options_optimize_for_point_lookup(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_for_point_lookup(
     rocksdb_options_t* opt, uint64_t block_cache_size_mb);
-extern void rocksdb_options_optimize_level_style_compaction(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_level_style_compaction(
     rocksdb_options_t* opt, uint64_t memtable_memory_budget);
-extern void rocksdb_options_optimize_universal_style_compaction(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_optimize_universal_style_compaction(
     rocksdb_options_t* opt, uint64_t memtable_memory_budget);
-extern void rocksdb_options_set_compaction_filter(
-    rocksdb_options_t*,
-    rocksdb_compactionfilter_t*);
-extern void rocksdb_options_set_compaction_filter_factory(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter(
+    rocksdb_options_t*, rocksdb_compactionfilter_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter_factory(
     rocksdb_options_t*, rocksdb_compactionfilterfactory_t*);
-extern void rocksdb_options_set_compaction_filter_factory_v2(
-    rocksdb_options_t*,
-    rocksdb_compactionfilterfactoryv2_t*);
-extern void rocksdb_options_set_comparator(
-    rocksdb_options_t*,
-    rocksdb_comparator_t*);
-extern void rocksdb_options_set_merge_operator(
-    rocksdb_options_t*,
-    rocksdb_mergeoperator_t*);
-extern void rocksdb_options_set_compression_per_level(
-  rocksdb_options_t* opt,
-  int* level_values,
-  size_t num_levels);
-extern void rocksdb_options_set_create_if_missing(
-    rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_create_missing_column_families(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_comparator(
+    rocksdb_options_t*, rocksdb_comparator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_merge_operator(
+    rocksdb_options_t*, rocksdb_mergeoperator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_uint64add_merge_operator(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_per_level(
+    rocksdb_options_t* opt, int* level_values, size_t num_levels);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_create_if_missing(
     rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_error_if_exists(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_create_missing_column_families(rocksdb_options_t*,
+                                                   unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_error_if_exists(
     rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_paranoid_checks(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_paranoid_checks(
     rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*);
-extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*);
-extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int);
-extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t);
-extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int);
-extern void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n);
-extern void rocksdb_options_set_compression_options(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_env(rocksdb_options_t*,
+                                                        rocksdb_env_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log(rocksdb_options_t*,
+                                                             rocksdb_logger_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log_level(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_open_files(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_total_wal_size(
+    rocksdb_options_t* opt, uint64_t n);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression_options(
     rocksdb_options_t*, int, int, int);
-extern void rocksdb_options_set_prefix_extractor(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_prefix_extractor(
     rocksdb_options_t*, rocksdb_slicetransform_t*);
-extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int);
-extern void rocksdb_options_set_level0_file_num_compaction_trigger(
-    rocksdb_options_t*, int);
-extern void rocksdb_options_set_level0_slowdown_writes_trigger(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_num_levels(
     rocksdb_options_t*, int);
-extern void rocksdb_options_set_level0_stop_writes_trigger(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_level0_file_num_compaction_trigger(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_level0_stop_writes_trigger(
     rocksdb_options_t*, int);
-extern void rocksdb_options_set_max_mem_compaction_level(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_mem_compaction_level(
     rocksdb_options_t*, int);
-extern void rocksdb_options_set_target_file_size_base(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_base(
     rocksdb_options_t*, uint64_t);
-extern void rocksdb_options_set_target_file_size_multiplier(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_target_file_size_multiplier(
     rocksdb_options_t*, int);
-extern void rocksdb_options_set_max_bytes_for_level_base(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_bytes_for_level_base(
     rocksdb_options_t*, uint64_t);
-extern void rocksdb_options_set_max_bytes_for_level_multiplier(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_expanded_compaction_factor(
     rocksdb_options_t*, int);
-extern void rocksdb_options_set_expanded_compaction_factor(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_grandparent_overlap_factor(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_bytes_for_level_multiplier_additional(
+    rocksdb_options_t*, int* level_values, size_t num_levels);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics(
+    rocksdb_options_t*);
+
+/* returns a pointer to a malloc()-ed, null terminated string */
+extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string(
+    rocksdb_options_t* opt);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_number(
     rocksdb_options_t*, int);
-extern void rocksdb_options_set_max_grandparent_overlap_factor(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*,
+                                                        int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_compactions(
     rocksdb_options_t*, int);
-extern void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
-    rocksdb_options_t*, int* level_values, size_t num_levels);
-extern void rocksdb_options_enable_statistics(rocksdb_options_t*);
-
-extern void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t*, int);
-extern void rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
-extern void rocksdb_options_set_max_background_compactions(rocksdb_options_t*, int);
-extern void rocksdb_options_set_max_background_flushes(rocksdb_options_t*, int);
-extern void rocksdb_options_set_max_log_file_size(rocksdb_options_t*, size_t);
-extern void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t*, size_t);
-extern void rocksdb_options_set_keep_log_file_num(rocksdb_options_t*, size_t);
-extern void rocksdb_options_set_soft_rate_limit(rocksdb_options_t*, double);
-extern void rocksdb_options_set_hard_rate_limit(rocksdb_options_t*, double);
-extern void rocksdb_options_set_rate_limit_delay_max_milliseconds(
-    rocksdb_options_t*, unsigned int);
-extern void rocksdb_options_set_max_manifest_file_size(
-    rocksdb_options_t*, size_t);
-extern void rocksdb_options_set_no_block_cache(
-    rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_table_cache_numshardbits(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_flushes(
     rocksdb_options_t*, int);
-extern void rocksdb_options_set_table_cache_remove_scan_count_limit(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_log_file_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_log_file_time_to_roll(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_keep_log_file_num(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_soft_rate_limit(
+    rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hard_rate_limit(
+    rocksdb_options_t*, double);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_rate_limit_delay_max_milliseconds(rocksdb_options_t*,
+                                                      unsigned int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_manifest_file_size(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_table_cache_numshardbits(
     rocksdb_options_t*, int);
-extern void rocksdb_options_set_arena_block_size(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_table_cache_remove_scan_count_limit(rocksdb_options_t*,
+                                                        int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_arena_block_size(
     rocksdb_options_t*, size_t);
-extern void rocksdb_options_set_use_fsync(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_fsync(
     rocksdb_options_t*, int);
-extern void rocksdb_options_set_db_log_dir(
-    rocksdb_options_t*, const char*);
-extern void rocksdb_options_set_wal_dir(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_log_dir(
     rocksdb_options_t*, const char*);
-extern void rocksdb_options_set_WAL_ttl_seconds(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_dir(rocksdb_options_t*,
+                                                            const char*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_ttl_seconds(
     rocksdb_options_t*, uint64_t);
-extern void rocksdb_options_set_WAL_size_limit_MB(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_WAL_size_limit_MB(
     rocksdb_options_t*, uint64_t);
-extern void rocksdb_options_set_manifest_preallocation_size(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manifest_preallocation_size(
     rocksdb_options_t*, size_t);
-extern void rocksdb_options_set_purge_redundant_kvs_while_flush(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t*,
+                                                    unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_os_buffer(
     rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_allow_os_buffer(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_reads(
     rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_allow_mmap_reads(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_allow_mmap_writes(
     rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_allow_mmap_writes(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_is_fd_close_on_exec(
     rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_is_fd_close_on_exec(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_log_error_on_recovery(
     rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_skip_log_error_on_recovery(
-    rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_stats_dump_period_sec(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_stats_dump_period_sec(
     rocksdb_options_t*, unsigned int);
-extern void rocksdb_options_set_block_size_deviation(
-    rocksdb_options_t*, int);
-extern void rocksdb_options_set_advise_random_on_open(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_advise_random_on_open(
     rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_access_hint_on_compaction_start(
-    rocksdb_options_t*, int);
-extern void rocksdb_options_set_use_adaptive_mutex(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_adaptive_mutex(
     rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_bytes_per_sync(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bytes_per_sync(
     rocksdb_options_t*, uint64_t);
-extern void rocksdb_options_set_verify_checksums_in_compaction(
-    rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_filter_deletes(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_verify_checksums_in_compaction(rocksdb_options_t*,
+                                                   unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_filter_deletes(
     rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_max_sequential_skip_in_iterations(
-    rocksdb_options_t*, uint64_t);
-extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int);
-extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int);
-extern void rocksdb_options_set_delete_obsolete_files_period_micros(
-    rocksdb_options_t*, uint64_t);
-extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int);
-extern void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t*);
-extern void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t*);
-extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, int32_t, int32_t);
-extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t);
-extern void rocksdb_options_set_plain_table_factory(rocksdb_options_t*, uint32_t, int, double, size_t);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_max_sequential_skip_in_iterations(rocksdb_options_t*,
+                                                      uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_disable_data_sync(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_disable_auto_compactions(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_delete_obsolete_files_period_micros(rocksdb_options_t*,
+                                                        uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_source_compaction_factor(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_prepare_for_bulk_load(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_vector_rep(
+    rocksdb_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_skip_list_rep(
+    rocksdb_options_t*, size_t, int32_t, int32_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_link_list_rep(
+    rocksdb_options_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_plain_table_factory(
+    rocksdb_options_t*, uint32_t, int, double, size_t);
 
-extern void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_level_to_compress(
+    rocksdb_options_t* opt, int level);
 
-extern void rocksdb_options_set_memtable_prefix_bloom_bits(
-    rocksdb_options_t*, uint32_t);
-extern void rocksdb_options_set_memtable_prefix_bloom_probes(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_memtable_prefix_bloom_bits(
     rocksdb_options_t*, uint32_t);
-extern void rocksdb_options_set_max_successive_merges(
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_memtable_prefix_bloom_probes(rocksdb_options_t*, uint32_t);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_successive_merges(
     rocksdb_options_t*, size_t);
-extern void rocksdb_options_set_min_partial_merge_operands(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_partial_merge_operands(
     rocksdb_options_t*, uint32_t);
-extern void rocksdb_options_set_bloom_locality(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bloom_locality(
     rocksdb_options_t*, uint32_t);
-extern void rocksdb_options_set_inplace_update_support(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_support(
     rocksdb_options_t*, unsigned char);
-extern void rocksdb_options_set_inplace_update_num_locks(
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_inplace_update_num_locks(
     rocksdb_options_t*, size_t);
 
 enum {
@@ -622,206 +678,175 @@ enum {
   rocksdb_lz4_compression = 4,
   rocksdb_lz4hc_compression = 5
 };
-extern void rocksdb_options_set_compression(rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compression(
+    rocksdb_options_t*, int);
 
 enum {
   rocksdb_level_compaction = 0,
   rocksdb_universal_compaction = 1,
   rocksdb_fifo_compaction = 2
 };
-extern void rocksdb_options_set_compaction_style(rocksdb_options_t*, int);
-extern void rocksdb_options_set_universal_compaction_options(rocksdb_options_t*, rocksdb_universal_compaction_options_t*);
-extern void rocksdb_options_set_fifo_compaction_options(rocksdb_options_t* opt,
-    rocksdb_fifo_compaction_options_t* fifo);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_style(
+    rocksdb_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_universal_compaction_options(
+    rocksdb_options_t*, rocksdb_universal_compaction_options_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_fifo_compaction_options(
+    rocksdb_options_t* opt, rocksdb_fifo_compaction_options_t* fifo);
 
 /* Compaction Filter */
 
-extern rocksdb_compactionfilter_t* rocksdb_compactionfilter_create(
-    void* state,
-    void (*destructor)(void*),
-    unsigned char (*filter)(
-        void*,
-        int level,
-        const char* key, size_t key_length,
-        const char* existing_value, size_t value_length,
-        char** new_value, size_t *new_value_length,
-        unsigned char* value_changed),
+extern ROCKSDB_LIBRARY_API rocksdb_compactionfilter_t*
+rocksdb_compactionfilter_create(
+    void* state, void (*destructor)(void*),
+    unsigned char (*filter)(void*, int level, const char* key,
+                            size_t key_length, const char* existing_value,
+                            size_t value_length, char** new_value,
+                            size_t* new_value_length,
+                            unsigned char* value_changed),
     const char* (*name)(void*));
-extern void rocksdb_compactionfilter_destroy(rocksdb_compactionfilter_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilter_destroy(
+    rocksdb_compactionfilter_t*);
 
 /* Compaction Filter Context */
 
-extern unsigned char rocksdb_compactionfiltercontext_is_full_compaction(
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionfiltercontext_is_full_compaction(
     rocksdb_compactionfiltercontext_t* context);
 
-extern unsigned char rocksdb_compactionfiltercontext_is_manual_compaction(
+extern ROCKSDB_LIBRARY_API unsigned char
+rocksdb_compactionfiltercontext_is_manual_compaction(
     rocksdb_compactionfiltercontext_t* context);
 
 /* Compaction Filter Factory */
 
-extern rocksdb_compactionfilterfactory_t*
-    rocksdb_compactionfilterfactory_create(
-        void* state, void (*destructor)(void*),
-        rocksdb_compactionfilter_t* (*create_compaction_filter)(
-            void*, rocksdb_compactionfiltercontext_t* context),
-        const char* (*name)(void*));
-extern void rocksdb_compactionfilterfactory_destroy(
-    rocksdb_compactionfilterfactory_t*);
-
-/* Compaction Filter V2 */
-
-extern rocksdb_compactionfilterv2_t* rocksdb_compactionfilterv2_create(
-    void* state,
-    void (*destructor)(void*),
-    // num_keys specifies the number of array entries in every *list parameter.
-    // New values added to the new_values_list should be malloc'd and will be
-    // freed by the caller. Specify true in the to_delete_list to remove an
-    // entry during compaction; false to keep it.
-    void (*filter)(
-        void*, int level, size_t num_keys,
-        const char* const* keys_list, const size_t* keys_list_sizes,
-        const char* const* existing_values_list, const size_t* existing_values_list_sizes,
-        char** new_values_list, size_t* new_values_list_sizes,
-        unsigned char* to_delete_list),
+extern ROCKSDB_LIBRARY_API rocksdb_compactionfilterfactory_t*
+rocksdb_compactionfilterfactory_create(
+    void* state, void (*destructor)(void*),
+    rocksdb_compactionfilter_t* (*create_compaction_filter)(
+        void*, rocksdb_compactionfiltercontext_t* context),
     const char* (*name)(void*));
-extern void rocksdb_compactionfilterv2_destroy(rocksdb_compactionfilterv2_t*);
-
-/* Compaction Filter Factory V2 */
-
-extern rocksdb_compactionfilterfactoryv2_t* rocksdb_compactionfilterfactoryv2_create(
-    void* state,
-    rocksdb_slicetransform_t* prefix_extractor,
-    void (*destructor)(void*),
-    rocksdb_compactionfilterv2_t* (*create_compaction_filter_v2)(
-        void*, const rocksdb_compactionfiltercontext_t* context),
-    const char* (*name)(void*));
-extern void rocksdb_compactionfilterfactoryv2_destroy(rocksdb_compactionfilterfactoryv2_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_compactionfilterfactory_destroy(
+    rocksdb_compactionfilterfactory_t*);
 
 /* Comparator */
 
-extern rocksdb_comparator_t* rocksdb_comparator_create(
-    void* state,
-    void (*destructor)(void*),
-    int (*compare)(
-        void*,
-        const char* a, size_t alen,
-        const char* b, size_t blen),
+extern ROCKSDB_LIBRARY_API rocksdb_comparator_t* rocksdb_comparator_create(
+    void* state, void (*destructor)(void*),
+    int (*compare)(void*, const char* a, size_t alen, const char* b,
+                   size_t blen),
     const char* (*name)(void*));
-extern void rocksdb_comparator_destroy(rocksdb_comparator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_comparator_destroy(
+    rocksdb_comparator_t*);
 
 /* Filter policy */
 
-extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create(
-    void* state,
-    void (*destructor)(void*),
-    char* (*create_filter)(
-        void*,
-        const char* const* key_array, const size_t* key_length_array,
-        int num_keys,
-        size_t* filter_length),
-    unsigned char (*key_may_match)(
-        void*,
-        const char* key, size_t length,
-        const char* filter, size_t filter_length),
-    void (*delete_filter)(
-        void*,
-        const char* filter, size_t filter_length),
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t* rocksdb_filterpolicy_create(
+    void* state, void (*destructor)(void*),
+    char* (*create_filter)(void*, const char* const* key_array,
+                           const size_t* key_length_array, int num_keys,
+                           size_t* filter_length),
+    unsigned char (*key_may_match)(void*, const char* key, size_t length,
+                                   const char* filter, size_t filter_length),
+    void (*delete_filter)(void*, const char* filter, size_t filter_length),
     const char* (*name)(void*));
-extern void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_filterpolicy_destroy(
+    rocksdb_filterpolicy_t*);
 
-extern rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(
-    int bits_per_key);
+extern ROCKSDB_LIBRARY_API rocksdb_filterpolicy_t*
+rocksdb_filterpolicy_create_bloom(int bits_per_key);
 
 /* Merge Operator */
 
-extern rocksdb_mergeoperator_t* rocksdb_mergeoperator_create(
-    void* state,
-    void (*destructor)(void*),
-    char* (*full_merge)(
-        void*,
-        const char* key, size_t key_length,
-        const char* existing_value, size_t existing_value_length,
-        const char* const* operands_list, const size_t* operands_list_length,
-        int num_operands,
-        unsigned char* success, size_t* new_value_length),
-    char* (*partial_merge)(
-        void*,
-        const char* key, size_t key_length,
-        const char* const* operands_list, const size_t* operands_list_length,
-        int num_operands,
-        unsigned char* success, size_t* new_value_length),
-    void (*delete_value)(
-        void*,
-        const char* value, size_t value_length),
+extern ROCKSDB_LIBRARY_API rocksdb_mergeoperator_t*
+rocksdb_mergeoperator_create(
+    void* state, void (*destructor)(void*),
+    char* (*full_merge)(void*, const char* key, size_t key_length,
+                        const char* existing_value,
+                        size_t existing_value_length,
+                        const char* const* operands_list,
+                        const size_t* operands_list_length, int num_operands,
+                        unsigned char* success, size_t* new_value_length),
+    char* (*partial_merge)(void*, const char* key, size_t key_length,
+                           const char* const* operands_list,
+                           const size_t* operands_list_length, int num_operands,
+                           unsigned char* success, size_t* new_value_length),
+    void (*delete_value)(void*, const char* value, size_t value_length),
     const char* (*name)(void*));
-extern void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_mergeoperator_destroy(
+    rocksdb_mergeoperator_t*);
 
 /* Read options */
 
-extern rocksdb_readoptions_t* rocksdb_readoptions_create();
-extern void rocksdb_readoptions_destroy(rocksdb_readoptions_t*);
-extern void rocksdb_readoptions_set_verify_checksums(
-    rocksdb_readoptions_t*,
-    unsigned char);
-extern void rocksdb_readoptions_set_fill_cache(
+extern ROCKSDB_LIBRARY_API rocksdb_readoptions_t* rocksdb_readoptions_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_destroy(
+    rocksdb_readoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_verify_checksums(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_fill_cache(
     rocksdb_readoptions_t*, unsigned char);
-extern void rocksdb_readoptions_set_snapshot(
-    rocksdb_readoptions_t*,
-    const rocksdb_snapshot_t*);
-extern void rocksdb_readoptions_set_iterate_upper_bound(
-    rocksdb_readoptions_t*,
-    const char* key,
-    size_t keylen);
-extern void rocksdb_readoptions_set_read_tier(
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_snapshot(
+    rocksdb_readoptions_t*, const rocksdb_snapshot_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_upper_bound(
+    rocksdb_readoptions_t*, const char* key, size_t keylen);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_read_tier(
     rocksdb_readoptions_t*, int);
-extern void rocksdb_readoptions_set_tailing(
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing(
     rocksdb_readoptions_t*, unsigned char);
 
 /* Write options */
 
-extern rocksdb_writeoptions_t* rocksdb_writeoptions_create();
-extern void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t*);
-extern void rocksdb_writeoptions_set_sync(
+extern ROCKSDB_LIBRARY_API rocksdb_writeoptions_t*
+rocksdb_writeoptions_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_destroy(
+    rocksdb_writeoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_sync(
     rocksdb_writeoptions_t*, unsigned char);
-extern void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_disable_WAL(
+    rocksdb_writeoptions_t* opt, int disable);
 
 /* Flush options */
 
-extern rocksdb_flushoptions_t* rocksdb_flushoptions_create();
-extern void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t*);
-extern void rocksdb_flushoptions_set_wait(
+extern ROCKSDB_LIBRARY_API rocksdb_flushoptions_t*
+rocksdb_flushoptions_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_destroy(
+    rocksdb_flushoptions_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_flushoptions_set_wait(
     rocksdb_flushoptions_t*, unsigned char);
 
 /* Cache */
 
-extern rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity);
-extern void rocksdb_cache_destroy(rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru(
+    size_t capacity);
+extern ROCKSDB_LIBRARY_API void rocksdb_cache_destroy(rocksdb_cache_t* cache);
 
 /* Env */
 
-extern rocksdb_env_t* rocksdb_create_default_env();
-extern void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n);
-extern void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
-extern void rocksdb_env_destroy(rocksdb_env_t*);
+extern ROCKSDB_LIBRARY_API rocksdb_env_t* rocksdb_create_default_env();
+extern ROCKSDB_LIBRARY_API void rocksdb_env_set_background_threads(
+    rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env, int n);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_join_all_threads(
+    rocksdb_env_t* env);
+extern ROCKSDB_LIBRARY_API void rocksdb_env_destroy(rocksdb_env_t*);
 
 /* SliceTransform */
 
-extern rocksdb_slicetransform_t* rocksdb_slicetransform_create(
-    void* state,
-    void (*destructor)(void*),
-    char* (*transform)(
-        void*,
-        const char* key, size_t length,
-        size_t* dst_length),
-    unsigned char (*in_domain)(
-        void*,
-        const char* key, size_t length),
-    unsigned char (*in_range)(
-        void*,
-        const char* key, size_t length),
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+rocksdb_slicetransform_create(
+    void* state, void (*destructor)(void*),
+    char* (*transform)(void*, const char* key, size_t length,
+                       size_t* dst_length),
+    unsigned char (*in_domain)(void*, const char* key, size_t length),
+    unsigned char (*in_range)(void*, const char* key, size_t length),
     const char* (*name)(void*));
-extern rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t);
-extern void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t*);
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+    rocksdb_slicetransform_create_fixed_prefix(size_t);
+extern ROCKSDB_LIBRARY_API rocksdb_slicetransform_t*
+rocksdb_slicetransform_create_noop();
+extern ROCKSDB_LIBRARY_API void rocksdb_slicetransform_destroy(
+    rocksdb_slicetransform_t*);
 
 /* Universal Compaction options */
 
@@ -830,49 +855,61 @@ enum {
   rocksdb_total_size_compaction_stop_style = 1
 };
 
-extern rocksdb_universal_compaction_options_t* rocksdb_universal_compaction_options_create() ;
-extern void rocksdb_universal_compaction_options_set_size_ratio(
-  rocksdb_universal_compaction_options_t*, int);
-extern void rocksdb_universal_compaction_options_set_min_merge_width(
-  rocksdb_universal_compaction_options_t*, int);
-extern void rocksdb_universal_compaction_options_set_max_merge_width(
-  rocksdb_universal_compaction_options_t*, int);
-extern void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
-  rocksdb_universal_compaction_options_t*, int);
-extern void rocksdb_universal_compaction_options_set_compression_size_percent(
-  rocksdb_universal_compaction_options_t*, int);
-extern void rocksdb_universal_compaction_options_set_stop_style(
-  rocksdb_universal_compaction_options_t*, int);
-extern void rocksdb_universal_compaction_options_destroy(
-  rocksdb_universal_compaction_options_t*);
-
-extern rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create();
-extern void rocksdb_fifo_compaction_options_set_max_table_files_size(
+extern ROCKSDB_LIBRARY_API rocksdb_universal_compaction_options_t*
+rocksdb_universal_compaction_options_create();
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_size_ratio(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_min_merge_width(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_max_merge_width(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_compression_size_percent(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_universal_compaction_options_set_stop_style(
+    rocksdb_universal_compaction_options_t*, int);
+extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_destroy(
+    rocksdb_universal_compaction_options_t*);
+
+extern ROCKSDB_LIBRARY_API rocksdb_fifo_compaction_options_t*
+rocksdb_fifo_compaction_options_create();
+extern ROCKSDB_LIBRARY_API void
+rocksdb_fifo_compaction_options_set_max_table_files_size(
     rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size);
-extern void rocksdb_fifo_compaction_options_destroy(
+extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_destroy(
     rocksdb_fifo_compaction_options_t* fifo_opts);
 
-extern int rocksdb_livefiles_count(
-  const rocksdb_livefiles_t*);
-extern const char* rocksdb_livefiles_name(
-  const rocksdb_livefiles_t*,
-  int index);
-extern int rocksdb_livefiles_level(
-  const rocksdb_livefiles_t*,
-  int index);
-extern size_t rocksdb_livefiles_size(
-  const rocksdb_livefiles_t*,
-  int index);
-extern const char* rocksdb_livefiles_smallestkey(
-  const rocksdb_livefiles_t*,
-  int index,
-  size_t* size);
-extern const char* rocksdb_livefiles_largestkey(
-  const rocksdb_livefiles_t*,
-  int index,
-  size_t* size);
-extern void rocksdb_livefiles_destroy(
-  const rocksdb_livefiles_t*);
+extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_count(
+    const rocksdb_livefiles_t*);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_name(
+    const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API int rocksdb_livefiles_level(
+    const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API size_t
+rocksdb_livefiles_size(const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_smallestkey(
+    const rocksdb_livefiles_t*, int index, size_t* size);
+extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_largestkey(
+    const rocksdb_livefiles_t*, int index, size_t* size);
+extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_destroy(
+    const rocksdb_livefiles_t*);
+
+/* Utility Helpers */
+
+extern ROCKSDB_LIBRARY_API void rocksdb_get_options_from_string(
+    const rocksdb_options_t* base_options, const char* opts_str,
+    rocksdb_options_t* new_options, char** errptr);
+
+// referring to convention (3), this should be used by client
+// to free memory that was malloc()ed
+extern ROCKSDB_LIBRARY_API void rocksdb_free(void* ptr);
 
 #ifdef __cplusplus
 }  /* end extern "C" */
diff --git a/src/rocksdb/include/rocksdb/cache.h b/src/rocksdb/include/rocksdb/cache.h
index c5c7f01..4e4b0e1 100644
--- a/src/rocksdb/include/rocksdb/cache.h
+++ b/src/rocksdb/include/rocksdb/cache.h
@@ -104,6 +104,12 @@ class Cache {
   // returns the memory size for the entries residing in the cache.
   virtual size_t GetUsage() const = 0;
 
+  // returns the memory size for a specific entry in the cache.
+  virtual size_t GetUsage(Handle* handle) const = 0;
+
+  // returns the memory size for the entries in use by the system
+  virtual size_t GetPinnedUsage() const = 0;
+
   // Call this on shutdown if you want to speed it up. Cache will disown
   // any underlying data and will not free it on delete. This call will leak
   // memory - call this only if you're shutting down the process.
diff --git a/src/rocksdb/include/rocksdb/compaction_filter.h b/src/rocksdb/include/rocksdb/compaction_filter.h
index dce69d2..698753c 100644
--- a/src/rocksdb/include/rocksdb/compaction_filter.h
+++ b/src/rocksdb/include/rocksdb/compaction_filter.h
@@ -51,10 +51,24 @@ class CompactionFilter {
   // output of the compaction.  The application can inspect
   // the existing value of the key and make decision based on it.
   //
+  // Key-Values that are results of merge operation during compaction are not
+  // passed into this function. Currently, when you have a mix of Put()s and
+  // Merge()s on a same key, we only guarantee to process the merge operands
+  // through the compaction filters. Put()s might be processed, or might not.
+  //
   // When the value is to be preserved, the application has the option
   // to modify the existing_value and pass it back through new_value.
   // value_changed needs to be set to true in this case.
   //
+  // If you use snapshot feature of RocksDB (i.e. call GetSnapshot() API on a
+  // DB* object), CompactionFilter might not be very useful for you. Due to
+  // guarantees we need to maintain, compaction process will not call Filter()
+  // on any keys that were written before the latest snapshot. In other words,
+  // compaction will only call Filter() on keys written after your most recent
+  // call to GetSnapshot(). In most cases, Filter() will not be called very
+  // often. This is something we're fixing. See the discussion at:
+  // https://www.facebook.com/groups/mysqlonrocksdb/permalink/999723240091865/
+  //
   // If multithreaded compaction is being used *and* a single CompactionFilter
   // instance was supplied via Options::compaction_filter, this method may be
   // called from different threads concurrently.  The application must ensure
@@ -64,44 +78,23 @@ class CompactionFilter {
   // be used by a single thread that is doing the compaction run, and this
   // call does not need to be thread-safe.  However, multiple filters may be
   // in existence and operating concurrently.
+  //
+  // The last paragraph is not true if you set max_subcompactions to more than
+  // 1. In that case, subcompaction from multiple threads may call a single
+  // CompactionFilter concurrently.
   virtual bool Filter(int level,
                       const Slice& key,
                       const Slice& existing_value,
                       std::string* new_value,
                       bool* value_changed) const = 0;
 
-  // Returns a name that identifies this compaction filter.
-  // The name will be printed to LOG file on start up for diagnosis.
-  virtual const char* Name() const = 0;
-};
-
-// CompactionFilterV2 that buffers kv pairs sharing the same prefix and let
-// application layer to make individual decisions for all the kv pairs in the
-// buffer.
-class CompactionFilterV2 {
- public:
-  virtual ~CompactionFilterV2() {}
-
-  // The compaction process invokes this method for all the kv pairs
-  // sharing the same prefix. It is a "roll-up" version of CompactionFilter.
-  //
-  // Each entry in the return vector indicates if the corresponding kv should
-  // be preserved in the output of this compaction run. The application can
-  // inspect the existing values of the keys and make decision based on it.
-  //
-  // When a value is to be preserved, the application has the option
-  // to modify the entry in existing_values and pass it back through an entry
-  // in new_values. A corresponding values_changed entry needs to be set to
-  // true in this case. Note that the new_values vector contains only changed
-  // values, i.e. new_values.size() <= values_changed.size().
-  //
-  typedef std::vector<Slice> SliceVector;
-  virtual std::vector<bool> Filter(int level,
-                                   const SliceVector& keys,
-                                   const SliceVector& existing_values,
-                                   std::vector<std::string>* new_values,
-                                   std::vector<bool>* values_changed)
-    const = 0;
+  // The compaction process invokes this method on every merge operand. If this
+  // method returns true, the merge operand will be ignored and not written out
+  // in the compaction output
+  virtual bool FilterMergeOperand(int level, const Slice& key,
+                                  const Slice& operand) const {
+    return false;
+  }
 
   // Returns a name that identifies this compaction filter.
   // The name will be printed to LOG file on start up for diagnosis.
@@ -135,65 +128,6 @@ class DefaultCompactionFilterFactory : public CompactionFilterFactory {
   }
 };
 
-// Each compaction will create a new CompactionFilterV2
-//
-// CompactionFilterFactoryV2 enables application to specify a prefix and use
-// CompactionFilterV2 to filter kv-pairs in batches. Each batch contains all
-// the kv-pairs sharing the same prefix.
-//
-// This is useful for applications that require grouping kv-pairs in
-// compaction filter to make a purge/no-purge decision. For example, if the
-// key prefix is user id and the rest of key represents the type of value.
-// This batching filter will come in handy if the application's compaction
-// filter requires knowledge of all types of values for any user id.
-//
-class CompactionFilterFactoryV2 {
- public:
-  // NOTE: CompactionFilterFactoryV2 will not delete prefix_extractor
-  explicit CompactionFilterFactoryV2(const SliceTransform* prefix_extractor)
-    : prefix_extractor_(prefix_extractor) { }
-
-  virtual ~CompactionFilterFactoryV2() { }
-
-  virtual std::unique_ptr<CompactionFilterV2> CreateCompactionFilterV2(
-    const CompactionFilterContext& context) = 0;
-
-  // Returns a name that identifies this compaction filter factory.
-  virtual const char* Name() const = 0;
-
-  const SliceTransform* GetPrefixExtractor() const {
-    return prefix_extractor_;
-  }
-
-  void SetPrefixExtractor(const SliceTransform* prefix_extractor) {
-    prefix_extractor_ = prefix_extractor;
-  }
-
- private:
-  // Prefix extractor for compaction filter v2
-  // Keys sharing the same prefix will be buffered internally.
-  // Client can implement a Filter callback function to operate on the buffer
-  const SliceTransform* prefix_extractor_;
-};
-
-// Default implementation of CompactionFilterFactoryV2 which does not
-// return any filter
-class DefaultCompactionFilterFactoryV2 : public CompactionFilterFactoryV2 {
- public:
-  explicit DefaultCompactionFilterFactoryV2()
-      : CompactionFilterFactoryV2(nullptr) { }
-
-  virtual std::unique_ptr<CompactionFilterV2>
-  CreateCompactionFilterV2(
-      const CompactionFilterContext& context) override {
-    return std::unique_ptr<CompactionFilterV2>(nullptr);
-  }
-
-  virtual const char* Name() const override {
-    return "DefaultCompactionFilterFactoryV2";
-  }
-};
-
 }  // namespace rocksdb
 
 #endif  // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
diff --git a/src/rocksdb/include/rocksdb/compaction_job_stats.h b/src/rocksdb/include/rocksdb/compaction_job_stats.h
new file mode 100644
index 0000000..5331900
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/compaction_job_stats.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+
+namespace rocksdb {
+struct CompactionJobStats {
+  CompactionJobStats() { Reset(); }
+  void Reset();
+  // Aggregate the CompactionJobStats from another instance with this one
+  void Add(const CompactionJobStats& stats);
+
+  // the elapsed time in micro of this compaction.
+  uint64_t elapsed_micros;
+
+  // the number of compaction input records.
+  uint64_t num_input_records;
+  // the number of compaction input files.
+  size_t num_input_files;
+  // the number of compaction input files at the output level.
+  size_t num_input_files_at_output_level;
+
+  // the number of compaction output records.
+  uint64_t num_output_records;
+  // the number of compaction output files.
+  size_t num_output_files;
+
+  // true if the compaction is a manual compaction
+  bool is_manual_compaction;
+
+  // the size of the compaction input in bytes.
+  uint64_t total_input_bytes;
+  // the size of the compaction output in bytes.
+  uint64_t total_output_bytes;
+
+  // number of records being replaced by newer record associated with same key.
+  // this could be a new value or a deletion entry for that key so this field
+  // sums up all updated and deleted keys
+  uint64_t num_records_replaced;
+
+  // the sum of the uncompressed input keys in bytes.
+  uint64_t total_input_raw_key_bytes;
+  // the sum of the uncompressed input values in bytes.
+  uint64_t total_input_raw_value_bytes;
+
+  // the number of deletion entries before compaction. Deletion entries
+  // can disappear after compaction because they expired
+  uint64_t num_input_deletion_records;
+  // number of deletion records that were found obsolete and discarded
+  // because it is not possible to delete any more keys with this entry
+  // (i.e. all possible deletions resulting from it have been completed)
+  uint64_t num_expired_deletion_records;
+
+  // number of corrupt keys (ParseInternalKey returned false when applied to
+  // the key) encountered and written out.
+  uint64_t num_corrupt_keys;
+
+  // Following counters are only populated if
+  // options.compaction_measure_io_stats = true;
+
+  // Time spent on file's Append() call.
+  uint64_t file_write_nanos;
+
+  // Time spent on sync file range.
+  uint64_t file_range_sync_nanos;
+
+  // Time spent on file fsync.
+  uint64_t file_fsync_nanos;
+
+  // Time spent on preparing file write (falocate, etc)
+  uint64_t file_prepare_write_nanos;
+
+  // 0-terminated strings storing the first 8 bytes of the smallest and
+  // largest key in the output.
+  static const size_t kMaxPrefixLength = 8;
+
+  std::string smallest_output_key_prefix;
+  std::string largest_output_key_prefix;
+};
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/comparator.h b/src/rocksdb/include/rocksdb/comparator.h
index 5b7dc10..8fc2710 100644
--- a/src/rocksdb/include/rocksdb/comparator.h
+++ b/src/rocksdb/include/rocksdb/comparator.h
@@ -29,6 +29,15 @@ class Comparator {
   //   > 0 iff "a" > "b"
   virtual int Compare(const Slice& a, const Slice& b) const = 0;
 
+  // Compares two slices for equality. The following invariant should always
+  // hold (and is the default implementation):
+  //   Equal(a, b) iff Compare(a, b) == 0
+  // Overwrite only if equality comparisons can be done more efficiently than
+  // three-way comparisons.
+  virtual bool Equal(const Slice& a, const Slice& b) const {
+    return Compare(a, b) == 0;
+  }
+
   // The name of the comparator.  Used to check for comparator
   // mismatches (i.e., a DB created with one comparator is
   // accessed using a different comparator.
diff --git a/src/rocksdb/include/rocksdb/convenience.h b/src/rocksdb/include/rocksdb/convenience.h
new file mode 100644
index 0000000..db59727
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/convenience.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <unordered_map>
+#include <string>
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+// Take a map of option name and option value, apply them into the
+// base_options, and return the new options as a result.
+//
+// If input_strings_escaped is set to true, then each escaped characters
+// prefixed by '\' in the the values of the opts_map will be further
+// converted back to the raw string before assigning to the associated
+// options.
+Status GetColumnFamilyOptionsFromMap(
+    const ColumnFamilyOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    ColumnFamilyOptions* new_options, bool input_strings_escaped = false);
+
+// Take a map of option name and option value, apply them into the
+// base_options, and return the new options as a result.
+//
+// If input_strings_escaped is set to true, then each escaped characters
+// prefixed by '\' in the the values of the opts_map will be further
+// converted back to the raw string before assigning to the associated
+// options.
+Status GetDBOptionsFromMap(
+    const DBOptions& base_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    DBOptions* new_options, bool input_strings_escaped = false);
+
+Status GetBlockBasedTableOptionsFromMap(
+    const BlockBasedTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    BlockBasedTableOptions* new_table_options);
+
+// Take a string representation of option names and  values, apply them into the
+// base_options, and return the new options as a result. The string has the
+// following format:
+//   "write_buffer_size=1024;max_write_buffer_number=2"
+// Nested options config is also possible. For example, you can define
+// BlockBasedTableOptions as part of the string for block-based table factory:
+//   "write_buffer_size=1024;block_based_table_factory={block_size=4k};"
+//   "max_write_buffer_num=2"
+Status GetColumnFamilyOptionsFromString(
+    const ColumnFamilyOptions& base_options,
+    const std::string& opts_str,
+    ColumnFamilyOptions* new_options);
+
+Status GetDBOptionsFromString(
+    const DBOptions& base_options,
+    const std::string& opts_str,
+    DBOptions* new_options);
+
+Status GetStringFromDBOptions(std::string* opts_str,
+                              const DBOptions& db_options,
+                              const std::string& delimiter = ";  ");
+
+Status GetStringFromColumnFamilyOptions(std::string* opts_str,
+                                        const ColumnFamilyOptions& db_options,
+                                        const std::string& delimiter = ";  ");
+
+Status GetBlockBasedTableOptionsFromString(
+    const BlockBasedTableOptions& table_options,
+    const std::string& opts_str,
+    BlockBasedTableOptions* new_table_options);
+
+Status GetOptionsFromString(const Options& base_options,
+                            const std::string& opts_str, Options* new_options);
+
+/// Request stopping background work, if wait is true wait until it's done
+void CancelAllBackgroundWork(DB* db, bool wait = false);
+#endif  // ROCKSDB_LITE
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/db.h b/src/rocksdb/include/rocksdb/db.h
index e5b4838..5a49638 100644
--- a/src/rocksdb/include/rocksdb/db.h
+++ b/src/rocksdb/include/rocksdb/db.h
@@ -22,8 +22,15 @@
 #include "rocksdb/types.h"
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/listener.h"
+#include "rocksdb/snapshot.h"
 #include "rocksdb/thread_status.h"
 
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#endif
+
+
 namespace rocksdb {
 
 struct Options;
@@ -33,7 +40,9 @@ struct ReadOptions;
 struct WriteOptions;
 struct FlushOptions;
 struct CompactionOptions;
+struct CompactRangeOptions;
 struct TableProperties;
+struct ExternalSstFileInfo;
 class WriteBatch;
 class Env;
 class EventListener;
@@ -61,18 +70,6 @@ struct ColumnFamilyDescriptor {
 static const int kMajorVersion = __ROCKSDB_MAJOR__;
 static const int kMinorVersion = __ROCKSDB_MINOR__;
 
-// Abstract handle to particular state of a DB.
-// A Snapshot is an immutable object and can therefore be safely
-// accessed from multiple threads without any external synchronization.
-class Snapshot {
- public:
-  // returns Snapshot's sequence number
-  virtual SequenceNumber GetSequenceNumber() const = 0;
-
- protected:
-  virtual ~Snapshot();
-};
-
 // A range of keys
 struct Range {
   Slice start;          // Included in the range
@@ -188,6 +185,17 @@ class DB {
     return Delete(options, DefaultColumnFamily(), key);
   }
 
+  // Remove the database entry for "key". Requires that the key exists
+  // and was not overwritten. Returns OK on success, and a non-OK status
+  // on error.  It is not an error if "key" did not exist in the database.
+  // Note: consider setting options.sync = true.
+  virtual Status SingleDelete(const WriteOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key) = 0;
+  virtual Status SingleDelete(const WriteOptions& options, const Slice& key) {
+    return SingleDelete(options, DefaultColumnFamily(), key);
+  }
+
   // Merge the database entry for "key" with "value".  Returns OK on success,
   // and a non-OK status on error. The semantics of this operation is
   // determined by the user provided merge_operator when opening DB.
@@ -316,21 +324,34 @@ class DB {
   //  "rocksdb.compaction-pending" - 1 if at least one compaction is pending
   //  "rocksdb.background-errors" - accumulated number of background errors
   //  "rocksdb.cur-size-active-mem-table"
-  //  "rocksdb.cur-size-all-mem-tables"
-  //  "rocksdb.num-entries-active-mem-table"
-  //  "rocksdb.num-entries-imm-mem-tables"
-  //  "rocksdb.num-deletes-active-mem-table"
-  //  "rocksdb.num-deletes-imm-mem-tables"
-  //  "rocksdb.estimate-num-keys" - estimated keys in the column family
-  //  "rocksdb.estimate-table-readers-mem" - estimated memory used for reding
-  //      SST tables, that is not counted as a part of block cache.
-  //  "rocksdb.is-file-deletions-enabled"
-  //  "rocksdb.num-snapshots"
-  //  "rocksdb.oldest-snapshot-time"
-  //  "rocksdb.num-live-versions" - `version` is an internal data structure.
-  //      See version_set.h for details. More live versions often mean more SST
-  //      files are held from being deleted, by iterators or unfinished
-  //      compactions.
+//  "rocksdb.size-all-mem-tables"
+//  "rocksdb.num-entries-active-mem-table"
+//  "rocksdb.num-entries-imm-mem-tables"
+//  "rocksdb.num-deletes-active-mem-table"
+//  "rocksdb.num-deletes-imm-mem-tables"
+//  "rocksdb.estimate-num-keys" - estimated keys in the column family
+//  "rocksdb.estimate-table-readers-mem" - estimated memory used for reding
+//      SST tables, that is not counted as a part of block cache.
+//  "rocksdb.is-file-deletions-enabled"
+//  "rocksdb.num-snapshots"
+//  "rocksdb.oldest-snapshot-time"
+//  "rocksdb.num-live-versions" - `version` is an internal data structure.
+//      See version_set.h for details. More live versions often mean more SST
+//      files are held from being deleted, by iterators or unfinished
+//      compactions.
+//  "rocksdb.estimate-live-data-size"
+//  "rocksdb.total-sst-files-size" - total size of all used sst files, this may
+//      slow down online queries if there are too many files.
+//  "rocksdb.base-level"
+//  "rocksdb.estimate-pending-compaction-bytes" - estimated total number of
+//      bytes compaction needs to rewrite the data to get all levels down
+//      to under target size. Not valid for other compactions than level-based.
+//  "rocksdb.aggregated-table-properties" - returns a string representation of
+//      the aggregated table properties of the target column family.
+//  "rocksdb.aggregated-table-properties-at-level<N>", same as the previous
+//      one but only returns the aggregated table properties of the specified
+//      level "N" at the target column family.
+//  replaced by the target level.
 #ifndef ROCKSDB_LITE
   struct Properties {
     static const std::string kNumFilesAtLevelPrefix;
@@ -344,6 +365,7 @@ class DB {
     static const std::string kBackgroundErrors;
     static const std::string kCurSizeActiveMemTable;
     static const std::string kCurSizeAllMemTables;
+    static const std::string kSizeAllMemTables;
     static const std::string kNumEntriesActiveMemTable;
     static const std::string kNumEntriesImmMemTables;
     static const std::string kNumDeletesActiveMemTable;
@@ -354,6 +376,11 @@ class DB {
     static const std::string kNumSnapshots;
     static const std::string kOldestSnapshotTime;
     static const std::string kNumLiveVersions;
+    static const std::string kEstimateLiveDataSize;
+    static const std::string kTotalSstFilesSize;
+    static const std::string kEstimatePendingCompactionBytes;
+    static const std::string kAggregatedTableProperties;
+    static const std::string kAggregatedTablePropertiesAtLevel;
   };
 #endif /* ROCKSDB_LITE */
 
@@ -372,6 +399,7 @@ class DB {
   //  "rocksdb.background-errors"
   //  "rocksdb.cur-size-active-mem-table"
   //  "rocksdb.cur-size-all-mem-tables"
+  //  "rocksdb.size-all-mem-tables"
   //  "rocksdb.num-entries-active-mem-table"
   //  "rocksdb.num-entries-imm-mem-tables"
   //  "rocksdb.num-deletes-active-mem-table"
@@ -382,6 +410,10 @@ class DB {
   //  "rocksdb.num-snapshots"
   //  "rocksdb.oldest-snapshot-time"
   //  "rocksdb.num-live-versions"
+  //  "rocksdb.estimate-live-data-size"
+  //  "rocksdb.total-sst-files-size"
+  //  "rocksdb.base-level"
+  //  "rocksdb.estimate-pending-compaction-bytes"
   virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
                               const Slice& property, uint64_t* value) = 0;
   virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
@@ -395,12 +427,16 @@ class DB {
   // if the user data compresses by a factor of ten, the returned
   // sizes will be one-tenth the size of the corresponding user data size.
   //
-  // The results may not include the sizes of recently written data.
+  // If include_memtable is set to true, then the result will also
+  // include those recently written data in the mem-tables if
+  // the mem-table type supports it.
   virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                   const Range* range, int n,
-                                   uint64_t* sizes) = 0;
-  virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) {
-    GetApproximateSizes(DefaultColumnFamily(), range, n, sizes);
+                                   const Range* range, int n, uint64_t* sizes,
+                                   bool include_memtable = false) = 0;
+  virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes,
+                                   bool include_memtable = false) {
+    GetApproximateSizes(DefaultColumnFamily(), range, n, sizes,
+                        include_memtable);
   }
 
   // Compact the underlying storage for the key range [*begin,*end].
@@ -413,25 +449,52 @@ class DB {
   // begin==nullptr is treated as a key before all keys in the database.
   // end==nullptr is treated as a key after all keys in the database.
   // Therefore the following call will compact the entire database:
-  //    db->CompactRange(nullptr, nullptr);
+  //    db->CompactRange(options, nullptr, nullptr);
   // Note that after the entire database is compacted, all data are pushed
-  // down to the last level containing any data. If the total data size
-  // after compaction is reduced, that level might not be appropriate for
-  // hosting all the files. In this case, client could set reduce_level
-  // to true, to move the files back to the minimum level capable of holding
-  // the data set or a given level (specified by non-negative target_level).
-  // Compaction outputs should be placed in options.db_paths[target_path_id].
-  // Behavior is undefined if target_path_id is out of range.
-  virtual Status CompactRange(ColumnFamilyHandle* column_family,
-                              const Slice* begin, const Slice* end,
-                              bool reduce_level = false, int target_level = -1,
-                              uint32_t target_path_id = 0) = 0;
-  virtual Status CompactRange(const Slice* begin, const Slice* end,
-                              bool reduce_level = false, int target_level = -1,
-                              uint32_t target_path_id = 0) {
-    return CompactRange(DefaultColumnFamily(), begin, end, reduce_level,
-                        target_level, target_path_id);
+  // down to the last level containing any data. If the total data size after
+  // compaction is reduced, that level might not be appropriate for hosting all
+  // the files. In this case, client could set options.change_level to true, to
+  // move the files back to the minimum level capable of holding the data set
+  // or a given level (specified by non-negative options.target_level).
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end) = 0;
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              const Slice* begin, const Slice* end) {
+    return CompactRange(options, DefaultColumnFamily(), begin, end);
   }
+
+#if defined(__GNUC__) || defined(__clang__)
+  __attribute__((deprecated))
+#elif _WIN32
+  __declspec(deprecated)
+#endif
+   virtual Status
+      CompactRange(ColumnFamilyHandle* column_family, const Slice* begin,
+                   const Slice* end, bool change_level = false,
+                   int target_level = -1, uint32_t target_path_id = 0) {
+    CompactRangeOptions options;
+    options.change_level = change_level;
+    options.target_level = target_level;
+    options.target_path_id = target_path_id;
+    return CompactRange(options, column_family, begin, end);
+  }
+#if defined(__GNUC__) || defined(__clang__)
+  __attribute__((deprecated))
+#elif _WIN32
+  __declspec(deprecated)
+#endif
+    virtual Status
+      CompactRange(const Slice* begin, const Slice* end,
+                   bool change_level = false, int target_level = -1,
+                   uint32_t target_path_id = 0) {
+    CompactRangeOptions options;
+    options.change_level = change_level;
+    options.target_level = target_level;
+    options.target_path_id = target_path_id;
+    return CompactRange(options, DefaultColumnFamily(), begin, end);
+  }
+
   virtual Status SetOptions(ColumnFamilyHandle* column_family,
       const std::unordered_map<std::string, std::string>& new_options) {
     return Status::NotSupported("Not implemented");
@@ -441,10 +504,10 @@ class DB {
     return SetOptions(DefaultColumnFamily(), new_options);
   }
 
-  // CompactFiles() inputs a list of files specified by file numbers
-  // and compacts them to the specified level.  Note that the behavior
-  // is different from CompactRange in that CompactFiles() will
-  // perform the compaction job using the CURRENT thread.
+  // CompactFiles() inputs a list of files specified by file numbers and
+  // compacts them to the specified level. Note that the behavior is different
+  // from CompactRange() in that CompactFiles() performs the compaction job
+  // using the CURRENT thread.
   //
   // @see GetDataBaseMetaData
   // @see GetColumnFamilyMetaData
@@ -461,6 +524,13 @@ class DB {
     return CompactFiles(compact_options, DefaultColumnFamily(),
                         input_file_names, output_level, output_path_id);
   }
+
+  // This function will wait until all currently running background processes
+  // finish. After it returns, no background process will be run until
+  // UnblockBackgroundWork is called
+  virtual Status PauseBackgroundWork() = 0;
+  virtual Status ContinueBackgroundWork() = 0;
+
   // Number of levels used for this DB.
   virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;
   virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); }
@@ -504,6 +574,12 @@ class DB {
     return Flush(options, DefaultColumnFamily());
   }
 
+  // Sync the wal. Note that Write() followed by SyncWAL() is not exactly the
+  // same as Write() with sync=true: in the latter case the changes won't be
+  // visible until the sync is done.
+  // Currently only works if allow_mmap_writes = false in Options.
+  virtual Status SyncWAL() = 0;
+
   // The sequence number of the most recent transaction.
   virtual SequenceNumber GetLatestSequenceNumber() const = 0;
 
@@ -561,6 +637,8 @@ class DB {
       const TransactionLogIterator::ReadOptions&
           read_options = TransactionLogIterator::ReadOptions()) = 0;
 
+// Windows API macro interference
+#undef DeleteFile
   // Delete the file name from the db directory and update the internal state to
   // reflect that. Supports deletion of sst and log files only. 'name' must be
   // path relative to the db directory. eg. 000001.sst, /archive/000003.log
@@ -585,12 +663,42 @@ class DB {
       ColumnFamilyMetaData* metadata) {
     GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
   }
+
+  // Load table file located at "file_path" into "column_family", a pointer to
+  // ExternalSstFileInfo can be used instead of "file_path" to do a blind add
+  // that wont need to read the file, move_file can be set to true to
+  // move the file instead of copying it.
+  //
+  // Current Requirements:
+  // (1) Memtable is empty.
+  // (2) All existing files (if any) have sequence number = 0.
+  // (3) Key range in loaded table file don't overlap with existing
+  //     files key ranges.
+  // (4) No other writes happen during AddFile call, otherwise
+  //     DB may get corrupted.
+  // (5) Database have at least 2 levels.
+  virtual Status AddFile(ColumnFamilyHandle* column_family,
+                         const std::string& file_path,
+                         bool move_file = false) = 0;
+  virtual Status AddFile(const std::string& file_path, bool move_file = false) {
+    return AddFile(DefaultColumnFamily(), file_path, move_file);
+  }
+
+  // Load table file with information "file_info" into "column_family"
+  virtual Status AddFile(ColumnFamilyHandle* column_family,
+                         const ExternalSstFileInfo* file_info,
+                         bool move_file = false) = 0;
+  virtual Status AddFile(const ExternalSstFileInfo* file_info,
+                         bool move_file = false) {
+    return AddFile(DefaultColumnFamily(), file_info, move_file);
+  }
+
 #endif  // ROCKSDB_LITE
 
   // Sets the globally unique ID created at database creation time by invoking
   // Env::GenerateUniqueId(), in identity. Returns Status::OK if identity could
   // be set properly
-  virtual Status GetDbIdentity(std::string& identity) = 0;
+  virtual Status GetDbIdentity(std::string& identity) const = 0;
 
   // Returns default column family handle
   virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0;
@@ -603,6 +711,9 @@ class DB {
   }
 #endif  // ROCKSDB_LITE
 
+  // Needed for StackableDB
+  virtual DB* GetRootDB() { return this; }
+
  private:
   // No copying allowed
   DB(const DB&);
diff --git a/src/rocksdb/include/rocksdb/db_dump_tool.h b/src/rocksdb/include/rocksdb/db_dump_tool.h
new file mode 100644
index 0000000..67575a9
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/db_dump_tool.h
@@ -0,0 +1,45 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+struct DumpOptions {
+  // Database that will be dumped
+  std::string db_path;
+  // File location that will contain dump output
+  std::string dump_location;
+  // Dont include db information header in the dump
+  bool anonymous = false;
+};
+
+class DbDumpTool {
+ public:
+  bool Run(const DumpOptions& dump_options,
+           rocksdb::Options options = rocksdb::Options());
+};
+
+struct UndumpOptions {
+  // Database that we will load the dumped file into
+  std::string db_path;
+  // File location of the dumped file that will be loaded
+  std::string dump_location;
+  // Compact the db after loading the dumped file
+  bool compact_db = false;
+};
+
+class DbUndumpTool {
+ public:
+  bool Run(const UndumpOptions& undump_options,
+           rocksdb::Options options = rocksdb::Options());
+};
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/delete_scheduler.h b/src/rocksdb/include/rocksdb/delete_scheduler.h
new file mode 100644
index 0000000..788d592
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/delete_scheduler.h
@@ -0,0 +1,66 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class Env;
+class Logger;
+
+// DeleteScheduler allow the DB to enforce a rate limit on file deletion,
+// Instead of deleteing files immediately, files are moved to trash_dir
+// and deleted in a background thread that apply sleep penlty between deletes
+// if they are happening in a rate faster than rate_bytes_per_sec,
+//
+// Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this
+// case DeleteScheduler will delete files immediately.
+class DeleteScheduler {
+ public:
+  virtual ~DeleteScheduler() {}
+
+  // Return delete rate limit in bytes per second
+  virtual int64_t GetRateBytesPerSecond() = 0;
+
+  // Move file to trash directory and schedule it's deletion
+  virtual Status DeleteFile(const std::string& fname) = 0;
+
+  // Return a map containing errors that happened in the background thread
+  // file_path => error status
+  virtual std::map<std::string, Status> GetBackgroundErrors() = 0;
+
+  // Wait for all files being deleteing in the background to finish or for
+  // destructor to be called.
+  virtual void WaitForEmptyTrash() = 0;
+};
+
+// Create a new DeleteScheduler that can be shared among multiple RocksDB
+// instances to control the file deletion rate.
+//
+// @env: Pointer to Env object, please see "rocksdb/env.h".
+// @trash_dir: Path to the directory where deleted files will be moved into
+//    to be deleted in a background thread while applying rate limiting. If this
+//    directory dont exist, it will be created. This directory should not be
+//    used by any other process or any other DeleteScheduler.
+// @rate_bytes_per_sec: How many bytes should be deleted per second, If this
+//    value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb
+//    in 1 second, we will wait for another 3 seconds before we delete other
+//    files, Set to 0 to disable rate limiting.
+// @info_log: If not nullptr, info_log will be used to log errors.
+// @delete_exisitng_trash: If set to true, the newly created DeleteScheduler
+//    will delete files that already exist in trash_dir.
+// @status: If not nullptr, status will contain any errors that happened during
+//    creating the missing trash_dir or deleting existing files in trash.
+extern DeleteScheduler* NewDeleteScheduler(
+    Env* env, const std::string& trash_dir, int64_t rate_bytes_per_sec,
+    std::shared_ptr<Logger> info_log = nullptr,
+    bool delete_exisitng_trash = true, Status* status = nullptr);
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/env.h b/src/rocksdb/include/rocksdb/env.h
index 2fb9242..57c60f0 100644
--- a/src/rocksdb/include/rocksdb/env.h
+++ b/src/rocksdb/include/rocksdb/env.h
@@ -17,15 +17,21 @@
 #ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
 #define STORAGE_ROCKSDB_INCLUDE_ENV_H_
 
+#include <stdint.h>
 #include <cstdarg>
-#include <string>
-#include <memory>
 #include <limits>
+#include <memory>
+#include <string>
 #include <vector>
-#include <stdint.h>
 #include "rocksdb/status.h"
 #include "rocksdb/thread_status.h"
 
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#undef GetCurrentTime
+#endif
+
 namespace rocksdb {
 
 class FileLock;
@@ -34,11 +40,11 @@ class RandomAccessFile;
 class SequentialFile;
 class Slice;
 class WritableFile;
-class RandomRWFile;
 class Directory;
 struct DBOptions;
 class RateLimiter;
 class ThreadStatusUpdater;
+struct ThreadStatus;
 
 using std::unique_ptr;
 using std::shared_ptr;
@@ -62,6 +68,9 @@ struct EnvOptions {
    // If true, then use mmap to write data
   bool use_mmap_writes = true;
 
+  // If false, fallocate() calls are bypassed
+  bool allow_fallocate = true;
+
   // If true, set the FD_CLOEXEC on open fd.
   bool set_fd_cloexec = true;
 
@@ -130,15 +139,6 @@ class Env {
                                  unique_ptr<WritableFile>* result,
                                  const EnvOptions& options) = 0;
 
-  // Create an object that both reads and writes to a file on
-  // specified offsets (random access). If file already exists,
-  // does not overwrite it. On success, stores a pointer to the
-  // new file in *result and returns OK. On failure stores nullptr
-  // in *result and returns non-OK.
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options) = 0;
-
   // Create an object that represents a directory. Will fail if directory
   // doesn't exist. If the directory exists, it will open the directory
   // and create a new Directory object.
@@ -149,8 +149,12 @@ class Env {
   virtual Status NewDirectory(const std::string& name,
                               unique_ptr<Directory>* result) = 0;
 
-  // Returns true iff the named file exists.
-  virtual bool FileExists(const std::string& fname) = 0;
+  // Returns OK if the named file exists.
+  //         NotFound if the named file does not exist,
+  //                  the calling process does not have permission to determine
+  //                  whether this file exists, or if the path is invalid.
+  //         IOError if an IO Error was encountered
+  virtual Status FileExists(const std::string& fname) = 0;
 
   // Store in *result the names of the children of the specified directory.
   // The names are relative to "dir".
@@ -256,6 +260,8 @@ class Env {
 
   // Returns the number of micro-seconds since some fixed point in time. Only
   // useful for computing deltas of time.
+  // However, it is often used as system time such as in GenericRateLimiter
+  // and other places so a port needs to return system time in order to work.
   virtual uint64_t NowMicros() = 0;
 
   // Returns the number of nano-seconds since some fixed point in time. Only
@@ -320,6 +326,9 @@ class Env {
     return thread_status_updater_;
   }
 
+  // Returns the ID of the current thread.
+  virtual uint64_t GetThreadID() const;
+
  protected:
   // The pointer to an internal structure that will update the
   // status of each thread.
@@ -387,6 +396,12 @@ class RandomAccessFile {
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                       char* scratch) const = 0;
 
+  // Used by the file_reader_writer to decide if the ReadAhead wrapper
+  // should simply forward the call and do not enact buffering or locking.
+  virtual bool ShouldForwardRawRequest() const {
+    return false;
+  }
+
   // Tries to get an unique ID for this file that will be the same each time
   // the file is opened (and will stay the same while the file is open).
   // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
@@ -407,7 +422,6 @@ class RandomAccessFile {
               // compatibility.
   };
 
-
   enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
 
   virtual void Hint(AccessPattern pattern) {}
@@ -432,7 +446,35 @@ class WritableFile {
   }
   virtual ~WritableFile();
 
+  // Indicates if the class makes use of unbuffered I/O
+  virtual bool UseOSBuffer() const {
+    return true;
+  }
+
+  const size_t c_DefaultPageSize = 4 * 1024;
+
+  // This is needed when you want to allocate
+  // AlignedBuffer for use with file I/O classes
+  // Used for unbuffered file I/O when UseOSBuffer() returns false
+  virtual size_t GetRequiredBufferAlignment() const {
+    return c_DefaultPageSize;
+  }
+
   virtual Status Append(const Slice& data) = 0;
+
+  // Positioned write for unbuffered access default forward
+  // to simple append as most of the tests are buffered by default
+  virtual Status PositionedAppend(const Slice& /* data */, uint64_t /* offset */) {
+    return Status::NotSupported();
+  }
+
+  // Truncate is necessary to trim the file to the correct size
+  // before closing. It is not always possible to keep track of the file
+  // size due to whole pages writes. The behavior is undefined if called
+  // with other writes to follow.
+  virtual Status Truncate(uint64_t size) {
+    return Status::OK();
+  }
   virtual Status Close() = 0;
   virtual Status Flush() = 0;
   virtual Status Sync() = 0; // sync data
@@ -447,6 +489,16 @@ class WritableFile {
     return Sync();
   }
 
+  // true if Sync() and Fsync() are safe to call concurrently with Append()
+  // and Flush().
+  virtual bool IsSyncThreadSafe() const {
+    return false;
+  }
+
+  // Indicates the upper layers if the current WritableFile implementation
+  // uses direct IO.
+  virtual bool UseDirectIO() const { return false; }
+
   /*
    * Change the priority in rate limiter if rate limiting is enabled.
    * If rate limiting is not enabled, this call has no effect.
@@ -455,6 +507,8 @@ class WritableFile {
     io_priority_ = pri;
   }
 
+  virtual Env::IOPriority GetIOPriority() { return io_priority_; }
+
   /*
    * Get the size of valid data in the file.
    */
@@ -491,7 +545,14 @@ class WritableFile {
     return Status::NotSupported("InvalidateCache not supported.");
   }
 
- protected:
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  virtual Status RangeSync(off_t offset, off_t nbytes) { return Status::OK(); }
+
   // PrepareWrite performs any necessary preparation for a write
   // before the write actually occurs.  This allows for pre-allocation
   // of space on devices where it can result in less file
@@ -516,6 +577,7 @@ class WritableFile {
     }
   }
 
+ protected:
   /*
    * Pre-allocate space for a file.
    */
@@ -523,16 +585,6 @@ class WritableFile {
     return Status::OK();
   }
 
-  // Sync a file range with disk.
-  // offset is the starting byte of the file range to be synchronized.
-  // nbytes specifies the length of the range to be synchronized.
-  // This asks the OS to initiate flushing the cached data to disk,
-  // without waiting for completion.
-  // Default implementation does nothing.
-  virtual Status RangeSync(off_t offset, off_t nbytes) {
-    return Status::OK();
-  }
-
   size_t preallocation_block_size() { return preallocation_block_size_; }
 
  private:
@@ -543,56 +595,9 @@ class WritableFile {
   void operator=(const WritableFile&);
 
  protected:
-  Env::IOPriority io_priority_;
-};
+  friend class WritableFileWrapper;
 
-// A file abstraction for random reading and writing.
-class RandomRWFile {
- public:
-  RandomRWFile() {}
-  virtual ~RandomRWFile() {}
-
-  // Write data from Slice data to file starting from offset
-  // Returns IOError on failure, but does not guarantee
-  // atomicity of a write.  Returns OK status on success.
-  //
-  // Safe for concurrent use.
-  virtual Status Write(uint64_t offset, const Slice& data) = 0;
-  // Read up to "n" bytes from the file starting at "offset".
-  // "scratch[0..n-1]" may be written by this routine.  Sets "*result"
-  // to the data that was read (including if fewer than "n" bytes were
-  // successfully read).  May set "*result" to point at data in
-  // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when
-  // "*result" is used.  If an error was encountered, returns a non-OK
-  // status.
-  //
-  // Safe for concurrent use by multiple threads.
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const = 0;
-  virtual Status Close() = 0; // closes the file
-  virtual Status Sync() = 0; // sync data
-
-  /*
-   * Sync data and/or metadata as well.
-   * By default, sync only data.
-   * Override this method for environments where we need to sync
-   * metadata as well.
-   */
-  virtual Status Fsync() {
-    return Sync();
-  }
-
-  /*
-   * Pre-allocate space for a file.
-   */
-  virtual Status Allocate(off_t offset, off_t len) {
-    return Status::OK();
-  }
-
- private:
-  // No copying allowed
-  RandomRWFile(const RandomRWFile&);
-  void operator=(const RandomRWFile&);
+  Env::IOPriority io_priority_;
 };
 
 // Directory object represents collection of files and implements
@@ -600,7 +605,7 @@ class RandomRWFile {
 class Directory {
  public:
   virtual ~Directory() {}
-  // Fsync directory
+  // Fsync directory. Can be called concurrently from multiple threads.
   virtual Status Fsync() = 0;
 };
 
@@ -610,6 +615,7 @@ enum InfoLogLevel : unsigned char {
   WARN_LEVEL,
   ERROR_LEVEL,
   FATAL_LEVEL,
+  HEADER_LEVEL,
   NUM_INFO_LOG_LEVELS,
 };
 
@@ -638,27 +644,8 @@ class Logger {
   // and format.  Any log with level under the internal log level
   // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
   // printed.
-  virtual void Logv(const InfoLogLevel log_level, const char* format, va_list ap) {
-    static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN",
-                                                "ERROR", "FATAL"};
-    if (log_level < log_level_) {
-      return;
-    }
+  virtual void Logv(const InfoLogLevel log_level, const char* format, va_list ap);
 
-    if (log_level == InfoLogLevel::INFO_LEVEL) {
-      // Doesn't print log level if it is INFO level.
-      // This is to avoid unexpected performance regression after we add
-      // the feature of log level. All the logs before we add the feature
-      // are INFO level. We don't want to add extra costs to those existing
-      // logging.
-      Logv(format, ap);
-    } else {
-      char new_format[500];
-      snprintf(new_format, sizeof(new_format) - 1, "[%s] %s",
-               kInfoLogLevelNames[log_level], format);
-      Logv(new_format, ap);
-    }
-  }
   virtual size_t GetLogFileSize() const { return kDoNotSupportGetLogFileSize; }
   // Flush to the OS buffers
   virtual void Flush() {}
@@ -762,15 +749,11 @@ class EnvWrapper : public Env {
                          const EnvOptions& options) override {
     return target_->NewWritableFile(f, r, options);
   }
-  Status NewRandomRWFile(const std::string& f, unique_ptr<RandomRWFile>* r,
-                         const EnvOptions& options) override {
-    return target_->NewRandomRWFile(f, r, options);
-  }
   virtual Status NewDirectory(const std::string& name,
                               unique_ptr<Directory>* result) override {
     return target_->NewDirectory(name, result);
   }
-  bool FileExists(const std::string& f) override {
+  Status FileExists(const std::string& f) override {
     return target_->FileExists(f);
   }
   Status GetChildren(const std::string& dir,
@@ -874,10 +857,61 @@ class EnvWrapper : public Env {
     return target_->GetThreadStatusUpdater();
   }
 
+  uint64_t GetThreadID() const override {
+    return target_->GetThreadID();
+  }
+
  private:
   Env* target_;
 };
 
+// An implementation of WritableFile that forwards all calls to another
+// WritableFile. May be useful to clients who wish to override just part of the
+// functionality of another WritableFile.
+// It's declared as friend of WritableFile to allow forwarding calls to
+// protected virtual methods.
+class WritableFileWrapper : public WritableFile {
+ public:
+  explicit WritableFileWrapper(WritableFile* t) : target_(t) { }
+
+  Status Append(const Slice& data) override { return target_->Append(data); }
+  Status PositionedAppend(const Slice& data, uint64_t offset) override {
+    return target_->PositionedAppend(data, offset);
+  }
+  Status Truncate(uint64_t size) override { return target_->Truncate(size); }
+  Status Close() override { return target_->Close(); }
+  Status Flush() override { return target_->Flush(); }
+  Status Sync() override { return target_->Sync(); }
+  Status Fsync() override { return target_->Fsync(); }
+  bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+  void SetIOPriority(Env::IOPriority pri) override {
+    target_->SetIOPriority(pri);
+  }
+  Env::IOPriority GetIOPriority() override { return target_->GetIOPriority(); }
+  uint64_t GetFileSize() override { return target_->GetFileSize(); }
+  void GetPreallocationStatus(size_t* block_size,
+                              size_t* last_allocated_block) override {
+    target_->GetPreallocationStatus(block_size, last_allocated_block);
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+
+ protected:
+  Status Allocate(off_t offset, off_t len) override {
+    return target_->Allocate(offset, len);
+  }
+  Status RangeSync(off_t offset, off_t nbytes) override {
+    return target_->RangeSync(offset, nbytes);
+  }
+
+ private:
+  WritableFile* target_;
+};
+
 // Returns a new environment that stores its data in memory and delegates
 // all non-file-storage tasks to base_env. The caller must delete the result
 // when it is no longer needed.
diff --git a/src/rocksdb/include/rocksdb/immutable_options.h b/src/rocksdb/include/rocksdb/immutable_options.h
index 1551d26..589f14e 100644
--- a/src/rocksdb/include/rocksdb/immutable_options.h
+++ b/src/rocksdb/include/rocksdb/immutable_options.h
@@ -35,8 +35,6 @@ struct ImmutableCFOptions {
 
   CompactionFilterFactory* compaction_filter_factory;
 
-  CompactionFilterFactoryV2* compaction_filter_factory_v2;
-
   bool inplace_update_support;
 
   UpdateStatus (*inplace_callback)(char* existing_value,
@@ -91,15 +89,19 @@ struct ImmutableCFOptions {
 
   Options::AccessHint access_hint_on_compaction_start;
 
+  bool new_table_reader_for_compaction_inputs;
+
+  size_t compaction_readahead_size;
+
   int num_levels;
 
   bool optimize_filters_for_hits;
 
-#ifndef ROCKSDB_LITE
   // A vector of EventListeners which call-back functions will be called
   // when specific RocksDB event happens.
   std::vector<std::shared_ptr<EventListener>> listeners;
-#endif  // ROCKSDB_LITE
+
+  std::shared_ptr<Cache> row_cache;
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/iostats_context.h b/src/rocksdb/include/rocksdb/iostats_context.h
index e06ee17..e81092b 100644
--- a/src/rocksdb/include/rocksdb/iostats_context.h
+++ b/src/rocksdb/include/rocksdb/iostats_context.h
@@ -2,14 +2,16 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
-
-#ifndef INCLUDE_ROCKSDB_IOSTATS_CONTEXT_H_
-#define INCLUDE_ROCKSDB_IOSTATS_CONTEXT_H_
+#pragma once
 
 #include <stdint.h>
 #include <string>
 
+#include "rocksdb/perf_level.h"
+
 // A thread local context for gathering io-stats efficiently and transparently.
+// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
+
 namespace rocksdb {
 
 struct IOStatsContext {
@@ -25,12 +27,31 @@ struct IOStatsContext {
   uint64_t bytes_written;
   // number of bytes that has been read.
   uint64_t bytes_read;
+
+  // time spent in open() and fopen().
+  uint64_t open_nanos;
+  // time spent in fallocate().
+  uint64_t allocate_nanos;
+  // time spent in write() and pwrite().
+  uint64_t write_nanos;
+  // time spent in read() and pread()
+  uint64_t read_nanos;
+  // time spent in sync_file_range().
+  uint64_t range_sync_nanos;
+  // time spent in fsync
+  uint64_t fsync_nanos;
+  // time spent in preparing write (fallocate etc).
+  uint64_t prepare_write_nanos;
+  // time spent in Logger::Logv().
+  uint64_t logger_nanos;
 };
 
 #ifndef IOS_CROSS_COMPILE
+# ifdef _WIN32
+extern __declspec(thread) IOStatsContext iostats_context;
+# else
 extern __thread IOStatsContext iostats_context;
+# endif
 #endif  // IOS_CROSS_COMPILE
 
 }  // namespace rocksdb
-
-#endif  // INCLUDE_ROCKSDB_IOSTATS_CONTEXT_H_
diff --git a/src/rocksdb/include/rocksdb/listener.h b/src/rocksdb/include/rocksdb/listener.h
index 7f70d1c..f693d5c 100644
--- a/src/rocksdb/include/rocksdb/listener.h
+++ b/src/rocksdb/include/rocksdb/listener.h
@@ -4,28 +4,100 @@
 
 #pragma once
 
-#ifndef ROCKSDB_LITE
-
 #include <string>
 #include <vector>
+#include "rocksdb/compaction_job_stats.h"
 #include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
 
 namespace rocksdb {
 
 class DB;
 class Status;
+struct CompactionJobStats;
+
+struct TableFileCreationInfo {
+  TableFileCreationInfo() = default;
+  explicit TableFileCreationInfo(TableProperties&& prop) :
+      table_properties(prop) {}
+  // the name of the database where the file was created
+  std::string db_name;
+  // the name of the column family where the file was created.
+  std::string cf_name;
+  // the path to the created file.
+  std::string file_path;
+  // the size of the file.
+  uint64_t file_size;
+  // the id of the job (which could be flush or compaction) that
+  // created the file.
+  int job_id;
+  // Detailed properties of the created file.
+  TableProperties table_properties;
+};
+
+
+#ifndef ROCKSDB_LITE
+
+struct TableFileDeletionInfo {
+  // The name of the database where the file was deleted.
+  std::string db_name;
+  // The path to the deleted file.
+  std::string file_path;
+  // The id of the job which deleted the file.
+  int job_id;
+  // The status indicating whether the deletion was successfull or not.
+  Status status;
+};
+
+struct FlushJobInfo {
+  // the name of the column family
+  std::string cf_name;
+  // the path to the newly created file
+  std::string file_path;
+  // the id of the thread that completed this flush job.
+  uint64_t thread_id;
+  // the job id, which is unique in the same thread.
+  int job_id;
+  // If true, then rocksdb is currently slowing-down all writes to prevent
+  // creating too many Level 0 files as compaction seems not able to
+  // catch up the write request speed.  This indicates that there are
+  // too many files in Level 0.
+  bool triggered_writes_slowdown;
+  // If true, then rocksdb is currently blocking any writes to prevent
+  // creating more L0 files.  This indicates that there are too many
+  // files in level 0.  Compactions should try to compact L0 files down
+  // to lower levels as soon as possible.
+  bool triggered_writes_stop;
+  // The smallest sequence number in the newly created file
+  SequenceNumber smallest_seqno;
+  // The largest sequence number in the newly created file
+  SequenceNumber largest_seqno;
+};
 
 struct CompactionJobInfo {
+  CompactionJobInfo() = default;
+  explicit CompactionJobInfo(const CompactionJobStats& _stats) :
+      stats(_stats) {}
+
   // the name of the column family where the compaction happened.
   std::string cf_name;
   // the status indicating whether the compaction was successful or not.
   Status status;
+  // the id of the thread that completed this compaction job.
+  uint64_t thread_id;
+  // the job id, which is unique in the same thread.
+  int job_id;
+  // the smallest input level of the compaction.
+  int base_input_level;
   // the output level of the compaction.
   int output_level;
   // the names of the compaction input files.
   std::vector<std::string> input_files;
   // the names of the compaction output files.
   std::vector<std::string> output_files;
+  // If non-null, this variable stores detailed information
+  // about this compaction.
+  CompactionJobStats stats;
 };
 
 // EventListener class contains a set of call-back functions that will
@@ -66,24 +138,22 @@ class EventListener {
   // Note that the this function must be implemented in a way such that
   // it should not run for an extended period of time before the function
   // returns.  Otherwise, RocksDB may be blocked.
-  //
-  // @param db a pointer to the rocksdb instance which just flushed
-  //     a memtable to disk.
-  // @param column_family_id the id of the flushed column family.
-  // @param file_path the path to the newly created file.
-  // @param triggered_writes_slowdown true when rocksdb is currently
-  //     slowing-down all writes to prevent creating too many Level 0
-  //     files as compaction seems not able to catch up the write request
-  //     speed.  This indicates that there're too many files in Level 0.
-  // @param triggered_writes_stop true when rocksdb is currently blocking
-  //     any writes to prevent creating more L0 files.  This indicates that
-  //     there're too many files in level 0.  Compactions should try to
-  //     compact L0 files down to lower levels as soon as possible.
   virtual void OnFlushCompleted(
-      DB* db, const std::string& column_family_name,
-      const std::string& file_path,
-      bool triggered_writes_slowdown,
-      bool triggered_writes_stop) {}
+      DB* db, const FlushJobInfo& flush_job_info) {}
+
+  // A call-back function for RocksDB which will be called whenever
+  // a SST file is deleted.  Different from OnCompactionCompleted and
+  // OnFlushCompleted, this call-back is designed for external logging
+  // service and thus only provide string parameters instead
+  // of a pointer to DB.  Applications that build logic basic based
+  // on file creations and deletions is suggested to implement
+  // OnFlushCompleted and OnCompactionCompleted.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from the
+  // returned value.
+  virtual void OnTableFileDeleted(
+      const TableFileDeletionInfo& info) {}
 
   // A call-back function for RocksDB which will be called whenever
   // a registered RocksDB compacts a file. The default implementation
@@ -99,9 +169,29 @@ class EventListener {
   //  after this function is returned, and must be copied if it is needed
   //  outside of this function.
   virtual void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) {}
+
+  // A call-back function for RocksDB which will be called whenever
+  // a SST file is created.  Different from OnCompactionCompleted and
+  // OnFlushCompleted, this call-back is designed for external logging
+  // service and thus only provide string parameters instead
+  // of a pointer to DB.  Applications that build logic basic based
+  // on file creations and deletions is suggested to implement
+  // OnFlushCompleted and OnCompactionCompleted.
+  //
+  // Note that if applications would like to use the passed reference
+  // outside this function call, they should make copies from these
+  // returned value.
+  virtual void OnTableFileCreated(
+      const TableFileCreationInfo& info) {}
+
   virtual ~EventListener() {}
 };
 
-}  // namespace rocksdb
+#else
+
+class EventListener {
+};
 
 #endif  // ROCKSDB_LITE
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/memtablerep.h b/src/rocksdb/include/rocksdb/memtablerep.h
index c369e88..f02c2d0 100644
--- a/src/rocksdb/include/rocksdb/memtablerep.h
+++ b/src/rocksdb/include/rocksdb/memtablerep.h
@@ -103,6 +103,11 @@ class MemTableRep {
   virtual void Get(const LookupKey& k, void* callback_args,
                    bool (*callback_func)(void* arg, const char* entry));
 
+  virtual uint64_t ApproximateNumEntries(const Slice& start_ikey,
+                                         const Slice& end_key) {
+    return 0;
+  }
+
   // Report an approximation of how much memory has been used other than memory
   // that was allocated through the allocator.
   virtual size_t ApproximateMemoryUsage() = 0;
diff --git a/src/rocksdb/include/rocksdb/merge_operator.h b/src/rocksdb/include/rocksdb/merge_operator.h
index 2ae64c1..05b66f2 100644
--- a/src/rocksdb/include/rocksdb/merge_operator.h
+++ b/src/rocksdb/include/rocksdb/merge_operator.h
@@ -6,9 +6,10 @@
 #ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
 #define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
 
+#include <deque>
 #include <memory>
 #include <string>
-#include <deque>
+
 #include "rocksdb/slice.h"
 
 namespace rocksdb {
@@ -54,7 +55,8 @@ class MergeOperator {
   //                   merge operation semantics
   // existing: (IN)    null indicates that the key does not exist before this op
   // operand_list:(IN) the sequence of merge operations to apply, front() first.
-  // new_value:(OUT)   Client is responsible for filling the merge result here
+  // new_value:(OUT)   Client is responsible for filling the merge result here.
+  // The string that new_value is pointing to will be empty.
   // logger:   (IN)    Client could use this to log errors during merge.
   //
   // Return true on success.
@@ -80,6 +82,8 @@ class MergeOperator {
   // DB::Merge(key, *new_value) would yield the same result as a call
   // to DB::Merge(key, left_op) followed by DB::Merge(key, right_op).
   //
+  // The string that new_value is pointing to will be empty.
+  //
   // The default implementation of PartialMergeMulti will use this function
   // as a helper, for backward compatibility.  Any successor class of
   // MergeOperator should either implement PartialMerge or PartialMergeMulti,
@@ -116,6 +120,8 @@ class MergeOperator {
   // the same result as subquential individual calls to DB::Merge(key, operand)
   // for each operand in operand_list from front() to back().
   //
+  // The string that new_value is pointing to will be empty.
+  //
   // The PartialMergeMulti function will be called only when the list of
   // operands are long enough. The minimum amount of operands that will be
   // passed to the function are specified by the "min_partial_merge_operands"
@@ -147,7 +153,8 @@ class AssociativeMergeOperator : public MergeOperator {
   // key:           (IN) The key that's associated with this merge operation.
   // existing_value:(IN) null indicates the key does not exist before this op
   // value:         (IN) the value to update/merge the existing_value with
-  // new_value:    (OUT) Client is responsible for filling the merge result here
+  // new_value:    (OUT) Client is responsible for filling the merge result
+  // here. The string that new_value is pointing to will be empty.
   // logger:        (IN) Client could use this to log errors during merge.
   //
   // Return true on success.
diff --git a/src/rocksdb/include/rocksdb/metadata.h b/src/rocksdb/include/rocksdb/metadata.h
index e026fa9..7cdf4a1 100644
--- a/src/rocksdb/include/rocksdb/metadata.h
+++ b/src/rocksdb/include/rocksdb/metadata.h
@@ -3,14 +3,16 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
+#pragma once
+
+#include <stdint.h>
+
 #include <limits>
 #include <string>
 #include <vector>
 
 #include "rocksdb/types.h"
 
-#pragma once
-
 namespace rocksdb {
 struct ColumnFamilyMetaData;
 struct LevelMetaData;
diff --git a/src/rocksdb/include/rocksdb/options.h b/src/rocksdb/include/rocksdb/options.h
index ea11b81..16aa378 100644
--- a/src/rocksdb/include/rocksdb/options.h
+++ b/src/rocksdb/include/rocksdb/options.h
@@ -15,19 +15,21 @@
 #include <memory>
 #include <vector>
 #include <limits>
-#include <stdint.h>
 #include <unordered_map>
 
 #include "rocksdb/version.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/universal_compaction.h"
 
+#ifdef max
+#undef max
+#endif
+
 namespace rocksdb {
 
 class Cache;
 class CompactionFilter;
 class CompactionFilterFactory;
-class CompactionFilterFactoryV2;
 class Comparator;
 class Env;
 enum InfoLogLevel : unsigned char;
@@ -39,6 +41,7 @@ class TableFactory;
 class MemTableRepFactory;
 class TablePropertiesCollectorFactory;
 class RateLimiter;
+class DeleteScheduler;
 class Slice;
 class SliceTransform;
 class Statistics;
@@ -51,16 +54,16 @@ class InternalKeyComparator;
 enum CompressionType : char {
   // NOTE: do not change the values of existing entries, as these are
   // part of the persistent format on disk.
-  kNoCompression = 0x0, kSnappyCompression = 0x1, kZlibCompression = 0x2,
-  kBZip2Compression = 0x3, kLZ4Compression = 0x4, kLZ4HCCompression = 0x5
+  kNoCompression = 0x0,
+  kSnappyCompression = 0x1,
+  kZlibCompression = 0x2,
+  kBZip2Compression = 0x3,
+  kLZ4Compression = 0x4,
+  kLZ4HCCompression = 0x5,
+  // zstd format is not finalized yet so it's subject to changes.
+  kZSTDNotFinalCompression = 0x40,
 };
 
-// returns true if RocksDB was correctly linked with compression library and
-// supports the compression type
-extern bool CompressionTypeSupported(CompressionType compression_type);
-// Returns a human-readable name of the compression type
-extern const char* CompressionTypeToString(CompressionType compression_type);
-
 enum CompactionStyle : char {
   // level based compaction style
   kCompactionStyleLevel = 0x0,
@@ -76,6 +79,36 @@ enum CompactionStyle : char {
   kCompactionStyleNone = 0x3,
 };
 
+enum CompactionPri : char {
+  // Slightly Priotize larger files by size compensated by #deletes
+  kCompactionPriByCompensatedSize = 0x0,
+  // First compact files whose data is oldest.
+  kCompactionPriByLargestSeq = 0x1,
+};
+
+enum class WALRecoveryMode : char {
+  // Original levelDB recovery
+  // We tolerate incomplete record in trailing data on all logs
+  // Use case : This is legacy behavior (default)
+  kTolerateCorruptedTailRecords = 0x00,
+  // Recover from clean shutdown
+  // We don't expect to find any corruption in the WAL
+  // Use case : This is ideal for unit tests and rare applications that
+  // can require high consistency guarantee
+  kAbsoluteConsistency = 0x01,
+  // Recover to point-in-time consistency
+  // We stop the WAL playback on discovering WAL inconsistency
+  // Use case : Ideal for systems that have disk controller cache like
+  // hard disk, SSD without super capacitor that store related data
+  kPointInTimeRecovery = 0x02,
+  // Recovery after a disaster
+  // We ignore any corruption in the WAL and try to salvage as much data as
+  // possible
+  // Use case : Ideal for last ditch effort to recover data or systems that
+  // operate with low grade unrelated data
+  kSkipAnyCorruptedRecords = 0x03,
+};
+
 struct CompactionOptionsFIFO {
   // once the total sum of table files reaches this, we will delete the oldest
   // table file
@@ -189,15 +222,9 @@ struct ColumnFamilyOptions {
   // compaction is being used, each created CompactionFilter will only be used
   // from a single thread and so does not need to be thread-safe.
   //
-  // Default: a factory that doesn't provide any object
+  // Default: nullptr
   std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
 
-  // Version TWO of the compaction_filter_factory
-  // It supports rolling compaction
-  //
-  // Default: a factory that doesn't provide any object
-  std::shared_ptr<CompactionFilterFactoryV2> compaction_filter_factory_v2;
-
   // -------------------
   // Parameters that affect performance
 
@@ -238,11 +265,29 @@ struct ColumnFamilyOptions {
   // individual write buffers.  Default: 1
   int min_write_buffer_number_to_merge;
 
+  // The total maximum number of write buffers to maintain in memory including
+  // copies of buffers that have already been flushed.  Unlike
+  // max_write_buffer_number, this parameter does not affect flushing.
+  // This controls the minimum amount of write history that will be available
+  // in memory for conflict checking when Transactions are used.
+  // If this value is too low, some transactions may fail at commit time due
+  // to not being able to determine whether there were any write conflicts.
+  //
+  // Setting this value to 0 will cause write buffers to be freed immediately
+  // after they are flushed.
+  // If this value is set to -1, 'max_write_buffer_number' will be used.
+  //
+  // Default:
+  // If using a TransactionDB/OptimisticTransactionDB, the default value will
+  // be set to the value of 'max_write_buffer_number' if it is not explicitly
+  // set by the user.  Otherwise, the default is 0.
+  int max_write_buffer_number_to_maintain;
+
   // Compress blocks using the specified compression algorithm.  This
   // parameter can be changed dynamically.
   //
-  // Default: kSnappyCompression, which gives lightweight but fast
-  // compression.
+  // Default: kSnappyCompression, if it's supported. If snappy is not linked
+  // with the library, the default is kNoCompression.
   //
   // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
   //    ~200-500MB/s compression
@@ -319,14 +364,7 @@ struct ColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   int level0_stop_writes_trigger;
 
-  // Maximum level to which a new compacted memtable is pushed if it
-  // does not create overlap.  We try to push to level 2 to avoid the
-  // relatively expensive level 0=>1 compactions and to avoid some
-  // expensive manifest file operations.  We do not push all the way to
-  // the largest level since that can generate a lot of wasted disk
-  // space if the same key space is being repeatedly overwritten.
-  //
-  // Dynamically changeable through SetOptions() API
+  // This does not do anything anymore. Deprecated.
   int max_mem_compaction_level;
 
   // Target file size for compaction.
@@ -461,30 +499,29 @@ struct ColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   int max_grandparent_overlap_factor;
 
-  // Puts are delayed 0-1 ms when any level has a compaction score that exceeds
-  // soft_rate_limit. This is ignored when == 0.0.
-  // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
-  // hold, RocksDB will set soft_rate_limit = hard_rate_limit
+  // Puts are delayed to options.delayed_write_rate when any level has a
+  // compaction score that exceeds soft_rate_limit. This is ignored when == 0.0.
   //
   // Default: 0 (disabled)
   //
   // Dynamically changeable through SetOptions() API
   double soft_rate_limit;
 
-  // Puts are delayed 1ms at a time when any level has a compaction score that
-  // exceeds hard_rate_limit. This is ignored when <= 1.0.
+  // DEPRECATED -- this options is no longer usde
+  double hard_rate_limit;
+
+  // All writes are stopped if estimated bytes needed to be compaction exceed
+  // this threshold.
   //
   // Default: 0 (disabled)
-  //
-  // Dynamically changeable through SetOptions() API
-  double hard_rate_limit;
+  uint64_t hard_pending_compaction_bytes_limit;
 
   // DEPRECATED -- this options is no longer used
   unsigned int rate_limit_delay_max_milliseconds;
 
   // size of one block in arena memory allocation.
-  // If <= 0, a proper value is automatically calculated (usually 1/10 of
-  // writer_buffer_size).
+  // If <= 0, a proper value is automatically calculated (usually 1/8 of
+  // writer_buffer_size, rounded up to a multiple of 4KB).
   //
   // There are two additonal restriction of the The specified size:
   // (1) size should be in the range of [4096, 2 << 30] and
@@ -505,13 +542,18 @@ struct ColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   bool disable_auto_compactions;
 
-  // Purge duplicate/deleted keys when a memtable is flushed to storage.
-  // Default: true
+  // DEPREACTED
+  // Does not have any effect.
   bool purge_redundant_kvs_while_flush;
 
   // The compaction style. Default: kCompactionStyleLevel
   CompactionStyle compaction_style;
 
+  // If level compaction_style = kCompactionStyleLevel, for each level,
+  // which files are prioritized to be picked to compact.
+  // Default: kCompactionPriByCompensatedSize
+  CompactionPri compaction_pri;
+
   // If true, compaction will verify checksum on every read that happens
   // as part of compaction
   //
@@ -709,11 +751,9 @@ struct ColumnFamilyOptions {
   // Default: false
   bool paranoid_file_checks;
 
-#ifndef ROCKSDB_LITE
-  // A vector of EventListeners which call-back functions will be called
-  // when specific RocksDB event happens.
-  std::vector<std::shared_ptr<EventListener>> listeners;
-#endif  // ROCKSDB_LITE
+  // Measure IO stats in compactions, if true.
+  // Default: false
+  bool compaction_measure_io_stats;
 
   // Create ColumnFamilyOptions with default values for all fields
   ColumnFamilyOptions();
@@ -766,6 +806,13 @@ struct DBOptions {
   // Default: nullptr
   std::shared_ptr<RateLimiter> rate_limiter;
 
+  // Use to control files deletion rate, can be used among multiple
+  // RocksDB instances. delete_scheduler is only used to delete table files that
+  // need to be deleted from the first db_path (db_name if db_paths is empty),
+  // other files types and other db_paths wont be affected by delete_scheduler.
+  // Default: nullptr (disabled)
+  std::shared_ptr<DeleteScheduler> delete_scheduler;
+
   // Any internal progress/error information generated by the db will
   // be written to info_log if it is non-nullptr, or to a file stored
   // in the same directory as the DB contents if info_log is nullptr.
@@ -779,9 +826,14 @@ struct DBOptions {
   // files opened are always kept open. You can estimate number of files based
   // on target_file_size_base and target_file_size_multiplier for level-based
   // compaction. For universal-style compaction, you can usually set it to -1.
-  // Default: 5000
+  // Default: 5000 or ulimit value of max open files (whichever is smaller)
   int max_open_files;
 
+  // If max_open_files is -1, DB will open all files on DB::Open(). You can
+  // use this option to increase the number of threads used to open the files.
+  // Default: 1
+  int max_file_opening_threads;
+
   // Once write-ahead logs exceed this size, we will start forcing the flush of
   // column families whose memtables are backed by the oldest live WAL file
   // (i.e. the ones that are causing all the space amplification). If set to 0
@@ -865,6 +917,12 @@ struct DBOptions {
   // Default: 1
   int max_background_compactions;
 
+  // This value represents the maximum number of threads that will
+  // concurrently perform a compaction job by breaking it into multiple,
+  // smaller ones that are run simultaneously.
+  // Default: 1 (i.e. no subcompactions)
+  uint32_t max_subcompactions;
+
   // Maximum number of concurrent background memtable flush jobs, submitted to
   // the HIGH priority thread pool.
   //
@@ -938,9 +996,14 @@ struct DBOptions {
   // Allow the OS to mmap file for reading sst tables. Default: false
   bool allow_mmap_reads;
 
-  // Allow the OS to mmap file for writing. Default: false
+  // Allow the OS to mmap file for writing.
+  // DB::SyncWAL() only works if this is set to false.
+  // Default: false
   bool allow_mmap_writes;
 
+  // If false, fallocate() calls are bypassed
+  bool allow_fallocate;
+
   // Disable child process inherit open files. Default: true
   bool is_fd_close_on_exec;
 
@@ -948,7 +1011,7 @@ struct DBOptions {
   bool skip_log_error_on_recovery;
 
   // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
-  // Default: 3600 (1 hour)
+  // Default: 600 (10 min)
   unsigned int stats_dump_period_sec;
 
   // If set true, will hint the underlying file system that the file
@@ -979,6 +1042,28 @@ struct DBOptions {
   };
   AccessHint access_hint_on_compaction_start;
 
+  // If true, always create a new file descriptor and new table reader
+  // for compaction inputs. Turn this parameter on may introduce extra
+  // memory usage in the table reader, if it allocates extra memory
+  // for indexes. This will allow file descriptor prefetch options
+  // to be set for compaction input files and not to impact file
+  // descriptors for the same file used by user queries.
+  // Suggest to enable BlockBasedTableOptions.cache_index_and_filter_blocks
+  // for this mode if using block-based table.
+  //
+  // Default: false
+  bool new_table_reader_for_compaction_inputs;
+
+  // If non-zero, we perform bigger reads when doing compaction. If you're
+  // running RocksDB on spinning disks, you should set this to at least 2MB.
+  // That way RocksDB's compaction is doing sequential instead of random reads.
+  //
+  // When non-zero, we also force new_table_reader_for_compaction_inputs to
+  // true.
+  //
+  // Default: 0
+  size_t compaction_readahead_size;
+
   // Use adaptive mutex, which spins in the user space before resorting
   // to kernel. This could reduce context switch when the mutex is not
   // heavily contended. However, if the mutex is hot, we could end up
@@ -994,7 +1079,9 @@ struct DBOptions {
   void Dump(Logger* log) const;
 
   // Allows OS to incrementally sync files to disk while they are being
-  // written, asynchronously, in the background.
+  // written, asynchronously, in the background. This operation can be used
+  // to smooth out write I/Os over time. Users shouldn't reply on it for
+  // persistency guarantee.
   // Issue one request for every bytes_per_sync written. 0 turns it off.
   // Default: 0
   //
@@ -1009,19 +1096,46 @@ struct DBOptions {
   // Default: 0, turned off
   uint64_t wal_bytes_per_sync;
 
+  // A vector of EventListeners which call-back functions will be called
+  // when specific RocksDB event happens.
+  std::vector<std::shared_ptr<EventListener>> listeners;
+
   // If true, then the status of the threads involved in this DB will
   // be tracked and available via GetThreadList() API.
   //
   // Default: false
   bool enable_thread_tracking;
+
+  // The limited write rate to DB if soft_rate_limit or
+  // level0_slowdown_writes_trigger is triggered. It is calculated using
+  // size of user write requests before compression.
+  // Unit: byte per second.
+  //
+  // Default: 1MB/s
+  uint64_t delayed_write_rate;
+
+  // If true, then DB::Open() will not update the statistics used to optimize
+  // compaction decision by loading table properties from many files.
+  // Turning off this feature will improve DBOpen time especially in
+  // disk environment.
+  //
+  // Default: false
+  bool skip_stats_update_on_db_open;
+
+  // Recovery mode to control the consistency while replaying WAL
+  // Default: kTolerateCorruptedTailRecords
+  WALRecoveryMode wal_recovery_mode;
+
+  // A global cache for table-level rows.
+  // Default: nullptr (disabled)
+  // Not supported in ROCKSDB_LITE mode!
+  std::shared_ptr<Cache> row_cache;
 };
 
 // Options to control the behavior of a database (passed to DB::Open)
 struct Options : public DBOptions, public ColumnFamilyOptions {
   // Create an Options object with default values for all fields.
-  Options() :
-    DBOptions(),
-    ColumnFamilyOptions() {}
+  Options() : DBOptions(), ColumnFamilyOptions() {}
 
   Options(const DBOptions& db_options,
           const ColumnFamilyOptions& column_family_options)
@@ -1029,6 +1143,8 @@ struct Options : public DBOptions, public ColumnFamilyOptions {
 
   void Dump(Logger* log) const;
 
+  void DumpCFOptions(Logger* log) const;
+
   // Set appropriate parameters for bulk loading.
   // The reason that this is a function that returns "this" instead of a
   // constructor is to enable chaining of multiple similar calls in the future.
@@ -1160,15 +1276,7 @@ struct WriteOptions {
   // and the write may got lost after a crash.
   bool disableWAL;
 
-  // If non-zero, then associated write waiting longer than the specified
-  // time MAY be aborted and returns Status::TimedOut. A write that takes
-  // less than the specified time is guaranteed to not fail with
-  // Status::TimedOut.
-  //
-  // The number of times a write call encounters a timeout is recorded in
-  // Statistics.WRITE_TIMEDOUT
-  //
-  // Default: 0
+  // The option is deprecated. It's not used anymore.
   uint64_t timeout_hint_us;
 
   // If true and if user is trying to write to column families that don't exist
@@ -1217,6 +1325,35 @@ struct CompactionOptions {
       : compression(kSnappyCompression),
         output_file_size_limit(std::numeric_limits<uint64_t>::max()) {}
 };
+
+// For level based compaction, we can configure if we want to skip/force
+// bottommost level compaction.
+enum class BottommostLevelCompaction {
+  // Skip bottommost level compaction
+  kSkip,
+  // Only compact bottommost level if there is a compaction filter
+  // This is the default option
+  kIfHaveCompactionFilter,
+  // Always compact bottommost level
+  kForce,
+};
+
+// CompactRangeOptions is used by CompactRange() call.
+struct CompactRangeOptions {
+  // If true, compacted files will be moved to the minimum level capable
+  // of holding the data or given level (specified non-negative target_level).
+  bool change_level = false;
+  // If change_level is true and target_level have non-negative value, compacted
+  // files will be moved to target_level.
+  int target_level = -1;
+  // Compaction outputs will be placed in options.db_paths[target_path_id].
+  // Behavior is undefined if target_path_id is out of range.
+  uint32_t target_path_id = 0;
+  // By default level based compaction will only compact the bottommost level
+  // if there is a compaction filter
+  BottommostLevelCompaction bottommost_level_compaction =
+      BottommostLevelCompaction::kIfHaveCompactionFilter;
+};
 }  // namespace rocksdb
 
 #endif  // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
diff --git a/src/rocksdb/include/rocksdb/perf_context.h b/src/rocksdb/include/rocksdb/perf_context.h
index 3b8145a..a7c993c 100644
--- a/src/rocksdb/include/rocksdb/perf_context.h
+++ b/src/rocksdb/include/rocksdb/perf_context.h
@@ -9,22 +9,13 @@
 #include <stdint.h>
 #include <string>
 
-namespace rocksdb {
-
-enum PerfLevel {
-  kDisable        = 0,  // disable perf stats
-  kEnableCount    = 1,  // enable only count stats
-  kEnableTime     = 2   // enable time stats too
-};
+#include "rocksdb/perf_level.h"
 
-// set the perf stats level
-void SetPerfLevel(PerfLevel level);
-
-// get current perf stats level
-PerfLevel GetPerfLevel();
+namespace rocksdb {
 
 // A thread local context for gathering performance counter efficiently
 // and transparently.
+// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
 
 struct PerfContext {
 
@@ -42,7 +33,7 @@ struct PerfContext {
   // total number of internal keys skipped over during iteration (overwritten or
   // deleted, to be more specific, hidden by a put or delete of the same key)
   uint64_t internal_key_skipped_count;
-  // total number of deletes skipped over during iteration
+  // total number of deletes and single deletes skipped over during iteration
   uint64_t internal_delete_skipped_count;
 
   uint64_t get_snapshot_time;          // total time spent on getting snapshot
@@ -64,20 +55,48 @@ struct PerfContext {
   uint64_t seek_internal_seek_time;
   // total time spent on iterating internal entries to find the next user entry
   uint64_t find_next_user_entry_time;
-  // total time spent on pre or post processing when writing a record
-  uint64_t write_pre_and_post_process_time;
-  uint64_t write_wal_time;            // total time spent on writing to WAL
+
+  // total time spent on writing to WAL
+  uint64_t write_wal_time;
   // total time spent on writing to mem tables
   uint64_t write_memtable_time;
+  // total time spent on delaying write
+  uint64_t write_delay_time;
+  // total time spent on writing a record, excluding the above three times
+  uint64_t write_pre_and_post_process_time;
+
   uint64_t db_mutex_lock_nanos;      // time spent on acquiring DB mutex.
   // Time spent on waiting with a condition variable created with DB mutex.
   uint64_t db_condition_wait_nanos;
   // Time spent on merge operator.
   uint64_t merge_operator_time_nanos;
+
+  // Time spent on reading index block from block cache or SST file
+  uint64_t read_index_block_nanos;
+  // Time spent on reading filter block from block cache or SST file
+  uint64_t read_filter_block_nanos;
+  // Time spent on creating data block iterator
+  uint64_t new_table_block_iter_nanos;
+  // Time spent on creating a iterator of an SST file.
+  uint64_t new_table_iterator_nanos;
+  // Time spent on seeking a key in data/index blocks
+  uint64_t block_seek_nanos;
+  // Time spent on finding or creating a table reader
+  uint64_t find_table_nanos;
+  // total number of mem table bloom hits
+  uint64_t bloom_memtable_hit_count;
+  // total number of mem table bloom misses
+  uint64_t bloom_memtable_miss_count;
+  // total number of SST table bloom hits
+  uint64_t bloom_sst_hit_count;
+  // total number of SST table bloom misses
+  uint64_t bloom_sst_miss_count;
 };
 
 #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
 extern PerfContext perf_context;
+#elif _WIN32
+extern __declspec(thread) PerfContext perf_context;
 #else
 extern __thread PerfContext perf_context;
 #endif
diff --git a/src/rocksdb/include/rocksdb/perf_level.h b/src/rocksdb/include/rocksdb/perf_level.h
new file mode 100644
index 0000000..fee8ce1
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/perf_level.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef INCLUDE_ROCKSDB_PERF_LEVEL_H_
+#define INCLUDE_ROCKSDB_PERF_LEVEL_H_
+
+#include <stdint.h>
+#include <string>
+
+namespace rocksdb {
+
+// How much perf stats to collect. Affects perf_context and iostats_context.
+
+enum PerfLevel {
+  kDisable        = 0,  // disable perf stats
+  kEnableCount    = 1,  // enable only count stats
+  kEnableTime     = 2   // enable time stats too
+};
+
+// set the perf stats level for current thread
+void SetPerfLevel(PerfLevel level);
+
+// get current perf stats level for current thread
+PerfLevel GetPerfLevel();
+
+}  // namespace rocksdb
+
+#endif  // INCLUDE_ROCKSDB_PERF_LEVEL_H_
diff --git a/src/rocksdb/include/rocksdb/rate_limiter.h b/src/rocksdb/include/rocksdb/rate_limiter.h
index 44c1bdf..ae3ab8f 100644
--- a/src/rocksdb/include/rocksdb/rate_limiter.h
+++ b/src/rocksdb/include/rocksdb/rate_limiter.h
@@ -23,7 +23,7 @@ class RateLimiter {
 
   // Request for token to write bytes. If this request can not be satisfied,
   // the call is blocked. Caller is responsible to make sure
-  // bytes < GetSingleBurstBytes()
+  // bytes <= GetSingleBurstBytes()
   virtual void Request(const int64_t bytes, const Env::IOPriority pri) = 0;
 
   // Max bytes can be granted in a single burst
diff --git a/src/rocksdb/include/rocksdb/slice.h b/src/rocksdb/include/rocksdb/slice.h
index 7019c90..ae3139c 100644
--- a/src/rocksdb/include/rocksdb/slice.h
+++ b/src/rocksdb/include/rocksdb/slice.h
@@ -74,19 +74,7 @@ class Slice {
   }
 
   // Return a string that contains the copy of the referenced data.
-  std::string ToString(bool hex = false) const {
-    if (hex) {
-      std::string result;
-      char buf[10];
-      for (size_t i = 0; i < size_; i++) {
-        snprintf(buf, 10, "%02X", (unsigned char)data_[i]);
-        result += buf;
-      }
-      return result;
-    } else {
-      return std::string(data_, size_);
-    }
-  }
+  std::string ToString(bool hex = false) const;
 
   // Three-way comparison.  Returns value:
   //   <  0 iff "*this" <  "b",
@@ -100,6 +88,9 @@ class Slice {
             (memcmp(data_, x.data_, x.size_) == 0));
   }
 
+  // Compare two slices and returns the first byte where they differ
+  size_t difference_offset(const Slice& b) const;
+
  // private: make these public for rocksdbjni access
   const char* data_;
   size_t size_;
@@ -137,6 +128,15 @@ inline int Slice::compare(const Slice& b) const {
   return r;
 }
 
+inline size_t Slice::difference_offset(const Slice& b) const {
+  size_t off = 0;
+  const size_t len = (size_ < b.size_) ? size_ : b.size_;
+  for (; off < len; off++) {
+    if (data_[off] != b.data_[off]) break;
+  }
+  return off;
+}
+
 }  // namespace rocksdb
 
 #endif  // STORAGE_ROCKSDB_INCLUDE_SLICE_H_
diff --git a/src/rocksdb/include/rocksdb/snapshot.h b/src/rocksdb/include/rocksdb/snapshot.h
new file mode 100644
index 0000000..aad675b
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/snapshot.h
@@ -0,0 +1,45 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include "rocksdb/types.h"
+
+namespace rocksdb {
+
+class DB;
+
+// Abstract handle to particular state of a DB.
+// A Snapshot is an immutable object and can therefore be safely
+// accessed from multiple threads without any external synchronization.
+//
+// To Create a Snapshot, call DB::GetSnapshot().
+// To Destroy a Snapshot, call DB::ReleaseSnapshot(snapshot).
+class Snapshot {
+ public:
+  // returns Snapshot's sequence number
+  virtual SequenceNumber GetSequenceNumber() const = 0;
+
+ protected:
+  virtual ~Snapshot();
+};
+
+// Simple RAII wrapper class for Snapshot.
+// Constructing this object will create a snapshot.  Destructing will
+// release the snapshot.
+class ManagedSnapshot {
+ public:
+  explicit ManagedSnapshot(DB* db);
+
+  ~ManagedSnapshot();
+
+  const Snapshot* snapshot();
+
+ private:
+  DB* db_;
+  const Snapshot* snapshot_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/sst_file_writer.h b/src/rocksdb/include/rocksdb/sst_file_writer.h
new file mode 100644
index 0000000..eb2f894
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/sst_file_writer.h
@@ -0,0 +1,77 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#include <string>
+#include "rocksdb/env.h"
+#include "rocksdb/immutable_options.h"
+#include "rocksdb/types.h"
+
+namespace rocksdb {
+
+class Comparator;
+
+// Table Properties that are specific to tables created by SstFileWriter.
+struct ExternalSstFilePropertyNames {
+  // value of this property is a fixed int32 number.
+  static const std::string kVersion;
+};
+
+// ExternalSstFileInfo include information about sst files created
+// using SstFileWriter
+struct ExternalSstFileInfo {
+  ExternalSstFileInfo() {}
+  ExternalSstFileInfo(const std::string& _file_path,
+                      const std::string& _smallest_key,
+                      const std::string& _largest_key,
+                      SequenceNumber _sequence_number, uint64_t _file_size,
+                      int32_t _num_entries, int32_t _version)
+      : file_path(_file_path),
+        smallest_key(_smallest_key),
+        largest_key(_largest_key),
+        sequence_number(_sequence_number),
+        file_size(_file_size),
+        num_entries(_num_entries),
+        version(_version) {}
+
+  std::string file_path;           // external sst file path
+  std::string smallest_key;        // smallest user key in file
+  std::string largest_key;         // largest user key in file
+  SequenceNumber sequence_number;  // sequence number of all keys in file
+  uint64_t file_size;              // file size in bytes
+  uint64_t num_entries;            // number of entries in file
+  int32_t version;                 // file version
+};
+
+// SstFileWriter is used to create sst files that can be added to database later
+// All keys in files generated by SstFileWriter will have sequence number = 0
+class SstFileWriter {
+ public:
+  SstFileWriter(const EnvOptions& env_options,
+                const ImmutableCFOptions& ioptions,
+                const Comparator* user_comparator);
+
+  ~SstFileWriter();
+
+  // Prepare SstFileWriter to write into file located at "file_path".
+  Status Open(const std::string& file_path);
+
+  // Add key, value to currently opened file
+  // REQUIRES: key is after any previously added key according to comparator.
+  Status Add(const Slice& user_key, const Slice& value);
+
+  // Finalize writing to sst file and close file.
+  //
+  // An optional ExternalSstFileInfo pointer can be passed to the function
+  // which will be populated with information about the created sst file
+  Status Finish(ExternalSstFileInfo* file_info = nullptr);
+
+ private:
+  class SstFileWriterPropertiesCollectorFactory;
+  class SstFileWriterPropertiesCollector;
+  struct Rep;
+  Rep* rep_;
+};
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/statistics.h b/src/rocksdb/include/rocksdb/statistics.h
index 4e06bf6..9a21fe1 100644
--- a/src/rocksdb/include/rocksdb/statistics.h
+++ b/src/rocksdb/include/rocksdb/statistics.h
@@ -45,6 +45,10 @@ enum Tickers : uint32_t {
   BLOCK_CACHE_DATA_MISS,
   // # of times cache hit when accessing data block from block cache.
   BLOCK_CACHE_DATA_HIT,
+  // # of bytes read from cache.
+  BLOCK_CACHE_BYTES_READ,
+  // # of bytes written into cache.
+  BLOCK_CACHE_BYTES_WRITE,
   // # of times bloom filter has avoided file reads.
   BLOOM_FILTER_USEFUL,
 
@@ -74,9 +78,25 @@ enum Tickers : uint32_t {
   NUMBER_KEYS_READ,
   // Number keys updated, if inplace update is enabled
   NUMBER_KEYS_UPDATED,
-  // Bytes written / read
+  // The number of uncompressed bytes issued by DB::Put(), DB::Delete(),
+  // DB::Merge(), and DB::Write().
   BYTES_WRITTEN,
+  // The number of uncompressed bytes read from DB::Get().  It could be
+  // either from memtables, cache, or table files.
+  // For the number of logical bytes read from DB::MultiGet(),
+  // please use NUMBER_MULTIGET_BYTES_READ.
   BYTES_READ,
+  // The number of calls to seek/next/prev
+  NUMBER_DB_SEEK,
+  NUMBER_DB_NEXT,
+  NUMBER_DB_PREV,
+  // The number of calls to seek/next/prev that returned data
+  NUMBER_DB_SEEK_FOUND,
+  NUMBER_DB_NEXT_FOUND,
+  NUMBER_DB_PREV_FOUND,
+  // The number of uncompressed bytes read from an iterator.
+  // Includes size of key and value.
+  ITER_BYTES_READ,
   NO_FILE_CLOSES,
   NO_FILE_OPENS,
   NO_FILE_ERRORS,
@@ -141,6 +161,11 @@ enum Tickers : uint32_t {
   NUMBER_BLOCK_NOT_COMPRESSED,
   MERGE_OPERATION_TOTAL_TIME,
   FILTER_OPERATION_TOTAL_TIME,
+
+  // Row cache.
+  ROW_CACHE_HIT,
+  ROW_CACHE_MISS,
+
   TICKER_ENUM_MAX
 };
 
@@ -156,6 +181,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
     {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
     {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
+    {BLOCK_CACHE_BYTES_READ, "rocksdb.block.cache.bytes.read"},
+    {BLOCK_CACHE_BYTES_WRITE, "rocksdb.block.cache.bytes.write"},
     {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
     {MEMTABLE_HIT, "rocksdb.memtable.hit"},
     {MEMTABLE_MISS, "rocksdb.memtable.miss"},
@@ -170,6 +197,13 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
     {BYTES_WRITTEN, "rocksdb.bytes.written"},
     {BYTES_READ, "rocksdb.bytes.read"},
+    {NUMBER_DB_SEEK, "rocksdb.number.db.seek"},
+    {NUMBER_DB_NEXT, "rocksdb.number.db.next"},
+    {NUMBER_DB_PREV, "rocksdb.number.db.prev"},
+    {NUMBER_DB_SEEK_FOUND, "rocksdb.number.db.seek.found"},
+    {NUMBER_DB_NEXT_FOUND, "rocksdb.number.db.next.found"},
+    {NUMBER_DB_PREV_FOUND, "rocksdb.number.db.prev.found"},
+    {ITER_BYTES_READ, "rocksdb.db.iter.bytes.read"},
     {NO_FILE_CLOSES, "rocksdb.no.file.closes"},
     {NO_FILE_OPENS, "rocksdb.no.file.opens"},
     {NO_FILE_ERRORS, "rocksdb.no.file.errors"},
@@ -196,7 +230,6 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {WAL_FILE_BYTES, "rocksdb.wal.bytes"},
     {WRITE_DONE_BY_SELF, "rocksdb.write.self"},
     {WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
-    {WRITE_TIMEDOUT, "rocksdb.write.timedout"},
     {WRITE_WITH_WAL, "rocksdb.write.wal"},
     {FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"},
     {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
@@ -209,6 +242,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
     {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"},
     {MERGE_OPERATION_TOTAL_TIME, "rocksdb.merge.operation.time.nanos"},
     {FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"},
+    {ROW_CACHE_HIT, "rocksdb.row.cache.hit"},
+    {ROW_CACHE_MISS, "rocksdb.row.cache.miss"},
 };
 
 /**
@@ -222,6 +257,7 @@ enum Histograms : uint32_t {
   DB_GET = 0,
   DB_WRITE,
   COMPACTION_TIME,
+  SUBCOMPACTION_SETUP_TIME,
   TABLE_SYNC_MICROS,
   COMPACTION_OUTFILE_SYNC_MICROS,
   WAL_FILE_SYNC_MICROS,
@@ -240,30 +276,36 @@ enum Histograms : uint32_t {
   NUM_FILES_IN_SINGLE_COMPACTION,
   DB_SEEK,
   WRITE_STALL,
+  SST_READ_MICROS,
+  // The number of subcompactions actually scheduled during a compaction
+  NUM_SUBCOMPACTIONS_SCHEDULED,
   HISTOGRAM_ENUM_MAX,  // TODO(ldemailly): enforce HistogramsNameMap match
 };
 
 const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
-  { DB_GET, "rocksdb.db.get.micros" },
-  { DB_WRITE, "rocksdb.db.write.micros" },
-  { COMPACTION_TIME, "rocksdb.compaction.times.micros" },
-  { TABLE_SYNC_MICROS, "rocksdb.table.sync.micros" },
-  { COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros" },
-  { WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros" },
-  { MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros" },
-  { TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros" },
-  { DB_MULTIGET, "rocksdb.db.multiget.micros" },
-  { READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros" },
-  { READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros" },
-  { WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros" },
-  { STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
-  { STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
-  { STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
-  { HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
-  { SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
-  { NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction" },
-  { DB_SEEK, "rocksdb.db.seek.micros" },
-  { WRITE_STALL, "rocksdb.db.write.stall" },
+    {DB_GET, "rocksdb.db.get.micros"},
+    {DB_WRITE, "rocksdb.db.write.micros"},
+    {COMPACTION_TIME, "rocksdb.compaction.times.micros"},
+    {SUBCOMPACTION_SETUP_TIME, "rocksdb.subcompaction.setup.times.micros"},
+    {TABLE_SYNC_MICROS, "rocksdb.table.sync.micros"},
+    {COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros"},
+    {WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros"},
+    {MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros"},
+    {TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros"},
+    {DB_MULTIGET, "rocksdb.db.multiget.micros"},
+    {READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros"},
+    {READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros"},
+    {WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros"},
+    {STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
+    {STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
+    {STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
+    {HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
+    {SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
+    {NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction"},
+    {DB_SEEK, "rocksdb.db.seek.micros"},
+    {WRITE_STALL, "rocksdb.db.write.stall"},
+    {SST_READ_MICROS, "rocksdb.sst.read.micros"},
+    {NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"},
 };
 
 struct HistogramData {
@@ -282,7 +324,7 @@ class Statistics {
   virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
   virtual void histogramData(uint32_t type,
                              HistogramData* const data) const = 0;
-
+  virtual std::string getHistogramString(uint32_t type) const { return ""; }
   virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0;
   virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0;
   virtual void measureTime(uint32_t histogramType, uint64_t time) = 0;
diff --git a/src/rocksdb/include/rocksdb/status.h b/src/rocksdb/include/rocksdb/status.h
index 177d705..e8e7970 100644
--- a/src/rocksdb/include/rocksdb/status.h
+++ b/src/rocksdb/include/rocksdb/status.h
@@ -25,12 +25,43 @@ namespace rocksdb {
 class Status {
  public:
   // Create a success status.
-  Status() : code_(kOk), state_(nullptr) { }
+  Status() : code_(kOk), subcode_(kNone), state_(nullptr) {}
   ~Status() { delete[] state_; }
 
   // Copy the specified status.
   Status(const Status& s);
   void operator=(const Status& s);
+  bool operator==(const Status& rhs) const;
+  bool operator!=(const Status& rhs) const;
+
+  enum Code {
+    kOk = 0,
+    kNotFound = 1,
+    kCorruption = 2,
+    kNotSupported = 3,
+    kInvalidArgument = 4,
+    kIOError = 5,
+    kMergeInProgress = 6,
+    kIncomplete = 7,
+    kShutdownInProgress = 8,
+    kTimedOut = 9,
+    kAborted = 10,
+    kBusy = 11,
+    kExpired = 12,
+    kTryAgain = 13
+  };
+
+  Code code() const { return code_; }
+
+  enum SubCode {
+    kNone = 0,
+    kMutexTimeout = 1,
+    kLockTimeout = 2,
+    kLockLimit = 3,
+    kMaxSubCode
+  };
+
+  SubCode subcode() const { return subcode_; }
 
   // Return a success status.
   static Status OK() { return Status(); }
@@ -40,45 +71,78 @@ class Status {
     return Status(kNotFound, msg, msg2);
   }
   // Fast path for not found without malloc;
-  static Status NotFound() {
-    return Status(kNotFound);
-  }
+  static Status NotFound(SubCode msg = kNone) { return Status(kNotFound, msg); }
+
   static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kCorruption, msg, msg2);
   }
+  static Status Corruption(SubCode msg = kNone) {
+    return Status(kCorruption, msg);
+  }
+
   static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kNotSupported, msg, msg2);
   }
+  static Status NotSupported(SubCode msg = kNone) {
+    return Status(kNotSupported, msg);
+  }
+
   static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kInvalidArgument, msg, msg2);
   }
+  static Status InvalidArgument(SubCode msg = kNone) {
+    return Status(kInvalidArgument, msg);
+  }
+
   static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kIOError, msg, msg2);
   }
+  static Status IOError(SubCode msg = kNone) { return Status(kIOError, msg); }
+
   static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kMergeInProgress, msg, msg2);
   }
+  static Status MergeInProgress(SubCode msg = kNone) {
+    return Status(kMergeInProgress, msg);
+  }
+
   static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kIncomplete, msg, msg2);
   }
-  static Status ShutdownInProgress() {
-    return Status(kShutdownInProgress);
+  static Status Incomplete(SubCode msg = kNone) {
+    return Status(kIncomplete, msg);
+  }
+
+  static Status ShutdownInProgress(SubCode msg = kNone) {
+    return Status(kShutdownInProgress, msg);
   }
   static Status ShutdownInProgress(const Slice& msg,
                                    const Slice& msg2 = Slice()) {
     return Status(kShutdownInProgress, msg, msg2);
   }
-  static Status TimedOut() {
-    return Status(kTimedOut);
+  static Status Aborted(SubCode msg = kNone) { return Status(kAborted, msg); }
+  static Status Aborted(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kAborted, msg, msg2);
   }
+
+  static Status Busy(SubCode msg = kNone) { return Status(kBusy, msg); }
+  static Status Busy(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kBusy, msg, msg2);
+  }
+
+  static Status TimedOut(SubCode msg = kNone) { return Status(kTimedOut, msg); }
   static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kTimedOut, msg, msg2);
   }
-  static Status Aborted() {
-    return Status(kAborted);
+
+  static Status Expired(SubCode msg = kNone) { return Status(kExpired, msg); }
+  static Status Expired(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kExpired, msg, msg2);
   }
-  static Status Aborted(const Slice& msg, const Slice& msg2 = Slice()) {
-    return Status(kAborted, msg, msg2);
+
+  static Status TryAgain(SubCode msg = kNone) { return Status(kTryAgain, msg); }
+  static Status TryAgain(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kTryAgain, msg, msg2);
   }
 
   // Returns true iff the status indicates success.
@@ -112,27 +176,22 @@ class Status {
 
   bool IsAborted() const { return code() == kAborted; }
 
+  // Returns true iff the status indicates that a resource is Busy and
+  // temporarily could not be acquired.
+  bool IsBusy() const { return code() == kBusy; }
+
+  // Returns true iff the status indicated that the operation has Expired.
+  bool IsExpired() const { return code() == kExpired; }
+
+  // Returns true iff the status indicates a TryAgain error.
+  // This usually means that the operation failed, but may succeed if
+  // re-attempted.
+  bool IsTryAgain() const { return code() == kTryAgain; }
+
   // Return a string representation of this status suitable for printing.
   // Returns the string "OK" for success.
   std::string ToString() const;
 
-  enum Code {
-    kOk = 0,
-    kNotFound = 1,
-    kCorruption = 2,
-    kNotSupported = 3,
-    kInvalidArgument = 4,
-    kIOError = 5,
-    kMergeInProgress = 6,
-    kIncomplete = 7,
-    kShutdownInProgress = 8,
-    kTimedOut = 9,
-    kAborted = 10
-  };
-
-  Code code() const {
-    return code_;
-  }
  private:
   // A nullptr state_ (which is always the case for OK) means the message
   // is empty.
@@ -140,27 +199,40 @@ class Status {
   //    state_[0..3] == length of message
   //    state_[4..]  == message
   Code code_;
+  SubCode subcode_;
   const char* state_;
 
-  explicit Status(Code _code) : code_(_code), state_(nullptr) {}
+  static const char* msgs[static_cast<int>(kMaxSubCode)];
+
+  explicit Status(Code _code, SubCode _subcode = kNone)
+      : code_(_code), subcode_(_subcode), state_(nullptr) {}
+
   Status(Code _code, const Slice& msg, const Slice& msg2);
   static const char* CopyState(const char* s);
 };
 
-inline Status::Status(const Status& s) {
-  code_ = s.code_;
+inline Status::Status(const Status& s) : code_(s.code_), subcode_(s.subcode_) {
   state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
 }
 inline void Status::operator=(const Status& s) {
   // The following condition catches both aliasing (when this == &s),
   // and the common case where both s and *this are ok.
   code_ = s.code_;
+  subcode_ = s.subcode_;
   if (state_ != s.state_) {
     delete[] state_;
     state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
   }
 }
 
+inline bool Status::operator==(const Status& rhs) const {
+  return (code_ == rhs.code_);
+}
+
+inline bool Status::operator!=(const Status& rhs) const {
+  return !(*this == rhs);
+}
+
 }  // namespace rocksdb
 
 #endif  // STORAGE_ROCKSDB_INCLUDE_STATUS_H_
diff --git a/src/rocksdb/include/rocksdb/table.h b/src/rocksdb/include/rocksdb/table.h
index b84363a..e52b580 100644
--- a/src/rocksdb/include/rocksdb/table.h
+++ b/src/rocksdb/include/rocksdb/table.h
@@ -31,10 +31,11 @@ namespace rocksdb {
 // -- Block-based Table
 class FlushBlockPolicyFactory;
 class RandomAccessFile;
+struct TableReaderOptions;
 struct TableBuilderOptions;
 class TableBuilder;
 class TableReader;
-class WritableFile;
+class WritableFileWriter;
 struct EnvOptions;
 struct Options;
 
@@ -315,6 +316,8 @@ extern TableFactory* NewCuckooTableFactory(
 
 #endif  // ROCKSDB_LITE
 
+class RandomAccessFileReader;
+
 // A base class for table factories.
 class TableFactory {
  public:
@@ -333,22 +336,22 @@ class TableFactory {
   // in parameter file. It's the caller's responsibility to make sure
   // file is in the correct format.
   //
-  // NewTableReader() is called in two places:
+  // NewTableReader() is called in three places:
   // (1) TableCache::FindTable() calls the function when table cache miss
   //     and cache the table object returned.
-  // (1) SstFileReader (for SST Dump) opens the table and dump the table
+  // (2) SstFileReader (for SST Dump) opens the table and dump the table
   //     contents using the interator of the table.
-  // ImmutableCFOptions is a subset of Options that can not be altered.
-  // EnvOptions is a subset of Options that will be used by Env.
-  // Multiple configured can be accessed from there, including and not
-  // limited to block cache and key comparators.
-  // file is a file handler to handle the file for the table
-  // file_size is the physical file size of the file
-  // table_reader is the output table reader
+  // (3) DBImpl::AddFile() calls this function to read the contents of
+  //     the sst file it's attempting to add
+  //
+  // table_reader_options is a TableReaderOptions which contain all the
+  //    needed parameters and configuration to open the table.
+  // file is a file handler to handle the file for the table.
+  // file_size is the physical file size of the file.
+  // table_reader is the output table reader.
   virtual Status NewTableReader(
-      const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
-      const InternalKeyComparator& internal_comparator,
-      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+      const TableReaderOptions& table_reader_options,
+      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
       unique_ptr<TableReader>* table_reader) const = 0;
 
   // Return a table builder to write to a file for this table type.
@@ -372,7 +375,7 @@ class TableFactory {
   // to use in this table.
   virtual TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      WritableFile* file) const = 0;
+      WritableFileWriter* file) const = 0;
 
   // Sanitizes the specified DB Options and ColumnFamilyOptions.
   //
diff --git a/src/rocksdb/include/rocksdb/table_properties.h b/src/rocksdb/include/rocksdb/table_properties.h
index 8572021..2850074 100644
--- a/src/rocksdb/include/rocksdb/table_properties.h
+++ b/src/rocksdb/include/rocksdb/table_properties.h
@@ -3,6 +3,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
 
+#include <stdint.h>
 #include <string>
 #include <map>
 #include "rocksdb/status.h"
@@ -24,7 +25,7 @@ namespace rocksdb {
 //      ++pos) {
 //   ...
 // }
-typedef std::map<const std::string, std::string> UserCollectedProperties;
+typedef std::map<std::string, std::string> UserCollectedProperties;
 
 // TableProperties contains a bunch of read-only properties of its associated
 // table.
@@ -60,6 +61,10 @@ struct TableProperties {
   //   @prop_delim: delimiter for each property.
   std::string ToString(const std::string& prop_delim = "; ",
                        const std::string& kv_delim = "=") const;
+
+  // Aggregate the numerical member variables of the specified
+  // TableProperties.
+  void Add(const TableProperties& tp);
 };
 
 // table properties' human-readable names in the property block.
@@ -81,15 +86,16 @@ extern const std::string kPropertiesBlock;
 enum EntryType {
   kEntryPut,
   kEntryDelete,
+  kEntrySingleDelete,
   kEntryMerge,
   kEntryOther,
 };
 
 // `TablePropertiesCollector` provides the mechanism for users to collect
-// their own interested properties. This class is essentially a collection
-// of callback functions that will be invoked during table building.
-// It is construced with TablePropertiesCollectorFactory. The methods don't
-// need to be thread-safe, as we will create exactly one
+// their own properties that they are interested in. This class is essentially
+// a collection of callback functions that will be invoked during table
+// building. It is construced with TablePropertiesCollectorFactory. The methods
+// don't need to be thread-safe, as we will create exactly one
 // TablePropertiesCollector object per table and then call it sequentially
 class TablePropertiesCollector {
  public:
@@ -113,7 +119,7 @@ class TablePropertiesCollector {
   virtual Status AddUserKey(const Slice& key, const Slice& value,
                             EntryType type, SequenceNumber seq,
                             uint64_t file_size) {
-    // For backward-compatible.
+    // For backwards-compatibility.
     return Add(key, value);
   }
 
@@ -129,6 +135,9 @@ class TablePropertiesCollector {
 
   // The name of the properties collector can be used for debugging purpose.
   virtual const char* Name() const = 0;
+
+  // EXPERIMENTAL Return whether the output file should be further compacted
+  virtual bool NeedCompact() const { return false; }
 };
 
 // Constructs TablePropertiesCollector. Internals create a new
diff --git a/src/rocksdb/include/rocksdb/thread_status.h b/src/rocksdb/include/rocksdb/thread_status.h
index 67346b8..d8a61b4 100644
--- a/src/rocksdb/include/rocksdb/thread_status.h
+++ b/src/rocksdb/include/rocksdb/thread_status.h
@@ -13,6 +13,7 @@
 
 #pragma once
 
+#include <stdint.h>
 #include <cstddef>
 #include <map>
 #include <string>
@@ -31,7 +32,11 @@ namespace rocksdb {
 
 // TODO(yhchiang): remove this function once c++14 is available
 //                 as std::max will be able to cover this.
-constexpr int constexpr_max(int a, int b) { return a > b ? a : b; }
+// Current MS compiler does not support constexpr
+template <int A, int B>
+struct constexpr_max {
+  static const int result = (A > B) ? A : B;
+};
 
 // A structure that describes the current status of a thread.
 // The status of active threads can be fetched using
@@ -62,7 +67,6 @@ struct ThreadStatus {
     STAGE_COMPACTION_PREPARE,
     STAGE_COMPACTION_RUN,
     STAGE_COMPACTION_PROCESS_KV,
-    STAGE_COMPACTION_FILTER_V2,
     STAGE_COMPACTION_INSTALL,
     STAGE_COMPACTION_SYNC_FILE,
     STAGE_PICK_MEMTABLES_TO_FLUSH,
@@ -91,7 +95,7 @@ struct ThreadStatus {
   // The maximum number of properties of an operation.
   // This number should be set to the biggest NUM_XXX_PROPERTIES.
   static const int kNumOperationProperties =
-      constexpr_max(NUM_COMPACTION_PROPERTIES, NUM_FLUSH_PROPERTIES);
+      constexpr_max<NUM_COMPACTION_PROPERTIES, NUM_FLUSH_PROPERTIES>::result;
 
   // The type used to refer to a thread state.
   // A state describes lower-level action of a thread
diff --git a/src/rocksdb/include/rocksdb/transaction_log.h b/src/rocksdb/include/rocksdb/transaction_log.h
index 30443bb..1b80b9a 100644
--- a/src/rocksdb/include/rocksdb/transaction_log.h
+++ b/src/rocksdb/include/rocksdb/transaction_log.h
@@ -58,6 +58,27 @@ class LogFile {
 struct BatchResult {
   SequenceNumber sequence = 0;
   std::unique_ptr<WriteBatch> writeBatchPtr;
+
+  // Add empty __ctor and __dtor for the rule of five
+  // However, preserve the original semantics and prohibit copying
+  // as the unique_ptr member does not copy.
+  BatchResult() {}
+
+  ~BatchResult() {}
+
+  BatchResult(const BatchResult&) = delete;
+
+  BatchResult& operator=(const BatchResult&) = delete;
+
+  BatchResult(BatchResult&& bResult)
+      : sequence(std::move(bResult.sequence)),
+        writeBatchPtr(std::move(bResult.writeBatchPtr)) {}
+
+  BatchResult& operator=(BatchResult&& bResult) {
+    sequence = std::move(bResult.sequence);
+    writeBatchPtr = std::move(bResult.writeBatchPtr);
+    return *this;
+  }
 };
 
 // A TransactionLogIterator is used to iterate over the transactions in a db.
diff --git a/src/rocksdb/include/rocksdb/universal_compaction.h b/src/rocksdb/include/rocksdb/universal_compaction.h
index 229e50b..e0f9f83 100644
--- a/src/rocksdb/include/rocksdb/universal_compaction.h
+++ b/src/rocksdb/include/rocksdb/universal_compaction.h
@@ -69,6 +69,11 @@ class CompactionOptionsUniversal {
   // Default: kCompactionStopStyleTotalSize
   CompactionStopStyle stop_style;
 
+  // Option to optimize the universal multi level compaction by enabling
+  // trivial move for non overlapping files.
+  // Default: false
+  bool allow_trivial_move;
+
   // Default set of parameters
   CompactionOptionsUniversal()
       : size_ratio(1),
@@ -76,7 +81,8 @@ class CompactionOptionsUniversal {
         max_merge_width(UINT_MAX),
         max_size_amplification_percent(200),
         compression_size_percent(-1),
-        stop_style(kCompactionStopStyleTotalSize) {}
+        stop_style(kCompactionStopStyleTotalSize),
+        allow_trivial_move(false) {}
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/utilities/backupable_db.h b/src/rocksdb/include/rocksdb/utilities/backupable_db.h
index 956ab3d..5c32750 100644
--- a/src/rocksdb/include/rocksdb/utilities/backupable_db.h
+++ b/src/rocksdb/include/rocksdb/utilities/backupable_db.h
@@ -18,6 +18,7 @@
 #include <string>
 #include <map>
 #include <vector>
+#include <functional>
 
 #include "rocksdb/utilities/stackable_db.h"
 
@@ -87,16 +88,25 @@ struct BackupableDBOptions {
   // *turn it on only if you know what you're doing*
   bool share_files_with_checksum;
 
+  // Up to this many background threads will copy files for CreateNewBackup()
+  // and RestoreDBFromBackup()
+  // Default: 1
+  int max_background_operations;
+
+  // During backup user can get callback every time next
+  // callback_trigger_interval_size bytes being copied.
+  // Default: 4194304
+  uint64_t callback_trigger_interval_size;
+
   void Dump(Logger* logger) const;
 
-  explicit BackupableDBOptions(const std::string& _backup_dir,
-                               Env* _backup_env = nullptr,
-                               bool _share_table_files = true,
-                               Logger* _info_log = nullptr, bool _sync = true,
-                               bool _destroy_old_data = false,
-                               bool _backup_log_files = true,
-                               uint64_t _backup_rate_limit = 0,
-                               uint64_t _restore_rate_limit = 0)
+  explicit BackupableDBOptions(
+      const std::string& _backup_dir, Env* _backup_env = nullptr,
+      bool _share_table_files = true, Logger* _info_log = nullptr,
+      bool _sync = true, bool _destroy_old_data = false,
+      bool _backup_log_files = true, uint64_t _backup_rate_limit = 0,
+      uint64_t _restore_rate_limit = 0, int _max_background_operations = 1,
+      uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024)
       : backup_dir(_backup_dir),
         backup_env(_backup_env),
         share_table_files(_share_table_files),
@@ -106,7 +116,9 @@ struct BackupableDBOptions {
         backup_log_files(_backup_log_files),
         backup_rate_limit(_backup_rate_limit),
         restore_rate_limit(_restore_rate_limit),
-        share_files_with_checksum(false) {
+        share_files_with_checksum(false),
+        max_background_operations(_max_background_operations),
+        callback_trigger_interval_size(_callback_trigger_interval_size) {
     assert(share_table_files || !share_files_with_checksum);
   }
 };
@@ -171,10 +183,6 @@ class BackupEngineReadOnly {
  public:
   virtual ~BackupEngineReadOnly() {}
 
-  static BackupEngineReadOnly* NewReadOnlyBackupEngine(
-      Env* db_env, const BackupableDBOptions& options)
-      __attribute__((deprecated("Please use Open() instead")));
-
   static Status Open(Env* db_env, const BackupableDBOptions& options,
                      BackupEngineReadOnly** backup_engine_ptr);
 
@@ -194,6 +202,11 @@ class BackupEngineReadOnly {
   virtual Status RestoreDBFromLatestBackup(
       const std::string& db_dir, const std::string& wal_dir,
       const RestoreOptions& restore_options = RestoreOptions()) = 0;
+
+  // checks that each file exists and that the size of the file matches our
+  // expectations. it does not check file checksum.
+  // Returns Status::OK() if all checks are good
+  virtual Status VerifyBackup(BackupID backup_id) = 0;
 };
 
 // Please see the documentation in BackupableDB and RestoreBackupableDB
@@ -201,15 +214,13 @@ class BackupEngine {
  public:
   virtual ~BackupEngine() {}
 
-  static BackupEngine* NewBackupEngine(Env* db_env,
-                                       const BackupableDBOptions& options)
-    __attribute__((deprecated("Please use Open() instead")));
-
   static Status Open(Env* db_env,
                      const BackupableDBOptions& options,
                      BackupEngine** backup_engine_ptr);
 
-  virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false) = 0;
+  virtual Status CreateNewBackup(
+      DB* db, bool flush_before_backup = false,
+      std::function<void()> progress_callback = []() {}) = 0;
   virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0;
   virtual Status DeleteBackup(BackupID backup_id) = 0;
   virtual void StopBackup() = 0;
@@ -224,6 +235,11 @@ class BackupEngine {
       const std::string& db_dir, const std::string& wal_dir,
       const RestoreOptions& restore_options = RestoreOptions()) = 0;
 
+  // checks that each file exists and that the size of the file matches our
+  // expectations. it does not check file checksum.
+  // Returns Status::OK() if all checks are good
+  virtual Status VerifyBackup(BackupID backup_id) = 0;
+
   virtual Status GarbageCollect() = 0;
 };
 
@@ -265,6 +281,7 @@ class BackupableDB : public StackableDB {
 
  private:
   BackupEngine* backup_engine_;
+  Status status_;
 };
 
 // Use this class to access information about backups and restore from them
@@ -310,6 +327,7 @@ class RestoreBackupableDB {
 
  private:
   BackupEngine* backup_engine_;
+  Status status_;
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/utilities/checkpoint.h b/src/rocksdb/include/rocksdb/utilities/checkpoint.h
index b60f4eb..b2d5458 100644
--- a/src/rocksdb/include/rocksdb/utilities/checkpoint.h
+++ b/src/rocksdb/include/rocksdb/utilities/checkpoint.h
@@ -6,6 +6,7 @@
 // A checkpoint is an openable snapshot of a database at a point in time.
 
 #pragma once
+#ifndef ROCKSDB_LITE
 
 #include <string>
 #include "rocksdb/status.h"
@@ -32,3 +33,4 @@ class Checkpoint {
 };
 
 }  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/convenience.h b/src/rocksdb/include/rocksdb/utilities/convenience.h
index 1c1057d..fae420b 100644
--- a/src/rocksdb/include/rocksdb/utilities/convenience.h
+++ b/src/rocksdb/include/rocksdb/utilities/convenience.h
@@ -5,59 +5,6 @@
 
 #pragma once
 
-#include <unordered_map>
-#include <string>
-#include "rocksdb/options.h"
-#include "rocksdb/table.h"
+// This file was moved to rocksdb/convenience.h"
 
-namespace rocksdb {
-
-#ifndef ROCKSDB_LITE
-// Take a map of option name and option value, apply them into the
-// base_options, and return the new options as a result
-Status GetColumnFamilyOptionsFromMap(
-    const ColumnFamilyOptions& base_options,
-    const std::unordered_map<std::string, std::string>& opts_map,
-    ColumnFamilyOptions* new_options);
-
-Status GetDBOptionsFromMap(
-    const DBOptions& base_options,
-    const std::unordered_map<std::string, std::string>& opts_map,
-    DBOptions* new_options);
-
-Status GetBlockBasedTableOptionsFromMap(
-    const BlockBasedTableOptions& table_options,
-    const std::unordered_map<std::string, std::string>& opts_map,
-    BlockBasedTableOptions* new_table_options);
-
-// Take a string representation of option names and  values, apply them into the
-// base_options, and return the new options as a result. The string has the
-// following format:
-//   "write_buffer_size=1024;max_write_buffer_number=2"
-// Nested options config is also possible. For example, you can define
-// BlockBasedTableOptions as part of the string for block-based table factory:
-//   "write_buffer_size=1024;block_based_table_factory={block_size=4k};"
-//   "max_write_buffer_num=2"
-Status GetColumnFamilyOptionsFromString(
-    const ColumnFamilyOptions& base_options,
-    const std::string& opts_str,
-    ColumnFamilyOptions* new_options);
-
-Status GetDBOptionsFromString(
-    const DBOptions& base_options,
-    const std::string& opts_str,
-    DBOptions* new_options);
-
-Status GetBlockBasedTableOptionsFromString(
-    const BlockBasedTableOptions& table_options,
-    const std::string& opts_str,
-    BlockBasedTableOptions* new_table_options);
-
-Status GetOptionsFromString(const Options& base_options,
-                            const std::string& opts_str, Options* new_options);
-
-/// Request stopping background work, if wait is true wait until it's done
-void CancelAllBackgroundWork(DB* db, bool wait = false);
-#endif  // ROCKSDB_LITE
-
-}  // namespace rocksdb
+#include "rocksdb/convenience.h"
diff --git a/src/rocksdb/include/rocksdb/utilities/info_log_finder.h b/src/rocksdb/include/rocksdb/utilities/info_log_finder.h
new file mode 100644
index 0000000..916c54c
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/info_log_finder.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+// This function can be used to list the Information logs,
+// given the db pointer.
+Status GetInfoLogList(DB* db, std::vector<std::string>* info_log_list);
+}  // namespace rocksdb
diff --git a/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h b/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h
new file mode 100644
index 0000000..772e645
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h
@@ -0,0 +1,72 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+
+namespace rocksdb {
+
+class Transaction;
+
+// Database with Transaction support.
+//
+// See optimistic_transaction.h and examples/transaction_example.cc
+
+// Options to use when starting an Optimistic Transaction
+struct OptimisticTransactionOptions {
+  // Setting set_snapshot=true is the same as calling SetSnapshot().
+  bool set_snapshot = false;
+
+  // Should be set if the DB has a non-default comparator.
+  // See comment in WriteBatchWithIndex constructor.
+  const Comparator* cmp = BytewiseComparator();
+};
+
+class OptimisticTransactionDB {
+ public:
+  // Open an OptimisticTransactionDB similar to DB::Open().
+  static Status Open(const Options& options, const std::string& dbname,
+                     OptimisticTransactionDB** dbptr);
+
+  static Status Open(const DBOptions& db_options, const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     OptimisticTransactionDB** dbptr);
+
+  virtual ~OptimisticTransactionDB() {}
+
+  // Starts a new Transaction.  Passing set_snapshot=true has the same effect
+  // as calling SetSnapshot().
+  //
+  // Caller should delete the returned transaction after calling
+  // Commit() or Rollback().
+  virtual Transaction* BeginTransaction(
+      const WriteOptions& write_options,
+      const OptimisticTransactionOptions&
+          txn_options = OptimisticTransactionOptions()) = 0;
+
+  // Return the underlying Database that was opened
+  virtual DB* GetBaseDB() = 0;
+
+ protected:
+  // To Create an OptimisticTransactionDB, call Open()
+  explicit OptimisticTransactionDB(DB* db) {}
+  OptimisticTransactionDB() {}
+
+ private:
+  // No copying allowed
+  OptimisticTransactionDB(const OptimisticTransactionDB&);
+  void operator=(const OptimisticTransactionDB&);
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/spatial_db.h b/src/rocksdb/include/rocksdb/utilities/spatial_db.h
index 1beb5c7..50abbf4 100644
--- a/src/rocksdb/include/rocksdb/utilities/spatial_db.h
+++ b/src/rocksdb/include/rocksdb/utilities/spatial_db.h
@@ -57,34 +57,57 @@ struct Variant {
     new (&data_.s) std::string(s);
   }
 
-  Variant(const Variant& v);
+  Variant(const Variant& v) : type_(v.type_) { Init(v, data_); }
 
-  ~Variant() {
-    if (type_ == kString) {
-      using std::string;
-      (&data_.s)->~string();
-    }
-  }
+  Variant& operator=(const Variant& v);
+
+  Variant(Variant&& rhs) : type_(kNull) { *this = std::move(rhs); }
+
+  Variant& operator=(Variant&& v);
+
+  ~Variant() { Destroy(type_, data_); }
 
   Type type() const { return type_; }
   bool get_bool() const { return data_.b; }
   uint64_t get_int() const { return data_.i; }
   double get_double() const { return data_.d; }
-  const std::string& get_string() const { return data_.s; }
+  const std::string& get_string() const { return *GetStringPtr(data_); }
 
-  bool operator==(const Variant& other);
-  bool operator!=(const Variant& other);
+  bool operator==(const Variant& other) const;
+  bool operator!=(const Variant& other) const { return !(*this == other); }
 
  private:
   Type type_;
+
   union Data {
-    Data() {}
-    ~Data() {}
     bool b;
     uint64_t i;
     double d;
-    std::string s;
+    // Current version of MS compiler not C++11 compliant so can not put
+    // std::string
+    // however, even then we still need the rest of the maintenance.
+    char s[sizeof(std::string)];
   } data_;
+
+  // Avoid type_punned aliasing problem
+  static std::string* GetStringPtr(Data& d) {
+    void* p = d.s;
+    return reinterpret_cast<std::string*>(p);
+  }
+
+  static const std::string* GetStringPtr(const Data& d) {
+    const void* p = d.s;
+    return reinterpret_cast<const std::string*>(p);
+  }
+
+  static void Init(const Variant&, Data&);
+
+  static void Destroy(Type t, Data& d) {
+    if (t == kString) {
+      using std::string;
+      GetStringPtr(d)->~string();
+    }
+  }
 };
 
 // FeatureSet is a map of key-value pairs. One feature set is associated with
diff --git a/src/rocksdb/include/rocksdb/utilities/stackable_db.h b/src/rocksdb/include/rocksdb/utilities/stackable_db.h
index 158aa32..aef192b 100644
--- a/src/rocksdb/include/rocksdb/utilities/stackable_db.h
+++ b/src/rocksdb/include/rocksdb/utilities/stackable_db.h
@@ -6,6 +6,12 @@
 #include <string>
 #include "rocksdb/db.h"
 
+#ifdef _WIN32
+// Windows API macro interference
+#undef DeleteFile
+#endif
+
+
 namespace rocksdb {
 
 // This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
@@ -22,6 +28,8 @@ class StackableDB : public DB {
     return db_;
   }
 
+  virtual DB* GetRootDB() override { return db_->GetRootDB(); }
+
   virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
                                     const std::string& column_family_name,
                                     ColumnFamilyHandle** handle) override {
@@ -55,6 +63,18 @@ class StackableDB : public DB {
     return db_->MultiGet(options, column_family, keys, values);
   }
 
+  using DB::AddFile;
+  virtual Status AddFile(ColumnFamilyHandle* column_family,
+                         const ExternalSstFileInfo* file_info,
+                         bool move_file) override {
+    return db_->AddFile(column_family, file_info, move_file);
+  }
+  virtual Status AddFile(ColumnFamilyHandle* column_family,
+                         const std::string& file_path,
+                         bool move_file) override {
+    return db_->AddFile(column_family, file_path, move_file);
+  }
+
   using DB::KeyMayExist;
   virtual bool KeyMayExist(const ReadOptions& options,
                            ColumnFamilyHandle* column_family, const Slice& key,
@@ -70,6 +90,13 @@ class StackableDB : public DB {
     return db_->Delete(wopts, column_family, key);
   }
 
+  using DB::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& wopts,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key) override {
+    return db_->SingleDelete(wopts, column_family, key);
+  }
+
   using DB::Merge;
   virtual Status Merge(const WriteOptions& options,
                        ColumnFamilyHandle* column_family, const Slice& key,
@@ -119,18 +146,16 @@ class StackableDB : public DB {
 
   using DB::GetApproximateSizes;
   virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                   const Range* r, int n,
-                                   uint64_t* sizes) override {
+                                   const Range* r, int n, uint64_t* sizes,
+                                   bool include_memtable = false) override {
       return db_->GetApproximateSizes(column_family, r, n, sizes);
   }
 
   using DB::CompactRange;
-  virtual Status CompactRange(ColumnFamilyHandle* column_family,
-                              const Slice* begin, const Slice* end,
-                              bool reduce_level = false, int target_level = -1,
-                              uint32_t target_path_id = 0) override {
-    return db_->CompactRange(column_family, begin, end, reduce_level,
-                             target_level, target_path_id);
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end) override {
+    return db_->CompactRange(options, column_family, begin, end);
   }
 
   using DB::CompactFiles;
@@ -144,6 +169,13 @@ class StackableDB : public DB {
         output_level, output_path_id);
   }
 
+  virtual Status PauseBackgroundWork() override {
+    return db_->PauseBackgroundWork();
+  }
+  virtual Status ContinueBackgroundWork() override {
+    return db_->ContinueBackgroundWork();
+  }
+
   using DB::NumberLevels;
   virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
     return db_->NumberLevels(column_family);
@@ -186,6 +218,10 @@ class StackableDB : public DB {
     return db_->Flush(fopts, column_family);
   }
 
+  virtual Status SyncWAL() override {
+    return db_->SyncWAL();
+  }
+
 #ifndef ROCKSDB_LITE
 
   virtual Status DisableFileDeletions() override {
@@ -226,7 +262,7 @@ class StackableDB : public DB {
     return db_->DeleteFile(name);
   }
 
-  virtual Status GetDbIdentity(std::string& identity) override {
+  virtual Status GetDbIdentity(std::string& identity) const override {
     return db_->GetDbIdentity(identity);
   }
 
diff --git a/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h b/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h
new file mode 100644
index 0000000..d31baf9
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/table_properties_collectors.h
@@ -0,0 +1,29 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <memory>
+
+#include "rocksdb/table_properties.h"
+
+namespace rocksdb {
+
+// Creates a factory of a table property collector that marks a SST
+// file as need-compaction when it observe at least "D" deletion
+// entries in any "N" consecutive entires.
+//
+// @param sliding_window_size "N". Note that this number will be
+//     round up to the smallest multiple of 128 that is no less
+//     than the specified size.
+// @param deletion_trigger "D".  Note that even when "N" is changed,
+//     the specified number for "D" will not be changed.
+extern std::shared_ptr<TablePropertiesCollectorFactory>
+    NewCompactOnDeletionCollectorFactory(
+        size_t sliding_window_size,
+        size_t deletion_trigger);
+}  // namespace rocksdb
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction.h b/src/rocksdb/include/rocksdb/utilities/transaction.h
new file mode 100644
index 0000000..6c2640a
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction.h
@@ -0,0 +1,307 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class Iterator;
+class TransactionDB;
+class WriteBatchWithIndex;
+
+// Provides BEGIN/COMMIT/ROLLBACK transactions.
+//
+// To use transactions, you must first create either an OptimisticTransactionDB
+// or a TransactionDB.  See examples/[optimistic_]transaction_example.cc for
+// more information.
+//
+// To create a transaction, use [Optimistic]TransactionDB::BeginTransaction().
+//
+// It is up to the caller to synchronize access to this object.
+//
+// See examples/transaction_example.cc for some simple examples.
+//
+// TODO(agiardullo): Not yet implemented
+//  -PerfContext statistics
+//  -Support for using Transactions with DBWithTTL
+class Transaction {
+ public:
+  virtual ~Transaction() {}
+
+  // If a transaction has a snapshot set, the transaction will ensure that
+  // any keys successfully written(or fetched via GetForUpdate()) have not
+  // been modified outside of this transaction since the time the snapshot was
+  // set.
+  // If a snapshot has not been set, the transaction guarantees that keys have
+  // not been modified since the time each key was first written (or fetched via
+  // GetForUpdate()).
+  //
+  // Using SetSnapshot() will provide stricter isolation guarantees at the
+  // expense of potentially more transaction failures due to conflicts with
+  // other writes.
+  //
+  // Calling SetSnapshot() has no effect on keys written before this function
+  // has been called.
+  //
+  // SetSnapshot() may be called multiple times if you would like to change
+  // the snapshot used for different operations in this transaction.
+  //
+  // Calling SetSnapshot will not affect the version of Data returned by Get()
+  // methods.  See Transaction::Get() for more details.
+  virtual void SetSnapshot() = 0;
+
+  // Returns the Snapshot created by the last call to SetSnapshot().
+  //
+  // REQUIRED: The returned Snapshot is only valid up until the next time
+  // SetSnapshot() is called or the Transaction is deleted.
+  virtual const Snapshot* GetSnapshot() const = 0;
+
+  // Write all batched keys to the db atomically.
+  //
+  // Returns OK on success.
+  //
+  // May return any error status that could be returned by DB:Write().
+  //
+  // If this transaction was created by an OptimisticTransactionDB(),
+  // Status::Busy() may be returned if the transaction could not guarantee
+  // that there are no write conflicts.  Status::TryAgain() may be returned
+  // if the memtable history size is not large enough
+  //  (See max_write_buffer_number_to_maintain).
+  //
+  // If this transaction was created by a TransactionDB(), Status::Expired()
+  // may be returned if this transaction has lived for longer than
+  // TransactionOptions.expiration.
+  virtual Status Commit() = 0;
+
+  // Discard all batched writes in this transaction.
+  virtual void Rollback() = 0;
+
+  // Records the state of the transaction for future calls to
+  // RollbackToSavePoint().  May be called multiple times to set multiple save
+  // points.
+  virtual void SetSavePoint() = 0;
+
+  // Undo all operations in this transaction (Put, Merge, Delete, PutLogData)
+  // since the most recent call to SetSavePoint() and removes the most recent
+  // SetSavePoint().
+  // If there is no previous call to SetSavePoint(), returns Status::NotFound()
+  virtual Status RollbackToSavePoint() = 0;
+
+  // This function is similar to DB::Get() except it will also read pending
+  // changes in this transaction.  Currently, this function will return
+  // Status::MergeInProgress if the most recent write to the queried key in
+  // this batch is a Merge.
+  //
+  // If read_options.snapshot is not set, the current version of the key will
+  // be read.  Calling SetSnapshot() does not affect the version of the data
+  // returned.
+  //
+  // Note that setting read_options.snapshot will affect what is read from the
+  // DB but will NOT change which keys are read from this transaction (the keys
+  // in this transaction do not yet belong to any snapshot and will be fetched
+  // regardless).
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     std::string* value) = 0;
+
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     std::string* value) = 0;
+
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+
+  virtual std::vector<Status> MultiGet(const ReadOptions& options,
+                                       const std::vector<Slice>& keys,
+                                       std::vector<std::string>* values) = 0;
+
+  // Read this key and ensure that this transaction will only
+  // be able to be committed if this key is not written outside this
+  // transaction after it has first been read (or after the snapshot if a
+  // snapshot is set in this transaction).  The transaction behavior is the
+  // same regardless of whether the key exists or not.
+  //
+  // Note: Currently, this function will return Status::MergeInProgress
+  // if the most recent write to the queried key in this batch is a Merge.
+  //
+  // The values returned by this function are similar to Transaction::Get().
+  // If value==nullptr, then this function will not read any data, but will
+  // still ensure that this key cannot be written to by outside of this
+  // transaction.
+  //
+  // If this transaction was created by an OptimisticTransaction, GetForUpdate()
+  // could cause commit() to fail.  Otherwise, it could return any error
+  // that could be returned by DB::Get().
+  //
+  // If this transaction was created by a TransactionDB, it can return
+  // Status::OK() on success,
+  // Status::Busy() if there is a write conflict,
+  // Status::TimedOut() if a lock could not be acquired,
+  // Status::TryAgain() if the memtable history size is not large enough
+  //  (See max_write_buffer_number_to_maintain)
+  // Status::MergeInProgress() if merge operations cannot be resolved.
+  // or other errors if this key could not be read.
+  virtual Status GetForUpdate(const ReadOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key, std::string* value) = 0;
+
+  virtual Status GetForUpdate(const ReadOptions& options, const Slice& key,
+                              std::string* value) = 0;
+
+  virtual std::vector<Status> MultiGetForUpdate(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values) = 0;
+
+  virtual std::vector<Status> MultiGetForUpdate(
+      const ReadOptions& options, const std::vector<Slice>& keys,
+      std::vector<std::string>* values) = 0;
+
+  // Returns an iterator that will iterate on all keys in the default
+  // column family including both keys in the DB and uncommitted keys in this
+  // transaction.
+  //
+  // Setting read_options.snapshot will affect what is read from the
+  // DB but will NOT change which keys are read from this transaction (the keys
+  // in this transaction do not yet belong to any snapshot and will be fetched
+  // regardless).
+  //
+  // Caller is reponsible for deleting the returned Iterator.
+  //
+  // The returned iterator is only valid until Commit(), Rollback(), or
+  // RollbackToSavePoint() is called.
+  // NOTE: Transaction::Put/Merge/Delete will currently invalidate this iterator
+  // until
+  // the following issue is fixed:
+  // https://github.com/facebook/rocksdb/issues/616
+  virtual Iterator* GetIterator(const ReadOptions& read_options) = 0;
+
+  virtual Iterator* GetIterator(const ReadOptions& read_options,
+                                ColumnFamilyHandle* column_family) = 0;
+
+  // Put, Merge, Delete, and SingleDelete behave similarly to the corresponding
+  // functions in WriteBatch, but will also do conflict checking on the
+  // keys being written.
+  //
+  // If this Transaction was created on an OptimisticTransactionDB, these
+  // functions should always return Status::OK().
+  //
+  // If this Transaction was created on a TransactionDB, the status returned
+  // can be:
+  // Status::OK() on success,
+  // Status::Busy() if there is a write conflict,
+  // Status::TimedOut() if a lock could not be acquired,
+  // Status::TryAgain() if the memtable history size is not large enough
+  //  (See max_write_buffer_number_to_maintain)
+  // or other errors on unexpected failures.
+  virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value) = 0;
+  virtual Status Put(const Slice& key, const Slice& value) = 0;
+  virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+                     const SliceParts& value) = 0;
+  virtual Status Put(const SliceParts& key, const SliceParts& value) = 0;
+
+  virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) = 0;
+  virtual Status Merge(const Slice& key, const Slice& value) = 0;
+
+  virtual Status Delete(ColumnFamilyHandle* column_family,
+                        const Slice& key) = 0;
+  virtual Status Delete(const Slice& key) = 0;
+  virtual Status Delete(ColumnFamilyHandle* column_family,
+                        const SliceParts& key) = 0;
+  virtual Status Delete(const SliceParts& key) = 0;
+
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const Slice& key) = 0;
+  virtual Status SingleDelete(const Slice& key) = 0;
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const SliceParts& key) = 0;
+  virtual Status SingleDelete(const SliceParts& key) = 0;
+
+  // PutUntracked() will write a Put to the batch of operations to be committed
+  // in this transaction.  This write will only happen if this transaction
+  // gets committed successfully.  But unlike Transaction::Put(),
+  // no conflict checking will be done for this key.
+  //
+  // If this Transaction was created on a TransactionDB, this function will
+  // still acquire locks necessary to make sure this write doesn't cause
+  // conflicts in other transactions and may return Status::Busy().
+  virtual Status PutUntracked(ColumnFamilyHandle* column_family,
+                              const Slice& key, const Slice& value) = 0;
+  virtual Status PutUntracked(const Slice& key, const Slice& value) = 0;
+  virtual Status PutUntracked(ColumnFamilyHandle* column_family,
+                              const SliceParts& key,
+                              const SliceParts& value) = 0;
+  virtual Status PutUntracked(const SliceParts& key,
+                              const SliceParts& value) = 0;
+
+  virtual Status MergeUntracked(ColumnFamilyHandle* column_family,
+                                const Slice& key, const Slice& value) = 0;
+  virtual Status MergeUntracked(const Slice& key, const Slice& value) = 0;
+
+  virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
+                                 const Slice& key) = 0;
+
+  virtual Status DeleteUntracked(const Slice& key) = 0;
+  virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
+                                 const SliceParts& key) = 0;
+  virtual Status DeleteUntracked(const SliceParts& key) = 0;
+
+  // Similar to WriteBatch::PutLogData
+  virtual void PutLogData(const Slice& blob) = 0;
+
+  // Returns the number of distinct Keys being tracked by this transaction.
+  // If this transaction was created by a TransactinDB, this is the number of
+  // keys that are currently locked by this transaction.
+  // If this transaction was created by an OptimisticTransactionDB, this is the
+  // number of keys that need to be checked for conflicts at commit time.
+  virtual uint64_t GetNumKeys() const = 0;
+
+  // Returns the number of Puts/Deletes/Merges that have been applied to this
+  // transaction so far.
+  virtual uint64_t GetNumPuts() const = 0;
+  virtual uint64_t GetNumDeletes() const = 0;
+  virtual uint64_t GetNumMerges() const = 0;
+
+  // Returns the elapsed time in milliseconds since this Transaction began.
+  virtual uint64_t GetElapsedTime() const = 0;
+
+  // Fetch the underlying write batch that contains all pending changes to be
+  // committed.
+  //
+  // Note:  You should not write or delete anything from the batch directly and
+  // should only use the the functions in the Transaction class to
+  // write to this transaction.
+  virtual WriteBatchWithIndex* GetWriteBatch() = 0;
+
+  // Change the value of TransactionOptions.lock_timeout (in milliseconds) for
+  // this transaction.
+  // Has no effect on OptimisticTransactions.
+  virtual void SetLockTimeout(int64_t timeout) = 0;
+
+ protected:
+  explicit Transaction(const TransactionDB* db) {}
+  Transaction() {}
+
+ private:
+  // No copying allowed
+  Transaction(const Transaction&);
+  void operator=(const Transaction&);
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction_db.h b/src/rocksdb/include/rocksdb/utilities/transaction_db.h
new file mode 100644
index 0000000..f9023fc
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction_db.h
@@ -0,0 +1,137 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
+#include "rocksdb/utilities/transaction.h"
+
+// Database with Transaction support.
+//
+// See transaction.h and examples/transaction_example.cc
+
+namespace rocksdb {
+
+class TransactionDBMutexFactory;
+
+struct TransactionDBOptions {
+  // Specifies the maximum number of keys that can be locked at the same time
+  // per column family.
+  // If the number of locked keys is greater than max_num_locks, transaction
+  // writes (or GetForUpdate) will return an error.
+  // If this value is not positive, no limit will be enforced.
+  int64_t max_num_locks = -1;
+
+  // Increasing this value will increase the concurrency by dividing the lock
+  // table (per column family) into more sub-tables, each with their own
+  // separate
+  // mutex.
+  size_t num_stripes = 16;
+
+  // If positive, specifies the default wait timeout in milliseconds when
+  // a transaction attempts to lock a key if not specified by
+  // TransactionOptions::lock_timeout.
+  //
+  // If 0, no waiting is done if a lock cannot instantly be acquired.
+  // If negative, there is no timeout.  Not using a timeout is not recommended
+  // as it can lead to deadlocks.  Currently, there is no deadlock-detection to
+  // recover
+  // from a deadlock.
+  int64_t transaction_lock_timeout = 1000;  // 1 second
+
+  // If positive, specifies the wait timeout in milliseconds when writing a key
+  // OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write()
+  // directly).
+  // If 0, no waiting is done if a lock cannot instantly be acquired.
+  // If negative, there is no timeout and will block indefinitely when acquiring
+  // a lock.
+  //
+  // Not using a a timeout can lead to deadlocks.  Currently, there
+  // is no deadlock-detection to recover from a deadlock.  While DB writes
+  // cannot deadlock with other DB writes, they can deadlock with a transaction.
+  // A negative timeout should only be used if all transactions have an small
+  // expiration set.
+  int64_t default_lock_timeout = 1000;  // 1 second
+
+  // If set, the TransactionDB will use this implemenation of a mutex and
+  // condition variable for all transaction locking instead of the default
+  // mutex/condvar implementation.
+  std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory;
+};
+
+struct TransactionOptions {
+  // Setting set_snapshot=true is the same as calling
+  // Transaction::SetSnapshot().
+  bool set_snapshot = false;
+
+
+  // TODO(agiardullo): TransactionDB does not yet support comparators that allow
+  // two non-equal keys to be equivalent.  Ie, cmp->Compare(a,b) should only
+  // return 0 if
+  // a.compare(b) returns 0.
+
+
+  // If positive, specifies the wait timeout in milliseconds when
+  // a transaction attempts to lock a key.
+  //
+  // If 0, no waiting is done if a lock cannot instantly be acquired.
+  // If negative, TransactionDBOptions::transaction_lock_timeout will be used.
+  int64_t lock_timeout = -1;
+
+  // Expiration duration in milliseconds.  If non-negative, transactions that
+  // last longer than this many milliseconds will fail to commit.  If not set,
+  // a forgotten transaction that is never committed, rolled back, or deleted
+  // will never relinquish any locks it holds.  This could prevent keys from
+  // being
+  // written by other writers.
+  //
+  // TODO(agiardullo):  Improve performance of checking expiration time.
+  int64_t expiration = -1;
+};
+
+class TransactionDB : public StackableDB {
+ public:
+  // Open a TransactionDB similar to DB::Open().
+  static Status Open(const Options& options,
+                     const TransactionDBOptions& txn_db_options,
+                     const std::string& dbname, TransactionDB** dbptr);
+
+  static Status Open(const DBOptions& db_options,
+                     const TransactionDBOptions& txn_db_options,
+                     const std::string& dbname,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles,
+                     TransactionDB** dbptr);
+
+  virtual ~TransactionDB() {}
+
+  // Starts a new Transaction.  Passing set_snapshot=true has the same effect
+  // as calling Transaction::SetSnapshot().
+  //
+  // Caller should delete the returned transaction after calling
+  // Transaction::Commit() or Transaction::Rollback().
+  virtual Transaction* BeginTransaction(
+      const WriteOptions& write_options,
+      const TransactionOptions& txn_options = TransactionOptions()) = 0;
+
+ protected:
+  // To Create an TransactionDB, call Open()
+  explicit TransactionDB(DB* db) : StackableDB(db) {}
+
+ private:
+  // No copying allowed
+  TransactionDB(const TransactionDB&);
+  void operator=(const TransactionDB&);
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h b/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h
new file mode 100644
index 0000000..773ebc1
--- /dev/null
+++ b/src/rocksdb/include/rocksdb/utilities/transaction_db_mutex.h
@@ -0,0 +1,92 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <memory>
+
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+// TransactionDBMutex and TransactionDBCondVar APIs allows applications to
+// implement custom mutexes and condition variables to be used by a
+// TransactionDB when locking keys.
+//
+// To open a TransactionDB with a custom TransactionDBMutexFactory, set
+// TransactionDBOptions.custom_mutex_factory.
+
+class TransactionDBMutex {
+ public:
+  virtual ~TransactionDBMutex() {}
+
+  // Attempt to acquire lock.  Return OK on success, or other Status on failure.
+  // If returned status is OK, TransactionDB will eventually call UnLock().
+  virtual Status Lock() = 0;
+
+  // Attempt to acquire lock.  If timeout is non-negative, operation should be
+  // failed after this many microseconds.
+  // Returns OK on success,
+  //         TimedOut if timed out,
+  //         or other Status on failure.
+  // If returned status is OK, TransactionDB will eventually call UnLock().
+  virtual Status TryLockFor(int64_t timeout_time) = 0;
+
+  // Unlock Mutex that was successfully locked by Lock() or TryLockUntil()
+  virtual void UnLock() = 0;
+};
+
+class TransactionDBCondVar {
+ public:
+  virtual ~TransactionDBCondVar() {}
+
+  // Block current thread until condition variable is notified by a call to
+  // Notify() or NotifyAll().  Wait() will be called with mutex locked.
+  // Returns OK if notified.
+  // Returns non-OK if TransactionDB should stop waiting and fail the operation.
+  // May return OK spuriously even if not notified.
+  virtual Status Wait(std::shared_ptr<TransactionDBMutex> mutex) = 0;
+
+  // Block current thread until condition variable is notified by a call to
+  // Notify() or NotifyAll(), or if the timeout is reached.
+  // Wait() will be called with mutex locked.
+  //
+  // If timeout is non-negative, operation should be failed after this many
+  // microseconds.
+  // If implementing a custom version of this class, the implementation may
+  // choose to ignore the timeout.
+  //
+  // Returns OK if notified.
+  // Returns TimedOut if timeout is reached.
+  // Returns other status if TransactionDB should otherwis stop waiting and
+  //  fail the operation.
+  // May return OK spuriously even if not notified.
+  virtual Status WaitFor(std::shared_ptr<TransactionDBMutex> mutex,
+                         int64_t timeout_time) = 0;
+
+  // If any threads are waiting on *this, unblock at least one of the
+  // waiting threads.
+  virtual void Notify() = 0;
+
+  // Unblocks all threads waiting on *this.
+  virtual void NotifyAll() = 0;
+};
+
+// Factory class that can allocate mutexes and condition variables.
+class TransactionDBMutexFactory {
+ public:
+  // Create a TransactionDBMutex object.
+  virtual std::shared_ptr<TransactionDBMutex> AllocateMutex() = 0;
+
+  // Create a TransactionDBCondVar object.
+  virtual std::shared_ptr<TransactionDBCondVar> AllocateCondVar() = 0;
+
+  virtual ~TransactionDBMutexFactory() {}
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/utilities/utility_db.h b/src/rocksdb/include/rocksdb/utilities/utility_db.h
index f4db665..a34a638 100644
--- a/src/rocksdb/include/rocksdb/utilities/utility_db.h
+++ b/src/rocksdb/include/rocksdb/utilities/utility_db.h
@@ -19,7 +19,12 @@ class UtilityDB {
   // This function is here only for backwards compatibility. Please use the
   // functions defined in DBWithTTl (rocksdb/utilities/db_ttl.h)
   // (deprecated)
-  __attribute__((deprecated)) static Status OpenTtlDB(const Options& options,
+#if defined(__GNUC__) || defined(__clang__)
+  __attribute__((deprecated))
+#elif _WIN32
+   __declspec(deprecated)
+#endif
+    static Status OpenTtlDB(const Options& options,
                                                       const std::string& name,
                                                       StackableDB** dbptr,
                                                       int32_t ttl = 0,
diff --git a/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
index 7c17534..1e41e78 100644
--- a/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
+++ b/src/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
@@ -8,9 +8,10 @@
 //
 // A WriteBatchWithIndex with a binary searchable index built for all the keys
 // inserted.
-
 #pragma once
 
+#ifndef ROCKSDB_LITE
+
 #include <string>
 
 #include "rocksdb/comparator.h"
@@ -28,10 +29,16 @@ class DB;
 struct ReadOptions;
 struct DBOptions;
 
-enum WriteType { kPutRecord, kMergeRecord, kDeleteRecord, kLogDataRecord };
+enum WriteType {
+  kPutRecord,
+  kMergeRecord,
+  kDeleteRecord,
+  kSingleDeleteRecord,
+  kLogDataRecord
+};
 
-// an entry for Put, Merge or Delete entry for write batches. Used in
-// WBWIIterator.
+// an entry for Put, Merge, Delete, or SingleDelete entry for write batches.
+// Used in WBWIIterator.
 struct WriteEntry {
   WriteType type;
   Slice key;
@@ -55,15 +62,17 @@ class WBWIIterator {
 
   virtual void Prev() = 0;
 
-  virtual const WriteEntry& Entry() const = 0;
+  // the return WriteEntry is only valid until the next mutation of
+  // WriteBatchWithIndex
+  virtual WriteEntry Entry() const = 0;
 
   virtual Status status() const = 0;
 };
 
 // A WriteBatchWithIndex with a binary searchable index built for all the keys
 // inserted.
-// In Put(), Merge() or Delete(), the same function of the wrapped will be
-// called. At the same time, indexes will be built.
+// In Put(), Merge() Delete(), or SingleDelete(), the same function of the
+// wrapped will be called. At the same time, indexes will be built.
 // By calling GetWriteBatch(), a user will get the WriteBatch for the data
 // they inserted, which can be used for DB::Write().
 // A user can call NewIterator() to create an iterator.
@@ -98,6 +107,11 @@ class WriteBatchWithIndex : public WriteBatchBase {
   void Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
   void Delete(const Slice& key) override;
 
+  using WriteBatchBase::SingleDelete;
+  void SingleDelete(ColumnFamilyHandle* column_family,
+                    const Slice& key) override;
+  void SingleDelete(const Slice& key) override;
+
   using WriteBatchBase::PutLogData;
   void PutLogData(const Slice& blob) override;
 
@@ -112,12 +126,21 @@ class WriteBatchWithIndex : public WriteBatchBase {
   // order given by index_comparator. For multiple updates on the same key,
   // each update will be returned as a separate entry, in the order of update
   // time.
+  //
+  // The returned iterator should be deleted by the caller.
   WBWIIterator* NewIterator(ColumnFamilyHandle* column_family);
   // Create an iterator of the default column family.
   WBWIIterator* NewIterator();
 
   // Will create a new Iterator that will use WBWIIterator as a delta and
-  // base_iterator as base
+  // base_iterator as base.
+  //
+  // This function is only supported if the WriteBatchWithIndex was
+  // constructed with overwrite_key=true.
+  //
+  // The returned iterator should be deleted by the caller.
+  // The base_iterator is now 'owned' by the returned iterator. Deleting the
+  // returned iterator will also delete the base_iterator.
   Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family,
                                 Iterator* base_iterator);
   // default column family
@@ -132,7 +155,7 @@ class WriteBatchWithIndex : public WriteBatchBase {
 
   // Similar to previous function but does not require a column_family.
   // Note:  An InvalidArgument status will be returned if there are any Merge
-  // operators for this key.
+  // operators for this key.  Use previous method instead.
   Status GetFromBatch(const DBOptions& options, const Slice& key,
                       std::string* value) {
     return GetFromBatch(nullptr, options, key, value);
@@ -154,9 +177,28 @@ class WriteBatchWithIndex : public WriteBatchBase {
                            ColumnFamilyHandle* column_family, const Slice& key,
                            std::string* value);
 
+  // Records the state of the batch for future calls to RollbackToSavePoint().
+  // May be called multiple times to set multiple save points.
+  void SetSavePoint() override;
+
+  // Remove all entries in this batch (Put, Merge, Delete, SingleDelete,
+  // PutLogData) since the most recent call to SetSavePoint() and removes the
+  // most recent save point.
+  // If there is no previous call to SetSavePoint(), behaves the same as
+  // Clear().
+  //
+  // Calling RollbackToSavePoint invalidates any open iterators on this batch.
+  //
+  // Returns Status::OK() on success,
+  //         Status::NotFound() if no previous call to SetSavePoint(),
+  //         or other Status on corruption.
+  Status RollbackToSavePoint() override;
+
  private:
   struct Rep;
   Rep* rep;
 };
 
 }  // namespace rocksdb
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/include/rocksdb/version.h b/src/rocksdb/include/rocksdb/version.h
index 26086b2..86a1939 100644
--- a/src/rocksdb/include/rocksdb/version.h
+++ b/src/rocksdb/include/rocksdb/version.h
@@ -4,9 +4,9 @@
 // of patent rights can be found in the PATENTS file in the same directory.
 #pragma once
 
-#define ROCKSDB_MAJOR 3
-#define ROCKSDB_MINOR 11
-#define ROCKSDB_PATCH 2
+#define ROCKSDB_MAJOR 4
+#define ROCKSDB_MINOR 1
+#define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these
diff --git a/src/rocksdb/include/rocksdb/write_batch.h b/src/rocksdb/include/rocksdb/write_batch.h
index c096ae1..a097f21 100644
--- a/src/rocksdb/include/rocksdb/write_batch.h
+++ b/src/rocksdb/include/rocksdb/write_batch.h
@@ -25,7 +25,9 @@
 #ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
 #define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
 
+#include <stack>
 #include <string>
+#include <stdint.h>
 #include "rocksdb/status.h"
 #include "rocksdb/write_batch_base.h"
 
@@ -33,6 +35,7 @@ namespace rocksdb {
 
 class Slice;
 class ColumnFamilyHandle;
+struct SavePoints;
 struct SliceParts;
 
 class WriteBatch : public WriteBatchBase {
@@ -57,6 +60,30 @@ class WriteBatch : public WriteBatchBase {
     Put(nullptr, key, value);
   }
 
+  using WriteBatchBase::Delete;
+  // If the database contains a mapping for "key", erase it.  Else do nothing.
+  void Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
+  void Delete(const Slice& key) override { Delete(nullptr, key); }
+
+  // variant that takes SliceParts
+  void Delete(ColumnFamilyHandle* column_family,
+              const SliceParts& key) override;
+  void Delete(const SliceParts& key) override { Delete(nullptr, key); }
+
+  using WriteBatchBase::SingleDelete;
+  // If the database contains a mapping for "key", erase it. Expects that the
+  // key was not overwritten. Else do nothing.
+  void SingleDelete(ColumnFamilyHandle* column_family,
+                    const Slice& key) override;
+  void SingleDelete(const Slice& key) override { SingleDelete(nullptr, key); }
+
+  // variant that takes SliceParts
+  void SingleDelete(ColumnFamilyHandle* column_family,
+                    const SliceParts& key) override;
+  void SingleDelete(const SliceParts& key) override {
+    SingleDelete(nullptr, key);
+  }
+
   using WriteBatchBase::Merge;
   // Merge "value" with the existing value of "key" in the database.
   // "key->merge(existing, value)"
@@ -66,15 +93,12 @@ class WriteBatch : public WriteBatchBase {
     Merge(nullptr, key, value);
   }
 
-  using WriteBatchBase::Delete;
-  // If the database contains a mapping for "key", erase it.  Else do nothing.
-  void Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
-  void Delete(const Slice& key) override { Delete(nullptr, key); }
-
   // variant that takes SliceParts
-  void Delete(ColumnFamilyHandle* column_family,
-              const SliceParts& key) override;
-  void Delete(const SliceParts& key) override { Delete(nullptr, key); }
+  void Merge(ColumnFamilyHandle* column_family, const SliceParts& key,
+             const SliceParts& value) override;
+  void Merge(const SliceParts& key, const SliceParts& value) override {
+    Merge(nullptr, key, value);
+  }
 
   using WriteBatchBase::PutLogData;
   // Append a blob of arbitrary size to the records in this batch. The blob will
@@ -93,6 +117,17 @@ class WriteBatch : public WriteBatchBase {
   // Clear all updates buffered in this batch.
   void Clear() override;
 
+  // Records the state of the batch for future calls to RollbackToSavePoint().
+  // May be called multiple times to set multiple save points.
+  void SetSavePoint() override;
+
+  // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the
+  // most recent call to SetSavePoint() and removes the most recent save point.
+  // If there is no previous call to SetSavePoint(), Status::NotFound()
+  // will be returned.
+  // Oterwise returns Status::OK().
+  Status RollbackToSavePoint() override;
+
   // Support for iterating over the contents of a batch.
   class Handler {
    public:
@@ -114,6 +149,26 @@ class WriteBatch : public WriteBatchBase {
     }
     virtual void Put(const Slice& key, const Slice& value) {}
 
+    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+      if (column_family_id == 0) {
+        Delete(key);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and DeleteCF not implemented");
+    }
+    virtual void Delete(const Slice& key) {}
+
+    virtual Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) {
+      if (column_family_id == 0) {
+        SingleDelete(key);
+        return Status::OK();
+      }
+      return Status::InvalidArgument(
+          "non-default column family and SingleDeleteCF not implemented");
+    }
+    virtual void SingleDelete(const Slice& key) {}
+
     // Merge and LogData are not pure virtual. Otherwise, we would break
     // existing clients of Handler on a source code level. The default
     // implementation of Merge does nothing.
@@ -130,15 +185,6 @@ class WriteBatch : public WriteBatchBase {
 
     // The default implementation of LogData does nothing.
     virtual void LogData(const Slice& blob);
-    virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
-      if (column_family_id == 0) {
-        Delete(key);
-        return Status::OK();
-      }
-      return Status::InvalidArgument(
-          "non-default column family and DeleteCF not implemented");
-    }
-    virtual void Delete(const Slice& key) {}
 
     // Continue is called by WriteBatch::Iterate. If it returns false,
     // iteration is halted. Otherwise, it continues iterating. The default
@@ -160,10 +206,12 @@ class WriteBatch : public WriteBatchBase {
   WriteBatch* GetWriteBatch() override { return this; }
 
   // Constructor with a serialized string object
-  explicit WriteBatch(std::string rep): rep_(rep) {}
+  explicit WriteBatch(const std::string& rep)
+      : save_points_(nullptr), rep_(rep) {}
 
  private:
   friend class WriteBatchInternal;
+  SavePoints* save_points_;
 
  protected:
   std::string rep_;  // See comment in write_batch.cc for the format of rep_
diff --git a/src/rocksdb/include/rocksdb/write_batch_base.h b/src/rocksdb/include/rocksdb/write_batch_base.h
index a218cc1..c408375 100644
--- a/src/rocksdb/include/rocksdb/write_batch_base.h
+++ b/src/rocksdb/include/rocksdb/write_batch_base.h
@@ -11,6 +11,7 @@
 namespace rocksdb {
 
 class Slice;
+class Status;
 class ColumnFamilyHandle;
 class WriteBatch;
 struct SliceParts;
@@ -40,6 +41,11 @@ class WriteBatchBase {
                      const Slice& value) = 0;
   virtual void Merge(const Slice& key, const Slice& value) = 0;
 
+  // variant that takes SliceParts
+  virtual void Merge(ColumnFamilyHandle* column_family, const SliceParts& key,
+                     const SliceParts& value);
+  virtual void Merge(const SliceParts& key, const SliceParts& value);
+
   // If the database contains a mapping for "key", erase it.  Else do nothing.
   virtual void Delete(ColumnFamilyHandle* column_family, const Slice& key) = 0;
   virtual void Delete(const Slice& key) = 0;
@@ -48,6 +54,17 @@ class WriteBatchBase {
   virtual void Delete(ColumnFamilyHandle* column_family, const SliceParts& key);
   virtual void Delete(const SliceParts& key);
 
+  // If the database contains a mapping for "key", erase it. Expects that the
+  // key was not overwritten. Else do nothing.
+  virtual void SingleDelete(ColumnFamilyHandle* column_family,
+                            const Slice& key) = 0;
+  virtual void SingleDelete(const Slice& key) = 0;
+
+  // variant that takes SliceParts
+  virtual void SingleDelete(ColumnFamilyHandle* column_family,
+                            const SliceParts& key);
+  virtual void SingleDelete(const SliceParts& key);
+
   // Append a blob of arbitrary size to the records in this batch. The blob will
   // be stored in the transaction log but not in any other file. In particular,
   // it will not be persisted to the SST files. When iterating over this
@@ -67,6 +84,16 @@ class WriteBatchBase {
   // converting any WriteBatchBase(eg WriteBatchWithIndex) into a basic
   // WriteBatch.
   virtual WriteBatch* GetWriteBatch() = 0;
+
+  // Records the state of the batch for future calls to RollbackToSavePoint().
+  // May be called multiple times to set multiple save points.
+  virtual void SetSavePoint() = 0;
+
+  // Remove all entries in this batch (Put, Merge, Delete, PutLogData) since the
+  // most recent call to SetSavePoint() and removes the most recent save point.
+  // If there is no previous call to SetSavePoint(), behaves the same as
+  // Clear().
+  virtual Status RollbackToSavePoint() = 0;
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/include/utilities/backupable_db.h b/src/rocksdb/include/utilities/backupable_db.h
deleted file mode 100644
index 43d5a5c..0000000
--- a/src/rocksdb/include/utilities/backupable_db.h
+++ /dev/null
@@ -1,12 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#pragma once
-#warning This file was moved to rocksdb/utilities/backupable_db.h
-#include "rocksdb/utilities/backupable_db.h"
diff --git a/src/rocksdb/include/utilities/db_ttl.h b/src/rocksdb/include/utilities/db_ttl.h
deleted file mode 100644
index c3d5c2b..0000000
--- a/src/rocksdb/include/utilities/db_ttl.h
+++ /dev/null
@@ -1,8 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#pragma once
-#warning This file was moved to rocksdb/utilities/db_ttl.h
-#include "rocksdb/utilities/db_ttl.h"
diff --git a/src/rocksdb/include/utilities/document_db.h b/src/rocksdb/include/utilities/document_db.h
deleted file mode 100644
index 1d1330b..0000000
--- a/src/rocksdb/include/utilities/document_db.h
+++ /dev/null
@@ -1,8 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#pragma once
-#warning This file was moved to rocksdb/utilities/document_db.h
-#include "rocksdb/utilities/document_db.h"
diff --git a/src/rocksdb/include/utilities/geo_db.h b/src/rocksdb/include/utilities/geo_db.h
deleted file mode 100644
index 48957d4..0000000
--- a/src/rocksdb/include/utilities/geo_db.h
+++ /dev/null
@@ -1,8 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#pragma once
-#warning This file was moved to rocksdb/utilities/geo_db.h
-#include "rocksdb/utilities/geo_db.h"
diff --git a/src/rocksdb/include/utilities/json_document.h b/src/rocksdb/include/utilities/json_document.h
deleted file mode 100644
index f3f9396..0000000
--- a/src/rocksdb/include/utilities/json_document.h
+++ /dev/null
@@ -1,7 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-#pragma once
-#warning This file was moved to rocksdb/utilities/json_document.h
-#include "rocksdb/utilities/json_document.h"
diff --git a/src/rocksdb/include/utilities/stackable_db.h b/src/rocksdb/include/utilities/stackable_db.h
deleted file mode 100644
index 435818d..0000000
--- a/src/rocksdb/include/utilities/stackable_db.h
+++ /dev/null
@@ -1,7 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#pragma once
-#warning This file was moved to rocksdb/utilities/stackable_db.h
-#include "rocksdb/utilities/stackable_db.h"
diff --git a/src/rocksdb/include/utilities/utility_db.h b/src/rocksdb/include/utilities/utility_db.h
deleted file mode 100644
index 4a8bbae..0000000
--- a/src/rocksdb/include/utilities/utility_db.h
+++ /dev/null
@@ -1,7 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#pragma once
-#warning This file was moved to rocksdb/utilities/utility_db.h
-#include "rocksdb/utilities/utility_db.h"
diff --git a/src/rocksdb/java/HISTORY-JAVA.md b/src/rocksdb/java/HISTORY-JAVA.md
new file mode 100644
index 0000000..731886a
--- /dev/null
+++ b/src/rocksdb/java/HISTORY-JAVA.md
@@ -0,0 +1,86 @@
+# RocksJava Change Log
+
+## 3.13 (8/4/2015)
+### New Features
+* Exposed BackupEngine API.
+* Added CappedPrefixExtractor support.  To use such extractor, simply call useCappedPrefixExtractor in either Options or ColumnFamilyOptions.
+* Added RemoveEmptyValueCompactionFilter.
+
+## 3.10.0 (3/24/2015)
+### New Features
+* Added compression per level API.
+* MemEnv is now available in RocksJava via RocksMemEnv class.
+* lz4 compression is now included in rocksjava static library when running `make rocksdbjavastatic`.
+
+### Public API Changes
+* Overflowing a size_t when setting rocksdb options now throws an IllegalArgumentException, which removes the necessity for a developer to catch these Exceptions explicitly.
+* The set and get functions for tableCacheRemoveScanCountLimit are deprecated.
+
+
+## By 01/31/2015
+### New Features
+* WriteBatchWithIndex support.
+* Iterator support for WriteBatch and WriteBatchWithIndex
+* GetUpdatesSince support.
+* Snapshots carry now information about the related sequence number.
+* TTL DB support.
+
+## By 11/14/2014
+### New Features
+* Full support for Column Family.
+* Slice and Comparator support.
+* Default merge operator support.
+* RateLimiter support.
+
+## By 06/15/2014
+### New Features
+* Added basic Java binding for rocksdb::Env such that multiple RocksDB can share the same thread pool and environment.
+* Added RestoreBackupableDB
+
+## By 05/30/2014
+### Internal Framework Improvement
+* Added disOwnNativeHandle to RocksObject, which allows a RocksObject to give-up the ownership of its native handle.  This method is useful when sharing and transferring the ownership of RocksDB C++ resources.
+
+## By 05/15/2014
+### New Features
+* Added RocksObject --- the base class of all RocksDB classes which holds some RocksDB resources in the C++ side.
+* Use environmental variable JAVA_HOME in Makefile for RocksJava
+### Public API changes
+* Renamed org.rocksdb.Iterator to org.rocksdb.RocksIterator to avoid potential confliction with Java built-in Iterator.
+
+## By 04/30/2014
+### New Features
+* Added Java binding for MultiGet.
+* Added static method RocksDB.loadLibrary(), which loads necessary library files.
+* Added Java bindings for 60+ rocksdb::Options.
+* Added Java binding for BloomFilter.
+* Added Java binding for ReadOptions.
+* Added Java binding for memtables.
+* Added Java binding for sst formats.
+* Added Java binding for RocksDB Iterator which enables sequential scan operation.
+* Added Java binding for Statistics
+* Added Java binding for BackupableDB.
+
+### DB Benchmark
+* Added filluniquerandom, readseq benchmark.
+* 70+ command-line options.
+* Enabled BloomFilter configuration.
+
+## By 04/15/2014
+### New Features
+* Added Java binding for WriteOptions.
+* Added Java binding for WriteBatch, which enables batch-write.
+* Added Java binding for rocksdb::Options.
+* Added Java binding for block cache.
+* Added Java version DB Benchmark.
+
+### DB Benchmark
+* Added readwhilewriting benchmark.
+
+### Internal Framework Improvement
+* Avoid a potential byte-array-copy between c++ and Java in RocksDB.get.
+* Added SizeUnit in org.rocksdb.util to store consts like KB and GB.
+
+### 03/28/2014
+* RocksJava project started.
+* Added Java binding for RocksDB, which supports Open, Close, Get and Put.
diff --git a/src/rocksdb/java/Makefile b/src/rocksdb/java/Makefile
new file mode 100644
index 0000000..abc8f73
--- /dev/null
+++ b/src/rocksdb/java/Makefile
@@ -0,0 +1,190 @@
+NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\
+        org.rocksdb.AbstractComparator\
+	org.rocksdb.AbstractSlice\
+	org.rocksdb.BackupEngine\
+	org.rocksdb.BackupableDB\
+	org.rocksdb.BackupableDBOptions\
+	org.rocksdb.BlockBasedTableConfig\
+	org.rocksdb.BloomFilter\
+	org.rocksdb.Checkpoint\
+	org.rocksdb.ColumnFamilyHandle\
+	org.rocksdb.ColumnFamilyOptions\
+	org.rocksdb.Comparator\
+	org.rocksdb.ComparatorOptions\
+	org.rocksdb.DBOptions\
+	org.rocksdb.DirectComparator\
+	org.rocksdb.DirectSlice\
+	org.rocksdb.Env\
+	org.rocksdb.FlushOptions\
+	org.rocksdb.Filter\
+	org.rocksdb.GenericRateLimiterConfig\
+	org.rocksdb.HashLinkedListMemTableConfig\
+	org.rocksdb.HashSkipListMemTableConfig\
+	org.rocksdb.Logger\
+	org.rocksdb.MergeOperator\
+	org.rocksdb.Options\
+	org.rocksdb.PlainTableConfig\
+	org.rocksdb.ReadOptions\
+	org.rocksdb.RemoveEmptyValueCompactionFilter\
+	org.rocksdb.RestoreBackupableDB\
+	org.rocksdb.RestoreOptions\
+	org.rocksdb.RocksDB\
+	org.rocksdb.RocksEnv\
+	org.rocksdb.RocksIterator\
+	org.rocksdb.RocksMemEnv\
+	org.rocksdb.SkipListMemTableConfig\
+	org.rocksdb.Slice\
+	org.rocksdb.Statistics\
+	org.rocksdb.TransactionLogIterator\
+	org.rocksdb.TtlDB\
+	org.rocksdb.VectorMemTableConfig\
+	org.rocksdb.Snapshot\
+	org.rocksdb.StringAppendOperator\
+	org.rocksdb.WriteBatch\
+	org.rocksdb.WriteBatch.Handler\
+	org.rocksdb.WriteOptions\
+	org.rocksdb.WriteBatchWithIndex\
+	org.rocksdb.WBWIRocksIterator
+
+NATIVE_JAVA_TEST_CLASSES = org.rocksdb.WriteBatchTest\
+    org.rocksdb.WriteBatchTestInternalHelper
+
+ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
+ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3)
+
+NATIVE_INCLUDE = ./include
+ARCH := $(shell getconf LONG_BIT)
+ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar
+ifeq ($(PLATFORM), OS_MACOSX)
+ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar
+endif
+
+JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\
+        org.rocksdb.BackupEngineTest\
+	org.rocksdb.BackupableDBTest\
+	org.rocksdb.BlockBasedTableConfigTest\
+	org.rocksdb.CheckPointTest\
+	org.rocksdb.ColumnFamilyOptionsTest\
+	org.rocksdb.ColumnFamilyTest\
+	org.rocksdb.ComparatorOptionsTest\
+	org.rocksdb.ComparatorTest\
+	org.rocksdb.CompressionOptionsTest\
+	org.rocksdb.DBOptionsTest\
+	org.rocksdb.DirectComparatorTest\
+	org.rocksdb.DirectSliceTest\
+	org.rocksdb.util.EnvironmentTest\
+	org.rocksdb.FilterTest\
+	org.rocksdb.FlushTest\
+	org.rocksdb.InfoLogLevelTest\
+	org.rocksdb.KeyMayExistTest\
+        org.rocksdb.LoggerTest\
+	org.rocksdb.MemTableTest\
+	org.rocksdb.MergeTest\
+	org.rocksdb.MixedOptionsTest\
+        org.rocksdb.NativeLibraryLoaderTest\
+	org.rocksdb.OptionsTest\
+	org.rocksdb.PlainTableConfigTest\
+	org.rocksdb.ReadOnlyTest\
+	org.rocksdb.ReadOptionsTest\
+	org.rocksdb.RocksDBTest\
+	org.rocksdb.RocksEnvTest\
+	org.rocksdb.RocksIteratorTest\
+	org.rocksdb.RocksMemEnvTest\
+	org.rocksdb.util.SizeUnitTest\
+	org.rocksdb.SliceTest\
+	org.rocksdb.SnapshotTest\
+	org.rocksdb.TransactionLogIteratorTest\
+	org.rocksdb.TtlDBTest\
+	org.rocksdb.StatisticsCollectorTest\
+	org.rocksdb.WriteBatchHandlerTest\
+	org.rocksdb.WriteBatchTest\
+	org.rocksdb.WriteOptionsTest\
+	org.rocksdb.WriteBatchWithIndexTest
+
+MAIN_SRC = src/main/java
+TEST_SRC = src/test/java
+OUTPUT = target
+MAIN_CLASSES = $(OUTPUT)/classes
+TEST_CLASSES = $(OUTPUT)/test-classes
+JAVADOC = $(OUTPUT)/apidocs
+
+BENCHMARK_MAIN_SRC = benchmark/src/main/java
+BENCHMARK_OUTPUT = benchmark/target
+BENCHMARK_MAIN_CLASSES = $(BENCHMARK_OUTPUT)/classes
+
+SAMPLES_MAIN_SRC = samples/src/main/java
+SAMPLES_OUTPUT = samples/target
+SAMPLES_MAIN_CLASSES = $(SAMPLES_OUTPUT)/classes
+
+JAVA_TEST_LIBDIR = test-libs
+JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)/junit-4.12.jar
+JAVA_HAMCR_JAR = $(JAVA_TEST_LIBDIR)/hamcrest-core-1.3.jar
+JAVA_MOCKITO_JAR = $(JAVA_TEST_LIBDIR)/mockito-all-1.10.19.jar
+JAVA_CGLIB_JAR = $(JAVA_TEST_LIBDIR)/cglib-2.2.2.jar
+JAVA_ASSERTJ_JAR = $(JAVA_TEST_LIBDIR)/assertj-core-1.7.1.jar
+JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR):$(JAVA_HAMCR_JAR):$(JAVA_MOCKITO_JAR):$(JAVA_CGLIB_JAR):$(JAVA_ASSERTJ_JAR)
+
+MVN_LOCAL = ~/.m2/repository
+
+clean:
+	$(AM_V_at)rm -rf include/*
+	$(AM_V_at)rm -rf test-libs/
+	$(AM_V_at)rm -rf $(OUTPUT)
+	$(AM_V_at)rm -rf $(BENCHMARK_OUTPUT)
+	$(AM_V_at)rm -rf $(SAMPLES_OUTPUT)
+
+
+javadocs:
+	$(AM_V_GEN)mkdir -p $(JAVADOC)
+	$(AM_V_at)javadoc -d $(JAVADOC) -sourcepath $(MAIN_SRC) -subpackages org
+
+javalib: java java_test javadocs
+
+java:
+	$(AM_V_GEN)mkdir -p $(MAIN_CLASSES)
+	$(AM_V_at)javac -d $(MAIN_CLASSES)\
+		$(MAIN_SRC)/org/rocksdb/util/*.java\
+		$(MAIN_SRC)/org/rocksdb/*.java
+	$(AM_V_at)@cp ../HISTORY.md ./HISTORY-CPP.md
+	$(AM_V_at)@rm -f ./HISTORY-CPP.md
+	$(AM_V_at)javah -cp $(MAIN_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES)
+
+sample: java
+	$(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES)
+	$(AM_V_at)javac -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBSample.java
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni_not_found
+	java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBSample /tmp/rocksdbjni
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni_not_found
+
+column_family_sample: java
+	$(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES)
+	$(AM_V_at)javac -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBColumnFamilySample.java
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+	java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBColumnFamilySample /tmp/rocksdbjni
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+
+resolve_test_deps:
+	test -s "$(JAVA_TEST_LIBDIR)" || mkdir -p "$(JAVA_TEST_LIBDIR)"
+	test -s "$(JAVA_JUNIT_JAR)" || cp $(MVN_LOCAL)/junit/junit/4.12/junit-4.12.jar $(JAVA_TEST_LIBDIR) || curl -k -L -o $(JAVA_JUNIT_JAR) http://search.maven.org/remotecontent?filepath=junit/junit/4.12/junit-4.12.jar
+	test -s "$(JAVA_HAMCR_JAR)" || cp $(MVN_LOCAL)/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar $(JAVA_TEST_LIBDIR) || curl -k -L -o $(JAVA_HAMCR_JAR) http://search.maven.org/remotecontent?filepath=org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar
+	test -s "$(JAVA_MOCKITO_JAR)" || cp $(MVN_LOCAL)/org/mockito/mockito-all/1.10.19/mockito-all-1.10.19.jar $(JAVA_TEST_LIBDIR) || curl -k -L -o "$(JAVA_MOCKITO_JAR)" http://search.maven.org/remotecontent?filepath=org/mockito/mockito-all/1.10.19/mockito-all-1.10.19.jar
+	test -s "$(JAVA_CGLIB_JAR)" || cp $(MVN_LOCAL)/cglib/cglib/2.2.2/cglib-2.2.2.jar $(JAVA_TEST_LIBDIR) || curl -k -L -o "$(JAVA_CGLIB_JAR)" http://search.maven.org/remotecontent?filepath=cglib/cglib/2.2.2/cglib-2.2.2.jar
+	test -s "$(JAVA_ASSERTJ_JAR)" || cp $(MVN_LOCAL)/org/assertj/assertj-core/1.7.1/assertj-core-1.7.1.jar $(JAVA_TEST_LIBDIR) || curl -k -L -o "$(JAVA_ASSERTJ_JAR)" http://central.maven.org/maven2/org/assertj/assertj-core/1.7.1/assertj-core-1.7.1.jar
+
+java_test: resolve_test_deps
+	$(AM_V_GEN)mkdir -p $(TEST_CLASSES)
+	$(AM_V_at)javac -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES)\
+		$(TEST_SRC)/org/rocksdb/test/*.java\
+		$(TEST_SRC)/org/rocksdb/util/*.java\
+		$(TEST_SRC)/org/rocksdb/*.java
+	$(AM_V_at)javah -cp $(MAIN_CLASSES):$(TEST_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_TEST_CLASSES)
+
+test: java resolve_test_deps java_test
+	java -ea -Xcheck:jni -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH):target/*" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS)
+
+db_bench: java
+	$(AM_V_GEN)mkdir -p $(BENCHMARK_MAIN_CLASSES)
+	$(AM_V_at)javac -cp $(MAIN_CLASSES) -d $(BENCHMARK_MAIN_CLASSES) $(BENCHMARK_MAIN_SRC)/org/rocksdb/benchmark/*.java
diff --git a/src/rocksdb/java/RELEASE.md b/src/rocksdb/java/RELEASE.md
new file mode 100644
index 0000000..084460c
--- /dev/null
+++ b/src/rocksdb/java/RELEASE.md
@@ -0,0 +1,54 @@
+## Cross-building
+
+RocksDB can be built as a single self contained cross-platform JAR. The cross-platform jar can be usd on any 64-bit OSX system, 32-bit Linux system, or 64-bit Linux system.
+
+Building a cross-platform JAR requires:
+
+ * [Vagrant](https://www.vagrantup.com/)
+ * [Virtualbox](https://www.virtualbox.org/)
+ * A Mac OSX machine that can compile RocksDB.
+ * Java 7 set as JAVA_HOME.
+
+Once you have these items, run this make command from RocksDB's root source directory:
+
+    make jclean clean rocksdbjavastaticrelease
+
+This command will build RocksDB natively on OSX, and will then spin up two Vagrant Virtualbox Ubuntu images to build RocksDB for both 32-bit and 64-bit Linux. 
+
+You can find all native binaries and JARs in the java directory upon completion:
+
+    librocksdbjni-linux32.so
+    librocksdbjni-linux64.so
+    librocksdbjni-osx.jnilib
+    rocksdbjni-3.5.0-javadoc.jar
+    rocksdbjni-3.5.0-linux32.jar
+    rocksdbjni-3.5.0-linux64.jar
+    rocksdbjni-3.5.0-osx.jar
+    rocksdbjni-3.5.0-sources.jar
+    rocksdbjni-3.5.0.jar
+
+## Maven publication
+
+Set ~/.m2/settings.xml to contain:
+
+    <settings xmlns="http://maven.apache.org/SETTINGS/1.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">
+      <servers>
+        <server>
+          <id>sonatype-nexus-staging</id>
+          <username>your-sonatype-jira-username</username>
+          <password>your-sonatype-jira-password</password>
+        </server>
+      </servers>
+    </settings>
+
+From RocksDB's root directory, first build the Java static JARs:
+
+    make jclean clean rocksdbjavastaticpublish
+
+This command will [stage the JAR artifacts on the Sonatype staging repository](http://central.sonatype.org/pages/manual-staging-bundle-creation-and-deployment.html). To release the staged artifacts.
+
+1. Go to [https://oss.sonatype.org/#stagingRepositories](https://oss.sonatype.org/#stagingRepositories) and search for "rocksdb" in the upper right hand search box.
+2. Select the rocksdb staging repository, and inspect its contents.
+3. If all is well, follow [these steps](https://oss.sonatype.org/#stagingRepositories) to close the repository and release it.
+
+After the release has occurred, the artifacts will be synced to Maven central within 24-48 hours.
diff --git a/src/rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java b/src/rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
new file mode 100644
index 0000000..14eea09
--- /dev/null
+++ b/src/rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
@@ -0,0 +1,1624 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+/**
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.rocksdb.benchmark;
+
+import java.lang.Runnable;
+import java.lang.Math;
+import java.io.File;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.Date;
+import java.util.EnumMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.concurrent.Callable;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import org.rocksdb.*;
+import org.rocksdb.RocksMemEnv;
+import org.rocksdb.util.SizeUnit;
+
+class Stats {
+  int id_;
+  long start_;
+  long finish_;
+  double seconds_;
+  long done_;
+  long found_;
+  long lastOpTime_;
+  long nextReport_;
+  long bytes_;
+  StringBuilder message_;
+  boolean excludeFromMerge_;
+
+  // TODO(yhchiang): use the following arguments:
+  //   (Long)Flag.stats_interval
+  //   (Integer)Flag.stats_per_interval
+
+  Stats(int id) {
+    id_ = id;
+    nextReport_ = 100;
+    done_ = 0;
+    bytes_ = 0;
+    seconds_ = 0;
+    start_ = System.nanoTime();
+    lastOpTime_ = start_;
+    finish_ = start_;
+    found_ = 0;
+    message_ = new StringBuilder("");
+    excludeFromMerge_ = false;
+  }
+
+  void merge(final Stats other) {
+    if (other.excludeFromMerge_) {
+      return;
+    }
+
+    done_ += other.done_;
+    found_ += other.found_;
+    bytes_ += other.bytes_;
+    seconds_ += other.seconds_;
+    if (other.start_ < start_) start_ = other.start_;
+    if (other.finish_ > finish_) finish_ = other.finish_;
+
+    // Just keep the messages from one thread
+    if (message_.length() == 0) {
+      message_ = other.message_;
+    }
+  }
+
+  void stop() {
+    finish_ = System.nanoTime();
+    seconds_ = (double) (finish_ - start_) * 1e-9;
+  }
+
+  void addMessage(String msg) {
+    if (message_.length() > 0) {
+      message_.append(" ");
+    }
+    message_.append(msg);
+  }
+
+  void setId(int id) { id_ = id; }
+  void setExcludeFromMerge() { excludeFromMerge_ = true; }
+
+  void finishedSingleOp(int bytes) {
+    done_++;
+    lastOpTime_ = System.nanoTime();
+    bytes_ += bytes;
+    if (done_ >= nextReport_) {
+      if (nextReport_ < 1000) {
+        nextReport_ += 100;
+      } else if (nextReport_ < 5000) {
+        nextReport_ += 500;
+      } else if (nextReport_ < 10000) {
+        nextReport_ += 1000;
+      } else if (nextReport_ < 50000) {
+        nextReport_ += 5000;
+      } else if (nextReport_ < 100000) {
+        nextReport_ += 10000;
+      } else if (nextReport_ < 500000) {
+        nextReport_ += 50000;
+      } else {
+        nextReport_ += 100000;
+      }
+      System.err.printf("... Task %s finished %d ops%30s\r", id_, done_, "");
+    }
+  }
+
+  void report(String name) {
+    // Pretend at least one op was done in case we are running a benchmark
+    // that does not call FinishedSingleOp().
+    if (done_ < 1) done_ = 1;
+
+    StringBuilder extra = new StringBuilder("");
+    if (bytes_ > 0) {
+      // Rate is computed on actual elapsed time, not the sum of per-thread
+      // elapsed times.
+      double elapsed = (finish_ - start_) * 1e-9;
+      extra.append(String.format("%6.1f MB/s", (bytes_ / 1048576.0) / elapsed));
+    }
+    extra.append(message_.toString());
+    double elapsed = (finish_ - start_);
+    double throughput = (double) done_ / (elapsed * 1e-9);
+
+    System.out.format("%-12s : %11.3f micros/op %d ops/sec;%s%s\n",
+            name, (elapsed * 1e-6) / done_,
+            (long) throughput, (extra.length() == 0 ? "" : " "), extra.toString());
+  }
+}
+
+public class DbBenchmark {
+  enum Order {
+    SEQUENTIAL,
+    RANDOM
+  }
+
+  enum DBState {
+    FRESH,
+    EXISTING
+  }
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  abstract class BenchmarkTask implements Callable<Stats> {
+    // TODO(yhchiang): use (Integer)Flag.perf_level.
+    public BenchmarkTask(
+        int tid, long randSeed, long numEntries, long keyRange) {
+      tid_ = tid;
+      rand_ = new Random(randSeed + tid * 1000);
+      numEntries_ = numEntries;
+      keyRange_ = keyRange;
+      stats_ = new Stats(tid);
+    }
+
+    @Override public Stats call() throws RocksDBException {
+      stats_.start_ = System.nanoTime();
+      runTask();
+      stats_.finish_ = System.nanoTime();
+      return stats_;
+    }
+
+    abstract protected void runTask() throws RocksDBException;
+
+    protected int tid_;
+    protected Random rand_;
+    protected long numEntries_;
+    protected long keyRange_;
+    protected Stats stats_;
+
+    protected void getFixedKey(byte[] key, long sn) {
+      generateKeyFromLong(key, sn);
+    }
+
+    protected void getRandomKey(byte[] key, long range) {
+      generateKeyFromLong(key, Math.abs(rand_.nextLong() % range));
+    }
+  }
+
+  abstract class WriteTask extends BenchmarkTask {
+    public WriteTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch) {
+      super(tid, randSeed, numEntries, keyRange);
+      writeOpt_ = writeOpt;
+      entriesPerBatch_ = entriesPerBatch;
+      maxWritesPerSecond_ = -1;
+    }
+
+    public WriteTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch, long maxWritesPerSecond) {
+      super(tid, randSeed, numEntries, keyRange);
+      writeOpt_ = writeOpt;
+      entriesPerBatch_ = entriesPerBatch;
+      maxWritesPerSecond_ = maxWritesPerSecond;
+    }
+
+    @Override public void runTask() throws RocksDBException {
+      if (numEntries_ != DbBenchmark.this.num_) {
+        stats_.message_.append(String.format(" (%d ops)", numEntries_));
+      }
+      byte[] key = new byte[keySize_];
+      byte[] value = new byte[valueSize_];
+
+      try {
+        if (entriesPerBatch_ == 1) {
+          for (long i = 0; i < numEntries_; ++i) {
+            getKey(key, i, keyRange_);
+            DbBenchmark.this.gen_.generate(value);
+            db_.put(writeOpt_, key, value);
+            stats_.finishedSingleOp(keySize_ + valueSize_);
+            writeRateControl(i);
+            if (isFinished()) {
+              return;
+            }
+          }
+        } else {
+          for (long i = 0; i < numEntries_; i += entriesPerBatch_) {
+            WriteBatch batch = new WriteBatch();
+            for (long j = 0; j < entriesPerBatch_; j++) {
+              getKey(key, i + j, keyRange_);
+              DbBenchmark.this.gen_.generate(value);
+              batch.put(key, value);
+              stats_.finishedSingleOp(keySize_ + valueSize_);
+            }
+            db_.write(writeOpt_, batch);
+            batch.dispose();
+            writeRateControl(i);
+            if (isFinished()) {
+              return;
+            }
+          }
+        }
+      } catch (InterruptedException e) {
+        // thread has been terminated.
+      }
+    }
+
+    protected void writeRateControl(long writeCount)
+        throws InterruptedException {
+      if (maxWritesPerSecond_ <= 0) return;
+      long minInterval =
+          writeCount * TimeUnit.SECONDS.toNanos(1) / maxWritesPerSecond_;
+      long interval = System.nanoTime() - stats_.start_;
+      if (minInterval - interval > TimeUnit.MILLISECONDS.toNanos(1)) {
+        TimeUnit.NANOSECONDS.sleep(minInterval - interval);
+      }
+    }
+
+    abstract protected void getKey(byte[] key, long id, long range);
+    protected WriteOptions writeOpt_;
+    protected long entriesPerBatch_;
+    protected long maxWritesPerSecond_;
+  }
+
+  class WriteSequentialTask extends WriteTask {
+    public WriteSequentialTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch);
+    }
+    public WriteSequentialTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch,
+        long maxWritesPerSecond) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch,
+            maxWritesPerSecond);
+    }
+    @Override protected void getKey(byte[] key, long id, long range) {
+      getFixedKey(key, id);
+    }
+  }
+
+  class WriteRandomTask extends WriteTask {
+    public WriteRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch);
+    }
+    public WriteRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch,
+        long maxWritesPerSecond) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch,
+            maxWritesPerSecond);
+    }
+    @Override protected void getKey(byte[] key, long id, long range) {
+      getRandomKey(key, range);
+    }
+  }
+
+  class WriteUniqueRandomTask extends WriteTask {
+    static final int MAX_BUFFER_SIZE = 10000000;
+    public WriteUniqueRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch);
+      initRandomKeySequence();
+    }
+    public WriteUniqueRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange,
+        WriteOptions writeOpt, long entriesPerBatch,
+        long maxWritesPerSecond) {
+      super(tid, randSeed, numEntries, keyRange,
+            writeOpt, entriesPerBatch,
+            maxWritesPerSecond);
+      initRandomKeySequence();
+    }
+    @Override protected void getKey(byte[] key, long id, long range) {
+      generateKeyFromLong(key, nextUniqueRandom());
+    }
+
+    protected void initRandomKeySequence() {
+      bufferSize_ = MAX_BUFFER_SIZE;
+      if (bufferSize_ > keyRange_) {
+        bufferSize_ = (int) keyRange_;
+      }
+      currentKeyCount_ = bufferSize_;
+      keyBuffer_ = new long[MAX_BUFFER_SIZE];
+      for (int k = 0; k < bufferSize_; ++k) {
+        keyBuffer_[k] = k;
+      }
+    }
+
+    /**
+     * Semi-randomly return the next unique key.  It is guaranteed to be
+     * fully random if keyRange_ <= MAX_BUFFER_SIZE.
+     */
+    long nextUniqueRandom() {
+      if (bufferSize_ == 0) {
+        System.err.println("bufferSize_ == 0.");
+        return 0;
+      }
+      int r = rand_.nextInt(bufferSize_);
+      // randomly pick one from the keyBuffer
+      long randKey = keyBuffer_[r];
+      if (currentKeyCount_ < keyRange_) {
+        // if we have not yet inserted all keys, insert next new key to [r].
+        keyBuffer_[r] = currentKeyCount_++;
+      } else {
+        // move the last element to [r] and decrease the size by 1.
+        keyBuffer_[r] = keyBuffer_[--bufferSize_];
+      }
+      return randKey;
+    }
+
+    int bufferSize_;
+    long currentKeyCount_;
+    long[] keyBuffer_;
+  }
+
+  class ReadRandomTask extends BenchmarkTask {
+    public ReadRandomTask(
+        int tid, long randSeed, long numEntries, long keyRange) {
+      super(tid, randSeed, numEntries, keyRange);
+    }
+    @Override public void runTask() throws RocksDBException {
+      byte[] key = new byte[keySize_];
+      byte[] value = new byte[valueSize_];
+      for (long i = 0; i < numEntries_; i++) {
+        getRandomKey(key, keyRange_);
+        int len = db_.get(key, value);
+        if (len != RocksDB.NOT_FOUND) {
+          stats_.found_++;
+          stats_.finishedSingleOp(keySize_ + valueSize_);
+        } else {
+          stats_.finishedSingleOp(keySize_);
+        }
+        if (isFinished()) {
+          return;
+        }
+      }
+    }
+  }
+
+  class ReadSequentialTask extends BenchmarkTask {
+    public ReadSequentialTask(
+        int tid, long randSeed, long numEntries, long keyRange) {
+      super(tid, randSeed, numEntries, keyRange);
+    }
+    @Override public void runTask() throws RocksDBException {
+      RocksIterator iter = db_.newIterator();
+      long i;
+      for (iter.seekToFirst(), i = 0;
+           iter.isValid() && i < numEntries_;
+           iter.next(), ++i) {
+        stats_.found_++;
+        stats_.finishedSingleOp(iter.key().length + iter.value().length);
+        if (isFinished()) {
+          iter.dispose();
+          return;
+        }
+      }
+      iter.dispose();
+    }
+  }
+
+  public DbBenchmark(Map<Flag, Object> flags) throws Exception {
+    benchmarks_ = (List<String>) flags.get(Flag.benchmarks);
+    num_ = (Integer) flags.get(Flag.num);
+    threadNum_ = (Integer) flags.get(Flag.threads);
+    reads_ = (Integer) (flags.get(Flag.reads) == null ?
+        flags.get(Flag.num) : flags.get(Flag.reads));
+    keySize_ = (Integer) flags.get(Flag.key_size);
+    valueSize_ = (Integer) flags.get(Flag.value_size);
+    compressionRatio_ = (Double) flags.get(Flag.compression_ratio);
+    useExisting_ = (Boolean) flags.get(Flag.use_existing_db);
+    randSeed_ = (Long) flags.get(Flag.seed);
+    databaseDir_ = (String) flags.get(Flag.db);
+    writesPerSeconds_ = (Integer) flags.get(Flag.writes_per_second);
+    memtable_ = (String) flags.get(Flag.memtablerep);
+    maxWriteBufferNumber_ = (Integer) flags.get(Flag.max_write_buffer_number);
+    prefixSize_ = (Integer) flags.get(Flag.prefix_size);
+    keysPerPrefix_ = (Integer) flags.get(Flag.keys_per_prefix);
+    hashBucketCount_ = (Long) flags.get(Flag.hash_bucket_count);
+    usePlainTable_ = (Boolean) flags.get(Flag.use_plain_table);
+    useMemenv_ = (Boolean) flags.get(Flag.use_mem_env);
+    flags_ = flags;
+    finishLock_ = new Object();
+    // options.setPrefixSize((Integer)flags_.get(Flag.prefix_size));
+    // options.setKeysPerPrefix((Long)flags_.get(Flag.keys_per_prefix));
+    compressionType_ = (String) flags.get(Flag.compression_type);
+    compression_ = CompressionType.NO_COMPRESSION;
+    try {
+      if (compressionType_!=null) {
+          final CompressionType compressionType =
+              CompressionType.getCompressionType(compressionType_);
+          if (compressionType != null &&
+              compressionType != CompressionType.NO_COMPRESSION) {
+            System.loadLibrary(compressionType.getLibraryName());
+          }
+
+      }
+    } catch (UnsatisfiedLinkError e) {
+      System.err.format("Unable to load %s library:%s%n" +
+                        "No compression is used.%n",
+          compressionType_, e.toString());
+      compressionType_ = "none";
+    }
+    gen_ = new RandomGenerator(randSeed_, compressionRatio_);
+  }
+
+  private void prepareReadOptions(ReadOptions options) {
+    options.setVerifyChecksums((Boolean)flags_.get(Flag.verify_checksum));
+    options.setTailing((Boolean)flags_.get(Flag.use_tailing_iterator));
+  }
+
+  private void prepareWriteOptions(WriteOptions options) {
+    options.setSync((Boolean)flags_.get(Flag.sync));
+    options.setDisableWAL((Boolean)flags_.get(Flag.disable_wal));
+  }
+
+  private void prepareOptions(Options options) throws RocksDBException {
+    if (!useExisting_) {
+      options.setCreateIfMissing(true);
+    } else {
+      options.setCreateIfMissing(false);
+    }
+    if (useMemenv_) {
+      options.setEnv(new RocksMemEnv());
+    }
+    switch (memtable_) {
+      case "skip_list":
+        options.setMemTableConfig(new SkipListMemTableConfig());
+        break;
+      case "vector":
+        options.setMemTableConfig(new VectorMemTableConfig());
+        break;
+      case "hash_linkedlist":
+        options.setMemTableConfig(
+            new HashLinkedListMemTableConfig()
+                .setBucketCount(hashBucketCount_));
+        options.useFixedLengthPrefixExtractor(prefixSize_);
+        break;
+      case "hash_skiplist":
+      case "prefix_hash":
+        options.setMemTableConfig(
+            new HashSkipListMemTableConfig()
+                .setBucketCount(hashBucketCount_));
+        options.useFixedLengthPrefixExtractor(prefixSize_);
+        break;
+      default:
+        System.err.format(
+            "unable to detect the specified memtable, " +
+                "use the default memtable factory %s%n",
+            options.memTableFactoryName());
+        break;
+    }
+    if (usePlainTable_) {
+      options.setTableFormatConfig(
+          new PlainTableConfig().setKeySize(keySize_));
+    } else {
+      BlockBasedTableConfig table_options = new BlockBasedTableConfig();
+      table_options.setBlockSize((Long)flags_.get(Flag.block_size))
+                   .setBlockCacheSize((Long)flags_.get(Flag.cache_size))
+                   .setCacheNumShardBits(
+                      (Integer)flags_.get(Flag.cache_numshardbits));
+      options.setTableFormatConfig(table_options);
+    }
+    options.setWriteBufferSize(
+        (Long)flags_.get(Flag.write_buffer_size));
+    options.setMaxWriteBufferNumber(
+        (Integer)flags_.get(Flag.max_write_buffer_number));
+    options.setMaxBackgroundCompactions(
+        (Integer)flags_.get(Flag.max_background_compactions));
+    options.getEnv().setBackgroundThreads(
+        (Integer)flags_.get(Flag.max_background_compactions));
+    options.setMaxBackgroundFlushes(
+        (Integer)flags_.get(Flag.max_background_flushes));
+    options.setMaxOpenFiles(
+        (Integer)flags_.get(Flag.open_files));
+    options.setDisableDataSync(
+        (Boolean)flags_.get(Flag.disable_data_sync));
+    options.setUseFsync(
+        (Boolean)flags_.get(Flag.use_fsync));
+    options.setWalDir(
+        (String)flags_.get(Flag.wal_dir));
+    options.setDeleteObsoleteFilesPeriodMicros(
+        (Integer)flags_.get(Flag.delete_obsolete_files_period_micros));
+    options.setTableCacheNumshardbits(
+        (Integer)flags_.get(Flag.table_cache_numshardbits));
+    options.setAllowMmapReads(
+        (Boolean)flags_.get(Flag.mmap_read));
+    options.setAllowMmapWrites(
+        (Boolean)flags_.get(Flag.mmap_write));
+    options.setAdviseRandomOnOpen(
+        (Boolean)flags_.get(Flag.advise_random_on_open));
+    options.setUseAdaptiveMutex(
+        (Boolean)flags_.get(Flag.use_adaptive_mutex));
+    options.setBytesPerSync(
+        (Long)flags_.get(Flag.bytes_per_sync));
+    options.setBloomLocality(
+        (Integer)flags_.get(Flag.bloom_locality));
+    options.setMinWriteBufferNumberToMerge(
+        (Integer)flags_.get(Flag.min_write_buffer_number_to_merge));
+    options.setMemtablePrefixBloomBits(
+        (Integer)flags_.get(Flag.memtable_bloom_bits));
+    options.setNumLevels(
+        (Integer)flags_.get(Flag.num_levels));
+    options.setTargetFileSizeBase(
+        (Integer)flags_.get(Flag.target_file_size_base));
+    options.setTargetFileSizeMultiplier(
+        (Integer)flags_.get(Flag.target_file_size_multiplier));
+    options.setMaxBytesForLevelBase(
+        (Integer)flags_.get(Flag.max_bytes_for_level_base));
+    options.setMaxBytesForLevelMultiplier(
+        (Integer)flags_.get(Flag.max_bytes_for_level_multiplier));
+    options.setLevelZeroStopWritesTrigger(
+        (Integer)flags_.get(Flag.level0_stop_writes_trigger));
+    options.setLevelZeroSlowdownWritesTrigger(
+        (Integer)flags_.get(Flag.level0_slowdown_writes_trigger));
+    options.setLevelZeroFileNumCompactionTrigger(
+        (Integer)flags_.get(Flag.level0_file_num_compaction_trigger));
+    options.setSoftRateLimit(
+        (Double)flags_.get(Flag.soft_rate_limit));
+    options.setHardRateLimit(
+        (Double)flags_.get(Flag.hard_rate_limit));
+    options.setRateLimitDelayMaxMilliseconds(
+        (Integer)flags_.get(Flag.rate_limit_delay_max_milliseconds));
+    options.setMaxGrandparentOverlapFactor(
+        (Integer)flags_.get(Flag.max_grandparent_overlap_factor));
+    options.setDisableAutoCompactions(
+        (Boolean)flags_.get(Flag.disable_auto_compactions));
+    options.setSourceCompactionFactor(
+        (Integer)flags_.get(Flag.source_compaction_factor));
+    options.setFilterDeletes(
+        (Boolean)flags_.get(Flag.filter_deletes));
+    options.setMaxSuccessiveMerges(
+        (Integer)flags_.get(Flag.max_successive_merges));
+    options.setWalTtlSeconds((Long)flags_.get(Flag.wal_ttl_seconds));
+    options.setWalSizeLimitMB((Long)flags_.get(Flag.wal_size_limit_MB));
+    /* TODO(yhchiang): enable the following parameters
+    options.setCompressionType((String)flags_.get(Flag.compression_type));
+    options.setCompressionLevel((Integer)flags_.get(Flag.compression_level));
+    options.setMinLevelToCompress((Integer)flags_.get(Flag.min_level_to_compress));
+    options.setHdfs((String)flags_.get(Flag.hdfs)); // env
+    options.setStatistics((Boolean)flags_.get(Flag.statistics));
+    options.setUniversalSizeRatio(
+        (Integer)flags_.get(Flag.universal_size_ratio));
+    options.setUniversalMinMergeWidth(
+        (Integer)flags_.get(Flag.universal_min_merge_width));
+    options.setUniversalMaxMergeWidth(
+        (Integer)flags_.get(Flag.universal_max_merge_width));
+    options.setUniversalMaxSizeAmplificationPercent(
+        (Integer)flags_.get(Flag.universal_max_size_amplification_percent));
+    options.setUniversalCompressionSizePercent(
+        (Integer)flags_.get(Flag.universal_compression_size_percent));
+    // TODO(yhchiang): add RocksDB.openForReadOnly() to enable Flag.readonly
+    // TODO(yhchiang): enable Flag.merge_operator by switch
+    options.setAccessHintOnCompactionStart(
+        (String)flags_.get(Flag.compaction_fadvice));
+    // available values of fadvice are "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" for fadvice
+    */
+  }
+
+  private void run() throws RocksDBException {
+    if (!useExisting_) {
+      destroyDb();
+    }
+    Options options = new Options();
+    prepareOptions(options);
+    open(options);
+
+    printHeader(options);
+
+    for (String benchmark : benchmarks_) {
+      List<Callable<Stats>> tasks = new ArrayList<Callable<Stats>>();
+      List<Callable<Stats>> bgTasks = new ArrayList<Callable<Stats>>();
+      WriteOptions writeOpt = new WriteOptions();
+      prepareWriteOptions(writeOpt);
+      ReadOptions readOpt = new ReadOptions();
+      prepareReadOptions(readOpt);
+      int currentTaskId = 0;
+      boolean known = true;
+
+      switch (benchmark) {
+        case "fillseq":
+          tasks.add(new WriteSequentialTask(
+              currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
+          break;
+        case "fillbatch":
+          tasks.add(new WriteRandomTask(
+              currentTaskId++, randSeed_, num_ / 1000, num_, writeOpt, 1000));
+          break;
+        case "fillrandom":
+          tasks.add(new WriteRandomTask(
+              currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
+          break;
+        case "filluniquerandom":
+          tasks.add(new WriteUniqueRandomTask(
+              currentTaskId++, randSeed_, num_, num_, writeOpt, 1));
+          break;
+        case "fillsync":
+          writeOpt.setSync(true);
+          tasks.add(new WriteRandomTask(
+              currentTaskId++, randSeed_, num_ / 1000, num_ / 1000,
+              writeOpt, 1));
+          break;
+        case "readseq":
+          for (int t = 0; t < threadNum_; ++t) {
+            tasks.add(new ReadSequentialTask(
+                currentTaskId++, randSeed_, reads_ / threadNum_, num_));
+          }
+          break;
+        case "readrandom":
+          for (int t = 0; t < threadNum_; ++t) {
+            tasks.add(new ReadRandomTask(
+                currentTaskId++, randSeed_, reads_ / threadNum_, num_));
+          }
+          break;
+        case "readwhilewriting":
+          WriteTask writeTask = new WriteRandomTask(
+              -1, randSeed_, Long.MAX_VALUE, num_, writeOpt, 1, writesPerSeconds_);
+          writeTask.stats_.setExcludeFromMerge();
+          bgTasks.add(writeTask);
+          for (int t = 0; t < threadNum_; ++t) {
+            tasks.add(new ReadRandomTask(
+                currentTaskId++, randSeed_, reads_ / threadNum_, num_));
+          }
+          break;
+        case "readhot":
+          for (int t = 0; t < threadNum_; ++t) {
+            tasks.add(new ReadRandomTask(
+                currentTaskId++, randSeed_, reads_ / threadNum_, num_ / 100));
+          }
+          break;
+        case "delete":
+          destroyDb();
+          open(options);
+          break;
+        default:
+          known = false;
+          System.err.println("Unknown benchmark: " + benchmark);
+          break;
+      }
+      if (known) {
+        ExecutorService executor = Executors.newCachedThreadPool();
+        ExecutorService bgExecutor = Executors.newCachedThreadPool();
+        try {
+          // measure only the main executor time
+          List<Future<Stats>> bgResults = new ArrayList<Future<Stats>>();
+          for (Callable bgTask : bgTasks) {
+            bgResults.add(bgExecutor.submit(bgTask));
+          }
+          start();
+          List<Future<Stats>> results = executor.invokeAll(tasks);
+          executor.shutdown();
+          boolean finished = executor.awaitTermination(10, TimeUnit.SECONDS);
+          if (!finished) {
+            System.out.format(
+                "Benchmark %s was not finished before timeout.",
+                benchmark);
+            executor.shutdownNow();
+          }
+          setFinished(true);
+          bgExecutor.shutdown();
+          finished = bgExecutor.awaitTermination(10, TimeUnit.SECONDS);
+          if (!finished) {
+            System.out.format(
+                "Benchmark %s was not finished before timeout.",
+                benchmark);
+            bgExecutor.shutdownNow();
+          }
+
+          stop(benchmark, results, currentTaskId);
+        } catch (InterruptedException e) {
+          System.err.println(e);
+        }
+      }
+      writeOpt.dispose();
+      readOpt.dispose();
+    }
+    options.dispose();
+    db_.close();
+  }
+
+  private void printHeader(Options options) {
+    int kKeySize = 16;
+    System.out.printf("Keys:     %d bytes each\n", kKeySize);
+    System.out.printf("Values:   %d bytes each (%d bytes after compression)\n",
+        valueSize_,
+        (int) (valueSize_ * compressionRatio_ + 0.5));
+    System.out.printf("Entries:  %d\n", num_);
+    System.out.printf("RawSize:  %.1f MB (estimated)\n",
+        ((double)(kKeySize + valueSize_) * num_) / SizeUnit.MB);
+    System.out.printf("FileSize:   %.1f MB (estimated)\n",
+        (((kKeySize + valueSize_ * compressionRatio_) * num_) / SizeUnit.MB));
+    System.out.format("Memtable Factory: %s%n", options.memTableFactoryName());
+    System.out.format("Prefix:   %d bytes%n", prefixSize_);
+    System.out.format("Compression: %s%n", compressionType_);
+    printWarnings();
+    System.out.printf("------------------------------------------------\n");
+  }
+
+  void printWarnings() {
+    boolean assertsEnabled = false;
+    assert assertsEnabled = true; // Intentional side effect!!!
+    if (assertsEnabled) {
+      System.out.printf(
+          "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+    }
+  }
+
+  private void open(Options options) throws RocksDBException {
+    db_ = RocksDB.open(options, databaseDir_);
+  }
+
+  private void start() {
+    setFinished(false);
+    startTime_ = System.nanoTime();
+  }
+
+  private void stop(
+      String benchmark, List<Future<Stats>> results, int concurrentThreads) {
+    long endTime = System.nanoTime();
+    double elapsedSeconds =
+        1.0d * (endTime - startTime_) / TimeUnit.SECONDS.toNanos(1);
+
+    Stats stats = new Stats(-1);
+    int taskFinishedCount = 0;
+    for (Future<Stats> result : results) {
+      if (result.isDone()) {
+        try {
+          Stats taskStats = result.get(3, TimeUnit.SECONDS);
+          if (!result.isCancelled()) {
+            taskFinishedCount++;
+          }
+          stats.merge(taskStats);
+        } catch (Exception e) {
+          // then it's not successful, the output will indicate this
+        }
+      }
+    }
+    String extra = "";
+    if (benchmark.indexOf("read") >= 0) {
+      extra = String.format(" %d / %d found; ", stats.found_, stats.done_);
+    } else {
+      extra = String.format(" %d ops done; ", stats.done_);
+    }
+
+    System.out.printf(
+        "%-16s : %11.5f micros/op; %6.1f MB/s;%s %d / %d task(s) finished.\n",
+        benchmark, elapsedSeconds / stats.done_ * 1e6,
+        (stats.bytes_ / 1048576.0) / elapsedSeconds, extra,
+        taskFinishedCount, concurrentThreads);
+  }
+
+  public void generateKeyFromLong(byte[] slice, long n) {
+    assert(n >= 0);
+    int startPos = 0;
+
+    if (keysPerPrefix_ > 0) {
+      long numPrefix = (num_ + keysPerPrefix_ - 1) / keysPerPrefix_;
+      long prefix = n % numPrefix;
+      int bytesToFill = Math.min(prefixSize_, 8);
+      for (int i = 0; i < bytesToFill; ++i) {
+        slice[i] = (byte) (prefix % 256);
+        prefix /= 256;
+      }
+      for (int i = 8; i < bytesToFill; ++i) {
+        slice[i] = '0';
+      }
+      startPos = bytesToFill;
+    }
+
+    for (int i = slice.length - 1; i >= startPos; --i) {
+      slice[i] = (byte) ('0' + (n % 10));
+      n /= 10;
+    }
+  }
+
+  private void destroyDb() {
+    if (db_ != null) {
+      db_.close();
+    }
+    // TODO(yhchiang): develop our own FileUtil
+    // FileUtil.deleteDir(databaseDir_);
+  }
+
+  private void printStats() {
+  }
+
+  static void printHelp() {
+    System.out.println("usage:");
+    for (Flag flag : Flag.values()) {
+      System.out.format("  --%s%n\t%s%n",
+          flag.name(),
+          flag.desc());
+      if (flag.getDefaultValue() != null) {
+        System.out.format("\tDEFAULT: %s%n",
+            flag.getDefaultValue().toString());
+      }
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    Map<Flag, Object> flags = new EnumMap<Flag, Object>(Flag.class);
+    for (Flag flag : Flag.values()) {
+      if (flag.getDefaultValue() != null) {
+        flags.put(flag, flag.getDefaultValue());
+      }
+    }
+    for (String arg : args) {
+      boolean valid = false;
+      if (arg.equals("--help") || arg.equals("-h")) {
+        printHelp();
+        System.exit(0);
+      }
+      if (arg.startsWith("--")) {
+        try {
+          String[] parts = arg.substring(2).split("=");
+          if (parts.length >= 1) {
+            Flag key = Flag.valueOf(parts[0]);
+            if (key != null) {
+              Object value = null;
+              if (parts.length >= 2) {
+                value = key.parseValue(parts[1]);
+              }
+              flags.put(key, value);
+              valid = true;
+            }
+          }
+        }
+        catch (Exception e) {
+        }
+      }
+      if (!valid) {
+        System.err.println("Invalid argument " + arg);
+        System.exit(1);
+      }
+    }
+    new DbBenchmark(flags).run();
+  }
+
+  private enum Flag {
+    benchmarks(
+        Arrays.asList(
+            "fillseq",
+            "readrandom",
+            "fillrandom"),
+        "Comma-separated list of operations to run in the specified order\n" +
+        "\tActual benchmarks:\n" +
+        "\t\tfillseq          -- write N values in sequential key order in async mode.\n" +
+        "\t\tfillrandom       -- write N values in random key order in async mode.\n" +
+        "\t\tfillbatch        -- write N/1000 batch where each batch has 1000 values\n" +
+        "\t\t                   in random key order in sync mode.\n" +
+        "\t\tfillsync         -- write N/100 values in random key order in sync mode.\n" +
+        "\t\tfill100K         -- write N/1000 100K values in random order in async mode.\n" +
+        "\t\treadseq          -- read N times sequentially.\n" +
+        "\t\treadrandom       -- read N times in random order.\n" +
+        "\t\treadhot          -- read N times in random order from 1% section of DB.\n" +
+        "\t\treadwhilewriting -- measure the read performance of multiple readers\n" +
+        "\t\t                   with a bg single writer.  The write rate of the bg\n" +
+        "\t\t                   is capped by --writes_per_second.\n" +
+        "\tMeta Operations:\n" +
+        "\t\tdelete            -- delete DB") {
+      @Override public Object parseValue(String value) {
+        return new ArrayList<String>(Arrays.asList(value.split(",")));
+      }
+    },
+    compression_ratio(0.5d,
+        "Arrange to generate values that shrink to this fraction of\n" +
+        "\ttheir original size after compression.") {
+      @Override public Object parseValue(String value) {
+        return Double.parseDouble(value);
+      }
+    },
+    use_existing_db(false,
+        "If true, do not destroy the existing database.  If you set this\n" +
+        "\tflag and also specify a benchmark that wants a fresh database,\n" +
+        "\tthat benchmark will fail.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    num(1000000,
+        "Number of key/values to place in database.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    threads(1,
+        "Number of concurrent threads to run.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    reads(null,
+        "Number of read operations to do.  If negative, do --nums reads.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    key_size(16,
+        "The size of each key in bytes.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    value_size(100,
+        "The size of each value in bytes.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    write_buffer_size(4 * SizeUnit.MB,
+        "Number of bytes to buffer in memtable before compacting\n" +
+        "\t(initialized to default value by 'main'.)") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    max_write_buffer_number(2,
+             "The number of in-memory memtables. Each memtable is of size\n" +
+             "\twrite_buffer_size.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    prefix_size(0, "Controls the prefix size for HashSkipList, HashLinkedList,\n" +
+                   "\tand plain table.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    keys_per_prefix(0, "Controls the average number of keys generated\n" +
+             "\tper prefix, 0 means no special handling of the prefix,\n" +
+             "\ti.e. use the prefix comes with the generated random number.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    memtablerep("skip_list",
+        "The memtable format.  Available options are\n" +
+        "\tskip_list,\n" +
+        "\tvector,\n" +
+        "\thash_linkedlist,\n" +
+        "\thash_skiplist (prefix_hash.)") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    },
+    hash_bucket_count(SizeUnit.MB,
+        "The number of hash buckets used in the hash-bucket-based\n" +
+        "\tmemtables.  Memtables that currently support this argument are\n" +
+        "\thash_linkedlist and hash_skiplist.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    writes_per_second(10000,
+        "The write-rate of the background writer used in the\n" +
+        "\t`readwhilewriting` benchmark.  Non-positive number indicates\n" +
+        "\tusing an unbounded write-rate in `readwhilewriting` benchmark.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    use_plain_table(false,
+        "Use plain-table sst format.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    cache_size(-1L,
+        "Number of bytes to use as a cache of uncompressed data.\n" +
+        "\tNegative means use default settings.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    seed(0L,
+        "Seed base for random number generators.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    num_levels(7,
+        "The total number of levels.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    numdistinct(1000,
+        "Number of distinct keys to use. Used in RandomWithVerify to\n" +
+        "\tread/write on fewer keys so that gets are more likely to find the\n" +
+        "\tkey and puts are more likely to update the same key.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    merge_keys(-1,
+        "Number of distinct keys to use for MergeRandom and\n" +
+        "\tReadRandomMergeRandom.\n" +
+        "\tIf negative, there will be FLAGS_num keys.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    bloom_locality(0,"Control bloom filter probes locality.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    duration(0,"Time in seconds for the random-ops tests to run.\n" +
+        "\tWhen 0 then num & reads determine the test duration.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    num_multi_db(0,
+        "Number of DBs used in the benchmark. 0 means single DB.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    histogram(false,"Print histogram of operation timings.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    min_write_buffer_number_to_merge(
+        defaultOptions_.minWriteBufferNumberToMerge(),
+        "The minimum number of write buffers that will be merged together\n" +
+        "\tbefore writing to storage. This is cheap because it is an\n" +
+        "\tin-memory merge. If this feature is not enabled, then all these\n" +
+        "\twrite buffers are flushed to L0 as separate files and this\n" +
+        "\tincreases read amplification because a get request has to check\n" +
+        "\tin all of these files. Also, an in-memory merge may result in\n" +
+        "\twriting less data to storage if there are duplicate records\n" +
+        "\tin each of these individual write buffers.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_background_compactions(
+        defaultOptions_.maxBackgroundCompactions(),
+        "The maximum number of concurrent background compactions\n" +
+        "\tthat can occur in parallel.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_background_flushes(
+        defaultOptions_.maxBackgroundFlushes(),
+        "The maximum number of concurrent background flushes\n" +
+        "\tthat can occur in parallel.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    /* TODO(yhchiang): enable the following
+    compaction_style((int32_t) defaultOptions_.compactionStyle(),
+        "style of compaction: level-based vs universal.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },*/
+    universal_size_ratio(0,
+        "Percentage flexibility while comparing file size\n" +
+        "\t(for universal compaction only).") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    universal_min_merge_width(0,"The minimum number of files in a\n" +
+        "\tsingle compaction run (for universal compaction only).") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    universal_max_merge_width(0,"The max number of files to compact\n" +
+        "\tin universal style compaction.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    universal_max_size_amplification_percent(0,
+        "The max size amplification for universal style compaction.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    universal_compression_size_percent(-1,
+        "The percentage of the database to compress for universal\n" +
+        "\tcompaction. -1 means compress everything.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    block_size(defaultBlockBasedTableOptions_.blockSize(),
+        "Number of bytes in a block.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    compressed_cache_size(-1,
+        "Number of bytes to use as a cache of compressed data.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    open_files(defaultOptions_.maxOpenFiles(),
+        "Maximum number of files to keep open at the same time\n" +
+        "\t(use default if == 0)") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    bloom_bits(-1,"Bloom filter bits per key. Negative means\n" +
+        "\tuse default settings.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    memtable_bloom_bits(0,"Bloom filter bits per key for memtable.\n" +
+        "\tNegative means no bloom filter.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    cache_numshardbits(-1,"Number of shards for the block cache\n" +
+        "\tis 2 ** cache_numshardbits. Negative means use default settings.\n" +
+        "\tThis is applied only if FLAGS_cache_size is non-negative.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    verify_checksum(false,"Verify checksum for every block read\n" +
+        "\tfrom storage.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    statistics(false,"Database statistics.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    writes(-1,"Number of write operations to do. If negative, do\n" +
+        "\t--num reads.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    sync(false,"Sync all writes to disk.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    disable_data_sync(false,"If true, do not wait until data is\n" +
+        "\tsynced to disk.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    use_fsync(false,"If true, issue fsync instead of fdatasync.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    disable_wal(false,"If true, do not write WAL for write.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    wal_dir("", "If not empty, use the given dir for WAL.") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    },
+    target_file_size_base(2 * 1048576,"Target file size at level-1") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    target_file_size_multiplier(1,
+        "A multiplier to compute target level-N file size (N >= 2)") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_bytes_for_level_base(10 * 1048576,
+      "Max bytes for level-1") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_bytes_for_level_multiplier(10,
+        "A multiplier to compute max bytes for level-N (N >= 2)") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    level0_stop_writes_trigger(12,"Number of files in level-0\n" +
+        "\tthat will trigger put stop.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    level0_slowdown_writes_trigger(8,"Number of files in level-0\n" +
+        "\tthat will slow down writes.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    level0_file_num_compaction_trigger(4,"Number of files in level-0\n" +
+        "\twhen compactions start.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    readwritepercent(90,"Ratio of reads to reads/writes (expressed\n" +
+        "\tas percentage) for the ReadRandomWriteRandom workload. The\n" +
+        "\tdefault value 90 means 90% operations out of all reads and writes\n" +
+        "\toperations are reads. In other words, 9 gets for every 1 put.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    mergereadpercent(70,"Ratio of merges to merges&reads (expressed\n" +
+        "\tas percentage) for the ReadRandomMergeRandom workload. The\n" +
+        "\tdefault value 70 means 70% out of all read and merge operations\n" +
+        "\tare merges. In other words, 7 merges for every 3 gets.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    deletepercent(2,"Percentage of deletes out of reads/writes/\n" +
+        "\tdeletes (used in RandomWithVerify only). RandomWithVerify\n" +
+        "\tcalculates writepercent as (100 - FLAGS_readwritepercent -\n" +
+        "\tdeletepercent), so deletepercent must be smaller than (100 -\n" +
+        "\tFLAGS_readwritepercent)") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    delete_obsolete_files_period_micros(0,"Option to delete\n" +
+        "\tobsolete files periodically. 0 means that obsolete files are\n" +
+        "\tdeleted after every compaction run.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    compression_type("snappy",
+        "Algorithm used to compress the database.") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    },
+    compression_level(-1,
+        "Compression level. For zlib this should be -1 for the\n" +
+        "\tdefault level, or between 0 and 9.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    min_level_to_compress(-1,"If non-negative, compression starts\n" +
+        "\tfrom this level. Levels with number < min_level_to_compress are\n" +
+        "\tnot compressed. Otherwise, apply compression_type to\n" +
+        "\tall levels.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    table_cache_numshardbits(4,"") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    stats_interval(0,"Stats are reported every N operations when\n" +
+        "\tthis is greater than zero. When 0 the interval grows over time.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    stats_per_interval(0,"Reports additional stats per interval when\n" +
+        "\tthis is greater than 0.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    perf_level(0,"Level of perf collection.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    soft_rate_limit(0.0,"") {
+      @Override public Object parseValue(String value) {
+        return Double.parseDouble(value);
+      }
+    },
+    hard_rate_limit(0.0,"When not equal to 0 this make threads\n" +
+        "\tsleep at each stats reporting interval until the compaction\n" +
+        "\tscore for all levels is less than or equal to this value.") {
+      @Override public Object parseValue(String value) {
+        return Double.parseDouble(value);
+      }
+    },
+    rate_limit_delay_max_milliseconds(1000,
+        "When hard_rate_limit is set then this is the max time a put will\n" +
+        "\tbe stalled.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    max_grandparent_overlap_factor(10,"Control maximum bytes of\n" +
+        "\toverlaps in grandparent (i.e., level+2) before we stop building a\n" +
+        "\tsingle file in a level->level+1 compaction.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    readonly(false,"Run read only benchmarks.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    disable_auto_compactions(false,"Do not auto trigger compactions.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    source_compaction_factor(1,"Cap the size of data in level-K for\n" +
+        "\ta compaction run that compacts Level-K with Level-(K+1) (for\n" +
+        "\tK >= 1)") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    wal_ttl_seconds(0L,"Set the TTL for the WAL Files in seconds.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    wal_size_limit_MB(0L,"Set the size limit for the WAL Files\n" +
+        "\tin MB.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    /* TODO(yhchiang): enable the following
+    bufferedio(rocksdb::EnvOptions().use_os_buffer,
+        "Allow buffered io using OS buffers.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    */
+    mmap_read(false,
+        "Allow reads to occur via mmap-ing files.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    mmap_write(false,
+        "Allow writes to occur via mmap-ing files.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    advise_random_on_open(defaultOptions_.adviseRandomOnOpen(),
+        "Advise random access on table file open.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    compaction_fadvice("NORMAL",
+      "Access pattern advice when a file is compacted.") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    },
+    use_tailing_iterator(false,
+        "Use tailing iterator to access a series of keys instead of get.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    use_adaptive_mutex(defaultOptions_.useAdaptiveMutex(),
+        "Use adaptive mutex.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    bytes_per_sync(defaultOptions_.bytesPerSync(),
+        "Allows OS to incrementally sync files to disk while they are\n" +
+        "\tbeing written, in the background. Issue one request for every\n" +
+        "\tbytes_per_sync written. 0 turns it off.") {
+      @Override public Object parseValue(String value) {
+        return Long.parseLong(value);
+      }
+    },
+    filter_deletes(false," On true, deletes use bloom-filter and drop\n" +
+        "\tthe delete if key not present.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    },
+    max_successive_merges(0,"Maximum number of successive merge\n" +
+        "\toperations on a key in the memtable.") {
+      @Override public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
+    db("/tmp/rocksdbjni-bench",
+       "Use the db with the following name.") {
+      @Override public Object parseValue(String value) {
+        return value;
+      }
+    },
+    use_mem_env(false, "Use RocksMemEnv instead of default filesystem based\n" +
+        "environment.") {
+      @Override public Object parseValue(String value) {
+        return parseBoolean(value);
+      }
+    };
+
+    private Flag(Object defaultValue, String desc) {
+      defaultValue_ = defaultValue;
+      desc_ = desc;
+    }
+
+    public Object getDefaultValue() {
+      return defaultValue_;
+    }
+
+    public String desc() {
+      return desc_;
+    }
+
+    public boolean parseBoolean(String value) {
+      if (value.equals("1")) {
+        return true;
+      } else if (value.equals("0")) {
+        return false;
+      }
+      return Boolean.parseBoolean(value);
+    }
+
+    protected abstract Object parseValue(String value);
+
+    private final Object defaultValue_;
+    private final String desc_;
+  }
+
+  private static class RandomGenerator {
+    private final byte[] data_;
+    private int dataLength_;
+    private int position_;
+    private double compressionRatio_;
+    Random rand_;
+
+    private RandomGenerator(long seed, double compressionRatio) {
+      // We use a limited amount of data over and over again and ensure
+      // that it is larger than the compression window (32KB), and also
+      byte[] value = new byte[100];
+      // large enough to serve all typical value sizes we want to write.
+      rand_ = new Random(seed);
+      dataLength_ = value.length * 10000;
+      data_ = new byte[dataLength_];
+      compressionRatio_ = compressionRatio;
+      int pos = 0;
+      while (pos < dataLength_) {
+        compressibleBytes(value);
+        System.arraycopy(value, 0, data_, pos,
+                         Math.min(value.length, dataLength_ - pos));
+        pos += value.length;
+      }
+    }
+
+    private void compressibleBytes(byte[] value) {
+      int baseLength = value.length;
+      if (compressionRatio_ < 1.0d) {
+        baseLength = (int) (compressionRatio_ * value.length + 0.5);
+      }
+      if (baseLength <= 0) {
+        baseLength = 1;
+      }
+      int pos;
+      for (pos = 0; pos < baseLength; ++pos) {
+        value[pos] = (byte) (' ' + rand_.nextInt(95));  // ' ' .. '~'
+      }
+      while (pos < value.length) {
+        System.arraycopy(value, 0, value, pos,
+                         Math.min(baseLength, value.length - pos));
+        pos += baseLength;
+      }
+    }
+
+    private void generate(byte[] value) {
+      if (position_ + value.length > data_.length) {
+        position_ = 0;
+        assert(value.length <= data_.length);
+      }
+      position_ += value.length;
+      System.arraycopy(data_, position_ - value.length,
+                       value, 0, value.length);
+    }
+  }
+
+  boolean isFinished() {
+    synchronized(finishLock_) {
+      return isFinished_;
+    }
+  }
+
+  void setFinished(boolean flag) {
+    synchronized(finishLock_) {
+      isFinished_ = flag;
+    }
+  }
+
+  RocksDB db_;
+  final List<String> benchmarks_;
+  final int num_;
+  final int reads_;
+  final int keySize_;
+  final int valueSize_;
+  final int threadNum_;
+  final int writesPerSeconds_;
+  final long randSeed_;
+  final boolean useExisting_;
+  final String databaseDir_;
+  double compressionRatio_;
+  RandomGenerator gen_;
+  long startTime_;
+
+  // env
+  boolean useMemenv_;
+
+  // memtable related
+  final int maxWriteBufferNumber_;
+  final int prefixSize_;
+  final int keysPerPrefix_;
+  final String memtable_;
+  final long hashBucketCount_;
+
+  // sst format related
+  boolean usePlainTable_;
+
+  Object finishLock_;
+  boolean isFinished_;
+  Map<Flag, Object> flags_;
+  // as the scope of a static member equals to the scope of the problem,
+  // we let its c++ pointer to be disposed in its finalizer.
+  static Options defaultOptions_ = new Options();
+  static BlockBasedTableConfig defaultBlockBasedTableOptions_ =
+    new BlockBasedTableConfig();
+  String compressionType_;
+  CompressionType compression_;
+}
diff --git a/src/rocksdb/java/crossbuild/Vagrantfile b/src/rocksdb/java/crossbuild/Vagrantfile
new file mode 100644
index 0000000..8a52b92
--- /dev/null
+++ b/src/rocksdb/java/crossbuild/Vagrantfile
@@ -0,0 +1,26 @@
+# -*- mode: ruby -*-
+# vi: set ft=ruby :
+
+# Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
+VAGRANTFILE_API_VERSION = "2"
+
+Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
+
+  config.vm.define "linux32" do |linux32|
+    linux32.vm.box = "hansode/centos-5.6-i386"
+  end
+
+  config.vm.define "linux64" do |linux64|
+    linux64.vm.box = "hansode/centos-5.6-x86_64"
+  end
+
+  config.vm.provider "virtualbox" do |v|
+    v.memory = 2048
+    v.cpus = 4
+  end
+
+  config.vm.provision :shell, path: "build-linux-centos.sh"
+  config.vm.synced_folder "../", "/rocksdb-build"
+  config.vm.synced_folder "../..", "/rocksdb", type: "rsync"
+  config.vm.boot_timeout = 1200
+end
diff --git a/src/rocksdb/java/crossbuild/build-linux-centos.sh b/src/rocksdb/java/crossbuild/build-linux-centos.sh
new file mode 100755
index 0000000..f2b7948
--- /dev/null
+++ b/src/rocksdb/java/crossbuild/build-linux-centos.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# install all required packages for rocksdb that are available through yum
+ARCH=$(uname -i)
+sudo yum -y install openssl java-1.7.0-openjdk-devel.$ARCH zlib zlib-devel bzip2 bzip2-devel
+
+# install gcc/g++ 4.8.2 via CERN (http://linux.web.cern.ch/linux/devtoolset/)
+sudo wget -O /etc/yum.repos.d/slc5-devtoolset.repo http://linuxsoft.cern.ch/cern/devtoolset/slc5-devtoolset.repo
+sudo wget -O /etc/pki/rpm-gpg/RPM-GPG-KEY-cern http://ftp.mirrorservice.org/sites/ftp.scientificlinux.org/linux/scientific/51/i386/RPM-GPG-KEYs/RPM-GPG-KEY-cern
+sudo yum -y install devtoolset-2
+
+wget http://gflags.googlecode.com/files/gflags-1.6.tar.gz
+tar xvfz gflags-1.6.tar.gz; cd gflags-1.6; scl enable devtoolset-2 ./configure; scl enable devtoolset-2 make; sudo make install
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
+
+# set java home so we can build rocksdb jars
+export JAVA_HOME=/usr/lib/jvm/java-1.7.0
+
+# build rocksdb
+cd /rocksdb
+scl enable devtoolset-2 'make jclean clean'
+scl enable devtoolset-2 'make rocksdbjavastatic'
+cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
+cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build
+
diff --git a/src/rocksdb/java/crossbuild/build-linux.sh b/src/rocksdb/java/crossbuild/build-linux.sh
new file mode 100755
index 0000000..48d1c28
--- /dev/null
+++ b/src/rocksdb/java/crossbuild/build-linux.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+# install all required packages for rocksdb
+sudo apt-get update
+sudo apt-get -y install git make gcc g++ libgflags-dev libsnappy-dev zlib1g-dev libbz2-dev default-jdk
+
+# set java home so we can build rocksdb jars
+export JAVA_HOME=$(echo /usr/lib/jvm/java-7-openjdk*)
+cd /rocksdb
+make jclean clean
+make -j 4 rocksdbjavastatic
+cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
+cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build
+sudo shutdown -h now
+
diff --git a/src/rocksdb/java/jdb_bench.sh b/src/rocksdb/java/jdb_bench.sh
new file mode 100755
index 0000000..9665de7
--- /dev/null
+++ b/src/rocksdb/java/jdb_bench.sh
@@ -0,0 +1,10 @@
+PLATFORM=64
+if [ `getconf LONG_BIT` != "64" ]
+then
+  PLATFORM=32
+fi
+
+ROCKS_JAR=`find target -name rocksdbjni*.jar`
+
+echo "Running benchmark in $PLATFORM-Bit mode."
+java -server -d$PLATFORM -XX:NewSize=4m -XX:+AggressiveOpts -Djava.library.path=target -cp "${ROCKS_JAR}:benchmark/target/classes" org.rocksdb.benchmark.DbBenchmark $@
diff --git a/src/rocksdb/java/rocksjni.pom b/src/rocksdb/java/rocksjni.pom
new file mode 100644
index 0000000..0512df6
--- /dev/null
+++ b/src/rocksdb/java/rocksjni.pom
@@ -0,0 +1,145 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project
+        xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
+        xmlns="http://maven.apache.org/POM/4.0.0"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+    <modelVersion>4.0.0</modelVersion>
+    <name>RocksDB JNI</name>
+    <url>http://rocksdb.org/</url>
+    <groupId>org.rocksdb</groupId>
+    <artifactId>rocksdbjni</artifactId>
+    <!-- Version will be automatically replaced -->
+    <version>-</version>
+    <description>RocksDB fat jar that contains .so files for linux32 and linux64, and jnilib files
+        for Mac OSX.
+    </description>
+    <licenses>
+        <license>
+            <name>Apache License 2.0</name>
+            <url>http://www.apache.org/licenses/LICENSE-2.0.html</url>
+            <distribution>repo</distribution>
+        </license>
+    </licenses>
+    <scm>
+        <connection>scm:git:git://github.com/dropwizard/metrics.git</connection>
+        <developerConnection>scm:git:git at github.com:dropwizard/metrics.git</developerConnection>
+        <url>http://github.com/dropwizard/metrics/</url>
+        <tag>HEAD</tag>
+    </scm>
+    <developers>
+        <developer>
+            <name>Facebook</name>
+            <email>help at facebook.com</email>
+            <timezone>America/New_York</timezone>
+            <roles>
+                <role>architect</role>
+            </roles>
+        </developer>
+    </developers>
+
+    <properties>
+        <project.build.source>1.7</project.build.source>
+        <project.build.target>1.7</project.build.target>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.2</version>
+                <configuration>
+                    <source>${project.build.source}</source>
+                    <target>${project.build.target}</target>
+                    <encoding>${project.build.sourceEncoding}</encoding>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.18.1</version>
+                <configuration>
+                    <argLine>${argLine} -ea -Xcheck:jni -Djava.library.path=${project.build.directory}</argLine>
+                    <useManifestOnlyJar>false</useManifestOnlyJar>  
+                    <useSystemClassLoader>false</useSystemClassLoader>
+                    <additionalClasspathElements>
+                        <additionalClasspathElement>${project.build.directory}/*</additionalClasspathElement>
+                    </additionalClasspathElements>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.jacoco</groupId>
+                <artifactId>jacoco-maven-plugin</artifactId>
+                <version>0.7.2.201409121644</version>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>prepare-agent</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>report</id>
+                        <phase>prepare-package</phase>
+                        <goals>
+                            <goal>report</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.codehaus.gmaven</groupId>
+                <artifactId>groovy-maven-plugin</artifactId>
+                <version>2.0</version>
+                <executions>
+                    <execution>
+                        <phase>process-classes</phase>
+                        <goals>
+                            <goal>execute</goal>
+                        </goals>
+                        <configuration>
+                            <defaults>
+                                <name>Xenu</name>
+                            </defaults>
+                            <source>
+                                String fileContents = new File("${project.basedir}/../include/rocksdb/version.h").getText('UTF-8')
+                                matcher = (fileContents =~ /(?s).*ROCKSDB_MAJOR ([0-9]+).*?/)
+                                String major_version = matcher.getAt(0).getAt(1)
+                                matcher = (fileContents =~ /(?s).*ROCKSDB_MINOR ([0-9]+).*?/)
+                                String minor_version = matcher.getAt(0).getAt(1)
+                                matcher = (fileContents =~ /(?s).*ROCKSDB_PATCH ([0-9]+).*?/)
+                                String patch_version = matcher.getAt(0).getAt(1)
+                                String version = String.format('%s.%s.%s', major_version, minor_version, patch_version)
+                                // Set version to be used in pom.properties
+                                project.version = version
+                                // Set version to be set as jar name
+                                project.build.finalName = project.artifactId + "-" + version
+                            </source>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+    <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.12</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.assertj</groupId>
+            <artifactId>assertj-core</artifactId>
+            <version>1.7.1</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-all</artifactId>
+            <version>1.10.19</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+</project>
diff --git a/src/rocksdb/java/rocksjni/backupablejni.cc b/src/rocksdb/java/rocksjni/backupablejni.cc
new file mode 100644
index 0000000..d26e46e
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/backupablejni.cc
@@ -0,0 +1,330 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::BackupableDB and rocksdb::BackupableDBOptions methods
+// from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+#include <vector>
+
+#include "include/org_rocksdb_BackupableDB.h"
+#include "include/org_rocksdb_BackupableDBOptions.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/utilities/backupable_db.h"
+
+/*
+ * Class:     org_rocksdb_BackupableDB
+ * Method:    open
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupableDB_open(
+    JNIEnv* env, jobject jbdb, jlong jdb_handle, jlong jopt_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto opt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jopt_handle);
+  auto bdb = new rocksdb::BackupableDB(db, *opt);
+
+  // as BackupableDB extends RocksDB on the java side, we can reuse
+  // the RocksDB portal here.
+  rocksdb::RocksDBJni::setHandle(env, jbdb, bdb);
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDB
+ * Method:    createNewBackup
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupableDB_createNewBackup(
+    JNIEnv* env, jobject jbdb, jlong jhandle, jboolean jflag) {
+  rocksdb::Status s =
+      reinterpret_cast<rocksdb::BackupableDB*>(jhandle)->CreateNewBackup(jflag);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDB
+ * Method:    purgeOldBackups
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_BackupableDB_purgeOldBackups(
+    JNIEnv* env, jobject jbdb, jlong jhandle, jint jnumBackupsToKeep) {
+  rocksdb::Status s =
+      reinterpret_cast<rocksdb::BackupableDB*>(jhandle)->
+      PurgeOldBackups(jnumBackupsToKeep);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDB
+ * Method:    deleteBackup0
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_BackupableDB_deleteBackup0(JNIEnv* env,
+    jobject jobj, jlong jhandle, jint jbackup_id) {
+  auto rdb = reinterpret_cast<rocksdb::BackupableDB*>(jhandle);
+  rocksdb::Status s = rdb->DeleteBackup(jbackup_id);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDB
+ * Method:    getBackupInfo
+ * Signature: (J)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_BackupableDB_getBackupInfo(
+    JNIEnv* env, jobject jbdb, jlong jhandle) {
+  std::vector<rocksdb::BackupInfo> backup_infos;
+  reinterpret_cast<rocksdb::BackupableDB*>(jhandle)->
+      GetBackupInfo(&backup_infos);
+  return rocksdb::BackupInfoListJni::getBackupInfo(env,
+      backup_infos);
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDB
+ * Method:    getCorruptedBackups
+ * Signature: (J)[I;
+ */
+jintArray Java_org_rocksdb_BackupableDB_getCorruptedBackups(
+    JNIEnv* env, jobject jbdb, jlong jhandle) {
+  std::vector<rocksdb::BackupID> backup_ids;
+  reinterpret_cast<rocksdb::BackupableDB*>(jhandle)->
+      GetCorruptedBackups(&backup_ids);
+  // store backupids in int array
+  const std::vector<rocksdb::BackupID>::size_type
+      kIdSize = backup_ids.size();
+  int int_backup_ids[kIdSize];
+  for (std::vector<rocksdb::BackupID>::size_type i = 0;
+      i != kIdSize; i++) {
+    int_backup_ids[i] = backup_ids[i];
+  }
+  // Store ints in java array
+  jintArray ret_backup_ids;
+  // Its ok to loose precision here (64->32)
+  jsize ret_backup_ids_size = static_cast<jsize>(kIdSize);
+  ret_backup_ids = env->NewIntArray(ret_backup_ids_size);
+  env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size,
+      int_backup_ids);
+  return ret_backup_ids;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDB
+ * Method:    garbageCollect
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_BackupableDB_garbageCollect(JNIEnv* env,
+    jobject jobj, jlong jhandle) {
+  auto db = reinterpret_cast<rocksdb::BackupableDB*>(jhandle);
+  rocksdb::Status s = db->GarbageCollect();
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// BackupDBOptions
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    newBackupableDBOptions
+ * Signature: (Ljava/lang/String;)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_newBackupableDBOptions(
+    JNIEnv* env, jobject jobj, jstring jpath) {
+  const char* cpath = env->GetStringUTFChars(jpath, 0);
+  auto bopt = new rocksdb::BackupableDBOptions(cpath);
+  env->ReleaseStringUTFChars(jpath, cpath);
+  rocksdb::BackupableDBOptionsJni::setHandle(env, jobj, bopt);
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    backupDir
+ * Signature: (J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_BackupableDBOptions_backupDir(
+    JNIEnv* env, jobject jopt, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return env->NewStringUTF(bopt->backup_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    setShareTableFiles
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_setShareTableFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  bopt->share_table_files = flag;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    shareTableFiles
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupableDBOptions_shareTableFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return bopt->share_table_files;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    setSync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_setSync(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  bopt->sync = flag;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    sync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupableDBOptions_sync(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return bopt->sync;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    setDestroyOldData
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_setDestroyOldData(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  bopt->destroy_old_data = flag;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    destroyOldData
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupableDBOptions_destroyOldData(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return bopt->destroy_old_data;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    setBackupLogFiles
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_setBackupLogFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  bopt->backup_log_files = flag;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    backupLogFiles
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupableDBOptions_backupLogFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return bopt->backup_log_files;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    setBackupRateLimit
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_setBackupRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jbackup_rate_limit) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  bopt->backup_rate_limit = jbackup_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    backupRateLimit
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_BackupableDBOptions_backupRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return bopt->backup_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    setRestoreRateLimit
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_setRestoreRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jrestore_rate_limit) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  bopt->restore_rate_limit = jrestore_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    restoreRateLimit
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_BackupableDBOptions_restoreRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return bopt->restore_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    setShareFilesWithChecksum
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_setShareFilesWithChecksum(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  bopt->share_files_with_checksum = flag;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    shareFilesWithChecksum
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_BackupableDBOptions_shareFilesWithChecksum(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  return bopt->share_files_with_checksum;
+}
+
+/*
+ * Class:     org_rocksdb_BackupableDBOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_BackupableDBOptions_disposeInternal(
+    JNIEnv* env, jobject jopt, jlong jhandle) {
+  auto bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
+  assert(bopt);
+  delete bopt;
+  rocksdb::BackupableDBOptionsJni::setHandle(env, jopt, nullptr);
+}
diff --git a/src/rocksdb/java/rocksjni/backupenginejni.cc b/src/rocksdb/java/rocksjni/backupenginejni.cc
new file mode 100644
index 0000000..750ab96
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/backupenginejni.cc
@@ -0,0 +1,216 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ rocksdb::BackupEngine methods from the Java side.
+
+#include <jni.h>
+#include <vector>
+
+#include "include/org_rocksdb_BackupEngine.h"
+#include "rocksdb/utilities/backupable_db.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    open
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_BackupEngine_open(
+    JNIEnv* env, jobject jbe, jlong env_handle,
+    jlong backupable_db_options_handle) {
+  auto* rocks_env = reinterpret_cast<rocksdb::Env*>(env_handle);
+  auto* backupable_db_options =
+      reinterpret_cast<rocksdb::BackupableDBOptions*>(
+      backupable_db_options_handle);
+  rocksdb::BackupEngine* backup_engine;
+  auto status = rocksdb::BackupEngine::Open(rocks_env,
+      *backupable_db_options, &backup_engine);
+
+  if (status.ok()) {
+    rocksdb::BackupEngineJni::setHandle(env, jbe, backup_engine);
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    createNewBackup
+ * Signature: (JJZ)V
+ */
+void Java_org_rocksdb_BackupEngine_createNewBackup(
+    JNIEnv* env, jobject jbe, jlong jbe_handle, jlong db_handle,
+    jboolean jflush_before_backup) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
+  auto status = backup_engine->CreateNewBackup(db,
+      static_cast<bool>(jflush_before_backup));
+
+  if (status.ok()) {
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    getBackupInfo
+ * Signature: (J)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_BackupEngine_getBackupInfo(
+    JNIEnv* env, jobject jbe, jlong jbe_handle) {
+  auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
+  std::vector<rocksdb::BackupInfo> backup_infos;
+  backup_engine->GetBackupInfo(&backup_infos);
+  return rocksdb::BackupInfoListJni::getBackupInfo(env, backup_infos);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    getCorruptedBackups
+ * Signature: (J)[I
+ */
+jintArray Java_org_rocksdb_BackupEngine_getCorruptedBackups(
+    JNIEnv* env, jobject jbe, jlong jbe_handle) {
+  auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
+  std::vector<rocksdb::BackupID> backup_ids;
+  backup_engine->GetCorruptedBackups(&backup_ids);
+  // store backupids in int array
+  const std::vector<rocksdb::BackupID>::size_type
+      kIdSize = backup_ids.size();
+  int int_backup_ids[kIdSize];
+  for (std::vector<rocksdb::BackupID>::size_type i = 0;
+      i != kIdSize; i++) {
+    int_backup_ids[i] = backup_ids[i];
+  }
+  // Store ints in java array
+  jintArray ret_backup_ids;
+  // Its ok to loose precision here (64->32)
+  jsize ret_backup_ids_size = static_cast<jsize>(kIdSize);
+  ret_backup_ids = env->NewIntArray(ret_backup_ids_size);
+  env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size,
+      int_backup_ids);
+  return ret_backup_ids;
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    garbageCollect
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_BackupEngine_garbageCollect(
+    JNIEnv* env, jobject jbe, jlong jbe_handle) {
+  auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
+  auto status = backup_engine->GarbageCollect();
+
+  if (status.ok()) {
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    purgeOldBackups
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_BackupEngine_purgeOldBackups(
+    JNIEnv* env, jobject jbe, jlong jbe_handle, jint jnum_backups_to_keep) {
+  auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
+  auto status =
+      backup_engine->
+          PurgeOldBackups(static_cast<uint32_t>(jnum_backups_to_keep));
+
+  if (status.ok()) {
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    deleteBackup
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_BackupEngine_deleteBackup(
+    JNIEnv* env, jobject jbe, jlong jbe_handle, jint jbackup_id) {
+  auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
+  auto status =
+      backup_engine->DeleteBackup(static_cast<rocksdb::BackupID>(jbackup_id));
+
+  if (status.ok()) {
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    restoreDbFromBackup
+ * Signature: (JILjava/lang/String;Ljava/lang/String;J)V
+ */
+void Java_org_rocksdb_BackupEngine_restoreDbFromBackup(
+    JNIEnv* env, jobject jbe, jlong jbe_handle, jint jbackup_id,
+    jstring jdb_dir, jstring jwal_dir, jlong jrestore_options_handle) {
+  auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
+  const char* db_dir = env->GetStringUTFChars(jdb_dir, 0);
+  const char* wal_dir = env->GetStringUTFChars(jwal_dir, 0);
+  auto* restore_options =
+      reinterpret_cast<rocksdb::RestoreOptions*>(jrestore_options_handle);
+  auto status =
+      backup_engine->RestoreDBFromBackup(
+          static_cast<rocksdb::BackupID>(jbackup_id), db_dir, wal_dir,
+          *restore_options);
+  env->ReleaseStringUTFChars(jwal_dir, wal_dir);
+  env->ReleaseStringUTFChars(jdb_dir, db_dir);
+
+  if (status.ok()) {
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    restoreDbFromLatestBackup
+ * Signature: (JLjava/lang/String;Ljava/lang/String;J)V
+ */
+void Java_org_rocksdb_BackupEngine_restoreDbFromLatestBackup(
+    JNIEnv* env, jobject jbe, jlong jbe_handle, jstring jdb_dir,
+    jstring jwal_dir, jlong jrestore_options_handle) {
+  auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
+  const char* db_dir = env->GetStringUTFChars(jdb_dir, 0);
+  const char* wal_dir = env->GetStringUTFChars(jwal_dir, 0);
+  auto* restore_options =
+      reinterpret_cast<rocksdb::RestoreOptions*>(jrestore_options_handle);
+  auto status =
+      backup_engine->RestoreDBFromLatestBackup(db_dir, wal_dir,
+          *restore_options);
+  env->ReleaseStringUTFChars(jwal_dir, wal_dir);
+  env->ReleaseStringUTFChars(jdb_dir, db_dir);
+
+  if (status.ok()) {
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_BackupEngine_disposeInternal(
+    JNIEnv* env, jobject jbe, jlong jbe_handle) {
+  delete reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
+}
diff --git a/src/rocksdb/java/rocksjni/checkpoint.cc b/src/rocksdb/java/rocksjni/checkpoint.cc
new file mode 100644
index 0000000..72a40be
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/checkpoint.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::Checkpoint methods from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+
+#include "include/org_rocksdb_Checkpoint.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/checkpoint.h"
+/*
+ * Class:     org_rocksdb_Checkpoint
+ * Method:    newCheckpoint
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Checkpoint_newCheckpoint(JNIEnv* env,
+    jclass jclazz, jlong jdb_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::Checkpoint* checkpoint;
+  rocksdb::Checkpoint::Create(db, &checkpoint);
+  return reinterpret_cast<jlong>(checkpoint);
+}
+
+/*
+ * Class:     org_rocksdb_Checkpoint
+ * Method:    dispose
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Checkpoint_disposeInternal(JNIEnv* env, jobject jobj,
+    jlong jhandle) {
+  auto checkpoint = reinterpret_cast<rocksdb::Checkpoint*>(jhandle);
+  assert(checkpoint);
+  delete checkpoint;
+}
+
+/*
+ * Class:     org_rocksdb_Checkpoint
+ * Method:    createCheckpoint
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_Checkpoint_createCheckpoint(
+    JNIEnv* env, jobject jobj, jlong jcheckpoint_handle,
+    jstring jcheckpoint_path) {
+  auto checkpoint = reinterpret_cast<rocksdb::Checkpoint*>(
+      jcheckpoint_handle);
+  const char* checkpoint_path = env->GetStringUTFChars(
+      jcheckpoint_path, 0);
+  rocksdb::Status s = checkpoint->CreateCheckpoint(
+      checkpoint_path);
+  env->ReleaseStringUTFChars(jcheckpoint_path, checkpoint_path);
+  if (!s.ok()) {
+      rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
diff --git a/src/rocksdb/java/rocksjni/columnfamilyhandle.cc b/src/rocksdb/java/rocksjni/columnfamilyhandle.cc
new file mode 100644
index 0000000..be3b4c8
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/columnfamilyhandle.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::Iterator methods from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+
+#include "include/org_rocksdb_ColumnFamilyHandle.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyHandle
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ColumnFamilyHandle_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto it = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(handle);
+  delete it;
+}
diff --git a/src/rocksdb/java/rocksjni/compaction_filter.cc b/src/rocksdb/java/rocksjni/compaction_filter.cc
new file mode 100644
index 0000000..5fa52c0
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/compaction_filter.cc
@@ -0,0 +1,24 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::CompactionFilter.
+
+#include <jni.h>
+
+#include "rocksdb/compaction_filter.h"
+
+// <editor-fold desc="org.rocksdb.AbstractCompactionFilter">
+
+/*
+ * Class:     org_rocksdb_AbstractCompactionFilter
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_AbstractCompactionFilter_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  delete reinterpret_cast<rocksdb::CompactionFilter*>(handle);
+}
+// </editor-fold>
diff --git a/src/rocksdb/java/rocksjni/comparator.cc b/src/rocksdb/java/rocksjni/comparator.cc
new file mode 100644
index 0000000..1963762
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/comparator.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::Comparator.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+#include <functional>
+
+#include "include/org_rocksdb_AbstractComparator.h"
+#include "include/org_rocksdb_Comparator.h"
+#include "include/org_rocksdb_DirectComparator.h"
+#include "rocksjni/comparatorjnicallback.h"
+#include "rocksjni/portal.h"
+
+// <editor-fold desc="org.rocksdb.AbstractComparator>
+
+/*
+ * Class:     org_rocksdb_AbstractComparator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_AbstractComparator_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  delete reinterpret_cast<rocksdb::BaseComparatorJniCallback*>(handle);
+}
+// </editor-fold>
+
+// <editor-fold desc="org.rocksdb.Comparator>
+
+/*
+ * Class:     org_rocksdb_Comparator
+ * Method:    createNewComparator0
+ * Signature: ()V
+ */
+void Java_org_rocksdb_Comparator_createNewComparator0(
+    JNIEnv* env, jobject jobj, jlong copt_handle) {
+  const rocksdb::ComparatorJniCallbackOptions* copt =
+    reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(copt_handle);
+  const rocksdb::ComparatorJniCallback* c =
+    new rocksdb::ComparatorJniCallback(env, jobj, copt);
+  rocksdb::AbstractComparatorJni::setHandle(env, jobj, c);
+}
+// </editor-fold>
+
+// <editor-fold desc="org.rocksdb.DirectComparator>
+
+/*
+ * Class:     org_rocksdb_DirectComparator
+ * Method:    createNewDirectComparator0
+ * Signature: ()V
+ */
+void Java_org_rocksdb_DirectComparator_createNewDirectComparator0(
+    JNIEnv* env, jobject jobj, jlong copt_handle) {
+  const rocksdb::ComparatorJniCallbackOptions* copt =
+    reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(copt_handle);
+  const rocksdb::DirectComparatorJniCallback* c =
+    new rocksdb::DirectComparatorJniCallback(env, jobj, copt);
+  rocksdb::AbstractComparatorJni::setHandle(env, jobj, c);
+}
+// </editor-fold>
diff --git a/src/rocksdb/java/rocksjni/comparatorjnicallback.cc b/src/rocksdb/java/rocksjni/comparatorjnicallback.cc
new file mode 100644
index 0000000..a85b450
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/comparatorjnicallback.cc
@@ -0,0 +1,176 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::Comparator.
+
+#include "rocksjni/comparatorjnicallback.h"
+#include "rocksjni/portal.h"
+
+namespace rocksdb {
+BaseComparatorJniCallback::BaseComparatorJniCallback(
+    JNIEnv* env, jobject jComparator,
+    const ComparatorJniCallbackOptions* copt)
+    : mtx_compare(new port::Mutex(copt->use_adaptive_mutex)),
+    mtx_findShortestSeparator(new port::Mutex(copt->use_adaptive_mutex)) {
+  // Note: Comparator methods may be accessed by multiple threads,
+  // so we ref the jvm not the env
+  const jint rs __attribute__((unused)) = env->GetJavaVM(&m_jvm);
+  assert(rs == JNI_OK);
+
+  // Note: we want to access the Java Comparator instance
+  // across multiple method calls, so we create a global ref
+  m_jComparator = env->NewGlobalRef(jComparator);
+
+  // Note: The name of a Comparator will not change during it's lifetime,
+  // so we cache it in a global var
+  jmethodID jNameMethodId = AbstractComparatorJni::getNameMethodId(env);
+  jstring jsName = (jstring)env->CallObjectMethod(m_jComparator, jNameMethodId);
+  m_name = JniUtil::copyString(env, jsName);  // also releases jsName
+
+  m_jCompareMethodId = AbstractComparatorJni::getCompareMethodId(env);
+  m_jFindShortestSeparatorMethodId =
+    AbstractComparatorJni::getFindShortestSeparatorMethodId(env);
+  m_jFindShortSuccessorMethodId =
+    AbstractComparatorJni::getFindShortSuccessorMethodId(env);
+}
+
+/**
+ * Attach/Get a JNIEnv for the current native thread
+ */
+JNIEnv* BaseComparatorJniCallback::getJniEnv() const {
+  JNIEnv *env;
+  jint rs __attribute__((unused)) =
+      m_jvm->AttachCurrentThread(reinterpret_cast<void**>(&env), NULL);
+  assert(rs == JNI_OK);
+  return env;
+}
+
+const char* BaseComparatorJniCallback::Name() const {
+  return m_name.c_str();
+}
+
+int BaseComparatorJniCallback::Compare(const Slice& a, const Slice& b) const {
+  JNIEnv* m_env = getJniEnv();
+
+  // TODO(adamretter): slice objects can potentially be cached using thread
+  // local variables to avoid locking. Could make this configurable depending on
+  // performance.
+  mtx_compare->Lock();
+
+  AbstractSliceJni::setHandle(m_env, m_jSliceA, &a);
+  AbstractSliceJni::setHandle(m_env, m_jSliceB, &b);
+  jint result =
+    m_env->CallIntMethod(m_jComparator, m_jCompareMethodId, m_jSliceA,
+      m_jSliceB);
+
+  mtx_compare->Unlock();
+
+  m_jvm->DetachCurrentThread();
+
+  return result;
+}
+
+void BaseComparatorJniCallback::FindShortestSeparator(
+  std::string* start, const Slice& limit) const {
+  if (start == nullptr) {
+    return;
+  }
+
+  JNIEnv* m_env = getJniEnv();
+
+  const char* startUtf = start->c_str();
+  jstring jsStart = m_env->NewStringUTF(startUtf);
+
+  // TODO(adamretter): slice object can potentially be cached using thread local
+  // variable to avoid locking. Could make this configurable depending on
+  // performance.
+  mtx_findShortestSeparator->Lock();
+
+  AbstractSliceJni::setHandle(m_env, m_jSliceLimit, &limit);
+  jstring jsResultStart =
+    (jstring)m_env->CallObjectMethod(m_jComparator,
+      m_jFindShortestSeparatorMethodId, jsStart, m_jSliceLimit);
+
+  mtx_findShortestSeparator->Unlock();
+
+  m_env->DeleteLocalRef(jsStart);
+
+  if (jsResultStart != nullptr) {
+    // update start with result
+    *start =
+      JniUtil::copyString(m_env, jsResultStart);  // also releases jsResultStart
+  }
+
+  m_jvm->DetachCurrentThread();
+}
+
+void BaseComparatorJniCallback::FindShortSuccessor(std::string* key) const {
+  if (key == nullptr) {
+    return;
+  }
+
+  JNIEnv* m_env = getJniEnv();
+
+  const char* keyUtf = key->c_str();
+  jstring jsKey = m_env->NewStringUTF(keyUtf);
+
+  jstring jsResultKey =
+    (jstring)m_env->CallObjectMethod(m_jComparator,
+      m_jFindShortSuccessorMethodId, jsKey);
+
+  m_env->DeleteLocalRef(jsKey);
+
+  if (jsResultKey != nullptr) {
+    // updates key with result, also releases jsResultKey.
+    *key = JniUtil::copyString(m_env, jsResultKey);
+  }
+
+  m_jvm->DetachCurrentThread();
+}
+
+BaseComparatorJniCallback::~BaseComparatorJniCallback() {
+  JNIEnv* m_env = getJniEnv();
+
+  m_env->DeleteGlobalRef(m_jComparator);
+
+  // Note: do not need to explicitly detach, as this function is effectively
+  // called from the Java class's disposeInternal method, and so already
+  // has an attached thread, getJniEnv above is just a no-op Attach to get
+  // the env jvm->DetachCurrentThread();
+}
+
+ComparatorJniCallback::ComparatorJniCallback(
+    JNIEnv* env, jobject jComparator,
+    const ComparatorJniCallbackOptions* copt) :
+    BaseComparatorJniCallback(env, jComparator, copt) {
+  m_jSliceA = env->NewGlobalRef(SliceJni::construct0(env));
+  m_jSliceB = env->NewGlobalRef(SliceJni::construct0(env));
+  m_jSliceLimit = env->NewGlobalRef(SliceJni::construct0(env));
+}
+
+ComparatorJniCallback::~ComparatorJniCallback() {
+  JNIEnv* m_env = getJniEnv();
+  m_env->DeleteGlobalRef(m_jSliceA);
+  m_env->DeleteGlobalRef(m_jSliceB);
+  m_env->DeleteGlobalRef(m_jSliceLimit);
+}
+
+DirectComparatorJniCallback::DirectComparatorJniCallback(
+    JNIEnv* env, jobject jComparator,
+    const ComparatorJniCallbackOptions* copt) :
+    BaseComparatorJniCallback(env, jComparator, copt) {
+  m_jSliceA = env->NewGlobalRef(DirectSliceJni::construct0(env));
+  m_jSliceB = env->NewGlobalRef(DirectSliceJni::construct0(env));
+  m_jSliceLimit = env->NewGlobalRef(DirectSliceJni::construct0(env));
+}
+
+DirectComparatorJniCallback::~DirectComparatorJniCallback() {
+  JNIEnv* m_env = getJniEnv();
+  m_env->DeleteGlobalRef(m_jSliceA);
+  m_env->DeleteGlobalRef(m_jSliceB);
+  m_env->DeleteGlobalRef(m_jSliceLimit);
+}
+}  // namespace rocksdb
diff --git a/src/rocksdb/java/rocksjni/comparatorjnicallback.h b/src/rocksdb/java/rocksjni/comparatorjnicallback.h
new file mode 100644
index 0000000..65b986c
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/comparatorjnicallback.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::Comparator and rocksdb::DirectComparator.
+
+#ifndef JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_
+#define JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_
+
+#include <jni.h>
+#include <string>
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+struct ComparatorJniCallbackOptions {
+  // Use adaptive mutex, which spins in the user space before resorting
+  // to kernel. This could reduce context switch when the mutex is not
+  // heavily contended. However, if the mutex is hot, we could end up
+  // wasting spin time.
+  // Default: false
+  bool use_adaptive_mutex;
+
+  ComparatorJniCallbackOptions() : use_adaptive_mutex(false) {
+  }
+};
+
+/**
+ * This class acts as a bridge between C++
+ * and Java. The methods in this class will be
+ * called back from the RocksDB storage engine (C++)
+ * we then callback to the appropriate Java method
+ * this enables Comparators to be implemented in Java.
+ *
+ * The design of this Comparator caches the Java Slice
+ * objects that are used in the compare and findShortestSeparator
+ * method callbacks. Instead of creating new objects for each callback
+ * of those functions, by reuse via setHandle we are a lot
+ * faster; Unfortunately this means that we have to
+ * introduce independent locking in regions of each of those methods
+ * via the mutexs mtx_compare and mtx_findShortestSeparator respectively
+ */
+class BaseComparatorJniCallback : public Comparator {
+ public:
+    BaseComparatorJniCallback(
+      JNIEnv* env, jobject jComparator,
+      const ComparatorJniCallbackOptions* copt);
+    virtual ~BaseComparatorJniCallback();
+    virtual const char* Name() const;
+    virtual int Compare(const Slice& a, const Slice& b) const;
+    virtual void FindShortestSeparator(
+      std::string* start, const Slice& limit) const;
+    virtual void FindShortSuccessor(std::string* key) const;
+
+ private:
+    // used for synchronisation in compare method
+    port::Mutex* mtx_compare;
+    // used for synchronisation in findShortestSeparator method
+    port::Mutex* mtx_findShortestSeparator;
+    JavaVM* m_jvm;
+    jobject m_jComparator;
+    std::string m_name;
+    jmethodID m_jCompareMethodId;
+    jmethodID m_jFindShortestSeparatorMethodId;
+    jmethodID m_jFindShortSuccessorMethodId;
+
+ protected:
+    JNIEnv* getJniEnv() const;
+    jobject m_jSliceA;
+    jobject m_jSliceB;
+    jobject m_jSliceLimit;
+};
+
+class ComparatorJniCallback : public BaseComparatorJniCallback {
+ public:
+      ComparatorJniCallback(
+        JNIEnv* env, jobject jComparator,
+        const ComparatorJniCallbackOptions* copt);
+      ~ComparatorJniCallback();
+};
+
+class DirectComparatorJniCallback : public BaseComparatorJniCallback {
+ public:
+      DirectComparatorJniCallback(
+        JNIEnv* env, jobject jComparator,
+        const ComparatorJniCallbackOptions* copt);
+      ~DirectComparatorJniCallback();
+};
+}  // namespace rocksdb
+
+#endif  // JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_
diff --git a/src/rocksdb/java/rocksjni/env.cc b/src/rocksdb/java/rocksjni/env.cc
new file mode 100644
index 0000000..b50d5ae
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/env.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::Env methods from Java side.
+
+#include "include/org_rocksdb_Env.h"
+#include "include/org_rocksdb_RocksEnv.h"
+#include "include/org_rocksdb_RocksMemEnv.h"
+#include "rocksdb/env.h"
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    getDefaultEnvInternal
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_Env_getDefaultEnvInternal(
+    JNIEnv* env, jclass jclazz) {
+  return reinterpret_cast<jlong>(rocksdb::Env::Default());
+}
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    setBackgroundThreads
+ * Signature: (JII)V
+ */
+void Java_org_rocksdb_Env_setBackgroundThreads(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint num, jint priority) {
+  auto* rocks_env = reinterpret_cast<rocksdb::Env*>(jhandle);
+  switch (priority) {
+    case org_rocksdb_Env_FLUSH_POOL:
+      rocks_env->SetBackgroundThreads(num, rocksdb::Env::Priority::LOW);
+      break;
+    case org_rocksdb_Env_COMPACTION_POOL:
+      rocks_env->SetBackgroundThreads(num, rocksdb::Env::Priority::HIGH);
+      break;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_sEnv
+ * Method:    getThreadPoolQueueLen
+ * Signature: (JI)I
+ */
+jint Java_org_rocksdb_Env_getThreadPoolQueueLen(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint pool_id) {
+  auto* rocks_env = reinterpret_cast<rocksdb::Env*>(jhandle);
+  switch (pool_id) {
+    case org_rocksdb_RocksEnv_FLUSH_POOL:
+      return rocks_env->GetThreadPoolQueueLen(rocksdb::Env::Priority::LOW);
+    case org_rocksdb_RocksEnv_COMPACTION_POOL:
+      return rocks_env->GetThreadPoolQueueLen(rocksdb::Env::Priority::HIGH);
+  }
+  return 0;
+}
+
+/*
+ * Class:     org_rocksdb_RocksMemEnv
+ * Method:    createMemEnv
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_RocksMemEnv_createMemEnv(
+    JNIEnv* env, jclass jclazz) {
+  return reinterpret_cast<jlong>(rocksdb::NewMemEnv(
+      rocksdb::Env::Default()));
+}
+
+/*
+ * Class:     org_rocksdb_RocksMemEnv
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksMemEnv_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  delete reinterpret_cast<rocksdb::Env*>(jhandle);
+}
diff --git a/src/rocksdb/java/rocksjni/filter.cc b/src/rocksdb/java/rocksjni/filter.cc
new file mode 100644
index 0000000..2ce17d4
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/filter.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::FilterPolicy.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+
+#include "include/org_rocksdb_Filter.h"
+#include "include/org_rocksdb_BloomFilter.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/filter_policy.h"
+
+/*
+ * Class:     org_rocksdb_BloomFilter
+ * Method:    createBloomFilter
+ * Signature: (IZ)V
+ */
+void Java_org_rocksdb_BloomFilter_createNewBloomFilter(
+    JNIEnv* env, jobject jobj, jint bits_per_key,
+    jboolean use_block_base_builder) {
+  rocksdb::FilterPolicy* fp = const_cast<rocksdb::FilterPolicy *>(
+      rocksdb::NewBloomFilterPolicy(bits_per_key, use_block_base_builder));
+  std::shared_ptr<rocksdb::FilterPolicy> *pFilterPolicy =
+      new std::shared_ptr<rocksdb::FilterPolicy>;
+  *pFilterPolicy = std::shared_ptr<rocksdb::FilterPolicy>(fp);
+  rocksdb::FilterJni::setHandle(env, jobj, pFilterPolicy);
+}
+
+/*
+ * Class:     org_rocksdb_Filter
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Filter_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+
+  std::shared_ptr<rocksdb::FilterPolicy> *handle =
+      reinterpret_cast<std::shared_ptr<rocksdb::FilterPolicy> *>(jhandle);
+  handle->reset();
+}
diff --git a/src/rocksdb/java/rocksjni/iterator.cc b/src/rocksdb/java/rocksjni/iterator.cc
new file mode 100644
index 0000000..e9eb0bb
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/iterator.cc
@@ -0,0 +1,144 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::Iterator methods from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+
+#include "include/org_rocksdb_RocksIterator.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/iterator.h"
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  delete it;
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    isValid0
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_RocksIterator_isValid0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  return reinterpret_cast<rocksdb::Iterator*>(handle)->Valid();
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    seekToFirst0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_seekToFirst0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToFirst();
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    seekToLast0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_seekToLast0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToLast();
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    next0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_next0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::Iterator*>(handle)->Next();
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    prev0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_prev0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::Iterator*>(handle)->Prev();
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    seek0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_RocksIterator_seek0(
+    JNIEnv* env, jobject jobj, jlong handle,
+    jbyteArray jtarget, jint jtarget_len) {
+  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  jbyte* target = env->GetByteArrayElements(jtarget, 0);
+  rocksdb::Slice target_slice(
+      reinterpret_cast<char*>(target), jtarget_len);
+
+  it->Seek(target_slice);
+
+  env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    status0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksIterator_status0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  rocksdb::Status s = it->status();
+
+  if (s.ok()) {
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    key0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_RocksIterator_key0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  rocksdb::Slice key_slice = it->key();
+
+  jbyteArray jkey = env->NewByteArray(static_cast<jsize>(key_slice.size()));
+  env->SetByteArrayRegion(jkey, 0, static_cast<jsize>(key_slice.size()),
+                          reinterpret_cast<const jbyte*>(key_slice.data()));
+  return jkey;
+}
+
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    value0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_RocksIterator_value0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  rocksdb::Slice value_slice = it->value();
+
+  jbyteArray jkeyValue =
+      env->NewByteArray(static_cast<jsize>(value_slice.size()));
+  env->SetByteArrayRegion(jkeyValue, 0, static_cast<jsize>(value_slice.size()),
+                          reinterpret_cast<const jbyte*>(value_slice.data()));
+  return jkeyValue;
+}
diff --git a/src/rocksdb/java/rocksjni/loggerjnicallback.cc b/src/rocksdb/java/rocksjni/loggerjnicallback.cc
new file mode 100644
index 0000000..71e50b9
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/loggerjnicallback.cc
@@ -0,0 +1,195 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::Logger.
+
+#include "include/org_rocksdb_Logger.h"
+
+#include "rocksjni/loggerjnicallback.h"
+#include "rocksjni/portal.h"
+
+namespace rocksdb {
+
+LoggerJniCallback::LoggerJniCallback(
+    JNIEnv* env, jobject jlogger) {
+  const jint rs __attribute__((unused)) = env->GetJavaVM(&m_jvm);
+  assert(rs == JNI_OK);
+
+  // Note: we want to access the Java Logger instance
+  // across multiple method calls, so we create a global ref
+  m_jLogger = env->NewGlobalRef(jlogger);
+  m_jLogMethodId = LoggerJni::getLogMethodId(env);
+}
+
+/**
+ * Get JNIEnv for current native thread
+ */
+JNIEnv* LoggerJniCallback::getJniEnv() const {
+  JNIEnv *env;
+  jint rs __attribute__((unused)) =
+      m_jvm->AttachCurrentThread(reinterpret_cast<void**>(&env), NULL);
+  assert(rs == JNI_OK);
+  return env;
+}
+
+void LoggerJniCallback::Logv(const char* format, va_list ap) {
+  // We implement this method because it is virtual but we don't
+  // use it because we need to know about the log level.
+}
+
+void LoggerJniCallback::Logv(const InfoLogLevel log_level,
+    const char* format, va_list ap) {
+  if (GetInfoLogLevel() <= log_level) {
+    JNIEnv* env = getJniEnv();
+
+    // determine InfoLogLevel java enum instance
+    jobject jlog_level;
+    switch (log_level) {
+      case rocksdb::InfoLogLevel::DEBUG_LEVEL:
+        jlog_level = InfoLogLevelJni::DEBUG_LEVEL(env);
+        break;
+      case rocksdb::InfoLogLevel::INFO_LEVEL:
+        jlog_level = InfoLogLevelJni::INFO_LEVEL(env);
+        break;
+      case rocksdb::InfoLogLevel::ERROR_LEVEL:
+        jlog_level = InfoLogLevelJni::ERROR_LEVEL(env);
+        break;
+      case rocksdb::InfoLogLevel::FATAL_LEVEL:
+        jlog_level = InfoLogLevelJni::FATAL_LEVEL(env);
+        break;
+      default:
+        jlog_level = InfoLogLevelJni::FATAL_LEVEL(env);
+        break;
+    }
+
+    // We try twice: the first time with a fixed-size stack allocated buffer,
+    // and the second time with a much larger dynamically allocated buffer.
+    char buffer[500];
+    for (int iter = 0; iter < 2; iter++) {
+      char* base;
+      int bufsize;
+      if (iter == 0) {
+        bufsize = sizeof(buffer);
+        base = buffer;
+      } else {
+        bufsize = 30000;
+        base = new char[bufsize];
+      }
+      char* p = base;
+      char* limit = base + bufsize;
+      // Print the message
+      if (p < limit) {
+        va_list backup_ap;
+        va_copy(backup_ap, ap);
+        p += vsnprintf(p, limit - p, format, backup_ap);
+        va_end(backup_ap);
+      }
+      // Truncate to available space if necessary
+      if (p >= limit) {
+        if (iter == 0) {
+          continue;       // Try again with larger buffer
+        } else {
+          p = limit - 1;
+        }
+      }
+      assert(p < limit);
+      *p++ = '\0';
+
+      // pass java string to callback handler
+      env->CallVoidMethod(
+          m_jLogger,
+          m_jLogMethodId,
+          jlog_level,
+          env->NewStringUTF(base));
+
+      if (base != buffer) {
+        delete[] base;
+      }
+      break;
+    }
+    m_jvm->DetachCurrentThread();
+  }
+}
+
+LoggerJniCallback::~LoggerJniCallback() {
+  JNIEnv* env = getJniEnv();
+  env->DeleteGlobalRef(m_jLogger);
+  m_jvm->DetachCurrentThread();
+}
+
+}  // namespace rocksdb
+
+/*
+ * Class:     org_rocksdb_Logger
+ * Method:    createNewLoggerOptions
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Logger_createNewLoggerOptions(
+    JNIEnv* env, jobject jobj, jlong joptions) {
+  rocksdb::LoggerJniCallback* c =
+      new rocksdb::LoggerJniCallback(env, jobj);
+  // set log level
+  c->SetInfoLogLevel(reinterpret_cast<rocksdb::Options*>
+      (joptions)->info_log_level);
+  std::shared_ptr<rocksdb::LoggerJniCallback> *pLoggerJniCallback =
+      new std::shared_ptr<rocksdb::LoggerJniCallback>;
+  *pLoggerJniCallback = std::shared_ptr<rocksdb::LoggerJniCallback>(c);
+  rocksdb::LoggerJni::setHandle(env, jobj, pLoggerJniCallback);
+}
+
+/*
+ * Class:     org_rocksdb_Logger
+ * Method:    createNewLoggerDbOptions
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Logger_createNewLoggerDbOptions(
+    JNIEnv* env, jobject jobj, jlong jdb_options) {
+  rocksdb::LoggerJniCallback* c =
+      new rocksdb::LoggerJniCallback(env, jobj);
+  // set log level
+  c->SetInfoLogLevel(reinterpret_cast<rocksdb::DBOptions*>
+      (jdb_options)->info_log_level);
+  std::shared_ptr<rocksdb::LoggerJniCallback> *pLoggerJniCallback =
+      new std::shared_ptr<rocksdb::LoggerJniCallback>;
+  *pLoggerJniCallback = std::shared_ptr<rocksdb::LoggerJniCallback>(c);
+  rocksdb::LoggerJni::setHandle(env, jobj, pLoggerJniCallback);
+}
+
+/*
+ * Class:     org_rocksdb_Logger
+ * Method:    setInfoLogLevel
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Logger_setInfoLogLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jlog_level) {
+  std::shared_ptr<rocksdb::LoggerJniCallback> *handle =
+      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback> *>(jhandle);
+  (*handle)->SetInfoLogLevel(static_cast<rocksdb::InfoLogLevel>(jlog_level));
+}
+
+/*
+ * Class:     org_rocksdb_Logger
+ * Method:    infoLogLevel
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Logger_infoLogLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  std::shared_ptr<rocksdb::LoggerJniCallback> *handle =
+      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback> *>(jhandle);
+  return static_cast<jbyte>((*handle)->GetInfoLogLevel());
+}
+
+/*
+ * Class:     org_rocksdb_Logger
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Logger_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  std::shared_ptr<rocksdb::LoggerJniCallback> *handle =
+      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback> *>(jhandle);
+  handle->reset();
+}
diff --git a/src/rocksdb/java/rocksjni/loggerjnicallback.h b/src/rocksdb/java/rocksjni/loggerjnicallback.h
new file mode 100644
index 0000000..3936252
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/loggerjnicallback.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::Logger
+
+#ifndef JAVA_ROCKSJNI_LOGGERJNICALLBACK_H_
+#define JAVA_ROCKSJNI_LOGGERJNICALLBACK_H_
+
+#include <jni.h>
+#include <string>
+#include "port/port.h"
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+  class LoggerJniCallback : public Logger {
+   public:
+     LoggerJniCallback(JNIEnv* env, jobject jLogger);
+     virtual ~LoggerJniCallback();
+
+     using Logger::SetInfoLogLevel;
+     using Logger::GetInfoLogLevel;
+     // Write an entry to the log file with the specified format.
+     virtual void Logv(const char* format, va_list ap);
+     // Write an entry to the log file with the specified log level
+     // and format.  Any log with level under the internal log level
+     // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
+     // printed.
+     virtual void Logv(const InfoLogLevel log_level,
+         const char* format, va_list ap);
+
+   protected:
+     JNIEnv* getJniEnv() const;
+   private:
+     JavaVM* m_jvm;
+     jobject m_jLogger;
+     jmethodID m_jLogMethodId;
+  };
+}  // namespace rocksdb
+
+#endif  // JAVA_ROCKSJNI_LOGGERJNICALLBACK_H_
diff --git a/src/rocksdb/java/rocksjni/memtablejni.cc b/src/rocksdb/java/rocksjni/memtablejni.cc
new file mode 100644
index 0000000..ce27f97
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/memtablejni.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for MemTables.
+
+#include "rocksjni/portal.h"
+#include "include/org_rocksdb_HashSkipListMemTableConfig.h"
+#include "include/org_rocksdb_HashLinkedListMemTableConfig.h"
+#include "include/org_rocksdb_VectorMemTableConfig.h"
+#include "include/org_rocksdb_SkipListMemTableConfig.h"
+#include "rocksdb/memtablerep.h"
+
+/*
+ * Class:     org_rocksdb_HashSkipListMemTableConfig
+ * Method:    newMemTableFactoryHandle
+ * Signature: (JII)J
+ */
+jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
+    JNIEnv* env, jobject jobj, jlong jbucket_count,
+    jint jheight, jint jbranching_factor) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jbucket_count);
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(rocksdb::NewHashSkipListRepFactory(
+        static_cast<size_t>(jbucket_count),
+        static_cast<int32_t>(jheight),
+        static_cast<int32_t>(jbranching_factor)));
+  }
+  rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  return 0;
+}
+
+/*
+ * Class:     org_rocksdb_HashLinkedListMemTableConfig
+ * Method:    newMemTableFactoryHandle
+ * Signature: (JJIZI)J
+ */
+jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
+    JNIEnv* env, jobject jobj, jlong jbucket_count, jlong jhuge_page_tlb_size,
+    jint jbucket_entries_logging_threshold,
+    jboolean jif_log_bucket_dist_when_flash, jint jthreshold_use_skiplist) {
+  rocksdb::Status statusBucketCount =
+      rocksdb::check_if_jlong_fits_size_t(jbucket_count);
+  rocksdb::Status statusHugePageTlb =
+      rocksdb::check_if_jlong_fits_size_t(jhuge_page_tlb_size);
+  if (statusBucketCount.ok() && statusHugePageTlb.ok()) {
+    return reinterpret_cast<jlong>(rocksdb::NewHashLinkListRepFactory(
+        static_cast<size_t>(jbucket_count),
+        static_cast<size_t>(jhuge_page_tlb_size),
+        static_cast<int32_t>(jbucket_entries_logging_threshold),
+        static_cast<bool>(jif_log_bucket_dist_when_flash),
+        static_cast<int32_t>(jthreshold_use_skiplist)));
+  }
+  rocksdb::IllegalArgumentExceptionJni::ThrowNew(env,
+      !statusBucketCount.ok()?statusBucketCount:statusHugePageTlb);
+  return 0;
+}
+
+/*
+ * Class:     org_rocksdb_VectorMemTableConfig
+ * Method:    newMemTableFactoryHandle
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle(
+    JNIEnv* env, jobject jobj, jlong jreserved_size) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jreserved_size);
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(new rocksdb::VectorRepFactory(
+        static_cast<size_t>(jreserved_size)));
+  }
+  rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  return 0;
+}
+
+/*
+ * Class:     org_rocksdb_SkipListMemTableConfig
+ * Method:    newMemTableFactoryHandle0
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_SkipListMemTableConfig_newMemTableFactoryHandle0(
+    JNIEnv* env, jobject jobj, jlong jlookahead) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jlookahead);
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(new rocksdb::SkipListFactory(
+        static_cast<size_t>(jlookahead)));
+  }
+  rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  return 0;
+}
diff --git a/src/rocksdb/java/rocksjni/merge_operator.cc b/src/rocksdb/java/rocksjni/merge_operator.cc
new file mode 100644
index 0000000..68fe9b6
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/merge_operator.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2014, Vlad Balan (vlad.gm at gmail.com).  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++
+// for rocksdb::MergeOperator.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+#include <memory>
+
+#include "include/org_rocksdb_StringAppendOperator.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/table.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+
+/*
+ * Class:     org_rocksdb_StringAppendOperator
+ * Method:    newMergeOperatorHandle
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_StringAppendOperator_newMergeOperatorHandleImpl
+(JNIEnv* env, jobject jobj) {
+  std::shared_ptr<rocksdb::MergeOperator> *op =
+    new std::shared_ptr<rocksdb::MergeOperator>();
+  *op = rocksdb::MergeOperators::CreateFromStringId("stringappend");
+  return reinterpret_cast<jlong>(op);
+}
diff --git a/src/rocksdb/java/rocksjni/options.cc b/src/rocksdb/java/rocksjni/options.cc
new file mode 100644
index 0000000..216fa5e
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/options.cc
@@ -0,0 +1,4089 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for rocksdb::Options.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <strings.h>
+#include <memory>
+
+#include "include/org_rocksdb_Options.h"
+#include "include/org_rocksdb_DBOptions.h"
+#include "include/org_rocksdb_ColumnFamilyOptions.h"
+#include "include/org_rocksdb_WriteOptions.h"
+#include "include/org_rocksdb_ReadOptions.h"
+#include "include/org_rocksdb_ComparatorOptions.h"
+#include "include/org_rocksdb_FlushOptions.h"
+
+#include "rocksjni/comparatorjnicallback.h"
+#include "rocksjni/portal.h"
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/table.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/merge_operator.h"
+#include "utilities/merge_operators.h"
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    newOptions
+ * Signature: ()V
+ */
+void Java_org_rocksdb_Options_newOptions__(JNIEnv* env, jobject jobj) {
+  rocksdb::Options* op = new rocksdb::Options();
+  rocksdb::OptionsJni::setHandle(env, jobj, op);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    newOptions
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_newOptions__JJ(JNIEnv* env, jobject jobj,
+    jlong jdboptions, jlong jcfoptions) {
+  auto dbOpt = reinterpret_cast<const rocksdb::DBOptions*>(jdboptions);
+  auto cfOpt = reinterpret_cast<const rocksdb::ColumnFamilyOptions*>(
+      jcfoptions);
+  rocksdb::Options* op = new rocksdb::Options(*dbOpt, *cfOpt);
+  rocksdb::OptionsJni::setHandle(env, jobj, op);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Options_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  delete reinterpret_cast<rocksdb::Options*>(handle);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setIncreaseParallelism
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setIncreaseParallelism(
+    JNIEnv * evnv, jobject jobj, jlong jhandle, jint totalThreads) {
+  reinterpret_cast<rocksdb::Options*>
+      (jhandle)->IncreaseParallelism(static_cast<int>(totalThreads));
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCreateIfMissing
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setCreateIfMissing(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->create_if_missing = flag;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    createIfMissing
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_createIfMissing(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->create_if_missing;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCreateMissingColumnFamilies
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setCreateMissingColumnFamilies(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  reinterpret_cast<rocksdb::Options*>
+      (jhandle)->create_missing_column_families = flag;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    createMissingColumnFamilies
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_createMissingColumnFamilies(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>
+      (jhandle)->create_missing_column_families;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setComparatorHandle
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setComparatorHandle__JI(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint builtinComparator) {
+  switch (builtinComparator) {
+    case 1:
+      reinterpret_cast<rocksdb::Options*>(jhandle)->comparator =
+          rocksdb::ReverseBytewiseComparator();
+      break;
+    default:
+      reinterpret_cast<rocksdb::Options*>(jhandle)->comparator =
+          rocksdb::BytewiseComparator();
+      break;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setComparatorHandle
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setComparatorHandle__JJ(
+    JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcomparator_handle) {
+  reinterpret_cast<rocksdb::Options*>(jopt_handle)->comparator =
+      reinterpret_cast<rocksdb::Comparator*>(jcomparator_handle);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMergeOperatorName
+ * Signature: (JJjava/lang/String)V
+ */
+void Java_org_rocksdb_Options_setMergeOperatorName(
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring jop_name) {
+  auto options = reinterpret_cast<rocksdb::Options*>(jhandle);
+  const char* op_name = env->GetStringUTFChars(jop_name, 0);
+  options->merge_operator = rocksdb::MergeOperators::CreateFromStringId(
+        op_name);
+  env->ReleaseStringUTFChars(jop_name, op_name);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMergeOperator
+ * Signature: (JJjava/lang/String)V
+ */
+void Java_org_rocksdb_Options_setMergeOperator(
+  JNIEnv* env, jobject jobj, jlong jhandle, jlong mergeOperatorHandle) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->merge_operator =
+    *(reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>
+      (mergeOperatorHandle));
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWriteBufferSize
+ * Signature: (JJ)I
+ */
+void Java_org_rocksdb_Options_setWriteBufferSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jwrite_buffer_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->write_buffer_size =
+        jwrite_buffer_size;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    writeBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_writeBufferSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->write_buffer_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxWriteBufferNumber
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxWriteBufferNumber(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jmax_write_buffer_number) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_write_buffer_number =
+          jmax_write_buffer_number;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    createStatistics
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Options_createStatistics(
+    JNIEnv* env, jobject jobj, jlong jOptHandle) {
+  reinterpret_cast<rocksdb::Options*>(jOptHandle)->statistics =
+      rocksdb::CreateDBStatistics();
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    statisticsPtr
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_statisticsPtr(
+    JNIEnv* env, jobject jobj, jlong jOptHandle) {
+  auto st = reinterpret_cast<rocksdb::Options*>(jOptHandle)->statistics.get();
+  return reinterpret_cast<jlong>(st);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxWriteBufferNumber
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxWriteBufferNumber(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_write_buffer_number;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    errorIfExists
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_errorIfExists(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->error_if_exists;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setErrorIfExists
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setErrorIfExists(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean error_if_exists) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->error_if_exists =
+      static_cast<bool>(error_if_exists);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    paranoidChecks
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_paranoidChecks(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->paranoid_checks;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setParanoidChecks
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setParanoidChecks(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean paranoid_checks) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->paranoid_checks =
+      static_cast<bool>(paranoid_checks);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setEnv
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setEnv(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jenv) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->env =
+      reinterpret_cast<rocksdb::Env*>(jenv);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxTotalWalSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxTotalWalSize(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_total_wal_size) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_total_wal_size =
+      static_cast<jlong>(jmax_total_wal_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxTotalWalSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxTotalWalSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->
+      max_total_wal_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxOpenFiles
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxOpenFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_open_files;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxOpenFiles
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxOpenFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint max_open_files) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_open_files =
+      static_cast<int>(max_open_files);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    disableDataSync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_disableDataSync(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->disableDataSync;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDisableDataSync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setDisableDataSync(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean disableDataSync) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->disableDataSync =
+      static_cast<bool>(disableDataSync);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    useFsync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_useFsync(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->use_fsync;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setUseFsync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setUseFsync(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_fsync) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->use_fsync =
+      static_cast<bool>(use_fsync);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    dbLogDir
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_Options_dbLogDir(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return env->NewStringUTF(
+      reinterpret_cast<rocksdb::Options*>(jhandle)->db_log_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDbLogDir
+ * Signature: (JLjava/lang/String)V
+ */
+void Java_org_rocksdb_Options_setDbLogDir(
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring jdb_log_dir) {
+  const char* log_dir = env->GetStringUTFChars(jdb_log_dir, 0);
+  reinterpret_cast<rocksdb::Options*>(jhandle)->db_log_dir.assign(log_dir);
+  env->ReleaseStringUTFChars(jdb_log_dir, log_dir);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    walDir
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_Options_walDir(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return env->NewStringUTF(
+      reinterpret_cast<rocksdb::Options*>(jhandle)->wal_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWalDir
+ * Signature: (JLjava/lang/String)V
+ */
+void Java_org_rocksdb_Options_setWalDir(
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring jwal_dir) {
+  const char* wal_dir = env->GetStringUTFChars(jwal_dir, 0);
+  reinterpret_cast<rocksdb::Options*>(jhandle)->wal_dir.assign(wal_dir);
+  env->ReleaseStringUTFChars(jwal_dir, wal_dir);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    deleteObsoleteFilesPeriodMicros
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_deleteObsoleteFilesPeriodMicros(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->delete_obsolete_files_period_micros;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDeleteObsoleteFilesPeriodMicros
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setDeleteObsoleteFilesPeriodMicros(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong micros) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->delete_obsolete_files_period_micros =
+          static_cast<int64_t>(micros);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBackgroundCompactions
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxBackgroundCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_background_compactions;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBackgroundCompactions
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxBackgroundCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint max) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->max_background_compactions = static_cast<int>(max);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBackgroundFlushes
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxBackgroundFlushes(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_background_flushes;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBackgroundFlushes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxBackgroundFlushes(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint max_background_flushes) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_background_flushes =
+      static_cast<int>(max_background_flushes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxLogFileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxLogFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_log_file_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxLogFileSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxLogFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(max_log_file_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->max_log_file_size =
+        max_log_file_size;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    logFileTimeToRoll
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_logFileTimeToRoll(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->log_file_time_to_roll;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLogFileTimeToRoll
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setLogFileTimeToRoll(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
+      log_file_time_to_roll);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->log_file_time_to_roll =
+        log_file_time_to_roll;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    keepLogFileNum
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_keepLogFileNum(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->keep_log_file_num;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setKeepLogFileNum
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setKeepLogFileNum(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(keep_log_file_num);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->keep_log_file_num =
+        keep_log_file_num;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxManifestFileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxManifestFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_manifest_file_size;
+}
+
+/*
+ * Method:    memTableFactoryName
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_Options_memTableFactoryName(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  rocksdb::MemTableRepFactory* tf = opt->memtable_factory.get();
+
+  // Should never be nullptr.
+  // Default memtable factory is SkipListFactory
+  assert(tf);
+
+  // temporarly fix for the historical typo
+  if (strcmp(tf->Name(), "HashLinkListRepFactory") == 0) {
+    return env->NewStringUTF("HashLinkedListRepFactory");
+  }
+
+  return env->NewStringUTF(tf->Name());
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxManifestFileSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxManifestFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_manifest_file_size) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_manifest_file_size =
+      static_cast<int64_t>(max_manifest_file_size);
+}
+
+/*
+ * Method:    setMemTableFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMemTableFactory(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->memtable_factory.reset(
+      reinterpret_cast<rocksdb::MemTableRepFactory*>(jfactory_handle));
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setRateLimiter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setRateLimiter(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jrate_limiter_handle) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->rate_limiter.reset(
+      reinterpret_cast<rocksdb::RateLimiter*>(jrate_limiter_handle));
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLogger
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setLogger(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jlogger_handle) {
+std::shared_ptr<rocksdb::LoggerJniCallback> *pLogger =
+      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback> *>(
+          jlogger_handle);
+  reinterpret_cast<rocksdb::Options*>(jhandle)->info_log = *pLogger;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setInfoLogLevel
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Options_setInfoLogLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jlog_level) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->info_log_level =
+      static_cast<rocksdb::InfoLogLevel>(jlog_level);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    infoLogLevel
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Options_infoLogLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return static_cast<jbyte>(
+      reinterpret_cast<rocksdb::Options*>(jhandle)->info_log_level);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    tableCacheNumshardbits
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_tableCacheNumshardbits(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->table_cache_numshardbits;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setTableCacheNumshardbits
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setTableCacheNumshardbits(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint table_cache_numshardbits) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->table_cache_numshardbits =
+      static_cast<int>(table_cache_numshardbits);
+}
+
+/*
+ * Method:    useFixedLengthPrefixExtractor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->prefix_extractor.reset(
+      rocksdb::NewFixedPrefixTransform(
+          static_cast<int>(jprefix_length)));
+}
+
+/*
+ * Method:    useCappedPrefixExtractor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_useCappedPrefixExtractor(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->prefix_extractor.reset(
+      rocksdb::NewCappedPrefixTransform(
+          static_cast<int>(jprefix_length)));
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    walTtlSeconds
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_walTtlSeconds(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_ttl_seconds;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWalTtlSeconds
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWalTtlSeconds(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_ttl_seconds) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_ttl_seconds =
+      static_cast<int64_t>(WAL_ttl_seconds);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    walTtlSeconds
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_walSizeLimitMB(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_size_limit_MB;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWalSizeLimitMB
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWalSizeLimitMB(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_size_limit_MB) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_size_limit_MB =
+      static_cast<int64_t>(WAL_size_limit_MB);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    manifestPreallocationSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_manifestPreallocationSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->manifest_preallocation_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setManifestPreallocationSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setManifestPreallocationSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(preallocation_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->manifest_preallocation_size =
+        preallocation_size;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    allowOsBuffer
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_allowOsBuffer(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_os_buffer;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAllowOsBuffer
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAllowOsBuffer(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_os_buffer) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->allow_os_buffer =
+      static_cast<bool>(allow_os_buffer);
+}
+
+/*
+ * Method:    setTableFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setTableFactory(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->table_factory.reset(
+      reinterpret_cast<rocksdb::TableFactory*>(jfactory_handle));
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    allowMmapReads
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_allowMmapReads(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_reads;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAllowMmapReads
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAllowMmapReads(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_reads) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_reads =
+      static_cast<bool>(allow_mmap_reads);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    allowMmapWrites
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_allowMmapWrites(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_writes;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAllowMmapWrites
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAllowMmapWrites(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_writes) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_writes =
+      static_cast<bool>(allow_mmap_writes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    isFdCloseOnExec
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_isFdCloseOnExec(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->is_fd_close_on_exec;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setIsFdCloseOnExec
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setIsFdCloseOnExec(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean is_fd_close_on_exec) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->is_fd_close_on_exec =
+      static_cast<bool>(is_fd_close_on_exec);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    statsDumpPeriodSec
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_statsDumpPeriodSec(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->stats_dump_period_sec;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setStatsDumpPeriodSec
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setStatsDumpPeriodSec(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint stats_dump_period_sec) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->stats_dump_period_sec =
+      static_cast<int>(stats_dump_period_sec);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    adviseRandomOnOpen
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_adviseRandomOnOpen(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->advise_random_on_open;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAdviseRandomOnOpen
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAdviseRandomOnOpen(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean advise_random_on_open) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->advise_random_on_open =
+      static_cast<bool>(advise_random_on_open);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    useAdaptiveMutex
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_useAdaptiveMutex(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->use_adaptive_mutex;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setUseAdaptiveMutex
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setUseAdaptiveMutex(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_adaptive_mutex) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->use_adaptive_mutex =
+      static_cast<bool>(use_adaptive_mutex);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    bytesPerSync
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_bytesPerSync(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->bytes_per_sync;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBytesPerSync
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setBytesPerSync(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong bytes_per_sync) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->bytes_per_sync =
+      static_cast<int64_t>(bytes_per_sync);
+}
+
+/*
+ * Method:    tableFactoryName
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_Options_tableFactoryName(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  rocksdb::TableFactory* tf = opt->table_factory.get();
+
+  // Should never be nullptr.
+  // Default memtable factory is SkipListFactory
+  assert(tf);
+
+  return env->NewStringUTF(tf->Name());
+}
+
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    minWriteBufferNumberToMerge
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_minWriteBufferNumberToMerge(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->min_write_buffer_number_to_merge;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMinWriteBufferNumberToMerge
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmin_write_buffer_number_to_merge) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->min_write_buffer_number_to_merge =
+          static_cast<int>(jmin_write_buffer_number_to_merge);
+}
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxWriteBufferNumberToMaintain
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxWriteBufferNumberToMaintain(JNIEnv* env,
+                                                             jobject jobj,
+                                                             jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->max_write_buffer_number_to_maintain;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxWriteBufferNumberToMaintain
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxWriteBufferNumberToMaintain(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmax_write_buffer_number_to_maintain) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->max_write_buffer_number_to_maintain =
+      static_cast<int>(jmax_write_buffer_number_to_maintain);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompressionType
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Options_setCompressionType(
+    JNIEnv* env, jobject jobj, jlong jhandle, jbyte compression) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->compression =
+      static_cast<rocksdb::CompressionType>(compression);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    compressionType
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Options_compressionType(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->compression;
+}
+
+/*
+ * Helper method to convert a Java list to a CompressionType
+ * vector.
+ */
+std::vector<rocksdb::CompressionType> rocksdb_compression_vector_helper(
+    JNIEnv* env, jobject jcompressionLevels) {
+  std::vector<rocksdb::CompressionType> compressionLevels;
+  // iterate over compressionLevels
+  jobject iteratorObj = env->CallObjectMethod(
+        jcompressionLevels, rocksdb::ListJni::getIteratorMethod(env));
+  while (env->CallBooleanMethod(
+    iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+    // get compression
+    jobject jcompression_obj = env->CallObjectMethod(iteratorObj,
+        rocksdb::ListJni::getNextMethod(env));
+    jbyte jcompression = env->CallByteMethod(jcompression_obj,
+        rocksdb::ByteJni::getByteValueMethod(env));
+    compressionLevels.push_back(static_cast<rocksdb::CompressionType>(
+        jcompression));
+  }
+  return compressionLevels;
+}
+
+/*
+ * Helper method to convert a CompressionType vector to a Java
+ * List.
+ */
+jobject rocksdb_compression_list_helper(JNIEnv* env,
+    std::vector<rocksdb::CompressionType> compressionLevels) {
+  jclass jListClazz = env->FindClass("java/util/ArrayList");
+  jmethodID midList = rocksdb::ListJni::getArrayListConstructorMethodId(
+      env, jListClazz);
+  jobject jcompressionLevels = env->NewObject(jListClazz,
+    midList, compressionLevels.size());
+  // insert in java list
+  for (std::vector<rocksdb::CompressionType>::size_type i = 0;
+        i != compressionLevels.size(); i++) {
+    jclass jByteClazz = env->FindClass("java/lang/Byte");
+    jmethodID midByte = env->GetMethodID(jByteClazz, "<init>", "(B)V");
+    jobject obj = env->NewObject(jByteClazz, midByte,
+        compressionLevels[i]);
+    env->CallBooleanMethod(jcompressionLevels,
+        rocksdb::ListJni::getListAddMethodId(env), obj);
+  }
+  return jcompressionLevels;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompressionPerLevel
+ * Signature: (JLjava/util/List;)V
+ */
+void Java_org_rocksdb_Options_setCompressionPerLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jobject jcompressionLevels) {
+  auto* options = reinterpret_cast<rocksdb::Options*>(jhandle);
+  std::vector<rocksdb::CompressionType> compressionLevels =
+      rocksdb_compression_vector_helper(env, jcompressionLevels);
+  options->compression_per_level = compressionLevels;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    compressionPerLevel
+ * Signature: (J)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_Options_compressionPerLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::Options*>(jhandle);
+  return rocksdb_compression_list_helper(env,
+      options->compression_per_level);
+}
+
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompactionStyle
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Options_setCompactionStyle(
+    JNIEnv* env, jobject jobj, jlong jhandle, jbyte compaction_style) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->compaction_style =
+      static_cast<rocksdb::CompactionStyle>(compaction_style);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    compactionStyle
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Options_compactionStyle(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->compaction_style;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxTableFilesSizeFIFO
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxTableFilesSizeFIFO(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_table_files_size) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->compaction_options_fifo.max_table_files_size =
+    static_cast<long>(jmax_table_files_size);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxTableFilesSizeFIFO
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxTableFilesSizeFIFO(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->compaction_options_fifo.max_table_files_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    numLevels
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_numLevels(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->num_levels;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setNumLevels
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setNumLevels(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jnum_levels) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->num_levels =
+      static_cast<int>(jnum_levels);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    levelZeroFileNumCompactionTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_levelZeroFileNumCompactionTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->level0_file_num_compaction_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLevelZeroFileNumCompactionTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setLevelZeroFileNumCompactionTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jlevel0_file_num_compaction_trigger) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->level0_file_num_compaction_trigger =
+          static_cast<int>(jlevel0_file_num_compaction_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    levelZeroSlowdownWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_levelZeroSlowdownWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->level0_slowdown_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLevelSlowdownWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setLevelZeroSlowdownWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jlevel0_slowdown_writes_trigger) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->level0_slowdown_writes_trigger =
+          static_cast<int>(jlevel0_slowdown_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    levelZeroStopWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_levelZeroStopWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->level0_stop_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLevelStopWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setLevelZeroStopWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jlevel0_stop_writes_trigger) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->level0_stop_writes_trigger =
+      static_cast<int>(jlevel0_stop_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    targetFileSizeBase
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_targetFileSizeBase(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->target_file_size_base;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setTargetFileSizeBase
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setTargetFileSizeBase(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jtarget_file_size_base) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->target_file_size_base =
+      static_cast<uint64_t>(jtarget_file_size_base);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    targetFileSizeMultiplier
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_targetFileSizeMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->target_file_size_multiplier;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setTargetFileSizeMultiplier
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setTargetFileSizeMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jtarget_file_size_multiplier) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->target_file_size_multiplier =
+          static_cast<int>(jtarget_file_size_multiplier);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBytesForLevelBase
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxBytesForLevelBase(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_bytes_for_level_base;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBytesForLevelBase
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxBytesForLevelBase(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_bytes_for_level_base) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_bytes_for_level_base =
+          static_cast<int64_t>(jmax_bytes_for_level_base);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    levelCompactionDynamicLevelBytes
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_levelCompactionDynamicLevelBytes(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->level_compaction_dynamic_level_bytes;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setLevelCompactionDynamicLevelBytes
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setLevelCompactionDynamicLevelBytes(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jenable_dynamic_level_bytes) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->level_compaction_dynamic_level_bytes =
+          (jenable_dynamic_level_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBytesForLevelMultiplier
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxBytesForLevelMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_bytes_for_level_multiplier;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBytesForLevelMultiplier
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmax_bytes_for_level_multiplier) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_bytes_for_level_multiplier =
+          static_cast<int>(jmax_bytes_for_level_multiplier);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    expandedCompactionFactor
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_expandedCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->expanded_compaction_factor;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setExpandedCompactionFactor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setExpandedCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jexpanded_compaction_factor) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->expanded_compaction_factor =
+          static_cast<int>(jexpanded_compaction_factor);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    sourceCompactionFactor
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_sourceCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->source_compaction_factor;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setSourceCompactionFactor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setSourceCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+        jint jsource_compaction_factor) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->source_compaction_factor =
+          static_cast<int>(jsource_compaction_factor);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxGrandparentOverlapFactor
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxGrandparentOverlapFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_grandparent_overlap_factor;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxGrandparentOverlapFactor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxGrandparentOverlapFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmax_grandparent_overlap_factor) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_grandparent_overlap_factor =
+          static_cast<int>(jmax_grandparent_overlap_factor);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    softRateLimit
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_Options_softRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->soft_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setSoftRateLimit
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_Options_setSoftRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle, jdouble jsoft_rate_limit) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->soft_rate_limit =
+      static_cast<double>(jsoft_rate_limit);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    hardRateLimit
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_Options_hardRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->hard_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setHardRateLimit
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_Options_setHardRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle, jdouble jhard_rate_limit) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->hard_rate_limit =
+      static_cast<double>(jhard_rate_limit);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    rateLimitDelayMaxMilliseconds
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_rateLimitDelayMaxMilliseconds(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->rate_limit_delay_max_milliseconds;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setRateLimitDelayMaxMilliseconds
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setRateLimitDelayMaxMilliseconds(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jrate_limit_delay_max_milliseconds) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->rate_limit_delay_max_milliseconds =
+          static_cast<int>(jrate_limit_delay_max_milliseconds);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    arenaBlockSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_arenaBlockSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->arena_block_size;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setArenaBlockSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setArenaBlockSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jarena_block_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->arena_block_size =
+        jarena_block_size;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    disableAutoCompactions
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_disableAutoCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->disable_auto_compactions;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setDisableAutoCompactions
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setDisableAutoCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jdisable_auto_compactions) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->disable_auto_compactions =
+          static_cast<bool>(jdisable_auto_compactions);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    purgeRedundantKvsWhileFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_purgeRedundantKvsWhileFlush(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->purge_redundant_kvs_while_flush;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setPurgeRedundantKvsWhileFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setPurgeRedundantKvsWhileFlush(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jpurge_redundant_kvs_while_flush) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->purge_redundant_kvs_while_flush =
+          static_cast<bool>(jpurge_redundant_kvs_while_flush);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    verifyChecksumsInCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_verifyChecksumsInCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->verify_checksums_in_compaction;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setVerifyChecksumsInCompaction
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setVerifyChecksumsInCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jverify_checksums_in_compaction) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->verify_checksums_in_compaction =
+          static_cast<bool>(jverify_checksums_in_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    filterDeletes
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_filterDeletes(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->filter_deletes;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setFilterDeletes
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setFilterDeletes(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfilter_deletes) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->filter_deletes =
+      static_cast<bool>(jfilter_deletes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxSequentialSkipInIterations
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxSequentialSkipInIterations(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_sequential_skip_in_iterations;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxSequentialSkipInIterations
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxSequentialSkipInIterations(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_sequential_skip_in_iterations) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->max_sequential_skip_in_iterations =
+          static_cast<int64_t>(jmax_sequential_skip_in_iterations);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    inplaceUpdateSupport
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_inplaceUpdateSupport(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->inplace_update_support;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setInplaceUpdateSupport
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setInplaceUpdateSupport(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jinplace_update_support) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->inplace_update_support =
+          static_cast<bool>(jinplace_update_support);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    inplaceUpdateNumLocks
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->inplace_update_num_locks;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setInplaceUpdateNumLocks
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setInplaceUpdateNumLocks(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jinplace_update_num_locks) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
+      jinplace_update_num_locks);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->inplace_update_num_locks =
+        jinplace_update_num_locks;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    memtablePrefixBloomBits
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_memtablePrefixBloomBits(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->memtable_prefix_bloom_bits;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMemtablePrefixBloomBits
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMemtablePrefixBloomBits(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmemtable_prefix_bloom_bits) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->memtable_prefix_bloom_bits =
+          static_cast<int32_t>(jmemtable_prefix_bloom_bits);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    memtablePrefixBloomProbes
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_memtablePrefixBloomProbes(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->memtable_prefix_bloom_probes;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMemtablePrefixBloomProbes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMemtablePrefixBloomProbes(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmemtable_prefix_bloom_probes) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->memtable_prefix_bloom_probes =
+          static_cast<int32_t>(jmemtable_prefix_bloom_probes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    bloomLocality
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_bloomLocality(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->bloom_locality;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBloomLocality
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setBloomLocality(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jbloom_locality) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->bloom_locality =
+      static_cast<int32_t>(jbloom_locality);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxSuccessiveMerges
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_maxSuccessiveMerges(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_successive_merges;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxSuccessiveMerges
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setMaxSuccessiveMerges(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_successive_merges) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
+      jmax_successive_merges);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::Options*>(jhandle)->max_successive_merges =
+        jmax_successive_merges;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    minPartialMergeOperands
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_minPartialMergeOperands(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->min_partial_merge_operands;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMinPartialMergeOperands
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMinPartialMergeOperands(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmin_partial_merge_operands) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->min_partial_merge_operands =
+          static_cast<int32_t>(jmin_partial_merge_operands);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    optimizeFiltersForHits
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_optimizeFiltersForHits(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(
+      jhandle)->optimize_filters_for_hits;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setOptimizeFiltersForHits
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setOptimizeFiltersForHits(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean joptimize_filters_for_hits) {
+  reinterpret_cast<rocksdb::Options*>(
+      jhandle)->optimize_filters_for_hits =
+          static_cast<bool>(joptimize_filters_for_hits);
+}
+
+/*
+ * Method:    optimizeForPointLookup
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_optimizeForPointLookup(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong block_cache_size_mb) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->
+      OptimizeForPointLookup(block_cache_size_mb);
+}
+
+/*
+ * Method:    optimizeLevelStyleCompaction
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_optimizeLevelStyleCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong memtable_memory_budget) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->
+      OptimizeLevelStyleCompaction(memtable_memory_budget);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    optimizeUniversalStyleCompaction
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_optimizeUniversalStyleCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong memtable_memory_budget) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->
+      OptimizeUniversalStyleCompaction(memtable_memory_budget);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    prepareForBulkLoad
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Options_prepareForBulkLoad(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->
+      PrepareForBulkLoad();
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::ColumnFamilyOptions
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    newColumnFamilyOptions
+ * Signature: ()V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_newColumnFamilyOptions(
+    JNIEnv* env, jobject jobj) {
+  rocksdb::ColumnFamilyOptions* op = new rocksdb::ColumnFamilyOptions();
+  rocksdb::ColumnFamilyOptionsJni::setHandle(env, jobj, op);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    getColumnFamilyOptionsFromProps
+ * Signature: (Ljava/util/String;)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps(
+    JNIEnv* env, jclass jclazz, jstring jopt_string) {
+  jlong ret_value = 0;
+  rocksdb::ColumnFamilyOptions* cf_options =
+      new rocksdb::ColumnFamilyOptions();
+  const char* opt_string = env->GetStringUTFChars(jopt_string, 0);
+  rocksdb::Status status = rocksdb::GetColumnFamilyOptionsFromString(
+      rocksdb::ColumnFamilyOptions(), opt_string, cf_options);
+  env->ReleaseStringUTFChars(jopt_string, opt_string);
+  // Check if ColumnFamilyOptions creation was possible.
+  if (status.ok()) {
+    ret_value = reinterpret_cast<jlong>(cf_options);
+  } else {
+    // if operation failed the ColumnFamilyOptions need to be deleted
+    // again to prevent a memory leak.
+    delete cf_options;
+  }
+  return ret_value;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  delete reinterpret_cast<rocksdb::ColumnFamilyOptions*>(handle);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    optimizeForPointLookup
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_optimizeForPointLookup(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong block_cache_size_mb) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      OptimizeForPointLookup(block_cache_size_mb);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    optimizeLevelStyleCompaction
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_optimizeLevelStyleCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong memtable_memory_budget) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      OptimizeLevelStyleCompaction(memtable_memory_budget);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    optimizeUniversalStyleCompaction
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_optimizeUniversalStyleCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong memtable_memory_budget) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      OptimizeUniversalStyleCompaction(memtable_memory_budget);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setComparatorHandle
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JI(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint builtinComparator) {
+  switch (builtinComparator) {
+    case 1:
+      reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->comparator =
+          rocksdb::ReverseBytewiseComparator();
+      break;
+    default:
+      reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->comparator =
+          rocksdb::BytewiseComparator();
+      break;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setComparatorHandle
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JJ(
+    JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcomparator_handle) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jopt_handle)->comparator =
+      reinterpret_cast<rocksdb::Comparator*>(jcomparator_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMergeOperatorName
+ * Signature: (JJjava/lang/String)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperatorName(
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring jop_name) {
+  auto options = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
+  const char* op_name = env->GetStringUTFChars(jop_name, 0);
+  options->merge_operator = rocksdb::MergeOperators::CreateFromStringId(
+        op_name);
+  env->ReleaseStringUTFChars(jop_name, op_name);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMergeOperator
+ * Signature: (JJjava/lang/String)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperator(
+  JNIEnv* env, jobject jobj, jlong jhandle, jlong mergeOperatorHandle) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->merge_operator =
+    *(reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>
+      (mergeOperatorHandle));
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompactionFilterHandle
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompactionFilterHandle__JJ(
+    JNIEnv* env, jobject jobj, jlong jopt_handle,
+    jlong jcompactionfilter_handle) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jopt_handle)->
+      compaction_filter = reinterpret_cast<rocksdb::CompactionFilter*>
+        (jcompactionfilter_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setWriteBufferSize
+ * Signature: (JJ)I
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setWriteBufferSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jwrite_buffer_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+        write_buffer_size = jwrite_buffer_size;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    writeBufferSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_writeBufferSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      write_buffer_size;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxWriteBufferNumber
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumber(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jmax_write_buffer_number) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      max_write_buffer_number = jmax_write_buffer_number;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxWriteBufferNumber
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumber(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      max_write_buffer_number;
+}
+
+/*
+ * Method:    setMemTableFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMemTableFactory(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      memtable_factory.reset(
+      reinterpret_cast<rocksdb::MemTableRepFactory*>(jfactory_handle));
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    memTableFactoryName
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_ColumnFamilyOptions_memTableFactoryName(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto opt = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
+  rocksdb::MemTableRepFactory* tf = opt->memtable_factory.get();
+
+  // Should never be nullptr.
+  // Default memtable factory is SkipListFactory
+  assert(tf);
+
+  // temporarly fix for the historical typo
+  if (strcmp(tf->Name(), "HashLinkListRepFactory") == 0) {
+    return env->NewStringUTF("HashLinkedListRepFactory");
+  }
+
+  return env->NewStringUTF(tf->Name());
+}
+
+/*
+ * Method:    useFixedLengthPrefixExtractor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_useFixedLengthPrefixExtractor(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(
+          static_cast<int>(jprefix_length)));
+}
+
+/*
+ * Method:    useCappedPrefixExtractor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_useCappedPrefixExtractor(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      prefix_extractor.reset(rocksdb::NewCappedPrefixTransform(
+          static_cast<int>(jprefix_length)));
+}
+
+/*
+ * Method:    setTableFactory
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setTableFactory(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      table_factory.reset(reinterpret_cast<rocksdb::TableFactory*>(
+      jfactory_handle));
+}
+
+/*
+ * Method:    tableFactoryName
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_ColumnFamilyOptions_tableFactoryName(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto opt = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
+  rocksdb::TableFactory* tf = opt->table_factory.get();
+
+  // Should never be nullptr.
+  // Default memtable factory is SkipListFactory
+  assert(tf);
+
+  return env->NewStringUTF(tf->Name());
+}
+
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    minWriteBufferNumberToMerge
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_minWriteBufferNumberToMerge(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->min_write_buffer_number_to_merge;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMinWriteBufferNumberToMerge
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMinWriteBufferNumberToMerge(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmin_write_buffer_number_to_merge) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->min_write_buffer_number_to_merge =
+          static_cast<int>(jmin_write_buffer_number_to_merge);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxWriteBufferNumberToMaintain
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumberToMaintain(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->max_write_buffer_number_to_maintain;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxWriteBufferNumberToMaintain
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumberToMaintain(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmax_write_buffer_number_to_maintain) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->max_write_buffer_number_to_maintain =
+      static_cast<int>(jmax_write_buffer_number_to_maintain);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompressionType
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompressionType(
+    JNIEnv* env, jobject jobj, jlong jhandle, jbyte compression) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      compression = static_cast<rocksdb::CompressionType>(compression);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    compressionType
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_ColumnFamilyOptions_compressionType(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      compression;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompressionPerLevel
+ * Signature: (JLjava/util/List;)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompressionPerLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jobject jcompressionLevels) {
+  auto* options = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
+  std::vector<rocksdb::CompressionType> compressionLevels =
+      rocksdb_compression_vector_helper(env, jcompressionLevels);
+  options->compression_per_level = compressionLevels;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    compressionPerLevel
+ * Signature: (J)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_ColumnFamilyOptions_compressionPerLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
+  return rocksdb_compression_list_helper(env,
+      options->compression_per_level);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompactionStyle
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompactionStyle(
+    JNIEnv* env, jobject jobj, jlong jhandle, jbyte compaction_style) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->compaction_style =
+      static_cast<rocksdb::CompactionStyle>(compaction_style);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    compactionStyle
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionStyle(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>
+      (jhandle)->compaction_style;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxTableFilesSizeFIFO
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxTableFilesSizeFIFO(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_table_files_size) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->compaction_options_fifo.max_table_files_size =
+    static_cast<long>(jmax_table_files_size);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxTableFilesSizeFIFO
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_maxTableFilesSizeFIFO(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->compaction_options_fifo.max_table_files_size;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    numLevels
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_numLevels(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->num_levels;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setNumLevels
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setNumLevels(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jnum_levels) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->num_levels =
+      static_cast<int>(jnum_levels);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    levelZeroFileNumCompactionTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroFileNumCompactionTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->level0_file_num_compaction_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setLevelZeroFileNumCompactionTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroFileNumCompactionTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jlevel0_file_num_compaction_trigger) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->level0_file_num_compaction_trigger =
+          static_cast<int>(jlevel0_file_num_compaction_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    levelZeroSlowdownWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroSlowdownWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->level0_slowdown_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setLevelSlowdownWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroSlowdownWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jlevel0_slowdown_writes_trigger) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->level0_slowdown_writes_trigger =
+          static_cast<int>(jlevel0_slowdown_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    levelZeroStopWritesTrigger
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroStopWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->level0_stop_writes_trigger;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setLevelStopWritesTrigger
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroStopWritesTrigger(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jlevel0_stop_writes_trigger) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      level0_stop_writes_trigger = static_cast<int>(
+      jlevel0_stop_writes_trigger);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxMemCompactionLevel
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_maxMemCompactionLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return 0;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxMemCompactionLevel
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxMemCompactionLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jmax_mem_compaction_level) {}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    targetFileSizeBase
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeBase(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      target_file_size_base;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setTargetFileSizeBase
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeBase(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jtarget_file_size_base) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      target_file_size_base = static_cast<uint64_t>(jtarget_file_size_base);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    targetFileSizeMultiplier
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->target_file_size_multiplier;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setTargetFileSizeMultiplier
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jtarget_file_size_multiplier) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->target_file_size_multiplier =
+          static_cast<int>(jtarget_file_size_multiplier);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxBytesForLevelBase
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelBase(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_bytes_for_level_base;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxBytesForLevelBase
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelBase(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_bytes_for_level_base) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_bytes_for_level_base =
+          static_cast<int64_t>(jmax_bytes_for_level_base);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    levelCompactionDynamicLevelBytes
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_levelCompactionDynamicLevelBytes(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->level_compaction_dynamic_level_bytes;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setLevelCompactionDynamicLevelBytes
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setLevelCompactionDynamicLevelBytes(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jenable_dynamic_level_bytes) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->level_compaction_dynamic_level_bytes =
+          (jenable_dynamic_level_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxBytesForLevelMultiplier
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_bytes_for_level_multiplier;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxBytesForLevelMultiplier
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplier(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmax_bytes_for_level_multiplier) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_bytes_for_level_multiplier =
+          static_cast<int>(jmax_bytes_for_level_multiplier);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    expandedCompactionFactor
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_expandedCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->expanded_compaction_factor;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setExpandedCompactionFactor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setExpandedCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jexpanded_compaction_factor) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->expanded_compaction_factor =
+          static_cast<int>(jexpanded_compaction_factor);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    sourceCompactionFactor
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_sourceCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->source_compaction_factor;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setSourceCompactionFactor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setSourceCompactionFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+        jint jsource_compaction_factor) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->source_compaction_factor =
+          static_cast<int>(jsource_compaction_factor);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxGrandparentOverlapFactor
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_maxGrandparentOverlapFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_grandparent_overlap_factor;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxGrandparentOverlapFactor
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxGrandparentOverlapFactor(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmax_grandparent_overlap_factor) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_grandparent_overlap_factor =
+          static_cast<int>(jmax_grandparent_overlap_factor);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    softRateLimit
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_ColumnFamilyOptions_softRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      soft_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setSoftRateLimit
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setSoftRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle, jdouble jsoft_rate_limit) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->soft_rate_limit =
+      static_cast<double>(jsoft_rate_limit);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    hardRateLimit
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_ColumnFamilyOptions_hardRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      hard_rate_limit;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setHardRateLimit
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setHardRateLimit(
+    JNIEnv* env, jobject jobj, jlong jhandle, jdouble jhard_rate_limit) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->hard_rate_limit =
+      static_cast<double>(jhard_rate_limit);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    rateLimitDelayMaxMilliseconds
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_rateLimitDelayMaxMilliseconds(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->rate_limit_delay_max_milliseconds;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setRateLimitDelayMaxMilliseconds
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setRateLimitDelayMaxMilliseconds(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jrate_limit_delay_max_milliseconds) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->rate_limit_delay_max_milliseconds =
+          static_cast<int>(jrate_limit_delay_max_milliseconds);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    arenaBlockSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_arenaBlockSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      arena_block_size;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setArenaBlockSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setArenaBlockSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jarena_block_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+        arena_block_size = jarena_block_size;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    disableAutoCompactions
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_disableAutoCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->disable_auto_compactions;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setDisableAutoCompactions
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setDisableAutoCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jdisable_auto_compactions) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->disable_auto_compactions =
+          static_cast<bool>(jdisable_auto_compactions);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    purgeRedundantKvsWhileFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_purgeRedundantKvsWhileFlush(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->purge_redundant_kvs_while_flush;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setPurgeRedundantKvsWhileFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setPurgeRedundantKvsWhileFlush(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jpurge_redundant_kvs_while_flush) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->purge_redundant_kvs_while_flush =
+          static_cast<bool>(jpurge_redundant_kvs_while_flush);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    verifyChecksumsInCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_verifyChecksumsInCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->verify_checksums_in_compaction;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setVerifyChecksumsInCompaction
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setVerifyChecksumsInCompaction(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jverify_checksums_in_compaction) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->verify_checksums_in_compaction =
+          static_cast<bool>(jverify_checksums_in_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    filterDeletes
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_filterDeletes(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      filter_deletes;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setFilterDeletes
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setFilterDeletes(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfilter_deletes) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->filter_deletes =
+      static_cast<bool>(jfilter_deletes);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxSequentialSkipInIterations
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_maxSequentialSkipInIterations(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_sequential_skip_in_iterations;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxSequentialSkipInIterations
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxSequentialSkipInIterations(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_sequential_skip_in_iterations) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->max_sequential_skip_in_iterations =
+          static_cast<int64_t>(jmax_sequential_skip_in_iterations);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    inplaceUpdateSupport
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateSupport(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->inplace_update_support;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setInplaceUpdateSupport
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateSupport(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jinplace_update_support) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->inplace_update_support =
+          static_cast<bool>(jinplace_update_support);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    inplaceUpdateNumLocks
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateNumLocks(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->inplace_update_num_locks;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setInplaceUpdateNumLocks
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateNumLocks(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jinplace_update_num_locks) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
+      jinplace_update_num_locks);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+        inplace_update_num_locks = jinplace_update_num_locks;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    memtablePrefixBloomBits
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_memtablePrefixBloomBits(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->memtable_prefix_bloom_bits;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMemtablePrefixBloomBits
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMemtablePrefixBloomBits(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmemtable_prefix_bloom_bits) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->memtable_prefix_bloom_bits =
+          static_cast<int32_t>(jmemtable_prefix_bloom_bits);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    memtablePrefixBloomProbes
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_memtablePrefixBloomProbes(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->memtable_prefix_bloom_probes;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMemtablePrefixBloomProbes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMemtablePrefixBloomProbes(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmemtable_prefix_bloom_probes) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->memtable_prefix_bloom_probes =
+          static_cast<int32_t>(jmemtable_prefix_bloom_probes);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    bloomLocality
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_bloomLocality(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      bloom_locality;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBloomLocality
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBloomLocality(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint jbloom_locality) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->bloom_locality =
+      static_cast<int32_t>(jbloom_locality);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    maxSuccessiveMerges
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_maxSuccessiveMerges(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+      max_successive_merges;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMaxSuccessiveMerges
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMaxSuccessiveMerges(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_successive_merges) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
+      jmax_successive_merges);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
+        max_successive_merges = jmax_successive_merges;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    minPartialMergeOperands
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyOptions_minPartialMergeOperands(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->min_partial_merge_operands;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setMinPartialMergeOperands
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setMinPartialMergeOperands(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jint jmin_partial_merge_operands) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->min_partial_merge_operands =
+          static_cast<int32_t>(jmin_partial_merge_operands);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    optimizeFiltersForHits
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ColumnFamilyOptions_optimizeFiltersForHits(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->optimize_filters_for_hits;
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setOptimizeFiltersForHits
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setOptimizeFiltersForHits(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean joptimize_filters_for_hits) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+      jhandle)->optimize_filters_for_hits =
+          static_cast<bool>(joptimize_filters_for_hits);
+}
+
+/////////////////////////////////////////////////////////////////////
+// rocksdb::DBOptions
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    newDBOptions
+ * Signature: ()V
+ */
+void Java_org_rocksdb_DBOptions_newDBOptions(JNIEnv* env,
+    jobject jobj) {
+  rocksdb::DBOptions* dbop = new rocksdb::DBOptions();
+  rocksdb::DBOptionsJni::setHandle(env, jobj, dbop);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    getDBOptionsFromProps
+ * Signature: (Ljava/util/String;)J
+ */
+jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps(
+    JNIEnv* env, jclass jclazz, jstring jopt_string) {
+  jlong ret_value = 0;
+  rocksdb::DBOptions* db_options =
+      new rocksdb::DBOptions();
+  const char* opt_string = env->GetStringUTFChars(jopt_string, 0);
+  rocksdb::Status status = rocksdb::GetDBOptionsFromString(
+      rocksdb::DBOptions(), opt_string, db_options);
+  env->ReleaseStringUTFChars(jopt_string, opt_string);
+  // Check if DBOptions creation was possible.
+  if (status.ok()) {
+    ret_value = reinterpret_cast<jlong>(db_options);
+  } else {
+    // if operation failed the DBOptions need to be deleted
+    // again to prevent a memory leak.
+    delete db_options;
+  }
+  return ret_value;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_DBOptions_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  delete reinterpret_cast<rocksdb::DBOptions*>(handle);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setIncreaseParallelism
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setIncreaseParallelism(
+    JNIEnv * env, jobject jobj, jlong jhandle, jint totalThreads) {
+  reinterpret_cast<rocksdb::DBOptions*>
+      (jhandle)->IncreaseParallelism(static_cast<int>(totalThreads));
+}
+
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setCreateIfMissing
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setCreateIfMissing(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
+      create_if_missing = flag;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    createIfMissing
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_createIfMissing(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->create_if_missing;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setCreateMissingColumnFamilies
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setCreateMissingColumnFamilies(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+  reinterpret_cast<rocksdb::DBOptions*>
+      (jhandle)->create_missing_column_families = flag;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    createMissingColumnFamilies
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_createMissingColumnFamilies(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>
+      (jhandle)->create_missing_column_families;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setErrorIfExists
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setErrorIfExists(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean error_if_exists) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->error_if_exists =
+      static_cast<bool>(error_if_exists);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    errorIfExists
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_errorIfExists(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->error_if_exists;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setParanoidChecks
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setParanoidChecks(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean paranoid_checks) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->paranoid_checks =
+      static_cast<bool>(paranoid_checks);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    paranoidChecks
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_paranoidChecks(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->paranoid_checks;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setRateLimiter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setRateLimiter(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jrate_limiter_handle) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->rate_limiter.reset(
+      reinterpret_cast<rocksdb::RateLimiter*>(jrate_limiter_handle));
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setLogger
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setLogger(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jlogger_handle) {
+  std::shared_ptr<rocksdb::LoggerJniCallback> *pLogger =
+      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback> *>(
+          jlogger_handle);
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->info_log = *pLogger;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setInfoLogLevel
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_DBOptions_setInfoLogLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jlog_level) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->info_log_level =
+    static_cast<rocksdb::InfoLogLevel>(jlog_level);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    infoLogLevel
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_DBOptions_infoLogLevel(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return static_cast<jbyte>(
+      reinterpret_cast<rocksdb::DBOptions*>(jhandle)->info_log_level);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxTotalWalSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxTotalWalSize(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jlong jmax_total_wal_size) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_total_wal_size =
+      static_cast<jlong>(jmax_total_wal_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxTotalWalSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_maxTotalWalSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
+      max_total_wal_size;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxOpenFiles
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxOpenFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint max_open_files) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_open_files =
+      static_cast<int>(max_open_files);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxOpenFiles
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxOpenFiles(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_open_files;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    createStatistics
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_DBOptions_createStatistics(
+    JNIEnv* env, jobject jobj, jlong jOptHandle) {
+  reinterpret_cast<rocksdb::DBOptions*>(jOptHandle)->statistics =
+      rocksdb::CreateDBStatistics();
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    statisticsPtr
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_statisticsPtr(
+    JNIEnv* env, jobject jobj, jlong jOptHandle) {
+  auto st = reinterpret_cast<rocksdb::DBOptions*>(jOptHandle)->
+      statistics.get();
+  return reinterpret_cast<jlong>(st);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setDisableDataSync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setDisableDataSync(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean disableDataSync) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->disableDataSync =
+      static_cast<bool>(disableDataSync);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    disableDataSync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_disableDataSync(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->disableDataSync;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setUseFsync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setUseFsync(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_fsync) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_fsync =
+      static_cast<bool>(use_fsync);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    useFsync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_useFsync(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_fsync;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setDbLogDir
+ * Signature: (JLjava/lang/String)V
+ */
+void Java_org_rocksdb_DBOptions_setDbLogDir(
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring jdb_log_dir) {
+  const char* log_dir = env->GetStringUTFChars(jdb_log_dir, 0);
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->db_log_dir.assign(log_dir);
+  env->ReleaseStringUTFChars(jdb_log_dir, log_dir);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    dbLogDir
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_DBOptions_dbLogDir(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return env->NewStringUTF(
+      reinterpret_cast<rocksdb::DBOptions*>(jhandle)->db_log_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWalDir
+ * Signature: (JLjava/lang/String)V
+ */
+void Java_org_rocksdb_DBOptions_setWalDir(
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring jwal_dir) {
+  const char* wal_dir = env->GetStringUTFChars(jwal_dir, 0);
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->wal_dir.assign(wal_dir);
+  env->ReleaseStringUTFChars(jwal_dir, wal_dir);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    walDir
+ * Signature: (J)Ljava/lang/String
+ */
+jstring Java_org_rocksdb_DBOptions_walDir(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return env->NewStringUTF(
+      reinterpret_cast<rocksdb::DBOptions*>(jhandle)->wal_dir.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setDeleteObsoleteFilesPeriodMicros
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setDeleteObsoleteFilesPeriodMicros(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong micros) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->delete_obsolete_files_period_micros =
+          static_cast<int64_t>(micros);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    deleteObsoleteFilesPeriodMicros
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_deleteObsoleteFilesPeriodMicros(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->delete_obsolete_files_period_micros;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxBackgroundCompactions
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxBackgroundCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint max) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->max_background_compactions = static_cast<int>(max);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxBackgroundCompactions
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxBackgroundCompactions(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(
+      jhandle)->max_background_compactions;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxBackgroundFlushes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxBackgroundFlushes(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint max_background_flushes) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_background_flushes =
+      static_cast<int>(max_background_flushes);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxBackgroundFlushes
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxBackgroundFlushes(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
+      max_background_flushes;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxLogFileSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxLogFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(max_log_file_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_log_file_size =
+        max_log_file_size;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxLogFileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_maxLogFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_log_file_size;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setLogFileTimeToRoll
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setLogFileTimeToRoll(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
+      log_file_time_to_roll);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::DBOptions*>(jhandle)->log_file_time_to_roll =
+        log_file_time_to_roll;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    logFileTimeToRoll
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_logFileTimeToRoll(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->log_file_time_to_roll;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setKeepLogFileNum
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setKeepLogFileNum(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(keep_log_file_num);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::DBOptions*>(jhandle)->keep_log_file_num =
+        keep_log_file_num;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    keepLogFileNum
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_keepLogFileNum(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->keep_log_file_num;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxManifestFileSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxManifestFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_manifest_file_size) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_manifest_file_size =
+      static_cast<int64_t>(max_manifest_file_size);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxManifestFileSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_maxManifestFileSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
+      max_manifest_file_size;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setTableCacheNumshardbits
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setTableCacheNumshardbits(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint table_cache_numshardbits) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->table_cache_numshardbits =
+      static_cast<int>(table_cache_numshardbits);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    tableCacheNumshardbits
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_tableCacheNumshardbits(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
+      table_cache_numshardbits;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWalTtlSeconds
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWalTtlSeconds(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_ttl_seconds) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->WAL_ttl_seconds =
+      static_cast<int64_t>(WAL_ttl_seconds);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    walTtlSeconds
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_walTtlSeconds(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->WAL_ttl_seconds;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWalSizeLimitMB
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWalSizeLimitMB(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_size_limit_MB) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->WAL_size_limit_MB =
+      static_cast<int64_t>(WAL_size_limit_MB);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    walTtlSeconds
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_walSizeLimitMB(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->WAL_size_limit_MB;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setManifestPreallocationSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setManifestPreallocationSize(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) {
+  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(preallocation_size);
+  if (s.ok()) {
+    reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
+        manifest_preallocation_size = preallocation_size;
+  } else {
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    manifestPreallocationSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_manifestPreallocationSize(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->manifest_preallocation_size;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAllowOsBuffer
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAllowOsBuffer(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_os_buffer) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_os_buffer =
+      static_cast<bool>(allow_os_buffer);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    allowOsBuffer
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_allowOsBuffer(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_os_buffer;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAllowMmapReads
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAllowMmapReads(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_reads) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_mmap_reads =
+      static_cast<bool>(allow_mmap_reads);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    allowMmapReads
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_allowMmapReads(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_mmap_reads;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAllowMmapWrites
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAllowMmapWrites(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_writes) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_mmap_writes =
+      static_cast<bool>(allow_mmap_writes);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    allowMmapWrites
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_allowMmapWrites(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_mmap_writes;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setIsFdCloseOnExec
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setIsFdCloseOnExec(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean is_fd_close_on_exec) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->is_fd_close_on_exec =
+      static_cast<bool>(is_fd_close_on_exec);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    isFdCloseOnExec
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_isFdCloseOnExec(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->is_fd_close_on_exec;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setStatsDumpPeriodSec
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setStatsDumpPeriodSec(
+    JNIEnv* env, jobject jobj, jlong jhandle, jint stats_dump_period_sec) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->stats_dump_period_sec =
+      static_cast<int>(stats_dump_period_sec);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    statsDumpPeriodSec
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_statsDumpPeriodSec(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->stats_dump_period_sec;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAdviseRandomOnOpen
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAdviseRandomOnOpen(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean advise_random_on_open) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->advise_random_on_open =
+      static_cast<bool>(advise_random_on_open);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    adviseRandomOnOpen
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_adviseRandomOnOpen(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->advise_random_on_open;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setUseAdaptiveMutex
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setUseAdaptiveMutex(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_adaptive_mutex) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_adaptive_mutex =
+      static_cast<bool>(use_adaptive_mutex);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    useAdaptiveMutex
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_useAdaptiveMutex(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_adaptive_mutex;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setBytesPerSync
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setBytesPerSync(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong bytes_per_sync) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->bytes_per_sync =
+      static_cast<int64_t>(bytes_per_sync);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    bytesPerSync
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_bytesPerSync(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->bytes_per_sync;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::WriteOptions
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    newWriteOptions
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteOptions_newWriteOptions(
+    JNIEnv* env, jobject jwrite_options) {
+  rocksdb::WriteOptions* op = new rocksdb::WriteOptions();
+  rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, op);
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    disposeInternal
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteOptions_disposeInternal(
+    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(jhandle);
+  delete write_options;
+
+  rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    setSync
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_WriteOptions_setSync(
+  JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) {
+  reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->sync = jflag;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    sync
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteOptions_sync(
+    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
+  return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->sync;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    setDisableWAL
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_WriteOptions_setDisableWAL(
+    JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) {
+  reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->disableWAL = jflag;
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    disableWAL
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteOptions_disableWAL(
+    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
+  return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->disableWAL;
+}
+
+/////////////////////////////////////////////////////////////////////
+// rocksdb::ReadOptions
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    newReadOptions
+ * Signature: ()V
+ */
+void Java_org_rocksdb_ReadOptions_newReadOptions(
+    JNIEnv* env, jobject jobj) {
+  auto read_opt = new rocksdb::ReadOptions();
+  rocksdb::ReadOptionsJni::setHandle(env, jobj, read_opt);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ReadOptions_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  delete reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
+  rocksdb::ReadOptionsJni::setHandle(env, jobj, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setVerifyChecksums
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setVerifyChecksums(
+    JNIEnv* env, jobject jobj, jlong jhandle,
+    jboolean jverify_checksums) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->verify_checksums =
+      static_cast<bool>(jverify_checksums);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    verifyChecksums
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_verifyChecksums(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ReadOptions*>(
+      jhandle)->verify_checksums;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setFillCache
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setFillCache(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfill_cache) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->fill_cache =
+      static_cast<bool>(jfill_cache);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    fillCache
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_fillCache(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->fill_cache;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setTailing
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ReadOptions_setTailing(
+    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jtailing) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing =
+      static_cast<bool>(jtailing);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    tailing
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ReadOptions_tailing(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing;
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setSnapshot
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setSnapshot(
+    JNIEnv* env, jobject jobj, jlong jhandle, jlong jsnapshot) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->snapshot =
+      reinterpret_cast<rocksdb::Snapshot*>(jsnapshot);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    snapshot
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_snapshot(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto& snapshot =
+      reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->snapshot;
+  return reinterpret_cast<jlong>(snapshot);
+}
+
+/////////////////////////////////////////////////////////////////////
+// rocksdb::ComparatorOptions
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    newComparatorOptions
+ * Signature: ()V
+ */
+void Java_org_rocksdb_ComparatorOptions_newComparatorOptions(
+    JNIEnv* env, jobject jobj) {
+  auto comparator_opt = new rocksdb::ComparatorJniCallbackOptions();
+  rocksdb::ComparatorOptionsJni::setHandle(env, jobj, comparator_opt);
+}
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    useAdaptiveMutex
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_ComparatorOptions_useAdaptiveMutex(
+    JNIEnv * env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(jhandle)
+    ->use_adaptive_mutex;
+}
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    setUseAdaptiveMutex
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_ComparatorOptions_setUseAdaptiveMutex(
+    JNIEnv * env, jobject jobj, jlong jhandle, jboolean juse_adaptive_mutex) {
+  reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(jhandle)
+    ->use_adaptive_mutex = static_cast<bool>(juse_adaptive_mutex);
+}
+
+/*
+ * Class:     org_rocksdb_ComparatorOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_ComparatorOptions_disposeInternal(
+    JNIEnv * env, jobject jobj, jlong jhandle) {
+  delete reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(jhandle);
+  rocksdb::ComparatorOptionsJni::setHandle(env, jobj, nullptr);
+}
+
+/////////////////////////////////////////////////////////////////////
+// rocksdb::FlushOptions
+
+/*
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    newFlushOptions
+ * Signature: ()V
+ */
+void Java_org_rocksdb_FlushOptions_newFlushOptions(
+    JNIEnv* env, jobject jobj) {
+  auto flush_opt = new rocksdb::FlushOptions();
+  rocksdb::FlushOptionsJni::setHandle(env, jobj, flush_opt);
+}
+
+/*
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    setWaitForFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_FlushOptions_setWaitForFlush(
+    JNIEnv * env, jobject jobj, jlong jhandle, jboolean jwait) {
+  reinterpret_cast<rocksdb::FlushOptions*>(jhandle)
+    ->wait = static_cast<bool>(jwait);
+}
+
+/*
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    waitForFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_FlushOptions_waitForFlush(
+    JNIEnv * env, jobject jobj, jlong jhandle) {
+  return reinterpret_cast<rocksdb::FlushOptions*>(jhandle)
+    ->wait;
+}
+
+/*
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_FlushOptions_disposeInternal(
+    JNIEnv * env, jobject jobj, jlong jhandle) {
+  delete reinterpret_cast<rocksdb::FlushOptions*>(jhandle);
+  rocksdb::FlushOptionsJni::setHandle(env, jobj, nullptr);
+}
diff --git a/src/rocksdb/java/rocksjni/portal.h b/src/rocksdb/java/rocksjni/portal.h
new file mode 100644
index 0000000..804bbc6
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/portal.h
@@ -0,0 +1,833 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+// This file is designed for caching those frequently used IDs and provide
+// efficient portal (i.e, a set of static functions) to access java code
+// from c++.
+
+#ifndef JAVA_ROCKSJNI_PORTAL_H_
+#define JAVA_ROCKSJNI_PORTAL_H_
+
+#include <jni.h>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/backupable_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksjni/comparatorjnicallback.h"
+#include "rocksjni/loggerjnicallback.h"
+#include "rocksjni/writebatchhandlerjnicallback.h"
+
+namespace rocksdb {
+
+// Detect if jlong overflows size_t
+inline Status check_if_jlong_fits_size_t(const jlong& jvalue) {
+  Status s = Status::OK();
+  if (static_cast<uint64_t>(jvalue) > std::numeric_limits<size_t>::max()) {
+    s = Status::InvalidArgument(Slice("jlong overflows 32 bit value."));
+  }
+  return s;
+}
+
+// Native class template
+template<class PTR, class DERIVED> class RocksDBNativeClass {
+ public:
+  // Get the java class id
+  static jclass getJClass(JNIEnv* env, const char* jclazz_name) {
+    jclass jclazz = env->FindClass(jclazz_name);
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the field id of the member variable to store
+  // the ptr
+  static jfieldID getHandleFieldID(JNIEnv* env) {
+    static jfieldID fid = env->GetFieldID(
+        DERIVED::getJClass(env), "nativeHandle_", "J");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  // Get the pointer from Java
+  static PTR getHandle(JNIEnv* env, jobject jobj) {
+    return reinterpret_cast<PTR>(
+        env->GetLongField(jobj, getHandleFieldID(env)));
+  }
+
+  // Pass the pointer to the java side.
+  static void setHandle(JNIEnv* env, jobject jdb, PTR ptr) {
+    env->SetLongField(
+        jdb, getHandleFieldID(env),
+        reinterpret_cast<jlong>(ptr));
+  }
+};
+
+// Java Exception template
+template<class DERIVED> class RocksDBJavaException {
+ public:
+  // Get the java class id
+  static jclass getJClass(JNIEnv* env, const char* jclazz_name) {
+    jclass jclazz = env->FindClass(jclazz_name);
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Create and throw a java exception by converting the input
+  // Status.
+  //
+  // In case s.ok() is true, then this function will not throw any
+  // exception.
+  static void ThrowNew(JNIEnv* env, Status s) {
+    if (s.ok()) {
+      return;
+    }
+    jstring msg = env->NewStringUTF(s.ToString().c_str());
+    // get the constructor id of org.rocksdb.RocksDBException
+    static jmethodID mid = env->GetMethodID(
+        DERIVED::getJClass(env), "<init>", "(Ljava/lang/String;)V");
+    assert(mid != nullptr);
+
+    env->Throw((jthrowable)env->NewObject(DERIVED::getJClass(env),
+        mid, msg));
+  }
+};
+
+// The portal class for org.rocksdb.RocksDB
+class RocksDBJni : public RocksDBNativeClass<rocksdb::DB*, RocksDBJni> {
+ public:
+  // Get the java class id of org.rocksdb.RocksDB.
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/RocksDB");
+  }
+};
+
+// The portal class for org.rocksdb.RocksDBException
+class RocksDBExceptionJni :
+    public RocksDBJavaException<RocksDBExceptionJni> {
+ public:
+  // Get the java class id of java.lang.IllegalArgumentException
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBJavaException::getJClass(env,
+        "org/rocksdb/RocksDBException");
+  }
+};
+
+// The portal class for java.lang.IllegalArgumentException
+class IllegalArgumentExceptionJni :
+    public RocksDBJavaException<IllegalArgumentExceptionJni> {
+ public:
+  // Get the java class id of java.lang.IllegalArgumentException
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBJavaException::getJClass(env,
+        "java/lang/IllegalArgumentException");
+  }
+};
+
+
+// The portal class for org.rocksdb.Options
+class OptionsJni : public RocksDBNativeClass<
+    rocksdb::Options*, OptionsJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Options");
+  }
+};
+
+// The portal class for org.rocksdb.DBOptions
+class DBOptionsJni : public RocksDBNativeClass<
+    rocksdb::DBOptions*, DBOptionsJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/DBOptions");
+  }
+};
+
+class ColumnFamilyDescriptorJni {
+ public:
+  // Get the java class id of org.rocksdb.ColumnFamilyDescriptor
+  static jclass getColumnFamilyDescriptorClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/ColumnFamilyDescriptor");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the java method id of columnFamilyName
+  static jmethodID getColumnFamilyNameMethod(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getColumnFamilyDescriptorClass(env),
+        "columnFamilyName", "()[B");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method id of columnFamilyOptions
+  static jmethodID getColumnFamilyOptionsMethod(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getColumnFamilyDescriptorClass(env),
+        "columnFamilyOptions", "()Lorg/rocksdb/ColumnFamilyOptions;");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.ColumnFamilyOptions
+class ColumnFamilyOptionsJni : public RocksDBNativeClass<
+    rocksdb::ColumnFamilyOptions*, ColumnFamilyOptionsJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/ColumnFamilyOptions");
+  }
+};
+
+// The portal class for org.rocksdb.WriteOptions
+class WriteOptionsJni : public RocksDBNativeClass<
+    rocksdb::WriteOptions*, WriteOptionsJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/WriteOptions");
+  }
+};
+
+// The portal class for org.rocksdb.ReadOptions
+class ReadOptionsJni : public RocksDBNativeClass<
+    rocksdb::ReadOptions*, ReadOptionsJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/ReadOptions");
+  }
+};
+
+// The portal class for org.rocksdb.ReadOptions
+class WriteBatchJni : public RocksDBNativeClass<
+    rocksdb::WriteBatch*, WriteBatchJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/WriteBatch");
+  }
+};
+
+// The portal class for org.rocksdb.WriteBatch.Handler
+class WriteBatchHandlerJni : public RocksDBNativeClass<
+    const rocksdb::WriteBatchHandlerJniCallback*,
+    WriteBatchHandlerJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/WriteBatch$Handler");
+  }
+
+  // Get the java method `put` of org.rocksdb.WriteBatch.Handler.
+  static jmethodID getPutMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "put", "([B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method `merge` of org.rocksdb.WriteBatch.Handler.
+  static jmethodID getMergeMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "merge", "([B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method `delete` of org.rocksdb.WriteBatch.Handler.
+  static jmethodID getDeleteMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "delete", "([B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method `logData` of org.rocksdb.WriteBatch.Handler.
+  static jmethodID getLogDataMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "logData", "([B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method `shouldContinue` of org.rocksdb.WriteBatch.Handler.
+  static jmethodID getContinueMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "shouldContinue", "()Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.WriteBatchWithIndex
+class WriteBatchWithIndexJni : public RocksDBNativeClass<
+    rocksdb::WriteBatchWithIndex*, WriteBatchWithIndexJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/WriteBatch");
+  }
+};
+
+class HistogramDataJni {
+ public:
+  static jmethodID getConstructorMethodId(JNIEnv* env, jclass jclazz) {
+    static jmethodID mid = env->GetMethodID(jclazz, "<init>", "(DDDDD)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.WriteBatchWithIndex
+class BackupableDBOptionsJni : public RocksDBNativeClass<
+    rocksdb::BackupableDBOptions*, BackupableDBOptionsJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/BackupableDBOptions");
+  }
+};
+
+class BackupEngineJni : public RocksDBNativeClass<
+    rocksdb::BackupEngine*, BackupEngineJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/BackupEngine");
+  }
+};
+
+// The portal class for org.rocksdb.RocksIterator
+class IteratorJni : public RocksDBNativeClass<
+    rocksdb::Iterator*, IteratorJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/RocksIterator");
+  }
+};
+
+// The portal class for org.rocksdb.Filter
+class FilterJni : public RocksDBNativeClass<
+    std::shared_ptr<rocksdb::FilterPolicy>*, FilterJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/Filter");
+  }
+};
+
+// The portal class for org.rocksdb.ColumnFamilyHandle
+class ColumnFamilyHandleJni : public RocksDBNativeClass<
+    rocksdb::ColumnFamilyHandle*, ColumnFamilyHandleJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/ColumnFamilyHandle");
+  }
+};
+
+// The portal class for org.rocksdb.FlushOptions
+class FlushOptionsJni : public RocksDBNativeClass<
+    rocksdb::FlushOptions*, FlushOptionsJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/FlushOptions");
+  }
+};
+
+// The portal class for org.rocksdb.ComparatorOptions
+class ComparatorOptionsJni : public RocksDBNativeClass<
+    rocksdb::ComparatorJniCallbackOptions*, ComparatorOptionsJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/ComparatorOptions");
+  }
+};
+
+// The portal class for org.rocksdb.AbstractComparator
+class AbstractComparatorJni : public RocksDBNativeClass<
+    const rocksdb::BaseComparatorJniCallback*,
+    AbstractComparatorJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/AbstractComparator");
+  }
+
+  // Get the java method `name` of org.rocksdb.Comparator.
+  static jmethodID getNameMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "name", "()Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method `compare` of org.rocksdb.Comparator.
+  static jmethodID getCompareMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(getJClass(env),
+      "compare",
+      "(Lorg/rocksdb/AbstractSlice;Lorg/rocksdb/AbstractSlice;)I");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method `findShortestSeparator` of org.rocksdb.Comparator.
+  static jmethodID getFindShortestSeparatorMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(getJClass(env),
+      "findShortestSeparator",
+      "(Ljava/lang/String;Lorg/rocksdb/AbstractSlice;)Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method `findShortSuccessor` of org.rocksdb.Comparator.
+  static jmethodID getFindShortSuccessorMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(getJClass(env),
+      "findShortSuccessor",
+      "(Ljava/lang/String;)Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.AbstractSlice
+class AbstractSliceJni : public RocksDBNativeClass<
+    const rocksdb::Slice*, AbstractSliceJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/AbstractSlice");
+  }
+};
+
+class SliceJni {
+ public:
+  // Get the java class id of org.rocksdb.Slice.
+  static jclass getJClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/Slice");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  static jobject construct0(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(getJClass(env), "<init>", "()V");
+    assert(mid != nullptr);
+    return env->NewObject(getJClass(env), mid);
+  }
+};
+
+class DirectSliceJni {
+ public:
+  // Get the java class id of org.rocksdb.DirectSlice.
+  static jclass getJClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/DirectSlice");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  static jobject construct0(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(getJClass(env), "<init>", "()V");
+    assert(mid != nullptr);
+    return env->NewObject(getJClass(env), mid);
+  }
+};
+
+class ListJni {
+ public:
+  // Get the java class id of java.util.List.
+  static jclass getListClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("java/util/List");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the java class id of java.util.ArrayList.
+  static jclass getArrayListClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("java/util/ArrayList");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the java class id of java.util.Iterator.
+  static jclass getIteratorClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("java/util/Iterator");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the java method id of java.util.List.iterator().
+  static jmethodID getIteratorMethod(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getListClass(env), "iterator", "()Ljava/util/Iterator;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method id of java.util.Iterator.hasNext().
+  static jmethodID getHasNextMethod(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getIteratorClass(env), "hasNext", "()Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method id of java.util.Iterator.next().
+  static jmethodID getNextMethod(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getIteratorClass(env), "next", "()Ljava/lang/Object;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method id of arrayList constructor.
+  static jmethodID getArrayListConstructorMethodId(JNIEnv* env, jclass jclazz) {
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "<init>", "(I)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  // Get the java method id of java.util.List.add().
+  static jmethodID getListAddMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getListClass(env), "add", "(Ljava/lang/Object;)Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+class ByteJni {
+ public:
+  // Get the java class id of java.lang.Byte.
+  static jclass getByteClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("java/lang/Byte");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  // Get the java method id of java.lang.Byte.byteValue.
+  static jmethodID getByteValueMethod(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getByteClass(env), "byteValue", "()B");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+class BackupInfoJni {
+ public:
+  // Get the java class id of org.rocksdb.BackupInfo.
+  static jclass getJClass(JNIEnv* env) {
+    jclass jclazz = env->FindClass("org/rocksdb/BackupInfo");
+    assert(jclazz != nullptr);
+    return jclazz;
+  }
+
+  static jobject construct0(JNIEnv* env, uint32_t backup_id, int64_t timestamp,
+      uint64_t size, uint32_t number_files) {
+    static jmethodID mid = env->GetMethodID(getJClass(env), "<init>",
+        "(IJJI)V");
+    assert(mid != nullptr);
+    return env->NewObject(getJClass(env), mid,
+        backup_id, timestamp, size, number_files);
+  }
+};
+
+class BackupInfoListJni {
+ public:
+  static jobject getBackupInfo(JNIEnv* env,
+      std::vector<BackupInfo> backup_infos) {
+    jclass jclazz = env->FindClass("java/util/ArrayList");
+    jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(
+        env, jclazz);
+    jobject jbackup_info_handle_list = env->NewObject(jclazz, mid,
+        backup_infos.size());
+    // insert in java list
+    for (std::vector<rocksdb::BackupInfo>::size_type i = 0;
+        i != backup_infos.size(); i++) {
+      rocksdb::BackupInfo backup_info = backup_infos[i];
+      jobject obj = rocksdb::BackupInfoJni::construct0(env,
+          backup_info.backup_id,
+          backup_info.timestamp,
+          backup_info.size,
+          backup_info.number_files);
+      env->CallBooleanMethod(jbackup_info_handle_list,
+          rocksdb::ListJni::getListAddMethodId(env), obj);
+    }
+    return jbackup_info_handle_list;
+  }
+};
+
+class WBWIRocksIteratorJni {
+ public:
+    // Get the java class id of org.rocksdb.WBWIRocksIterator.
+    static jclass getJClass(JNIEnv* env) {
+      static jclass jclazz = env->FindClass("org/rocksdb/WBWIRocksIterator");
+      assert(jclazz != nullptr);
+      return jclazz;
+    }
+
+    static jfieldID getWriteEntryField(JNIEnv* env) {
+      static jfieldID fid =
+          env->GetFieldID(getJClass(env), "entry",
+          "Lorg/rocksdb/WBWIRocksIterator$WriteEntry;");
+      assert(fid != nullptr);
+      return fid;
+    }
+
+    static jobject getWriteEntry(JNIEnv* env, jobject jwbwi_rocks_iterator) {
+      jobject jwe =
+          env->GetObjectField(jwbwi_rocks_iterator, getWriteEntryField(env));
+      assert(jwe != nullptr);
+      return jwe;
+    }
+};
+
+class WriteTypeJni {
+ public:
+    // Get the PUT enum field of org.rocksdb.WBWIRocksIterator.WriteType
+    static jobject PUT(JNIEnv* env) {
+      return getEnum(env, "PUT");
+    }
+
+    // Get the MERGE enum field of org.rocksdb.WBWIRocksIterator.WriteType
+    static jobject MERGE(JNIEnv* env) {
+      return getEnum(env, "MERGE");
+    }
+
+    // Get the DELETE enum field of org.rocksdb.WBWIRocksIterator.WriteType
+    static jobject DELETE(JNIEnv* env) {
+      return getEnum(env, "DELETE");
+    }
+
+    // Get the LOG enum field of org.rocksdb.WBWIRocksIterator.WriteType
+    static jobject LOG(JNIEnv* env) {
+      return getEnum(env, "LOG");
+    }
+
+ private:
+    // Get the java class id of org.rocksdb.WBWIRocksIterator.WriteType.
+    static jclass getJClass(JNIEnv* env) {
+      jclass jclazz = env->FindClass("org/rocksdb/WBWIRocksIterator$WriteType");
+      assert(jclazz != nullptr);
+      return jclazz;
+    }
+
+    // Get an enum field of org.rocksdb.WBWIRocksIterator.WriteType
+    static jobject getEnum(JNIEnv* env, const char name[]) {
+      jclass jclazz = getJClass(env);
+      jfieldID jfid =
+          env->GetStaticFieldID(jclazz, name,
+          "Lorg/rocksdb/WBWIRocksIterator$WriteType;");
+      assert(jfid != nullptr);
+      return env->GetStaticObjectField(jclazz, jfid);
+    }
+};
+
+class WriteEntryJni {
+ public:
+    // Get the java class id of org.rocksdb.WBWIRocksIterator.WriteEntry.
+    static jclass getJClass(JNIEnv* env) {
+      static jclass jclazz =
+          env->FindClass("org/rocksdb/WBWIRocksIterator$WriteEntry");
+      assert(jclazz != nullptr);
+      return jclazz;
+    }
+
+    static void setWriteType(JNIEnv* env, jobject jwrite_entry,
+        WriteType write_type) {
+      jobject jwrite_type;
+      switch (write_type) {
+        case kPutRecord:
+          jwrite_type = WriteTypeJni::PUT(env);
+          break;
+
+        case kMergeRecord:
+          jwrite_type = WriteTypeJni::MERGE(env);
+          break;
+
+        case kDeleteRecord:
+          jwrite_type = WriteTypeJni::DELETE(env);
+          break;
+
+        case kLogDataRecord:
+          jwrite_type = WriteTypeJni::LOG(env);
+          break;
+
+        default:
+          jwrite_type = nullptr;
+      }
+      assert(jwrite_type != nullptr);
+      env->SetObjectField(jwrite_entry, getWriteTypeField(env), jwrite_type);
+    }
+
+    static void setKey(JNIEnv* env, jobject jwrite_entry,
+        const rocksdb::Slice* slice) {
+      jobject jkey = env->GetObjectField(jwrite_entry, getKeyField(env));
+      AbstractSliceJni::setHandle(env, jkey, slice);
+    }
+
+    static void setValue(JNIEnv* env, jobject jwrite_entry,
+        const rocksdb::Slice* slice) {
+      jobject jvalue = env->GetObjectField(jwrite_entry, getValueField(env));
+      AbstractSliceJni::setHandle(env, jvalue, slice);
+    }
+
+ private:
+    static jfieldID getWriteTypeField(JNIEnv* env) {
+      static jfieldID fid = env->GetFieldID(
+          getJClass(env), "type", "Lorg/rocksdb/WBWIRocksIterator$WriteType;");
+        assert(fid != nullptr);
+        return fid;
+    }
+
+    static jfieldID getKeyField(JNIEnv* env) {
+      static jfieldID fid = env->GetFieldID(
+          getJClass(env), "key", "Lorg/rocksdb/DirectSlice;");
+      assert(fid != nullptr);
+      return fid;
+    }
+
+    static jfieldID getValueField(JNIEnv* env) {
+      static jfieldID fid = env->GetFieldID(
+          getJClass(env), "value", "Lorg/rocksdb/DirectSlice;");
+      assert(fid != nullptr);
+      return fid;
+    }
+};
+
+class InfoLogLevelJni {
+ public:
+    // Get the DEBUG_LEVEL enum field of org.rocksdb.InfoLogLevel
+    static jobject DEBUG_LEVEL(JNIEnv* env) {
+      return getEnum(env, "DEBUG_LEVEL");
+    }
+
+    // Get the INFO_LEVEL enum field of org.rocksdb.InfoLogLevel
+    static jobject INFO_LEVEL(JNIEnv* env) {
+      return getEnum(env, "INFO_LEVEL");
+    }
+
+    // Get the WARN_LEVEL enum field of org.rocksdb.InfoLogLevel
+    static jobject WARN_LEVEL(JNIEnv* env) {
+      return getEnum(env, "WARN_LEVEL");
+    }
+
+    // Get the ERROR_LEVEL enum field of org.rocksdb.InfoLogLevel
+    static jobject ERROR_LEVEL(JNIEnv* env) {
+      return getEnum(env, "ERROR_LEVEL");
+    }
+
+    // Get the FATAL_LEVEL enum field of org.rocksdb.InfoLogLevel
+    static jobject FATAL_LEVEL(JNIEnv* env) {
+      return getEnum(env, "FATAL_LEVEL");
+    }
+
+ private:
+    // Get the java class id of org.rocksdb.WBWIRocksIterator.WriteType.
+    static jclass getJClass(JNIEnv* env) {
+      jclass jclazz = env->FindClass("org/rocksdb/InfoLogLevel");
+      assert(jclazz != nullptr);
+      return jclazz;
+    }
+
+    // Get an enum field of org.rocksdb.WBWIRocksIterator.WriteType
+    static jobject getEnum(JNIEnv* env, const char name[]) {
+      jclass jclazz = getJClass(env);
+      jfieldID jfid =
+          env->GetStaticFieldID(jclazz, name,
+          "Lorg/rocksdb/InfoLogLevel;");
+      assert(jfid != nullptr);
+      return env->GetStaticObjectField(jclazz, jfid);
+    }
+};
+
+// The portal class for org.rocksdb.Logger
+class LoggerJni : public RocksDBNativeClass<
+    std::shared_ptr<rocksdb::LoggerJniCallback>*, LoggerJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/Logger");
+  }
+
+  // Get the java method `name` of org.rocksdb.Logger.
+  static jmethodID getLogMethodId(JNIEnv* env) {
+    static jmethodID mid = env->GetMethodID(
+        getJClass(env), "log",
+        "(Lorg/rocksdb/InfoLogLevel;Ljava/lang/String;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+class JniUtil {
+ public:
+    /*
+     * Copies a jstring to a std::string
+     * and releases the original jstring
+     */
+    static std::string copyString(JNIEnv* env, jstring js) {
+      const char *utf = env->GetStringUTFChars(js, NULL);
+      std::string name(utf);
+      env->ReleaseStringUTFChars(js, utf);
+      return name;
+    }
+
+    /*
+     * Helper for operations on a key and value
+     * for example WriteBatch->Put
+     *
+     * TODO(AR) could be extended to cover returning rocksdb::Status
+     * from `op` and used for RocksDB->Put etc.
+     */
+    static void kv_op(
+        std::function<void(rocksdb::Slice, rocksdb::Slice)> op,
+        JNIEnv* env, jobject jobj,
+        jbyteArray jkey, jint jkey_len,
+        jbyteArray jentry_value, jint jentry_value_len) {
+      jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+      jbyte* value = env->GetByteArrayElements(jentry_value, nullptr);
+      rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+      rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
+          jentry_value_len);
+
+      op(key_slice, value_slice);
+
+      env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+      env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
+    }
+
+    /*
+     * Helper for operations on a key
+     * for example WriteBatch->Delete
+     *
+     * TODO(AR) could be extended to cover returning rocksdb::Status
+     * from `op` and used for RocksDB->Delete etc.
+     */
+    static void k_op(
+        std::function<void(rocksdb::Slice)> op,
+        JNIEnv* env, jobject jobj,
+        jbyteArray jkey, jint jkey_len) {
+      jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+      rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+      op(key_slice);
+
+      env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+    }
+};
+
+}  // namespace rocksdb
+#endif  // JAVA_ROCKSJNI_PORTAL_H_
diff --git a/src/rocksdb/java/rocksjni/ratelimiterjni.cc b/src/rocksdb/java/rocksjni/ratelimiterjni.cc
new file mode 100644
index 0000000..ab6160e
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/ratelimiterjni.cc
@@ -0,0 +1,24 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for RateLimiter.
+
+#include "rocksjni/portal.h"
+#include "include/org_rocksdb_GenericRateLimiterConfig.h"
+#include "rocksdb/rate_limiter.h"
+
+/*
+ * Class:     org_rocksdb_GenericRateLimiterConfig
+ * Method:    newRateLimiterHandle
+ * Signature: (JJI)J
+ */
+jlong Java_org_rocksdb_GenericRateLimiterConfig_newRateLimiterHandle(
+    JNIEnv* env, jobject jobj, jlong jrate_bytes_per_second,
+    jlong jrefill_period_micros, jint jfairness) {
+  return reinterpret_cast<jlong>(rocksdb::NewGenericRateLimiter(
+      static_cast<int64_t>(jrate_bytes_per_second),
+      static_cast<int64_t>(jrefill_period_micros),
+      static_cast<int32_t>(jfairness)));
+}
diff --git a/src/rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc b/src/rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc
new file mode 100644
index 0000000..e442d8d
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_RemoveEmptyValueCompactionFilter.h"
+#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h"
+
+
+/*
+ * Class:     org_rocksdb_RemoveEmptyValueCompactionFilter
+ * Method:    createNewRemoveEmptyValueCompactionFilter0
+ * Signature: ()V
+ */
+void Java_org_rocksdb_RemoveEmptyValueCompactionFilter_createNewRemoveEmptyValueCompactionFilter0(
+    JNIEnv* env, jobject jobj) {
+  const rocksdb::RemoveEmptyValueCompactionFilter* compaction_filter =
+      new rocksdb::RemoveEmptyValueCompactionFilter();
+
+  // set the native handle to our native compaction filter
+  static jclass jclazz =
+      env->FindClass("org/rocksdb/RemoveEmptyValueCompactionFilter");
+  static jfieldID fid = env->GetFieldID(jclazz, "nativeHandle_", "J");
+  env->SetLongField(jobj, fid, reinterpret_cast<jlong>(compaction_filter));
+}
diff --git a/src/rocksdb/java/rocksjni/restorejni.cc b/src/rocksdb/java/rocksjni/restorejni.cc
new file mode 100644
index 0000000..a234163
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/restorejni.cc
@@ -0,0 +1,203 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::RestoreBackupableDB and rocksdb::RestoreOptions methods
+// from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+
+#include "include/org_rocksdb_RestoreOptions.h"
+#include "include/org_rocksdb_RestoreBackupableDB.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/utilities/backupable_db.h"
+/*
+ * Class:     org_rocksdb_RestoreOptions
+ * Method:    newRestoreOptions
+ * Signature: (Z)J
+ */
+jlong Java_org_rocksdb_RestoreOptions_newRestoreOptions(JNIEnv* env,
+    jobject jobj, jboolean keep_log_files) {
+  auto ropt = new rocksdb::RestoreOptions(keep_log_files);
+  return reinterpret_cast<jlong>(ropt);
+}
+
+/*
+ * Class:     org_rocksdb_RestoreOptions
+ * Method:    dispose
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RestoreOptions_dispose(JNIEnv* env, jobject jobj,
+    jlong jhandle) {
+  auto ropt = reinterpret_cast<rocksdb::RestoreOptions*>(jhandle);
+  assert(ropt);
+  delete ropt;
+}
+
+/*
+ * Class:     org_rocksdb_RestoreBackupableDB
+ * Method:    newRestoreBackupableDB
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RestoreBackupableDB_newRestoreBackupableDB(JNIEnv* env,
+    jobject jobj, jlong jopt_handle) {
+  auto opt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jopt_handle);
+  auto rdb = new rocksdb::RestoreBackupableDB(rocksdb::Env::Default(), *opt);
+  return reinterpret_cast<jlong>(rdb);
+}
+
+/*
+ * Class:     org_rocksdb_RestoreBackupableDB
+ * Method:    restoreDBFromBackup0
+ * Signature: (JJLjava/lang/String;Ljava/lang/String;J)V
+ */
+void Java_org_rocksdb_RestoreBackupableDB_restoreDBFromBackup0(JNIEnv* env,
+    jobject jobj, jlong jhandle, jlong jbackup_id, jstring jdb_dir,
+    jstring jwal_dir, jlong jopt_handle) {
+  auto opt = reinterpret_cast<rocksdb::RestoreOptions*>(jopt_handle);
+
+  const char* cdb_dir = env->GetStringUTFChars(jdb_dir, 0);
+  const char* cwal_dir = env->GetStringUTFChars(jwal_dir, 0);
+
+  auto rdb = reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle);
+  rocksdb::Status s = rdb->RestoreDBFromBackup(
+      static_cast<rocksdb::BackupID>(jbackup_id), cdb_dir, cwal_dir, *opt);
+
+  env->ReleaseStringUTFChars(jdb_dir, cdb_dir);
+  env->ReleaseStringUTFChars(jwal_dir, cwal_dir);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RestoreBackupableDB
+ * Method:    restoreDBFromLatestBackup0
+ * Signature: (JLjava/lang/String;Ljava/lang/String;J)V
+ */
+void Java_org_rocksdb_RestoreBackupableDB_restoreDBFromLatestBackup0(
+    JNIEnv* env, jobject jobj, jlong jhandle, jstring jdb_dir, jstring jwal_dir,
+    jlong jopt_handle) {
+  auto opt = reinterpret_cast<rocksdb::RestoreOptions*>(jopt_handle);
+
+  const char* cdb_dir = env->GetStringUTFChars(jdb_dir, 0);
+  const char* cwal_dir = env->GetStringUTFChars(jwal_dir, 0);
+
+  auto rdb = reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle);
+  rocksdb::Status s =
+      rdb->RestoreDBFromLatestBackup(cdb_dir, cwal_dir, *opt);
+
+  env->ReleaseStringUTFChars(jdb_dir, cdb_dir);
+  env->ReleaseStringUTFChars(jwal_dir, cwal_dir);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RestoreBackupableDB
+ * Method:    purgeOldBackups0
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_RestoreBackupableDB_purgeOldBackups0(JNIEnv* env,
+    jobject jobj, jlong jhandle, jint jnum_backups_to_keep) {
+  auto rdb = reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle);
+  rocksdb::Status s = rdb->PurgeOldBackups(jnum_backups_to_keep);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RestoreBackupableDB
+ * Method:    deleteBackup0
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_RestoreBackupableDB_deleteBackup0(JNIEnv* env,
+    jobject jobj, jlong jhandle, jint jbackup_id) {
+  auto rdb = reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle);
+  rocksdb::Status s = rdb->DeleteBackup(jbackup_id);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RestoreBackupableDB
+ * Method:    getBackupInfo
+ * Signature: (J)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_RestoreBackupableDB_getBackupInfo(
+    JNIEnv* env, jobject jbdb, jlong jhandle) {
+  std::vector<rocksdb::BackupInfo> backup_infos;
+  reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle)->
+      GetBackupInfo(&backup_infos);
+  return rocksdb::BackupInfoListJni::getBackupInfo(env,
+      backup_infos);
+}
+
+/*
+ * Class:     org_rocksdb_RestoreBackupableDB
+ * Method:    getCorruptedBackups
+ * Signature: (J)[I;
+ */
+jintArray Java_org_rocksdb_RestoreBackupableDB_getCorruptedBackups(
+    JNIEnv* env, jobject jbdb, jlong jhandle) {
+  std::vector<rocksdb::BackupID> backup_ids;
+  reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle)->
+      GetCorruptedBackups(&backup_ids);
+  // store backupids in int array
+  const std::vector<rocksdb::BackupID>::size_type
+        kIdSize = backup_ids.size();
+
+  int int_backup_ids[kIdSize];
+  for (std::vector<rocksdb::BackupID>::size_type i = 0;
+      i != kIdSize; i++) {
+    int_backup_ids[i] = backup_ids[i];
+  }
+  // Store ints in java array
+  jintArray ret_backup_ids;
+  // Its ok to loose precision here (64->32)
+  jsize ret_backup_ids_size = static_cast<jsize>(kIdSize);
+  ret_backup_ids = env->NewIntArray(ret_backup_ids_size);
+  env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size,
+      int_backup_ids);
+  return ret_backup_ids;
+}
+
+/*
+ * Class:     org_rocksdb_RestoreBackupableDB
+ * Method:    garbageCollect
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RestoreBackupableDB_garbageCollect(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto db = reinterpret_cast<rocksdb::RestoreBackupableDB*>(
+      jhandle);
+  rocksdb::Status s = db->GarbageCollect();
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RestoreBackupableDB
+ * Method:    dispose
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RestoreBackupableDB_dispose(JNIEnv* env, jobject jobj,
+    jlong jhandle) {
+  auto ropt = reinterpret_cast<rocksdb::RestoreBackupableDB*>(jhandle);
+  assert(ropt);
+  delete ropt;
+}
diff --git a/src/rocksdb/java/rocksjni/rocksjni.cc b/src/rocksdb/java/rocksjni/rocksjni.cc
new file mode 100644
index 0000000..221e7ff
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/rocksjni.cc
@@ -0,0 +1,1653 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::DB methods from Java side.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "include/org_rocksdb_RocksDB.h"
+#include "rocksdb/db.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/types.h"
+#include "rocksjni/portal.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Open
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    open
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2(
+    JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path) {
+  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+  rocksdb::DB* db = nullptr;
+  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
+  rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, &db);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  if (s.ok()) {
+    rocksdb::RocksDBJni::setHandle(env, jdb, db);
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    openROnly
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2(
+    JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path) {
+  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+  rocksdb::DB* db = nullptr;
+  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
+  rocksdb::Status s = rocksdb::DB::OpenForReadOnly(*opt,
+      db_path, &db);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  if (s.ok()) {
+    rocksdb::RocksDBJni::setHandle(env, jdb, db);
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    openROnly
+ * Signature: (JLjava/lang/String;Ljava/util/List;I)Ljava/util/List;
+ */
+jobject
+    Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2Ljava_util_List_2I(
+    JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path,
+    jobject jcfdesc_list, jint jcfdesc_count) {
+  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+  rocksdb::DB* db = nullptr;
+  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
+
+  std::vector<jbyte*> cfnames_to_free;
+  std::vector<jbyteArray> jcfnames_for_free;
+
+  std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
+  std::vector<rocksdb::ColumnFamilyHandle* > handles;
+  // get iterator for ColumnFamilyDescriptors
+  jobject iteratorObj = env->CallObjectMethod(
+      jcfdesc_list, rocksdb::ListJni::getIteratorMethod(env));
+
+  // iterate over ColumnFamilyDescriptors
+  while (env->CallBooleanMethod(
+      iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+      // get ColumnFamilyDescriptor
+      jobject jcf_descriptor = env->CallObjectMethod(iteratorObj,
+          rocksdb::ListJni::getNextMethod(env));
+      // get ColumnFamilyName
+      jbyteArray cf_name_in_byte_array = static_cast<jbyteArray>(
+          env->CallObjectMethod(jcf_descriptor,
+          rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
+              env)));
+      // get CF Options
+      jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
+          rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
+              env));
+      rocksdb::ColumnFamilyOptions* cfOptions =
+          rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
+
+      jbyte* cfname = env->GetByteArrayElements(cf_name_in_byte_array, 0);
+      const int len = env->GetArrayLength(cf_name_in_byte_array);
+
+      // free allocated cfnames after call to open
+      cfnames_to_free.push_back(cfname);
+      jcfnames_for_free.push_back(cf_name_in_byte_array);
+      column_families.push_back(rocksdb::ColumnFamilyDescriptor(
+          std::string(reinterpret_cast<char *>(cfname), len), *cfOptions));
+  }
+
+  rocksdb::Status s = rocksdb::DB::OpenForReadOnly(*opt,
+      db_path, column_families, &handles, &db);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+  // free jbyte allocations
+  for (std::vector<jbyte*>::size_type i = 0;
+      i != cfnames_to_free.size(); i++) {
+    // free  cfnames
+    env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0);
+  }
+
+  // check if open operation was successful
+  if (s.ok()) {
+    rocksdb::RocksDBJni::setHandle(env, jdb, db);
+    jclass jListClazz = env->FindClass("java/util/ArrayList");
+    jmethodID midList = rocksdb::ListJni::getArrayListConstructorMethodId(
+        env, jListClazz);
+    jobject jcfhandle_list = env->NewObject(jListClazz,
+        midList, handles.size());
+    // insert in java list
+    for (std::vector<rocksdb::ColumnFamilyHandle*>::size_type i = 0;
+        i != handles.size(); i++) {
+      // jlong must be converted to Long due to collections restrictions
+      jclass jLongClazz = env->FindClass("java/lang/Long");
+      jmethodID midLong = env->GetMethodID(jLongClazz, "<init>", "(J)V");
+      jobject obj = env->NewObject(jLongClazz, midLong,
+          reinterpret_cast<jlong>(handles[i]));
+      env->CallBooleanMethod(jcfhandle_list,
+          rocksdb::ListJni::getListAddMethodId(env), obj);
+    }
+
+    return jcfhandle_list;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return nullptr;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    open
+ * Signature: (JLjava/lang/String;Ljava/util/List;I)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I(
+    JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path,
+    jobject jcfdesc_list, jint jcfdesc_count) {
+  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+  rocksdb::DB* db = nullptr;
+  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
+
+  std::vector<jbyte*> cfnames_to_free;
+  std::vector<jbyteArray> jcfnames_for_free;
+
+  std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
+  std::vector<rocksdb::ColumnFamilyHandle* > handles;
+  // get iterator for ColumnFamilyDescriptors
+  jobject iteratorObj = env->CallObjectMethod(
+      jcfdesc_list, rocksdb::ListJni::getIteratorMethod(env));
+
+  // iterate over ColumnFamilyDescriptors
+  while (env->CallBooleanMethod(
+      iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+      // get ColumnFamilyDescriptor
+      jobject jcf_descriptor = env->CallObjectMethod(iteratorObj,
+          rocksdb::ListJni::getNextMethod(env));
+      // get ColumnFamilyName
+      jbyteArray cf_name_in_byte_array = static_cast<jbyteArray>(
+          env->CallObjectMethod(jcf_descriptor,
+          rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
+              env)));
+      // get CF Options
+      jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
+          rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
+              env));
+      rocksdb::ColumnFamilyOptions* cfOptions =
+          rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
+
+      jbyte* cfname = env->GetByteArrayElements(cf_name_in_byte_array, 0);
+      const int len = env->GetArrayLength(cf_name_in_byte_array);
+
+      // free allocated cfnames after call to open
+      cfnames_to_free.push_back(cfname);
+      jcfnames_for_free.push_back(cf_name_in_byte_array);
+      column_families.push_back(rocksdb::ColumnFamilyDescriptor(
+          std::string(reinterpret_cast<char *>(cfname), len), *cfOptions));
+  }
+
+  rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, column_families,
+      &handles, &db);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+  // free jbyte allocations
+  for (std::vector<jbyte*>::size_type i = 0;
+      i != cfnames_to_free.size(); i++) {
+    // free  cfnames
+    env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0);
+  }
+
+  // check if open operation was successful
+  if (s.ok()) {
+    rocksdb::RocksDBJni::setHandle(env, jdb, db);
+    jclass jListClazz = env->FindClass("java/util/ArrayList");
+    jmethodID midList = rocksdb::ListJni::getArrayListConstructorMethodId(
+        env, jListClazz);
+    jobject jcfhandle_list = env->NewObject(jListClazz,
+        midList, handles.size());
+    // insert in java list
+    for (std::vector<rocksdb::ColumnFamilyHandle*>::size_type i = 0;
+        i != handles.size(); i++) {
+      // jlong must be converted to Long due to collections restrictions
+      jclass jLongClazz = env->FindClass("java/lang/Long");
+      jmethodID midLong = env->GetMethodID(jLongClazz, "<init>", "(J)V");
+      jobject obj = env->NewObject(jLongClazz, midLong,
+          reinterpret_cast<jlong>(handles[i]));
+      env->CallBooleanMethod(jcfhandle_list,
+          rocksdb::ListJni::getListAddMethodId(env), obj);
+    }
+
+    return jcfhandle_list;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return nullptr;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::ListColumnFamilies
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    listColumnFamilies
+ * Signature: (JLjava/lang/String;)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_RocksDB_listColumnFamilies(
+    JNIEnv* env, jclass jclazz, jlong jopt_handle, jstring jdb_path) {
+  std::vector<std::string> column_family_names;
+  auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
+  jobject jvalue_list = nullptr;
+
+  rocksdb::Status s = rocksdb::DB::ListColumnFamilies(*opt, db_path,
+      &column_family_names);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+  if (s.ok()) {
+    // Don't reuse class pointer
+    jclass jListClazz = env->FindClass("java/util/ArrayList");
+    jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(env,
+        jListClazz);
+    jvalue_list = env->NewObject(jListClazz, mid, column_family_names.size());
+
+    for (std::vector<std::string>::size_type i = 0;
+        i < column_family_names.size(); i++) {
+      jbyteArray jcf_value =
+          env->NewByteArray(static_cast<jsize>(column_family_names[i].size()));
+      env->SetByteArrayRegion(
+          jcf_value, 0, static_cast<jsize>(column_family_names[i].size()),
+          reinterpret_cast<const jbyte*>(column_family_names[i].data()));
+      env->CallBooleanMethod(jvalue_list,
+          rocksdb::ListJni::getListAddMethodId(env), jcf_value);
+    }
+  }
+  return jvalue_list;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Put
+
+void rocksdb_put_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
+    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+
+  jbyte* key = env->GetByteArrayElements(jkey, 0);
+  jbyte* value = env->GetByteArrayElements(jentry_value, 0);
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
+      jentry_value_len);
+
+  rocksdb::Status s;
+  if (cf_handle != nullptr) {
+    s = db->Put(write_options, cf_handle, key_slice, value_slice);
+  } else {
+    // backwards compatibility
+    s = db->Put(write_options, key_slice, value_slice);
+  }
+
+  // trigger java unref on key and value.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
+
+  if (s.ok()) {
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    put
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_RocksDB_put__J_3BI_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+
+  rocksdb_put_helper(env, db, default_write_options, nullptr,
+                     jkey, jkey_len,
+                     jentry_value, jentry_value_len);
+}
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    put
+ * Signature: (J[BI[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_put__J_3BI_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_put_helper(env, db, default_write_options, cf_handle,
+        jkey, jkey_len, jentry_value, jentry_value_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    put
+ * Signature: (JJ[BI[BI)V
+ */
+void Java_org_rocksdb_RocksDB_put__JJ_3BI_3BI(
+    JNIEnv* env, jobject jdb,
+    jlong jdb_handle, jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
+      jwrite_options_handle);
+
+  rocksdb_put_helper(env, db, *write_options, nullptr,
+                     jkey, jkey_len,
+                     jentry_value, jentry_value_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    put
+ * Signature: (JJ[BI[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_put__JJ_3BI_3BIJ(
+    JNIEnv* env, jobject jdb,
+    jlong jdb_handle, jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
+      jwrite_options_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_put_helper(env, db, *write_options, cf_handle,
+        jkey, jkey_len, jentry_value, jentry_value_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Write
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    write0
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_RocksDB_write0(
+    JNIEnv* env, jobject jdb,
+    jlong jwrite_options_handle, jlong jwb_handle) {
+  rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
+  auto* write_options = reinterpret_cast<rocksdb::WriteOptions*>(
+      jwrite_options_handle);
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+
+  rocksdb::Status s = db->Write(*write_options, wb);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    write1
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_RocksDB_write1(
+    JNIEnv* env, jobject jdb,
+    jlong jwrite_options_handle, jlong jwbwi_handle) {
+  rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
+  auto* write_options = reinterpret_cast<rocksdb::WriteOptions*>(
+      jwrite_options_handle);
+  auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
+  auto* wb = wbwi->GetWriteBatch();
+
+  rocksdb::Status s = db->Write(*write_options, wb);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::KeyMayExist
+jboolean key_may_exist_helper(JNIEnv* env, rocksdb::DB* db,
+    const rocksdb::ReadOptions& read_opt,
+    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len,
+    jobject jstring_buffer) {
+  std::string value;
+  bool value_found = false;
+  jboolean isCopy;
+  jbyte* key = env->GetByteArrayElements(jkey, &isCopy);
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  bool keyMayExist;
+  if (cf_handle != nullptr) {
+    keyMayExist = db->KeyMayExist(read_opt, cf_handle, key_slice,
+        &value, &value_found);
+  } else {
+    keyMayExist = db->KeyMayExist(read_opt, key_slice,
+        &value, &value_found);
+  }
+
+  if (value_found && !value.empty()) {
+    jclass clazz = env->GetObjectClass(jstring_buffer);
+    jmethodID mid = env->GetMethodID(clazz, "append",
+        "(Ljava/lang/String;)Ljava/lang/StringBuffer;");
+    jstring new_value_str = env->NewStringUTF(value.c_str());
+    env->CallObjectMethod(jstring_buffer, mid, new_value_str);
+  }
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  return static_cast<jboolean>(keyMayExist);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    keyMayExist
+ * Signature: ([BILjava/lang/StringBuffer;)Z
+ */
+jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BILjava_lang_StringBuffer_2(
+    JNIEnv* env, jobject jdb, jbyteArray jkey, jint jkey_len,
+    jobject jstring_buffer) {
+  rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
+  return key_may_exist_helper(env, db, rocksdb::ReadOptions(),
+      nullptr, jkey, jkey_len, jstring_buffer);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    keyMayExist
+ * Signature: ([BIJLjava/lang/StringBuffer;)Z
+ */
+jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BIJLjava_lang_StringBuffer_2(
+    JNIEnv* env, jobject jdb, jbyteArray jkey, jint jkey_len,
+    jlong jcf_handle, jobject jstring_buffer) {
+  rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(
+      jcf_handle);
+  if (cf_handle != nullptr) {
+    return key_may_exist_helper(env, db, rocksdb::ReadOptions(),
+        cf_handle, jkey, jkey_len, jstring_buffer);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+  return true;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    keyMayExist
+ * Signature: (J[BILjava/lang/StringBuffer;)Z
+ */
+jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BILjava_lang_StringBuffer_2(
+    JNIEnv* env, jobject jdb, jlong jread_options_handle,
+    jbyteArray jkey, jint jkey_len, jobject jstring_buffer) {
+  rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
+  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
+      jread_options_handle);
+  return key_may_exist_helper(env, db, read_options,
+      nullptr, jkey, jkey_len, jstring_buffer);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    keyMayExist
+ * Signature: (J[BIJLjava/lang/StringBuffer;)Z
+ */
+jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BIJLjava_lang_StringBuffer_2(
+    JNIEnv* env, jobject jdb, jlong jread_options_handle,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle, jobject jstring_buffer) {
+  rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb);
+  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
+      jread_options_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(
+      jcf_handle);
+  if (cf_handle != nullptr) {
+    return key_may_exist_helper(env, db, read_options, cf_handle,
+        jkey, jkey_len, jstring_buffer);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Get
+
+jbyteArray rocksdb_get_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_opt,
+    rocksdb::ColumnFamilyHandle* column_family_handle, jbyteArray jkey,
+    jint jkey_len) {
+  jboolean isCopy;
+  jbyte* key = env->GetByteArrayElements(jkey, &isCopy);
+  rocksdb::Slice key_slice(
+      reinterpret_cast<char*>(key), jkey_len);
+
+  std::string value;
+  rocksdb::Status s;
+  if (column_family_handle != nullptr) {
+    s = db->Get(read_opt, column_family_handle, key_slice, &value);
+  } else {
+    // backwards compatibility
+    s = db->Get(read_opt, key_slice, &value);
+  }
+
+  // trigger java unref on key.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+
+  if (s.IsNotFound()) {
+    return nullptr;
+  }
+
+  if (s.ok()) {
+    jbyteArray jret_value = env->NewByteArray(static_cast<jsize>(value.size()));
+    env->SetByteArrayRegion(jret_value, 0, static_cast<jsize>(value.size()),
+                            reinterpret_cast<const jbyte*>(value.c_str()));
+    return jret_value;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+
+  return nullptr;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BI)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__J_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len) {
+  return rocksdb_get_helper(env,
+      reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      rocksdb::ReadOptions(), nullptr,
+      jkey, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BIJ)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    return rocksdb_get_helper(env, db_handle, rocksdb::ReadOptions(),
+        cf_handle, jkey, jkey_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    // will never be evaluated
+    return env->NewByteArray(0);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BI)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
+    jbyteArray jkey, jint jkey_len) {
+  return rocksdb_get_helper(env,
+      reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle), nullptr,
+      jkey, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BIJ)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto& ro_opt = *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle,
+        jkey, jkey_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    // will never be evaluated
+    return env->NewByteArray(0);
+  }
+}
+
+jint rocksdb_get_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_options,
+    rocksdb::ColumnFamilyHandle* column_family_handle, jbyteArray jkey,
+    jint jkey_len, jbyteArray jentry_value, jint jentry_value_len) {
+  static const int kNotFound = -1;
+  static const int kStatusError = -2;
+
+  jbyte* key = env->GetByteArrayElements(jkey, 0);
+  rocksdb::Slice key_slice(
+      reinterpret_cast<char*>(key), jkey_len);
+
+  // TODO(yhchiang): we might save one memory allocation here by adding
+  // a DB::Get() function which takes preallocated jbyte* as input.
+  std::string cvalue;
+  rocksdb::Status s;
+  if (column_family_handle != nullptr) {
+    s = db->Get(read_options, column_family_handle, key_slice, &cvalue);
+  } else {
+    // backwards compatibility
+    s = db->Get(read_options, key_slice, &cvalue);
+  }
+
+  // trigger java unref on key.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+
+  if (s.IsNotFound()) {
+    return kNotFound;
+  } else if (!s.ok()) {
+    // Here since we are throwing a Java exception from c++ side.
+    // As a result, c++ does not know calling this function will in fact
+    // throwing an exception.  As a result, the execution flow will
+    // not stop here, and codes after this throw will still be
+    // executed.
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+
+    // Return a dummy const value to avoid compilation error, although
+    // java side might not have a chance to get the return value :)
+    return kStatusError;
+  }
+
+  int cvalue_len = static_cast<int>(cvalue.size());
+  int length = std::min(jentry_value_len, cvalue_len);
+
+  env->SetByteArrayRegion(
+      jentry_value, 0, length,
+      reinterpret_cast<const jbyte*>(cvalue.c_str()));
+  return cvalue_len;
+}
+
+// cf multi get
+jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
+    const rocksdb::ReadOptions& rOpt, jobject jkey_list, jint jkeys_count,
+    jobject jcfhandle_list) {
+  std::vector<rocksdb::Slice> keys;
+  std::vector<jbyte*> keys_to_free;
+  std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
+
+  if (jcfhandle_list != nullptr) {
+    // get cf iterator
+    jobject cfIteratorObj = env->CallObjectMethod(
+        jcfhandle_list, rocksdb::ListJni::getIteratorMethod(env));
+
+    // iterate over keys and convert java byte array to slice
+    while (env->CallBooleanMethod(
+        cfIteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+      jobject jobj = (jbyteArray) env->CallObjectMethod(
+          cfIteratorObj, rocksdb::ListJni::getNextMethod(env));
+      rocksdb::ColumnFamilyHandle* cfHandle =
+          rocksdb::ColumnFamilyHandleJni::getHandle(env, jobj);
+      cf_handles.push_back(cfHandle);
+    }
+  }
+
+  // Process key list
+  // get iterator
+  jobject iteratorObj = env->CallObjectMethod(
+      jkey_list, rocksdb::ListJni::getIteratorMethod(env));
+
+  // iterate over keys and convert java byte array to slice
+  while (env->CallBooleanMethod(
+      iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+    jbyteArray jkey = (jbyteArray) env->CallObjectMethod(
+       iteratorObj, rocksdb::ListJni::getNextMethod(env));
+    jint key_length = env->GetArrayLength(jkey);
+
+    jbyte* key = new jbyte[key_length];
+    env->GetByteArrayRegion(jkey, 0, key_length, key);
+    // store allocated jbyte to free it after multiGet call
+    keys_to_free.push_back(key);
+
+    rocksdb::Slice key_slice(
+      reinterpret_cast<char*>(key), key_length);
+    keys.push_back(key_slice);
+  }
+
+  std::vector<std::string> values;
+  std::vector<rocksdb::Status> s;
+  if (cf_handles.size() == 0) {
+    s = db->MultiGet(rOpt, keys, &values);
+  } else {
+    s = db->MultiGet(rOpt, cf_handles, keys, &values);
+  }
+
+  // Don't reuse class pointer
+  jclass jclazz = env->FindClass("java/util/ArrayList");
+  jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(
+      env, jclazz);
+  jobject jvalue_list = env->NewObject(jclazz, mid, jkeys_count);
+
+  // insert in java list
+  for (std::vector<rocksdb::Status>::size_type i = 0; i != s.size(); i++) {
+    if (s[i].ok()) {
+      jbyteArray jentry_value =
+          env->NewByteArray(static_cast<jsize>(values[i].size()));
+      env->SetByteArrayRegion(
+          jentry_value, 0, static_cast<jsize>(values[i].size()),
+          reinterpret_cast<const jbyte*>(values[i].c_str()));
+      env->CallBooleanMethod(
+          jvalue_list, rocksdb::ListJni::getListAddMethodId(env),
+              jentry_value);
+    } else {
+      env->CallBooleanMethod(
+          jvalue_list, rocksdb::ListJni::getListAddMethodId(env), nullptr);
+    }
+  }
+  // free up allocated byte arrays
+  for (std::vector<jbyte*>::size_type i = 0; i != keys_to_free.size(); i++) {
+    delete[] keys_to_free[i];
+  }
+  keys_to_free.clear();
+  return jvalue_list;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    multiGet
+ * Signature: (JLjava/util/List;I)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_RocksDB_multiGet__JLjava_util_List_2I(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jobject jkey_list, jint jkeys_count) {
+  return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      rocksdb::ReadOptions(), jkey_list, jkeys_count, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    multiGet
+ * Signature: (JLjava/util/List;ILjava/util/List;)Ljava/util/List;
+ */
+jobject
+    Java_org_rocksdb_RocksDB_multiGet__JLjava_util_List_2ILjava_util_List_2(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jobject jkey_list, jint jkeys_count, jobject jcfhandle_list) {
+  return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      rocksdb::ReadOptions(), jkey_list, jkeys_count, jcfhandle_list);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    multiGet
+ * Signature: (JJLjava/util/List;I)Ljava/util/List;
+ */
+jobject Java_org_rocksdb_RocksDB_multiGet__JJLjava_util_List_2I(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jlong jropt_handle, jobject jkey_list, jint jkeys_count) {
+  return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle), jkey_list,
+      jkeys_count, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    multiGet
+ * Signature: (JJLjava/util/List;ILjava/util/List;)Ljava/util/List;
+ */
+jobject
+    Java_org_rocksdb_RocksDB_multiGet__JJLjava_util_List_2ILjava_util_List_2(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jlong jropt_handle, jobject jkey_list, jint jkeys_count,
+    jobject jcfhandle_list) {
+  return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle), jkey_list,
+      jkeys_count, jcfhandle_list);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BI[BI)I
+ */
+jint Java_org_rocksdb_RocksDB_get__J_3BI_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+  return rocksdb_get_helper(env,
+      reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      rocksdb::ReadOptions(), nullptr,
+      jkey, jkey_len, jentry_value, jentry_value_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BI[BIJ)I
+ */
+jint Java_org_rocksdb_RocksDB_get__J_3BI_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
+  auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    return rocksdb_get_helper(env, db_handle, rocksdb::ReadOptions(), cf_handle,
+        jkey, jkey_len, jentry_value, jentry_value_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    // will never be evaluated
+    return 0;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BI[BI)I
+ */
+jint Java_org_rocksdb_RocksDB_get__JJ_3BI_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+  return rocksdb_get_helper(env,
+      reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle),
+      nullptr, jkey, jkey_len, jentry_value, jentry_value_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BI[BIJ)I
+ */
+jint Java_org_rocksdb_RocksDB_get__JJ_3BI_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
+  auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto& ro_opt = *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle, jkey,
+        jkey_len, jentry_value, jentry_value_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    // will never be evaluated
+    return 0;
+  }
+}
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Delete()
+void rocksdb_remove_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
+    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len) {
+  jbyte* key = env->GetByteArrayElements(jkey, 0);
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+  rocksdb::Status s;
+  if (cf_handle != nullptr) {
+    s = db->Delete(write_options, cf_handle, key_slice);
+  } else {
+    // backwards compatibility
+    s = db->Delete(write_options, key_slice);
+  }
+  // trigger java unref on key and value.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+  return;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    remove
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_RocksDB_remove__J_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  rocksdb_remove_helper(env, db, default_write_options, nullptr,
+      jkey, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    remove
+ * Signature: (J[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_remove__J_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_remove_helper(env, db, default_write_options, cf_handle,
+        jkey, jkey_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    remove
+ * Signature: (JJ[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_remove__JJ_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jlong jwrite_options, jbyteArray jkey, jint jkey_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
+  rocksdb_remove_helper(env, db, *write_options, nullptr, jkey, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    remove
+ * Signature: (JJ[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_remove__JJ_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jlong jwrite_options, jbyteArray jkey, jint jkey_len,
+    jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_remove_helper(env, db, *write_options, cf_handle, jkey, jkey_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+}
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Merge
+
+void rocksdb_merge_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
+    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+
+  jbyte* key = env->GetByteArrayElements(jkey, 0);
+  jbyte* value = env->GetByteArrayElements(jentry_value, 0);
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
+      jentry_value_len);
+
+  rocksdb::Status s;
+  if (cf_handle != nullptr) {
+    s = db->Merge(write_options, cf_handle, key_slice, value_slice);
+  } else {
+    s = db->Merge(write_options, key_slice, value_slice);
+  }
+
+  // trigger java unref on key and value.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+  env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
+
+  if (s.ok()) {
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_RocksDB_merge__J_3BI_3BI(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+
+  rocksdb_merge_helper(env, db, default_write_options,
+      nullptr, jkey, jkey_len, jentry_value, jentry_value_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (J[BI[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_merge__J_3BI_3BIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_merge_helper(env, db, default_write_options,
+        cf_handle, jkey, jkey_len, jentry_value, jentry_value_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (JJ[BI[BI)V
+ */
+void Java_org_rocksdb_RocksDB_merge__JJ_3BI_3BI(
+    JNIEnv* env, jobject jdb,
+    jlong jdb_handle, jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
+      jwrite_options_handle);
+
+  rocksdb_merge_helper(env, db, *write_options,
+      nullptr, jkey, jkey_len, jentry_value, jentry_value_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (JJ[BI[BIJ)V
+ */
+void Java_org_rocksdb_RocksDB_merge__JJ_3BI_3BIJ(
+    JNIEnv* env, jobject jdb,
+    jlong jdb_handle, jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto write_options = reinterpret_cast<rocksdb::WriteOptions*>(
+      jwrite_options_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_merge_helper(env, db, *write_options,
+        cf_handle, jkey, jkey_len, jentry_value, jentry_value_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env,
+        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::~DB()
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_disposeInternal(
+    JNIEnv* env, jobject java_db, jlong jhandle) {
+  delete reinterpret_cast<rocksdb::DB*>(jhandle);
+}
+
+jlong rocksdb_iterator_helper(
+    rocksdb::DB* db, rocksdb::ReadOptions read_options,
+    rocksdb::ColumnFamilyHandle* cf_handle) {
+  rocksdb::Iterator* iterator = nullptr;
+  if (cf_handle != nullptr) {
+    iterator = db->NewIterator(read_options, cf_handle);
+  } else {
+    iterator = db->NewIterator(read_options);
+  }
+  return reinterpret_cast<jlong>(iterator);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iterator
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RocksDB_iterator__J(
+    JNIEnv* env, jobject jdb, jlong db_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  return rocksdb_iterator_helper(db, rocksdb::ReadOptions(),
+      nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iterator
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_iterator__JJ(
+    JNIEnv* env, jobject jdb, jlong db_handle,
+    jlong jread_options_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
+      jread_options_handle);
+  return rocksdb_iterator_helper(db, read_options,
+      nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iteratorCF
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_iteratorCF__JJ(
+    JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  return rocksdb_iterator_helper(db, rocksdb::ReadOptions(),
+        cf_handle);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iteratorCF
+ * Signature: (JJJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_iteratorCF__JJJ(
+    JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle,
+    jlong jread_options_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
+      jread_options_handle);
+  return rocksdb_iterator_helper(db, read_options,
+        cf_handle);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iterators
+ * Signature: (JLjava/util/List;J)[J
+ */
+jlongArray Java_org_rocksdb_RocksDB_iterators(
+    JNIEnv* env, jobject jdb, jlong db_handle, jobject jcfhandle_list,
+    jlong jread_options_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
+        jread_options_handle);
+  std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
+  std::vector<rocksdb::Iterator*> iterators;
+
+  if (jcfhandle_list != nullptr) {
+    // get cf iterator
+    jobject cfIteratorObj = env->CallObjectMethod(
+        jcfhandle_list, rocksdb::ListJni::getIteratorMethod(env));
+
+    // iterate over keys and convert java byte array to slice
+    while (env->CallBooleanMethod(
+        cfIteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+      jobject jobj = (jbyteArray) env->CallObjectMethod(
+          cfIteratorObj, rocksdb::ListJni::getNextMethod(env));
+      rocksdb::ColumnFamilyHandle* cfHandle =
+          rocksdb::ColumnFamilyHandleJni::getHandle(env, jobj);
+      cf_handles.push_back(cfHandle);
+    }
+  }
+
+  rocksdb::Status s = db->NewIterators(read_options,
+      cf_handles, &iterators);
+  if (s.ok()) {
+    jlongArray jLongArray =
+        env->NewLongArray(static_cast<jsize>(iterators.size()));
+    for (std::vector<rocksdb::Iterator*>::size_type i = 0; i < iterators.size();
+         i++) {
+      env->SetLongArrayRegion(jLongArray, static_cast<jsize>(i), 1,
+                              reinterpret_cast<const jlong*>(&iterators[i]));
+    }
+    return jLongArray;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return env->NewLongArray(0);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getDefaultColumnFamily
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RocksDB_getDefaultColumnFamily(
+    JNIEnv* env, jobject jobj, jlong jdb_handle) {
+  auto* db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* cf_handle = db_handle->DefaultColumnFamily();
+  return reinterpret_cast<jlong>(cf_handle);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    createColumnFamily
+ * Signature: (JLorg/rocksdb/ColumnFamilyDescriptor;)J;
+ */
+jlong Java_org_rocksdb_RocksDB_createColumnFamily(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jobject jcf_descriptor) {
+  rocksdb::ColumnFamilyHandle* handle;
+  auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+
+  // get ColumnFamilyName
+  jbyteArray byteArray = static_cast<jbyteArray>(env->CallObjectMethod(
+      jcf_descriptor,
+      rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
+          env)));
+  // get CF Options
+  jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
+      rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
+          env));
+  rocksdb::ColumnFamilyOptions* cfOptions =
+      rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
+
+  jbyte* cfname = env->GetByteArrayElements(byteArray, 0);
+  const int len = env->GetArrayLength(byteArray);
+
+  rocksdb::Status s = db_handle->CreateColumnFamily(
+      *cfOptions, std::string(reinterpret_cast<char *>(cfname), len), &handle);
+  env->ReleaseByteArrayElements(byteArray, cfname, 0);
+
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(handle);
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return 0;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    dropColumnFamily
+ * Signature: (JJ)V;
+ */
+void Java_org_rocksdb_RocksDB_dropColumnFamily(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jcf_handle) {
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::Status s = db_handle->DropColumnFamily(cf_handle);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Method:    getSnapshot
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RocksDB_getSnapshot(
+    JNIEnv* env, jobject jdb, jlong db_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  const rocksdb::Snapshot* snapshot = db->GetSnapshot();
+  return reinterpret_cast<jlong>(snapshot);
+}
+
+/*
+ * Method:    releaseSnapshot
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_RocksDB_releaseSnapshot(
+    JNIEnv* env, jobject jdb, jlong db_handle, jlong snapshot_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto snapshot = reinterpret_cast<rocksdb::Snapshot*>(snapshot_handle);
+  db->ReleaseSnapshot(snapshot);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getProperty0
+ * Signature: (JLjava/lang/String;I)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_RocksDB_getProperty0__JLjava_lang_String_2I(
+    JNIEnv* env, jobject jdb, jlong db_handle, jstring jproperty,
+    jint jproperty_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+
+  const char* property = env->GetStringUTFChars(jproperty, 0);
+  rocksdb::Slice property_slice(property, jproperty_len);
+
+  std::string property_value;
+  bool retCode = db->GetProperty(property_slice, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+
+  if (!retCode) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
+  }
+
+  return env->NewStringUTF(property_value.data());
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getProperty0
+ * Signature: (JJLjava/lang/String;I)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_RocksDB_getProperty0__JJLjava_lang_String_2I(
+    JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle,
+    jstring jproperty, jint jproperty_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+
+  const char* property = env->GetStringUTFChars(jproperty, 0);
+  rocksdb::Slice property_slice(property, jproperty_len);
+
+  std::string property_value;
+  bool retCode = db->GetProperty(cf_handle, property_slice, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+
+  if (!retCode) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
+  }
+
+  return env->NewStringUTF(property_value.data());
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getLongProperty
+ * Signature: (JLjava/lang/String;I)L;
+ */
+jlong Java_org_rocksdb_RocksDB_getLongProperty__JLjava_lang_String_2I(
+    JNIEnv* env, jobject jdb, jlong db_handle, jstring jproperty,
+    jint jproperty_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+
+  const char* property = env->GetStringUTFChars(jproperty, 0);
+  rocksdb::Slice property_slice(property, jproperty_len);
+
+  uint64_t property_value = 0;
+  bool retCode = db->GetIntProperty(property_slice, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+
+  if (!retCode) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
+  }
+  return property_value;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getLongProperty
+ * Signature: (JJLjava/lang/String;I)L;
+ */
+jlong Java_org_rocksdb_RocksDB_getLongProperty__JJLjava_lang_String_2I(
+    JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle,
+    jstring jproperty, jint jproperty_len) {
+  auto db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+
+  const char* property = env->GetStringUTFChars(jproperty, 0);
+  rocksdb::Slice property_slice(property, jproperty_len);
+
+  uint64_t property_value;
+  bool retCode = db->GetIntProperty(cf_handle, property_slice, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+
+  if (!retCode) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
+  }
+  return property_value;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Flush
+
+void rocksdb_flush_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::FlushOptions& flush_options,
+  rocksdb::ColumnFamilyHandle* column_family_handle) {
+  rocksdb::Status s;
+  if (column_family_handle != nullptr) {
+    s = db->Flush(flush_options, column_family_handle);
+  } else {
+    s = db->Flush(flush_options);
+  }
+  if (!s.ok()) {
+      rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    flush
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_RocksDB_flush__JJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jlong jflush_options) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto flush_options = reinterpret_cast<rocksdb::FlushOptions*>(jflush_options);
+  rocksdb_flush_helper(env, db, *flush_options, nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    flush
+ * Signature: (JJJ)V
+ */
+void Java_org_rocksdb_RocksDB_flush__JJJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jlong jflush_options, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto flush_options = reinterpret_cast<rocksdb::FlushOptions*>(jflush_options);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  rocksdb_flush_helper(env, db, *flush_options, cf_handle);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::CompactRange - Full
+
+void rocksdb_compactrange_helper(JNIEnv* env, rocksdb::DB* db,
+    rocksdb::ColumnFamilyHandle* cf_handle, jboolean jreduce_level,
+    jint jtarget_level, jint jtarget_path_id) {
+
+  rocksdb::Status s;
+  rocksdb::CompactRangeOptions compact_options;
+  compact_options.change_level = jreduce_level;
+  compact_options.target_level = jtarget_level;
+  compact_options.target_path_id = static_cast<uint32_t>(jtarget_path_id);
+  if (cf_handle != nullptr) {
+    s = db->CompactRange(compact_options, cf_handle, nullptr, nullptr);
+  } else {
+    // backwards compatibility
+    s = db->CompactRange(compact_options, nullptr, nullptr);
+  }
+
+  if (s.ok()) {
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    compactRange0
+ * Signature: (JZII)V
+ */
+void Java_org_rocksdb_RocksDB_compactRange0__JZII(JNIEnv* env,
+    jobject jdb, jlong jdb_handle, jboolean jreduce_level,
+    jint jtarget_level, jint jtarget_path_id) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb_compactrange_helper(env, db, nullptr, jreduce_level,
+      jtarget_level, jtarget_path_id);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    compactRange
+ * Signature: (JZIIJ)V
+ */
+void Java_org_rocksdb_RocksDB_compactRange__JZIIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+     jboolean jreduce_level, jint jtarget_level,
+     jint jtarget_path_id, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  rocksdb_compactrange_helper(env, db, cf_handle, jreduce_level,
+      jtarget_level, jtarget_path_id);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::CompactRange - Range
+
+void rocksdb_compactrange_helper(JNIEnv* env, rocksdb::DB* db,
+    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jbegin, jint jbegin_len,
+    jbyteArray jend, jint jend_len, jboolean jreduce_level, jint jtarget_level,
+    jint jtarget_path_id) {
+
+  jbyte* begin = env->GetByteArrayElements(jbegin, 0);
+  jbyte* end = env->GetByteArrayElements(jend, 0);
+  const rocksdb::Slice begin_slice(reinterpret_cast<char*>(begin), jbegin_len);
+  const rocksdb::Slice end_slice(reinterpret_cast<char*>(end), jend_len);
+
+  rocksdb::Status s;
+  rocksdb::CompactRangeOptions compact_options;
+  compact_options.change_level = jreduce_level;
+  compact_options.target_level = jtarget_level;
+  compact_options.target_path_id = static_cast<uint32_t>(jtarget_path_id);
+  if (cf_handle != nullptr) {
+    s = db->CompactRange(compact_options, cf_handle, &begin_slice, &end_slice);
+  } else {
+    // backwards compatibility
+    s = db->CompactRange(compact_options, &begin_slice, &end_slice);
+  }
+
+  env->ReleaseByteArrayElements(jbegin, begin, JNI_ABORT);
+  env->ReleaseByteArrayElements(jend, end, JNI_ABORT);
+
+  if (s.ok()) {
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    compactRange0
+ * Signature: (J[BI[BIZII)V
+ */
+void Java_org_rocksdb_RocksDB_compactRange0__J_3BI_3BIZII(JNIEnv* env,
+    jobject jdb, jlong jdb_handle, jbyteArray jbegin, jint jbegin_len,
+    jbyteArray jend, jint jend_len, jboolean jreduce_level,
+    jint jtarget_level, jint jtarget_path_id) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb_compactrange_helper(env, db, nullptr, jbegin, jbegin_len,
+      jend, jend_len, jreduce_level, jtarget_level, jtarget_path_id);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    compactRange
+ * Signature: (JJ[BI[BIZII)V
+ */
+void Java_org_rocksdb_RocksDB_compactRange__J_3BI_3BIZIIJ(
+    JNIEnv* env, jobject jdb, jlong jdb_handle, jbyteArray jbegin,
+    jint jbegin_len, jbyteArray jend, jint jend_len,
+    jboolean jreduce_level, jint jtarget_level,
+    jint jtarget_path_id, jlong jcf_handle) {
+  auto db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  rocksdb_compactrange_helper(env, db, cf_handle, jbegin, jbegin_len,
+      jend, jend_len, jreduce_level, jtarget_level, jtarget_path_id);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::GetLatestSequenceNumber
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getLatestSequenceNumber
+ * Signature: (J)V
+ */
+jlong Java_org_rocksdb_RocksDB_getLatestSequenceNumber(JNIEnv* env,
+    jobject jdb, jlong jdb_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  return db->GetLatestSequenceNumber();
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB enable/disable file deletions
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    enableFileDeletions
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_disableFileDeletions(JNIEnv* env,
+    jobject jdb, jlong jdb_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::Status s = db->DisableFileDeletions();
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    enableFileDeletions
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_RocksDB_enableFileDeletions(JNIEnv* env,
+    jobject jdb, jlong jdb_handle, jboolean jforce) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::Status s = db->EnableFileDeletions(jforce);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::GetUpdatesSince
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getUpdatesSince
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_getUpdatesSince(JNIEnv* env,
+    jobject jdb, jlong jdb_handle, jlong jsequence_number) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::SequenceNumber sequence_number =
+      static_cast<rocksdb::SequenceNumber>(jsequence_number);
+  std::unique_ptr<rocksdb::TransactionLogIterator> iter;
+  rocksdb::Status s = db->GetUpdatesSince(sequence_number, &iter);
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(iter.release());
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return 0;
+}
diff --git a/src/rocksdb/java/rocksjni/slice.cc b/src/rocksdb/java/rocksjni/slice.cc
new file mode 100644
index 0000000..8111173
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/slice.cc
@@ -0,0 +1,259 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::Slice.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+#include <string>
+
+#include "include/org_rocksdb_AbstractSlice.h"
+#include "include/org_rocksdb_Slice.h"
+#include "include/org_rocksdb_DirectSlice.h"
+#include "rocksdb/slice.h"
+#include "rocksjni/portal.h"
+
+// <editor-fold desc="org.rocksdb.AbstractSlice>
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    createNewSliceFromString
+ * Signature: (Ljava/lang/String;)V
+ */
+void Java_org_rocksdb_AbstractSlice_createNewSliceFromString(
+    JNIEnv* env, jobject jobj, jstring jstr) {
+
+  const auto* str = env->GetStringUTFChars(jstr, 0);
+  const size_t len = strlen(str);
+  char* buf = new char[len + 1];
+  memcpy(buf, str, len);
+  buf[len] = 0;
+  env->ReleaseStringUTFChars(jstr, str);
+
+  const auto* slice = new rocksdb::Slice(buf);
+  rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    size0
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_AbstractSlice_size0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  return static_cast<jint>(slice->size());
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    empty0
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_AbstractSlice_empty0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  return slice->empty();
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    toString0
+ * Signature: (JZ)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_AbstractSlice_toString0(
+    JNIEnv* env, jobject jobj, jlong handle, jboolean hex) {
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const std::string s = slice->ToString(hex);
+  return env->NewStringUTF(s.c_str());
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    compare0
+ * Signature: (JJ)I;
+ */
+jint Java_org_rocksdb_AbstractSlice_compare0(
+    JNIEnv* env, jobject jobj, jlong handle, jlong otherHandle) {
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto* otherSlice =
+    reinterpret_cast<rocksdb::Slice*>(otherHandle);
+  return slice->compare(*otherSlice);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    startsWith0
+ * Signature: (JJ)Z;
+ */
+jboolean Java_org_rocksdb_AbstractSlice_startsWith0(
+    JNIEnv* env, jobject jobj, jlong handle, jlong otherHandle) {
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const auto* otherSlice =
+    reinterpret_cast<rocksdb::Slice*>(otherHandle);
+  return slice->starts_with(*otherSlice);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractSlice
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_AbstractSlice_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  delete reinterpret_cast<rocksdb::Slice*>(handle);
+}
+
+// </editor-fold>
+
+// <editor-fold desc="org.rocksdb.Slice>
+
+/*
+ * Class:     org_rocksdb_Slice
+ * Method:    createNewSlice0
+ * Signature: ([BI)V
+ */
+void Java_org_rocksdb_Slice_createNewSlice0(
+    JNIEnv * env, jobject jobj, jbyteArray data, jint offset) {
+
+  const jsize dataSize = env->GetArrayLength(data);
+  const int len = dataSize - offset;
+  jbyte* ptrData = new jbyte[len];
+  env->GetByteArrayRegion(data, offset, len, ptrData);
+
+  const auto* slice = new rocksdb::Slice((const char*)ptrData, len);
+  rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
+}
+
+/*
+ * Class:     org_rocksdb_Slice
+ * Method:    createNewSlice1
+ * Signature: ([B)V
+ */
+void Java_org_rocksdb_Slice_createNewSlice1(
+    JNIEnv * env, jobject jobj, jbyteArray data) {
+
+  const int len = env->GetArrayLength(data) + 1;
+
+  jboolean isCopy;
+  jbyte* ptrData = env->GetByteArrayElements(data, &isCopy);
+  char* buf = new char[len];
+
+  memcpy(buf, ptrData, len - 1);
+  buf[len-1]='\0';
+
+  const auto* slice =
+      new rocksdb::Slice(buf, len - 1);
+
+  rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
+  env->ReleaseByteArrayElements(data, ptrData, JNI_ABORT);
+  // NOTE: buf will be deleted in the org.rocksdb.Slice#dispose method
+}
+
+/*
+ * Class:     org_rocksdb_Slice
+ * Method:    data0
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_Slice_data0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  const int len = static_cast<int>(slice->size());
+  const jbyteArray data = env->NewByteArray(len);
+  env->SetByteArrayRegion(data, 0, len,
+    reinterpret_cast<const jbyte*>(slice->data()));
+  return data;
+}
+
+/*
+ * Class:     org_rocksdb_Slice
+ * Method:    disposeInternalBuf
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Slice_disposeInternalBuf(
+    JNIEnv * env, jobject jobj, jlong handle) {
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  delete [] slice->data_;
+}
+
+// </editor-fold>
+
+// <editor-fold desc="org.rocksdb.DirectSlice>
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    createNewDirectSlice0
+ * Signature: (Ljava/nio/ByteBuffer;I)V
+ */
+void Java_org_rocksdb_DirectSlice_createNewDirectSlice0(
+    JNIEnv* env, jobject jobj, jobject data, jint length) {
+  const auto* ptrData =
+     reinterpret_cast<char*>(env->GetDirectBufferAddress(data));
+  const auto* slice = new rocksdb::Slice(ptrData, length);
+  rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    createNewDirectSlice1
+ * Signature: (Ljava/nio/ByteBuffer;)V
+ */
+void Java_org_rocksdb_DirectSlice_createNewDirectSlice1(
+    JNIEnv* env, jobject jobj, jobject data) {
+  const auto* ptrData =
+    reinterpret_cast<char*>(env->GetDirectBufferAddress(data));
+  const auto* slice = new rocksdb::Slice(ptrData);
+  rocksdb::AbstractSliceJni::setHandle(env, jobj, slice);
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    data0
+ * Signature: (J)Ljava/lang/Object;
+ */
+jobject Java_org_rocksdb_DirectSlice_data0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  return env->NewDirectByteBuffer(const_cast<char*>(slice->data()),
+    slice->size());
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    get0
+ * Signature: (JI)B
+ */
+jbyte Java_org_rocksdb_DirectSlice_get0(
+    JNIEnv* env, jobject jobj, jlong handle, jint offset) {
+  const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  return (*slice)[offset];
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    clear0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_DirectSlice_clear0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  delete [] slice->data_;
+  slice->clear();
+}
+
+/*
+ * Class:     org_rocksdb_DirectSlice
+ * Method:    removePrefix0
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DirectSlice_removePrefix0(
+    JNIEnv* env, jobject jobj, jlong handle, jint length) {
+  auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
+  slice->remove_prefix(length);
+}
+
+// </editor-fold>
diff --git a/src/rocksdb/java/rocksjni/snapshot.cc b/src/rocksdb/java/rocksjni/snapshot.cc
new file mode 100644
index 0000000..cd10c97
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/snapshot.cc
@@ -0,0 +1,26 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "include/org_rocksdb_Snapshot.h"
+#include "rocksdb/db.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_Snapshot
+ * Method:    getSequenceNumber
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Snapshot_getSequenceNumber(JNIEnv* env,
+    jobject jobj, jlong jsnapshot_handle) {
+  auto* snapshot = reinterpret_cast<rocksdb::Snapshot*>(
+      jsnapshot_handle);
+  return snapshot->GetSequenceNumber();
+}
diff --git a/src/rocksdb/java/rocksjni/statistics.cc b/src/rocksdb/java/rocksjni/statistics.cc
new file mode 100644
index 0000000..bf170c6
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/statistics.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::Statistics methods from Java side.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <jni.h>
+
+#include "include/org_rocksdb_Statistics.h"
+#include "rocksjni/portal.h"
+#include "rocksdb/statistics.h"
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    getTickerCount0
+ * Signature: (IJ)J
+ */
+jlong Java_org_rocksdb_Statistics_getTickerCount0(
+    JNIEnv* env, jobject jobj, int tickerType, jlong handle) {
+  auto st = reinterpret_cast<rocksdb::Statistics*>(handle);
+  assert(st != nullptr);
+
+  return st->getTickerCount(static_cast<rocksdb::Tickers>(tickerType));
+}
+
+/*
+ * Class:     org_rocksdb_Statistics
+ * Method:    geHistogramData0
+ * Signature: (IJ)Lorg/rocksdb/HistogramData;
+ */
+jobject Java_org_rocksdb_Statistics_geHistogramData0(
+  JNIEnv* env, jobject jobj, int histogramType, jlong handle) {
+  auto st = reinterpret_cast<rocksdb::Statistics*>(handle);
+  assert(st != nullptr);
+
+  rocksdb::HistogramData data;
+  st->histogramData(static_cast<rocksdb::Histograms>(histogramType),
+    &data);
+
+  // Don't reuse class pointer
+  jclass jclazz = env->FindClass("org/rocksdb/HistogramData");
+  jmethodID mid = rocksdb::HistogramDataJni::getConstructorMethodId(
+      env, jclazz);
+  return env->NewObject(jclazz, mid, data.median, data.percentile95,
+      data.percentile99, data.average, data.standard_deviation);
+}
diff --git a/src/rocksdb/java/rocksjni/table.cc b/src/rocksdb/java/rocksjni/table.cc
new file mode 100644
index 0000000..e78e7e0
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/table.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ for rocksdb::Options.
+
+#include <jni.h>
+#include "include/org_rocksdb_PlainTableConfig.h"
+#include "include/org_rocksdb_BlockBasedTableConfig.h"
+#include "rocksdb/table.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/filter_policy.h"
+
+/*
+ * Class:     org_rocksdb_PlainTableConfig
+ * Method:    newTableFactoryHandle
+ * Signature: (IIDIIBZZ)J
+ */
+jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
+    JNIEnv* env, jobject jobj, jint jkey_size, jint jbloom_bits_per_key,
+    jdouble jhash_table_ratio, jint jindex_sparseness,
+    jint jhuge_page_tlb_size, jbyte jencoding_type,
+    jboolean jfull_scan_mode, jboolean jstore_index_in_file) {
+  rocksdb::PlainTableOptions options = rocksdb::PlainTableOptions();
+  options.user_key_len = jkey_size;
+  options.bloom_bits_per_key = jbloom_bits_per_key;
+  options.hash_table_ratio = jhash_table_ratio;
+  options.index_sparseness = jindex_sparseness;
+  options.huge_page_tlb_size = jhuge_page_tlb_size;
+  options.encoding_type = static_cast<rocksdb::EncodingType>(
+      jencoding_type);
+  options.full_scan_mode = jfull_scan_mode;
+  options.store_index_in_file = jstore_index_in_file;
+  return reinterpret_cast<jlong>(rocksdb::NewPlainTableFactory(options));
+}
+
+/*
+ * Class:     org_rocksdb_BlockBasedTableConfig
+ * Method:    newTableFactoryHandle
+ * Signature: (ZJIJIIZIZZJIBBI)J
+ */
+jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
+    JNIEnv* env, jobject jobj, jboolean no_block_cache, jlong block_cache_size,
+    jint block_cache_num_shardbits, jlong block_size, jint block_size_deviation,
+    jint block_restart_interval, jboolean whole_key_filtering,
+    jlong jfilterPolicy, jboolean cache_index_and_filter_blocks,
+    jboolean hash_index_allow_collision, jlong block_cache_compressed_size,
+    jint block_cache_compressd_num_shard_bits, jbyte jchecksum_type,
+    jbyte jindex_type, jint jformat_version) {
+  rocksdb::BlockBasedTableOptions options;
+  options.no_block_cache = no_block_cache;
+
+  if (!no_block_cache && block_cache_size > 0) {
+    if (block_cache_num_shardbits > 0) {
+      options.block_cache =
+          rocksdb::NewLRUCache(block_cache_size, block_cache_num_shardbits);
+    } else {
+      options.block_cache = rocksdb::NewLRUCache(block_cache_size);
+    }
+  }
+  options.block_size = block_size;
+  options.block_size_deviation = block_size_deviation;
+  options.block_restart_interval = block_restart_interval;
+  options.whole_key_filtering = whole_key_filtering;
+  if (jfilterPolicy > 0) {
+    std::shared_ptr<rocksdb::FilterPolicy> *pFilterPolicy =
+        reinterpret_cast<std::shared_ptr<rocksdb::FilterPolicy> *>(
+            jfilterPolicy);
+    options.filter_policy = *pFilterPolicy;
+  }
+  options.cache_index_and_filter_blocks = cache_index_and_filter_blocks;
+  options.hash_index_allow_collision = hash_index_allow_collision;
+  if (block_cache_compressed_size > 0) {
+    if (block_cache_compressd_num_shard_bits > 0) {
+      options.block_cache =
+          rocksdb::NewLRUCache(block_cache_compressed_size,
+              block_cache_compressd_num_shard_bits);
+    } else {
+      options.block_cache = rocksdb::NewLRUCache(block_cache_compressed_size);
+    }
+  }
+  options.checksum = static_cast<rocksdb::ChecksumType>(jchecksum_type);
+  options.index_type = static_cast<
+      rocksdb::BlockBasedTableOptions::IndexType>(jindex_type);
+  options.format_version = jformat_version;
+
+  return reinterpret_cast<jlong>(rocksdb::NewBlockBasedTableFactory(options));
+}
diff --git a/src/rocksdb/java/rocksjni/transaction_log.cc b/src/rocksdb/java/rocksjni/transaction_log.cc
new file mode 100644
index 0000000..1d3d7c1
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/transaction_log.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::Iterator methods from Java side.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "include/org_rocksdb_TransactionLogIterator.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionLogIterator_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  delete reinterpret_cast<rocksdb::TransactionLogIterator*>(handle);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    isValid
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_TransactionLogIterator_isValid(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  return reinterpret_cast<rocksdb::TransactionLogIterator*>(handle)->Valid();
+}
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    next
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionLogIterator_next(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::TransactionLogIterator*>(handle)->Next();
+}
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    status
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionLogIterator_status(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  rocksdb::Status s = reinterpret_cast<
+      rocksdb::TransactionLogIterator*>(handle)->status();
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_TransactionLogIterator
+ * Method:    getBatch
+ * Signature: (J)Lorg/rocksdb/TransactionLogIterator$BatchResult
+ */
+jobject Java_org_rocksdb_TransactionLogIterator_getBatch(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  rocksdb::BatchResult batch_result =
+      reinterpret_cast<rocksdb::TransactionLogIterator*>(handle)->GetBatch();
+  jclass jclazz = env->FindClass(
+      "org/rocksdb/TransactionLogIterator$BatchResult");
+  assert(jclazz != nullptr);
+  jmethodID mid = env->GetMethodID(
+      jclazz, "<init>", "(Lorg/rocksdb/TransactionLogIterator;JJ)V");
+  assert(mid != nullptr);
+  return env->NewObject(jclazz, mid, jobj,
+      batch_result.sequence, batch_result.writeBatchPtr.release());
+}
diff --git a/src/rocksdb/java/rocksjni/ttl.cc b/src/rocksdb/java/rocksjni/ttl.cc
new file mode 100644
index 0000000..ec5b419
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/ttl.cc
@@ -0,0 +1,183 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::TtlDB methods.
+// from Java side.
+
+#include <jni.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <vector>
+
+#include "include/org_rocksdb_TtlDB.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_TtlDB
+ * Method:    open
+ * Signature: (JLjava/lang/String;IZ)V
+ */
+void Java_org_rocksdb_TtlDB_open(JNIEnv* env,
+    jobject jttldb, jlong joptions_handle, jstring jdb_path,
+    jint jttl, jboolean jread_only) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(joptions_handle);
+  rocksdb::DBWithTTL* db = nullptr;
+  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
+  rocksdb::Status s = rocksdb::DBWithTTL::Open(*opt, db_path, &db,
+      jttl, jread_only);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  // as TTLDB extends RocksDB on the java side, we can reuse
+  // the RocksDB portal here.
+  if (s.ok()) {
+      rocksdb::RocksDBJni::setHandle(env, jttldb, db);
+      return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_TtlDB
+ * Method:    openCF
+ * Signature: (JLjava/lang/String;Ljava/util/List;
+ *    ILjava/util/List;Z)Ljava/util/List;
+ */
+jobject
+    Java_org_rocksdb_TtlDB_openCF(
+    JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path,
+    jobject jcfdesc_list, jint jcfdesc_count, jobject jttl_list,
+    jboolean jread_only) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+  rocksdb::DBWithTTL* db = nullptr;
+  const char* db_path = env->GetStringUTFChars(jdb_path, 0);
+
+  std::vector<jbyte*> cfnames_to_free;
+  std::vector<jbyteArray> jcfnames_for_free;
+
+  std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
+  std::vector<int32_t> ttl_values;
+  std::vector<rocksdb::ColumnFamilyHandle* > handles;
+  // get iterator for ColumnFamilyDescriptors
+  jobject iteratorObj = env->CallObjectMethod(
+      jcfdesc_list, rocksdb::ListJni::getIteratorMethod(env));
+
+  // iterate over ColumnFamilyDescriptors
+  while (env->CallBooleanMethod(
+      iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+      // get ColumnFamilyDescriptor
+      jobject jcf_descriptor = env->CallObjectMethod(iteratorObj,
+          rocksdb::ListJni::getNextMethod(env));
+      // get ColumnFamilyName
+      jbyteArray byteArray = static_cast<jbyteArray>(env->CallObjectMethod(
+          jcf_descriptor,
+          rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
+              env)));
+      // get CF Options
+      jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
+          rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
+              env));
+      rocksdb::ColumnFamilyOptions* cfOptions =
+          rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
+
+      jbyte* cfname = env->GetByteArrayElements(byteArray, 0);
+      const int len = env->GetArrayLength(byteArray);
+
+      // free allocated cfnames after call to open
+      cfnames_to_free.push_back(cfname);
+      jcfnames_for_free.push_back(byteArray);
+      column_families.push_back(rocksdb::ColumnFamilyDescriptor(
+          std::string(reinterpret_cast<char *>(cfname), len), *cfOptions));
+  }
+  // get iterator for TTL values
+  iteratorObj = env->CallObjectMethod(
+        jttl_list, rocksdb::ListJni::getIteratorMethod(env));
+  // iterate over TTL values
+  while (env->CallBooleanMethod(
+      iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) {
+     // get TTL object
+     jobject jttl_object = env->CallObjectMethod(iteratorObj,
+       rocksdb::ListJni::getNextMethod(env));
+     // get Integer value
+     jclass jIntClazz = env->FindClass("java/lang/Integer");
+     jmethodID getVal = env->GetMethodID(jIntClazz, "intValue", "()I");
+     ttl_values.push_back(env->CallIntMethod(jttl_object, getVal));
+  }
+  rocksdb::Status s = rocksdb::DBWithTTL::Open(*opt, db_path, column_families,
+      &handles, &db, ttl_values, jread_only);
+
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+  // free jbyte allocations
+  for (std::vector<jbyte*>::size_type i = 0;
+      i != cfnames_to_free.size(); i++) {
+    // free  cfnames
+    env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0);
+  }
+
+  // check if open operation was successful
+  if (s.ok()) {
+    rocksdb::RocksDBJni::setHandle(env, jdb, db);
+    jclass jListClazz = env->FindClass("java/util/ArrayList");
+    jmethodID midList = rocksdb::ListJni::getArrayListConstructorMethodId(
+        env, jListClazz);
+    jobject jcfhandle_list = env->NewObject(jListClazz,
+        midList, handles.size());
+    // insert in java list
+    for (std::vector<rocksdb::ColumnFamilyHandle*>::size_type i = 0;
+        i != handles.size(); i++) {
+      // jlong must be converted to Long due to collections restrictions
+      jclass jLongClazz = env->FindClass("java/lang/Long");
+      jmethodID midLong = env->GetMethodID(jLongClazz, "<init>", "(J)V");
+      jobject obj = env->NewObject(jLongClazz, midLong,
+          reinterpret_cast<jlong>(handles[i]));
+      env->CallBooleanMethod(jcfhandle_list,
+          rocksdb::ListJni::getListAddMethodId(env), obj);
+    }
+
+    return jcfhandle_list;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return nullptr;
+}
+
+/*
+ * Class:     org_rocksdb_TtlDB
+ * Method:    createColumnFamilyWithTtl
+ * Signature: (JLorg/rocksdb/ColumnFamilyDescriptor;I)J;
+ */
+jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl(
+    JNIEnv* env, jobject jobj, jlong jdb_handle,
+    jobject jcf_descriptor, jint jttl) {
+  rocksdb::ColumnFamilyHandle* handle;
+  auto* db_handle = reinterpret_cast<rocksdb::DBWithTTL*>(jdb_handle);
+
+  // get ColumnFamilyName
+  jbyteArray byteArray = static_cast<jbyteArray>(env->CallObjectMethod(
+      jcf_descriptor,
+      rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod(
+          env)));
+  // get CF Options
+  jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor,
+      rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod(
+      env));
+  rocksdb::ColumnFamilyOptions* cfOptions =
+      rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj);
+
+  jbyte* cfname = env->GetByteArrayElements(byteArray, 0);
+  const int len = env->GetArrayLength(byteArray);
+
+  rocksdb::Status s = db_handle->CreateColumnFamilyWithTtl(
+      *cfOptions, std::string(reinterpret_cast<char *>(cfname),
+          len), &handle, jttl);
+  env->ReleaseByteArrayElements(byteArray, cfname, 0);
+
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(handle);
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return 0;
+}
diff --git a/src/rocksdb/java/rocksjni/write_batch.cc b/src/rocksdb/java/rocksjni/write_batch.cc
new file mode 100644
index 0000000..aa0c230
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/write_batch.cc
@@ -0,0 +1,238 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::WriteBatch methods from Java side.
+#include <memory>
+
+#include "include/org_rocksdb_WriteBatch.h"
+#include "include/org_rocksdb_WriteBatch_Handler.h"
+#include "rocksjni/portal.h"
+#include "rocksjni/writebatchhandlerjnicallback.h"
+#include "rocksdb/db.h"
+#include "rocksdb/immutable_options.h"
+#include "db/memtable.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/status.h"
+#include "db/write_batch_internal.h"
+#include "db/writebuffer.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "util/logging.h"
+#include "util/scoped_arena_iterator.h"
+#include "util/testharness.h"
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    newWriteBatch
+ * Signature: (I)V
+ */
+void Java_org_rocksdb_WriteBatch_newWriteBatch(
+    JNIEnv* env, jobject jobj, jint jreserved_bytes) {
+  rocksdb::WriteBatch* wb = new rocksdb::WriteBatch(
+      static_cast<size_t>(jreserved_bytes));
+
+  rocksdb::WriteBatchJni::setHandle(env, jobj, wb);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    count0
+ * Signature: ()I
+ */
+jint Java_org_rocksdb_WriteBatch_count0(JNIEnv* env, jobject jobj) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  return static_cast<jint>(wb->Count());
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    clear0
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteBatch_clear0(JNIEnv* env, jobject jobj) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  wb->Clear();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    put
+ * Signature: ([BI[BI)V
+ */
+void Java_org_rocksdb_WriteBatch_put___3BI_3BI(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+  auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+  auto put = [&wb] (rocksdb::Slice key, rocksdb::Slice value) {
+    wb->Put(key, value);
+  };
+  rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    put
+ * Signature: ([BI[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatch_put___3BI_3BIJ(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
+  auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto put = [&wb, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) {
+    wb->Put(cf_handle, key, value);
+  };
+  rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    merge
+ * Signature: ([BI[BI)V
+ */
+void Java_org_rocksdb_WriteBatch_merge___3BI_3BI(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+  auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+  auto merge = [&wb] (rocksdb::Slice key, rocksdb::Slice value) {
+    wb->Merge(key, value);
+  };
+  rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    merge
+ * Signature: ([BI[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatch_merge___3BI_3BIJ(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
+  auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto merge = [&wb, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) {
+    wb->Merge(cf_handle, key, value);
+  };
+  rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    remove
+ * Signature: ([BI)V
+ */
+void Java_org_rocksdb_WriteBatch_remove___3BI(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len) {
+  auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+  auto remove = [&wb] (rocksdb::Slice key) {
+    wb->Delete(key);
+  };
+  rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    remove
+ * Signature: ([BIJ)V
+ */
+void Java_org_rocksdb_WriteBatch_remove___3BIJ(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto remove = [&wb, &cf_handle] (rocksdb::Slice key) {
+    wb->Delete(cf_handle, key);
+  };
+  rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    putLogData
+ * Signature: ([BI)V
+ */
+void Java_org_rocksdb_WriteBatch_putLogData(
+    JNIEnv* env, jobject jobj, jbyteArray jblob, jint jblob_len) {
+  auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+  auto putLogData = [&wb] (rocksdb::Slice blob) {
+    wb->PutLogData(blob);
+  };
+  rocksdb::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    iterate
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatch_iterate(
+    JNIEnv* env , jobject jobj, jlong handlerHandle) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  rocksdb::Status s = wb->Iterate(
+    reinterpret_cast<rocksdb::WriteBatchHandlerJniCallback*>(handlerHandle));
+
+  if (s.ok()) {
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatch_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  delete reinterpret_cast<rocksdb::WriteBatch*>(handle);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch_Handler
+ * Method:    createNewHandler0
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteBatch_00024Handler_createNewHandler0(
+    JNIEnv* env, jobject jobj) {
+  const rocksdb::WriteBatchHandlerJniCallback* h =
+    new rocksdb::WriteBatchHandlerJniCallback(env, jobj);
+  rocksdb::WriteBatchHandlerJni::setHandle(env, jobj, h);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch_Handler
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatch_00024Handler_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  delete reinterpret_cast<rocksdb::WriteBatchHandlerJniCallback*>(handle);
+}
diff --git a/src/rocksdb/java/rocksjni/write_batch_test.cc b/src/rocksdb/java/rocksjni/write_batch_test.cc
new file mode 100644
index 0000000..d540291
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/write_batch_test.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::WriteBatch methods testing from Java side.
+#include <memory>
+
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "db/writebuffer.h"
+#include "include/org_rocksdb_WriteBatch.h"
+#include "include/org_rocksdb_WriteBatch_Handler.h"
+#include "include/org_rocksdb_WriteBatchTest.h"
+#include "include/org_rocksdb_WriteBatchTestInternalHelper.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/immutable_options.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_batch.h"
+#include "rocksjni/portal.h"
+#include "util/logging.h"
+#include "util/scoped_arena_iterator.h"
+#include "util/testharness.h"
+
+/*
+ * Class:     org_rocksdb_WriteBatchTest
+ * Method:    getContents
+ * Signature: (Lorg/rocksdb/WriteBatch;)[B
+ */
+jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
+    JNIEnv* env, jclass jclazz, jobject jobj) {
+  rocksdb::WriteBatch* b = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(b != nullptr);
+
+  // todo: Currently the following code is directly copied from
+  // db/write_bench_test.cc.  It could be implemented in java once
+  // all the necessary components can be accessed via jni api.
+
+  rocksdb::InternalKeyComparator cmp(rocksdb::BytewiseComparator());
+  auto factory = std::make_shared<rocksdb::SkipListFactory>();
+  rocksdb::Options options;
+  rocksdb::WriteBuffer wb(options.db_write_buffer_size);
+  options.memtable_factory = factory;
+  rocksdb::MemTable* mem = new rocksdb::MemTable(
+      cmp, rocksdb::ImmutableCFOptions(options),
+      rocksdb::MutableCFOptions(options, rocksdb::ImmutableCFOptions(options)),
+      &wb, rocksdb::kMaxSequenceNumber);
+  mem->Ref();
+  std::string state;
+  rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem);
+  rocksdb::Status s =
+      rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default);
+  int count = 0;
+  rocksdb::Arena arena;
+  rocksdb::ScopedArenaIterator iter(mem->NewIterator(
+      rocksdb::ReadOptions(), &arena));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    rocksdb::ParsedInternalKey ikey;
+    memset(reinterpret_cast<void*>(&ikey), 0, sizeof(ikey));
+    assert(rocksdb::ParseInternalKey(iter->key(), &ikey));
+    switch (ikey.type) {
+      case rocksdb::kTypeValue:
+        state.append("Put(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case rocksdb::kTypeMerge:
+        state.append("Merge(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case rocksdb::kTypeDeletion:
+        state.append("Delete(");
+        state.append(ikey.user_key.ToString());
+        state.append(")");
+        count++;
+        break;
+      default:
+        assert(false);
+        break;
+    }
+    state.append("@");
+    state.append(rocksdb::NumberToString(ikey.sequence));
+  }
+  if (!s.ok()) {
+    state.append(s.ToString());
+  } else if (count != rocksdb::WriteBatchInternal::Count(b)) {
+    state.append("CountMismatch()");
+  }
+  delete mem->Unref();
+
+  jbyteArray jstate = env->NewByteArray(static_cast<jsize>(state.size()));
+  env->SetByteArrayRegion(jstate, 0, static_cast<jsize>(state.size()),
+                          reinterpret_cast<const jbyte*>(state.c_str()));
+
+  return jstate;
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchTestInternalHelper
+ * Method:    setSequence
+ * Signature: (Lorg/rocksdb/WriteBatch;J)V
+ */
+void Java_org_rocksdb_WriteBatchTestInternalHelper_setSequence(
+    JNIEnv* env, jclass jclazz, jobject jobj, jlong jsn) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  rocksdb::WriteBatchInternal::SetSequence(
+      wb, static_cast<rocksdb::SequenceNumber>(jsn));
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchTestInternalHelper
+ * Method:    sequence
+ * Signature: (Lorg/rocksdb/WriteBatch;)J
+ */
+jlong Java_org_rocksdb_WriteBatchTestInternalHelper_sequence(
+    JNIEnv* env, jclass jclazz, jobject jobj) {
+  rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj);
+  assert(wb != nullptr);
+
+  return static_cast<jlong>(rocksdb::WriteBatchInternal::Sequence(wb));
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchTestInternalHelper
+ * Method:    append
+ * Signature: (Lorg/rocksdb/WriteBatch;Lorg/rocksdb/WriteBatch;)V
+ */
+void Java_org_rocksdb_WriteBatchTestInternalHelper_append(
+    JNIEnv* env, jclass jclazz, jobject jwb1, jobject jwb2) {
+  rocksdb::WriteBatch* wb1 = rocksdb::WriteBatchJni::getHandle(env, jwb1);
+  assert(wb1 != nullptr);
+  rocksdb::WriteBatch* wb2 = rocksdb::WriteBatchJni::getHandle(env, jwb2);
+  assert(wb2 != nullptr);
+
+  rocksdb::WriteBatchInternal::Append(wb1, wb2);
+}
diff --git a/src/rocksdb/java/rocksjni/write_batch_with_index.cc b/src/rocksdb/java/rocksjni/write_batch_with_index.cc
new file mode 100644
index 0000000..7c57a0e
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/write_batch_with_index.cc
@@ -0,0 +1,386 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::WriteBatchWithIndex methods from Java side.
+
+#include "include/org_rocksdb_WBWIRocksIterator.h"
+#include "include/org_rocksdb_WriteBatchWithIndex.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    newWriteBatchWithIndex
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__(
+    JNIEnv* env, jobject jobj) {
+  rocksdb::WriteBatchWithIndex* wbwi = new rocksdb::WriteBatchWithIndex();
+  rocksdb::WriteBatchWithIndexJni::setHandle(env, jobj, wbwi);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    newWriteBatchWithIndex
+ * Signature: (Z)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__Z(
+    JNIEnv* env, jobject jobj, jboolean joverwrite_key) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0,
+      static_cast<bool>(joverwrite_key));
+  rocksdb::WriteBatchWithIndexJni::setHandle(env, jobj, wbwi);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    newWriteBatchWithIndex
+ * Signature: (JIZ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__JIZ(
+    JNIEnv* env, jobject jobj, jlong jfallback_index_comparator_handle,
+    jint jreserved_bytes, jboolean joverwrite_key) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      new rocksdb::WriteBatchWithIndex(
+      reinterpret_cast<rocksdb::Comparator*>(jfallback_index_comparator_handle),
+      static_cast<size_t>(jreserved_bytes), static_cast<bool>(joverwrite_key));
+  rocksdb::WriteBatchWithIndexJni::setHandle(env, jobj, wbwi);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    count
+ * Signature: ()I
+ */
+jint Java_org_rocksdb_WriteBatchWithIndex_count0(
+    JNIEnv* env, jobject jobj) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+
+  return static_cast<jint>(wbwi->GetWriteBatch()->Count());
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    put
+ * Signature: ([BI[BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_put___3BI_3BI(
+    JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+  auto* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+  auto put = [&wbwi] (rocksdb::Slice key, rocksdb::Slice value) {
+    wbwi->Put(key, value);
+  };
+  rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    put
+ * Signature: ([BI[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_put___3BI_3BIJ(
+    JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
+  auto* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto put = [&wbwi, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) {
+    wbwi->Put(cf_handle, key, value);
+  };
+  rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    merge
+ * Signature: ([BI[BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_merge___3BI_3BI(
+    JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len) {
+  auto* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+  auto merge = [&wbwi] (rocksdb::Slice key, rocksdb::Slice value) {
+    wbwi->Merge(key, value);
+  };
+  rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    merge
+ * Signature: ([BI[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_merge___3BI_3BIJ(
+    JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len,
+    jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
+  auto* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto merge = [&wbwi, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) {
+    wbwi->Merge(cf_handle, key, value);
+  };
+  rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value,
+      jentry_value_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    remove
+ * Signature: ([BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_remove___3BI(
+    JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len) {
+  auto* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+  auto remove = [&wbwi] (rocksdb::Slice key) {
+    wbwi->Delete(key);
+  };
+  rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    remove
+ * Signature: ([BIJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_remove___3BIJ(
+    JNIEnv* env, jobject jobj,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto remove = [&wbwi, &cf_handle] (rocksdb::Slice key) {
+    wbwi->Delete(cf_handle, key);
+  };
+  rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    putLogData
+ * Signature: ([BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_putLogData(
+    JNIEnv* env, jobject jobj, jbyteArray jblob, jint jblob_len) {
+  auto* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+  auto putLogData = [&wbwi] (rocksdb::Slice blob) {
+    wbwi->PutLogData(blob);
+  };
+  rocksdb::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    clear
+ * Signature: ()V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_clear0(
+    JNIEnv* env, jobject jobj) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  assert(wbwi != nullptr);
+
+  wbwi->GetWriteBatch()->Clear();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    iterator0
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_WriteBatchWithIndex_iterator0(
+    JNIEnv* env, jobject jobj) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  rocksdb::WBWIIterator* wbwi_iterator = wbwi->NewIterator();
+  return reinterpret_cast<jlong>(wbwi_iterator);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    iterator1
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_WriteBatchWithIndex_iterator1(
+    JNIEnv* env, jobject jobj, jlong jcf_handle) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  rocksdb::WBWIIterator* wbwi_iterator = wbwi->NewIterator(cf_handle);
+  return reinterpret_cast<jlong>(wbwi_iterator);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    iteratorWithBase
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_WriteBatchWithIndex_iteratorWithBase(
+    JNIEnv* env, jobject jobj, jlong jcf_handle, jlong jbi_handle) {
+  rocksdb::WriteBatchWithIndex* wbwi =
+      rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  auto* base_iterator = reinterpret_cast<rocksdb::Iterator*>(jbi_handle);
+  auto* iterator = wbwi->NewIteratorWithBase(cf_handle, base_iterator);
+  return reinterpret_cast<jlong>(iterator);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(handle);
+  delete wbwi;
+}
+
+/* WBWIRocksIterator below */
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_disposeInternal(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto* it = reinterpret_cast<rocksdb::WBWIIterator*>(handle);
+  delete it;
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    isValid0
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WBWIRocksIterator_isValid0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  return reinterpret_cast<rocksdb::WBWIIterator*>(handle)->Valid();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seekToFirst0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seekToFirst0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::WBWIIterator*>(handle)->SeekToFirst();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seekToLast0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seekToLast0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::WBWIIterator*>(handle)->SeekToLast();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    next0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_next0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::WBWIIterator*>(handle)->Next();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    prev0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_prev0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  reinterpret_cast<rocksdb::WBWIIterator*>(handle)->Prev();
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seek0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seek0(
+    JNIEnv* env, jobject jobj, jlong handle, jbyteArray jtarget,
+    jint jtarget_len) {
+  auto* it = reinterpret_cast<rocksdb::WBWIIterator*>(handle);
+  jbyte* target = env->GetByteArrayElements(jtarget, 0);
+  rocksdb::Slice target_slice(
+      reinterpret_cast<char*>(target), jtarget_len);
+
+  it->Seek(target_slice);
+
+  env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    status0
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_status0(
+    JNIEnv* env, jobject jobj, jlong handle) {
+  auto* it = reinterpret_cast<rocksdb::WBWIIterator*>(handle);
+  rocksdb::Status s = it->status();
+
+  if (s.ok()) {
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    entry1
+ * Signature: (JLorg/rocksdb/WBWIRocksIterator/WriteEntry;)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_entry1(
+    JNIEnv* env, jobject jobj, jlong handle, jobject jwrite_entry) {
+  auto* it = reinterpret_cast<rocksdb::WBWIIterator*>(handle);
+  const rocksdb::WriteEntry& we = it->Entry();
+  jobject jwe = rocksdb::WBWIRocksIteratorJni::getWriteEntry(env, jobj);
+  rocksdb::WriteEntryJni::setWriteType(env, jwe, we.type);
+
+  char* buf = new char[we.key.size()];
+  memcpy(buf, we.key.data(), we.key.size());
+  auto* key_slice = new rocksdb::Slice(buf, we.key.size());
+  rocksdb::WriteEntryJni::setKey(env, jwe, key_slice);
+
+  if (we.type == rocksdb::kDeleteRecord || we.type == rocksdb::kLogDataRecord) {
+    // set native handle of value slice to null if no value available
+    rocksdb::WriteEntryJni::setValue(env, jwe, nullptr);
+  } else {
+    char* value_buf = new char[we.value.size()];
+    memcpy(value_buf, we.value.data(), we.value.size());
+    auto* value_slice = new rocksdb::Slice(value_buf, we.value.size());
+    rocksdb::WriteEntryJni::setValue(env, jwe, value_slice);
+  }
+}
diff --git a/src/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc b/src/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc
new file mode 100644
index 0000000..b12e355
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::Comparator.
+
+#include "rocksjni/writebatchhandlerjnicallback.h"
+#include "rocksjni/portal.h"
+
+namespace rocksdb {
+WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback(
+    JNIEnv* env, jobject jWriteBatchHandler)
+    : m_env(env) {
+
+  // Note: we want to access the Java WriteBatchHandler instance
+  // across multiple method calls, so we create a global ref
+  m_jWriteBatchHandler = env->NewGlobalRef(jWriteBatchHandler);
+
+  m_jPutMethodId = WriteBatchHandlerJni::getPutMethodId(env);
+  m_jMergeMethodId = WriteBatchHandlerJni::getMergeMethodId(env);
+  m_jDeleteMethodId = WriteBatchHandlerJni::getDeleteMethodId(env);
+  m_jLogDataMethodId = WriteBatchHandlerJni::getLogDataMethodId(env);
+  m_jContinueMethodId = WriteBatchHandlerJni::getContinueMethodId(env);
+}
+
+void WriteBatchHandlerJniCallback::Put(const Slice& key, const Slice& value) {
+  const jbyteArray j_key = sliceToJArray(key);
+  const jbyteArray j_value = sliceToJArray(value);
+
+  m_env->CallVoidMethod(
+      m_jWriteBatchHandler,
+      m_jPutMethodId,
+      j_key,
+      j_value);
+
+  m_env->DeleteLocalRef(j_value);
+  m_env->DeleteLocalRef(j_key);
+}
+
+void WriteBatchHandlerJniCallback::Merge(const Slice& key, const Slice& value) {
+  const jbyteArray j_key = sliceToJArray(key);
+  const jbyteArray j_value = sliceToJArray(value);
+
+  m_env->CallVoidMethod(
+      m_jWriteBatchHandler,
+      m_jMergeMethodId,
+      j_key,
+      j_value);
+
+  m_env->DeleteLocalRef(j_value);
+  m_env->DeleteLocalRef(j_key);
+}
+
+void WriteBatchHandlerJniCallback::Delete(const Slice& key) {
+  const jbyteArray j_key = sliceToJArray(key);
+
+  m_env->CallVoidMethod(
+      m_jWriteBatchHandler,
+      m_jDeleteMethodId,
+      j_key);
+
+  m_env->DeleteLocalRef(j_key);
+}
+
+void WriteBatchHandlerJniCallback::LogData(const Slice& blob) {
+  const jbyteArray j_blob = sliceToJArray(blob);
+
+  m_env->CallVoidMethod(
+      m_jWriteBatchHandler,
+      m_jLogDataMethodId,
+      j_blob);
+
+  m_env->DeleteLocalRef(j_blob);
+}
+
+bool WriteBatchHandlerJniCallback::Continue() {
+  jboolean jContinue = m_env->CallBooleanMethod(
+      m_jWriteBatchHandler,
+      m_jContinueMethodId);
+
+  return static_cast<bool>(jContinue == JNI_TRUE);
+}
+
+/*
+ * Creates a Java Byte Array from the data in a Slice
+ *
+ * When calling this function
+ * you must remember to call env->DeleteLocalRef
+ * on the result after you have finished with it
+ */
+jbyteArray WriteBatchHandlerJniCallback::sliceToJArray(const Slice& s) {
+  jbyteArray ja = m_env->NewByteArray(static_cast<jsize>(s.size()));
+  m_env->SetByteArrayRegion(
+      ja, 0, static_cast<jsize>(s.size()),
+      reinterpret_cast<const jbyte*>(s.data()));
+  return ja;
+}
+
+WriteBatchHandlerJniCallback::~WriteBatchHandlerJniCallback() {
+  m_env->DeleteGlobalRef(m_jWriteBatchHandler);
+}
+}  // namespace rocksdb
diff --git a/src/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h b/src/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h
new file mode 100644
index 0000000..9a2a47e
--- /dev/null
+++ b/src/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::WriteBatch::Handler.
+
+#ifndef JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_
+#define JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_
+
+#include <jni.h>
+#include "rocksdb/write_batch.h"
+
+namespace rocksdb {
+/**
+ * This class acts as a bridge between C++
+ * and Java. The methods in this class will be
+ * called back from the RocksDB storage engine (C++)
+ * which calls the appropriate Java method.
+ * This enables Write Batch Handlers to be implemented in Java.
+ */
+class WriteBatchHandlerJniCallback : public WriteBatch::Handler {
+ public:
+    WriteBatchHandlerJniCallback(
+      JNIEnv* env, jobject jWriteBackHandler);
+    ~WriteBatchHandlerJniCallback();
+    void Put(const Slice& key, const Slice& value);
+    void Merge(const Slice& key, const Slice& value);
+    void Delete(const Slice& key);
+    void LogData(const Slice& blob);
+    bool Continue();
+
+ private:
+    JNIEnv* m_env;
+    jobject m_jWriteBatchHandler;
+    jbyteArray sliceToJArray(const Slice& s);
+    jmethodID m_jPutMethodId;
+    jmethodID m_jMergeMethodId;
+    jmethodID m_jDeleteMethodId;
+    jmethodID m_jLogDataMethodId;
+    jmethodID m_jContinueMethodId;
+};
+}  // namespace rocksdb
+
+#endif  // JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_
diff --git a/src/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java b/src/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java
new file mode 100644
index 0000000..da9f4d2
--- /dev/null
+++ b/src/rocksdb/java/samples/src/main/java/RocksDBColumnFamilySample.java
@@ -0,0 +1,95 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+import org.rocksdb.*;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class RocksDBColumnFamilySample {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(String[] args) throws RocksDBException {
+    if (args.length < 1) {
+      System.out.println(
+          "usage: RocksDBColumnFamilySample db_path");
+      return;
+    }
+    String db_path = args[0];
+
+    System.out.println("RocksDBColumnFamilySample");
+    RocksDB db = null;
+    Options options = null;
+    ColumnFamilyHandle columnFamilyHandle = null;
+    WriteBatch wb = null;
+    try {
+      options = new Options().setCreateIfMissing(true);
+      db = RocksDB.open(options, db_path);
+      assert(db != null);
+
+      // create column family
+      columnFamilyHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf".getBytes(),
+          new ColumnFamilyOptions()));
+      assert(columnFamilyHandle != null);
+
+    } finally {
+      if (columnFamilyHandle != null) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+        db = null;
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+
+    // open DB with two column families
+    List<ColumnFamilyDescriptor> columnFamilyDescriptors = new ArrayList<>();
+    // have to open default column family
+    columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+        RocksDB.DEFAULT_COLUMN_FAMILY, new ColumnFamilyOptions()));
+    // open the new one, too
+    columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+        "new_cf".getBytes(), new ColumnFamilyOptions()));
+    List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+    try {
+      db = RocksDB.open(new DBOptions(), db_path,
+          columnFamilyDescriptors, columnFamilyHandles);
+      assert(db != null);
+
+      // put and get from non-default column family
+      db.put(columnFamilyHandles.get(0), new WriteOptions(),
+          "key".getBytes(), "value".getBytes());
+      String value = new String(db.get(columnFamilyHandles.get(0),
+          "key".getBytes()));
+
+      // atomic write
+      wb = new WriteBatch();
+      wb.put(columnFamilyHandles.get(0), "key2".getBytes(), "value2".getBytes());
+      wb.put(columnFamilyHandles.get(1), "key3".getBytes(), "value3".getBytes());
+      wb.remove(columnFamilyHandles.get(0), "key".getBytes());
+      db.write(new WriteOptions(), wb);
+
+      // drop column family
+      db.dropColumnFamily(columnFamilyHandles.get(1));
+
+    } finally {
+      for (ColumnFamilyHandle handle : columnFamilyHandles){
+        handle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (wb != null) {
+        wb.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/samples/src/main/java/RocksDBSample.java b/src/rocksdb/java/samples/src/main/java/RocksDBSample.java
new file mode 100644
index 0000000..402fd8f
--- /dev/null
+++ b/src/rocksdb/java/samples/src/main/java/RocksDBSample.java
@@ -0,0 +1,312 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+import java.lang.IllegalArgumentException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.ArrayList;
+import org.rocksdb.*;
+import org.rocksdb.util.SizeUnit;
+import java.io.IOException;
+
+public class RocksDBSample {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  public static void main(String[] args) {
+    if (args.length < 1) {
+      System.out.println("usage: RocksDBSample db_path");
+      return;
+    }
+    String db_path = args[0];
+    String db_path_not_found = db_path + "_not_found";
+
+    System.out.println("RocksDBSample");
+    RocksDB db = null;
+    Options options = new Options();
+    try {
+      db = RocksDB.open(options, db_path_not_found);
+      assert(false);
+    } catch (RocksDBException e) {
+      System.out.format("caught the expceted exception -- %s\n", e);
+      assert(db == null);
+    }
+
+    try {
+      options.setCreateIfMissing(true)
+          .createStatistics()
+          .setWriteBufferSize(8 * SizeUnit.KB)
+          .setMaxWriteBufferNumber(3)
+          .setMaxBackgroundCompactions(10)
+          .setCompressionType(CompressionType.SNAPPY_COMPRESSION)
+          .setCompactionStyle(CompactionStyle.UNIVERSAL);
+    } catch (IllegalArgumentException e) {
+      assert(false);
+    }
+
+    Statistics stats = options.statisticsPtr();
+
+    assert(options.createIfMissing() == true);
+    assert(options.writeBufferSize() == 8 * SizeUnit.KB);
+    assert(options.maxWriteBufferNumber() == 3);
+    assert(options.maxBackgroundCompactions() == 10);
+    assert(options.compressionType() == CompressionType.SNAPPY_COMPRESSION);
+    assert(options.compactionStyle() == CompactionStyle.UNIVERSAL);
+
+    assert(options.memTableFactoryName().equals("SkipListFactory"));
+    options.setMemTableConfig(
+        new HashSkipListMemTableConfig()
+            .setHeight(4)
+            .setBranchingFactor(4)
+            .setBucketCount(2000000));
+    assert(options.memTableFactoryName().equals("HashSkipListRepFactory"));
+
+    options.setMemTableConfig(
+        new HashLinkedListMemTableConfig()
+            .setBucketCount(100000));
+    assert(options.memTableFactoryName().equals("HashLinkedListRepFactory"));
+
+    options.setMemTableConfig(
+        new VectorMemTableConfig().setReservedSize(10000));
+    assert(options.memTableFactoryName().equals("VectorRepFactory"));
+
+    options.setMemTableConfig(new SkipListMemTableConfig());
+    assert(options.memTableFactoryName().equals("SkipListFactory"));
+
+    options.setTableFormatConfig(new PlainTableConfig());
+    // Plain-Table requires mmap read
+    options.setAllowMmapReads(true);
+    assert(options.tableFactoryName().equals("PlainTable"));
+
+    options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000,
+        10000, 10));
+    options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000));
+
+
+    Filter bloomFilter = new BloomFilter(10);
+    BlockBasedTableConfig table_options = new BlockBasedTableConfig();
+    table_options.setBlockCacheSize(64 * SizeUnit.KB)
+                 .setFilter(bloomFilter)
+                 .setCacheNumShardBits(6)
+                 .setBlockSizeDeviation(5)
+                 .setBlockRestartInterval(10)
+                 .setCacheIndexAndFilterBlocks(true)
+                 .setHashIndexAllowCollision(false)
+                 .setBlockCacheCompressedSize(64 * SizeUnit.KB)
+                 .setBlockCacheCompressedNumShardBits(10);
+
+    assert(table_options.blockCacheSize() == 64 * SizeUnit.KB);
+    assert(table_options.cacheNumShardBits() == 6);
+    assert(table_options.blockSizeDeviation() == 5);
+    assert(table_options.blockRestartInterval() == 10);
+    assert(table_options.cacheIndexAndFilterBlocks() == true);
+    assert(table_options.hashIndexAllowCollision() == false);
+    assert(table_options.blockCacheCompressedSize() == 64 * SizeUnit.KB);
+    assert(table_options.blockCacheCompressedNumShardBits() == 10);
+
+    options.setTableFormatConfig(table_options);
+    assert(options.tableFactoryName().equals("BlockBasedTable"));
+
+    try {
+      db = RocksDB.open(options, db_path);
+      db.put("hello".getBytes(), "world".getBytes());
+      byte[] value = db.get("hello".getBytes());
+      assert("world".equals(new String(value)));
+      String str = db.getProperty("rocksdb.stats");
+      assert(str != null && !str.equals(""));
+    } catch (RocksDBException e) {
+      System.out.format("[ERROR] caught the unexpceted exception -- %s\n", e);
+      assert(db == null);
+      assert(false);
+    }
+    // be sure to release the c++ pointer
+    db.close();
+
+    ReadOptions readOptions = new ReadOptions();
+    readOptions.setFillCache(false);
+
+    try {
+      db = RocksDB.open(options, db_path);
+      db.put("hello".getBytes(), "world".getBytes());
+      byte[] value = db.get("hello".getBytes());
+      System.out.format("Get('hello') = %s\n",
+          new String(value));
+
+      for (int i = 1; i <= 9; ++i) {
+        for (int j = 1; j <= 9; ++j) {
+          db.put(String.format("%dx%d", i, j).getBytes(),
+                 String.format("%d", i * j).getBytes());
+        }
+      }
+
+      for (int i = 1; i <= 9; ++i) {
+        for (int j = 1; j <= 9; ++j) {
+          System.out.format("%s ", new String(db.get(
+              String.format("%dx%d", i, j).getBytes())));
+        }
+        System.out.println("");
+      }
+
+      // write batch test
+      WriteOptions writeOpt = new WriteOptions();
+      for (int i = 10; i <= 19; ++i) {
+        WriteBatch batch = new WriteBatch();
+        for (int j = 10; j <= 19; ++j) {
+          batch.put(String.format("%dx%d", i, j).getBytes(),
+                    String.format("%d", i * j).getBytes());
+        }
+        db.write(writeOpt, batch);
+        batch.dispose();
+      }
+      for (int i = 10; i <= 19; ++i) {
+        for (int j = 10; j <= 19; ++j) {
+          assert(new String(
+              db.get(String.format("%dx%d", i, j).getBytes())).equals(
+                  String.format("%d", i * j)));
+          System.out.format("%s ", new String(db.get(
+              String.format("%dx%d", i, j).getBytes())));
+        }
+        System.out.println("");
+      }
+      writeOpt.dispose();
+
+      value = db.get("1x1".getBytes());
+      assert(value != null);
+      value = db.get("world".getBytes());
+      assert(value == null);
+      value = db.get(readOptions, "world".getBytes());
+      assert(value == null);
+
+      byte[] testKey = "asdf".getBytes();
+      byte[] testValue =
+          "asdfghjkl;'?><MNBVCXZQWERTYUIOP{+_)(*&^%$#@".getBytes();
+      db.put(testKey, testValue);
+      byte[] testResult = db.get(testKey);
+      assert(testResult != null);
+      assert(Arrays.equals(testValue, testResult));
+      assert(new String(testValue).equals(new String(testResult)));
+      testResult = db.get(readOptions, testKey);
+      assert(testResult != null);
+      assert(Arrays.equals(testValue, testResult));
+      assert(new String(testValue).equals(new String(testResult)));
+
+      byte[] insufficientArray = new byte[10];
+      byte[] enoughArray = new byte[50];
+      int len;
+      len = db.get(testKey, insufficientArray);
+      assert(len > insufficientArray.length);
+      len = db.get("asdfjkl;".getBytes(), enoughArray);
+      assert(len == RocksDB.NOT_FOUND);
+      len = db.get(testKey, enoughArray);
+      assert(len == testValue.length);
+
+      len = db.get(readOptions, testKey, insufficientArray);
+      assert(len > insufficientArray.length);
+      len = db.get(readOptions, "asdfjkl;".getBytes(), enoughArray);
+      assert(len == RocksDB.NOT_FOUND);
+      len = db.get(readOptions, testKey, enoughArray);
+      assert(len == testValue.length);
+
+      db.remove(testKey);
+      len = db.get(testKey, enoughArray);
+      assert(len == RocksDB.NOT_FOUND);
+
+      // repeat the test with WriteOptions
+      WriteOptions writeOpts = new WriteOptions();
+      writeOpts.setSync(true);
+      writeOpts.setDisableWAL(true);
+      db.put(writeOpts, testKey, testValue);
+      len = db.get(testKey, enoughArray);
+      assert(len == testValue.length);
+      assert(new String(testValue).equals(
+          new String(enoughArray, 0, len)));
+      writeOpts.dispose();
+
+      try {
+        for (TickerType statsType : TickerType.values()) {
+          stats.getTickerCount(statsType);
+        }
+        System.out.println("getTickerCount() passed.");
+      } catch (Exception e) {
+        System.out.println("Failed in call to getTickerCount()");
+        assert(false); //Should never reach here.
+      }
+
+      try {
+        for (HistogramType histogramType : HistogramType.values()) {
+          HistogramData data = stats.geHistogramData(histogramType);
+        }
+        System.out.println("geHistogramData() passed.");
+      } catch (Exception e) {
+        System.out.println("Failed in call to geHistogramData()");
+        assert(false); //Should never reach here.
+      }
+
+      RocksIterator iterator = db.newIterator();
+
+      boolean seekToFirstPassed = false;
+      for (iterator.seekToFirst(); iterator.isValid(); iterator.next()) {
+        iterator.status();
+        assert(iterator.key() != null);
+        assert(iterator.value() != null);
+        seekToFirstPassed = true;
+      }
+      if(seekToFirstPassed) {
+        System.out.println("iterator seekToFirst tests passed.");
+      }
+
+      boolean seekToLastPassed = false;
+      for (iterator.seekToLast(); iterator.isValid(); iterator.prev()) {
+        iterator.status();
+        assert(iterator.key() != null);
+        assert(iterator.value() != null);
+        seekToLastPassed = true;
+      }
+
+      if(seekToLastPassed) {
+        System.out.println("iterator seekToLastPassed tests passed.");
+      }
+
+      iterator.seekToFirst();
+      iterator.seek(iterator.key());
+      assert(iterator.key() != null);
+      assert(iterator.value() != null);
+
+      System.out.println("iterator seek test passed.");
+
+      iterator.dispose();
+      System.out.println("iterator tests passed.");
+
+      iterator = db.newIterator();
+      List<byte[]> keys = new ArrayList<byte[]>();
+      for (iterator.seekToLast(); iterator.isValid(); iterator.prev()) {
+        keys.add(iterator.key());
+      }
+      iterator.dispose();
+
+      Map<byte[], byte[]> values = db.multiGet(keys);
+      assert(values.size() == keys.size());
+      for(byte[] value1 : values.values()) {
+        assert(value1 != null);
+      }
+
+      values = db.multiGet(new ReadOptions(), keys);
+      assert(values.size() == keys.size());
+      for(byte[] value1 : values.values()) {
+        assert(value1 != null);
+      }
+    } catch (RocksDBException e) {
+      System.err.println(e);
+    }
+    if (db != null) {
+      db.close();
+    }
+    // be sure to dispose c++ pointers
+    options.dispose();
+    readOptions.dispose();
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java
new file mode 100644
index 0000000..2b78ded
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java
@@ -0,0 +1,29 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * A CompactionFilter allows an application to modify/delete a key-value at
+ * the time of compaction.
+ *
+ * At present we just permit an overriding Java class to wrap a C++ implementation
+ */
+public abstract class AbstractCompactionFilter<T extends AbstractSlice<?>>
+    extends RocksObject {
+
+  /**
+   * Deletes underlying C++ comparator pointer.
+   *
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the comparator are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void disposeInternal(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java
new file mode 100644
index 0000000..c2412d7
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java
@@ -0,0 +1,100 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Comparators are used by RocksDB to determine
+ * the ordering of keys.
+ *
+ * This class is package private, implementers
+ * should extend either of the public abstract classes:
+ *   @see org.rocksdb.Comparator
+ *   @see org.rocksdb.DirectComparator
+ */
+public abstract class AbstractComparator<T extends AbstractSlice<?>>
+    extends RocksObject {
+
+  /**
+   * The name of the comparator.  Used to check for comparator
+   * mismatches (i.e., a DB created with one comparator is
+   * accessed using a different comparator).
+   *
+   * A new name should be used whenever
+   * the comparator implementation changes in a way that will cause
+   * the relative ordering of any two keys to change.
+   *
+   * Names starting with "rocksdb." are reserved and should not be used.
+   *
+   * @return The name of this comparator implementation
+   */
+  public abstract String name();
+
+  /**
+   * Three-way key comparison
+   *
+   *  @param a Slice access to first key
+   *  @param b Slice access to second key
+   *
+   *  @return Should return either:
+   *    1) < 0 if "a" < "b"
+   *    2) == 0 if "a" == "b"
+   *    3) > 0 if "a" > "b"
+   */
+  public abstract int compare(final T a, final T b);
+
+  /**
+   * <p>Used to reduce the space requirements
+   * for internal data structures like index blocks.</p>
+   *
+   * <p>If start < limit, you may return a new start which is a
+   * shorter string in [start, limit).</p>
+   *
+   * <p>Simple comparator implementations may return null if they
+   * wish to use start unchanged. i.e., an implementation of
+   * this method that does nothing is correct.</p>
+   *
+   * @param start String
+   * @param limit of type T
+   *
+   * @return a shorter start, or null
+   */
+  public String findShortestSeparator(final String start, final T limit) {
+      return null;
+  }
+
+  /**
+   * <p>Used to reduce the space requirements
+   * for internal data structures like index blocks.</p>
+   *
+   * <p>You may return a new short key (key1) where
+   * key1 ≥ key.</p>
+   *
+   * <p>Simple comparator implementations may return null if they
+   * wish to leave the key unchanged. i.e., an implementation of
+   * this method that does nothing is correct.</p>
+   *
+   * @param key String
+   *
+   * @return a shorter key, or null
+   */
+  public String findShortSuccessor(final String key) {
+      return null;
+  }
+
+  /**
+   * Deletes underlying C++ comparator pointer.
+   *
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the comparator are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void disposeInternal(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java
new file mode 100644
index 0000000..f3f89a6
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java
@@ -0,0 +1,106 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Base class implementation for Rocks Iterators
+ * in the Java API
+ *
+ * <p>Multiple threads can invoke const methods on an RocksIterator without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same RocksIterator must use
+ * external synchronization.</p>
+ *
+ * @param <P> The type of the Parent Object from which the Rocks Iterator was
+ *          created. This is used by disposeInternal to avoid double-free
+ *          issues with the underlying C++ object.
+ * @see org.rocksdb.RocksObject
+ */
+public abstract class AbstractRocksIterator<P extends RocksObject>
+    extends RocksObject implements RocksIteratorInterface {
+  final P parent_;
+
+  protected AbstractRocksIterator(final P parent,
+      final long nativeHandle) {
+    super();
+    nativeHandle_ = nativeHandle;
+    // parent must point to a valid RocksDB instance.
+    assert (parent != null);
+    // RocksIterator must hold a reference to the related parent instance
+    // to guarantee that while a GC cycle starts RocksIterator instances
+    // are freed prior to parent instances.
+    parent_ = parent;
+  }
+
+  @Override
+  public boolean isValid() {
+    assert (isInitialized());
+    return isValid0(nativeHandle_);
+  }
+
+  @Override
+  public void seekToFirst() {
+    assert (isInitialized());
+    seekToFirst0(nativeHandle_);
+  }
+
+  @Override
+  public void seekToLast() {
+    assert (isInitialized());
+    seekToLast0(nativeHandle_);
+  }
+
+  @Override
+  public void seek(byte[] target) {
+    assert (isInitialized());
+    seek0(nativeHandle_, target, target.length);
+  }
+
+  @Override
+  public void next() {
+    assert (isInitialized());
+    next0(nativeHandle_);
+  }
+
+  @Override
+  public void prev() {
+    assert (isInitialized());
+    prev0(nativeHandle_);
+  }
+
+  @Override
+  public void status() throws RocksDBException {
+    assert (isInitialized());
+    status0(nativeHandle_);
+  }
+
+  /**
+   * <p>Deletes underlying C++ iterator pointer.</p>
+   *
+   * <p>Note: the underlying handle can only be safely deleted if the parent
+   * instance related to a certain RocksIterator is still valid and initialized.
+   * Therefore {@code disposeInternal()} checks if the parent is initialized
+   * before freeing the native handle.</p>
+   */
+  @Override
+  protected void disposeInternal() {
+    synchronized (parent_) {
+      assert (isInitialized());
+      if (parent_.isInitialized()) {
+        disposeInternal(nativeHandle_);
+      }
+    }
+  }
+
+  abstract void disposeInternal(long handle);
+  abstract boolean isValid0(long handle);
+  abstract void seekToFirst0(long handle);
+  abstract void seekToLast0(long handle);
+  abstract void next0(long handle);
+  abstract void prev0(long handle);
+  abstract void seek0(long handle, byte[] target, int targetLen);
+  abstract void status0(long handle) throws RocksDBException;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractSlice.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractSlice.java
new file mode 100644
index 0000000..a37bd02
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractSlice.java
@@ -0,0 +1,171 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Slices are used by RocksDB to provide
+ * efficient access to keys and values.
+ *
+ * This class is package private, implementers
+ * should extend either of the public abstract classes:
+ *   @see org.rocksdb.Slice
+ *   @see org.rocksdb.DirectSlice
+ *
+ * Regards the lifecycle of Java Slices in RocksDB:
+ *   At present when you configure a Comparator from Java, it creates an
+ *   instance of a C++ BaseComparatorJniCallback subclass and
+ *   passes that to RocksDB as the comparator. That subclass of
+ *   BaseComparatorJniCallback creates the Java
+ *   @see org.rocksdb.AbstractSlice subclass Objects. When you dispose
+ *   the Java @see org.rocksdb.AbstractComparator subclass, it disposes the
+ *   C++ BaseComparatorJniCallback subclass, which in turn destroys the
+ *   Java @see org.rocksdb.AbstractSlice subclass Objects.
+ */
+abstract class AbstractSlice<T> extends RocksObject {
+
+  /**
+   * Returns the data of the slice.
+   *
+   * @return The slice data. Note, the type of access is
+   *   determined by the subclass
+   *   @see org.rocksdb.AbstractSlice#data0(long)
+   */
+  public T data() {
+    assert (isInitialized());
+    return data0(nativeHandle_);
+  }
+
+  /**
+   * Access to the data is provided by the
+   * subtype as it needs to handle the
+   * generic typing.
+   *
+   * @param handle The address of the underlying
+   *   native object.
+   *
+   * @return Java typed access to the data.
+   */
+  protected abstract T data0(long handle);
+
+  /**
+   * Return the length (in bytes) of the data.
+   *
+   * @return The length in bytes.
+   */
+  public int size() {
+    assert (isInitialized());
+    return size0(nativeHandle_);
+  }
+
+  /**
+   * Return true if the length of the
+   * data is zero.
+   *
+   * @return true if there is no data, false otherwise.
+   */
+  public boolean empty() {
+    assert (isInitialized());
+    return empty0(nativeHandle_);
+  }
+
+  /**
+   * Creates a string representation of the data
+   *
+   * @param hex When true, the representation
+   *   will be encoded in hexadecimal.
+   *
+   * @return The string representation of the data.
+   */
+  public String toString(final boolean hex) {
+    assert (isInitialized());
+    return toString0(nativeHandle_, hex);
+  }
+
+  @Override
+  public String toString() {
+    return toString(false);
+  }
+
+  /**
+   * Three-way key comparison
+   *
+   *  @param other A slice to compare against
+   *
+   *  @return Should return either:
+   *    1) < 0 if this < other
+   *    2) == 0 if this == other
+   *    3) > 0 if this > other
+   */
+  public int compare(final AbstractSlice<?> other) {
+    assert (other != null);
+    assert (isInitialized());
+    return compare0(nativeHandle_, other.nativeHandle_);
+  }
+
+  @Override
+  public int hashCode() {
+    return toString().hashCode();
+  }
+
+  /**
+   * If other is a slice object, then
+   * we defer to {@link #compare(AbstractSlice) compare}
+   * to check equality, otherwise we return false.
+   *
+   * @param other Object to test for equality
+   *
+   * @return true when {@code this.compare(other) == 0},
+   *   false otherwise.
+   */
+  @Override
+  public boolean equals(final Object other) {
+    if (other != null && other instanceof AbstractSlice) {
+      return compare((AbstractSlice<?>)other) == 0;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * Determines whether this slice starts with
+   * another slice
+   *
+   * @param prefix Another slice which may of may not
+   *   be a prefix of this slice.
+   *
+   * @return true when this slice starts with the
+   *   {@code prefix} slice
+   */
+  public boolean startsWith(final AbstractSlice<?> prefix) {
+    if (prefix != null) {
+      assert (isInitialized());
+      return startsWith0(nativeHandle_, prefix.nativeHandle_);
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * Deletes underlying C++ slice pointer.
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the slice are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override
+  protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  protected native void createNewSliceFromString(String str);
+  private native int size0(long handle);
+  private native boolean empty0(long handle);
+  private native String toString0(long handle, boolean hex);
+  private native int compare0(long handle, long otherHandle);
+  private native boolean startsWith0(long handle, long otherHandle);
+  private native void disposeInternal(long handle);
+
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java
new file mode 100644
index 0000000..b380c5d
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java
@@ -0,0 +1,92 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+public abstract class AbstractWriteBatch extends RocksObject implements WriteBatchInterface {
+
+  @Override
+  public int count() {
+    assert (isInitialized());
+    return count0();
+  }
+
+  @Override
+  public void put(byte[] key, byte[] value) {
+    assert (isInitialized());
+    put(key, key.length, value, value.length);
+  }
+
+  @Override
+  public void put(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value) {
+    assert (isInitialized());
+    put(key, key.length, value, value.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  @Override
+  public void merge(byte[] key, byte[] value) {
+    assert (isInitialized());
+    merge(key, key.length, value, value.length);
+  }
+
+  @Override
+  public void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value) {
+    assert (isInitialized());
+    merge(key, key.length, value, value.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  @Override
+  public void remove(byte[] key) {
+    assert (isInitialized());
+    remove(key, key.length);
+  }
+
+  @Override
+  public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) {
+    assert (isInitialized());
+    remove(key, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  @Override
+  public void putLogData(byte[] blob) {
+    assert (isInitialized());
+    putLogData(blob, blob.length);
+  }
+
+  @Override
+  public void clear() {
+    assert (isInitialized());
+    clear0();
+  }
+
+  /**
+   * Delete the c++ side pointer.
+   */
+  @Override
+  protected void disposeInternal() {
+    assert (isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  abstract void disposeInternal(long handle);
+
+  abstract int count0();
+
+  abstract void put(byte[] key, int keyLen, byte[] value, int valueLen);
+
+  abstract void put(byte[] key, int keyLen, byte[] value, int valueLen, long cfHandle);
+
+  abstract void merge(byte[] key, int keyLen, byte[] value, int valueLen);
+
+  abstract void merge(byte[] key, int keyLen, byte[] value, int valueLen, long cfHandle);
+
+  abstract void remove(byte[] key, int keyLen);
+
+  abstract void remove(byte[] key, int keyLen, long cfHandle);
+
+  abstract void putLogData(byte[] blob, int blobLen);
+
+  abstract void clear0();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java b/src/rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java
new file mode 100644
index 0000000..2f944e5
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java
@@ -0,0 +1,222 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * BackupEngine allows you to backup
+ * and restore the database
+ *
+ * Be aware, that `new BackupEngine` takes time proportional to the amount
+ * of backups. So if you have a slow filesystem to backup (like HDFS)
+ * and you have a lot of backups then restoring can take some time.
+ * That's why we recommend to limit the number of backups.
+ * Also we recommend to keep BackupEngine alive and not to recreate it every
+ * time you need to do a backup.
+ */
+public class BackupEngine extends RocksObject implements AutoCloseable {
+
+  protected BackupEngine() {
+    super();
+  }
+
+  /**
+   * Opens a new Backup Engine
+   *
+   * @param env The environment that the backup engine should operate within
+   * @param options Any options for the backup engine
+   *
+   * @return A new BackupEngine instance
+   */
+  public static BackupEngine open(final Env env,
+      final BackupableDBOptions options) throws RocksDBException {
+    final BackupEngine be = new BackupEngine();
+    be.open(env.nativeHandle_, options.nativeHandle_);
+    return be;
+  }
+
+  /**
+   * Captures the state of the database in the latest backup
+   *
+   * Just a convenience for {@link #createNewBackup(RocksDB, boolean)} with
+   * the flushBeforeBackup parameter set to false
+   *
+   * @param db The database to backup
+   *
+   * Note - This method is not thread safe
+   */
+  public void createNewBackup(final RocksDB db) throws RocksDBException {
+    createNewBackup(db, false);
+  }
+
+  /**
+   * Captures the state of the database in the latest backup
+   *
+   * @param db The database to backup
+   * @param flushBeforeBackup When true, the Backup Engine will first issue a
+   *                          memtable flush and only then copy the DB files to
+   *                          the backup directory. Doing so will prevent log
+   *                          files from being copied to the backup directory
+   *                          (since flush will delete them).
+   *                          When false, the Backup Engine will not issue a
+   *                          flush before starting the backup. In that case,
+   *                          the backup will also include log files
+   *                          corresponding to live memtables. The backup will
+   *                          always be consistent with the current state of the
+   *                          database regardless of the flushBeforeBackup
+   *                          parameter.
+   *
+   * Note - This method is not thread safe
+   */
+  public void createNewBackup(
+      final RocksDB db, final boolean flushBeforeBackup)
+      throws RocksDBException {
+    assert (isInitialized());
+    createNewBackup(nativeHandle_, db.nativeHandle_, flushBeforeBackup);
+  }
+
+  /**
+   * Gets information about the available
+   * backups
+   *
+   * @return A list of information about each available backup
+   */
+  public List<BackupInfo> getBackupInfo() {
+    assert (isInitialized());
+    return getBackupInfo(nativeHandle_);
+  }
+
+  /**
+   * <p>Returns a list of corrupted backup ids. If there
+   * is no corrupted backup the method will return an
+   * empty list.</p>
+   *
+   * @return array of backup ids as int ids.
+   */
+  public int[] getCorruptedBackups() {
+    assert(isInitialized());
+    return getCorruptedBackups(nativeHandle_);
+  }
+
+  /**
+   * <p>Will delete all the files we don't need anymore. It will
+   * do the full scan of the files/ directory and delete all the
+   * files that are not referenced.</p>
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void garbageCollect() throws RocksDBException {
+    assert(isInitialized());
+    garbageCollect(nativeHandle_);
+  }
+
+  /**
+   * Deletes old backups, keeping just the latest numBackupsToKeep
+   *
+   * @param numBackupsToKeep The latest n backups to keep
+   */
+  public void purgeOldBackups(
+      final int numBackupsToKeep) throws RocksDBException {
+    assert (isInitialized());
+    purgeOldBackups(nativeHandle_, numBackupsToKeep);
+  }
+
+  /**
+   * Deletes a backup
+   *
+   * @param backupId The id of the backup to delete
+   */
+  public void deleteBackup(final int backupId) throws RocksDBException {
+    assert (isInitialized());
+    deleteBackup(nativeHandle_, backupId);
+  }
+
+  /**
+   * Restore the database from a backup
+   *
+   * IMPORTANT: if options.share_table_files == true and you restore the DB
+   * from some backup that is not the latest, and you start creating new
+   * backups from the new DB, they will probably fail!
+   *
+   * Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3.
+   * If you add new data to the DB and try creating a new backup now, the
+   * database will diverge from backups 4 and 5 and the new backup will fail.
+   * If you want to create new backup, you will first have to delete backups 4
+   * and 5.
+   *
+   * @param backupId The id of the backup to restore
+   * @param dbDir The directory to restore the backup to, i.e. where your
+   *              database is
+   * @param walDir The location of the log files for your database,
+   *               often the same as dbDir
+   * @param restoreOptions Options for controlling the restore
+   */
+  public void restoreDbFromBackup(
+      final int backupId, final String dbDir, final String walDir,
+      final RestoreOptions restoreOptions) throws RocksDBException {
+    assert (isInitialized());
+    restoreDbFromBackup(nativeHandle_, backupId, dbDir, walDir,
+        restoreOptions.nativeHandle_);
+  }
+
+  /**
+   * Restore the database from the latest backup
+   *
+   * @param dbDir The directory to restore the backup to, i.e. where your database is
+   * @param walDir The location of the log files for your database, often the same as dbDir
+   * @param restoreOptions Options for controlling the restore
+   */
+  public void restoreDbFromLatestBackup(
+      final String dbDir, final String walDir,
+      final RestoreOptions restoreOptions) throws RocksDBException {
+    assert (isInitialized());
+    restoreDbFromLatestBackup(nativeHandle_, dbDir, walDir,
+        restoreOptions.nativeHandle_);
+  }
+
+  /**
+   * Close the Backup Engine
+   */
+  @Override
+  public void close() throws RocksDBException {
+    dispose();
+  }
+
+  @Override
+  protected void disposeInternal() {
+    assert (isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void open(final long env, final long backupableDbOptions)
+      throws RocksDBException;
+
+  private native void createNewBackup(final long handle, final long dbHandle,
+      final boolean flushBeforeBackup) throws RocksDBException;
+
+  private native List<BackupInfo> getBackupInfo(final long handle);
+
+  private native int[] getCorruptedBackups(final long handle);
+
+  private native void garbageCollect(final long handle) throws RocksDBException;
+
+  private native void purgeOldBackups(final long handle,
+      final int numBackupsToKeep) throws RocksDBException;
+
+  private native void deleteBackup(final long handle, final int backupId)
+      throws RocksDBException;
+
+  private native void restoreDbFromBackup(final long handle, final int backupId,
+      final String dbDir, final String walDir, final long restoreOptionsHandle)
+      throws RocksDBException;
+
+  private native void restoreDbFromLatestBackup(final long handle,
+      final String dbDir, final String walDir, final long restoreOptionsHandle)
+      throws RocksDBException;
+
+  private native void disposeInternal(final long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java b/src/rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java
new file mode 100644
index 0000000..48a52a7
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java
@@ -0,0 +1,67 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * Instances of this class describe a Backup made by
+ * {@link org.rocksdb.BackupableDB}.
+ */
+public class BackupInfo {
+
+  /**
+   * Package private constructor used to create instances
+   * of BackupInfo by {@link org.rocksdb.BackupableDB} and
+   * {@link org.rocksdb.RestoreBackupableDB}.
+   *
+   * @param backupId id of backup
+   * @param timestamp timestamp of backup
+   * @param size size of backup
+   * @param numberFiles number of files related to this backup.
+   */
+  BackupInfo(final int backupId, final long timestamp, final long size,
+      final int numberFiles) {
+    backupId_ = backupId;
+    timestamp_ = timestamp;
+    size_ = size;
+    numberFiles_ = numberFiles;
+  }
+
+  /**
+   *
+   * @return the backup id.
+   */
+  public int backupId() {
+    return backupId_;
+  }
+
+  /**
+   *
+   * @return the timestamp of the backup.
+   */
+  public long timestamp() {
+    return timestamp_;
+  }
+
+  /**
+   *
+   * @return the size of the backup
+   */
+  public long size() {
+    return size_;
+  }
+
+  /**
+   *
+   * @return the number of files of this backup.
+   */
+  public int numberFiles() {
+    return numberFiles_;
+  }
+
+  private int backupId_;
+  private long timestamp_;
+  private long size_;
+  private int numberFiles_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/BackupableDB.java b/src/rocksdb/java/src/main/java/org/rocksdb/BackupableDB.java
new file mode 100644
index 0000000..f2646d2
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/BackupableDB.java
@@ -0,0 +1,166 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * <p>A subclass of RocksDB which supports
+ * backup-related operations.</p>
+ *
+ * @see org.rocksdb.BackupableDBOptions
+ */
+public class BackupableDB extends RocksDB {
+  /**
+   * <p>Open a {@code BackupableDB} under the specified path.
+   * Note that the backup path should be set properly in the
+   * input BackupableDBOptions.</p>
+   *
+   * @param opt {@link org.rocksdb.Options} to set for the database.
+   * @param bopt {@link org.rocksdb.BackupableDBOptions} to use.
+   * @param db_path Path to store data to. The path for storing the backup should be
+   *     specified in the {@link org.rocksdb.BackupableDBOptions}.
+   *
+   * @return {@link BackupableDB} reference to the opened database.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static BackupableDB open(
+      final Options opt, final BackupableDBOptions bopt, final String db_path)
+      throws RocksDBException {
+
+    RocksDB db = RocksDB.open(opt, db_path);
+    BackupableDB bdb = new BackupableDB();
+    bdb.open(db.nativeHandle_, bopt.nativeHandle_);
+
+    // Prevent the RocksDB object from attempting to delete
+    // the underly C++ DB object.
+    db.disOwnNativeHandle();
+
+    return bdb;
+  }
+
+  /**
+   * <p>Captures the state of the database in the latest backup.
+   * Note that this function is not thread-safe.</p>
+   *
+   * @param flushBeforeBackup if true, then all data will be flushed
+   *     before creating backup.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void createNewBackup(final boolean flushBeforeBackup)
+      throws RocksDBException {
+    assert(isInitialized());
+    createNewBackup(nativeHandle_, flushBeforeBackup);
+  }
+
+  /**
+   * <p>Deletes old backups, keeping latest numBackupsToKeep alive.</p>
+   *
+   * @param numBackupsToKeep Number of latest backups to keep.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void purgeOldBackups(final int numBackupsToKeep)
+      throws RocksDBException {
+    assert(isInitialized());
+    purgeOldBackups(nativeHandle_, numBackupsToKeep);
+  }
+
+  /**
+   * <p>Deletes a specific backup.</p>
+   *
+   * @param backupId of backup to delete.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void deleteBackup(final int backupId) throws RocksDBException {
+    assert(isInitialized());
+    deleteBackup0(nativeHandle_, backupId);
+  }
+
+  /**
+   * <p>Returns a list of {@link BackupInfo} instances, which describe
+   * already made backups.</p>
+   *
+   * @return List of {@link BackupInfo} instances.
+   */
+  public List<BackupInfo> getBackupInfos() {
+    assert(isInitialized());
+    return getBackupInfo(nativeHandle_);
+  }
+
+  /**
+   * <p>Returns a list of corrupted backup ids. If there
+   * is no corrupted backup the method will return an
+   * empty list.</p>
+   *
+   * @return array of backup ids as int ids.
+   */
+  public int[] getCorruptedBackups() {
+    assert(isInitialized());
+    return getCorruptedBackups(nativeHandle_);
+  }
+
+  /**
+   * <p>Will delete all the files we don't need anymore. It will
+   * do the full scan of the files/ directory and delete all the
+   * files that are not referenced.</p>
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void garbageCollect() throws RocksDBException {
+    assert(isInitialized());
+    garbageCollect(nativeHandle_);
+  }
+
+  /**
+   * <p>Close the BackupableDB instance and release resource.</p>
+   *
+   * <p>Internally, {@link BackupableDB} owns the {@code rocksdb::DB}
+   * pointer to its associated {@link org.rocksdb.RocksDB}.
+   * The release of that RocksDB pointer is handled in the destructor
+   * of the c++ {@code rocksdb::BackupableDB} and should be transparent
+   * to Java developers.</p>
+   */
+  @Override public synchronized void close() {
+    if (isInitialized()) {
+      super.close();
+    }
+  }
+
+  /**
+   * <p>A protected construction that will be used in the static
+   * factory method {@link #open(Options, BackupableDBOptions, String)}.
+   * </p>
+   */
+  protected BackupableDB() {
+    super();
+  }
+
+  @Override protected void finalize() throws Throwable {
+    close();
+    super.finalize();
+  }
+
+  protected native void open(long rocksDBHandle, long backupDBOptionsHandle);
+  protected native void createNewBackup(long handle, boolean flag)
+      throws RocksDBException;
+  protected native void purgeOldBackups(long handle, int numBackupsToKeep)
+      throws RocksDBException;
+  private native void deleteBackup0(long nativeHandle, int backupId)
+      throws RocksDBException;
+  protected native List<BackupInfo> getBackupInfo(long handle);
+  private native int[] getCorruptedBackups(long handle);
+  private native void garbageCollect(long handle)
+      throws RocksDBException;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/BackupableDBOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/BackupableDBOptions.java
new file mode 100644
index 0000000..17a0afc
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/BackupableDBOptions.java
@@ -0,0 +1,271 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.io.File;
+import java.nio.file.Path;
+
+/**
+ * <p>BackupableDBOptions to control the behavior of a backupable database.
+ * It will be used during the creation of a {@link org.rocksdb.BackupableDB}.
+ * </p>
+ * <p>Note that dispose() must be called before an Options instance
+ * become out-of-scope to release the allocated memory in c++.</p>
+ *
+ * @see org.rocksdb.BackupableDB
+ */
+public class BackupableDBOptions extends RocksObject {
+
+  /**
+   * <p>BackupableDBOptions constructor.</p>
+   *
+   * @param path Where to keep the backup files. Has to be different than db name.
+   *     Best to set this to {@code db name_ + "/backups"}
+   * @throws java.lang.IllegalArgumentException if illegal path is used.
+   */
+  public BackupableDBOptions(final String path) {
+    super();
+    File backupPath = path == null ? null : new File(path);
+    if (backupPath == null || !backupPath.isDirectory() || !backupPath.canWrite()) {
+      throw new IllegalArgumentException("Illegal path provided.");
+    }
+    newBackupableDBOptions(path);
+  }
+
+  /**
+   * <p>Returns the path to the BackupableDB directory.</p>
+   *
+   * @return the path to the BackupableDB directory.
+   */
+  public String backupDir() {
+    assert(isInitialized());
+    return backupDir(nativeHandle_);
+  }
+
+  /**
+   * <p>Share table files between backups.</p>
+   *
+   * @param shareTableFiles If {@code share_table_files == true}, backup will assume
+   *     that table files with same name have the same contents. This enables incremental
+   *     backups and avoids unnecessary data copies. If {@code share_table_files == false},
+   *     each backup will be on its own and will not share any data with other backups.
+   *
+   * <p>Default: true</p>
+   *
+   * @return instance of current BackupableDBOptions.
+   */
+  public BackupableDBOptions setShareTableFiles(final boolean shareTableFiles) {
+    assert(isInitialized());
+    setShareTableFiles(nativeHandle_, shareTableFiles);
+    return this;
+  }
+
+  /**
+   * <p>Share table files between backups.</p>
+   *
+   * @return boolean value indicating if SST files will be shared between
+   *     backups.
+   */
+  public boolean shareTableFiles() {
+    assert(isInitialized());
+    return shareTableFiles(nativeHandle_);
+  }
+
+  /**
+   * <p>Set synchronous backups.</p>
+   *
+   * @param sync If {@code sync == true}, we can guarantee you'll get consistent backup
+   *     even on a machine crash/reboot. Backup process is slower with sync enabled.
+   *     If {@code sync == false}, we don't guarantee anything on machine reboot.
+   *     However,chances are some of the backups are consistent.
+   *
+   * <p>Default: true</p>
+   *
+   * @return instance of current BackupableDBOptions.
+   */
+  public BackupableDBOptions setSync(final boolean sync) {
+    assert(isInitialized());
+    setSync(nativeHandle_, sync);
+    return this;
+  }
+
+  /**
+   * <p>Are synchronous backups activated.</p>
+   *
+   * @return boolean value if synchronous backups are configured.
+   */
+  public boolean sync() {
+    assert(isInitialized());
+    return sync(nativeHandle_);
+  }
+
+  /**
+   * <p>Set if old data will be destroyed.</p>
+   *
+   * @param destroyOldData If true, it will delete whatever backups there are already.
+   *
+   * <p>Default: false</p>
+   *
+   * @return instance of current BackupableDBOptions.
+   */
+  public BackupableDBOptions setDestroyOldData(final boolean destroyOldData) {
+    assert(isInitialized());
+    setDestroyOldData(nativeHandle_, destroyOldData);
+    return this;
+  }
+
+  /**
+   * <p>Returns if old data will be destroyed will performing new backups.</p>
+   *
+   * @return boolean value indicating if old data will be destroyed.
+   */
+  public boolean destroyOldData() {
+    assert(isInitialized());
+    return destroyOldData(nativeHandle_);
+  }
+
+  /**
+   * <p>Set if log files shall be persisted.</p>
+   *
+   * @param backupLogFiles If false, we won't backup log files. This option can be
+   *     useful for backing up in-memory databases where log file are persisted,but table
+   *     files are in memory.
+   *
+   * <p>Default: true</p>
+   *
+   * @return instance of current BackupableDBOptions.
+   */
+  public BackupableDBOptions setBackupLogFiles(final boolean backupLogFiles) {
+    assert(isInitialized());
+    setBackupLogFiles(nativeHandle_, backupLogFiles);
+    return this;
+  }
+
+  /**
+   * <p>Return information if log files shall be persisted.</p>
+   *
+   * @return boolean value indicating if log files will be persisted.
+   */
+  public boolean backupLogFiles() {
+    assert(isInitialized());
+    return backupLogFiles(nativeHandle_);
+  }
+
+  /**
+   * <p>Set backup rate limit.</p>
+   *
+   * @param backupRateLimit Max bytes that can be transferred in a second during backup.
+   *     If 0 or negative, then go as fast as you can.
+   *
+   * <p>Default: 0</p>
+   *
+   * @return instance of current BackupableDBOptions.
+   */
+  public BackupableDBOptions setBackupRateLimit(long backupRateLimit) {
+    assert(isInitialized());
+    backupRateLimit = (backupRateLimit <= 0) ? 0 : backupRateLimit;
+    setBackupRateLimit(nativeHandle_, backupRateLimit);
+    return this;
+  }
+
+  /**
+   * <p>Return backup rate limit which described the max bytes that can be transferred in a
+   * second during backup.</p>
+   *
+   * @return numerical value describing the backup transfer limit in bytes per second.
+   */
+  public long backupRateLimit() {
+    assert(isInitialized());
+    return backupRateLimit(nativeHandle_);
+  }
+
+  /**
+   * <p>Set restore rate limit.</p>
+   *
+   * @param restoreRateLimit Max bytes that can be transferred in a second during restore.
+   *     If 0 or negative, then go as fast as you can.
+   *
+   * <p>Default: 0</p>
+   *
+   * @return instance of current BackupableDBOptions.
+   */
+  public BackupableDBOptions setRestoreRateLimit(long restoreRateLimit) {
+    assert(isInitialized());
+    restoreRateLimit = (restoreRateLimit <= 0) ? 0 : restoreRateLimit;
+    setRestoreRateLimit(nativeHandle_, restoreRateLimit);
+    return this;
+  }
+
+  /**
+   * <p>Return restore rate limit which described the max bytes that can be transferred in a
+   * second during restore.</p>
+   *
+   * @return numerical value describing the restore transfer limit in bytes per second.
+   */
+  public long restoreRateLimit() {
+    assert(isInitialized());
+    return restoreRateLimit(nativeHandle_);
+  }
+
+  /**
+   * <p>Only used if share_table_files is set to true. If true, will consider that
+   * backups can come from different databases, hence a sst is not uniquely
+   * identified by its name, but by the triple (file name, crc32, file length)</p>
+   *
+   * @param shareFilesWithChecksum boolean value indicating if SST files are stored
+   *     using the triple (file name, crc32, file length) and not its name.
+   *
+   * <p>Note: this is an experimental option, and you'll need to set it manually
+   * turn it on only if you know what you're doing*</p>
+   *
+   * <p>Default: false</p>
+   *
+   * @return instance of current BackupableDBOptions.
+   */
+  public BackupableDBOptions setShareFilesWithChecksum(
+      final boolean shareFilesWithChecksum) {
+    assert(isInitialized());
+    setShareFilesWithChecksum(nativeHandle_, shareFilesWithChecksum);
+    return this;
+  }
+
+  /**
+   * <p>Return of share files with checksum is active.</p>
+   *
+   * @return boolean value indicating if share files with checksum
+   *     is active.
+   */
+  public boolean shareFilesWithChecksum() {
+    assert(isInitialized());
+    return shareFilesWithChecksum(nativeHandle_);
+  }
+
+  /**
+   * Release the memory allocated for the current instance
+   * in the c++ side.
+   */
+  @Override protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void newBackupableDBOptions(String path);
+  private native String backupDir(long handle);
+  private native void setShareTableFiles(long handle, boolean flag);
+  private native boolean shareTableFiles(long handle);
+  private native void setSync(long handle, boolean flag);
+  private native boolean sync(long handle);
+  private native void setDestroyOldData(long handle, boolean flag);
+  private native boolean destroyOldData(long handle);
+  private native void setBackupLogFiles(long handle, boolean flag);
+  private native boolean backupLogFiles(long handle);
+  private native void setBackupRateLimit(long handle, long rateLimit);
+  private native long backupRateLimit(long handle);
+  private native void setRestoreRateLimit(long handle, long rateLimit);
+  private native long restoreRateLimit(long handle);
+  private native void setShareFilesWithChecksum(long handle, boolean flag);
+  private native boolean shareFilesWithChecksum(long handle);
+  private native void disposeInternal(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
new file mode 100644
index 0000000..c3c6309
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
@@ -0,0 +1,425 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * The config for plain table sst format.
+ *
+ * BlockBasedTable is a RocksDB's default SST file format.
+ */
+public class BlockBasedTableConfig extends TableFormatConfig {
+
+  public BlockBasedTableConfig() {
+    noBlockCache_ = false;
+    blockCacheSize_ = 8 * 1024 * 1024;
+    blockCacheNumShardBits_ = 0;
+    blockSize_ = 4 * 1024;
+    blockSizeDeviation_ = 10;
+    blockRestartInterval_ = 16;
+    wholeKeyFiltering_ = true;
+    filter_ = null;
+    cacheIndexAndFilterBlocks_ = false;
+    hashIndexAllowCollision_ = true;
+    blockCacheCompressedSize_ = 0;
+    blockCacheCompressedNumShardBits_ = 0;
+    checksumType_ = ChecksumType.kCRC32c;
+    indexType_ = IndexType.kBinarySearch;
+    formatVersion_ = 0;
+  }
+
+  /**
+   * Disable block cache. If this is set to true,
+   * then no block cache should be used, and the block_cache should
+   * point to a {@code nullptr} object.
+   * Default: false
+   *
+   * @param noBlockCache if use block cache
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setNoBlockCache(final boolean noBlockCache) {
+    noBlockCache_ = noBlockCache;
+    return this;
+  }
+
+  /**
+   * @return if block cache is disabled
+   */
+  public boolean noBlockCache() {
+    return noBlockCache_;
+  }
+
+  /**
+   * Set the amount of cache in bytes that will be used by RocksDB.
+   * If cacheSize is non-positive, then cache will not be used.
+   * DEFAULT: 8M
+   *
+   * @param blockCacheSize block cache size in bytes
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockCacheSize(final long blockCacheSize) {
+    blockCacheSize_ = blockCacheSize;
+    return this;
+  }
+
+  /**
+   * @return block cache size in bytes
+   */
+  public long blockCacheSize() {
+    return blockCacheSize_;
+  }
+
+  /**
+   * Controls the number of shards for the block cache.
+   * This is applied only if cacheSize is set to non-negative.
+   *
+   * @param blockCacheNumShardBits the number of shard bits. The resulting
+   *     number of shards would be 2 ^ numShardBits.  Any negative
+   *     number means use default settings."
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setCacheNumShardBits(
+      final int blockCacheNumShardBits) {
+    blockCacheNumShardBits_ = blockCacheNumShardBits;
+    return this;
+  }
+
+  /**
+   * Returns the number of shard bits used in the block cache.
+   * The resulting number of shards would be 2 ^ (returned value).
+   * Any negative number means use default settings.
+   *
+   * @return the number of shard bits used in the block cache.
+   */
+  public int cacheNumShardBits() {
+    return blockCacheNumShardBits_;
+  }
+
+  /**
+   * Approximate size of user data packed per block.  Note that the
+   * block size specified here corresponds to uncompressed data.  The
+   * actual size of the unit read from disk may be smaller if
+   * compression is enabled.  This parameter can be changed dynamically.
+   * Default: 4K
+   *
+   * @param blockSize block size in bytes
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockSize(final long blockSize) {
+    blockSize_ = blockSize;
+    return this;
+  }
+
+  /**
+   * @return block size in bytes
+   */
+  public long blockSize() {
+    return blockSize_;
+  }
+
+  /**
+   * This is used to close a block before it reaches the configured
+   * 'block_size'. If the percentage of free space in the current block is less
+   * than this specified number and adding a new record to the block will
+   * exceed the configured block size, then this block will be closed and the
+   * new record will be written to the next block.
+   * Default is 10.
+   *
+   * @param blockSizeDeviation the deviation to block size allowed
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockSizeDeviation(
+      final int blockSizeDeviation) {
+    blockSizeDeviation_ = blockSizeDeviation;
+    return this;
+  }
+
+  /**
+   * @return the hash table ratio.
+   */
+  public int blockSizeDeviation() {
+    return blockSizeDeviation_;
+  }
+
+  /**
+   * Set block restart interval
+   *
+   * @param restartInterval block restart interval.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockRestartInterval(
+      final int restartInterval) {
+    blockRestartInterval_ = restartInterval;
+    return this;
+  }
+
+  /**
+   * @return block restart interval
+   */
+  public int blockRestartInterval() {
+    return blockRestartInterval_;
+  }
+
+  /**
+   * If true, place whole keys in the filter (not just prefixes).
+   * This must generally be true for gets to be efficient.
+   * Default: true
+   *
+   * @param wholeKeyFiltering if enable whole key filtering
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setWholeKeyFiltering(
+      final boolean wholeKeyFiltering) {
+    wholeKeyFiltering_ = wholeKeyFiltering;
+    return this;
+  }
+
+  /**
+   * @return if whole key filtering is enabled
+   */
+  public boolean wholeKeyFiltering() {
+    return wholeKeyFiltering_;
+  }
+
+  /**
+   * Use the specified filter policy to reduce disk reads.
+   *
+   * {@link org.rocksdb.Filter} should not be disposed before options instances
+   * using this filter is disposed. If {@link Filter#dispose()} function is not
+   * called, then filter object will be GC'd automatically.
+   *
+   * {@link org.rocksdb.Filter} instance can be re-used in multiple options
+   * instances.
+   *
+   * @param filter {@link org.rocksdb.Filter} Filter Policy java instance.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setFilter(
+      final Filter filter) {
+    filter_ = filter;
+    return this;
+  }
+
+  /**
+   * Indicating if we'd put index/filter blocks to the block cache.
+     If not specified, each "table reader" object will pre-load index/filter
+     block during table initialization.
+   *
+   * @return if index and filter blocks should be put in block cache.
+   */
+  public boolean cacheIndexAndFilterBlocks() {
+    return cacheIndexAndFilterBlocks_;
+  }
+
+  /**
+   * Indicating if we'd put index/filter blocks to the block cache.
+     If not specified, each "table reader" object will pre-load index/filter
+     block during table initialization.
+   *
+   * @param cacheIndexAndFilterBlocks and filter blocks should be put in block cache.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setCacheIndexAndFilterBlocks(
+      final boolean cacheIndexAndFilterBlocks) {
+    cacheIndexAndFilterBlocks_ = cacheIndexAndFilterBlocks;
+    return this;
+  }
+
+  /**
+   * Influence the behavior when kHashSearch is used.
+     if false, stores a precise prefix to block range mapping
+     if true, does not store prefix and allows prefix hash collision
+     (less memory consumption)
+   *
+   * @return if hash collisions should be allowed.
+   */
+  public boolean hashIndexAllowCollision() {
+    return hashIndexAllowCollision_;
+  }
+
+  /**
+   * Influence the behavior when kHashSearch is used.
+     if false, stores a precise prefix to block range mapping
+     if true, does not store prefix and allows prefix hash collision
+     (less memory consumption)
+   *
+   * @param hashIndexAllowCollision points out if hash collisions should be allowed.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setHashIndexAllowCollision(
+      final boolean hashIndexAllowCollision) {
+    hashIndexAllowCollision_ = hashIndexAllowCollision;
+    return this;
+  }
+
+  /**
+   * Size of compressed block cache. If 0, then block_cache_compressed is set
+   * to null.
+   *
+   * @return size of compressed block cache.
+   */
+  public long blockCacheCompressedSize() {
+    return blockCacheCompressedSize_;
+  }
+
+  /**
+   * Size of compressed block cache. If 0, then block_cache_compressed is set
+   * to null.
+   *
+   * @param blockCacheCompressedSize of compressed block cache.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setBlockCacheCompressedSize(
+      final long blockCacheCompressedSize) {
+    blockCacheCompressedSize_ = blockCacheCompressedSize;
+    return this;
+  }
+
+  /**
+   * Controls the number of shards for the block compressed cache.
+   * This is applied only if blockCompressedCacheSize is set to non-negative.
+   *
+   * @return numShardBits the number of shard bits.  The resulting
+   *     number of shards would be 2 ^ numShardBits.  Any negative
+   *     number means use default settings.
+   */
+  public int blockCacheCompressedNumShardBits() {
+    return blockCacheCompressedNumShardBits_;
+  }
+
+  /**
+   * Controls the number of shards for the block compressed cache.
+   * This is applied only if blockCompressedCacheSize is set to non-negative.
+   *
+   * @param blockCacheCompressedNumShardBits the number of shard bits.  The resulting
+   *     number of shards would be 2 ^ numShardBits.  Any negative
+   *     number means use default settings."
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setBlockCacheCompressedNumShardBits(
+      final int blockCacheCompressedNumShardBits) {
+    blockCacheCompressedNumShardBits_ = blockCacheCompressedNumShardBits;
+    return this;
+  }
+
+  /**
+   * Sets the checksum type to be used with this table.
+   *
+   * @param checksumType {@link org.rocksdb.ChecksumType} value.
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setChecksumType(
+      final ChecksumType checksumType) {
+    checksumType_ = checksumType;
+    return this;
+  }
+
+  /**
+   *
+   * @return the currently set checksum type
+   */
+  public ChecksumType checksumType() {
+    return checksumType_;
+  }
+
+  /**
+   * Sets the index type to used with this table.
+   *
+   * @param indexType {@link org.rocksdb.IndexType} value
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setIndexType(
+      final IndexType indexType) {
+    indexType_ = indexType;
+    return this;
+  }
+
+  /**
+   *
+   * @return the currently set index type
+   */
+  public IndexType indexType() {
+    return indexType_;
+  }
+
+  /**
+   * <p>We currently have three versions:</p>
+   *
+   * <ul>
+   * <li><strong>0</strong> - This version is currently written
+   * out by all RocksDB's versions by default. Can be read by really old
+   * RocksDB's. Doesn't support changing checksum (default is CRC32).</li>
+   * <li><strong>1</strong> - Can be read by RocksDB's versions since 3.0.
+   * Supports non-default checksum, like xxHash. It is written by RocksDB when
+   * BlockBasedTableOptions::checksum is something other than kCRC32c. (version
+   * 0 is silently upconverted)</li>
+   * <li><strong>2</strong> - Can be read by RocksDB's versions since 3.10.
+   * Changes the way we encode compressed blocks with LZ4, BZip2 and Zlib
+   * compression. If you don't plan to run RocksDB before version 3.10,
+   * you should probably use this.</li>
+   * </ul>
+   * <p> This option only affects newly written tables. When reading existing
+   * tables, the information about version is read from the footer.</p>
+   *
+   * @param formatVersion integer representing the version to be used.
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setFormatVersion(
+      final int formatVersion) {
+    assert(formatVersion >= 0 && formatVersion <= 2);
+    formatVersion_ = formatVersion;
+    return this;
+  }
+
+  /**
+   *
+   * @return the currently configured format version.
+   * See also: {@link #setFormatVersion(int)}.
+   */
+  public int formatVersion() {
+    return formatVersion_;
+  }
+
+
+
+  @Override protected long newTableFactoryHandle() {
+    long filterHandle = 0;
+    if (filter_ != null) {
+      filterHandle = filter_.nativeHandle_;
+    }
+
+    return newTableFactoryHandle(noBlockCache_, blockCacheSize_,
+        blockCacheNumShardBits_, blockSize_, blockSizeDeviation_,
+        blockRestartInterval_, wholeKeyFiltering_,
+        filterHandle, cacheIndexAndFilterBlocks_,
+        hashIndexAllowCollision_, blockCacheCompressedSize_,
+        blockCacheCompressedNumShardBits_,
+        checksumType_.getValue(), indexType_.getValue(),
+        formatVersion_);
+  }
+
+  private native long newTableFactoryHandle(
+      boolean noBlockCache, long blockCacheSize, int blockCacheNumShardBits,
+      long blockSize, int blockSizeDeviation, int blockRestartInterval,
+      boolean wholeKeyFiltering, long filterPolicyHandle,
+      boolean cacheIndexAndFilterBlocks, boolean hashIndexAllowCollision,
+      long blockCacheCompressedSize, int blockCacheCompressedNumShardBits,
+      byte checkSumType, byte indexType, int formatVersion);
+
+  private boolean cacheIndexAndFilterBlocks_;
+  private IndexType indexType_;
+  private boolean hashIndexAllowCollision_;
+  private ChecksumType checksumType_;
+  private boolean noBlockCache_;
+  private long blockSize_;
+  private long blockCacheSize_;
+  private int blockCacheNumShardBits_;
+  private long blockCacheCompressedSize_;
+  private int blockCacheCompressedNumShardBits_;
+  private int blockSizeDeviation_;
+  private int blockRestartInterval_;
+  private Filter filter_;
+  private boolean wholeKeyFiltering_;
+  private int formatVersion_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/BloomFilter.java b/src/rocksdb/java/src/main/java/org/rocksdb/BloomFilter.java
new file mode 100644
index 0000000..67c45d7
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/BloomFilter.java
@@ -0,0 +1,89 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Bloom filter policy that uses a bloom filter with approximately
+ * the specified number of bits per key.
+ *
+ * <p>
+ * Note: if you are using a custom comparator that ignores some parts
+ * of the keys being compared, you must not use this {@code BloomFilter}
+ * and must provide your own FilterPolicy that also ignores the
+ * corresponding parts of the keys. For example, if the comparator
+ * ignores trailing spaces, it would be incorrect to use a
+ * FilterPolicy (like {@code BloomFilter}) that does not ignore
+ * trailing spaces in keys.</p>
+ */
+public class BloomFilter extends Filter {
+
+  private static final int DEFAULT_BITS_PER_KEY = 10;
+  private static final boolean DEFAULT_MODE = true;
+  private final int bitsPerKey_;
+  private final boolean useBlockBasedMode_;
+
+  /**
+   * BloomFilter constructor
+   *
+   * <p>
+   * Callers must delete the result after any database that is using the
+   * result has been closed.</p>
+   */
+  public BloomFilter() {
+    this(DEFAULT_BITS_PER_KEY, DEFAULT_MODE);
+  }
+
+  /**
+   * BloomFilter constructor
+   *
+   * <p>
+   * bits_per_key: bits per key in bloom filter. A good value for bits_per_key
+   * is 10, which yields a filter with ~ 1% false positive rate.
+   * </p>
+   * <p>
+   * Callers must delete the result after any database that is using the
+   * result has been closed.</p>
+   *
+   * @param bitsPerKey number of bits to use
+   */
+  public BloomFilter(final int bitsPerKey) {
+    this(bitsPerKey, DEFAULT_MODE);
+  }
+
+  /**
+   * BloomFilter constructor
+   *
+   * <p>
+   * bits_per_key: bits per key in bloom filter. A good value for bits_per_key
+   * is 10, which yields a filter with ~ 1% false positive rate.
+   * <p><strong>default bits_per_key</strong>: 10</p>
+   *
+   * <p>use_block_based_builder: use block based filter rather than full filter.
+   * If you want to builder full filter, it needs to be set to false.
+   * </p>
+   * <p><strong>default mode: block based filter</strong></p>
+   * <p>
+   * Callers must delete the result after any database that is using the
+   * result has been closed.</p>
+   *
+   * @param bitsPerKey number of bits to use
+   * @param useBlockBasedMode use block based mode or full filter mode
+   */
+  public BloomFilter(final int bitsPerKey, final boolean useBlockBasedMode) {
+    super();
+    bitsPerKey_ = bitsPerKey;
+    useBlockBasedMode_ = useBlockBasedMode;
+    createNewFilter();
+  }
+
+  @Override
+  protected final void createNewFilter() {
+    createNewBloomFilter(bitsPerKey_, useBlockBasedMode_);
+  }
+
+  private native void createNewBloomFilter(int bitsKeyKey,
+      boolean useBlockBasedMode);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/BuiltinComparator.java b/src/rocksdb/java/src/main/java/org/rocksdb/BuiltinComparator.java
new file mode 100644
index 0000000..ee92e8d
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/BuiltinComparator.java
@@ -0,0 +1,20 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Builtin RocksDB comparators
+ *
+ * <ol>
+ *   <li>BYTEWISE_COMPARATOR - Sorts all keys in ascending bytewise
+ *   order.</li>
+ *   <li>REVERSE_BYTEWISE_COMPARATOR - Sorts all keys in descending bytewise
+ *   order</li>
+ * </ol>
+ */
+public enum BuiltinComparator {
+  BYTEWISE_COMPARATOR, REVERSE_BYTEWISE_COMPARATOR
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Checkpoint.java b/src/rocksdb/java/src/main/java/org/rocksdb/Checkpoint.java
new file mode 100644
index 0000000..816ecea
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Checkpoint.java
@@ -0,0 +1,72 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Provides Checkpoint functionality. Checkpoints
+ * provide persistent snapshots of RocksDB databases.
+ */
+public class Checkpoint extends RocksObject {
+
+  /**
+   * Creates a Checkpoint object to be used for creating open-able
+   * snapshots.
+   *
+   * @param db {@link RocksDB} instance.
+   * @return a Checkpoint instance.
+   *
+   * @throws java.lang.IllegalArgumentException if {@link RocksDB}
+   *     instance is null.
+   * @throws java.lang.IllegalStateException if {@link RocksDB}
+   *     instance is not initialized.
+   */
+  public static Checkpoint create(final RocksDB db) {
+    if (db == null) {
+      throw new IllegalArgumentException(
+          "RocksDB instance shall not be null.");
+    } else if (!db.isInitialized()) {
+      throw new IllegalStateException(
+          "RocksDB instance must be initialized.");
+    }
+    Checkpoint checkpoint = new Checkpoint(db);
+    return checkpoint;
+  }
+
+  /**
+   * <p>Builds an open-able snapshot of RocksDB on the same disk, which
+   * accepts an output directory on the same disk, and under the directory
+   * (1) hard-linked SST files pointing to existing live SST files
+   * (2) a copied manifest files and other files</p>
+   *
+   * @param checkpointPath path to the folder where the snapshot is going
+   *     to be stored.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void createCheckpoint(final String checkpointPath)
+      throws RocksDBException {
+    createCheckpoint(nativeHandle_, checkpointPath);
+  }
+
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private Checkpoint(RocksDB db) {
+    super();
+    nativeHandle_ = newCheckpoint(db.nativeHandle_);
+    db_ = db;
+  }
+
+  private RocksDB db_;
+
+  private static native long newCheckpoint(long dbHandle);
+  private native void disposeInternal(long handle);
+
+  private native void createCheckpoint(long handle, String checkpointPath)
+      throws RocksDBException;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java b/src/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java
new file mode 100644
index 0000000..e685376
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ChecksumType.java
@@ -0,0 +1,39 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Checksum types used in conjunction with BlockBasedTable.
+ */
+public enum ChecksumType {
+  /**
+   * Not implemented yet.
+   */
+  kNoChecksum((byte) 0),
+  /**
+   * CRC32 Checksum
+   */
+  kCRC32c((byte) 1),
+  /**
+   * XX Hash
+   */
+  kxxHash((byte) 2);
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  private ChecksumType(byte value) {
+    value_ = value;
+  }
+
+  private final byte value_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java
new file mode 100644
index 0000000..8def05e
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java
@@ -0,0 +1,61 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * <p>Describes a column family with a
+ * name and respective Options.</p>
+ */
+public class ColumnFamilyDescriptor {
+
+  /**
+   * <p>Creates a new Column Family using a name and default
+   * options,</p>
+   *
+   * @param columnFamilyName name of column family.
+   * @since 3.10.0
+   */
+  public ColumnFamilyDescriptor(final byte[] columnFamilyName) {
+    this(columnFamilyName, new ColumnFamilyOptions());
+  }
+
+  /**
+   * <p>Creates a new Column Family using a name and custom
+   * options.</p>
+   *
+   * @param columnFamilyName name of column family.
+   * @param columnFamilyOptions options to be used with
+   *     column family.
+   * @since 3.10.0
+   */
+  public ColumnFamilyDescriptor(final byte[] columnFamilyName,
+      final ColumnFamilyOptions columnFamilyOptions) {
+    columnFamilyName_ = columnFamilyName;
+    columnFamilyOptions_ = columnFamilyOptions;
+  }
+
+  /**
+   * Retrieve name of column family.
+   *
+   * @return column family name.
+   * @since 3.10.0
+   */
+  public byte[] columnFamilyName() {
+    return columnFamilyName_;
+  }
+
+  /**
+   * Retrieve assigned options instance.
+   *
+   * @return Options instance assigned to this instance.
+   */
+  public ColumnFamilyOptions columnFamilyOptions() {
+    return columnFamilyOptions_;
+  }
+
+  private final byte[] columnFamilyName_;
+  private final ColumnFamilyOptions columnFamilyOptions_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java
new file mode 100644
index 0000000..613cb89
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java
@@ -0,0 +1,45 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * ColumnFamilyHandle class to hold handles to underlying rocksdb
+ * ColumnFamily Pointers.
+ */
+public class ColumnFamilyHandle extends RocksObject {
+  ColumnFamilyHandle(final RocksDB rocksDB,
+      final long nativeHandle) {
+    super();
+    nativeHandle_ = nativeHandle;
+    // rocksDB must point to a valid RocksDB instance;
+    assert(rocksDB != null);
+    // ColumnFamilyHandle must hold a reference to the related RocksDB instance
+    // to guarantee that while a GC cycle starts ColumnFamilyHandle instances
+    // are freed prior to RocksDB instances.
+    rocksDB_ = rocksDB;
+  }
+
+  /**
+   * <p>Deletes underlying C++ iterator pointer.</p>
+   *
+   * <p>Note: the underlying handle can only be safely deleted if the RocksDB
+   * instance related to a certain ColumnFamilyHandle is still valid and initialized.
+   * Therefore {@code disposeInternal()} checks if the RocksDB is initialized
+   * before freeing the native handle.</p>
+   */
+  @Override protected void disposeInternal() {
+    synchronized (rocksDB_) {
+      assert (isInitialized());
+      if (rocksDB_.isInitialized()) {
+        disposeInternal(nativeHandle_);
+      }
+    }
+  }
+
+  private native void disposeInternal(long handle);
+
+  private final RocksDB rocksDB_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
new file mode 100644
index 0000000..4304f58
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
@@ -0,0 +1,820 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+
+/**
+ * ColumnFamilyOptions to control the behavior of a database.  It will be used
+ * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()).
+ *
+ * If {@link #dispose()} function is not called, then it will be GC'd automatically
+ * and native resources will be released as part of the process.
+ */
+public class ColumnFamilyOptions extends RocksObject
+    implements ColumnFamilyOptionsInterface {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Construct ColumnFamilyOptions.
+   *
+   * This constructor will create (by allocating a block of memory)
+   * an {@code rocksdb::DBOptions} in the c++ side.
+   */
+  public ColumnFamilyOptions() {
+    super();
+    newColumnFamilyOptions();
+  }
+
+  /**
+   * <p>Method to get a options instance by using pre-configured
+   * property values. If one or many values are undefined in
+   * the context of RocksDB the method will return a null
+   * value.</p>
+   *
+   * <p><strong>Note</strong>: Property keys can be derived from
+   * getter methods within the options class. Example: the method
+   * {@code writeBufferSize()} has a property key:
+   * {@code write_buffer_size}.</p>
+   *
+   * @param properties {@link java.util.Properties} instance.
+   *
+   * @return {@link org.rocksdb.ColumnFamilyOptions instance}
+   *     or null.
+   *
+   * @throws java.lang.IllegalArgumentException if null or empty
+   *     {@link Properties} instance is passed to the method call.
+   */
+  public static ColumnFamilyOptions getColumnFamilyOptionsFromProps(
+      final Properties properties) {
+    if (properties == null || properties.size() == 0) {
+      throw new IllegalArgumentException(
+          "Properties value must contain at least one value.");
+    }
+    ColumnFamilyOptions columnFamilyOptions = null;
+    StringBuilder stringBuilder = new StringBuilder();
+    for (final String name : properties.stringPropertyNames()){
+      stringBuilder.append(name);
+      stringBuilder.append("=");
+      stringBuilder.append(properties.getProperty(name));
+      stringBuilder.append(";");
+    }
+    long handle = getColumnFamilyOptionsFromProps(
+        stringBuilder.toString());
+    if (handle != 0){
+      columnFamilyOptions = new ColumnFamilyOptions(handle);
+    }
+    return columnFamilyOptions;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeForPointLookup(
+      final long blockCacheSizeMb) {
+    optimizeForPointLookup(nativeHandle_,
+        blockCacheSizeMb);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeLevelStyleCompaction() {
+    optimizeLevelStyleCompaction(nativeHandle_,
+        DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeLevelStyleCompaction(
+      final long memtableMemoryBudget) {
+    optimizeLevelStyleCompaction(nativeHandle_,
+        memtableMemoryBudget);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeUniversalStyleCompaction() {
+    optimizeUniversalStyleCompaction(nativeHandle_,
+        DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions optimizeUniversalStyleCompaction(
+      final long memtableMemoryBudget) {
+    optimizeUniversalStyleCompaction(nativeHandle_,
+        memtableMemoryBudget);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setComparator(final BuiltinComparator builtinComparator) {
+    assert(isInitialized());
+    setComparatorHandle(nativeHandle_, builtinComparator.ordinal());
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setComparator(
+      final AbstractComparator<? extends AbstractSlice<?>> comparator) {
+    assert (isInitialized());
+    setComparatorHandle(nativeHandle_, comparator.nativeHandle_);
+    comparator_ = comparator;
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setMergeOperatorName(final String name) {
+    assert (isInitialized());
+    if (name == null) {
+      throw new IllegalArgumentException(
+          "Merge operator name must not be null.");
+    }
+    setMergeOperatorName(nativeHandle_, name);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setMergeOperator(final MergeOperator mergeOperator) {
+    setMergeOperator(nativeHandle_, mergeOperator.newMergeOperatorHandle());
+    return this;
+  }
+
+  public ColumnFamilyOptions setCompactionFilter(
+        final AbstractCompactionFilter<? extends AbstractSlice<?>> compactionFilter) {
+    setCompactionFilterHandle(nativeHandle_, compactionFilter.nativeHandle_);
+    compactionFilter_ = compactionFilter;
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setWriteBufferSize(final long writeBufferSize) {
+    assert(isInitialized());
+    setWriteBufferSize(nativeHandle_, writeBufferSize);
+    return this;
+  }
+
+  @Override
+  public long writeBufferSize()  {
+    assert(isInitialized());
+    return writeBufferSize(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxWriteBufferNumber(
+      final int maxWriteBufferNumber) {
+    assert(isInitialized());
+    setMaxWriteBufferNumber(nativeHandle_, maxWriteBufferNumber);
+    return this;
+  }
+
+  @Override
+  public int maxWriteBufferNumber() {
+    assert(isInitialized());
+    return maxWriteBufferNumber(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMinWriteBufferNumberToMerge(
+      final int minWriteBufferNumberToMerge) {
+    setMinWriteBufferNumberToMerge(nativeHandle_, minWriteBufferNumberToMerge);
+    return this;
+  }
+
+  @Override
+  public int minWriteBufferNumberToMerge() {
+    return minWriteBufferNumberToMerge(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions useFixedLengthPrefixExtractor(final int n) {
+    assert(isInitialized());
+    useFixedLengthPrefixExtractor(nativeHandle_, n);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions useCappedPrefixExtractor(final int n) {
+    assert(isInitialized());
+    useCappedPrefixExtractor(nativeHandle_, n);
+    return this;
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompressionType(final CompressionType compressionType) {
+    setCompressionType(nativeHandle_, compressionType.getValue());
+    return this;
+  }
+
+  @Override
+  public CompressionType compressionType() {
+    return CompressionType.values()[compressionType(nativeHandle_)];
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompressionPerLevel(
+      final List<CompressionType> compressionLevels) {
+    final List<Byte> byteCompressionTypes = new ArrayList<>(
+        compressionLevels.size());
+    for (final CompressionType compressionLevel : compressionLevels) {
+      byteCompressionTypes.add(compressionLevel.getValue());
+    }
+    setCompressionPerLevel(nativeHandle_, byteCompressionTypes);
+    return this;
+  }
+
+  @Override
+  public List<CompressionType> compressionPerLevel() {
+    final List<Byte> byteCompressionTypes =
+        compressionPerLevel(nativeHandle_);
+    final List<CompressionType> compressionLevels = new ArrayList<>();
+    for (final Byte byteCompressionType : byteCompressionTypes) {
+      compressionLevels.add(CompressionType.getCompressionType(
+          byteCompressionType));
+    }
+    return compressionLevels;
+  }
+
+  @Override
+  public ColumnFamilyOptions setNumLevels(final int numLevels) {
+    setNumLevels(nativeHandle_, numLevels);
+    return this;
+  }
+
+  @Override
+  public int numLevels() {
+    return numLevels(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setLevelZeroFileNumCompactionTrigger(
+      final int numFiles) {
+    setLevelZeroFileNumCompactionTrigger(
+        nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public int levelZeroFileNumCompactionTrigger() {
+    return levelZeroFileNumCompactionTrigger(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setLevelZeroSlowdownWritesTrigger(
+      final int numFiles) {
+    setLevelZeroSlowdownWritesTrigger(nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public int levelZeroSlowdownWritesTrigger() {
+    return levelZeroSlowdownWritesTrigger(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setLevelZeroStopWritesTrigger(final int numFiles) {
+    setLevelZeroStopWritesTrigger(nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public int levelZeroStopWritesTrigger() {
+    return levelZeroStopWritesTrigger(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxMemCompactionLevel(
+      final int maxMemCompactionLevel) {
+    return this;
+  }
+
+  @Override
+  public int maxMemCompactionLevel() {
+    return 0;
+  }
+
+  @Override
+  public ColumnFamilyOptions setTargetFileSizeBase(
+      final long targetFileSizeBase) {
+    setTargetFileSizeBase(nativeHandle_, targetFileSizeBase);
+    return this;
+  }
+
+  @Override
+  public long targetFileSizeBase() {
+    return targetFileSizeBase(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setTargetFileSizeMultiplier(
+      final int multiplier) {
+    setTargetFileSizeMultiplier(nativeHandle_, multiplier);
+    return this;
+  }
+
+  @Override
+  public int targetFileSizeMultiplier() {
+    return targetFileSizeMultiplier(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxBytesForLevelBase(
+      final long maxBytesForLevelBase) {
+    setMaxBytesForLevelBase(nativeHandle_, maxBytesForLevelBase);
+    return this;
+  }
+
+  @Override
+  public long maxBytesForLevelBase() {
+    return maxBytesForLevelBase(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setLevelCompactionDynamicLevelBytes(
+      final boolean enableLevelCompactionDynamicLevelBytes) {
+    setLevelCompactionDynamicLevelBytes(nativeHandle_,
+        enableLevelCompactionDynamicLevelBytes);
+    return this;
+  }
+
+  @Override
+  public boolean levelCompactionDynamicLevelBytes() {
+    return levelCompactionDynamicLevelBytes(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxBytesForLevelMultiplier(
+      final int multiplier) {
+    setMaxBytesForLevelMultiplier(nativeHandle_, multiplier);
+    return this;
+  }
+
+  @Override
+  public int maxBytesForLevelMultiplier() {
+    return maxBytesForLevelMultiplier(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setExpandedCompactionFactor(
+      final int expandedCompactionFactor) {
+    setExpandedCompactionFactor(nativeHandle_, expandedCompactionFactor);
+    return this;
+  }
+
+  @Override
+  public int expandedCompactionFactor() {
+    return expandedCompactionFactor(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setSourceCompactionFactor(
+      final int sourceCompactionFactor) {
+    setSourceCompactionFactor(nativeHandle_, sourceCompactionFactor);
+    return this;
+  }
+
+  @Override
+  public int sourceCompactionFactor() {
+    return sourceCompactionFactor(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxGrandparentOverlapFactor(
+      final int maxGrandparentOverlapFactor) {
+    setMaxGrandparentOverlapFactor(nativeHandle_, maxGrandparentOverlapFactor);
+    return this;
+  }
+
+  @Override
+  public int maxGrandparentOverlapFactor() {
+    return maxGrandparentOverlapFactor(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setSoftRateLimit(
+      final double softRateLimit) {
+    setSoftRateLimit(nativeHandle_, softRateLimit);
+    return this;
+  }
+
+  @Override
+  public double softRateLimit() {
+    return softRateLimit(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setHardRateLimit(
+      final double hardRateLimit) {
+    setHardRateLimit(nativeHandle_, hardRateLimit);
+    return this;
+  }
+
+  @Override
+  public double hardRateLimit() {
+    return hardRateLimit(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setRateLimitDelayMaxMilliseconds(
+      final int rateLimitDelayMaxMilliseconds) {
+    setRateLimitDelayMaxMilliseconds(
+        nativeHandle_, rateLimitDelayMaxMilliseconds);
+    return this;
+  }
+
+  @Override
+  public int rateLimitDelayMaxMilliseconds() {
+    return rateLimitDelayMaxMilliseconds(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setArenaBlockSize(
+      final long arenaBlockSize) {
+    setArenaBlockSize(nativeHandle_, arenaBlockSize);
+    return this;
+  }
+
+  @Override
+  public long arenaBlockSize() {
+    return arenaBlockSize(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setDisableAutoCompactions(
+      final boolean disableAutoCompactions) {
+    setDisableAutoCompactions(nativeHandle_, disableAutoCompactions);
+    return this;
+  }
+
+  @Override
+  public boolean disableAutoCompactions() {
+    return disableAutoCompactions(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setPurgeRedundantKvsWhileFlush(
+      final boolean purgeRedundantKvsWhileFlush) {
+    setPurgeRedundantKvsWhileFlush(
+        nativeHandle_, purgeRedundantKvsWhileFlush);
+    return this;
+  }
+
+  @Override
+  public boolean purgeRedundantKvsWhileFlush() {
+    return purgeRedundantKvsWhileFlush(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompactionStyle(
+      final CompactionStyle compactionStyle) {
+    setCompactionStyle(nativeHandle_, compactionStyle.getValue());
+    return this;
+  }
+
+  @Override
+  public CompactionStyle compactionStyle() {
+    return CompactionStyle.values()[compactionStyle(nativeHandle_)];
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxTableFilesSizeFIFO(
+      final long maxTableFilesSize) {
+    assert(maxTableFilesSize > 0); // unsigned native type
+    assert(isInitialized());
+    setMaxTableFilesSizeFIFO(nativeHandle_, maxTableFilesSize);
+    return this;
+  }
+
+  @Override
+  public long maxTableFilesSizeFIFO() {
+    return maxTableFilesSizeFIFO(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setVerifyChecksumsInCompaction(
+      final boolean verifyChecksumsInCompaction) {
+    setVerifyChecksumsInCompaction(
+        nativeHandle_, verifyChecksumsInCompaction);
+    return this;
+  }
+
+  @Override
+  public boolean verifyChecksumsInCompaction() {
+    return verifyChecksumsInCompaction(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setFilterDeletes(
+      final boolean filterDeletes) {
+    setFilterDeletes(nativeHandle_, filterDeletes);
+    return this;
+  }
+
+  @Override
+  public boolean filterDeletes() {
+    return filterDeletes(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxSequentialSkipInIterations(
+      final long maxSequentialSkipInIterations) {
+    setMaxSequentialSkipInIterations(nativeHandle_, maxSequentialSkipInIterations);
+    return this;
+  }
+
+  @Override
+  public long maxSequentialSkipInIterations() {
+    return maxSequentialSkipInIterations(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMemTableConfig(
+      final MemTableConfig config) {
+    memTableConfig_ = config;
+    setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle());
+    return this;
+  }
+
+  @Override
+  public String memTableFactoryName() {
+    assert(isInitialized());
+    return memTableFactoryName(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setTableFormatConfig(
+      final TableFormatConfig config) {
+    tableFormatConfig_ = config;
+    setTableFactory(nativeHandle_, config.newTableFactoryHandle());
+    return this;
+  }
+
+  @Override
+  public String tableFactoryName() {
+    assert(isInitialized());
+    return tableFactoryName(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setInplaceUpdateSupport(
+      final boolean inplaceUpdateSupport) {
+    setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport);
+    return this;
+  }
+
+  @Override
+  public boolean inplaceUpdateSupport() {
+    return inplaceUpdateSupport(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setInplaceUpdateNumLocks(
+      final long inplaceUpdateNumLocks) {
+    setInplaceUpdateNumLocks(nativeHandle_, inplaceUpdateNumLocks);
+    return this;
+  }
+
+  @Override
+  public long inplaceUpdateNumLocks() {
+    return inplaceUpdateNumLocks(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMemtablePrefixBloomBits(
+      final int memtablePrefixBloomBits) {
+    setMemtablePrefixBloomBits(nativeHandle_, memtablePrefixBloomBits);
+    return this;
+  }
+
+  @Override
+  public int memtablePrefixBloomBits() {
+    return memtablePrefixBloomBits(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMemtablePrefixBloomProbes(
+      final int memtablePrefixBloomProbes) {
+    setMemtablePrefixBloomProbes(nativeHandle_, memtablePrefixBloomProbes);
+    return this;
+  }
+
+  @Override
+  public int memtablePrefixBloomProbes() {
+    return memtablePrefixBloomProbes(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setBloomLocality(int bloomLocality) {
+    setBloomLocality(nativeHandle_, bloomLocality);
+    return this;
+  }
+
+  @Override
+  public int bloomLocality() {
+    return bloomLocality(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMaxSuccessiveMerges(
+      final long maxSuccessiveMerges) {
+    setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges);
+    return this;
+  }
+
+  @Override
+  public long maxSuccessiveMerges() {
+    return maxSuccessiveMerges(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setMinPartialMergeOperands(
+      final int minPartialMergeOperands) {
+    setMinPartialMergeOperands(nativeHandle_, minPartialMergeOperands);
+    return this;
+  }
+
+  @Override
+  public int minPartialMergeOperands() {
+    return minPartialMergeOperands(nativeHandle_);
+  }
+
+  @Override
+  public ColumnFamilyOptions setOptimizeFiltersForHits(
+      final boolean optimizeFiltersForHits) {
+    setOptimizeFiltersForHits(nativeHandle_, optimizeFiltersForHits);
+    return this;
+  }
+
+  @Override
+  public boolean optimizeFiltersForHits() {
+    return optimizeFiltersForHits(nativeHandle_);
+  }
+
+  /**
+   * Release the memory allocated for the current instance
+   * in the c++ side.
+   */
+  @Override protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  /**
+   * <p>Private constructor to be used by
+   * {@link #getColumnFamilyOptionsFromProps(java.util.Properties)}</p>
+   *
+   * @param handle native handle to ColumnFamilyOptions instance.
+   */
+  private ColumnFamilyOptions(final long handle) {
+    super();
+    nativeHandle_ = handle;
+  }
+
+  private static native long getColumnFamilyOptionsFromProps(
+      String optString);
+
+  private native void newColumnFamilyOptions();
+  private native void disposeInternal(long handle);
+
+  private native void optimizeForPointLookup(long handle,
+      long blockCacheSizeMb);
+  private native void optimizeLevelStyleCompaction(long handle,
+      long memtableMemoryBudget);
+  private native void optimizeUniversalStyleCompaction(long handle,
+      long memtableMemoryBudget);
+  private native void setComparatorHandle(long handle, int builtinComparator);
+  private native void setComparatorHandle(long optHandle, long comparatorHandle);
+  private native void setMergeOperatorName(
+      long handle, String name);
+  private native void setMergeOperator(
+      long handle, long mergeOperatorHandle);
+  private native void setCompactionFilterHandle(long handle, long compactionFilterHandle);
+  private native void setWriteBufferSize(long handle, long writeBufferSize)
+      throws IllegalArgumentException;
+  private native long writeBufferSize(long handle);
+  private native void setMaxWriteBufferNumber(
+      long handle, int maxWriteBufferNumber);
+  private native int maxWriteBufferNumber(long handle);
+  private native void setMinWriteBufferNumberToMerge(
+      long handle, int minWriteBufferNumberToMerge);
+  private native int minWriteBufferNumberToMerge(long handle);
+  private native void setCompressionType(long handle, byte compressionType);
+  private native byte compressionType(long handle);
+  private native void setCompressionPerLevel(long handle,
+      List<Byte> compressionLevels);
+  private native List<Byte> compressionPerLevel(long handle);
+  private native void useFixedLengthPrefixExtractor(
+      long handle, int prefixLength);
+  private native void useCappedPrefixExtractor(
+      long handle, int prefixLength);
+  private native void setNumLevels(
+      long handle, int numLevels);
+  private native int numLevels(long handle);
+  private native void setLevelZeroFileNumCompactionTrigger(
+      long handle, int numFiles);
+  private native int levelZeroFileNumCompactionTrigger(long handle);
+  private native void setLevelZeroSlowdownWritesTrigger(
+      long handle, int numFiles);
+  private native int levelZeroSlowdownWritesTrigger(long handle);
+  private native void setLevelZeroStopWritesTrigger(
+      long handle, int numFiles);
+  private native int levelZeroStopWritesTrigger(long handle);
+  private native void setTargetFileSizeBase(
+      long handle, long targetFileSizeBase);
+  private native long targetFileSizeBase(long handle);
+  private native void setTargetFileSizeMultiplier(
+      long handle, int multiplier);
+  private native int targetFileSizeMultiplier(long handle);
+  private native void setMaxBytesForLevelBase(
+      long handle, long maxBytesForLevelBase);
+  private native long maxBytesForLevelBase(long handle);
+  private native void setLevelCompactionDynamicLevelBytes(
+      long handle, boolean enableLevelCompactionDynamicLevelBytes);
+  private native boolean levelCompactionDynamicLevelBytes(
+      long handle);
+  private native void setMaxBytesForLevelMultiplier(
+      long handle, int multiplier);
+  private native int maxBytesForLevelMultiplier(long handle);
+  private native void setExpandedCompactionFactor(
+      long handle, int expandedCompactionFactor);
+  private native int expandedCompactionFactor(long handle);
+  private native void setSourceCompactionFactor(
+      long handle, int sourceCompactionFactor);
+  private native int sourceCompactionFactor(long handle);
+  private native void setMaxGrandparentOverlapFactor(
+      long handle, int maxGrandparentOverlapFactor);
+  private native int maxGrandparentOverlapFactor(long handle);
+  private native void setSoftRateLimit(
+      long handle, double softRateLimit);
+  private native double softRateLimit(long handle);
+  private native void setHardRateLimit(
+      long handle, double hardRateLimit);
+  private native double hardRateLimit(long handle);
+  private native void setRateLimitDelayMaxMilliseconds(
+      long handle, int rateLimitDelayMaxMilliseconds);
+  private native int rateLimitDelayMaxMilliseconds(long handle);
+  private native void setArenaBlockSize(
+      long handle, long arenaBlockSize)
+      throws IllegalArgumentException;
+  private native long arenaBlockSize(long handle);
+  private native void setDisableAutoCompactions(
+      long handle, boolean disableAutoCompactions);
+  private native boolean disableAutoCompactions(long handle);
+  private native void setCompactionStyle(long handle, byte compactionStyle);
+  private native byte compactionStyle(long handle);
+   private native void setMaxTableFilesSizeFIFO(
+      long handle, long max_table_files_size);
+  private native long maxTableFilesSizeFIFO(long handle);
+  private native void setPurgeRedundantKvsWhileFlush(
+      long handle, boolean purgeRedundantKvsWhileFlush);
+  private native boolean purgeRedundantKvsWhileFlush(long handle);
+  private native void setVerifyChecksumsInCompaction(
+      long handle, boolean verifyChecksumsInCompaction);
+  private native boolean verifyChecksumsInCompaction(long handle);
+  private native void setFilterDeletes(
+      long handle, boolean filterDeletes);
+  private native boolean filterDeletes(long handle);
+  private native void setMaxSequentialSkipInIterations(
+      long handle, long maxSequentialSkipInIterations);
+  private native long maxSequentialSkipInIterations(long handle);
+  private native void setMemTableFactory(long handle, long factoryHandle);
+  private native String memTableFactoryName(long handle);
+  private native void setTableFactory(long handle, long factoryHandle);
+  private native String tableFactoryName(long handle);
+  private native void setInplaceUpdateSupport(
+      long handle, boolean inplaceUpdateSupport);
+  private native boolean inplaceUpdateSupport(long handle);
+  private native void setInplaceUpdateNumLocks(
+      long handle, long inplaceUpdateNumLocks)
+      throws IllegalArgumentException;
+  private native long inplaceUpdateNumLocks(long handle);
+  private native void setMemtablePrefixBloomBits(
+      long handle, int memtablePrefixBloomBits);
+  private native int memtablePrefixBloomBits(long handle);
+  private native void setMemtablePrefixBloomProbes(
+      long handle, int memtablePrefixBloomProbes);
+  private native int memtablePrefixBloomProbes(long handle);
+  private native void setBloomLocality(
+      long handle, int bloomLocality);
+  private native int bloomLocality(long handle);
+  private native void setMaxSuccessiveMerges(
+      long handle, long maxSuccessiveMerges)
+      throws IllegalArgumentException;
+  private native long maxSuccessiveMerges(long handle);
+  private native void setMinPartialMergeOperands(
+      long handle, int minPartialMergeOperands);
+  private native int minPartialMergeOperands(long handle);
+  private native void setOptimizeFiltersForHits(long handle,
+      boolean optimizeFiltersForHits);
+  private native boolean optimizeFiltersForHits(long handle);
+
+  MemTableConfig memTableConfig_;
+  TableFormatConfig tableFormatConfig_;
+  AbstractComparator<? extends AbstractSlice<?>> comparator_;
+  AbstractCompactionFilter<? extends AbstractSlice<?>> compactionFilter_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
new file mode 100644
index 0000000..1c7a5a1
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
@@ -0,0 +1,1182 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.List;
+
+public interface ColumnFamilyOptionsInterface {
+
+  /**
+   * Use this if you don't need to keep the data sorted, i.e. you'll never use
+   * an iterator, only Put() and Get() API calls
+   *
+   * @param blockCacheSizeMb Block cache size in MB
+   * @return the instance of the current Object.
+   */
+  Object optimizeForPointLookup(long blockCacheSizeMb);
+
+  /**
+   * <p>Default values for some parameters in ColumnFamilyOptions are not
+   * optimized for heavy workloads and big datasets, which means you might
+   * observe write stalls under some conditions. As a starting point for tuning
+   * RocksDB options, use the following for level style compaction.</p>
+   *
+   * <p>Make sure to also call IncreaseParallelism(), which will provide the
+   * biggest performance gains.</p>
+   * <p>Note: we might use more memory than memtable_memory_budget during high
+   * write rate period</p>
+   *
+   * @return the instance of the current Object.
+   */
+  Object optimizeLevelStyleCompaction();
+
+  /**
+   * <p>Default values for some parameters in ColumnFamilyOptions are not
+   * optimized for heavy workloads and big datasets, which means you might
+   * observe write stalls under some conditions. As a starting point for tuning
+   * RocksDB options, use the following for level style compaction.</p>
+   *
+   * <p>Make sure to also call IncreaseParallelism(), which will provide the
+   * biggest performance gains.</p>
+   * <p>Note: we might use more memory than memtable_memory_budget during high
+   * write rate period</p>
+   *
+   * @param memtableMemoryBudget memory budget in bytes
+   * @return the instance of the current Object.
+   */
+  Object optimizeLevelStyleCompaction(long memtableMemoryBudget);
+
+  /**
+   * <p>Default values for some parameters in ColumnFamilyOptions are not
+   * optimized for heavy workloads and big datasets, which means you might
+   * observe write stalls under some conditions. As a starting point for tuning
+   * RocksDB options, use the following for universal style compaction.</p>
+   *
+   * <p>Universal style compaction is focused on reducing Write Amplification
+   * Factor for big data sets, but increases Space Amplification.</p>
+   *
+   * <p>Make sure to also call IncreaseParallelism(), which will provide the
+   * biggest performance gains.</p>
+   *
+   * <p>Note: we might use more memory than memtable_memory_budget during high
+   * write rate period</p>
+   *
+   * @return the instance of the current Object.
+   */
+  Object optimizeUniversalStyleCompaction();
+
+  /**
+   * <p>Default values for some parameters in ColumnFamilyOptions are not
+   * optimized for heavy workloads and big datasets, which means you might
+   * observe write stalls under some conditions. As a starting point for tuning
+   * RocksDB options, use the following for universal style compaction.</p>
+   *
+   * <p>Universal style compaction is focused on reducing Write Amplification
+   * Factor for big data sets, but increases Space Amplification.</p>
+   *
+   * <p>Make sure to also call IncreaseParallelism(), which will provide the
+   * biggest performance gains.</p>
+   *
+   * <p>Note: we might use more memory than memtable_memory_budget during high
+   * write rate period</p>
+   *
+   * @param memtableMemoryBudget memory budget in bytes
+   * @return the instance of the current Object.
+   */
+  Object optimizeUniversalStyleCompaction(long memtableMemoryBudget);
+
+  /**
+   * Set {@link BuiltinComparator} to be used with RocksDB.
+   *
+   * Note: Comparator can be set once upon database creation.
+   *
+   * Default: BytewiseComparator.
+   * @param builtinComparator a {@link BuiltinComparator} type.
+   * @return the instance of the current Object.
+   */
+  Object setComparator(BuiltinComparator builtinComparator);
+
+  /**
+   * Use the specified comparator for key ordering.
+   *
+   * Comparator should not be disposed before options instances using this comparator is
+   * disposed. If dispose() function is not called, then comparator object will be
+   * GC'd automatically.
+   *
+   * Comparator instance can be re-used in multiple options instances.
+   *
+   * @param comparator java instance.
+   * @return the instance of the current Object.
+   */
+  Object setComparator(AbstractComparator<? extends AbstractSlice<?>> comparator);
+
+  /**
+   * <p>Set the merge operator to be used for merging two merge operands
+   * of the same key. The merge function is invoked during
+   * compaction and at lookup time, if multiple key/value pairs belonging
+   * to the same key are found in the database.</p>
+   *
+   * @param name the name of the merge function, as defined by
+   * the MergeOperators factory (see utilities/MergeOperators.h)
+   * The merge function is specified by name and must be one of the
+   * standard merge operators provided by RocksDB. The available
+   * operators are "put", "uint64add", "stringappend" and "stringappendtest".
+   * @return the instance of the current Object.
+   */
+  Object setMergeOperatorName(String name);
+
+  /**
+   * <p>Set the merge operator to be used for merging two different key/value
+   * pairs that share the same key. The merge function is invoked during
+   * compaction and at lookup time, if multiple key/value pairs belonging
+   * to the same key are found in the database.</p>
+   *
+   * @param mergeOperator {@link MergeOperator} instance.
+   * @return the instance of the current Object.
+   */
+  Object setMergeOperator(MergeOperator mergeOperator);
+
+  /**
+   * Amount of data to build up in memory (backed by an unsorted log
+   * on disk) before converting to a sorted on-disk file.
+   *
+   * Larger values increase performance, especially during bulk loads.
+   * Up to {@code max_write_buffer_number} write buffers may be held in memory
+   * at the same time, so you may wish to adjust this parameter
+   * to control memory usage.
+   *
+   * Also, a larger write buffer will result in a longer recovery time
+   * the next time the database is opened.
+   *
+   * Default: 4MB
+   * @param writeBufferSize the size of write buffer.
+   * @return the instance of the current Object.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  Object setWriteBufferSize(long writeBufferSize);
+
+  /**
+   * Return size of write buffer size.
+   *
+   * @return size of write buffer.
+   * @see #setWriteBufferSize(long)
+   */
+  long writeBufferSize();
+
+  /**
+   * The maximum number of write buffers that are built up in memory.
+   * The default is 2, so that when 1 write buffer is being flushed to
+   * storage, new writes can continue to the other write buffer.
+   * Default: 2
+   *
+   * @param maxWriteBufferNumber maximum number of write buffers.
+   * @return the instance of the current Object.
+   */
+  Object setMaxWriteBufferNumber(
+      int maxWriteBufferNumber);
+
+  /**
+   * Returns maximum number of write buffers.
+   *
+   * @return maximum number of write buffers.
+   * @see #setMaxWriteBufferNumber(int)
+   */
+  int maxWriteBufferNumber();
+
+  /**
+   * The minimum number of write buffers that will be merged together
+   * before writing to storage.  If set to 1, then
+   * all write buffers are flushed to L0 as individual files and this increases
+   * read amplification because a get request has to check in all of these
+   * files. Also, an in-memory merge may result in writing lesser
+   * data to storage if there are duplicate records in each of these
+   * individual write buffers.  Default: 1
+   *
+   * @param minWriteBufferNumberToMerge the minimum number of write buffers
+   *     that will be merged together.
+   * @return the reference to the current option.
+   */
+  Object setMinWriteBufferNumberToMerge(
+      int minWriteBufferNumberToMerge);
+
+  /**
+   * The minimum number of write buffers that will be merged together
+   * before writing to storage.  If set to 1, then
+   * all write buffers are flushed to L0 as individual files and this increases
+   * read amplification because a get request has to check in all of these
+   * files. Also, an in-memory merge may result in writing lesser
+   * data to storage if there are duplicate records in each of these
+   * individual write buffers.  Default: 1
+   *
+   * @return the minimum number of write buffers that will be merged together.
+   */
+  int minWriteBufferNumberToMerge();
+
+  /**
+   * This prefix-extractor uses the first n bytes of a key as its prefix.
+   *
+   * In some hash-based memtable representation such as HashLinkedList
+   * and HashSkipList, prefixes are used to partition the keys into
+   * several buckets.  Prefix extractor is used to specify how to
+   * extract the prefix given a key.
+   *
+   * @param n use the first n bytes of a key as its prefix.
+   * @return the reference to the current option.
+   */
+  Object useFixedLengthPrefixExtractor(int n);
+
+
+  /**
+   * Same as fixed length prefix extractor, except that when slice is 
+   * shorter than the fixed length, it will use the full key.
+   *
+   * @param n use the first n bytes of a key as its prefix.
+   * @return the reference to the current option.
+   */
+  Object useCappedPrefixExtractor(int n);
+
+  /**
+   * Compress blocks using the specified compression algorithm.  This
+   * parameter can be changed dynamically.
+   *
+   * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression.
+   *
+   * @param compressionType Compression Type.
+   * @return the reference to the current option.
+   */
+  Object setCompressionType(CompressionType compressionType);
+
+  /**
+   * Compress blocks using the specified compression algorithm.  This
+   * parameter can be changed dynamically.
+   *
+   * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression.
+   *
+   * @return Compression type.
+   */
+  CompressionType compressionType();
+
+  /**
+   * <p>Different levels can have different compression
+   * policies. There are cases where most lower levels
+   * would like to use quick compression algorithms while
+   * the higher levels (which have more data) use
+   * compression algorithms that have better compression
+   * but could be slower. This array, if non-empty, should
+   * have an entry for each level of the database;
+   * these override the value specified in the previous
+   * field 'compression'.</p>
+   *
+   * <strong>NOTICE</strong>
+   * <p>If {@code level_compaction_dynamic_level_bytes=true},
+   * {@code compression_per_level[0]} still determines {@code L0},
+   * but other elements of the array are based on base level
+   * (the level {@code L0} files are merged to), and may not
+   * match the level users see from info log for metadata.
+   * </p>
+   * <p>If {@code L0} files are merged to {@code level - n},
+   * then, for {@code i>0}, {@code compression_per_level[i]}
+   * determines compaction type for level {@code n+i-1}.</p>
+   *
+   * <strong>Example</strong>
+   * <p>For example, if we have 5 levels, and we determine to
+   * merge {@code L0} data to {@code L4} (which means {@code L1..L3}
+   * will be empty), then the new files go to {@code L4} uses
+   * compression type {@code compression_per_level[1]}.</p>
+   *
+   * <p>If now {@code L0} is merged to {@code L2}. Data goes to
+   * {@code L2} will be compressed according to
+   * {@code compression_per_level[1]}, {@code L3} using
+   * {@code compression_per_level[2]}and {@code L4} using
+   * {@code compression_per_level[3]}. Compaction for each
+   * level can change when data grows.</p>
+   *
+   * <p><strong>Default:</strong> empty</p>
+   *
+   * @param compressionLevels list of
+   *     {@link org.rocksdb.CompressionType} instances.
+   *
+   * @return the reference to the current option.
+   */
+  Object setCompressionPerLevel(
+      List<CompressionType> compressionLevels);
+
+  /**
+   * <p>Return the currently set {@link org.rocksdb.CompressionType}
+   * per instances.</p>
+   *
+   * <p>See: {@link #setCompressionPerLevel(java.util.List)}</p>
+   *
+   * @return list of {@link org.rocksdb.CompressionType}
+   *     instances.
+   */
+  List<CompressionType> compressionPerLevel();
+
+  /**
+   * Set the number of levels for this database
+   * If level-styled compaction is used, then this number determines
+   * the total number of levels.
+   *
+   * @param numLevels the number of levels.
+   * @return the reference to the current option.
+   */
+  Object setNumLevels(int numLevels);
+
+  /**
+   * If level-styled compaction is used, then this number determines
+   * the total number of levels.
+   *
+   * @return the number of levels.
+   */
+  int numLevels();
+
+  /**
+   * Number of files to trigger level-0 compaction. A value < 0 means that
+   * level-0 compaction will not be triggered by number of files at all.
+   * Default: 4
+   *
+   * @param numFiles the number of files in level-0 to trigger compaction.
+   * @return the reference to the current option.
+   */
+  Object setLevelZeroFileNumCompactionTrigger(
+      int numFiles);
+
+  /**
+   * The number of files in level 0 to trigger compaction from level-0 to
+   * level-1.  A value < 0 means that level-0 compaction will not be
+   * triggered by number of files at all.
+   * Default: 4
+   *
+   * @return the number of files in level 0 to trigger compaction.
+   */
+  int levelZeroFileNumCompactionTrigger();
+
+  /**
+   * Soft limit on number of level-0 files. We start slowing down writes at this
+   * point. A value < 0 means that no writing slow down will be triggered by
+   * number of files in level-0.
+   *
+   * @param numFiles soft limit on number of level-0 files.
+   * @return the reference to the current option.
+   */
+  Object setLevelZeroSlowdownWritesTrigger(
+      int numFiles);
+
+  /**
+   * Soft limit on the number of level-0 files. We start slowing down writes
+   * at this point. A value < 0 means that no writing slow down will be
+   * triggered by number of files in level-0.
+   *
+   * @return the soft limit on the number of level-0 files.
+   */
+  int levelZeroSlowdownWritesTrigger();
+
+  /**
+   * Maximum number of level-0 files.  We stop writes at this point.
+   *
+   * @param numFiles the hard limit of the number of level-0 files.
+   * @return the reference to the current option.
+   */
+  Object setLevelZeroStopWritesTrigger(int numFiles);
+
+  /**
+   * Maximum number of level-0 files.  We stop writes at this point.
+   *
+   * @return the hard limit of the number of level-0 file.
+   */
+  int levelZeroStopWritesTrigger();
+
+  /**
+   * This does nothing anymore. Deprecated.
+   *
+   * @param maxMemCompactionLevel Unused.
+   *
+   * @return the reference to the current option.
+   */
+  @Deprecated
+  Object setMaxMemCompactionLevel(
+      int maxMemCompactionLevel);
+
+  /**
+   * This does nothing anymore. Deprecated.
+   *
+   * @return Always returns 0.
+   */
+  @Deprecated
+  int maxMemCompactionLevel();
+
+  /**
+   * The target file size for compaction.
+   * This targetFileSizeBase determines a level-1 file size.
+   * Target file size for level L can be calculated by
+   * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1))
+   * For example, if targetFileSizeBase is 2MB and
+   * target_file_size_multiplier is 10, then each file on level-1 will
+   * be 2MB, and each file on level 2 will be 20MB,
+   * and each file on level-3 will be 200MB.
+   * by default targetFileSizeBase is 2MB.
+   *
+   * @param targetFileSizeBase the target size of a level-0 file.
+   * @return the reference to the current option.
+   *
+   * @see #setTargetFileSizeMultiplier(int)
+   */
+  Object setTargetFileSizeBase(long targetFileSizeBase);
+
+  /**
+   * The target file size for compaction.
+   * This targetFileSizeBase determines a level-1 file size.
+   * Target file size for level L can be calculated by
+   * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1))
+   * For example, if targetFileSizeBase is 2MB and
+   * target_file_size_multiplier is 10, then each file on level-1 will
+   * be 2MB, and each file on level 2 will be 20MB,
+   * and each file on level-3 will be 200MB.
+   * by default targetFileSizeBase is 2MB.
+   *
+   * @return the target size of a level-0 file.
+   *
+   * @see #targetFileSizeMultiplier()
+   */
+  long targetFileSizeBase();
+
+  /**
+   * targetFileSizeMultiplier defines the size ratio between a
+   * level-L file and level-(L+1) file.
+   * By default target_file_size_multiplier is 1, meaning
+   * files in different levels have the same target.
+   *
+   * @param multiplier the size ratio between a level-(L+1) file
+   *     and level-L file.
+   * @return the reference to the current option.
+   */
+  Object setTargetFileSizeMultiplier(int multiplier);
+
+  /**
+   * targetFileSizeMultiplier defines the size ratio between a
+   * level-(L+1) file and level-L file.
+   * By default targetFileSizeMultiplier is 1, meaning
+   * files in different levels have the same target.
+   *
+   * @return the size ratio between a level-(L+1) file and level-L file.
+   */
+  int targetFileSizeMultiplier();
+
+  /**
+   * The upper-bound of the total size of level-1 files in bytes.
+   * Maximum number of bytes for level L can be calculated as
+   * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1))
+   * For example, if maxBytesForLevelBase is 20MB, and if
+   * max_bytes_for_level_multiplier is 10, total data size for level-1
+   * will be 20MB, total file size for level-2 will be 200MB,
+   * and total file size for level-3 will be 2GB.
+   * by default 'maxBytesForLevelBase' is 10MB.
+   *
+   * @param maxBytesForLevelBase maximum bytes for level base.
+   *
+   * @return the reference to the current option.
+   * @see #setMaxBytesForLevelMultiplier(int)
+   */
+  Object setMaxBytesForLevelBase(
+      long maxBytesForLevelBase);
+
+  /**
+   * The upper-bound of the total size of level-1 files in bytes.
+   * Maximum number of bytes for level L can be calculated as
+   * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1))
+   * For example, if maxBytesForLevelBase is 20MB, and if
+   * max_bytes_for_level_multiplier is 10, total data size for level-1
+   * will be 20MB, total file size for level-2 will be 200MB,
+   * and total file size for level-3 will be 2GB.
+   * by default 'maxBytesForLevelBase' is 10MB.
+   *
+   * @return the upper-bound of the total size of level-1 files
+   *     in bytes.
+   * @see #maxBytesForLevelMultiplier()
+   */
+  long maxBytesForLevelBase();
+
+  /**
+   * <p>If {@code true}, RocksDB will pick target size of each level
+   * dynamically. We will pick a base level b >= 1. L0 will be
+   * directly merged into level b, instead of always into level 1.
+   * Level 1 to b-1 need to be empty. We try to pick b and its target
+   * size so that</p>
+   *
+   * <ol>
+   * <li>target size is in the range of
+   *   (max_bytes_for_level_base / max_bytes_for_level_multiplier,
+   *    max_bytes_for_level_base]</li>
+   * <li>target size of the last level (level num_levels-1) equals to extra size
+   *    of the level.</li>
+   * </ol>
+   *
+   * <p>At the same time max_bytes_for_level_multiplier and
+   * max_bytes_for_level_multiplier_additional are still satisfied.</p>
+   *
+   * <p>With this option on, from an empty DB, we make last level the base
+   * level, which means merging L0 data into the last level, until it exceeds
+   * max_bytes_for_level_base. And then we make the second last level to be
+   * base level, to start to merge L0 data to second last level, with its
+   * target size to be {@code 1/max_bytes_for_level_multiplier} of the last
+   * levels extra size. After the data accumulates more so that we need to
+   * move the base level to the third last one, and so on.</p>
+   *
+   * <h2>Example</h2>
+   * <p>For example, assume {@code max_bytes_for_level_multiplier=10},
+   * {@code num_levels=6}, and {@code max_bytes_for_level_base=10MB}.</p>
+   *
+   * <p>Target sizes of level 1 to 5 starts with:</p>
+   * {@code [- - - - 10MB]}
+   * <p>with base level is level. Target sizes of level 1 to 4 are not applicable
+   * because they will not be used.
+   * Until the size of Level 5 grows to more than 10MB, say 11MB, we make
+   * base target to level 4 and now the targets looks like:</p>
+   * {@code [- - - 1.1MB 11MB]}
+   * <p>While data are accumulated, size targets are tuned based on actual data
+   * of level 5. When level 5 has 50MB of data, the target is like:</p>
+   * {@code [- - - 5MB 50MB]}
+   * <p>Until level 5's actual size is more than 100MB, say 101MB. Now if we
+   * keep level 4 to be the base level, its target size needs to be 10.1MB,
+   * which doesn't satisfy the target size range. So now we make level 3
+   * the target size and the target sizes of the levels look like:</p>
+   * {@code [- - 1.01MB 10.1MB 101MB]}
+   * <p>In the same way, while level 5 further grows, all levels' targets grow,
+   * like</p>
+   * {@code [- - 5MB 50MB 500MB]}
+   * <p>Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the
+   * base level and make levels' target sizes like this:</p>
+   * {@code [- 1.001MB 10.01MB 100.1MB 1001MB]}
+   * <p>and go on...</p>
+   *
+   * <p>By doing it, we give {@code max_bytes_for_level_multiplier} a priority
+   * against {@code max_bytes_for_level_base}, for a more predictable LSM tree
+   * shape. It is useful to limit worse case space amplification.</p>
+   *
+   * <p>{@code max_bytes_for_level_multiplier_additional} is ignored with
+   * this flag on.</p>
+   *
+   * <p>Turning this feature on or off for an existing DB can cause unexpected
+   * LSM tree structure so it's not recommended.</p>
+   *
+   * <p><strong>Caution</strong>: this option is experimental</p>
+   *
+   * <p>Default: false</p>
+   *
+   * @param enableLevelCompactionDynamicLevelBytes boolean value indicating
+   *     if {@code LevelCompactionDynamicLevelBytes} shall be enabled.
+   * @return the reference to the current option.
+   */
+  Object setLevelCompactionDynamicLevelBytes(
+      boolean enableLevelCompactionDynamicLevelBytes);
+
+  /**
+   * <p>Return if {@code LevelCompactionDynamicLevelBytes} is enabled.
+   * </p>
+   *
+   * <p>For further information see
+   * {@link #setLevelCompactionDynamicLevelBytes(boolean)}</p>
+   *
+   * @return boolean value indicating if
+   *    {@code levelCompactionDynamicLevelBytes} is enabled.
+   */
+  boolean levelCompactionDynamicLevelBytes();
+
+  /**
+   * The ratio between the total size of level-(L+1) files and the total
+   * size of level-L files for all L.
+   * DEFAULT: 10
+   *
+   * @param multiplier the ratio between the total size of level-(L+1)
+   *     files and the total size of level-L files for all L.
+   * @return the reference to the current option.
+   * @see #setMaxBytesForLevelBase(long)
+   */
+  Object setMaxBytesForLevelMultiplier(int multiplier);
+
+  /**
+   * The ratio between the total size of level-(L+1) files and the total
+   * size of level-L files for all L.
+   * DEFAULT: 10
+   *
+   * @return the ratio between the total size of level-(L+1) files and
+   *     the total size of level-L files for all L.
+   * @see #maxBytesForLevelBase()
+   */
+  int maxBytesForLevelMultiplier();
+
+  /**
+   * Maximum number of bytes in all compacted files.  We avoid expanding
+   * the lower level file set of a compaction if it would make the
+   * total compaction cover more than
+   * (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+   *
+   * @param expandedCompactionFactor the maximum number of bytes in all
+   *     compacted files.
+   * @return the reference to the current option.
+   * @see #setSourceCompactionFactor(int)
+   */
+  Object setExpandedCompactionFactor(int expandedCompactionFactor);
+
+  /**
+   * Maximum number of bytes in all compacted files.  We avoid expanding
+   * the lower level file set of a compaction if it would make the
+   * total compaction cover more than
+   * (expanded_compaction_factor * targetFileSizeLevel()) many bytes.
+   *
+   * @return the maximum number of bytes in all compacted files.
+   * @see #sourceCompactionFactor()
+   */
+  int expandedCompactionFactor();
+
+  /**
+   * Maximum number of bytes in all source files to be compacted in a
+   * single compaction run. We avoid picking too many files in the
+   * source level so that we do not exceed the total source bytes
+   * for compaction to exceed
+   * (source_compaction_factor * targetFileSizeLevel()) many bytes.
+   * Default:1, i.e. pick maxfilesize amount of data as the source of
+   * a compaction.
+   *
+   * @param sourceCompactionFactor the maximum number of bytes in all
+   *     source files to be compacted in a single compaction run.
+   * @return the reference to the current option.
+   * @see #setExpandedCompactionFactor(int)
+   */
+  Object setSourceCompactionFactor(int sourceCompactionFactor);
+
+  /**
+   * Maximum number of bytes in all source files to be compacted in a
+   * single compaction run. We avoid picking too many files in the
+   * source level so that we do not exceed the total source bytes
+   * for compaction to exceed
+   * (source_compaction_factor * targetFileSizeLevel()) many bytes.
+   * Default:1, i.e. pick maxfilesize amount of data as the source of
+   * a compaction.
+   *
+   * @return the maximum number of bytes in all source files to be compactedo.
+   * @see #expandedCompactionFactor()
+   */
+  int sourceCompactionFactor();
+
+  /**
+   * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
+   * stop building a single file in a level->level+1 compaction.
+   *
+   * @param maxGrandparentOverlapFactor maximum bytes of overlaps in
+   *     "grandparent" level.
+   * @return the reference to the current option.
+   */
+  Object setMaxGrandparentOverlapFactor(
+      int maxGrandparentOverlapFactor);
+
+  /**
+   * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we
+   * stop building a single file in a level->level+1 compaction.
+   *
+   * @return maximum bytes of overlaps in "grandparent" level.
+   */
+  int maxGrandparentOverlapFactor();
+
+  /**
+   * Puts are delayed 0-1 ms when any level has a compaction score that exceeds
+   * soft_rate_limit. This is ignored when == 0.0.
+   * CONSTRAINT: soft_rate_limit ≤ hard_rate_limit. If this constraint does not
+   * hold, RocksDB will set soft_rate_limit = hard_rate_limit
+   * Default: 0 (disabled)
+   *
+   * @param softRateLimit the soft-rate-limit of a compaction score
+   *     for put delay.
+   * @return the reference to the current option.
+   */
+  Object setSoftRateLimit(double softRateLimit);
+
+  /**
+   * Puts are delayed 0-1 ms when any level has a compaction score that exceeds
+   * soft_rate_limit. This is ignored when == 0.0.
+   * CONSTRAINT: soft_rate_limit ≤ hard_rate_limit. If this constraint does not
+   * hold, RocksDB will set soft_rate_limit = hard_rate_limit
+   * Default: 0 (disabled)
+   *
+   * @return soft-rate-limit for put delay.
+   */
+  double softRateLimit();
+
+  /**
+   * Puts are delayed 1ms at a time when any level has a compaction score that
+   * exceeds hard_rate_limit. This is ignored when ≤ 1.0.
+   * Default: 0 (disabled)
+   *
+   * @param hardRateLimit the hard-rate-limit of a compaction score for put
+   *     delay.
+   * @return the reference to the current option.
+   */
+  Object setHardRateLimit(double hardRateLimit);
+
+  /**
+   * Puts are delayed 1ms at a time when any level has a compaction score that
+   * exceeds hard_rate_limit. This is ignored when ≤ 1.0.
+   * Default: 0 (disabled)
+   *
+   * @return the hard-rate-limit of a compaction score for put delay.
+   */
+  double hardRateLimit();
+
+  /**
+   * The maximum time interval a put will be stalled when hard_rate_limit
+   * is enforced. If 0, then there is no limit.
+   * Default: 1000
+   *
+   * @param rateLimitDelayMaxMilliseconds the maximum time interval a put
+   *     will be stalled.
+   * @return the reference to the current option.
+   */
+  Object setRateLimitDelayMaxMilliseconds(
+      int rateLimitDelayMaxMilliseconds);
+
+  /**
+   * The maximum time interval a put will be stalled when hard_rate_limit
+   * is enforced.  If 0, then there is no limit.
+   * Default: 1000
+   *
+   * @return the maximum time interval a put will be stalled when
+   *     hard_rate_limit is enforced.
+   */
+  int rateLimitDelayMaxMilliseconds();
+
+  /**
+   * The size of one block in arena memory allocation.
+   * If ≤ 0, a proper value is automatically calculated (usually 1/10 of
+   * writer_buffer_size).
+   *
+   * There are two additonal restriction of the The specified size:
+   * (1) size should be in the range of [4096, 2 << 30] and
+   * (2) be the multiple of the CPU word (which helps with the memory
+   * alignment).
+   *
+   * We'll automatically check and adjust the size number to make sure it
+   * conforms to the restrictions.
+   * Default: 0
+   *
+   * @param arenaBlockSize the size of an arena block
+   * @return the reference to the current option.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  Object setArenaBlockSize(long arenaBlockSize);
+
+  /**
+   * The size of one block in arena memory allocation.
+   * If ≤ 0, a proper value is automatically calculated (usually 1/10 of
+   * writer_buffer_size).
+   *
+   * There are two additonal restriction of the The specified size:
+   * (1) size should be in the range of [4096, 2 << 30] and
+   * (2) be the multiple of the CPU word (which helps with the memory
+   * alignment).
+   *
+   * We'll automatically check and adjust the size number to make sure it
+   * conforms to the restrictions.
+   * Default: 0
+   *
+   * @return the size of an arena block
+   */
+  long arenaBlockSize();
+
+  /**
+   * Disable automatic compactions. Manual compactions can still
+   * be issued on this column family
+   *
+   * @param disableAutoCompactions true if auto-compactions are disabled.
+   * @return the reference to the current option.
+   */
+  Object setDisableAutoCompactions(boolean disableAutoCompactions);
+
+  /**
+   * Disable automatic compactions. Manual compactions can still
+   * be issued on this column family
+   *
+   * @return true if auto-compactions are disabled.
+   */
+  boolean disableAutoCompactions();
+
+  /**
+   * Purge duplicate/deleted keys when a memtable is flushed to storage.
+   * Default: true
+   *
+   * @param purgeRedundantKvsWhileFlush true if purging keys is disabled.
+   * @return the reference to the current option.
+   */
+  Object setPurgeRedundantKvsWhileFlush(
+      boolean purgeRedundantKvsWhileFlush);
+
+  /**
+   * Purge duplicate/deleted keys when a memtable is flushed to storage.
+   * Default: true
+   *
+   * @return true if purging keys is disabled.
+   */
+  boolean purgeRedundantKvsWhileFlush();
+
+  /**
+   * Set compaction style for DB.
+   *
+   * Default: LEVEL.
+   *
+   * @param compactionStyle Compaction style.
+   * @return the reference to the current option.
+   */
+  Object setCompactionStyle(CompactionStyle compactionStyle);
+
+  /**
+   * Compaction style for DB.
+   *
+   * @return Compaction style.
+   */
+  CompactionStyle compactionStyle();
+
+  /**
+   * FIFO compaction option.
+   * The oldest table file will be deleted
+   * once the sum of table files reaches this size.
+   * The default value is 1GB (1 * 1024 * 1024 * 1024).
+   *
+   * @param maxTableFilesSize the size limit of the total sum of table files.
+   * @return the instance of the current Object.
+   */
+  Object setMaxTableFilesSizeFIFO(long maxTableFilesSize);
+
+  /**
+   * FIFO compaction option.
+   * The oldest table file will be deleted
+   * once the sum of table files reaches this size.
+   * The default value is 1GB (1 * 1024 * 1024 * 1024).
+   *
+   * @return the size limit of the total sum of table files.
+   */
+  long maxTableFilesSizeFIFO();
+
+  /**
+   * If true, compaction will verify checksum on every read that happens
+   * as part of compaction
+   * Default: true
+   *
+   * @param verifyChecksumsInCompaction true if compaction verifies
+   *     checksum on every read.
+   * @return the reference to the current option.
+   */
+  Object setVerifyChecksumsInCompaction(
+      boolean verifyChecksumsInCompaction);
+
+  /**
+   * If true, compaction will verify checksum on every read that happens
+   * as part of compaction
+   * Default: true
+   *
+   * @return true if compaction verifies checksum on every read.
+   */
+  boolean verifyChecksumsInCompaction();
+
+  /**
+   * Use KeyMayExist API to filter deletes when this is true.
+   * If KeyMayExist returns false, i.e. the key definitely does not exist, then
+   * the delete is a noop. KeyMayExist only incurs in-memory look up.
+   * This optimization avoids writing the delete to storage when appropriate.
+   * Default: false
+   *
+   * @param filterDeletes true if filter-deletes behavior is on.
+   * @return the reference to the current option.
+   */
+  Object setFilterDeletes(boolean filterDeletes);
+
+  /**
+   * Use KeyMayExist API to filter deletes when this is true.
+   * If KeyMayExist returns false, i.e. the key definitely does not exist, then
+   * the delete is a noop. KeyMayExist only incurs in-memory look up.
+   * This optimization avoids writing the delete to storage when appropriate.
+   * Default: false
+   *
+   * @return true if filter-deletes behavior is on.
+   */
+  boolean filterDeletes();
+
+  /**
+   * An iteration->Next() sequentially skips over keys with the same
+   * user-key unless this option is set. This number specifies the number
+   * of keys (with the same userkey) that will be sequentially
+   * skipped before a reseek is issued.
+   * Default: 8
+   *
+   * @param maxSequentialSkipInIterations the number of keys could
+   *     be skipped in a iteration.
+   * @return the reference to the current option.
+   */
+  Object setMaxSequentialSkipInIterations(long maxSequentialSkipInIterations);
+
+  /**
+   * An iteration->Next() sequentially skips over keys with the same
+   * user-key unless this option is set. This number specifies the number
+   * of keys (with the same userkey) that will be sequentially
+   * skipped before a reseek is issued.
+   * Default: 8
+   *
+   * @return the number of keys could be skipped in a iteration.
+   */
+  long maxSequentialSkipInIterations();
+
+  /**
+   * Set the config for mem-table.
+   *
+   * @param config the mem-table config.
+   * @return the instance of the current Object.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  Object setMemTableConfig(MemTableConfig config);
+
+  /**
+   * Returns the name of the current mem table representation.
+   * Memtable format can be set using setTableFormatConfig.
+   *
+   * @return the name of the currently-used memtable factory.
+   * @see #setTableFormatConfig(org.rocksdb.TableFormatConfig)
+   */
+  String memTableFactoryName();
+
+  /**
+   * Set the config for table format.
+   *
+   * @param config the table format config.
+   * @return the reference of the current Options.
+   */
+  Object setTableFormatConfig(TableFormatConfig config);
+
+  /**
+   * @return the name of the currently used table factory.
+   */
+  String tableFactoryName();
+
+  /**
+   * Allows thread-safe inplace updates.
+   * If inplace_callback function is not set,
+   *   Put(key, new_value) will update inplace the existing_value iff
+   *   * key exists in current memtable
+   *   * new sizeof(new_value) ≤ sizeof(existing_value)
+   *   * existing_value for that key is a put i.e. kTypeValue
+   * If inplace_callback function is set, check doc for inplace_callback.
+   * Default: false.
+   *
+   * @param inplaceUpdateSupport true if thread-safe inplace updates
+   *     are allowed.
+   * @return the reference to the current option.
+   */
+  Object setInplaceUpdateSupport(boolean inplaceUpdateSupport);
+
+  /**
+   * Allows thread-safe inplace updates.
+   * If inplace_callback function is not set,
+   *   Put(key, new_value) will update inplace the existing_value iff
+   *   * key exists in current memtable
+   *   * new sizeof(new_value) ≤ sizeof(existing_value)
+   *   * existing_value for that key is a put i.e. kTypeValue
+   * If inplace_callback function is set, check doc for inplace_callback.
+   * Default: false.
+   *
+   * @return true if thread-safe inplace updates are allowed.
+   */
+  boolean inplaceUpdateSupport();
+
+  /**
+   * Number of locks used for inplace update
+   * Default: 10000, if inplace_update_support = true, else 0.
+   *
+   * @param inplaceUpdateNumLocks the number of locks used for
+   *     inplace updates.
+   * @return the reference to the current option.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  Object setInplaceUpdateNumLocks(long inplaceUpdateNumLocks);
+
+  /**
+   * Number of locks used for inplace update
+   * Default: 10000, if inplace_update_support = true, else 0.
+   *
+   * @return the number of locks used for inplace update.
+   */
+  long inplaceUpdateNumLocks();
+
+  /**
+   * Sets the number of bits used in the prefix bloom filter.
+   *
+   * This value will be used only when a prefix-extractor is specified.
+   *
+   * @param memtablePrefixBloomBits the number of bits used in the
+   *     prefix bloom filter.
+   * @return the reference to the current option.
+   */
+  Object setMemtablePrefixBloomBits(int memtablePrefixBloomBits);
+
+  /**
+   * Returns the number of bits used in the prefix bloom filter.
+   *
+   * This value will be used only when a prefix-extractor is specified.
+   *
+   * @return the number of bloom-bits.
+   * @see #useFixedLengthPrefixExtractor(int)
+   */
+  int memtablePrefixBloomBits();
+
+  /**
+   * The number of hash probes per key used in the mem-table.
+   *
+   * @param memtablePrefixBloomProbes the number of hash probes per key.
+   * @return the reference to the current option.
+   */
+  Object setMemtablePrefixBloomProbes(int memtablePrefixBloomProbes);
+
+  /**
+   * The number of hash probes per key used in the mem-table.
+   *
+   * @return the number of hash probes per key.
+   */
+  int memtablePrefixBloomProbes();
+
+  /**
+   * Control locality of bloom filter probes to improve cache miss rate.
+   * This option only applies to memtable prefix bloom and plaintable
+   * prefix bloom. It essentially limits the max number of cache lines each
+   * bloom filter check can touch.
+   * This optimization is turned off when set to 0. The number should never
+   * be greater than number of probes. This option can boost performance
+   * for in-memory workload but should use with care since it can cause
+   * higher false positive rate.
+   * Default: 0
+   *
+   * @param bloomLocality the level of locality of bloom-filter probes.
+   * @return the reference to the current option.
+   */
+  Object setBloomLocality(int bloomLocality);
+
+  /**
+   * Control locality of bloom filter probes to improve cache miss rate.
+   * This option only applies to memtable prefix bloom and plaintable
+   * prefix bloom. It essentially limits the max number of cache lines each
+   * bloom filter check can touch.
+   * This optimization is turned off when set to 0. The number should never
+   * be greater than number of probes. This option can boost performance
+   * for in-memory workload but should use with care since it can cause
+   * higher false positive rate.
+   * Default: 0
+   *
+   * @return the level of locality of bloom-filter probes.
+   * @see #setMemtablePrefixBloomProbes(int)
+   */
+  int bloomLocality();
+
+  /**
+   * Maximum number of successive merge operations on a key in the memtable.
+   *
+   * When a merge operation is added to the memtable and the maximum number of
+   * successive merges is reached, the value of the key will be calculated and
+   * inserted into the memtable instead of the merge operation. This will
+   * ensure that there are never more than max_successive_merges merge
+   * operations in the memtable.
+   *
+   * Default: 0 (disabled)
+   *
+   * @param maxSuccessiveMerges the maximum number of successive merges.
+   * @return the reference to the current option.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  Object setMaxSuccessiveMerges(long maxSuccessiveMerges);
+
+  /**
+   * Maximum number of successive merge operations on a key in the memtable.
+   *
+   * When a merge operation is added to the memtable and the maximum number of
+   * successive merges is reached, the value of the key will be calculated and
+   * inserted into the memtable instead of the merge operation. This will
+   * ensure that there are never more than max_successive_merges merge
+   * operations in the memtable.
+   *
+   * Default: 0 (disabled)
+   *
+   * @return the maximum number of successive merges.
+   */
+  long maxSuccessiveMerges();
+
+  /**
+   * The number of partial merge operands to accumulate before partial
+   * merge will be performed. Partial merge will not be called
+   * if the list of values to merge is less than min_partial_merge_operands.
+   *
+   * If min_partial_merge_operands < 2, then it will be treated as 2.
+   *
+   * Default: 2
+   *
+   * @param minPartialMergeOperands min partial merge operands
+   * @return the reference to the current option.
+   */
+  Object setMinPartialMergeOperands(int minPartialMergeOperands);
+
+  /**
+   * The number of partial merge operands to accumulate before partial
+   * merge will be performed. Partial merge will not be called
+   * if the list of values to merge is less than min_partial_merge_operands.
+   *
+   * If min_partial_merge_operands < 2, then it will be treated as 2.
+   *
+   * Default: 2
+   *
+   * @return min partial merge operands
+   */
+  int minPartialMergeOperands();
+
+  /**
+   * <p>This flag specifies that the implementation should optimize the filters
+   * mainly for cases where keys are found rather than also optimize for keys
+   * missed. This would be used in cases where the application knows that
+   * there are very few misses or the performance in the case of misses is not
+   * important.</p>
+   *
+   * <p>For now, this flag allows us to not store filters for the last level i.e
+   * the largest level which contains data of the LSM store. For keys which
+   * are hits, the filters in this level are not useful because we will search
+   * for the data anyway.</p>
+   *
+   * <p><strong>NOTE</strong>: the filters in other levels are still useful
+   * even for key hit because they tell us whether to look in that level or go
+   * to the higher level.</p>
+   *
+   * <p>Default: false<p>
+   *
+   * @param optimizeFiltersForHits boolean value indicating if this flag is set.
+   * @return the reference to the current option.
+   */
+  Object setOptimizeFiltersForHits(boolean optimizeFiltersForHits);
+
+  /**
+   * <p>Returns the current state of the {@code optimize_filters_for_hits}
+   * setting.</p>
+   *
+   * @return boolean value indicating if the flag
+   *     {@code optimize_filters_for_hits} was set.
+   */
+  boolean optimizeFiltersForHits();
+
+  /**
+   * Default memtable memory budget used with the following methods:
+   *
+   * <ol>
+   *   <li>{@link #optimizeLevelStyleCompaction()}</li>
+   *   <li>{@link #optimizeUniversalStyleCompaction()}</li>
+   * </ol>
+   */
+  long DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET = 512 * 1024 * 1024;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java
new file mode 100644
index 0000000..7606439
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java
@@ -0,0 +1,52 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Enum CompactionStyle
+ *
+ * RocksDB supports different styles of compaction. Available
+ * compaction styles can be chosen using this enumeration.
+ *
+ * <ol>
+ *   <li><strong>LEVEL</strong> - Level based Compaction style</li>
+ *   <li><strong>UNIVERSAL</strong> - Universal Compaction Style is a
+ *   compaction style, targeting the use cases requiring lower write
+ *   amplification, trading off read amplification and space
+ *   amplification.</li>
+ *   <li><strong>FIFO</strong> - FIFO compaction style is the simplest
+ *   compaction strategy. It is suited for keeping event log data with
+ *   very low overhead (query log for example). It periodically deletes
+ *   the old data, so it's basically a TTL compaction style.</li>
+ * </ol>
+ *
+ * @see <a
+ * href="https://github.com/facebook/rocksdb/wiki/Universal-Compaction">
+ * Universal Compaction</a>
+ * @see <a
+ * href="https://github.com/facebook/rocksdb/wiki/FIFO-compaction-style">
+ * FIFO Compaction</a>
+ */
+public enum CompactionStyle {
+  LEVEL((byte) 0),
+  UNIVERSAL((byte) 1),
+  FIFO((byte) 2);
+
+  private final byte value_;
+
+  private CompactionStyle(byte value) {
+    value_ = value;
+  }
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Comparator.java b/src/rocksdb/java/src/main/java/org/rocksdb/Comparator.java
new file mode 100644
index 0000000..c8e050b
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Comparator.java
@@ -0,0 +1,24 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Base class for comparators which will receive
+ * byte[] based access via org.rocksdb.Slice in their
+ * compare method implementation.
+ *
+ * byte[] based slices perform better when small keys
+ * are involved. When using larger keys consider
+ * using @see org.rocksdb.DirectComparator
+ */
+public abstract class Comparator extends AbstractComparator<Slice> {
+  public Comparator(final ComparatorOptions copt) {
+    super();
+    createNewComparator0(copt.nativeHandle_);
+  }
+
+  private native void createNewComparator0(final long comparatorOptionsHandle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ComparatorOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/ComparatorOptions.java
new file mode 100644
index 0000000..f0ba520
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ComparatorOptions.java
@@ -0,0 +1,57 @@
+package org.rocksdb;
+
+/**
+ * This class controls the behaviour
+ * of Java implementations of
+ * AbstractComparator
+ *
+ * Note that dispose() must be called before a ComparatorOptions
+ * instance becomes out-of-scope to release the allocated memory in C++.
+ */
+public class ComparatorOptions extends RocksObject {
+  public ComparatorOptions() {
+    super();
+    newComparatorOptions();
+  }
+
+  /**
+   * Use adaptive mutex, which spins in the user space before resorting
+   * to kernel. This could reduce context switch when the mutex is not
+   * heavily contended. However, if the mutex is hot, we could end up
+   * wasting spin time.
+   * Default: false
+   *
+   * @return true if adaptive mutex is used.
+   */
+  public boolean useAdaptiveMutex() {
+    assert(isInitialized());
+    return useAdaptiveMutex(nativeHandle_);
+  }
+
+  /**
+   * Use adaptive mutex, which spins in the user space before resorting
+   * to kernel. This could reduce context switch when the mutex is not
+   * heavily contended. However, if the mutex is hot, we could end up
+   * wasting spin time.
+   * Default: false
+   *
+   * @param useAdaptiveMutex true if adaptive mutex is used.
+   * @return the reference to the current comparator options.
+   */
+  public ComparatorOptions setUseAdaptiveMutex(final boolean useAdaptiveMutex) {
+    assert (isInitialized());
+    setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex);
+    return this;
+  }
+
+  @Override protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void newComparatorOptions();
+  private native boolean useAdaptiveMutex(final long handle);
+  private native void setUseAdaptiveMutex(final long handle,
+      final boolean useAdaptiveMutex);
+  private native void disposeInternal(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java b/src/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java
new file mode 100644
index 0000000..ec0c42f
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/CompressionType.java
@@ -0,0 +1,94 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Enum CompressionType
+ *
+ * <p>DB contents are stored in a set of blocks, each of which holds a
+ * sequence of key,value pairs. Each block may be compressed before
+ * being stored in a file. The following enum describes which
+ * compression method (if any) is used to compress a block.</p>
+ */
+public enum CompressionType {
+
+  NO_COMPRESSION((byte) 0, null),
+  SNAPPY_COMPRESSION((byte) 1, "snappy"),
+  ZLIB_COMPRESSION((byte) 2, "z"),
+  BZLIB2_COMPRESSION((byte) 3, "bzip2"),
+  LZ4_COMPRESSION((byte) 4, "lz4"),
+  LZ4HC_COMPRESSION((byte) 5, "lz4hc");
+
+  /**
+   * <p>Get the CompressionType enumeration value by
+   * passing the library name to this method.</p>
+   *
+   * <p>If library cannot be found the enumeration
+   * value {@code NO_COMPRESSION} will be returned.</p>
+   *
+   * @param libraryName compression library name.
+   *
+   * @return CompressionType instance.
+   */
+  public static CompressionType getCompressionType(String libraryName) {
+    if (libraryName != null) {
+      for (CompressionType compressionType : CompressionType.values()) {
+        if (compressionType.getLibraryName() != null &&
+            compressionType.getLibraryName().equals(libraryName)) {
+          return compressionType;
+        }
+      }
+    }
+    return CompressionType.NO_COMPRESSION;
+  }
+
+  /**
+   * <p>Get the CompressionType enumeration value by
+   * passing the byte identifier to this method.</p>
+   *
+   * <p>If library cannot be found the enumeration
+   * value {@code NO_COMPRESSION} will be returned.</p>
+   *
+   * @param byteIdentifier of CompressionType.
+   *
+   * @return CompressionType instance.
+   */
+  public static CompressionType getCompressionType(byte byteIdentifier) {
+    for (CompressionType compressionType : CompressionType.values()) {
+      if (compressionType.getValue() == byteIdentifier) {
+        return compressionType;
+      }
+    }
+    return CompressionType.NO_COMPRESSION;
+  }
+
+  /**
+   * <p>Returns the byte value of the enumerations value.</p>
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  /**
+   * <p>Returns the library name of the compression type
+   * identified by the enumeration value.</p>
+   *
+   * @return library name
+   */
+  public String getLibraryName() {
+    return libraryName_;
+  }
+
+  private CompressionType(byte value, final String libraryName) {
+        value_ = value;
+        libraryName_ = libraryName;
+  }
+
+  private final byte value_;
+  private final String libraryName_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java
new file mode 100644
index 0000000..85aad1e
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java
@@ -0,0 +1,655 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.Properties;
+
+/**
+ * DBOptions to control the behavior of a database.  It will be used
+ * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()).
+ *
+ * If {@link #dispose()} function is not called, then it will be GC'd automatically
+ * and native resources will be released as part of the process.
+ */
+public class DBOptions extends RocksObject implements DBOptionsInterface {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Construct DBOptions.
+   *
+   * This constructor will create (by allocating a block of memory)
+   * an {@code rocksdb::DBOptions} in the c++ side.
+   */
+  public DBOptions() {
+    super();
+    numShardBits_ = DEFAULT_NUM_SHARD_BITS;
+    newDBOptions();
+  }
+
+  /**
+   * <p>Method to get a options instance by using pre-configured
+   * property values. If one or many values are undefined in
+   * the context of RocksDB the method will return a null
+   * value.</p>
+   *
+   * <p><strong>Note</strong>: Property keys can be derived from
+   * getter methods within the options class. Example: the method
+   * {@code allowMmapReads()} has a property key:
+   * {@code allow_mmap_reads}.</p>
+   *
+   * @param properties {@link java.util.Properties} instance.
+   *
+   * @return {@link org.rocksdb.DBOptions instance}
+   *     or null.
+   *
+   * @throws java.lang.IllegalArgumentException if null or empty
+   *     {@link java.util.Properties} instance is passed to the method call.
+   */
+  public static DBOptions getDBOptionsFromProps(
+      final Properties properties) {
+    if (properties == null || properties.size() == 0) {
+      throw new IllegalArgumentException(
+          "Properties value must contain at least one value.");
+    }
+    DBOptions dbOptions = null;
+    StringBuilder stringBuilder = new StringBuilder();
+    for (final String name : properties.stringPropertyNames()){
+      stringBuilder.append(name);
+      stringBuilder.append("=");
+      stringBuilder.append(properties.getProperty(name));
+      stringBuilder.append(";");
+    }
+    long handle = getDBOptionsFromProps(
+        stringBuilder.toString());
+    if (handle != 0){
+      dbOptions = new DBOptions(handle);
+    }
+    return dbOptions;
+  }
+
+  @Override
+  public DBOptions setIncreaseParallelism(
+      final int totalThreads) {
+    assert (isInitialized());
+    setIncreaseParallelism(nativeHandle_, totalThreads);
+    return this;
+  }
+
+  @Override
+  public DBOptions setCreateIfMissing(final boolean flag) {
+    assert(isInitialized());
+    setCreateIfMissing(nativeHandle_, flag);
+    return this;
+  }
+
+  @Override
+  public boolean createIfMissing() {
+    assert(isInitialized());
+    return createIfMissing(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setCreateMissingColumnFamilies(
+      final boolean flag) {
+    assert(isInitialized());
+    setCreateMissingColumnFamilies(nativeHandle_, flag);
+    return this;
+  }
+
+  @Override
+  public boolean createMissingColumnFamilies() {
+    assert(isInitialized());
+    return createMissingColumnFamilies(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setErrorIfExists(
+      final boolean errorIfExists) {
+    assert(isInitialized());
+    setErrorIfExists(nativeHandle_, errorIfExists);
+    return this;
+  }
+
+  @Override
+  public boolean errorIfExists() {
+    assert(isInitialized());
+    return errorIfExists(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setParanoidChecks(
+      final boolean paranoidChecks) {
+    assert(isInitialized());
+    setParanoidChecks(nativeHandle_, paranoidChecks);
+    return this;
+  }
+
+  @Override
+  public boolean paranoidChecks() {
+    assert(isInitialized());
+    return paranoidChecks(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setRateLimiterConfig(
+      final RateLimiterConfig config) {
+    assert(isInitialized());
+    rateLimiterConfig_ = config;
+    setRateLimiter(nativeHandle_, config.newRateLimiterHandle());
+    return this;
+  }
+
+  @Override
+  public DBOptions setLogger(final Logger logger) {
+    assert(isInitialized());
+    setLogger(nativeHandle_, logger.nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public DBOptions setInfoLogLevel(
+      final InfoLogLevel infoLogLevel) {
+    assert(isInitialized());
+    setInfoLogLevel(nativeHandle_, infoLogLevel.getValue());
+    return this;
+  }
+
+  @Override
+  public InfoLogLevel infoLogLevel() {
+    assert(isInitialized());
+    return InfoLogLevel.getInfoLogLevel(
+        infoLogLevel(nativeHandle_));
+  }
+
+  @Override
+  public DBOptions setMaxOpenFiles(
+      final int maxOpenFiles) {
+    assert(isInitialized());
+    setMaxOpenFiles(nativeHandle_, maxOpenFiles);
+    return this;
+  }
+
+  @Override
+  public int maxOpenFiles() {
+    assert(isInitialized());
+    return maxOpenFiles(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxTotalWalSize(
+      final long maxTotalWalSize) {
+    assert(isInitialized());
+    setMaxTotalWalSize(nativeHandle_, maxTotalWalSize);
+    return this;
+  }
+
+  @Override
+  public long maxTotalWalSize() {
+    assert(isInitialized());
+    return maxTotalWalSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions createStatistics() {
+    assert(isInitialized());
+    createStatistics(nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public Statistics statisticsPtr() {
+    assert(isInitialized());
+
+    long statsPtr = statisticsPtr(nativeHandle_);
+    if(statsPtr == 0) {
+      createStatistics();
+      statsPtr = statisticsPtr(nativeHandle_);
+    }
+
+    return new Statistics(statsPtr);
+  }
+
+  @Override
+  public DBOptions setDisableDataSync(
+      final boolean disableDataSync) {
+    assert(isInitialized());
+    setDisableDataSync(nativeHandle_, disableDataSync);
+    return this;
+  }
+
+  @Override
+  public boolean disableDataSync() {
+    assert(isInitialized());
+    return disableDataSync(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setUseFsync(
+      final boolean useFsync) {
+    assert(isInitialized());
+    setUseFsync(nativeHandle_, useFsync);
+    return this;
+  }
+
+  @Override
+  public boolean useFsync() {
+    assert(isInitialized());
+    return useFsync(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setDbLogDir(
+      final String dbLogDir) {
+    assert(isInitialized());
+    setDbLogDir(nativeHandle_, dbLogDir);
+    return this;
+  }
+
+  @Override
+  public String dbLogDir() {
+    assert(isInitialized());
+    return dbLogDir(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWalDir(
+      final String walDir) {
+    assert(isInitialized());
+    setWalDir(nativeHandle_, walDir);
+    return this;
+  }
+
+  @Override
+  public String walDir() {
+    assert(isInitialized());
+    return walDir(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setDeleteObsoleteFilesPeriodMicros(
+      final long micros) {
+    assert(isInitialized());
+    setDeleteObsoleteFilesPeriodMicros(nativeHandle_, micros);
+    return this;
+  }
+
+  @Override
+  public long deleteObsoleteFilesPeriodMicros() {
+    assert(isInitialized());
+    return deleteObsoleteFilesPeriodMicros(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxBackgroundCompactions(
+      final int maxBackgroundCompactions) {
+    assert(isInitialized());
+    setMaxBackgroundCompactions(nativeHandle_, maxBackgroundCompactions);
+    return this;
+  }
+
+  @Override
+  public int maxBackgroundCompactions() {
+    assert(isInitialized());
+    return maxBackgroundCompactions(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxBackgroundFlushes(
+      final int maxBackgroundFlushes) {
+    assert(isInitialized());
+    setMaxBackgroundFlushes(nativeHandle_, maxBackgroundFlushes);
+    return this;
+  }
+
+  @Override
+  public int maxBackgroundFlushes() {
+    assert(isInitialized());
+    return maxBackgroundFlushes(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxLogFileSize(
+      final long maxLogFileSize) {
+    assert(isInitialized());
+    setMaxLogFileSize(nativeHandle_, maxLogFileSize);
+    return this;
+  }
+
+  @Override
+  public long maxLogFileSize() {
+    assert(isInitialized());
+    return maxLogFileSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setLogFileTimeToRoll(
+      final long logFileTimeToRoll) {
+    assert(isInitialized());
+    setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll);
+    return this;
+  }
+
+  @Override
+  public long logFileTimeToRoll() {
+    assert(isInitialized());
+    return logFileTimeToRoll(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setKeepLogFileNum(
+      final long keepLogFileNum) {
+    assert(isInitialized());
+    setKeepLogFileNum(nativeHandle_, keepLogFileNum);
+    return this;
+  }
+
+  @Override
+  public long keepLogFileNum() {
+    assert(isInitialized());
+    return keepLogFileNum(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setMaxManifestFileSize(
+      final long maxManifestFileSize) {
+    assert(isInitialized());
+    setMaxManifestFileSize(nativeHandle_, maxManifestFileSize);
+    return this;
+  }
+
+  @Override
+  public long maxManifestFileSize() {
+    assert(isInitialized());
+    return maxManifestFileSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setTableCacheNumshardbits(
+      final int tableCacheNumshardbits) {
+    assert(isInitialized());
+    setTableCacheNumshardbits(nativeHandle_, tableCacheNumshardbits);
+    return this;
+  }
+
+  @Override
+  public int tableCacheNumshardbits() {
+    assert(isInitialized());
+    return tableCacheNumshardbits(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWalTtlSeconds(
+      final long walTtlSeconds) {
+    assert(isInitialized());
+    setWalTtlSeconds(nativeHandle_, walTtlSeconds);
+    return this;
+  }
+
+  @Override
+  public long walTtlSeconds() {
+    assert(isInitialized());
+    return walTtlSeconds(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setWalSizeLimitMB(
+      final long sizeLimitMB) {
+    assert(isInitialized());
+    setWalSizeLimitMB(nativeHandle_, sizeLimitMB);
+    return this;
+  }
+
+  @Override
+  public long walSizeLimitMB() {
+    assert(isInitialized());
+    return walSizeLimitMB(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setManifestPreallocationSize(
+      final long size) {
+    assert(isInitialized());
+    setManifestPreallocationSize(nativeHandle_, size);
+    return this;
+  }
+
+  @Override
+  public long manifestPreallocationSize() {
+    assert(isInitialized());
+    return manifestPreallocationSize(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAllowOsBuffer(
+      final boolean allowOsBuffer) {
+    assert(isInitialized());
+    setAllowOsBuffer(nativeHandle_, allowOsBuffer);
+    return this;
+  }
+
+  @Override
+  public boolean allowOsBuffer() {
+    assert(isInitialized());
+    return allowOsBuffer(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAllowMmapReads(
+      final boolean allowMmapReads) {
+    assert(isInitialized());
+    setAllowMmapReads(nativeHandle_, allowMmapReads);
+    return this;
+  }
+
+  @Override
+  public boolean allowMmapReads() {
+    assert(isInitialized());
+    return allowMmapReads(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAllowMmapWrites(
+      final boolean allowMmapWrites) {
+    assert(isInitialized());
+    setAllowMmapWrites(nativeHandle_, allowMmapWrites);
+    return this;
+  }
+
+  @Override
+  public boolean allowMmapWrites() {
+    assert(isInitialized());
+    return allowMmapWrites(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setIsFdCloseOnExec(
+      final boolean isFdCloseOnExec) {
+    assert(isInitialized());
+    setIsFdCloseOnExec(nativeHandle_, isFdCloseOnExec);
+    return this;
+  }
+
+  @Override
+  public boolean isFdCloseOnExec() {
+    assert(isInitialized());
+    return isFdCloseOnExec(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setStatsDumpPeriodSec(
+      final int statsDumpPeriodSec) {
+    assert(isInitialized());
+    setStatsDumpPeriodSec(nativeHandle_, statsDumpPeriodSec);
+    return this;
+  }
+
+  @Override
+  public int statsDumpPeriodSec() {
+    assert(isInitialized());
+    return statsDumpPeriodSec(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAdviseRandomOnOpen(
+      final boolean adviseRandomOnOpen) {
+    assert(isInitialized());
+    setAdviseRandomOnOpen(nativeHandle_, adviseRandomOnOpen);
+    return this;
+  }
+
+  @Override
+  public boolean adviseRandomOnOpen() {
+    return adviseRandomOnOpen(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setUseAdaptiveMutex(
+      final boolean useAdaptiveMutex) {
+    assert(isInitialized());
+    setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex);
+    return this;
+  }
+
+  @Override
+  public boolean useAdaptiveMutex() {
+    assert(isInitialized());
+    return useAdaptiveMutex(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setBytesPerSync(
+      final long bytesPerSync) {
+    assert(isInitialized());
+    setBytesPerSync(nativeHandle_, bytesPerSync);
+    return this;
+  }
+
+  @Override
+  public long bytesPerSync() {
+    return bytesPerSync(nativeHandle_);
+  }
+
+  /**
+   * Release the memory allocated for the current instance
+   * in the c++ side.
+   */
+  @Override protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  static final int DEFAULT_NUM_SHARD_BITS = -1;
+
+  /**
+   * <p>Private constructor to be used by
+   * {@link #getDBOptionsFromProps(java.util.Properties)}</p>
+   *
+   * @param handle native handle to DBOptions instance.
+   */
+  private DBOptions(final long handle) {
+    super();
+    nativeHandle_ = handle;
+  }
+
+  private static native long getDBOptionsFromProps(
+      String optString);
+
+  private native void newDBOptions();
+  private native void disposeInternal(long handle);
+
+  private native void setIncreaseParallelism(long handle, int totalThreads);
+  private native void setCreateIfMissing(long handle, boolean flag);
+  private native boolean createIfMissing(long handle);
+  private native void setCreateMissingColumnFamilies(
+      long handle, boolean flag);
+  private native boolean createMissingColumnFamilies(long handle);
+  private native void setErrorIfExists(long handle, boolean errorIfExists);
+  private native boolean errorIfExists(long handle);
+  private native void setParanoidChecks(
+      long handle, boolean paranoidChecks);
+  private native boolean paranoidChecks(long handle);
+  private native void setRateLimiter(long handle,
+      long rateLimiterHandle);
+  private native void setLogger(long handle,
+      long loggerHandle);
+  private native void setInfoLogLevel(long handle, byte logLevel);
+  private native byte infoLogLevel(long handle);
+  private native void setMaxOpenFiles(long handle, int maxOpenFiles);
+  private native int maxOpenFiles(long handle);
+  private native void setMaxTotalWalSize(long handle,
+      long maxTotalWalSize);
+  private native long maxTotalWalSize(long handle);
+  private native void createStatistics(long optHandle);
+  private native long statisticsPtr(long optHandle);
+  private native void setDisableDataSync(long handle, boolean disableDataSync);
+  private native boolean disableDataSync(long handle);
+  private native boolean useFsync(long handle);
+  private native void setUseFsync(long handle, boolean useFsync);
+  private native void setDbLogDir(long handle, String dbLogDir);
+  private native String dbLogDir(long handle);
+  private native void setWalDir(long handle, String walDir);
+  private native String walDir(long handle);
+  private native void setDeleteObsoleteFilesPeriodMicros(
+      long handle, long micros);
+  private native long deleteObsoleteFilesPeriodMicros(long handle);
+  private native void setMaxBackgroundCompactions(
+      long handle, int maxBackgroundCompactions);
+  private native int maxBackgroundCompactions(long handle);
+  private native void setMaxBackgroundFlushes(
+      long handle, int maxBackgroundFlushes);
+  private native int maxBackgroundFlushes(long handle);
+  private native void setMaxLogFileSize(long handle, long maxLogFileSize)
+      throws IllegalArgumentException;
+  private native long maxLogFileSize(long handle);
+  private native void setLogFileTimeToRoll(
+      long handle, long logFileTimeToRoll) throws IllegalArgumentException;
+  private native long logFileTimeToRoll(long handle);
+  private native void setKeepLogFileNum(long handle, long keepLogFileNum)
+      throws IllegalArgumentException;
+  private native long keepLogFileNum(long handle);
+  private native void setMaxManifestFileSize(
+      long handle, long maxManifestFileSize);
+  private native long maxManifestFileSize(long handle);
+  private native void setTableCacheNumshardbits(
+      long handle, int tableCacheNumshardbits);
+  private native int tableCacheNumshardbits(long handle);
+  private native void setWalTtlSeconds(long handle, long walTtlSeconds);
+  private native long walTtlSeconds(long handle);
+  private native void setWalSizeLimitMB(long handle, long sizeLimitMB);
+  private native long walSizeLimitMB(long handle);
+  private native void setManifestPreallocationSize(
+      long handle, long size) throws IllegalArgumentException;
+  private native long manifestPreallocationSize(long handle);
+  private native void setAllowOsBuffer(
+      long handle, boolean allowOsBuffer);
+  private native boolean allowOsBuffer(long handle);
+  private native void setAllowMmapReads(
+      long handle, boolean allowMmapReads);
+  private native boolean allowMmapReads(long handle);
+  private native void setAllowMmapWrites(
+      long handle, boolean allowMmapWrites);
+  private native boolean allowMmapWrites(long handle);
+  private native void setIsFdCloseOnExec(
+      long handle, boolean isFdCloseOnExec);
+  private native boolean isFdCloseOnExec(long handle);
+  private native void setStatsDumpPeriodSec(
+      long handle, int statsDumpPeriodSec);
+  private native int statsDumpPeriodSec(long handle);
+  private native void setAdviseRandomOnOpen(
+      long handle, boolean adviseRandomOnOpen);
+  private native boolean adviseRandomOnOpen(long handle);
+  private native void setUseAdaptiveMutex(
+      long handle, boolean useAdaptiveMutex);
+  private native boolean useAdaptiveMutex(long handle);
+  private native void setBytesPerSync(
+      long handle, long bytesPerSync);
+  private native long bytesPerSync(long handle);
+
+  int numShardBits_;
+  RateLimiterConfig rateLimiterConfig_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/src/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java
new file mode 100644
index 0000000..f710105
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java
@@ -0,0 +1,764 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+public interface DBOptionsInterface {
+
+  /**
+   * <p>By default, RocksDB uses only one background thread for flush and
+   * compaction. Calling this function will set it up such that total of
+   * `total_threads` is used.</p>
+   *
+   * <p>You almost definitely want to call this function if your system is
+   * bottlenecked by RocksDB.</p>
+   *
+   * @param totalThreads The total number of threads to be used by RocksDB.
+   *     A good value is the number of cores.
+   *
+   * @return the instance of the current Options
+   */
+  Object setIncreaseParallelism(int totalThreads);
+
+  /**
+   * If this value is set to true, then the database will be created
+   * if it is missing during {@code RocksDB.open()}.
+   * Default: false
+   *
+   * @param flag a flag indicating whether to create a database the
+   *     specified database in {@link RocksDB#open(org.rocksdb.Options, String)} operation
+   *     is missing.
+   * @return the instance of the current Options
+   * @see RocksDB#open(org.rocksdb.Options, String)
+   */
+  Object setCreateIfMissing(boolean flag);
+
+  /**
+   * Return true if the create_if_missing flag is set to true.
+   * If true, the database will be created if it is missing.
+   *
+   * @return true if the createIfMissing option is set to true.
+   * @see #setCreateIfMissing(boolean)
+   */
+  boolean createIfMissing();
+
+  /**
+   * <p>If true, missing column families will be automatically created</p>
+   *
+   * <p>Default: false</p>
+   *
+   * @param flag a flag indicating if missing column families shall be
+   *     created automatically.
+   * @return true if missing column families shall be created automatically
+   *     on open.
+   */
+  Object setCreateMissingColumnFamilies(boolean flag);
+
+  /**
+   * Return true if the create_missing_column_families flag is set
+   * to true. If true column families be created if missing.
+   *
+   * @return true if the createMissingColumnFamilies is set to
+   *     true.
+   * @see #setCreateMissingColumnFamilies(boolean)
+   */
+  boolean createMissingColumnFamilies();
+
+  /**
+   * If true, an error will be thrown during RocksDB.open() if the
+   * database already exists.
+   * Default: false
+   *
+   * @param errorIfExists if true, an exception will be thrown
+   *     during {@code RocksDB.open()} if the database already exists.
+   * @return the reference to the current option.
+   * @see RocksDB#open(org.rocksdb.Options, String)
+   */
+  Object setErrorIfExists(boolean errorIfExists);
+
+  /**
+   * If true, an error will be thrown during RocksDB.open() if the
+   * database already exists.
+   *
+   * @return if true, an error is raised when the specified database
+   *    already exists before open.
+   */
+  boolean errorIfExists();
+
+  /**
+   * If true, the implementation will do aggressive checking of the
+   * data it is processing and will stop early if it detects any
+   * errors.  This may have unforeseen ramifications: for example, a
+   * corruption of one DB entry may cause a large number of entries to
+   * become unreadable or for the entire DB to become unopenable.
+   * If any of the  writes to the database fails (Put, Delete, Merge, Write),
+   * the database will switch to read-only mode and fail all other
+   * Write operations.
+   * Default: true
+   *
+   * @param paranoidChecks a flag to indicate whether paranoid-check
+   *     is on.
+   * @return the reference to the current option.
+   */
+  Object setParanoidChecks(boolean paranoidChecks);
+
+  /**
+   * If true, the implementation will do aggressive checking of the
+   * data it is processing and will stop early if it detects any
+   * errors.  This may have unforeseen ramifications: for example, a
+   * corruption of one DB entry may cause a large number of entries to
+   * become unreadable or for the entire DB to become unopenable.
+   * If any of the  writes to the database fails (Put, Delete, Merge, Write),
+   * the database will switch to read-only mode and fail all other
+   * Write operations.
+   *
+   * @return a boolean indicating whether paranoid-check is on.
+   */
+  boolean paranoidChecks();
+
+  /**
+   * Use to control write rate of flush and compaction. Flush has higher
+   * priority than compaction. Rate limiting is disabled if nullptr.
+   * Default: nullptr
+   *
+   * @param config rate limiter config.
+   * @return the instance of the current Object.
+   */
+  Object setRateLimiterConfig(RateLimiterConfig config);
+
+  /**
+   * <p>Any internal progress/error information generated by
+   * the db will be written to the Logger if it is non-nullptr,
+   * or to a file stored in the same directory as the DB
+   * contents if info_log is nullptr.</p>
+   *
+   * <p>Default: nullptr</p>
+   *
+   * @param logger {@link Logger} instance.
+   * @return the instance of the current Object.
+   */
+  Object setLogger(Logger logger);
+
+  /**
+   * <p>Sets the RocksDB log level. Default level is INFO</p>
+   *
+   * @param infoLogLevel log level to set.
+   * @return the instance of the current Object.
+   */
+  Object setInfoLogLevel(InfoLogLevel infoLogLevel);
+
+  /**
+   * <p>Returns currently set log level.</p>
+   * @return {@link org.rocksdb.InfoLogLevel} instance.
+   */
+  InfoLogLevel infoLogLevel();
+
+  /**
+   * Number of open files that can be used by the DB.  You may need to
+   * increase this if your database has a large working set. Value -1 means
+   * files opened are always kept open. You can estimate number of files based
+   * on {@code target_file_size_base} and {@code target_file_size_multiplier}
+   * for level-based compaction. For universal-style compaction, you can usually
+   * set it to -1.
+   * Default: 5000
+   *
+   * @param maxOpenFiles the maximum number of open files.
+   * @return the instance of the current Object.
+   */
+  Object setMaxOpenFiles(int maxOpenFiles);
+
+  /**
+   * Number of open files that can be used by the DB.  You may need to
+   * increase this if your database has a large working set. Value -1 means
+   * files opened are always kept open. You can estimate number of files based
+   * on {@code target_file_size_base} and {@code target_file_size_multiplier}
+   * for level-based compaction. For universal-style compaction, you can usually
+   * set it to -1.
+   *
+   * @return the maximum number of open files.
+   */
+  int maxOpenFiles();
+
+  /**
+   * <p>Once write-ahead logs exceed this size, we will start forcing the
+   * flush of column families whose memtables are backed by the oldest live
+   * WAL file (i.e. the ones that are causing all the space amplification).
+   * </p>
+   * <p>If set to 0 (default), we will dynamically choose the WAL size limit to
+   * be [sum of all write_buffer_size * max_write_buffer_number] * 2</p>
+   * <p>Default: 0</p>
+   *
+   * @param maxTotalWalSize max total wal size.
+   * @return the instance of the current Object.
+   */
+  Object setMaxTotalWalSize(long maxTotalWalSize);
+
+  /**
+   * <p>Returns the max total wal size. Once write-ahead logs exceed this size,
+   * we will start forcing the flush of column families whose memtables are
+   * backed by the oldest live WAL file (i.e. the ones that are causing all
+   * the space amplification).</p>
+   *
+   * <p>If set to 0 (default), we will dynamically choose the WAL size limit
+   * to be [sum of all write_buffer_size * max_write_buffer_number] * 2
+   * </p>
+   *
+   * @return max total wal size
+   */
+  long maxTotalWalSize();
+
+  /**
+   * <p>Creates statistics object which collects metrics about database operations.
+   * Statistics objects should not be shared between DB instances as
+   * it does not use any locks to prevent concurrent updates.</p>
+   *
+   * @return the instance of the current Object.
+   * @see RocksDB#open(org.rocksdb.Options, String)
+   */
+  Object createStatistics();
+
+  /**
+   * <p>Returns statistics object. Calls {@link #createStatistics()} if
+   * C++ returns {@code nullptr} for statistics.</p>
+   *
+   * @return the instance of the statistics object.
+   * @see #createStatistics()
+   */
+  Statistics statisticsPtr();
+
+  /**
+   * <p>If true, then the contents of manifest and data files are
+   * not synced to stable storage. Their contents remain in the
+   * OS buffers till theOS decides to flush them.</p>
+   *
+   * <p>This option is good for bulk-loading of data.</p>
+   *
+   * <p>Once the bulk-loading is complete, please issue a sync to
+   * the OS to flush all dirty buffers to stable storage.</p>
+   *
+   * <p>Default: false</p>
+   *
+   * @param disableDataSync a boolean flag to specify whether to
+   *     disable data sync.
+   * @return the instance of the current Object.
+   */
+  Object setDisableDataSync(boolean disableDataSync);
+
+  /**
+   * If true, then the contents of data files are not synced
+   * to stable storage. Their contents remain in the OS buffers till the
+   * OS decides to flush them. This option is good for bulk-loading
+   * of data. Once the bulk-loading is complete, please issue a
+   * sync to the OS to flush all dirty buffers to stable storage.
+   *
+   * @return if true, then data-sync is disabled.
+   */
+  boolean disableDataSync();
+
+  /**
+   * <p>If true, then every store to stable storage will issue a fsync.</p>
+   * <p>If false, then every store to stable storage will issue a fdatasync.
+   * This parameter should be set to true while storing data to
+   * filesystem like ext3 that can lose files after a reboot.</p>
+   * <p>Default: false</p>
+   *
+   * @param useFsync a boolean flag to specify whether to use fsync
+   * @return the instance of the current Object.
+   */
+  Object setUseFsync(boolean useFsync);
+
+  /**
+   * <p>If true, then every store to stable storage will issue a fsync.</p>
+   * <p>If false, then every store to stable storage will issue a fdatasync.
+   * This parameter should be set to true while storing data to
+   * filesystem like ext3 that can lose files after a reboot.</p>
+   *
+   * @return boolean value indicating if fsync is used.
+   */
+  boolean useFsync();
+
+  /**
+   * This specifies the info LOG dir.
+   * If it is empty, the log files will be in the same dir as data.
+   * If it is non empty, the log files will be in the specified dir,
+   * and the db data dir's absolute path will be used as the log file
+   * name's prefix.
+   *
+   * @param dbLogDir the path to the info log directory
+   * @return the instance of the current Object.
+   */
+  Object setDbLogDir(String dbLogDir);
+
+  /**
+   * Returns the directory of info log.
+   *
+   * If it is empty, the log files will be in the same dir as data.
+   * If it is non empty, the log files will be in the specified dir,
+   * and the db data dir's absolute path will be used as the log file
+   * name's prefix.
+   *
+   * @return the path to the info log directory
+   */
+  String dbLogDir();
+
+  /**
+   * This specifies the absolute dir path for write-ahead logs (WAL).
+   * If it is empty, the log files will be in the same dir as data,
+   *   dbname is used as the data dir by default
+   * If it is non empty, the log files will be in kept the specified dir.
+   * When destroying the db,
+   *   all log files in wal_dir and the dir itself is deleted
+   *
+   * @param walDir the path to the write-ahead-log directory.
+   * @return the instance of the current Object.
+   */
+  Object setWalDir(String walDir);
+
+  /**
+   * Returns the path to the write-ahead-logs (WAL) directory.
+   *
+   * If it is empty, the log files will be in the same dir as data,
+   *   dbname is used as the data dir by default
+   * If it is non empty, the log files will be in kept the specified dir.
+   * When destroying the db,
+   *   all log files in wal_dir and the dir itself is deleted
+   *
+   * @return the path to the write-ahead-logs (WAL) directory.
+   */
+  String walDir();
+
+  /**
+   * The periodicity when obsolete files get deleted. The default
+   * value is 6 hours. The files that get out of scope by compaction
+   * process will still get automatically delete on every compaction,
+   * regardless of this setting
+   *
+   * @param micros the time interval in micros
+   * @return the instance of the current Object.
+   */
+  Object setDeleteObsoleteFilesPeriodMicros(long micros);
+
+  /**
+   * The periodicity when obsolete files get deleted. The default
+   * value is 6 hours. The files that get out of scope by compaction
+   * process will still get automatically delete on every compaction,
+   * regardless of this setting
+   *
+   * @return the time interval in micros when obsolete files will be deleted.
+   */
+  long deleteObsoleteFilesPeriodMicros();
+
+  /**
+   * Specifies the maximum number of concurrent background compaction jobs,
+   * submitted to the default LOW priority thread pool.
+   * If you're increasing this, also consider increasing number of threads in
+   * LOW priority thread pool. For more information, see
+   * Default: 1
+   *
+   * @param maxBackgroundCompactions the maximum number of background
+   *     compaction jobs.
+   * @return the instance of the current Object.
+   *
+   * @see RocksEnv#setBackgroundThreads(int)
+   * @see RocksEnv#setBackgroundThreads(int, int)
+   * @see #maxBackgroundFlushes()
+   */
+  Object setMaxBackgroundCompactions(int maxBackgroundCompactions);
+
+  /**
+   * Returns the maximum number of concurrent background compaction jobs,
+   * submitted to the default LOW priority thread pool.
+   * When increasing this number, we may also want to consider increasing
+   * number of threads in LOW priority thread pool.
+   * Default: 1
+   *
+   * @return the maximum number of concurrent background compaction jobs.
+   * @see RocksEnv#setBackgroundThreads(int)
+   * @see RocksEnv#setBackgroundThreads(int, int)
+   */
+  int maxBackgroundCompactions();
+
+  /**
+   * Specifies the maximum number of concurrent background flush jobs.
+   * If you're increasing this, also consider increasing number of threads in
+   * HIGH priority thread pool. For more information, see
+   * Default: 1
+   *
+   * @param maxBackgroundFlushes number of max concurrent flush jobs
+   * @return the instance of the current Object.
+   *
+   * @see RocksEnv#setBackgroundThreads(int)
+   * @see RocksEnv#setBackgroundThreads(int, int)
+   * @see #maxBackgroundCompactions()
+   */
+  Object setMaxBackgroundFlushes(int maxBackgroundFlushes);
+
+  /**
+   * Returns the maximum number of concurrent background flush jobs.
+   * If you're increasing this, also consider increasing number of threads in
+   * HIGH priority thread pool. For more information, see
+   * Default: 1
+   *
+   * @return the maximum number of concurrent background flush jobs.
+   * @see RocksEnv#setBackgroundThreads(int)
+   * @see RocksEnv#setBackgroundThreads(int, int)
+   */
+  int maxBackgroundFlushes();
+
+  /**
+   * Specifies the maximum size of a info log file. If the current log file
+   * is larger than `max_log_file_size`, a new info log file will
+   * be created.
+   * If 0, all logs will be written to one log file.
+   *
+   * @param maxLogFileSize the maximum size of a info log file.
+   * @return the instance of the current Object.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  Object setMaxLogFileSize(long maxLogFileSize);
+
+  /**
+   * Returns the maximum size of a info log file. If the current log file
+   * is larger than this size, a new info log file will be created.
+   * If 0, all logs will be written to one log file.
+   *
+   * @return the maximum size of the info log file.
+   */
+  long maxLogFileSize();
+
+  /**
+   * Specifies the time interval for the info log file to roll (in seconds).
+   * If specified with non-zero value, log file will be rolled
+   * if it has been active longer than `log_file_time_to_roll`.
+   * Default: 0 (disabled)
+   *
+   * @param logFileTimeToRoll the time interval in seconds.
+   * @return the instance of the current Object.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  Object setLogFileTimeToRoll(long logFileTimeToRoll);
+
+  /**
+   * Returns the time interval for the info log file to roll (in seconds).
+   * If specified with non-zero value, log file will be rolled
+   * if it has been active longer than `log_file_time_to_roll`.
+   * Default: 0 (disabled)
+   *
+   * @return the time interval in seconds.
+   */
+  long logFileTimeToRoll();
+
+  /**
+   * Specifies the maximum number of info log files to be kept.
+   * Default: 1000
+   *
+   * @param keepLogFileNum the maximum number of info log files to be kept.
+   * @return the instance of the current Object.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  Object setKeepLogFileNum(long keepLogFileNum);
+
+  /**
+   * Returns the maximum number of info log files to be kept.
+   * Default: 1000
+   *
+   * @return the maximum number of info log files to be kept.
+   */
+  long keepLogFileNum();
+
+  /**
+   * Manifest file is rolled over on reaching this limit.
+   * The older manifest file be deleted.
+   * The default value is MAX_INT so that roll-over does not take place.
+   *
+   * @param maxManifestFileSize the size limit of a manifest file.
+   * @return the instance of the current Object.
+   */
+  Object setMaxManifestFileSize(long maxManifestFileSize);
+
+  /**
+   * Manifest file is rolled over on reaching this limit.
+   * The older manifest file be deleted.
+   * The default value is MAX_INT so that roll-over does not take place.
+   *
+   * @return the size limit of a manifest file.
+   */
+  long maxManifestFileSize();
+
+  /**
+   * Number of shards used for table cache.
+   *
+   * @param tableCacheNumshardbits the number of chards
+   * @return the instance of the current Object.
+   */
+  Object setTableCacheNumshardbits(int tableCacheNumshardbits);
+
+  /**
+   * Number of shards used for table cache.
+   *
+   * @return the number of shards used for table cache.
+   */
+  int tableCacheNumshardbits();
+
+  /**
+   * {@link #walTtlSeconds()} and {@link #walSizeLimitMB()} affect how archived logs
+   * will be deleted.
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   * the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   *    WAL files will be checked every 10 min and if total size is greater
+   *    then WAL_size_limit_MB, they will be deleted starting with the
+   *    earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   *    are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.</li>
+   * </ol>
+   *
+   * @param walTtlSeconds the ttl seconds
+   * @return the instance of the current Object.
+   * @see #setWalSizeLimitMB(long)
+   */
+  Object setWalTtlSeconds(long walTtlSeconds);
+
+  /**
+   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
+   * will be deleted.
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   * the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   * WAL files will be checked every 10 min and if total size is greater
+   * then WAL_size_limit_MB, they will be deleted starting with the
+   * earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   * WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   * are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   * checks will be performed with ttl being first.</li>
+   * </ol>
+   *
+   * @return the wal-ttl seconds
+   * @see #walSizeLimitMB()
+   */
+  long walTtlSeconds();
+
+  /**
+   * WalTtlSeconds() and walSizeLimitMB() affect how archived logs
+   * will be deleted.
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   *    the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   *    WAL files will be checked every 10 min and if total size is greater
+   *    then WAL_size_limit_MB, they will be deleted starting with the
+   *    earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+   *    are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.</li>
+   * </ol> 
+   *
+   * @param sizeLimitMB size limit in mega-bytes.
+   * @return the instance of the current Object.
+   * @see #setWalSizeLimitMB(long)
+   */
+  Object setWalSizeLimitMB(long sizeLimitMB);
+
+  /**
+   * {@link #walTtlSeconds()} and {@code #walSizeLimitMB()} affect how archived logs
+   * will be deleted.
+   * <ol>
+   * <li>If both set to 0, logs will be deleted asap and will not get into
+   *    the archive.</li>
+   * <li>If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0,
+   *    WAL files will be checked every 10 min and if total size is greater
+   *    then WAL_size_limit_MB, they will be deleted starting with the
+   *    earliest until size_limit is met. All empty files will be deleted.</li>
+   * <li>If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
+   *    WAL files will be checked every WAL_ttl_seconds i / 2 and those that
+   *    are older than WAL_ttl_seconds will be deleted.</li>
+   * <li>If both are not 0, WAL files will be checked every 10 min and both
+   *    checks will be performed with ttl being first.</li>
+   * </ol>
+   * @return size limit in mega-bytes.
+   * @see #walSizeLimitMB()
+   */
+  long walSizeLimitMB();
+
+  /**
+   * Number of bytes to preallocate (via fallocate) the manifest
+   * files.  Default is 4mb, which is reasonable to reduce random IO
+   * as well as prevent overallocation for mounts that preallocate
+   * large amounts of data (such as xfs's allocsize option).
+   *
+   * @param size the size in byte
+   * @return the instance of the current Object.
+   * @throws java.lang.IllegalArgumentException thrown on 32-Bit platforms
+   *   while overflowing the underlying platform specific value.
+   */
+  Object setManifestPreallocationSize(long size);
+
+  /**
+   * Number of bytes to preallocate (via fallocate) the manifest
+   * files.  Default is 4mb, which is reasonable to reduce random IO
+   * as well as prevent overallocation for mounts that preallocate
+   * large amounts of data (such as xfs's allocsize option).
+   *
+   * @return size in bytes.
+   */
+  long manifestPreallocationSize();
+
+  /**
+   * Data being read from file storage may be buffered in the OS
+   * Default: true
+   *
+   * @param allowOsBuffer if true, then OS buffering is allowed.
+   * @return the instance of the current Object.
+   */
+  Object setAllowOsBuffer(boolean allowOsBuffer);
+
+  /**
+   * Data being read from file storage may be buffered in the OS
+   * Default: true
+   *
+   * @return if true, then OS buffering is allowed.
+   */
+  boolean allowOsBuffer();
+
+  /**
+   * Allow the OS to mmap file for reading sst tables.
+   * Default: false
+   *
+   * @param allowMmapReads true if mmap reads are allowed.
+   * @return the instance of the current Object.
+   */
+  Object setAllowMmapReads(boolean allowMmapReads);
+
+  /**
+   * Allow the OS to mmap file for reading sst tables.
+   * Default: false
+   *
+   * @return true if mmap reads are allowed.
+   */
+  boolean allowMmapReads();
+
+  /**
+   * Allow the OS to mmap file for writing. Default: false
+   *
+   * @param allowMmapWrites true if mmap writes are allowd.
+   * @return the instance of the current Object.
+   */
+  Object setAllowMmapWrites(boolean allowMmapWrites);
+
+  /**
+   * Allow the OS to mmap file for writing. Default: false
+   *
+   * @return true if mmap writes are allowed.
+   */
+  boolean allowMmapWrites();
+
+  /**
+   * Disable child process inherit open files. Default: true
+   *
+   * @param isFdCloseOnExec true if child process inheriting open
+   *     files is disabled.
+   * @return the instance of the current Object.
+   */
+  Object setIsFdCloseOnExec(boolean isFdCloseOnExec);
+
+  /**
+   * Disable child process inherit open files. Default: true
+   *
+   * @return true if child process inheriting open files is disabled.
+   */
+  boolean isFdCloseOnExec();
+
+  /**
+   * if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+   * Default: 3600 (1 hour)
+   *
+   * @param statsDumpPeriodSec time interval in seconds.
+   * @return the instance of the current Object.
+   */
+  Object setStatsDumpPeriodSec(int statsDumpPeriodSec);
+
+  /**
+   * If not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+   * Default: 3600 (1 hour)
+   *
+   * @return time interval in seconds.
+   */
+  int statsDumpPeriodSec();
+
+  /**
+   * If set true, will hint the underlying file system that the file
+   * access pattern is random, when a sst file is opened.
+   * Default: true
+   *
+   * @param adviseRandomOnOpen true if hinting random access is on.
+   * @return the instance of the current Object.
+   */
+  Object setAdviseRandomOnOpen(boolean adviseRandomOnOpen);
+
+  /**
+   * If set true, will hint the underlying file system that the file
+   * access pattern is random, when a sst file is opened.
+   * Default: true
+   *
+   * @return true if hinting random access is on.
+   */
+  boolean adviseRandomOnOpen();
+
+  /**
+   * Use adaptive mutex, which spins in the user space before resorting
+   * to kernel. This could reduce context switch when the mutex is not
+   * heavily contended. However, if the mutex is hot, we could end up
+   * wasting spin time.
+   * Default: false
+   *
+   * @param useAdaptiveMutex true if adaptive mutex is used.
+   * @return the instance of the current Object.
+   */
+  Object setUseAdaptiveMutex(boolean useAdaptiveMutex);
+
+  /**
+   * Use adaptive mutex, which spins in the user space before resorting
+   * to kernel. This could reduce context switch when the mutex is not
+   * heavily contended. However, if the mutex is hot, we could end up
+   * wasting spin time.
+   * Default: false
+   *
+   * @return true if adaptive mutex is used.
+   */
+  boolean useAdaptiveMutex();
+
+  /**
+   * Allows OS to incrementally sync files to disk while they are being
+   * written, asynchronously, in the background.
+   * Issue one request for every bytes_per_sync written. 0 turns it off.
+   * Default: 0
+   *
+   * @param bytesPerSync size in bytes
+   * @return the instance of the current Object.
+   */
+  Object setBytesPerSync(long bytesPerSync);
+
+  /**
+   * Allows OS to incrementally sync files to disk while they are being
+   * written, asynchronously, in the background.
+   * Issue one request for every bytes_per_sync written. 0 turns it off.
+   * Default: 0
+   *
+   * @return size in bytes
+   */
+  long bytesPerSync();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/DirectComparator.java b/src/rocksdb/java/src/main/java/org/rocksdb/DirectComparator.java
new file mode 100644
index 0000000..47f4d72
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/DirectComparator.java
@@ -0,0 +1,24 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Base class for comparators which will receive
+ * ByteBuffer based access via org.rocksdb.DirectSlice
+ * in their compare method implementation.
+ *
+ * ByteBuffer based slices perform better when large keys
+ * are involved. When using smaller keys consider
+ * using @see org.rocksdb.Comparator
+ */
+public abstract class DirectComparator extends AbstractComparator<DirectSlice> {
+  public DirectComparator(final ComparatorOptions copt) {
+    super();
+    createNewDirectComparator0(copt.nativeHandle_);
+  }
+
+  private native void createNewDirectComparator0(final long comparatorOptionsHandle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java b/src/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java
new file mode 100644
index 0000000..765b015
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/DirectSlice.java
@@ -0,0 +1,118 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Base class for slices which will receive direct
+ * ByteBuffer based access to the underlying data.
+ *
+ * ByteBuffer backed slices typically perform better with
+ * larger keys and values. When using smaller keys and
+ * values consider using @see org.rocksdb.Slice
+ */
+public class DirectSlice extends AbstractSlice<ByteBuffer> {
+  //TODO(AR) only needed by WriteBatchWithIndexTest until JDK8
+  public final static DirectSlice NONE = new DirectSlice();
+
+  /**
+   * Called from JNI to construct a new Java DirectSlice
+   * without an underlying C++ object set
+   * at creation time.
+   *
+   * Note: You should be aware that
+   * {@see org.rocksdb.RocksObject#disOwnNativeHandle()} is intentionally
+   * called from the default DirectSlice constructor, and that it is marked as
+   * package-private. This is so that developers cannot construct their own default
+   * DirectSlice objects (at present). As developers cannot construct their own
+   * DirectSlice objects through this, they are not creating underlying C++
+   * DirectSlice objects, and so there is nothing to free (dispose) from Java.
+   */
+  DirectSlice() {
+    super();
+    disOwnNativeHandle();
+  }
+
+  /**
+   * Constructs a slice
+   * where the data is taken from
+   * a String.
+   *
+   * @param str The string
+   */
+  public DirectSlice(final String str) {
+    super();
+    createNewSliceFromString(str);
+  }
+
+  /**
+   * Constructs a slice where the data is
+   * read from the provided
+   * ByteBuffer up to a certain length
+   *
+   * @param data The buffer containing the data
+   * @param length The length of the data to use for the slice
+   */
+  public DirectSlice(final ByteBuffer data, final int length) {
+    super();
+    assert(data.isDirect());
+    createNewDirectSlice0(data, length);
+  }
+
+  /**
+   * Constructs a slice where the data is
+   * read from the provided
+   * ByteBuffer
+   *
+   * @param data The bugger containing the data
+   */
+  public DirectSlice(final ByteBuffer data) {
+    super();
+    assert(data.isDirect());
+    createNewDirectSlice1(data);
+  }
+
+  /**
+   * Retrieves the byte at a specific offset
+   * from the underlying data
+   *
+   * @param offset The (zero-based) offset of the byte to retrieve
+   *
+   * @return the requested byte
+   */
+  public byte get(int offset) {
+    assert (isInitialized());
+    return get0(nativeHandle_, offset);
+  }
+
+  /**
+   * Clears the backing slice
+   */
+  public void clear() {
+    assert (isInitialized());
+    clear0(nativeHandle_);
+  }
+
+  /**
+   * Drops the specified {@code n}
+   * number of bytes from the start
+   * of the backing slice
+   *
+   * @param n The number of bytes to drop
+   */
+  public void removePrefix(final int n) {
+    assert (isInitialized());
+    removePrefix0(nativeHandle_, n);
+  }
+
+  private native void createNewDirectSlice0(ByteBuffer data, int length);
+  private native void createNewDirectSlice1(ByteBuffer data);
+  @Override protected final native ByteBuffer data0(long handle);
+  private native byte get0(long handle, int offset);
+  private native void clear0(long handle);
+  private native void removePrefix0(long handle, int length);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/EncodingType.java b/src/rocksdb/java/src/main/java/org/rocksdb/EncodingType.java
new file mode 100644
index 0000000..d639542
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/EncodingType.java
@@ -0,0 +1,55 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * EncodingType
+ *
+ * <p>The value will determine how to encode keys
+ * when writing to a new SST file.</p>
+ *
+ * <p>This value will be stored
+ * inside the SST file which will be used when reading from
+ * the file, which makes it possible for users to choose
+ * different encoding type when reopening a DB. Files with
+ * different encoding types can co-exist in the same DB and
+ * can be read.</p>
+ */
+public enum EncodingType {
+  /**
+   * Always write full keys without any special encoding.
+   */
+  kPlain((byte) 0),
+  /**
+   * <p>Find opportunity to write the same prefix once for multiple rows.
+   * In some cases, when a key follows a previous key with the same prefix,
+   * instead of writing out the full key, it just writes out the size of the
+   * shared prefix, as well as other bytes, to save some bytes.</p>
+   *
+   * <p>When using this option, the user is required to use the same prefix
+   * extractor to make sure the same prefix will be extracted from the same key.
+   * The Name() value of the prefix extractor will be stored in the file. When
+   * reopening the file, the name of the options.prefix_extractor given will be
+   * bitwise compared to the prefix extractors stored in the file. An error
+   * will be returned if the two don't match.</p>
+   */
+  kPrefix((byte) 1);
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  private EncodingType(byte value) {
+    value_ = value;
+  }
+
+  private final byte value_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Env.java b/src/rocksdb/java/src/main/java/org/rocksdb/Env.java
new file mode 100644
index 0000000..929a394
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Env.java
@@ -0,0 +1,92 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Base class for all Env implementations in RocksDB.
+ */
+public abstract class Env extends RocksObject {
+  public static final int FLUSH_POOL = 0;
+  public static final int COMPACTION_POOL = 1;
+
+  /**
+   * <p>Returns the default environment suitable for the current operating
+   * system.</p>
+   *
+   * <p>The result of {@code getDefault()} is a singleton whose ownership
+   * belongs to rocksdb c++.  As a result, the returned RocksEnv will not
+   * have the ownership of its c++ resource, and calling its dispose()
+   * will be no-op.</p>
+   *
+   * @return the default {@link org.rocksdb.RocksEnv} instance.
+   */
+  public static Env getDefault() {
+    return default_env_;
+  }
+
+  /**
+   * <p>Sets the number of background worker threads of the flush pool
+   * for this environment.</p>
+   * <p>Default number: 1</p>
+   *
+   * @param num the number of threads
+   *
+   * @return current {@link RocksEnv} instance.
+   */
+  public Env setBackgroundThreads(final int num) {
+    return setBackgroundThreads(num, FLUSH_POOL);
+  }
+
+  /**
+   * <p>Sets the number of background worker threads of the specified thread
+   * pool for this environment.</p>
+   *
+   * @param num the number of threads
+   * @param poolID the id to specified a thread pool.  Should be either
+   *     FLUSH_POOL or COMPACTION_POOL.
+   *
+   * <p>Default number: 1</p>
+   * @return current {@link RocksEnv} instance.
+   */
+  public Env setBackgroundThreads(final int num, final int poolID) {
+    setBackgroundThreads(nativeHandle_, num, poolID);
+    return this;
+  }
+
+  /**
+   * <p>Returns the length of the queue associated with the specified
+   * thread pool.</p>
+   *
+   * @param poolID the id to specified a thread pool.  Should be either
+   *     FLUSH_POOL or COMPACTION_POOL.
+   *
+   * @return the thread pool queue length.
+   */
+  public int getThreadPoolQueueLen(final int poolID) {
+    return getThreadPoolQueueLen(nativeHandle_, poolID);
+  }
+
+
+  protected Env() {
+    super();
+  }
+
+  static {
+    default_env_ = new RocksEnv(getDefaultEnvInternal());
+  }
+
+  /**
+   * <p>The static default Env. The ownership of its native handle
+   * belongs to rocksdb c++ and is not able to be released on the Java
+   * side.</p>
+   */
+  static Env default_env_;
+
+  private static native long getDefaultEnvInternal();
+  private native void setBackgroundThreads(
+      long handle, int num, int priority);
+  private native int getThreadPoolQueueLen(long handle, int poolID);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Filter.java b/src/rocksdb/java/src/main/java/org/rocksdb/Filter.java
new file mode 100644
index 0000000..ce5c41f
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Filter.java
@@ -0,0 +1,31 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Filters are stored in rocksdb and are consulted automatically
+ * by rocksdb to decide whether or not to read some
+ * information from disk. In many cases, a filter can cut down the
+ * number of disk seeks form a handful to a single disk seek per
+ * DB::Get() call.
+ */
+public abstract class Filter extends RocksObject {
+  protected abstract void createNewFilter();
+
+  /**
+   * Deletes underlying C++ filter pointer.
+   *
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the filter are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void disposeInternal(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java
new file mode 100644
index 0000000..9ddf95f
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java
@@ -0,0 +1,51 @@
+package org.rocksdb;
+
+/**
+ * FlushOptions to be passed to flush operations of
+ * {@link org.rocksdb.RocksDB}.
+ */
+public class FlushOptions extends RocksObject {
+
+  /**
+   * Construct a new instance of FlushOptions.
+   */
+  public FlushOptions(){
+    super();
+    newFlushOptions();
+  }
+
+  /**
+   * Set if the flush operation shall block until it terminates.
+   *
+   * @param waitForFlush boolean value indicating if the flush
+   *     operations waits for termination of the flush process.
+   *
+   * @return instance of current FlushOptions.
+   */
+  public FlushOptions setWaitForFlush(final boolean waitForFlush) {
+    assert(isInitialized());
+    setWaitForFlush(nativeHandle_, waitForFlush);
+    return this;
+  }
+
+  /**
+   * Wait for flush to finished.
+   *
+   * @return boolean value indicating if the flush operation
+   *     waits for termination of the flush process.
+   */
+  public boolean waitForFlush() {
+    assert(isInitialized());
+    return waitForFlush(nativeHandle_);
+  }
+
+  @Override protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void newFlushOptions();
+  private native void disposeInternal(long handle);
+  private native void setWaitForFlush(long handle,
+      boolean wait);
+  private native boolean waitForFlush(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java
new file mode 100644
index 0000000..89951c5
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java
@@ -0,0 +1,66 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * Config for rate limiter, which is used to control write rate of flush and
+ * compaction.
+ *
+ * @see RateLimiterConfig
+ */
+public class GenericRateLimiterConfig extends RateLimiterConfig {
+  private static final long DEFAULT_REFILL_PERIOD_MICROS = (100 * 1000);
+  private static final int DEFAULT_FAIRNESS = 10;
+
+  /**
+   * GenericRateLimiterConfig constructor
+   *
+   * @param rateBytesPerSecond this is the only parameter you want to set
+   *     most of the time. It controls the total write rate of compaction
+   *     and flush in bytes per second. Currently, RocksDB does not enforce
+   *     rate limit for anything other than flush and compaction, e.g. write to WAL.
+   * @param refillPeriodMicros this controls how often tokens are refilled. For example,
+   *     when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
+   *     100ms, then 1MB is refilled every 100ms internally. Larger value can lead to
+   *     burstier writes while smaller value introduces more CPU overhead.
+   *     The default should work for most cases.
+   * @param fairness RateLimiter accepts high-pri requests and low-pri requests.
+   *     A low-pri request is usually blocked in favor of hi-pri request. Currently,
+   *     RocksDB assigns low-pri to request from compaction and high-pri to request
+   *     from flush. Low-pri requests can get blocked if flush requests come in
+   *     continuously. This fairness parameter grants low-pri requests permission by
+   *     fairness chance even though high-pri requests exist to avoid starvation.
+   *     You should be good by leaving it at default 10.
+   */
+  public GenericRateLimiterConfig(final long rateBytesPerSecond,
+      final long refillPeriodMicros, final int fairness) {
+    rateBytesPerSecond_ = rateBytesPerSecond;
+    refillPeriodMicros_ = refillPeriodMicros;
+    fairness_ = fairness;
+  }
+
+  /**
+   * GenericRateLimiterConfig constructor
+   *
+   * @param rateBytesPerSecond this is the only parameter you want to set
+   *     most of the time. It controls the total write rate of compaction
+   *     and flush in bytes per second. Currently, RocksDB does not enforce
+   *     rate limit for anything other than flush and compaction, e.g. write to WAL.
+   */
+  public GenericRateLimiterConfig(final long rateBytesPerSecond) {
+    this(rateBytesPerSecond, DEFAULT_REFILL_PERIOD_MICROS, DEFAULT_FAIRNESS);
+  }
+
+  @Override protected long newRateLimiterHandle() {
+    return newRateLimiterHandle(rateBytesPerSecond_, refillPeriodMicros_,
+        fairness_);
+  }
+
+  private native long newRateLimiterHandle(long rateBytesPerSecond,
+      long refillPeriodMicros, int fairness);
+  private final long rateBytesPerSecond_;
+  private final long refillPeriodMicros_;
+  private final int fairness_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java
new file mode 100644
index 0000000..d56c46c
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java
@@ -0,0 +1,173 @@
+package org.rocksdb;
+
+/**
+ * The config for hash linked list memtable representation
+ * Such memtable contains a fix-sized array of buckets, where
+ * each bucket points to a sorted singly-linked
+ * list (or null if the bucket is empty).
+ *
+ * Note that since this mem-table representation relies on the
+ * key prefix, it is required to invoke one of the usePrefixExtractor
+ * functions to specify how to extract key prefix given a key.
+ * If proper prefix-extractor is not set, then RocksDB will
+ * use the default memtable representation (SkipList) instead
+ * and post a warning in the LOG.
+ */
+public class HashLinkedListMemTableConfig extends MemTableConfig {
+  public static final long DEFAULT_BUCKET_COUNT = 50000;
+  public static final long DEFAULT_HUGE_PAGE_TLB_SIZE = 0;
+  public static final int DEFAULT_BUCKET_ENTRIES_LOG_THRES = 4096;
+  public static final boolean
+      DEFAULT_IF_LOG_BUCKET_DIST_WHEN_FLUSH = true;
+  public static final int DEFAUL_THRESHOLD_USE_SKIPLIST = 256;
+
+  /**
+   * HashLinkedListMemTableConfig constructor
+   */
+  public HashLinkedListMemTableConfig() {
+    bucketCount_ = DEFAULT_BUCKET_COUNT;
+    hugePageTlbSize_ = DEFAULT_HUGE_PAGE_TLB_SIZE;
+    bucketEntriesLoggingThreshold_ = DEFAULT_BUCKET_ENTRIES_LOG_THRES;
+    ifLogBucketDistWhenFlush_ = DEFAULT_IF_LOG_BUCKET_DIST_WHEN_FLUSH;
+    thresholdUseSkiplist_ = DEFAUL_THRESHOLD_USE_SKIPLIST;
+  }
+
+  /**
+   * Set the number of buckets in the fixed-size array used
+   * in the hash linked-list mem-table.
+   *
+   * @param count the number of hash buckets.
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig setBucketCount(
+      final long count) {
+    bucketCount_ = count;
+    return this;
+  }
+
+  /**
+   * Returns the number of buckets that will be used in the memtable
+   * created based on this config.
+   *
+   * @return the number of buckets
+   */
+  public long bucketCount() {
+    return bucketCount_;
+  }
+
+  /**
+   * <p>Set the size of huge tlb or allocate the hashtable bytes from
+   * malloc if {@code size <= 0}.</p>
+   *
+   * <p>The user needs to reserve huge pages for it to be allocated,
+   * like: {@code sysctl -w vm.nr_hugepages=20}</p>
+   *
+   * <p>See linux documentation/vm/hugetlbpage.txt</p>
+   *
+   * @param size if set to {@code <= 0} hashtable bytes from malloc
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig setHugePageTlbSize(
+      final long size) {
+    hugePageTlbSize_ = size;
+    return this;
+  }
+
+  /**
+   * Returns the size value of hugePageTlbSize.
+   *
+   * @return the hugePageTlbSize.
+   */
+  public long hugePageTlbSize() {
+    return hugePageTlbSize_;
+  }
+
+  /**
+   * If number of entries in one bucket exceeds that setting, log
+   * about it.
+   *
+   * @param threshold - number of entries in a single bucket before
+   *     logging starts.
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig
+      setBucketEntriesLoggingThreshold(final int threshold) {
+    bucketEntriesLoggingThreshold_ = threshold;
+    return this;
+  }
+
+  /**
+   * Returns the maximum number of entries in one bucket before
+   * logging starts.
+   *
+   * @return maximum number of entries in one bucket before logging
+   *     starts.
+   */
+  public int bucketEntriesLoggingThreshold() {
+    return bucketEntriesLoggingThreshold_;
+  }
+
+  /**
+   * If true the distrubition of number of entries will be logged.
+   *
+   * @param logDistribution - boolean parameter indicating if number
+   *     of entry distribution shall be logged.
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig
+      setIfLogBucketDistWhenFlush(final boolean logDistribution) {
+    ifLogBucketDistWhenFlush_ = logDistribution;
+    return this;
+  }
+
+  /**
+   * Returns information about logging the distribution of
+   *  number of entries on flush.
+   *
+   * @return if distrubtion of number of entries shall be logged.
+   */
+  public boolean ifLogBucketDistWhenFlush() {
+    return ifLogBucketDistWhenFlush_;
+  }
+
+  /**
+   * Set maximum number of entries in one bucket. Exceeding this val
+   * leads to a switch from LinkedList to SkipList.
+   *
+   * @param threshold maximum number of entries before SkipList is
+   *     used.
+   * @return the reference to the current HashLinkedListMemTableConfig.
+   */
+  public HashLinkedListMemTableConfig
+      setThresholdUseSkiplist(final int threshold) {
+    thresholdUseSkiplist_ = threshold;
+    return this;
+  }
+
+  /**
+   * Returns entries per bucket threshold before LinkedList is
+   * replaced by SkipList usage for that bucket.
+   *
+   * @return entries per bucket threshold before SkipList is used.
+   */
+  public int thresholdUseSkiplist() {
+    return thresholdUseSkiplist_;
+  }
+
+  @Override protected long newMemTableFactoryHandle() {
+    return newMemTableFactoryHandle(bucketCount_, hugePageTlbSize_,
+        bucketEntriesLoggingThreshold_, ifLogBucketDistWhenFlush_,
+        thresholdUseSkiplist_);
+  }
+
+  private native long newMemTableFactoryHandle(long bucketCount,
+      long hugePageTlbSize, int bucketEntriesLoggingThreshold,
+      boolean ifLogBucketDistWhenFlush, int thresholdUseSkiplist)
+      throws IllegalArgumentException;
+
+  private long bucketCount_;
+  private long hugePageTlbSize_;
+  private int bucketEntriesLoggingThreshold_;
+  private boolean ifLogBucketDistWhenFlush_;
+  private int thresholdUseSkiplist_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java
new file mode 100644
index 0000000..fe1779b
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java
@@ -0,0 +1,105 @@
+package org.rocksdb;
+
+/**
+ * The config for hash skip-list mem-table representation.
+ * Such mem-table representation contains a fix-sized array of
+ * buckets, where each bucket points to a skiplist (or null if the
+ * bucket is empty).
+ *
+ * Note that since this mem-table representation relies on the
+ * key prefix, it is required to invoke one of the usePrefixExtractor
+ * functions to specify how to extract key prefix given a key.
+ * If proper prefix-extractor is not set, then RocksDB will
+ * use the default memtable representation (SkipList) instead
+ * and post a warning in the LOG.
+ */
+public class HashSkipListMemTableConfig extends MemTableConfig {
+  public static final int DEFAULT_BUCKET_COUNT = 1000000;
+  public static final int DEFAULT_BRANCHING_FACTOR = 4;
+  public static final int DEFAULT_HEIGHT = 4;
+
+  /**
+   * HashSkipListMemTableConfig constructor
+   */
+  public HashSkipListMemTableConfig() {
+    bucketCount_ = DEFAULT_BUCKET_COUNT;
+    branchingFactor_ = DEFAULT_BRANCHING_FACTOR;
+    height_ = DEFAULT_HEIGHT;
+  }
+
+  /**
+   * Set the number of hash buckets used in the hash skiplist memtable.
+   * Default = 1000000.
+   *
+   * @param count the number of hash buckets used in the hash
+   *    skiplist memtable.
+   * @return the reference to the current HashSkipListMemTableConfig.
+   */
+  public HashSkipListMemTableConfig setBucketCount(
+      final long count) {
+    bucketCount_ = count;
+    return this;
+  }
+
+  /**
+   * @return the number of hash buckets
+   */
+  public long bucketCount() {
+    return bucketCount_;
+  }
+
+  /**
+   * Set the height of the skip list.  Default = 4.
+   *
+   * @param height height to set.
+   *
+   * @return the reference to the current HashSkipListMemTableConfig.
+   */
+  public HashSkipListMemTableConfig setHeight(final int height) {
+    height_ = height;
+    return this;
+  }
+
+  /**
+   * @return the height of the skip list.
+   */
+  public int height() {
+    return height_;
+  }
+
+  /**
+   * Set the branching factor used in the hash skip-list memtable.
+   * This factor controls the probabilistic size ratio between adjacent
+   * links in the skip list.
+   *
+   * @param bf the probabilistic size ratio between adjacent link
+   *     lists in the skip list.
+   * @return the reference to the current HashSkipListMemTableConfig.
+   */
+  public HashSkipListMemTableConfig setBranchingFactor(
+      final int bf) {
+    branchingFactor_ = bf;
+    return this;
+  }
+
+  /**
+   * @return branching factor, the probabilistic size ratio between
+   *     adjacent links in the skip list.
+   */
+  public int branchingFactor() {
+    return branchingFactor_;
+  }
+
+  @Override protected long newMemTableFactoryHandle() {
+    return newMemTableFactoryHandle(
+        bucketCount_, height_, branchingFactor_);
+  }
+
+  private native long newMemTableFactoryHandle(
+      long bucketCount, int height, int branchingFactor)
+      throws IllegalArgumentException;
+
+  private long bucketCount_;
+  private int branchingFactor_;
+  private int height_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/HistogramData.java b/src/rocksdb/java/src/main/java/org/rocksdb/HistogramData.java
new file mode 100644
index 0000000..020a9c9
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/HistogramData.java
@@ -0,0 +1,44 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+public class HistogramData {
+  private final double median_;
+  private final double percentile95_;
+  private final double percentile99_;
+  private final double average_;
+  private final double standardDeviation_;
+
+  public HistogramData(final double median, final double percentile95,
+      final double percentile99, final double average,
+      final double standardDeviation) {
+    median_ = median;
+    percentile95_ = percentile95;
+    percentile99_ = percentile99;
+    average_ = average;
+    standardDeviation_ = standardDeviation;
+  }
+
+  public double getMedian() {
+    return median_;
+  }
+
+  public double getPercentile95() {
+    return percentile95_;
+  }
+
+  public double getPercentile99() {
+    return percentile99_;
+  }
+
+  public double getAverage() {
+    return average_;
+  }
+
+  public double getStandardDeviation() {
+    return standardDeviation_;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java b/src/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java
new file mode 100644
index 0000000..9b45481
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java
@@ -0,0 +1,40 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+public enum HistogramType {
+  DB_GET(0),
+  DB_WRITE(1),
+  COMPACTION_TIME(2),
+  TABLE_SYNC_MICROS(3),
+  COMPACTION_OUTFILE_SYNC_MICROS(4),
+  WAL_FILE_SYNC_MICROS(5),
+  MANIFEST_FILE_SYNC_MICROS(6),
+  // TIME SPENT IN IO DURING TABLE OPEN
+  TABLE_OPEN_IO_MICROS(7),
+  DB_MULTIGET(8),
+  READ_BLOCK_COMPACTION_MICROS(9),
+  READ_BLOCK_GET_MICROS(10),
+  WRITE_RAW_BLOCK_MICROS(11),
+  STALL_L0_SLOWDOWN_COUNT(12),
+  STALL_MEMTABLE_COMPACTION_COUNT(13),
+  STALL_L0_NUM_FILES_COUNT(14),
+  HARD_RATE_LIMIT_DELAY_COUNT(15),
+  SOFT_RATE_LIMIT_DELAY_COUNT(16),
+  NUM_FILES_IN_SINGLE_COMPACTION(17),
+  DB_SEEK(18),
+  WRITE_STALL(19);
+
+  private final int value_;
+
+  private HistogramType(int value) {
+    value_ = value;
+  }
+
+  public int getValue() {
+    return value_;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/IndexType.java b/src/rocksdb/java/src/main/java/org/rocksdb/IndexType.java
new file mode 100644
index 0000000..f3c1045
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/IndexType.java
@@ -0,0 +1,37 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * IndexType used in conjunction with BlockBasedTable.
+ */
+public enum IndexType {
+  /**
+   * A space efficient index block that is optimized for
+   * binary-search-based index.
+   */
+  kBinarySearch((byte) 0),
+  /**
+   * The hash index, if enabled, will do the hash lookup when
+   * {@code Options.prefix_extractor} is provided.
+   */
+  kHashSearch((byte) 1);
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  private IndexType(byte value) {
+    value_ = value;
+  }
+
+  private final byte value_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/InfoLogLevel.java b/src/rocksdb/java/src/main/java/org/rocksdb/InfoLogLevel.java
new file mode 100644
index 0000000..e67063c
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/InfoLogLevel.java
@@ -0,0 +1,47 @@
+package org.rocksdb;
+
+/**
+ * RocksDB log levels.
+ */
+public enum InfoLogLevel {
+  DEBUG_LEVEL((byte)0),
+  INFO_LEVEL((byte)1),
+  WARN_LEVEL((byte)2),
+  ERROR_LEVEL((byte)3),
+  FATAL_LEVEL((byte)4),
+  NUM_INFO_LOG_LEVELS((byte)5);
+
+  private final byte value_;
+
+  private InfoLogLevel(byte value) {
+    value_ = value;
+  }
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  /**
+   * Get InfoLogLevel by byte value.
+   *
+   * @param value byte representation of InfoLogLevel.
+   *
+   * @return {@link org.rocksdb.InfoLogLevel} instance or null.
+   * @throws java.lang.IllegalArgumentException if an invalid
+   *     value is provided.
+   */
+  public static InfoLogLevel getInfoLogLevel(byte value) {
+    for (InfoLogLevel infoLogLevel : InfoLogLevel.values()) {
+      if (infoLogLevel.getValue() == value){
+        return infoLogLevel;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Illegal value provided for InfoLogLevel.");
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Logger.java b/src/rocksdb/java/src/main/java/org/rocksdb/Logger.java
new file mode 100644
index 0000000..05c53b5
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Logger.java
@@ -0,0 +1,108 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * <p>This class provides a custom logger functionality
+ * in Java which wraps {@code RocksDB} logging facilities.
+ * </p>
+ *
+ * <p>Using this class RocksDB can log with common
+ * Java logging APIs like Log4j or Slf4j without keeping
+ * database logs in the filesystem.</p>
+ *
+ * <strong>Performance</strong>
+ * <p>There are certain performance penalties using a Java
+ * {@code Logger} implementation within production code.
+ * </p>
+ *
+ * <p>
+ * A log level can be set using {@link org.rocksdb.Options} or
+ * {@link Logger#setInfoLogLevel(InfoLogLevel)}. The set log level
+ * influences the underlying native code. Each log message is
+ * checked against the set log level and if the log level is more
+ * verbose as the set log level, native allocations will be made
+ * and data structures are allocated.
+ * </p>
+ *
+ * <p>Every log message which will be emitted by native code will
+ * trigger expensive native to Java transitions. So the preferred
+ * setting for production use is either
+ * {@link org.rocksdb.InfoLogLevel#ERROR_LEVEL} or
+ * {@link org.rocksdb.InfoLogLevel#FATAL_LEVEL}.
+ * </p>
+ */
+public abstract class Logger extends RocksObject {
+
+  /**
+   * <p>AbstractLogger constructor.</p>
+   *
+   * <p><strong>Important:</strong> the log level set within
+   * the {@link org.rocksdb.Options} instance will be used as
+   * maximum log level of RocksDB.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   */
+  public Logger(final Options options) {
+    createNewLoggerOptions(options.nativeHandle_);
+  }
+
+  /**
+   * <p>AbstractLogger constructor.</p>
+   *
+   * <p><strong>Important:</strong> the log level set within
+   * the {@link org.rocksdb.DBOptions} instance will be used
+   * as maximum log level of RocksDB.</p>
+   *
+   * @param dboptions {@link org.rocksdb.DBOptions} instance.
+   */
+  public Logger(final DBOptions dboptions) {
+    createNewLoggerDbOptions(dboptions.nativeHandle_);
+  }
+
+  /**
+   * Set {@link org.rocksdb.InfoLogLevel} to AbstractLogger.
+   *
+   * @param infoLogLevel {@link org.rocksdb.InfoLogLevel} instance.
+   */
+  public void setInfoLogLevel(final InfoLogLevel infoLogLevel) {
+      setInfoLogLevel(nativeHandle_, infoLogLevel.getValue());
+  }
+
+  /**
+   * Return the loggers log level.
+   *
+   * @return {@link org.rocksdb.InfoLogLevel} instance.
+   */
+  public InfoLogLevel infoLogLevel() {
+    return InfoLogLevel.getInfoLogLevel(
+        infoLogLevel(nativeHandle_));
+  }
+
+  protected abstract void log(InfoLogLevel infoLogLevel,
+      String logMsg);
+
+  /**
+   * Deletes underlying C++ slice pointer.
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the slice are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override
+  protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  protected native void createNewLoggerOptions(
+      long options);
+  protected native void createNewLoggerDbOptions(
+      long dbOptions);
+  protected native void setInfoLogLevel(long handle,
+      byte infoLogLevel);
+  protected native byte infoLogLevel(long handle);
+  private native void disposeInternal(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/MemTableConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/MemTableConfig.java
new file mode 100644
index 0000000..7c34826
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/MemTableConfig.java
@@ -0,0 +1,29 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * MemTableConfig is used to config the internal mem-table of a RocksDB.
+ * It is required for each memtable to have one such sub-class to allow
+ * Java developers to use it.
+ *
+ * To make a RocksDB to use a specific MemTable format, its associated
+ * MemTableConfig should be properly set and passed into Options
+ * via Options.setMemTableFactory() and open the db using that Options.
+ *
+ * @see Options
+ */
+public abstract class MemTableConfig {
+  /**
+   * This function should only be called by Options.setMemTableConfig(),
+   * which will create a c++ shared-pointer to the c++ MemTableRepFactory
+   * that associated with the Java MemTableConfig.
+   *
+   * @see Options#setMemTableConfig(MemTableConfig)
+   *
+   * @return native handle address to native memory table instance.
+   */
+  abstract protected long newMemTableFactoryHandle();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/MergeOperator.java b/src/rocksdb/java/src/main/java/org/rocksdb/MergeOperator.java
new file mode 100644
index 0000000..3abea02
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/MergeOperator.java
@@ -0,0 +1,15 @@
+// Copyright (c) 2014, Vlad Balan (vlad.gm at gmail.com).  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * MergeOperator holds an operator to be applied when compacting
+ * two merge operands held under the same key in order to obtain a single
+ * value.
+ */
+public interface MergeOperator {
+    long newMergeOperatorHandle();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java b/src/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
new file mode 100644
index 0000000..dca9b31
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
@@ -0,0 +1,114 @@
+package org.rocksdb;
+
+import java.io.*;
+import java.nio.file.Files;
+import java.nio.file.StandardCopyOption;
+
+import org.rocksdb.util.Environment;
+
+/**
+ * This class is used to load the RocksDB shared library from within the jar.
+ * The shared library is extracted to a temp folder and loaded from there.
+ */
+public class NativeLibraryLoader {
+  //singleton
+  private static final NativeLibraryLoader instance = new NativeLibraryLoader();
+  private static boolean initialized = false;
+
+  private static final String sharedLibraryName = Environment.getSharedLibraryName("rocksdb");
+  private static final String jniLibraryName = Environment.getJniLibraryName("rocksdb");
+  private static final String jniLibraryFileName = Environment.getJniLibraryFileName("rocksdb");
+  private static final String tempFilePrefix = "librocksdbjni";
+  private static final String tempFileSuffix = "." + Environment.getJniLibraryExtension();
+
+  /**
+   * Get a reference to the NativeLibraryLoader
+   *
+   * @return The NativeLibraryLoader
+   */
+  public static NativeLibraryLoader getInstance() {
+    return instance;
+  }
+
+  /**
+   * Firstly attempts to load the library from <i>java.library.path</i>,
+   * if that fails then it falls back to extracting
+   * the library from the classpath
+   * {@link org.rocksdb.NativeLibraryLoader#loadLibraryFromJar(java.lang.String)}
+   *
+   * @param tmpDir A temporary directory to use
+   *   to copy the native library to when loading from the classpath.
+   *   If null, or the empty string, we rely on Java's
+   *   {@link java.io.File#createTempFile(String, String)}
+   *   function to provide a temporary location.
+   *   The temporary file will be registered for deletion
+   *   on exit.
+   *
+   * @throws java.io.IOException if a filesystem operation fails.
+   */
+  public synchronized void loadLibrary(final String tmpDir) throws IOException {
+    try {
+        System.loadLibrary(sharedLibraryName);
+    } catch(final UnsatisfiedLinkError ule1) {
+      try {
+        System.loadLibrary(jniLibraryName);
+      } catch(final UnsatisfiedLinkError ule2) {
+        loadLibraryFromJar(tmpDir);
+      }
+    }
+  }
+
+  /**
+   * Attempts to extract the native RocksDB library
+   * from the classpath and load it
+   *
+   * @param tmpDir A temporary directory to use
+   *   to copy the native library to. If null,
+   *   or the empty string, we rely on Java's
+   *   {@link java.io.File#createTempFile(String, String)}
+   *   function to provide a temporary location.
+   *   The temporary file will be registered for deletion
+   *   on exit.
+   *
+   * @throws java.io.IOException if a filesystem operation fails.
+   */
+  void loadLibraryFromJar(final String tmpDir)
+      throws IOException {
+    if (!initialized) {
+      final File temp;
+      if (tmpDir == null || tmpDir.equals("")) {
+        temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
+      } else {
+        temp = new File(tmpDir, jniLibraryFileName);
+        if (!temp.createNewFile()) {
+          throw new RuntimeException("File: " + temp.getAbsolutePath()
+              + " could not be created.");
+        }
+      }
+
+      if (!temp.exists()) {
+        throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist.");
+      } else {
+        temp.deleteOnExit();
+      }
+
+      // attempt to copy the library from the Jar file to the temp destination
+      try (final InputStream is = getClass().getClassLoader().
+          getResourceAsStream(jniLibraryFileName)) {
+        if (is == null) {
+          throw new RuntimeException(jniLibraryFileName + " was not found inside JAR.");
+        } else {
+          Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
+        }
+      }
+
+      System.load(temp.getAbsolutePath());
+      initialized = true;
+    }
+  }
+  /**
+   * Private constructor to disallow instantiation
+   */
+  private NativeLibraryLoader() {
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Options.java b/src/rocksdb/java/src/main/java/org/rocksdb/Options.java
new file mode 100644
index 0000000..771de0a
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Options.java
@@ -0,0 +1,1328 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Options to control the behavior of a database.  It will be used
+ * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()).
+ *
+ * If {@link #dispose()} function is not called, then it will be GC'd automatically
+ * and native resources will be released as part of the process.
+ */
+public class Options extends RocksObject
+    implements DBOptionsInterface, ColumnFamilyOptionsInterface {
+  static {
+    RocksDB.loadLibrary();
+  }
+  /**
+   * Construct options for opening a RocksDB.
+   *
+   * This constructor will create (by allocating a block of memory)
+   * an {@code rocksdb::Options} in the c++ side.
+   */
+  public Options() {
+    super();
+    newOptions();
+    env_ = Env.getDefault();
+  }
+
+  /**
+   * Construct options for opening a RocksDB. Reusing database options
+   * and column family options.
+   *
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance
+   * @param columnFamilyOptions {@link org.rocksdb.ColumnFamilyOptions}
+   *     instance
+   */
+  public Options(final DBOptions dbOptions,
+      final ColumnFamilyOptions columnFamilyOptions) {
+    super();
+    newOptions(dbOptions.nativeHandle_, columnFamilyOptions.nativeHandle_);
+    env_ = Env.getDefault();
+  }
+
+  @Override
+  public Options setIncreaseParallelism(final int totalThreads) {
+    assert(isInitialized());
+    setIncreaseParallelism(nativeHandle_, totalThreads);
+    return this;
+  }
+
+  @Override
+  public Options setCreateIfMissing(final boolean flag) {
+    assert(isInitialized());
+    setCreateIfMissing(nativeHandle_, flag);
+    return this;
+  }
+
+  @Override
+  public Options setCreateMissingColumnFamilies(final boolean flag) {
+    assert(isInitialized());
+    setCreateMissingColumnFamilies(nativeHandle_, flag);
+    return this;
+  }
+
+  /**
+   * Use the specified object to interact with the environment,
+   * e.g. to read/write files, schedule background work, etc.
+   * Default: {@link Env#getDefault()}
+   *
+   * @param env {@link Env} instance.
+   * @return the instance of the current Options.
+   */
+  public Options setEnv(final Env env) {
+    assert(isInitialized());
+    setEnv(nativeHandle_, env.nativeHandle_);
+    env_ = env;
+    return this;
+  }
+
+  /**
+   * Returns the set RocksEnv instance.
+   *
+   * @return {@link RocksEnv} instance set in the Options.
+   */
+  public Env getEnv() {
+    return env_;
+  }
+
+  /**
+   * <p>Set appropriate parameters for bulk loading.
+   * The reason that this is a function that returns "this" instead of a
+   * constructor is to enable chaining of multiple similar calls in the future.
+   * </p>
+   *
+   * <p>All data will be in level 0 without any automatic compaction.
+   * It's recommended to manually call CompactRange(NULL, NULL) before reading
+   * from the database, because otherwise the read can be very slow.</p>
+   *
+   * @return the instance of the current Options.
+   */
+  public Options prepareForBulkLoad() {
+    prepareForBulkLoad(nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public boolean createIfMissing() {
+    assert(isInitialized());
+    return createIfMissing(nativeHandle_);
+  }
+
+  @Override
+  public boolean createMissingColumnFamilies() {
+    assert(isInitialized());
+    return createMissingColumnFamilies(nativeHandle_);
+  }
+
+  @Override
+  public Options optimizeForPointLookup(
+      long blockCacheSizeMb) {
+    optimizeForPointLookup(nativeHandle_,
+        blockCacheSizeMb);
+    return this;
+  }
+
+  @Override
+  public Options optimizeLevelStyleCompaction() {
+    optimizeLevelStyleCompaction(nativeHandle_,
+        DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET);
+    return this;
+  }
+
+  @Override
+  public Options optimizeLevelStyleCompaction(
+      long memtableMemoryBudget) {
+    optimizeLevelStyleCompaction(nativeHandle_,
+        memtableMemoryBudget);
+    return this;
+  }
+
+  @Override
+  public Options optimizeUniversalStyleCompaction() {
+    optimizeUniversalStyleCompaction(nativeHandle_,
+        DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET);
+    return this;
+  }
+
+  @Override
+  public Options optimizeUniversalStyleCompaction(
+      final long memtableMemoryBudget) {
+    optimizeUniversalStyleCompaction(nativeHandle_,
+        memtableMemoryBudget);
+    return this;
+  }
+
+  @Override
+  public Options setComparator(final BuiltinComparator builtinComparator) {
+    assert(isInitialized());
+    setComparatorHandle(nativeHandle_, builtinComparator.ordinal());
+    return this;
+  }
+
+  @Override
+  public Options setComparator(
+      final AbstractComparator<? extends AbstractSlice<?>> comparator) {
+    assert (isInitialized());
+    setComparatorHandle(nativeHandle_, comparator.nativeHandle_);
+    comparator_ = comparator;
+    return this;
+  }
+
+  @Override
+  public Options setMergeOperatorName(final String name) {
+    assert (isInitialized());
+    if (name == null) {
+      throw new IllegalArgumentException(
+          "Merge operator name must not be null.");
+    }
+    setMergeOperatorName(nativeHandle_, name);
+    return this;
+  }
+
+  @Override
+  public Options setMergeOperator(final MergeOperator mergeOperator) {
+    setMergeOperator(nativeHandle_, mergeOperator.newMergeOperatorHandle());
+    return this;
+  }
+
+  @Override
+  public Options setWriteBufferSize(final long writeBufferSize) {
+    assert(isInitialized());
+    setWriteBufferSize(nativeHandle_, writeBufferSize);
+    return this;
+  }
+
+  @Override
+  public long writeBufferSize()  {
+    assert(isInitialized());
+    return writeBufferSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxWriteBufferNumber(final int maxWriteBufferNumber) {
+    assert(isInitialized());
+    setMaxWriteBufferNumber(nativeHandle_, maxWriteBufferNumber);
+    return this;
+  }
+
+  @Override
+  public int maxWriteBufferNumber() {
+    assert(isInitialized());
+    return maxWriteBufferNumber(nativeHandle_);
+  }
+
+  @Override
+  public boolean errorIfExists() {
+    assert(isInitialized());
+    return errorIfExists(nativeHandle_);
+  }
+
+  @Override
+  public Options setErrorIfExists(final boolean errorIfExists) {
+    assert(isInitialized());
+    setErrorIfExists(nativeHandle_, errorIfExists);
+    return this;
+  }
+
+  @Override
+  public boolean paranoidChecks() {
+    assert(isInitialized());
+    return paranoidChecks(nativeHandle_);
+  }
+
+  @Override
+  public Options setParanoidChecks(final boolean paranoidChecks) {
+    assert(isInitialized());
+    setParanoidChecks(nativeHandle_, paranoidChecks);
+    return this;
+  }
+
+  @Override
+  public int maxOpenFiles() {
+    assert(isInitialized());
+    return maxOpenFiles(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxTotalWalSize(final long maxTotalWalSize) {
+    assert(isInitialized());
+    setMaxTotalWalSize(nativeHandle_, maxTotalWalSize);
+    return this;
+  }
+
+  @Override
+  public long maxTotalWalSize() {
+    assert(isInitialized());
+    return maxTotalWalSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxOpenFiles(final int maxOpenFiles) {
+    assert(isInitialized());
+    setMaxOpenFiles(nativeHandle_, maxOpenFiles);
+    return this;
+  }
+
+  @Override
+  public boolean disableDataSync() {
+    assert(isInitialized());
+    return disableDataSync(nativeHandle_);
+  }
+
+  @Override
+  public Options setDisableDataSync(final boolean disableDataSync) {
+    assert(isInitialized());
+    setDisableDataSync(nativeHandle_, disableDataSync);
+    return this;
+  }
+
+  @Override
+  public boolean useFsync() {
+    assert(isInitialized());
+    return useFsync(nativeHandle_);
+  }
+
+  @Override
+  public Options setUseFsync(final boolean useFsync) {
+    assert(isInitialized());
+    setUseFsync(nativeHandle_, useFsync);
+    return this;
+  }
+
+  @Override
+  public String dbLogDir() {
+    assert(isInitialized());
+    return dbLogDir(nativeHandle_);
+  }
+
+  @Override
+  public Options setDbLogDir(final String dbLogDir) {
+    assert(isInitialized());
+    setDbLogDir(nativeHandle_, dbLogDir);
+    return this;
+  }
+
+  @Override
+  public String walDir() {
+    assert(isInitialized());
+    return walDir(nativeHandle_);
+  }
+
+  @Override
+  public Options setWalDir(final String walDir) {
+    assert(isInitialized());
+    setWalDir(nativeHandle_, walDir);
+    return this;
+  }
+
+  @Override
+  public long deleteObsoleteFilesPeriodMicros() {
+    assert(isInitialized());
+    return deleteObsoleteFilesPeriodMicros(nativeHandle_);
+  }
+
+  @Override
+  public Options setDeleteObsoleteFilesPeriodMicros(
+      final long micros) {
+    assert(isInitialized());
+    setDeleteObsoleteFilesPeriodMicros(nativeHandle_, micros);
+    return this;
+  }
+
+  @Override
+  public int maxBackgroundCompactions() {
+    assert(isInitialized());
+    return maxBackgroundCompactions(nativeHandle_);
+  }
+
+  @Override
+  public Options createStatistics() {
+    assert(isInitialized());
+    createStatistics(nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public Statistics statisticsPtr() {
+    assert(isInitialized());
+
+    long statsPtr = statisticsPtr(nativeHandle_);
+    if(statsPtr == 0) {
+      createStatistics();
+      statsPtr = statisticsPtr(nativeHandle_);
+    }
+
+    return new Statistics(statsPtr);
+  }
+
+  @Override
+  public Options setMaxBackgroundCompactions(
+      final int maxBackgroundCompactions) {
+    assert(isInitialized());
+    setMaxBackgroundCompactions(nativeHandle_, maxBackgroundCompactions);
+    return this;
+  }
+
+  @Override
+  public int maxBackgroundFlushes() {
+    assert(isInitialized());
+    return maxBackgroundFlushes(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxBackgroundFlushes(
+      final int maxBackgroundFlushes) {
+    assert(isInitialized());
+    setMaxBackgroundFlushes(nativeHandle_, maxBackgroundFlushes);
+    return this;
+  }
+
+  @Override
+  public long maxLogFileSize() {
+    assert(isInitialized());
+    return maxLogFileSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxLogFileSize(final long maxLogFileSize) {
+    assert(isInitialized());
+    setMaxLogFileSize(nativeHandle_, maxLogFileSize);
+    return this;
+  }
+
+  @Override
+  public long logFileTimeToRoll() {
+    assert(isInitialized());
+    return logFileTimeToRoll(nativeHandle_);
+  }
+
+  @Override
+  public Options setLogFileTimeToRoll(final long logFileTimeToRoll) {
+    assert(isInitialized());
+    setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll);
+    return this;
+  }
+
+  @Override
+  public long keepLogFileNum() {
+    assert(isInitialized());
+    return keepLogFileNum(nativeHandle_);
+  }
+
+  @Override
+  public Options setKeepLogFileNum(final long keepLogFileNum) {
+    assert(isInitialized());
+    setKeepLogFileNum(nativeHandle_, keepLogFileNum);
+    return this;
+  }
+
+  @Override
+  public long maxManifestFileSize() {
+    assert(isInitialized());
+    return maxManifestFileSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxManifestFileSize(
+      final long maxManifestFileSize) {
+    assert(isInitialized());
+    setMaxManifestFileSize(nativeHandle_, maxManifestFileSize);
+    return this;
+  }
+
+  @Override
+  public Options setMaxTableFilesSizeFIFO(
+    final long maxTableFilesSize) {
+    assert(maxTableFilesSize > 0); // unsigned native type
+    assert(isInitialized());
+    setMaxTableFilesSizeFIFO(nativeHandle_, maxTableFilesSize);
+    return this;
+  }
+
+  @Override
+  public long maxTableFilesSizeFIFO() {
+    return maxTableFilesSizeFIFO(nativeHandle_);
+  }
+
+  @Override
+  public int tableCacheNumshardbits() {
+    assert(isInitialized());
+    return tableCacheNumshardbits(nativeHandle_);
+  }
+
+  @Override
+  public Options setTableCacheNumshardbits(
+      final int tableCacheNumshardbits) {
+    assert(isInitialized());
+    setTableCacheNumshardbits(nativeHandle_, tableCacheNumshardbits);
+    return this;
+  }
+
+  @Override
+  public long walTtlSeconds() {
+    assert(isInitialized());
+    return walTtlSeconds(nativeHandle_);
+  }
+
+  @Override
+  public Options setWalTtlSeconds(final long walTtlSeconds) {
+    assert(isInitialized());
+    setWalTtlSeconds(nativeHandle_, walTtlSeconds);
+    return this;
+  }
+
+  @Override
+  public long walSizeLimitMB() {
+    assert(isInitialized());
+    return walSizeLimitMB(nativeHandle_);
+  }
+
+  @Override
+  public Options setWalSizeLimitMB(final long sizeLimitMB) {
+    assert(isInitialized());
+    setWalSizeLimitMB(nativeHandle_, sizeLimitMB);
+    return this;
+  }
+
+  @Override
+  public long manifestPreallocationSize() {
+    assert(isInitialized());
+    return manifestPreallocationSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setManifestPreallocationSize(final long size) {
+    assert(isInitialized());
+    setManifestPreallocationSize(nativeHandle_, size);
+    return this;
+  }
+
+  @Override
+  public boolean allowOsBuffer() {
+    assert(isInitialized());
+    return allowOsBuffer(nativeHandle_);
+  }
+
+  @Override
+  public Options setAllowOsBuffer(final boolean allowOsBuffer) {
+    assert(isInitialized());
+    setAllowOsBuffer(nativeHandle_, allowOsBuffer);
+    return this;
+  }
+
+  @Override
+  public boolean allowMmapReads() {
+    assert(isInitialized());
+    return allowMmapReads(nativeHandle_);
+  }
+
+  @Override
+  public Options setAllowMmapReads(final boolean allowMmapReads) {
+    assert(isInitialized());
+    setAllowMmapReads(nativeHandle_, allowMmapReads);
+    return this;
+  }
+
+  @Override
+  public boolean allowMmapWrites() {
+    assert(isInitialized());
+    return allowMmapWrites(nativeHandle_);
+  }
+
+  @Override
+  public Options setAllowMmapWrites(final boolean allowMmapWrites) {
+    assert(isInitialized());
+    setAllowMmapWrites(nativeHandle_, allowMmapWrites);
+    return this;
+  }
+
+  @Override
+  public boolean isFdCloseOnExec() {
+    assert(isInitialized());
+    return isFdCloseOnExec(nativeHandle_);
+  }
+
+  @Override
+  public Options setIsFdCloseOnExec(final boolean isFdCloseOnExec) {
+    assert(isInitialized());
+    setIsFdCloseOnExec(nativeHandle_, isFdCloseOnExec);
+    return this;
+  }
+
+  @Override
+  public int statsDumpPeriodSec() {
+    assert(isInitialized());
+    return statsDumpPeriodSec(nativeHandle_);
+  }
+
+  @Override
+  public Options setStatsDumpPeriodSec(final int statsDumpPeriodSec) {
+    assert(isInitialized());
+    setStatsDumpPeriodSec(nativeHandle_, statsDumpPeriodSec);
+    return this;
+  }
+
+  @Override
+  public boolean adviseRandomOnOpen() {
+    return adviseRandomOnOpen(nativeHandle_);
+  }
+
+  @Override
+  public Options setAdviseRandomOnOpen(final boolean adviseRandomOnOpen) {
+    assert(isInitialized());
+    setAdviseRandomOnOpen(nativeHandle_, adviseRandomOnOpen);
+    return this;
+  }
+
+  @Override
+  public boolean useAdaptiveMutex() {
+    assert(isInitialized());
+    return useAdaptiveMutex(nativeHandle_);
+  }
+
+  @Override
+  public Options setUseAdaptiveMutex(final boolean useAdaptiveMutex) {
+    assert(isInitialized());
+    setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex);
+    return this;
+  }
+
+  @Override
+  public long bytesPerSync() {
+    return bytesPerSync(nativeHandle_);
+  }
+
+  @Override
+  public Options setBytesPerSync(final long bytesPerSync) {
+    assert(isInitialized());
+    setBytesPerSync(nativeHandle_, bytesPerSync);
+    return this;
+  }
+
+  @Override
+  public Options setMemTableConfig(final MemTableConfig config) {
+    memTableConfig_ = config;
+    setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle());
+    return this;
+  }
+
+  @Override
+  public Options setRateLimiterConfig(final RateLimiterConfig config) {
+    rateLimiterConfig_ = config;
+    setRateLimiter(nativeHandle_, config.newRateLimiterHandle());
+    return this;
+  }
+
+  @Override
+  public Options setLogger(final Logger logger) {
+    assert(isInitialized());
+    setLogger(nativeHandle_, logger.nativeHandle_);
+    return this;
+  }
+
+  @Override
+  public Options setInfoLogLevel(final InfoLogLevel infoLogLevel) {
+    assert(isInitialized());
+    setInfoLogLevel(nativeHandle_, infoLogLevel.getValue());
+    return this;
+  }
+
+  @Override
+  public InfoLogLevel infoLogLevel() {
+    assert(isInitialized());
+    return InfoLogLevel.getInfoLogLevel(
+        infoLogLevel(nativeHandle_));
+  }
+
+  @Override
+  public String memTableFactoryName() {
+    assert(isInitialized());
+    return memTableFactoryName(nativeHandle_);
+  }
+
+  @Override
+  public Options setTableFormatConfig(final TableFormatConfig config) {
+    tableFormatConfig_ = config;
+    setTableFactory(nativeHandle_, config.newTableFactoryHandle());
+    return this;
+  }
+
+  @Override
+  public String tableFactoryName() {
+    assert(isInitialized());
+    return tableFactoryName(nativeHandle_);
+  }
+
+  @Override
+  public Options useFixedLengthPrefixExtractor(final int n) {
+    assert(isInitialized());
+    useFixedLengthPrefixExtractor(nativeHandle_, n);
+    return this;
+  }
+
+  @Override
+  public Options useCappedPrefixExtractor(final int n) {
+    assert(isInitialized());
+    useCappedPrefixExtractor(nativeHandle_, n);
+    return this;
+  }
+
+  @Override
+  public CompressionType compressionType() {
+    return CompressionType.values()[compressionType(nativeHandle_)];
+  }
+
+  @Override
+  public Options setCompressionPerLevel(final List<CompressionType> compressionLevels) {
+    final List<Byte> byteCompressionTypes = new ArrayList<>(
+        compressionLevels.size());
+    for (final CompressionType compressionLevel : compressionLevels) {
+      byteCompressionTypes.add(compressionLevel.getValue());
+    }
+    setCompressionPerLevel(nativeHandle_, byteCompressionTypes);
+    return this;
+  }
+
+  @Override
+  public List<CompressionType> compressionPerLevel() {
+    final List<Byte> byteCompressionTypes =
+        compressionPerLevel(nativeHandle_);
+    final List<CompressionType> compressionLevels = new ArrayList<>();
+    for (final Byte byteCompressionType : byteCompressionTypes) {
+      compressionLevels.add(CompressionType.getCompressionType(
+          byteCompressionType));
+    }
+    return compressionLevels;
+  }
+
+  @Override
+  public Options setCompressionType(CompressionType compressionType) {
+    setCompressionType(nativeHandle_, compressionType.getValue());
+    return this;
+  }
+
+  @Override
+  public CompactionStyle compactionStyle() {
+    return CompactionStyle.values()[compactionStyle(nativeHandle_)];
+  }
+
+  @Override
+  public Options setCompactionStyle(
+      final CompactionStyle compactionStyle) {
+    setCompactionStyle(nativeHandle_, compactionStyle.getValue());
+    return this;
+  }
+
+  @Override
+  public int numLevels() {
+    return numLevels(nativeHandle_);
+  }
+
+  @Override
+  public Options setNumLevels(int numLevels) {
+    setNumLevels(nativeHandle_, numLevels);
+    return this;
+  }
+
+  @Override
+  public int levelZeroFileNumCompactionTrigger() {
+    return levelZeroFileNumCompactionTrigger(nativeHandle_);
+  }
+
+  @Override
+  public Options setLevelZeroFileNumCompactionTrigger(
+      final int numFiles) {
+    setLevelZeroFileNumCompactionTrigger(
+        nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public int levelZeroSlowdownWritesTrigger() {
+    return levelZeroSlowdownWritesTrigger(nativeHandle_);
+  }
+
+  @Override
+  public Options setLevelZeroSlowdownWritesTrigger(
+      final int numFiles) {
+    setLevelZeroSlowdownWritesTrigger(nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public int levelZeroStopWritesTrigger() {
+    return levelZeroStopWritesTrigger(nativeHandle_);
+  }
+
+  @Override
+  public Options setLevelZeroStopWritesTrigger(
+      final int numFiles) {
+    setLevelZeroStopWritesTrigger(nativeHandle_, numFiles);
+    return this;
+  }
+
+  @Override
+  public int maxMemCompactionLevel() {
+    return 0;
+  }
+
+  @Override
+  public Options setMaxMemCompactionLevel(
+      final int maxMemCompactionLevel) {
+    return this;
+  }
+
+  @Override
+  public long targetFileSizeBase() {
+    return targetFileSizeBase(nativeHandle_);
+  }
+
+  @Override
+  public Options setTargetFileSizeBase(long targetFileSizeBase) {
+    setTargetFileSizeBase(nativeHandle_, targetFileSizeBase);
+    return this;
+  }
+
+  @Override
+  public int targetFileSizeMultiplier() {
+    return targetFileSizeMultiplier(nativeHandle_);
+  }
+
+  @Override
+  public Options setTargetFileSizeMultiplier(int multiplier) {
+    setTargetFileSizeMultiplier(nativeHandle_, multiplier);
+    return this;
+  }
+
+  @Override
+  public Options setMaxBytesForLevelBase(final long maxBytesForLevelBase) {
+    setMaxBytesForLevelBase(nativeHandle_, maxBytesForLevelBase);
+    return this;
+  }
+
+  @Override
+  public long maxBytesForLevelBase() {
+    return maxBytesForLevelBase(nativeHandle_);
+  }
+
+  @Override
+  public Options setLevelCompactionDynamicLevelBytes(
+      final boolean enableLevelCompactionDynamicLevelBytes) {
+    setLevelCompactionDynamicLevelBytes(nativeHandle_,
+        enableLevelCompactionDynamicLevelBytes);
+    return this;
+  }
+
+  @Override
+  public boolean levelCompactionDynamicLevelBytes() {
+    return levelCompactionDynamicLevelBytes(nativeHandle_);
+  }
+
+  @Override
+  public int maxBytesForLevelMultiplier() {
+    return maxBytesForLevelMultiplier(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxBytesForLevelMultiplier(final int multiplier) {
+    setMaxBytesForLevelMultiplier(nativeHandle_, multiplier);
+    return this;
+  }
+
+  @Override
+  public int expandedCompactionFactor() {
+    return expandedCompactionFactor(nativeHandle_);
+  }
+
+  @Override
+  public Options setExpandedCompactionFactor(
+      final int expandedCompactionFactor) {
+    setExpandedCompactionFactor(nativeHandle_, expandedCompactionFactor);
+    return this;
+  }
+
+  @Override
+  public int sourceCompactionFactor() {
+    return sourceCompactionFactor(nativeHandle_);
+  }
+
+  @Override
+  public Options setSourceCompactionFactor(int sourceCompactionFactor) {
+    setSourceCompactionFactor(nativeHandle_, sourceCompactionFactor);
+    return this;
+  }
+
+  @Override
+  public int maxGrandparentOverlapFactor() {
+    return maxGrandparentOverlapFactor(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxGrandparentOverlapFactor(
+      final int maxGrandparentOverlapFactor) {
+    setMaxGrandparentOverlapFactor(nativeHandle_, maxGrandparentOverlapFactor);
+    return this;
+  }
+
+  @Override
+  public double softRateLimit() {
+    return softRateLimit(nativeHandle_);
+  }
+
+  @Override
+  public Options setSoftRateLimit(final double softRateLimit) {
+    setSoftRateLimit(nativeHandle_, softRateLimit);
+    return this;
+  }
+
+  @Override
+  public double hardRateLimit() {
+    return hardRateLimit(nativeHandle_);
+  }
+
+  @Override
+  public Options setHardRateLimit(double hardRateLimit) {
+    setHardRateLimit(nativeHandle_, hardRateLimit);
+    return this;
+  }
+
+  @Override
+  public int rateLimitDelayMaxMilliseconds() {
+    return rateLimitDelayMaxMilliseconds(nativeHandle_);
+  }
+
+  @Override
+  public Options setRateLimitDelayMaxMilliseconds(
+      final int rateLimitDelayMaxMilliseconds) {
+    setRateLimitDelayMaxMilliseconds(
+        nativeHandle_, rateLimitDelayMaxMilliseconds);
+    return this;
+  }
+
+  @Override
+  public long arenaBlockSize() {
+    return arenaBlockSize(nativeHandle_);
+  }
+
+  @Override
+  public Options setArenaBlockSize(final long arenaBlockSize) {
+    setArenaBlockSize(nativeHandle_, arenaBlockSize);
+    return this;
+  }
+
+  @Override
+  public boolean disableAutoCompactions() {
+    return disableAutoCompactions(nativeHandle_);
+  }
+
+  @Override
+  public Options setDisableAutoCompactions(
+      final boolean disableAutoCompactions) {
+    setDisableAutoCompactions(nativeHandle_, disableAutoCompactions);
+    return this;
+  }
+
+  @Override
+  public boolean purgeRedundantKvsWhileFlush() {
+    return purgeRedundantKvsWhileFlush(nativeHandle_);
+  }
+
+  @Override
+  public Options setPurgeRedundantKvsWhileFlush(
+      final boolean purgeRedundantKvsWhileFlush) {
+    setPurgeRedundantKvsWhileFlush(
+        nativeHandle_, purgeRedundantKvsWhileFlush);
+    return this;
+  }
+
+  @Override
+  public boolean verifyChecksumsInCompaction() {
+    return verifyChecksumsInCompaction(nativeHandle_);
+  }
+
+  @Override
+  public Options setVerifyChecksumsInCompaction(
+      final boolean verifyChecksumsInCompaction) {
+    setVerifyChecksumsInCompaction(
+        nativeHandle_, verifyChecksumsInCompaction);
+    return this;
+  }
+
+  @Override
+  public boolean filterDeletes() {
+    return filterDeletes(nativeHandle_);
+  }
+
+  @Override
+  public Options setFilterDeletes(
+      final boolean filterDeletes) {
+    setFilterDeletes(nativeHandle_, filterDeletes);
+    return this;
+  }
+
+  @Override
+  public long maxSequentialSkipInIterations() {
+    return maxSequentialSkipInIterations(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxSequentialSkipInIterations(
+      final long maxSequentialSkipInIterations) {
+    setMaxSequentialSkipInIterations(nativeHandle_, maxSequentialSkipInIterations);
+    return this;
+  }
+
+  @Override
+  public boolean inplaceUpdateSupport() {
+    return inplaceUpdateSupport(nativeHandle_);
+  }
+
+  @Override
+  public Options setInplaceUpdateSupport(
+      final boolean inplaceUpdateSupport) {
+    setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport);
+    return this;
+  }
+
+  @Override
+  public long inplaceUpdateNumLocks() {
+    return inplaceUpdateNumLocks(nativeHandle_);
+  }
+
+  @Override
+  public Options setInplaceUpdateNumLocks(
+      final long inplaceUpdateNumLocks) {
+    setInplaceUpdateNumLocks(nativeHandle_, inplaceUpdateNumLocks);
+    return this;
+  }
+
+  @Override
+  public int memtablePrefixBloomBits() {
+    return memtablePrefixBloomBits(nativeHandle_);
+  }
+
+  @Override
+  public Options setMemtablePrefixBloomBits(
+      final int memtablePrefixBloomBits) {
+    setMemtablePrefixBloomBits(nativeHandle_, memtablePrefixBloomBits);
+    return this;
+  }
+
+  @Override
+  public int memtablePrefixBloomProbes() {
+    return memtablePrefixBloomProbes(nativeHandle_);
+  }
+
+  @Override
+  public Options setMemtablePrefixBloomProbes(
+      final int memtablePrefixBloomProbes) {
+    setMemtablePrefixBloomProbes(nativeHandle_, memtablePrefixBloomProbes);
+    return this;
+  }
+
+  @Override
+  public int bloomLocality() {
+    return bloomLocality(nativeHandle_);
+  }
+
+  @Override
+  public Options setBloomLocality(final int bloomLocality) {
+    setBloomLocality(nativeHandle_, bloomLocality);
+    return this;
+  }
+
+  @Override
+  public long maxSuccessiveMerges() {
+    return maxSuccessiveMerges(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxSuccessiveMerges(long maxSuccessiveMerges) {
+    setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges);
+    return this;
+  }
+
+  @Override
+  public int minWriteBufferNumberToMerge() {
+    return minWriteBufferNumberToMerge(nativeHandle_);
+  }
+
+  @Override
+  public Options setMinWriteBufferNumberToMerge(
+      final int minWriteBufferNumberToMerge) {
+    setMinWriteBufferNumberToMerge(nativeHandle_, minWriteBufferNumberToMerge);
+    return this;
+  }
+
+  @Override
+  public int minPartialMergeOperands() {
+    return minPartialMergeOperands(nativeHandle_);
+  }
+
+  @Override
+  public Options setMinPartialMergeOperands(
+      final int minPartialMergeOperands) {
+    setMinPartialMergeOperands(nativeHandle_, minPartialMergeOperands);
+    return this;
+  }
+
+  @Override
+  public Options setOptimizeFiltersForHits(
+      final boolean optimizeFiltersForHits) {
+    setOptimizeFiltersForHits(nativeHandle_, optimizeFiltersForHits);
+    return this;
+  }
+
+  @Override
+  public boolean optimizeFiltersForHits() {
+    return optimizeFiltersForHits(nativeHandle_);
+  }
+
+  /**
+   * Release the memory allocated for the current instance
+   * in the c++ side.
+   */
+  @Override protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void newOptions();
+  private native void newOptions(long dbOptHandle,
+      long cfOptHandle);
+  private native void disposeInternal(long handle);
+  private native void setEnv(long optHandle, long envHandle);
+  private native void prepareForBulkLoad(long handle);
+
+  // DB native handles
+  private native void setIncreaseParallelism(long handle, int totalThreads);
+  private native void setCreateIfMissing(long handle, boolean flag);
+  private native boolean createIfMissing(long handle);
+  private native void setCreateMissingColumnFamilies(
+      long handle, boolean flag);
+  private native boolean createMissingColumnFamilies(long handle);
+  private native void setErrorIfExists(long handle, boolean errorIfExists);
+  private native boolean errorIfExists(long handle);
+  private native void setParanoidChecks(
+      long handle, boolean paranoidChecks);
+  private native boolean paranoidChecks(long handle);
+  private native void setRateLimiter(long handle,
+      long rateLimiterHandle);
+  private native void setLogger(long handle,
+      long loggerHandle);
+  private native void setInfoLogLevel(long handle, byte logLevel);
+  private native byte infoLogLevel(long handle);
+  private native void setMaxOpenFiles(long handle, int maxOpenFiles);
+  private native int maxOpenFiles(long handle);
+  private native void setMaxTotalWalSize(long handle,
+      long maxTotalWalSize);
+  private native long maxTotalWalSize(long handle);
+  private native void createStatistics(long optHandle);
+  private native long statisticsPtr(long optHandle);
+  private native void setDisableDataSync(long handle, boolean disableDataSync);
+  private native boolean disableDataSync(long handle);
+  private native boolean useFsync(long handle);
+  private native void setUseFsync(long handle, boolean useFsync);
+  private native void setDbLogDir(long handle, String dbLogDir);
+  private native String dbLogDir(long handle);
+  private native void setWalDir(long handle, String walDir);
+  private native String walDir(long handle);
+  private native void setDeleteObsoleteFilesPeriodMicros(
+      long handle, long micros);
+  private native long deleteObsoleteFilesPeriodMicros(long handle);
+  private native void setMaxBackgroundCompactions(
+      long handle, int maxBackgroundCompactions);
+  private native int maxBackgroundCompactions(long handle);
+  private native void setMaxBackgroundFlushes(
+      long handle, int maxBackgroundFlushes);
+  private native int maxBackgroundFlushes(long handle);
+  private native void setMaxLogFileSize(long handle, long maxLogFileSize)
+      throws IllegalArgumentException;
+  private native long maxLogFileSize(long handle);
+  private native void setLogFileTimeToRoll(
+      long handle, long logFileTimeToRoll) throws IllegalArgumentException;
+  private native long logFileTimeToRoll(long handle);
+  private native void setKeepLogFileNum(long handle, long keepLogFileNum)
+      throws IllegalArgumentException;
+  private native long keepLogFileNum(long handle);
+  private native void setMaxManifestFileSize(
+      long handle, long maxManifestFileSize);
+  private native long maxManifestFileSize(long handle);
+  private native void setMaxTableFilesSizeFIFO(
+      long handle, long maxTableFilesSize);
+  private native long maxTableFilesSizeFIFO(long handle);
+  private native void setTableCacheNumshardbits(
+      long handle, int tableCacheNumshardbits);
+  private native int tableCacheNumshardbits(long handle);
+  private native void setWalTtlSeconds(long handle, long walTtlSeconds);
+  private native long walTtlSeconds(long handle);
+  private native void setWalSizeLimitMB(long handle, long sizeLimitMB);
+  private native long walSizeLimitMB(long handle);
+  private native void setManifestPreallocationSize(
+      long handle, long size) throws IllegalArgumentException;
+  private native long manifestPreallocationSize(long handle);
+  private native void setAllowOsBuffer(
+      long handle, boolean allowOsBuffer);
+  private native boolean allowOsBuffer(long handle);
+  private native void setAllowMmapReads(
+      long handle, boolean allowMmapReads);
+  private native boolean allowMmapReads(long handle);
+  private native void setAllowMmapWrites(
+      long handle, boolean allowMmapWrites);
+  private native boolean allowMmapWrites(long handle);
+  private native void setIsFdCloseOnExec(
+      long handle, boolean isFdCloseOnExec);
+  private native boolean isFdCloseOnExec(long handle);
+  private native void setStatsDumpPeriodSec(
+      long handle, int statsDumpPeriodSec);
+  private native int statsDumpPeriodSec(long handle);
+  private native void setAdviseRandomOnOpen(
+      long handle, boolean adviseRandomOnOpen);
+  private native boolean adviseRandomOnOpen(long handle);
+  private native void setUseAdaptiveMutex(
+      long handle, boolean useAdaptiveMutex);
+  private native boolean useAdaptiveMutex(long handle);
+  private native void setBytesPerSync(
+      long handle, long bytesPerSync);
+  private native long bytesPerSync(long handle);
+  // CF native handles
+  private native void optimizeForPointLookup(long handle,
+      long blockCacheSizeMb);
+  private native void optimizeLevelStyleCompaction(long handle,
+      long memtableMemoryBudget);
+  private native void optimizeUniversalStyleCompaction(long handle,
+      long memtableMemoryBudget);
+  private native void setComparatorHandle(long handle, int builtinComparator);
+  private native void setComparatorHandle(long optHandle, long comparatorHandle);
+  private native void setMergeOperatorName(
+      long handle, String name);
+  private native void setMergeOperator(
+      long handle, long mergeOperatorHandle);
+  private native void setWriteBufferSize(long handle, long writeBufferSize)
+      throws IllegalArgumentException;
+  private native long writeBufferSize(long handle);
+  private native void setMaxWriteBufferNumber(
+      long handle, int maxWriteBufferNumber);
+  private native int maxWriteBufferNumber(long handle);
+  private native void setMinWriteBufferNumberToMerge(
+      long handle, int minWriteBufferNumberToMerge);
+  private native int minWriteBufferNumberToMerge(long handle);
+  private native void setCompressionType(long handle, byte compressionType);
+  private native byte compressionType(long handle);
+  private native void setCompressionPerLevel(long handle,
+      List<Byte> compressionLevels);
+  private native List<Byte> compressionPerLevel(long handle);
+  private native void useFixedLengthPrefixExtractor(
+      long handle, int prefixLength);
+  private native void useCappedPrefixExtractor(
+      long handle, int prefixLength);
+  private native void setNumLevels(
+      long handle, int numLevels);
+  private native int numLevels(long handle);
+  private native void setLevelZeroFileNumCompactionTrigger(
+      long handle, int numFiles);
+  private native int levelZeroFileNumCompactionTrigger(long handle);
+  private native void setLevelZeroSlowdownWritesTrigger(
+      long handle, int numFiles);
+  private native int levelZeroSlowdownWritesTrigger(long handle);
+  private native void setLevelZeroStopWritesTrigger(
+      long handle, int numFiles);
+  private native int levelZeroStopWritesTrigger(long handle);
+  private native void setTargetFileSizeBase(
+      long handle, long targetFileSizeBase);
+  private native long targetFileSizeBase(long handle);
+  private native void setTargetFileSizeMultiplier(
+      long handle, int multiplier);
+  private native int targetFileSizeMultiplier(long handle);
+  private native void setMaxBytesForLevelBase(
+      long handle, long maxBytesForLevelBase);
+  private native long maxBytesForLevelBase(long handle);
+  private native void setLevelCompactionDynamicLevelBytes(
+      long handle, boolean enableLevelCompactionDynamicLevelBytes);
+  private native boolean levelCompactionDynamicLevelBytes(
+      long handle);
+  private native void setMaxBytesForLevelMultiplier(
+      long handle, int multiplier);
+  private native int maxBytesForLevelMultiplier(long handle);
+  private native void setExpandedCompactionFactor(
+      long handle, int expandedCompactionFactor);
+  private native int expandedCompactionFactor(long handle);
+  private native void setSourceCompactionFactor(
+      long handle, int sourceCompactionFactor);
+  private native int sourceCompactionFactor(long handle);
+  private native void setMaxGrandparentOverlapFactor(
+      long handle, int maxGrandparentOverlapFactor);
+  private native int maxGrandparentOverlapFactor(long handle);
+  private native void setSoftRateLimit(
+      long handle, double softRateLimit);
+  private native double softRateLimit(long handle);
+  private native void setHardRateLimit(
+      long handle, double hardRateLimit);
+  private native double hardRateLimit(long handle);
+  private native void setRateLimitDelayMaxMilliseconds(
+      long handle, int rateLimitDelayMaxMilliseconds);
+  private native int rateLimitDelayMaxMilliseconds(long handle);
+  private native void setArenaBlockSize(
+      long handle, long arenaBlockSize) throws IllegalArgumentException;
+  private native long arenaBlockSize(long handle);
+  private native void setDisableAutoCompactions(
+      long handle, boolean disableAutoCompactions);
+  private native boolean disableAutoCompactions(long handle);
+  private native void setCompactionStyle(long handle, byte compactionStyle);
+  private native byte compactionStyle(long handle);
+  private native void setPurgeRedundantKvsWhileFlush(
+      long handle, boolean purgeRedundantKvsWhileFlush);
+  private native boolean purgeRedundantKvsWhileFlush(long handle);
+  private native void setVerifyChecksumsInCompaction(
+      long handle, boolean verifyChecksumsInCompaction);
+  private native boolean verifyChecksumsInCompaction(long handle);
+  private native void setFilterDeletes(
+      long handle, boolean filterDeletes);
+  private native boolean filterDeletes(long handle);
+  private native void setMaxSequentialSkipInIterations(
+      long handle, long maxSequentialSkipInIterations);
+  private native long maxSequentialSkipInIterations(long handle);
+  private native void setMemTableFactory(long handle, long factoryHandle);
+  private native String memTableFactoryName(long handle);
+  private native void setTableFactory(long handle, long factoryHandle);
+  private native String tableFactoryName(long handle);
+  private native void setInplaceUpdateSupport(
+      long handle, boolean inplaceUpdateSupport);
+  private native boolean inplaceUpdateSupport(long handle);
+  private native void setInplaceUpdateNumLocks(
+      long handle, long inplaceUpdateNumLocks)
+      throws IllegalArgumentException;
+  private native long inplaceUpdateNumLocks(long handle);
+  private native void setMemtablePrefixBloomBits(
+      long handle, int memtablePrefixBloomBits);
+  private native int memtablePrefixBloomBits(long handle);
+  private native void setMemtablePrefixBloomProbes(
+      long handle, int memtablePrefixBloomProbes);
+  private native int memtablePrefixBloomProbes(long handle);
+  private native void setBloomLocality(
+      long handle, int bloomLocality);
+  private native int bloomLocality(long handle);
+  private native void setMaxSuccessiveMerges(
+      long handle, long maxSuccessiveMerges)
+      throws IllegalArgumentException;
+  private native long maxSuccessiveMerges(long handle);
+  private native void setMinPartialMergeOperands(
+      long handle, int minPartialMergeOperands);
+  private native int minPartialMergeOperands(long handle);
+  private native void setOptimizeFiltersForHits(long handle,
+      boolean optimizeFiltersForHits);
+  private native boolean optimizeFiltersForHits(long handle);
+  // instance variables
+  Env env_;
+  MemTableConfig memTableConfig_;
+  TableFormatConfig tableFormatConfig_;
+  RateLimiterConfig rateLimiterConfig_;
+  AbstractComparator<? extends AbstractSlice<?>> comparator_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/PlainTableConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/PlainTableConfig.java
new file mode 100644
index 0000000..3a41bea
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/PlainTableConfig.java
@@ -0,0 +1,251 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * The config for plain table sst format.
+ *
+ * <p>PlainTable is a RocksDB's SST file format optimized for low query
+ * latency on pure-memory or really low-latency media.</p>
+ *
+ * <p>It also support prefix hash feature.</p>
+ */
+public class PlainTableConfig extends TableFormatConfig {
+  public static final int VARIABLE_LENGTH = 0;
+  public static final int DEFAULT_BLOOM_BITS_PER_KEY = 10;
+  public static final double DEFAULT_HASH_TABLE_RATIO = 0.75;
+  public static final int DEFAULT_INDEX_SPARSENESS = 16;
+  public static final int DEFAULT_HUGE_TLB_SIZE = 0;
+  public static final EncodingType DEFAULT_ENCODING_TYPE =
+      EncodingType.kPlain;
+  public static final boolean DEFAULT_FULL_SCAN_MODE = false;
+  public static final boolean DEFAULT_STORE_INDEX_IN_FILE
+      = false;
+
+  public PlainTableConfig() {
+    keySize_ = VARIABLE_LENGTH;
+    bloomBitsPerKey_ = DEFAULT_BLOOM_BITS_PER_KEY;
+    hashTableRatio_ = DEFAULT_HASH_TABLE_RATIO;
+    indexSparseness_ = DEFAULT_INDEX_SPARSENESS;
+    hugePageTlbSize_ = DEFAULT_HUGE_TLB_SIZE;
+    encodingType_ = DEFAULT_ENCODING_TYPE;
+    fullScanMode_ = DEFAULT_FULL_SCAN_MODE;
+    storeIndexInFile_ = DEFAULT_STORE_INDEX_IN_FILE;
+  }
+
+  /**
+   * <p>Set the length of the user key. If it is set to be
+   * VARIABLE_LENGTH, then it indicates the user keys are
+   * of variable length.</p>
+   *
+   * <p>Otherwise,all the keys need to have the same length
+   * in byte.</p>
+   *
+   * <p>DEFAULT: VARIABLE_LENGTH</p>
+   *
+   * @param keySize the length of the user key.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setKeySize(int keySize) {
+    keySize_ = keySize;
+    return this;
+  }
+
+  /**
+   * @return the specified size of the user key.  If VARIABLE_LENGTH,
+   *     then it indicates variable-length key.
+   */
+  public int keySize() {
+    return keySize_;
+  }
+
+  /**
+   * Set the number of bits per key used by the internal bloom filter
+   * in the plain table sst format.
+   *
+   * @param bitsPerKey the number of bits per key for bloom filer.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setBloomBitsPerKey(int bitsPerKey) {
+    bloomBitsPerKey_ = bitsPerKey;
+    return this;
+  }
+
+  /**
+   * @return the number of bits per key used for the bloom filter.
+   */
+  public int bloomBitsPerKey() {
+    return bloomBitsPerKey_;
+  }
+
+  /**
+   * hashTableRatio is the desired utilization of the hash table used
+   * for prefix hashing.  The ideal ratio would be the number of
+   * prefixes / the number of hash buckets.  If this value is set to
+   * zero, then hash table will not be used.
+   *
+   * @param ratio the hash table ratio.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setHashTableRatio(double ratio) {
+    hashTableRatio_ = ratio;
+    return this;
+  }
+
+  /**
+   * @return the hash table ratio.
+   */
+  public double hashTableRatio() {
+    return hashTableRatio_;
+  }
+
+  /**
+   * Index sparseness determines the index interval for keys inside the
+   * same prefix.  This number is equal to the maximum number of linear
+   * search required after hash and binary search.  If it's set to 0,
+   * then each key will be indexed.
+   *
+   * @param sparseness the index sparseness.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setIndexSparseness(int sparseness) {
+    indexSparseness_ = sparseness;
+    return this;
+  }
+
+  /**
+   * @return the index sparseness.
+   */
+  public long indexSparseness() {
+    return indexSparseness_;
+  }
+
+  /**
+   * <p>huge_page_tlb_size: if ≤0, allocate hash indexes and blooms
+   * from malloc otherwise from huge page TLB.</p>
+   *
+   * <p>The user needs to reserve huge pages for it to be allocated,
+   * like: {@code sysctl -w vm.nr_hugepages=20}</p>
+   *
+   * <p>See linux doc Documentation/vm/hugetlbpage.txt</p>
+   *
+   * @param hugePageTlbSize huge page tlb size
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setHugePageTlbSize(int hugePageTlbSize) {
+    this.hugePageTlbSize_ = hugePageTlbSize;
+    return this;
+  }
+
+  /**
+   * Returns the value for huge page tlb size
+   *
+   * @return hugePageTlbSize
+   */
+  public int hugePageTlbSize() {
+    return hugePageTlbSize_;
+  }
+
+  /**
+   * Sets the encoding type.
+   *
+   * <p>This setting determines how to encode
+   * the keys. See enum {@link EncodingType} for
+   * the choices.</p>
+   *
+   * <p>The value will determine how to encode keys
+   * when writing to a new SST file. This value will be stored
+   * inside the SST file which will be used when reading from
+   * the file, which makes it possible for users to choose
+   * different encoding type when reopening a DB. Files with
+   * different encoding types can co-exist in the same DB and
+   * can be read.</p>
+   *
+   * @param encodingType {@link org.rocksdb.EncodingType} value.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setEncodingType(EncodingType encodingType) {
+    this.encodingType_ = encodingType;
+    return this;
+  }
+
+  /**
+   * Returns the active EncodingType
+   *
+   * @return currently set encoding type
+   */
+  public EncodingType encodingType() {
+    return encodingType_;
+  }
+
+  /**
+   * Set full scan mode, if true the whole file will be read
+   * one record by one without using the index.
+   *
+   * @param fullScanMode boolean value indicating if full
+   *     scan mode shall be enabled.
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setFullScanMode(boolean fullScanMode) {
+    this.fullScanMode_ = fullScanMode;
+    return this;
+  }
+
+  /**
+   * Return if full scan mode is active
+   * @return boolean value indicating if the full scan mode is
+   *     enabled.
+   */
+  public boolean fullScanMode() {
+    return fullScanMode_;
+  }
+
+  /**
+   * <p>If set to true: compute plain table index and bloom
+   * filter during file building and store it in file.
+   * When reading file, index will be mmaped instead
+   * of doing recomputation.</p>
+   *
+   * @param storeIndexInFile value indicating if index shall
+   *     be stored in a file
+   * @return the reference to the current config.
+   */
+  public PlainTableConfig setStoreIndexInFile(boolean storeIndexInFile) {
+    this.storeIndexInFile_ = storeIndexInFile;
+    return this;
+  }
+
+  /**
+   * Return a boolean value indicating if index shall be stored
+   * in a file.
+   *
+   * @return currently set value for store index in file.
+   */
+  public boolean storeIndexInFile() {
+    return storeIndexInFile_;
+  }
+
+  @Override protected long newTableFactoryHandle() {
+    return newTableFactoryHandle(keySize_, bloomBitsPerKey_,
+        hashTableRatio_, indexSparseness_, hugePageTlbSize_,
+        encodingType_.getValue(), fullScanMode_,
+        storeIndexInFile_);
+  }
+
+  private native long newTableFactoryHandle(
+      int keySize, int bloomBitsPerKey,
+      double hashTableRatio, int indexSparseness,
+      int hugePageTlbSize, byte encodingType,
+      boolean fullScanMode, boolean storeIndexInFile);
+
+  private int keySize_;
+  private int bloomBitsPerKey_;
+  private double hashTableRatio_;
+  private int indexSparseness_;
+  private int hugePageTlbSize_;
+  private EncodingType encodingType_;
+  private boolean fullScanMode_;
+  private boolean storeIndexInFile_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RateLimiterConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/RateLimiterConfig.java
new file mode 100644
index 0000000..09d1c7a
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RateLimiterConfig.java
@@ -0,0 +1,23 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * Config for rate limiter, which is used to control write rate of flush and
+ * compaction.
+ */
+public abstract class RateLimiterConfig {
+  /**
+   * This function should only be called by
+   * {@link org.rocksdb.DBOptions#setRateLimiter(long, long)}, which will
+   * create a c++ shared-pointer to the c++ {@code RateLimiter} that is associated
+   * with a Java RateLimiterConfig.
+   *
+   * @see org.rocksdb.DBOptions#setRateLimiter(long, long)
+   *
+   * @return native handle address to rate limiter instance.
+   */
+  abstract protected long newRateLimiterHandle();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java
new file mode 100644
index 0000000..a72a6e0
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java
@@ -0,0 +1,163 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * The class that controls the get behavior.
+ *
+ * Note that dispose() must be called before an Options instance
+ * become out-of-scope to release the allocated memory in c++.
+ */
+public class ReadOptions extends RocksObject {
+  public ReadOptions() {
+    super();
+    newReadOptions();
+  }
+  private native void newReadOptions();
+
+  /**
+   * If true, all data read from underlying storage will be
+   * verified against corresponding checksums.
+   * Default: true
+   *
+   * @return true if checksum verification is on.
+   */
+  public boolean verifyChecksums() {
+    assert(isInitialized());
+    return verifyChecksums(nativeHandle_);
+  }
+  private native boolean verifyChecksums(long handle);
+
+  /**
+   * If true, all data read from underlying storage will be
+   * verified against corresponding checksums.
+   * Default: true
+   *
+   * @param verifyChecksums if true, then checksum verification
+   *     will be performed on every read.
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setVerifyChecksums(
+      final boolean verifyChecksums) {
+    assert(isInitialized());
+    setVerifyChecksums(nativeHandle_, verifyChecksums);
+    return this;
+  }
+  private native void setVerifyChecksums(
+      long handle, boolean verifyChecksums);
+
+  // TODO(yhchiang): this option seems to be block-based table only.
+  //                 move this to a better place?
+  /**
+   * Fill the cache when loading the block-based sst formated db.
+   * Callers may wish to set this field to false for bulk scans.
+   * Default: true
+   *
+   * @return true if the fill-cache behavior is on.
+   */
+  public boolean fillCache() {
+    assert(isInitialized());
+    return fillCache(nativeHandle_);
+  }
+  private native boolean fillCache(long handle);
+
+  /**
+   * Fill the cache when loading the block-based sst formatted db.
+   * Callers may wish to set this field to false for bulk scans.
+   * Default: true
+   *
+   * @param fillCache if true, then fill-cache behavior will be
+   *     performed.
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setFillCache(final boolean fillCache) {
+    assert(isInitialized());
+    setFillCache(nativeHandle_, fillCache);
+    return this;
+  }
+  private native void setFillCache(
+      long handle, boolean fillCache);
+
+  /**
+   * <p>If "snapshot" is non-nullptr, read as of the supplied snapshot
+   * (which must belong to the DB that is being read and which must
+   * not have been released).  If "snapshot" is nullptr, use an implicit
+   * snapshot of the state at the beginning of this read operation.</p>
+   * <p>Default: null</p>
+   *
+   * @param snapshot {@link Snapshot} instance
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setSnapshot(final Snapshot snapshot) {
+    assert(isInitialized());
+    if (snapshot != null) {
+      setSnapshot(nativeHandle_, snapshot.nativeHandle_);
+    } else {
+      setSnapshot(nativeHandle_, 0l);
+    }
+    return this;
+  }
+  private native void setSnapshot(long handle, long snapshotHandle);
+
+  /**
+   * Returns the currently assigned Snapshot instance.
+   *
+   * @return the Snapshot assigned to this instance. If no Snapshot
+   *     is assigned null.
+   */
+  public Snapshot snapshot() {
+    assert(isInitialized());
+    long snapshotHandle = snapshot(nativeHandle_);
+    if (snapshotHandle != 0) {
+      return new Snapshot(snapshotHandle);
+    }
+    return null;
+  }
+  private native long snapshot(long handle);
+
+  /**
+   * Specify to create a tailing iterator -- a special iterator that has a
+   * view of the complete database (i.e. it can also be used to read newly
+   * added data) and is optimized for sequential reads. It will return records
+   * that were inserted into the database after the creation of the iterator.
+   * Default: false
+   *
+   * Not supported in {@code ROCKSDB_LITE} mode!
+   *
+   * @return true if tailing iterator is enabled.
+   */
+  public boolean tailing() {
+    assert(isInitialized());
+    return tailing(nativeHandle_);
+  }
+  private native boolean tailing(long handle);
+
+  /**
+   * Specify to create a tailing iterator -- a special iterator that has a
+   * view of the complete database (i.e. it can also be used to read newly
+   * added data) and is optimized for sequential reads. It will return records
+   * that were inserted into the database after the creation of the iterator.
+   * Default: false
+   * Not supported in ROCKSDB_LITE mode!
+   *
+   * @param tailing if true, then tailing iterator will be enabled.
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setTailing(final boolean tailing) {
+    assert(isInitialized());
+    setTailing(nativeHandle_, tailing);
+    return this;
+  }
+  private native void setTailing(
+      long handle, boolean tailing);
+
+
+  @Override protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+  private native void disposeInternal(long handle);
+
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java b/src/rocksdb/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java
new file mode 100644
index 0000000..61c4613
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java
@@ -0,0 +1,18 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Just a Java wrapper around EmptyValueCompactionFilter implemented in C++
+ */
+public class RemoveEmptyValueCompactionFilter extends AbstractCompactionFilter<Slice> {
+  public RemoveEmptyValueCompactionFilter() {
+    super();
+    createNewRemoveEmptyValueCompactionFilter0();
+  }
+
+  private native void createNewRemoveEmptyValueCompactionFilter0();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RestoreBackupableDB.java b/src/rocksdb/java/src/main/java/org/rocksdb/RestoreBackupableDB.java
new file mode 100644
index 0000000..5a3b2fc
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RestoreBackupableDB.java
@@ -0,0 +1,166 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * <p>This class is used to access information about backups and
+ * restore from them.</p>
+ *
+ * <p>Note: {@code dispose()} must be called before this instance
+ * become out-of-scope to release the allocated
+ * memory in c++.</p>
+ *
+ */
+public class RestoreBackupableDB extends RocksObject {
+  /**
+   * <p>Construct new estoreBackupableDB instance.</p>
+   *
+   * @param options {@link org.rocksdb.BackupableDBOptions} instance
+   */
+  public RestoreBackupableDB(final BackupableDBOptions options) {
+    super();
+    nativeHandle_ = newRestoreBackupableDB(options.nativeHandle_);
+  }
+
+  /**
+   * <p>Restore from backup with backup_id.</p>
+   *
+   * <p><strong>Important</strong>: If options_.share_table_files == true
+   * and you restore DB from some backup that is not the latest, and you
+   * start creating new backups from the new DB, they will probably
+   * fail.</p>
+   *
+   * <p><strong>Example</strong>: Let's say you have backups 1, 2, 3, 4, 5
+   * and you restore 3. If you add new data to the DB and try creating a new
+   * backup now, the database will diverge from backups 4 and 5 and the new
+   * backup will fail. If you want to create new backup, you will first have
+   * to delete backups 4 and 5.</p>
+   *
+   * @param backupId id pointing to backup
+   * @param dbDir database directory to restore to
+   * @param walDir directory where wal files are located
+   * @param restoreOptions {@link org.rocksdb.RestoreOptions} instance.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void restoreDBFromBackup(final long backupId, final String dbDir,
+      final String walDir, final RestoreOptions restoreOptions)
+      throws RocksDBException {
+    assert(isInitialized());
+    restoreDBFromBackup0(nativeHandle_, backupId, dbDir, walDir,
+        restoreOptions.nativeHandle_);
+  }
+
+  /**
+   * <p>Restore from the latest backup.</p>
+   *
+   * @param dbDir database directory to restore to
+   * @param walDir directory where wal files are located
+   * @param restoreOptions {@link org.rocksdb.RestoreOptions} instance
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void restoreDBFromLatestBackup(final String dbDir,
+      final String walDir, final RestoreOptions restoreOptions)
+      throws RocksDBException {
+    assert(isInitialized());
+    restoreDBFromLatestBackup0(nativeHandle_, dbDir, walDir,
+        restoreOptions.nativeHandle_);
+  }
+
+  /**
+   * <p>Deletes old backups, keeping latest numBackupsToKeep alive.</p>
+   *
+   * @param numBackupsToKeep of latest backups to keep
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void purgeOldBackups(final int numBackupsToKeep)
+      throws RocksDBException {
+    assert(isInitialized());
+    purgeOldBackups0(nativeHandle_, numBackupsToKeep);
+  }
+
+  /**
+   * <p>Deletes a specific backup.</p>
+   *
+   * @param backupId of backup to delete.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void deleteBackup(final int backupId)
+      throws RocksDBException {
+    assert(isInitialized());
+    deleteBackup0(nativeHandle_, backupId);
+  }
+
+  /**
+   * <p>Returns a list of {@link BackupInfo} instances, which describe
+   * already made backups.</p>
+   *
+   * @return List of {@link BackupInfo} instances.
+   */
+  public List<BackupInfo> getBackupInfos() {
+    assert(isInitialized());
+    return getBackupInfo(nativeHandle_);
+  }
+
+  /**
+   * <p>Returns a list of corrupted backup ids. If there
+   * is no corrupted backup the method will return an
+   * empty list.</p>
+   *
+   * @return array of backup ids as int ids.
+   */
+  public int[] getCorruptedBackups() {
+    assert(isInitialized());
+    return getCorruptedBackups(nativeHandle_);
+  }
+
+  /**
+   * <p>Will delete all the files we don't need anymore. It will
+   * do the full scan of the files/ directory and delete all the
+   * files that are not referenced.</p>
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void garbageCollect() throws RocksDBException {
+    assert(isInitialized());
+    garbageCollect(nativeHandle_);
+  }
+
+  /**
+   * <p>Release the memory allocated for the current instance
+   * in the c++ side.</p>
+   */
+  @Override public synchronized void disposeInternal() {
+    dispose(nativeHandle_);
+  }
+
+  private native long newRestoreBackupableDB(long options);
+  private native void restoreDBFromBackup0(long nativeHandle, long backupId,
+      String dbDir, String walDir, long restoreOptions)
+      throws RocksDBException;
+  private native void restoreDBFromLatestBackup0(long nativeHandle,
+      String dbDir, String walDir, long restoreOptions)
+      throws RocksDBException;
+  private native void purgeOldBackups0(long nativeHandle, int numBackupsToKeep)
+      throws RocksDBException;
+  private native void deleteBackup0(long nativeHandle, int backupId)
+      throws RocksDBException;
+  private native List<BackupInfo> getBackupInfo(long handle);
+  private native int[] getCorruptedBackups(long handle);
+  private native void garbageCollect(long handle)
+      throws RocksDBException;
+  private native void dispose(long nativeHandle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RestoreOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/RestoreOptions.java
new file mode 100644
index 0000000..d98167a
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RestoreOptions.java
@@ -0,0 +1,41 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * RestoreOptions to control the behavior of restore.
+ *
+ * Note that dispose() must be called before this instance become out-of-scope
+ * to release the allocated memory in c++.
+ *
+ */
+public class RestoreOptions extends RocksObject {
+  /**
+   * Constructor
+   *
+   * @param keepLogFiles If true, restore won't overwrite the existing log files in wal_dir. It
+   *     will also move all log files from archive directory to wal_dir. Use this
+   *     option in combination with BackupableDBOptions::backup_log_files = false
+   *     for persisting in-memory databases.
+   *     Default: false
+   */
+  public RestoreOptions(final boolean keepLogFiles) {
+    super();
+    nativeHandle_ = newRestoreOptions(keepLogFiles);
+  }
+
+  /**
+   * Release the memory allocated for the current instance
+   * in the c++ side.
+   */
+  @Override public synchronized void disposeInternal() {
+    assert(isInitialized());
+    dispose(nativeHandle_);
+  }
+
+  private native long newRestoreOptions(boolean keepLogFiles);
+  private native void dispose(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java
new file mode 100644
index 0000000..2af55c4
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java
@@ -0,0 +1,1824 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.*;
+import java.io.IOException;
+import org.rocksdb.util.Environment;
+
+/**
+ * A RocksDB is a persistent ordered map from keys to values.  It is safe for
+ * concurrent access from multiple threads without any external synchronization.
+ * All methods of this class could potentially throw RocksDBException, which
+ * indicates sth wrong at the RocksDB library side and the call failed.
+ */
+public class RocksDB extends RocksObject {
+  public static final byte[] DEFAULT_COLUMN_FAMILY = "default".getBytes();
+  public static final int NOT_FOUND = -1;
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Loads the necessary library files.
+   * Calling this method twice will have no effect.
+   * By default the method extracts the shared library for loading at
+   * java.io.tmpdir, however, you can override this temporary location by
+   * setting the environment variable ROCKSDB_SHAREDLIB_DIR.
+   */
+  public static synchronized void loadLibrary() {
+    String tmpDir = System.getenv("ROCKSDB_SHAREDLIB_DIR");
+    // loading possibly necessary libraries.
+    for (CompressionType compressionType : CompressionType.values()) {
+      try {
+        if (compressionType.getLibraryName() != null) {
+          System.loadLibrary(compressionType.getLibraryName());
+        }
+      } catch (UnsatisfiedLinkError e) {
+        // since it may be optional, we ignore its loading failure here.
+      }
+    }
+    try
+    {
+      NativeLibraryLoader.getInstance().loadLibrary(tmpDir);
+    }
+    catch (IOException e)
+    {
+      throw new RuntimeException("Unable to load the RocksDB shared library" + e);
+    }
+  }
+
+  /**
+   * Tries to load the necessary library files from the given list of
+   * directories.
+   *
+   * @param paths a list of strings where each describes a directory
+   *     of a library.
+   */
+  public static synchronized void loadLibrary(final List<String> paths) {
+    for (CompressionType compressionType : CompressionType.values()) {
+      if (compressionType.equals(CompressionType.NO_COMPRESSION)) {
+        continue;
+      }
+      for (String path : paths) {
+        try {
+          System.load(path + "/" + Environment.getSharedLibraryFileName(
+              compressionType.getLibraryName()));
+          break;
+        } catch (UnsatisfiedLinkError e) {
+          // since they are optional, we ignore loading fails.
+        }
+      }
+    }
+    boolean success = false;
+    UnsatisfiedLinkError err = null;
+    for (String path : paths) {
+      try {
+        System.load(path + "/" + Environment.getJniLibraryFileName("rocksdbjni"));
+        success = true;
+        break;
+      } catch (UnsatisfiedLinkError e) {
+        err = e;
+      }
+    }
+    if (!success) {
+      throw err;
+    }
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance given
+   * the path to the database using the default options w/ createIfMissing
+   * set to true.
+   *
+   * @param path the path to the rocksdb.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @see Options#setCreateIfMissing(boolean)
+   */
+  public static RocksDB open(final String path) throws RocksDBException {
+    // This allows to use the rocksjni default Options instead of
+    // the c++ one.
+    Options options = new Options();
+    options.setCreateIfMissing(true);
+    return open(options, path);
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance given
+   * the path to the database using the specified options and db path and a list
+   * of column family names.
+   * <p>
+   * If opened in read write mode every existing column family name must be passed
+   * within the list to this method.</p>
+   * <p>
+   * If opened in read-only mode only a subset of existing column families must
+   * be passed to this method.</p>
+   * <p>
+   * Options instance *should* not be disposed before all DBs using this options
+   * instance have been closed. If user doesn't call options dispose explicitly,
+   * then this options instance will be GC'd automatically</p>
+   * <p>
+   * ColumnFamily handles are disposed when the RocksDB instance is disposed.
+   * </p>
+   *
+   * @param path the path to the rocksdb.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @see DBOptions#setCreateIfMissing(boolean)
+   */
+  public static RocksDB open(final String path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+    // This allows to use the rocksjni default Options instead of
+    // the c++ one.
+    DBOptions options = new DBOptions();
+    return open(options, path, columnFamilyDescriptors, columnFamilyHandles);
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance given
+   * the path to the database using the specified options and db path.
+   *
+   * <p>
+   * Options instance *should* not be disposed before all DBs using this options
+   * instance have been closed. If user doesn't call options dispose explicitly,
+   * then this options instance will be GC'd automatically.</p>
+   * <p>
+   * Options instance can be re-used to open multiple DBs if DB statistics is
+   * not used. If DB statistics are required, then its recommended to open DB
+   * with new Options instance as underlying native statistics instance does not
+   * use any locks to prevent concurrent updates.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param path the path to the rocksdb.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   *
+   * @see Options#setCreateIfMissing(boolean)
+   */
+  public static RocksDB open(final Options options, final String path)
+      throws RocksDBException {
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    RocksDB db = new RocksDB();
+    db.open(options.nativeHandle_, path);
+
+    db.storeOptionsInstance(options);
+    return db;
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance given
+   * the path to the database using the specified options and db path and a list
+   * of column family names.
+   * <p>
+   * If opened in read write mode every existing column family name must be passed
+   * within the list to this method.</p>
+   * <p>
+   * If opened in read-only mode only a subset of existing column families must
+   * be passed to this method.</p>
+   * <p>
+   * Options instance *should* not be disposed before all DBs using this options
+   * instance have been closed. If user doesn't call options dispose explicitly,
+   * then this options instance will be GC'd automatically.</p>
+   * <p>
+   * Options instance can be re-used to open multiple DBs if DB statistics is
+   * not used. If DB statistics are required, then its recommended to open DB
+   * with new Options instance as underlying native statistics instance does not
+   * use any locks to prevent concurrent updates.</p>
+   * <p>
+   * ColumnFamily handles are disposed when the RocksDB instance is disposed.</p>
+   *
+   * @param options {@link org.rocksdb.DBOptions} instance.
+   * @param path the path to the rocksdb.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   *
+   * @see DBOptions#setCreateIfMissing(boolean)
+   */
+  public static RocksDB open(final DBOptions options, final String path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+    RocksDB db = new RocksDB();
+    List<Long> cfReferences = db.open(options.nativeHandle_, path,
+        columnFamilyDescriptors, columnFamilyDescriptors.size());
+    for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
+      columnFamilyHandles.add(new ColumnFamilyHandle(db, cfReferences.get(i)));
+    }
+    db.storeOptionsInstance(options);
+    return db;
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the default
+   * options.
+   *
+   * @param path the path to the RocksDB.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openReadOnly(final String path)
+      throws RocksDBException {
+    // This allows to use the rocksjni default Options instead of
+    // the c++ one.
+    Options options = new Options();
+    return openReadOnly(options, path);
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the default
+   * options.
+   *
+   * @param path the path to the RocksDB.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openReadOnly(final String path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+    // This allows to use the rocksjni default Options instead of
+    // the c++ one.
+    DBOptions options = new DBOptions();
+    return openReadOnly(options, path, columnFamilyDescriptors,
+        columnFamilyHandles);
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the specified
+   * options and db path.
+   *
+   * Options instance *should* not be disposed before all DBs using this options
+   * instance have been closed. If user doesn't call options dispose explicitly,
+   * then this options instance will be GC'd automatically.
+   *
+   * @param options {@link Options} instance.
+   * @param path the path to the RocksDB.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openReadOnly(final Options options, final String path)
+      throws RocksDBException {
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    RocksDB db = new RocksDB();
+    db.openROnly(options.nativeHandle_, path);
+
+    db.storeOptionsInstance(options);
+    return db;
+  }
+
+  /**
+   * The factory constructor of RocksDB that opens a RocksDB instance in
+   * Read-Only mode given the path to the database using the specified
+   * options and db path.
+   *
+   * <p>This open method allows to open RocksDB using a subset of available
+   * column families</p>
+   * <p>Options instance *should* not be disposed before all DBs using this
+   * options instance have been closed. If user doesn't call options dispose
+   * explicitly,then this options instance will be GC'd automatically.</p>
+   *
+   * @param options {@link DBOptions} instance.
+   * @param path the path to the RocksDB.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @return a {@link RocksDB} instance on success, null if the specified
+   *     {@link RocksDB} can not be opened.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static RocksDB openReadOnly(final DBOptions options, final String path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    RocksDB db = new RocksDB();
+    List<Long> cfReferences = db.openROnly(options.nativeHandle_, path,
+        columnFamilyDescriptors, columnFamilyDescriptors.size());
+    for (int i=0; i<columnFamilyDescriptors.size(); i++) {
+      columnFamilyHandles.add(new ColumnFamilyHandle(db, cfReferences.get(i)));
+    }
+
+    db.storeOptionsInstance(options);
+    return db;
+  }
+  /**
+   * Static method to determine all available column families for a
+   * rocksdb database identified by path
+   *
+   * @param options Options for opening the database
+   * @param path Absolute path to rocksdb database
+   * @return List<byte[]> List containing the column family names
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static List<byte[]> listColumnFamilies(final Options options,
+      final String path) throws RocksDBException {
+    return RocksDB.listColumnFamilies(options.nativeHandle_, path);
+  }
+
+  private void storeOptionsInstance(DBOptionsInterface options) {
+    options_ = options;
+  }
+
+  @Override protected void disposeInternal() {
+    synchronized (this) {
+      assert (isInitialized());
+      disposeInternal(nativeHandle_);
+    }
+  }
+
+  /**
+   * Close the RocksDB instance.
+   * This function is equivalent to dispose().
+   */
+  public void close() {
+    dispose();
+  }
+
+  /**
+   * Set the database entry for "key" to "value".
+   *
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void put(final byte[] key, final byte[] value) throws RocksDBException {
+    put(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  /**
+   * Set the database entry for "key" to "value" in the specified
+   * column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * throws IllegalArgumentException if column family is not present
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final byte[] value) throws RocksDBException {
+    put(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Set the database entry for "key" to "value".
+   *
+   * @param writeOpts {@link org.rocksdb.WriteOptions} instance.
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void put(final WriteOptions writeOpts, final byte[] key,
+      final byte[] value) throws RocksDBException {
+    put(nativeHandle_, writeOpts.nativeHandle_,
+        key, key.length, value, value.length);
+  }
+
+  /**
+   * Set the database entry for "key" to "value" for the specified
+   * column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param writeOpts {@link org.rocksdb.WriteOptions} instance.
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * throws IllegalArgumentException if column family is not present
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @see IllegalArgumentException
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpts, final byte[] key,
+      final byte[] value) throws RocksDBException {
+    put(nativeHandle_, writeOpts.nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
+   *
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
+   *
+   * @param key byte array of a key to search for
+   * @param value StringBuffer instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
+   */
+  public boolean keyMayExist(final byte[] key, final StringBuffer value){
+    return keyMayExist(key, key.length, value);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
+   *
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key byte array of a key to search for
+   * @param value StringBuffer instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
+   */
+  public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final StringBuffer value){
+    return keyMayExist(key, key.length, columnFamilyHandle.nativeHandle_,
+        value);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
+   *
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
+   *
+   * @param readOptions {@link ReadOptions} instance
+   * @param key byte array of a key to search for
+   * @param value StringBuffer instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
+   */
+  public boolean keyMayExist(final ReadOptions readOptions,
+      final byte[] key, final StringBuffer value){
+    return keyMayExist(readOptions.nativeHandle_,
+        key, key.length, value);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
+   *
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
+   *
+   * @param readOptions {@link ReadOptions} instance
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key byte array of a key to search for
+   * @param value StringBuffer instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
+   */
+  public boolean keyMayExist(final ReadOptions readOptions,
+      final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final StringBuffer value){
+    return keyMayExist(readOptions.nativeHandle_,
+        key, key.length, columnFamilyHandle.nativeHandle_,
+        value);
+  }
+
+  /**
+   * Apply the specified updates to the database.
+   *
+   * @param writeOpts WriteOptions instance
+   * @param updates WriteBatch instance
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void write(final WriteOptions writeOpts, final WriteBatch updates)
+      throws RocksDBException {
+    write0(writeOpts.nativeHandle_, updates.nativeHandle_);
+  }
+
+  /**
+   * Apply the specified updates to the database.
+   *
+   * @param writeOpts WriteOptions instance
+   * @param updates WriteBatchWithIndex instance
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void write(final WriteOptions writeOpts,
+      final WriteBatchWithIndex updates) throws RocksDBException {
+    write1(writeOpts.nativeHandle_, updates.nativeHandle_);
+  }
+
+  /**
+   * Add merge operand for key/value pair.
+   *
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final byte[] key, final byte[] value) throws RocksDBException {
+    merge(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  /**
+   * Add merge operand for key/value pair in a ColumnFamily.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final byte[] value) throws RocksDBException {
+    merge(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Add merge operand for key/value pair.
+   *
+   * @param writeOpts {@link WriteOptions} for this write.
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final WriteOptions writeOpts, final byte[] key,
+      final byte[] value) throws RocksDBException {
+    merge(nativeHandle_, writeOpts.nativeHandle_,
+        key, key.length, value, value.length);
+  }
+
+  /**
+   * Add merge operand for key/value pair.
+   *
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param writeOpts {@link WriteOptions} for this write.
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void merge(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpts, final byte[] key,
+      final byte[] value) throws RocksDBException {
+    merge(nativeHandle_, writeOpts.nativeHandle_,
+        key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Get the value associated with the specified key within column family*
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final byte[] key, final byte[] value) throws RocksDBException {
+    return get(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  /**
+   * Get the value associated with the specified key within column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final byte[] value) throws RocksDBException, IllegalArgumentException {
+    return get(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Get the value associated with the specified key.
+   *
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ReadOptions opt, final byte[] key,
+      final byte[] value) throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_,
+               key, key.length, value, value.length);
+  }
+  /**
+   * Get the value associated with the specified key within column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions opt, final byte[] key, final byte[] value)
+      throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_, key, key.length, value,
+        value.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param key the key retrieve the value.
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final byte[] key) throws RocksDBException {
+    return get(nativeHandle_, key, key.length);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key retrieve the value.
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key)
+      throws RocksDBException {
+    return get(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param key the key retrieve the value.
+   * @param opt Read options.
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ReadOptions opt, final byte[] key)
+      throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_, key, key.length);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key retrieve the value.
+   * @param opt Read options.
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions opt, final byte[] key) throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Returns a map of keys for which values were found in DB.
+   *
+   * @param keys List of keys for which values need to be retrieved.
+   * @return Map where key of map is the key passed by user and value for map
+   * entry is the corresponding value in DB.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public Map<byte[], byte[]> multiGet(final List<byte[]> keys)
+      throws RocksDBException {
+    assert(keys.size() != 0);
+
+    List<byte[]> values = multiGet(
+        nativeHandle_, keys, keys.size());
+
+    Map<byte[], byte[]> keyValueMap = new HashMap<>();
+    for(int i = 0; i < values.size(); i++) {
+      if(values.get(i) == null) {
+        continue;
+      }
+
+      keyValueMap.put(keys.get(i), values.get(i));
+    }
+
+    return keyValueMap;
+  }
+
+  /**
+   * Returns a map of keys for which values were found in DB.
+   * <p>
+   * Note: Every key needs to have a related column family name in
+   * {@code columnFamilyHandleList}.
+   * </p>
+   *
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys List of keys for which values need to be retrieved.
+   * @return Map where key of map is the key passed by user and value for map
+   * entry is the corresponding value in DB.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IllegalArgumentException thrown if the size of passed keys is not
+   *    equal to the amount of passed column family handles.
+   */
+  public Map<byte[], byte[]> multiGet(final List<ColumnFamilyHandle> columnFamilyHandleList,
+      final List<byte[]> keys) throws RocksDBException, IllegalArgumentException {
+    assert(keys.size() != 0);
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.size()!=columnFamilyHandleList.size()) {
+        throw new IllegalArgumentException(
+            "For each key there must be a ColumnFamilyHandle.");
+    }
+    List<byte[]> values = multiGet(nativeHandle_, keys, keys.size(),
+        columnFamilyHandleList);
+
+    Map<byte[], byte[]> keyValueMap = new HashMap<>();
+    for(int i = 0; i < values.size(); i++) {
+      if (values.get(i) == null) {
+        continue;
+      }
+      keyValueMap.put(keys.get(i), values.get(i));
+    }
+    return keyValueMap;
+  }
+
+  /**
+   * Returns a map of keys for which values were found in DB.
+   *
+   * @param opt Read options.
+   * @param keys of keys for which values need to be retrieved.
+   * @return Map where key of map is the key passed by user and value for map
+   * entry is the corresponding value in DB.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public Map<byte[], byte[]> multiGet(final ReadOptions opt,
+      final List<byte[]> keys) throws RocksDBException {
+    assert(keys.size() != 0);
+
+    List<byte[]> values = multiGet(
+        nativeHandle_, opt.nativeHandle_, keys, keys.size());
+
+    Map<byte[], byte[]> keyValueMap = new HashMap<>();
+    for(int i = 0; i < values.size(); i++) {
+      if(values.get(i) == null) {
+        continue;
+      }
+
+      keyValueMap.put(keys.get(i), values.get(i));
+    }
+
+    return keyValueMap;
+  }
+
+  /**
+   * Returns a map of keys for which values were found in DB.
+   * <p>
+   * Note: Every key needs to have a related column family name in
+   * {@code columnFamilyHandleList}.
+   * </p>
+   *
+   * @param opt Read options.
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys of keys for which values need to be retrieved.
+   * @return Map where key of map is the key passed by user and value for map
+   * entry is the corresponding value in DB.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IllegalArgumentException thrown if the size of passed keys is not
+   *    equal to the amount of passed column family handles.
+   */
+  public Map<byte[], byte[]> multiGet(final ReadOptions opt,
+      final List<ColumnFamilyHandle> columnFamilyHandleList,
+      final List<byte[]> keys) throws RocksDBException {
+    assert(keys.size() != 0);
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.size()!=columnFamilyHandleList.size()){
+      throw new IllegalArgumentException(
+          "For each key there must be a ColumnFamilyHandle.");
+    }
+
+    List<byte[]> values = multiGet(nativeHandle_, opt.nativeHandle_,
+        keys, keys.size(), columnFamilyHandleList);
+
+    Map<byte[], byte[]> keyValueMap = new HashMap<>();
+    for(int i = 0; i < values.size(); i++) {
+      if(values.get(i) == null) {
+        continue;
+      }
+      keyValueMap.put(keys.get(i), values.get(i));
+    }
+
+    return keyValueMap;
+  }
+
+  /**
+   * Remove the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void remove(final byte[] key) throws RocksDBException {
+    remove(nativeHandle_, key, key.length);
+  }
+
+  /**
+   * Remove the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void remove(final ColumnFamilyHandle columnFamilyHandle, final byte[] key)
+      throws RocksDBException {
+    remove(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Remove the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void remove(final WriteOptions writeOpt, final byte[] key)
+      throws RocksDBException {
+    remove(nativeHandle_, writeOpt.nativeHandle_, key, key.length);
+  }
+
+  /**
+   * Remove the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void remove(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpt, final byte[] key)
+      throws RocksDBException {
+    remove(nativeHandle_, writeOpt.nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * DB implements can export properties about their state
+   * via this method on a per column family level.
+   *
+   * <p>If {@code property} is a valid property understood by this DB
+   * implementation, fills {@code value} with its current value and
+   * returns true. Otherwise returns false.</p>
+   *
+   * <p>Valid property names include:
+   * <ul>
+   * <li>"rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
+   *     where <N> is an ASCII representation of a level number (e.g. "0").</li>
+   * <li>"rocksdb.stats" - returns a multi-line string that describes statistics
+   *     about the internal operation of the DB.</li>
+   * <li>"rocksdb.sstables" - returns a multi-line string that describes all
+   *    of the sstables that make up the db contents.</li>
+   * </ul>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param property to be fetched. See above for examples
+   * @return property value
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public String getProperty(final ColumnFamilyHandle columnFamilyHandle,
+      final String property) throws RocksDBException {
+    return getProperty0(nativeHandle_, columnFamilyHandle.nativeHandle_, property,
+        property.length());
+  }
+
+  /**
+   * DB implementations can export properties about their state
+   * via this method.  If "property" is a valid property understood by this
+   * DB implementation, fills "*value" with its current value and returns
+   * true.  Otherwise returns false.
+   *
+   * <p>Valid property names include:
+   * <ul>
+   * <li>"rocksdb.num-files-at-level<N>" - return the number of files at level <N>,
+   *     where <N> is an ASCII representation of a level number (e.g. "0").</li>
+   * <li>"rocksdb.stats" - returns a multi-line string that describes statistics
+   *     about the internal operation of the DB.</li>
+   * <li>"rocksdb.sstables" - returns a multi-line string that describes all
+   *    of the sstables that make up the db contents.</li>
+   *</ul>
+   *
+   * @param property to be fetched. See above for examples
+   * @return property value
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public String getProperty(final String property) throws RocksDBException {
+    return getProperty0(nativeHandle_, property, property.length());
+  }
+
+  /**
+   * <p> Similar to GetProperty(), but only works for a subset of properties whose
+   * return value is a numerical value. Return the value as long.</p>
+   *
+   * <p><strong>Note</strong>: As the returned property is of type
+   * {@code uint64_t} on C++ side the returning value can be negative
+   * because Java supports in Java 7 only signed long values.</p>
+   *
+   * <p><strong>Java 7</strong>: To mitigate the problem of the non
+   * existent unsigned long tpye, values should be encapsulated using
+   * {@link java.math.BigInteger} to reflect the correct value. The correct
+   * behavior is guaranteed if {@code 2^64} is added to negative values.</p>
+   *
+   * <p><strong>Java 8</strong>: In Java 8 the value should be treated as
+   * unsigned long using provided methods of type {@link Long}.</p>
+   *
+   * @param property to be fetched.
+   *
+   * @return numerical property value.
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public long getLongProperty(final String property) throws RocksDBException {
+    return getLongProperty(nativeHandle_, property, property.length());
+  }
+
+  /**
+   * <p> Similar to GetProperty(), but only works for a subset of properties whose
+   * return value is a numerical value. Return the value as long.</p>
+   *
+   * <p><strong>Note</strong>: As the returned property is of type
+   * {@code uint64_t} on C++ side the returning value can be negative
+   * because Java supports in Java 7 only signed long values.</p>
+   *
+   * <p><strong>Java 7</strong>: To mitigate the problem of the non
+   * existent unsigned long tpye, values should be encapsulated using
+   * {@link java.math.BigInteger} to reflect the correct value. The correct
+   * behavior is guaranteed if {@code 2^64} is added to negative values.</p>
+   *
+   * <p><strong>Java 8</strong>: In Java 8 the value should be treated as
+   * unsigned long using provided methods of type {@link Long}.</p>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param property to be fetched.
+   *
+   * @return numerical property value
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public long getLongProperty(final ColumnFamilyHandle columnFamilyHandle,
+      final String property) throws RocksDBException {
+    return getLongProperty(nativeHandle_, columnFamilyHandle.nativeHandle_, property,
+        property.length());
+  }
+
+  /**
+   * <p>Return a heap-allocated iterator over the contents of the
+   * database. The result of newIterator() is initially invalid
+   * (caller must call one of the Seek methods on the iterator
+   * before using it).</p>
+   *
+   * <p>Caller should close the iterator when it is no longer needed.
+   * The returned iterator should be closed before this db is closed.
+   * </p>
+   *
+   * @return instance of iterator object.
+   */
+  public RocksIterator newIterator() {
+    return new RocksIterator(this, iterator(nativeHandle_));
+  }
+
+  /**
+   * <p>Return a heap-allocated iterator over the contents of the
+   * database. The result of newIterator() is initially invalid
+   * (caller must call one of the Seek methods on the iterator
+   * before using it).</p>
+   *
+   * <p>Caller should close the iterator when it is no longer needed.
+   * The returned iterator should be closed before this db is closed.
+   * </p>
+   *
+   * @param readOptions {@link ReadOptions} instance.
+   * @return instance of iterator object.
+   */
+  public RocksIterator newIterator(final ReadOptions readOptions) {
+    return new RocksIterator(this, iterator(nativeHandle_,
+        readOptions.nativeHandle_));
+  }
+
+   /**
+   * <p>Return a handle to the current DB state. Iterators created with
+   * this handle will all observe a stable snapshot of the current DB
+   * state. The caller must call ReleaseSnapshot(result) when the
+   * snapshot is no longer needed.</p>
+   *
+   * <p>nullptr will be returned if the DB fails to take a snapshot or does
+   * not support snapshot.</p>
+   *
+   * @return Snapshot {@link Snapshot} instance
+   */
+  public Snapshot getSnapshot() {
+    long snapshotHandle = getSnapshot(nativeHandle_);
+    if (snapshotHandle != 0) {
+      return new Snapshot(snapshotHandle);
+    }
+    return null;
+  }
+
+  /**
+   * Release a previously acquired snapshot.  The caller must not
+   * use "snapshot" after this call.
+   *
+   * @param snapshot {@link Snapshot} instance
+   */
+  public void releaseSnapshot(final Snapshot snapshot) {
+    if (snapshot != null) {
+      releaseSnapshot(nativeHandle_, snapshot.nativeHandle_);
+    }
+  }
+
+  /**
+   * <p>Return a heap-allocated iterator over the contents of the
+   * database. The result of newIterator() is initially invalid
+   * (caller must call one of the Seek methods on the iterator
+   * before using it).</p>
+   *
+   * <p>Caller should close the iterator when it is no longer needed.
+   * The returned iterator should be closed before this db is closed.
+   * </p>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @return instance of iterator object.
+   */
+  public RocksIterator newIterator(final ColumnFamilyHandle columnFamilyHandle) {
+    return new RocksIterator(this, iteratorCF(nativeHandle_,
+        columnFamilyHandle.nativeHandle_));
+  }
+
+  /**
+   * <p>Return a heap-allocated iterator over the contents of the
+   * database. The result of newIterator() is initially invalid
+   * (caller must call one of the Seek methods on the iterator
+   * before using it).</p>
+   *
+   * <p>Caller should close the iterator when it is no longer needed.
+   * The returned iterator should be closed before this db is closed.
+   * </p>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param readOptions {@link ReadOptions} instance.
+   * @return instance of iterator object.
+   */
+  public RocksIterator newIterator(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions readOptions) {
+    return new RocksIterator(this, iteratorCF(nativeHandle_,
+        columnFamilyHandle.nativeHandle_, readOptions.nativeHandle_));
+  }
+
+  /**
+   * Returns iterators from a consistent database state across multiple
+   * column families. Iterators are heap allocated and need to be deleted
+   * before the db is deleted
+   *
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @return {@link java.util.List} containing {@link org.rocksdb.RocksIterator}
+   *     instances
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public List<RocksIterator> newIterators(
+      final List<ColumnFamilyHandle> columnFamilyHandleList) throws RocksDBException {
+    return newIterators(columnFamilyHandleList, new ReadOptions());
+  }
+
+  /**
+   * Returns iterators from a consistent database state across multiple
+   * column families. Iterators are heap allocated and need to be deleted
+   * before the db is deleted
+   *
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param readOptions {@link ReadOptions} instance.
+   * @return {@link java.util.List} containing {@link org.rocksdb.RocksIterator}
+   *     instances
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public List<RocksIterator> newIterators(
+      final List<ColumnFamilyHandle> columnFamilyHandleList,
+      final ReadOptions readOptions) throws RocksDBException {
+    List<RocksIterator> iterators =
+        new ArrayList<>(columnFamilyHandleList.size());
+
+    long[] iteratorRefs = iterators(nativeHandle_, columnFamilyHandleList,
+        readOptions.nativeHandle_);
+    for (int i=0; i<columnFamilyHandleList.size(); i++){
+      iterators.add(new RocksIterator(this, iteratorRefs[i]));
+    }
+    return iterators;
+  }
+
+  /**
+   * Gets the handle for the default column family
+   *
+   * @return The handle of the default column family
+   */
+  public ColumnFamilyHandle getDefaultColumnFamily() {
+    ColumnFamilyHandle cfHandle = new ColumnFamilyHandle(this,
+        getDefaultColumnFamily(nativeHandle_));
+    cfHandle.disOwnNativeHandle();
+    return cfHandle;
+  }
+
+  /**
+   * Creates a new column family with the name columnFamilyName and
+   * allocates a ColumnFamilyHandle within an internal structure.
+   * The ColumnFamilyHandle is automatically disposed with DB disposal.
+   *
+   * @param columnFamilyDescriptor column family to be created.
+   * @return {@link org.rocksdb.ColumnFamilyHandle} instance.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public ColumnFamilyHandle createColumnFamily(
+      final ColumnFamilyDescriptor columnFamilyDescriptor)
+      throws RocksDBException {
+    return new ColumnFamilyHandle(this, createColumnFamily(nativeHandle_,
+        columnFamilyDescriptor));
+  }
+
+  /**
+   * Drops the column family identified by columnFamilyName. Internal
+   * handles to this column family will be disposed. If the column family
+   * is not known removal will fail.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void dropColumnFamily(final ColumnFamilyHandle columnFamilyHandle)
+      throws RocksDBException, IllegalArgumentException {
+    // throws RocksDBException if something goes wrong
+    dropColumnFamily(nativeHandle_, columnFamilyHandle.nativeHandle_);
+    // After the drop the native handle is not valid anymore
+    columnFamilyHandle.nativeHandle_ = 0;
+  }
+
+  /**
+   * <p>Flush all memory table data.</p>
+   *
+   * <p>Note: it must be ensured that the FlushOptions instance
+   * is not GC'ed before this method finishes. If the wait parameter is
+   * set to false, flush processing is asynchronous.</p>
+   *
+   * @param flushOptions {@link org.rocksdb.FlushOptions} instance.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void flush(final FlushOptions flushOptions)
+      throws RocksDBException {
+    flush(nativeHandle_, flushOptions.nativeHandle_);
+  }
+
+  /**
+   * <p>Flush all memory table data.</p>
+   *
+   * <p>Note: it must be ensured that the FlushOptions instance
+   * is not GC'ed before this method finishes. If the wait parameter is
+   * set to false, flush processing is asynchronous.</p>
+   *
+   * @param flushOptions {@link org.rocksdb.FlushOptions} instance.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void flush(final FlushOptions flushOptions,
+      final ColumnFamilyHandle columnFamilyHandle) throws RocksDBException {
+    flush(nativeHandle_, flushOptions.nativeHandle_,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * <p>Range compaction of database.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange(boolean, int, int)}</li>
+   * <li>{@link #compactRange(byte[], byte[])}</li>
+   * <li>{@link #compactRange(byte[], byte[], boolean, int, int)}</li>
+   * </ul>
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange() throws RocksDBException {
+    compactRange0(nativeHandle_, false, -1, 0);
+  }
+
+  /**
+   * <p>Range compaction of database.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange()}</li>
+   * <li>{@link #compactRange(boolean, int, int)}</li>
+   * <li>{@link #compactRange(byte[], byte[], boolean, int, int)}</li>
+   * </ul>
+   *
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(final byte[] begin, final byte[] end)
+      throws RocksDBException {
+    compactRange0(nativeHandle_, begin, begin.length, end,
+        end.length, false, -1, 0);
+  }
+
+  /**
+   * <p>Range compaction of database.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * <p>Compaction outputs should be placed in options.db_paths
+   * [target_path_id]. Behavior is undefined if target_path_id is
+   * out of range.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange()}</li>
+   * <li>{@link #compactRange(byte[], byte[])}</li>
+   * <li>{@link #compactRange(byte[], byte[], boolean, int, int)}</li>
+   * </ul>
+   *
+   * @param reduce_level reduce level after compaction
+   * @param target_level target level to compact to
+   * @param target_path_id the target path id of output path
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(final boolean reduce_level,
+      final int target_level, final int target_path_id)
+      throws RocksDBException {
+    compactRange0(nativeHandle_, reduce_level,
+        target_level, target_path_id);
+  }
+
+
+  /**
+   * <p>Range compaction of database.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * <p>Compaction outputs should be placed in options.db_paths
+   * [target_path_id]. Behavior is undefined if target_path_id is
+   * out of range.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange()}</li>
+   * <li>{@link #compactRange(boolean, int, int)}</li>
+   * <li>{@link #compactRange(byte[], byte[])}</li>
+   * </ul>
+   *
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
+   * @param reduce_level reduce level after compaction
+   * @param target_level target level to compact to
+   * @param target_path_id the target path id of output path
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(final byte[] begin, final byte[] end,
+      final boolean reduce_level, final int target_level,
+      final int target_path_id) throws RocksDBException {
+    compactRange0(nativeHandle_, begin, begin.length, end, end.length,
+        reduce_level, target_level, target_path_id);
+  }
+
+  /**
+   * <p>Range compaction of column family.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, boolean, int, int)}
+   * </li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
+   * </li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[],
+   *   boolean, int, int)}
+   * </li>
+   * </ul>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance.
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(final ColumnFamilyHandle columnFamilyHandle)
+      throws RocksDBException {
+    compactRange(nativeHandle_, false, -1, 0,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * <p>Range compaction of column family.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange(ColumnFamilyHandle)}</li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, boolean, int, int)}
+   * </li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[],
+   *   boolean, int, int)}
+   * </li>
+   * </ul>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance.
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] begin, final byte[] end) throws RocksDBException {
+    compactRange(nativeHandle_, begin, begin.length, end, end.length,
+        false, -1, 0, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * <p>Range compaction of column family.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * <p>Compaction outputs should be placed in options.db_paths
+   * [target_path_id]. Behavior is undefined if target_path_id is
+   * out of range.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange(ColumnFamilyHandle)}</li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
+   * </li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[],
+   *   boolean, int, int)}
+   * </li>
+   * </ul>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance.
+   * @param reduce_level reduce level after compaction
+   * @param target_level target level to compact to
+   * @param target_path_id the target path id of output path
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(final ColumnFamilyHandle columnFamilyHandle,
+      final boolean reduce_level, final int target_level,
+      final int target_path_id) throws RocksDBException {
+    compactRange(nativeHandle_, reduce_level, target_level,
+        target_path_id, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * <p>Range compaction of column family.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * <p>Compaction outputs should be placed in options.db_paths
+   * [target_path_id]. Behavior is undefined if target_path_id is
+   * out of range.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>{@link #compactRange(ColumnFamilyHandle)}</li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, boolean, int, int)}
+   * </li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
+   * </li>
+   * </ul>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance.
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
+   * @param reduce_level reduce level after compaction
+   * @param target_level target level to compact to
+   * @param target_path_id the target path id of output path
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] begin, final byte[] end, final boolean reduce_level,
+      final int target_level, final int target_path_id)
+      throws RocksDBException {
+    compactRange(nativeHandle_, begin, begin.length, end, end.length,
+        reduce_level, target_level, target_path_id,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * <p>The sequence number of the most recent transaction.</p>
+   *
+   * @return sequence number of the most
+   *     recent transaction.
+   */
+  public long getLatestSequenceNumber() {
+    return getLatestSequenceNumber(nativeHandle_);
+  }
+
+  /**
+   * <p>Prevent file deletions. Compactions will continue to occur,
+   * but no obsolete files will be deleted. Calling this multiple
+   * times have the same effect as calling it once.</p>
+   *
+   * @throws RocksDBException thrown if operation was not performed
+   *     successfully.
+   */
+  public void disableFileDeletions() throws RocksDBException {
+    disableFileDeletions(nativeHandle_);
+  }
+
+  /**
+   * <p>Allow compactions to delete obsolete files.
+   * If force == true, the call to EnableFileDeletions()
+   * will guarantee that file deletions are enabled after
+   * the call, even if DisableFileDeletions() was called
+   * multiple times before.</p>
+   *
+   * <p>If force == false, EnableFileDeletions will only
+   * enable file deletion after it's been called at least
+   * as many times as DisableFileDeletions(), enabling
+   * the two methods to be called by two threads
+   * concurrently without synchronization
+   * -- i.e., file deletions will be enabled only after both
+   * threads call EnableFileDeletions()</p>
+   *
+   * @param force boolean value described above.
+   *
+   * @throws RocksDBException thrown if operation was not performed
+   *     successfully.
+   */
+  public void enableFileDeletions(final boolean force)
+      throws RocksDBException {
+    enableFileDeletions(nativeHandle_, force);
+  }
+
+  /**
+   * <p>Returns an iterator that is positioned at a write-batch containing
+   * seq_number. If the sequence number is non existent, it returns an iterator
+   * at the first available seq_no after the requested seq_no.</p>
+   *
+   * <p>Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to
+   * use this api, else the WAL files will get
+   * cleared aggressively and the iterator might keep getting invalid before
+   * an update is read.</p>
+   *
+   * @param sequenceNumber sequence number offset
+   *
+   * @return {@link org.rocksdb.TransactionLogIterator} instance.
+   *
+   * @throws org.rocksdb.RocksDBException if iterator cannot be retrieved
+   *     from native-side.
+   */
+  public TransactionLogIterator getUpdatesSince(final long sequenceNumber)
+      throws RocksDBException {
+    return new TransactionLogIterator(
+        getUpdatesSince(nativeHandle_, sequenceNumber));
+  }
+
+  /**
+   * Private constructor.
+   */
+  protected RocksDB() {
+    super();
+  }
+
+  // native methods
+  protected native void open(
+      long optionsHandle, String path) throws RocksDBException;
+  protected native List<Long> open(long optionsHandle, String path,
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      int columnFamilyDescriptorsLength)
+      throws RocksDBException;
+  protected native static List<byte[]> listColumnFamilies(
+      long optionsHandle, String path) throws RocksDBException;
+  protected native void openROnly(
+      long optionsHandle, String path) throws RocksDBException;
+  protected native List<Long> openROnly(
+      long optionsHandle, String path,
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      int columnFamilyDescriptorsLength) throws RocksDBException;
+  protected native void put(
+      long handle, byte[] key, int keyLen,
+      byte[] value, int valueLen) throws RocksDBException;
+  protected native void put(
+      long handle, byte[] key, int keyLen,
+      byte[] value, int valueLen, long cfHandle) throws RocksDBException;
+  protected native void put(
+      long handle, long writeOptHandle,
+      byte[] key, int keyLen,
+      byte[] value, int valueLen) throws RocksDBException;
+  protected native void put(
+      long handle, long writeOptHandle,
+      byte[] key, int keyLen,
+      byte[] value, int valueLen, long cfHandle) throws RocksDBException;
+  protected native void write0(
+      long writeOptHandle, long wbHandle) throws RocksDBException;
+  protected native void write1(
+      long writeOptHandle, long wbwiHandle) throws RocksDBException;
+  protected native boolean keyMayExist(byte[] key, int keyLen,
+      StringBuffer stringBuffer);
+  protected native boolean keyMayExist(byte[] key, int keyLen,
+      long cfHandle, StringBuffer stringBuffer);
+  protected native boolean keyMayExist(long optionsHandle, byte[] key, int keyLen,
+      StringBuffer stringBuffer);
+  protected native boolean keyMayExist(long optionsHandle, byte[] key, int keyLen,
+      long cfHandle, StringBuffer stringBuffer);
+  protected native void merge(
+      long handle, byte[] key, int keyLen,
+      byte[] value, int valueLen) throws RocksDBException;
+  protected native void merge(
+      long handle, byte[] key, int keyLen,
+      byte[] value, int valueLen, long cfHandle) throws RocksDBException;
+  protected native void merge(
+      long handle, long writeOptHandle,
+      byte[] key, int keyLen,
+      byte[] value, int valueLen) throws RocksDBException;
+  protected native void merge(
+      long handle, long writeOptHandle,
+      byte[] key, int keyLen,
+      byte[] value, int valueLen, long cfHandle) throws RocksDBException;
+  protected native int get(
+      long handle, byte[] key, int keyLen,
+      byte[] value, int valueLen) throws RocksDBException;
+  protected native int get(
+      long handle, byte[] key, int keyLen,
+      byte[] value, int valueLen, long cfHandle) throws RocksDBException;
+  protected native int get(
+      long handle, long readOptHandle, byte[] key, int keyLen,
+      byte[] value, int valueLen) throws RocksDBException;
+  protected native int get(
+      long handle, long readOptHandle, byte[] key, int keyLen,
+      byte[] value, int valueLen, long cfHandle) throws RocksDBException;
+  protected native List<byte[]> multiGet(
+      long dbHandle, List<byte[]> keys, int keysCount);
+  protected native List<byte[]> multiGet(
+      long dbHandle, List<byte[]> keys, int keysCount, List<ColumnFamilyHandle>
+      cfHandles);
+  protected native List<byte[]> multiGet(
+      long dbHandle, long rOptHandle, List<byte[]> keys, int keysCount);
+  protected native List<byte[]> multiGet(
+      long dbHandle, long rOptHandle, List<byte[]> keys, int keysCount,
+      List<ColumnFamilyHandle> cfHandles);
+  protected native byte[] get(
+      long handle, byte[] key, int keyLen) throws RocksDBException;
+  protected native byte[] get(
+      long handle, byte[] key, int keyLen, long cfHandle) throws RocksDBException;
+  protected native byte[] get(
+      long handle, long readOptHandle,
+      byte[] key, int keyLen) throws RocksDBException;
+  protected native byte[] get(
+      long handle, long readOptHandle,
+      byte[] key, int keyLen, long cfHandle) throws RocksDBException;
+  protected native void remove(
+      long handle, byte[] key, int keyLen) throws RocksDBException;
+  protected native void remove(
+      long handle, byte[] key, int keyLen, long cfHandle) throws RocksDBException;
+  protected native void remove(
+      long handle, long writeOptHandle,
+      byte[] key, int keyLen) throws RocksDBException;
+  protected native void remove(
+      long handle, long writeOptHandle,
+      byte[] key, int keyLen, long cfHandle) throws RocksDBException;
+  protected native String getProperty0(long nativeHandle,
+      String property, int propertyLength) throws RocksDBException;
+  protected native String getProperty0(long nativeHandle, long cfHandle,
+      String property, int propertyLength) throws RocksDBException;
+  protected native long getLongProperty(long nativeHandle,
+      String property, int propertyLength) throws RocksDBException;
+  protected native long getLongProperty(long nativeHandle, long cfHandle,
+      String property, int propertyLength) throws RocksDBException;
+  protected native long iterator(long handle);
+  protected native long iterator(long handle, long readOptHandle);
+  protected native long iteratorCF(long handle, long cfHandle);
+  protected native long iteratorCF(long handle, long cfHandle,
+      long readOptHandle);
+  protected native long[] iterators(long handle,
+      List<ColumnFamilyHandle> columnFamilyNames, long readOptHandle)
+      throws RocksDBException;
+  protected native long getSnapshot(long nativeHandle);
+  protected native void releaseSnapshot(
+      long nativeHandle, long snapshotHandle);
+  private native void disposeInternal(long handle);
+  private native long getDefaultColumnFamily(long handle);
+  private native long createColumnFamily(long handle,
+      ColumnFamilyDescriptor columnFamilyDescriptor) throws RocksDBException;
+  private native void dropColumnFamily(long handle, long cfHandle) throws RocksDBException;
+  private native void flush(long handle, long flushOptHandle)
+      throws RocksDBException;
+  private native void flush(long handle, long flushOptHandle,
+      long cfHandle) throws RocksDBException;
+  private native void compactRange0(long handle, boolean reduce_level, int target_level,
+      int target_path_id) throws RocksDBException;
+  private native void compactRange0(long handle, byte[] begin, int beginLen, byte[] end,
+      int endLen, boolean reduce_level, int target_level, int target_path_id)
+      throws RocksDBException;
+  private native void compactRange(long handle, boolean reduce_level, int target_level,
+      int target_path_id, long cfHandle) throws RocksDBException;
+  private native void compactRange(long handle, byte[] begin, int beginLen, byte[] end,
+      int endLen, boolean reduce_level, int target_level, int target_path_id,
+      long cfHandle) throws RocksDBException;
+  private native long getLatestSequenceNumber(long handle);
+  private native void disableFileDeletions(long handle)
+      throws RocksDBException;
+  private native void enableFileDeletions(long handle,
+      boolean force) throws RocksDBException;
+  private native long getUpdatesSince(long handle, long sequenceNumber)
+      throws RocksDBException;
+
+  protected DBOptionsInterface options_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksDBException.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksDBException.java
new file mode 100644
index 0000000..a65d401
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksDBException.java
@@ -0,0 +1,21 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * A RocksDBException encapsulates the error of an operation.  This exception
+ * type is used to describe an internal error from the c++ rocksdb library.
+ */
+public class RocksDBException extends Exception {
+  /**
+   * The private construct used by a set of public static factory method.
+   *
+   * @param msg the specified error message.
+   */
+  public RocksDBException(final String msg) {
+    super(msg);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java
new file mode 100644
index 0000000..4c399ea
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java
@@ -0,0 +1,43 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * <p>A RocksEnv is an interface used by the rocksdb implementation to access
+ * operating system functionality like the filesystem etc.</p>
+ *
+ * <p>All Env implementations are safe for concurrent access from
+ * multiple threads without any external synchronization.</p>
+ */
+public class RocksEnv extends Env {
+
+  /**
+   * <p>Package-private constructor that uses the specified native handle
+   * to construct a RocksEnv.</p>
+   *
+   * <p>Note that the ownership of the input handle
+   * belongs to the caller, and the newly created RocksEnv will not take
+   * the ownership of the input handle.  As a result, calling
+   * {@code dispose()} of the created RocksEnv will be no-op.</p>
+   */
+  RocksEnv(final long handle) {
+    super();
+    nativeHandle_ = handle;
+    disOwnNativeHandle();
+  }
+
+  /**
+   * <p>The helper function of {@link #dispose()} which all subclasses of
+   * {@link RocksObject} must implement to release their associated C++
+   * resource.</p>
+   *
+   * <p><strong>Note:</strong> this class is used to use the default
+   * RocksEnv with RocksJava. The default env allocation is managed
+   * by C++.</p>
+   */
+  @Override protected void disposeInternal() {
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java
new file mode 100644
index 0000000..bb9a6e6
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java
@@ -0,0 +1,64 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * <p>An iterator that yields a sequence of key/value pairs from a source.
+ * Multiple implementations are provided by this library.
+ * In particular, iterators are provided
+ * to access the contents of a Table or a DB.</p>
+ *
+ * <p>Multiple threads can invoke const methods on an RocksIterator without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same RocksIterator must use
+ * external synchronization.</p>
+ *
+ * @see org.rocksdb.RocksObject
+ */
+public class RocksIterator extends AbstractRocksIterator<RocksDB> {
+  protected RocksIterator(RocksDB rocksDB, long nativeHandle) {
+    super(rocksDB, nativeHandle);
+  }
+
+  /**
+   * <p>Return the key for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}</p>
+   *
+   * @return key for the current entry.
+   */
+  public byte[] key() {
+    assert(isInitialized());
+    return key0(nativeHandle_);
+  }
+
+  /**
+   * <p>Return the value for the current entry.  The underlying storage for
+   * the returned slice is valid only until the next modification of
+   * the iterator.</p>
+   *
+   * <p>REQUIRES: !AtEnd() && !AtStart()</p>
+   * @return value for the current entry.
+   */
+  public byte[] value() {
+    assert(isInitialized());
+    return value0(nativeHandle_);
+  }
+
+  @Override final native void disposeInternal(long handle);
+  @Override final native boolean isValid0(long handle);
+  @Override final native void seekToFirst0(long handle);
+  @Override final native void seekToLast0(long handle);
+  @Override final native void next0(long handle);
+  @Override final native void prev0(long handle);
+  @Override final native void seek0(long handle, byte[] target, int targetLen);
+  @Override final native void status0(long handle) throws RocksDBException;
+
+  private native byte[] key0(long handle);
+  private native byte[] value0(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java
new file mode 100644
index 0000000..fce8fe3
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java
@@ -0,0 +1,80 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * <p>Defines the interface for an Iterator which provides
+ * access to data one entry at a time. Multiple implementations
+ * are provided by this library.  In particular, iterators are provided
+ * to access the contents of a DB and Write Batch.</p>
+ *
+ * <p>Multiple threads can invoke const methods on an RocksIterator without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same RocksIterator must use
+ * external synchronization.</p>
+ *
+ * @see org.rocksdb.RocksObject
+ */
+public interface RocksIteratorInterface {
+
+  /**
+   * <p>An iterator is either positioned at an entry, or
+   * not valid.  This method returns true if the iterator is valid.</p>
+   *
+   * @return true if iterator is valid.
+   */
+  boolean isValid();
+
+  /**
+   * <p>Position at the first entry in the source.  The iterator is Valid()
+   * after this call if the source is not empty.</p>
+   */
+  void seekToFirst();
+
+  /**
+   * <p>Position at the last entry in the source.  The iterator is
+   * valid after this call if the source is not empty.</p>
+   */
+  void seekToLast();
+
+  /**
+   * <p>Position at the first entry in the source whose key is that or
+   * past target.</p>
+   *
+   * <p>The iterator is valid after this call if the source contains
+   * a key that comes at or past target.</p>
+   *
+   * @param target byte array describing a key or a
+   *               key prefix to seek for.
+   */
+  void seek(byte[] target);
+
+  /**
+   * <p>Moves to the next entry in the source.  After this call, Valid() is
+   * true if the iterator was not positioned at the last entry in the source.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}</p>
+   */
+  void next();
+
+  /**
+   * <p>Moves to the previous entry in the source.  After this call, Valid() is
+   * true if the iterator was not positioned at the first entry in source.</p>
+   *
+   * <p>REQUIRES: {@link #isValid()}</p>
+   */
+  void prev();
+
+  /**
+   * <p>If an error has occurred, return it.  Else return an ok status.
+   * If non-blocking IO is requested and this operation cannot be
+   * satisfied without doing some IO, then this returns Status::Incomplete().</p>
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *                          native library.
+   */
+  void status() throws RocksDBException;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java
new file mode 100644
index 0000000..54c9f99
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java
@@ -0,0 +1,33 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * RocksDB memory environment.
+ */
+public class RocksMemEnv extends Env {
+
+  /**
+   * <p>Creates a new RocksDB environment that stores its data
+   * in memory and delegates all non-file-storage tasks to
+   * base_env. The caller must delete the result when it is
+   * no longer needed.</p>
+   *
+   * <p>{@code *base_env} must remain live while the result is in use.</p>
+   */
+  public RocksMemEnv() {
+    super();
+    nativeHandle_ = createMemEnv();
+  }
+
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private static native long createMemEnv();
+  private native void disposeInternal(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java b/src/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java
new file mode 100644
index 0000000..6e24a13
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/RocksObject.java
@@ -0,0 +1,125 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * RocksObject is the base-class of all RocksDB classes that has a pointer to
+ * some c++ {@code rocksdb} object.
+ *
+ * <p>
+ * RocksObject has {@code dispose()} function, which releases its associated c++
+ * resource.</p>
+ * <p>
+ * This function can be either called manually, or being called automatically
+ * during the regular Java GC process. However, since Java may wrongly assume a
+ * RocksObject only contains a long member variable and think it is small in size,
+ * Java may give {@code RocksObject} low priority in the GC process. For this, it is
+ * suggested to call {@code dispose()} manually. However, it is safe to let
+ * {@code RocksObject} go out-of-scope without manually calling {@code dispose()}
+ * as {@code dispose()} will be called in the finalizer during the
+ * regular GC process.</p>
+ */
+public abstract class RocksObject {
+  protected RocksObject() {
+    nativeHandle_ = 0;
+    owningHandle_ = true;
+  }
+
+  /**
+   * Release the c++ object manually pointed by the native handle.
+   * <p>
+   * Note that {@code dispose()} will also be called during the GC process
+   * if it was not called before its {@code RocksObject} went out-of-scope.
+   * However, since Java may wrongly wrongly assume those objects are
+   * small in that they seems to only hold a long variable. As a result,
+   * they might have low priority in the GC process.  To prevent this,
+   * it is suggested to call {@code dispose()} manually.
+   * </p>
+   * <p>
+   * Note that once an instance of {@code RocksObject} has been disposed,
+   * calling its function will lead undefined behavior.
+   * </p>
+   */
+  public final synchronized void dispose() {
+    if (isOwningNativeHandle() && isInitialized()) {
+      disposeInternal();
+    }
+    nativeHandle_ = 0;
+    disOwnNativeHandle();
+  }
+
+  /**
+   * The helper function of {@code dispose()} which all subclasses of
+   * {@code RocksObject} must implement to release their associated
+   * C++ resource.
+   */
+  protected abstract void disposeInternal();
+
+  /**
+   * Revoke ownership of the native object.
+   * <p>
+   * This will prevent the object from attempting to delete the underlying
+   * native object in its finalizer. This must be used when another object
+   * takes over ownership of the native object or both will attempt to delete
+   * the underlying object when garbage collected.
+   * <p>
+   * When {@code disOwnNativeHandle()} is called, {@code dispose()} will simply set
+   * {@code nativeHandle_} to 0 without releasing its associated C++ resource.
+   * As a result, incorrectly use this function may cause memory leak, and this
+   * function call will not affect the return value of {@code isInitialized()}.
+   * </p>
+   * @see #dispose()
+   * @see #isInitialized()
+   */
+  protected void disOwnNativeHandle() {
+    owningHandle_ = false;
+  }
+
+  /**
+   * Returns true if the current {@code RocksObject} is responsible to release
+   * its native handle.
+   *
+   * @return true if the current {@code RocksObject} is responsible to release
+   *     its native handle.
+   *
+   * @see #disOwnNativeHandle()
+   * @see #dispose()
+   */
+  protected boolean isOwningNativeHandle() {
+    return owningHandle_;
+  }
+
+  /**
+   * Returns true if the associated native handle has been initialized.
+   *
+   * @return true if the associated native handle has been initialized.
+   *
+   * @see #dispose()
+   */
+  protected boolean isInitialized() {
+    return (nativeHandle_ != 0);
+  }
+
+  /**
+   * Simply calls {@code dispose()} and release its c++ resource if it has not
+   * yet released.
+   */
+  @Override protected void finalize() throws Throwable {
+    dispose();
+    super.finalize();
+  }
+
+  /**
+   * A long variable holding c++ pointer pointing to some RocksDB C++ object.
+   */
+  protected long nativeHandle_;
+
+  /**
+   * A flag indicating whether the current {@code RocksObject} is responsible to
+   * release the c++ object stored in its {@code nativeHandle_}.
+   */
+  private boolean owningHandle_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java
new file mode 100644
index 0000000..e31e199
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java
@@ -0,0 +1,50 @@
+package org.rocksdb;
+
+/**
+ * The config for skip-list memtable representation.
+ */
+public class SkipListMemTableConfig extends MemTableConfig {
+
+  public static final long DEFAULT_LOOKAHEAD = 0;
+
+  /**
+   * SkipListMemTableConfig constructor
+   */
+  public SkipListMemTableConfig() {
+    lookahead_ = DEFAULT_LOOKAHEAD;
+  }
+
+  /**
+   * Sets lookahead for SkipList
+   *
+   * @param lookahead If non-zero, each iterator's seek operation
+   *     will start the search from the previously visited record
+   *     (doing at most 'lookahead' steps). This is an
+   *     optimization for the access pattern including many
+   *     seeks with consecutive keys.
+   * @return the current instance of SkipListMemTableConfig
+   */
+  public SkipListMemTableConfig setLookahead(final long lookahead) {
+    lookahead_ = lookahead;
+    return this;
+  }
+
+  /**
+   * Returns the currently set lookahead value.
+   *
+   * @return lookahead value
+   */
+  public long lookahead() {
+    return lookahead_;
+  }
+
+
+  @Override protected long newMemTableFactoryHandle() {
+    return newMemTableFactoryHandle0(lookahead_);
+  }
+
+  private native long newMemTableFactoryHandle0(long lookahead)
+      throws IllegalArgumentException;
+
+  private long lookahead_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Slice.java b/src/rocksdb/java/src/main/java/org/rocksdb/Slice.java
new file mode 100644
index 0000000..d26490e
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Slice.java
@@ -0,0 +1,88 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * <p>Base class for slices which will receive
+ * byte[] based access to the underlying data.</p>
+ *
+ * <p>byte[] backed slices typically perform better with
+ * small keys and values. When using larger keys and
+ * values consider using {@link org.rocksdb.DirectSlice}</p>
+ */
+public class Slice extends AbstractSlice<byte[]> {
+  /**
+   * <p>Called from JNI to construct a new Java Slice
+   * without an underlying C++ object set
+   * at creation time.</p>
+   *
+   * <p>Note: You should be aware that
+   * {@see org.rocksdb.RocksObject#disOwnNativeHandle()} is intentionally
+   * called from the default Slice constructor, and that it is marked as
+   * private. This is so that developers cannot construct their own default
+   * Slice objects (at present). As developers cannot construct their own
+   * Slice objects through this, they are not creating underlying C++ Slice
+   * objects, and so there is nothing to free (dispose) from Java.</p>
+   */
+  private Slice() {
+    super();
+    disOwnNativeHandle();
+  }
+
+  /**
+   * <p>Constructs a slice where the data is taken from
+   * a String.</p>
+   *
+   * @param str String value.
+   */
+  public Slice(final String str) {
+    super();
+    createNewSliceFromString(str);
+  }
+
+  /**
+   * <p>Constructs a slice where the data is a copy of
+   * the byte array from a specific offset.</p>
+   *
+   * @param data byte array.
+   * @param offset offset within the byte array.
+   */
+  public Slice(final byte[] data, final int offset) {
+    super();
+    createNewSlice0(data, offset);
+  }
+
+  /**
+   * <p>Constructs a slice where the data is a copy of
+   * the byte array.</p>
+   *
+   * @param data byte array.
+   */
+  public Slice(final byte[] data) {
+    super();
+    createNewSlice1(data);
+  }
+
+  /**
+   * <p>Deletes underlying C++ slice pointer
+   * and any buffered data.</p>
+   *
+   * <p>
+   * Note that this function should be called only after all
+   * RocksDB instances referencing the slice are closed.
+   * Otherwise an undefined behavior will occur.</p>
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternalBuf(nativeHandle_);
+    super.disposeInternal();
+  }
+
+  @Override protected final native byte[] data0(long handle);
+  private native void createNewSlice0(byte[] data, int length);
+  private native void createNewSlice1(byte[] data);
+  private native void disposeInternalBuf(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Snapshot.java b/src/rocksdb/java/src/main/java/org/rocksdb/Snapshot.java
new file mode 100644
index 0000000..7ef5c38
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Snapshot.java
@@ -0,0 +1,37 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Snapshot of database
+ */
+public class Snapshot extends RocksObject {
+  Snapshot(final long nativeHandle) {
+    super();
+    nativeHandle_ = nativeHandle;
+  }
+
+  /**
+   * Return the associated sequence number;
+   *
+   * @return the associated sequence number of
+   *     this snapshot.
+   */
+  public long getSequenceNumber() {
+    assert(isInitialized());
+    return getSequenceNumber(nativeHandle_);
+  }
+
+  /**
+   * Dont release C++ Snapshot pointer. The pointer
+   * to the snapshot is released by the database
+   * instance.
+   */
+  @Override protected void disposeInternal() {
+  }
+
+  private native long getSequenceNumber(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/Statistics.java b/src/rocksdb/java/src/main/java/org/rocksdb/Statistics.java
new file mode 100644
index 0000000..a099444
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/Statistics.java
@@ -0,0 +1,37 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Statistics to analyze the performance of a db. Pointer for statistics object
+ * is managed by Options class.
+ */
+public class Statistics {
+
+  private final long statsHandle_;
+
+  public Statistics(final long statsHandle) {
+    statsHandle_ = statsHandle;
+  }
+
+  public long getTickerCount(TickerType tickerType) {
+    assert(isInitialized());
+    return getTickerCount0(tickerType.getValue(), statsHandle_);
+  }
+
+  public HistogramData geHistogramData(final HistogramType histogramType) {
+    assert(isInitialized());
+    return geHistogramData0(
+        histogramType.getValue(), statsHandle_);
+  }
+
+  private boolean isInitialized() {
+    return (statsHandle_ != 0);
+  }
+
+  private native long getTickerCount0(int tickerType, long handle);
+  private native HistogramData geHistogramData0(int histogramType, long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java b/src/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java
new file mode 100644
index 0000000..4f1577c
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java
@@ -0,0 +1,107 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.List;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * <p>Helper class to collect DB statistics periodically at a period specified in
+ * constructor. Callback function (provided in constructor) is called with
+ * every statistics collection.</p>
+ *
+ * <p>Caller should call start() to start statistics collection. Shutdown() should
+ * be called to stop stats collection and should be called before statistics (
+ * provided in constructor) reference has been disposed.</p>
+ */
+public class StatisticsCollector {
+  private final List<StatsCollectorInput> _statsCollectorInputList;
+  private final ExecutorService _executorService;
+  private final int _statsCollectionInterval;
+  private volatile boolean _isRunning = true;
+
+  /**
+   * Constructor for statistics collector.
+   *
+   * @param statsCollectorInputList List of statistics collector input.
+   * @param statsCollectionIntervalInMilliSeconds Statistics collection time
+   *        period (specified in milliseconds).
+   */
+  public StatisticsCollector(
+      final List<StatsCollectorInput> statsCollectorInputList,
+      final int statsCollectionIntervalInMilliSeconds) {
+    _statsCollectorInputList = statsCollectorInputList;
+    _statsCollectionInterval = statsCollectionIntervalInMilliSeconds;
+
+    _executorService = Executors.newSingleThreadExecutor();
+  }
+
+  public void start() {
+    _executorService.submit(collectStatistics());
+  }
+
+  /**
+   * Shuts down statistics collector.
+   *
+   * @param shutdownTimeout Time in milli-seconds to wait for shutdown before
+   *        killing the collection process.
+   * @throws java.lang.InterruptedException thrown if Threads are interrupted.
+   */
+  public void shutDown(final int shutdownTimeout) throws InterruptedException {
+    _isRunning = false;
+
+    _executorService.shutdownNow();
+    // Wait for collectStatistics runnable to finish so that disposal of
+    // statistics does not cause any exceptions to be thrown.
+    _executorService.awaitTermination(shutdownTimeout, TimeUnit.MILLISECONDS);
+  }
+
+  private Runnable collectStatistics() {
+    return new Runnable() {
+
+      @Override
+      public void run() {
+        while (_isRunning) {
+          try {
+            if(Thread.currentThread().isInterrupted()) {
+              break;
+            }
+            for(StatsCollectorInput statsCollectorInput :
+                _statsCollectorInputList) {
+              Statistics statistics = statsCollectorInput.getStatistics();
+              StatisticsCollectorCallback statsCallback =
+                  statsCollectorInput.getCallback();
+
+                // Collect ticker data
+              for(TickerType ticker : TickerType.values()) {
+                long tickerValue = statistics.getTickerCount(ticker);
+                statsCallback.tickerCallback(ticker, tickerValue);
+              }
+
+              // Collect histogram data
+              for(HistogramType histogramType : HistogramType.values()) {
+                HistogramData histogramData =
+                    statistics.geHistogramData(histogramType);
+                statsCallback.histogramCallback(histogramType, histogramData);
+              }
+
+              Thread.sleep(_statsCollectionInterval);
+            }
+          }
+          catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+            break;
+          }
+          catch (Exception e) {
+            throw new RuntimeException("Error while calculating statistics", e);
+          }
+        }
+      }
+    };
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java b/src/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java
new file mode 100644
index 0000000..2ce92c5
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java
@@ -0,0 +1,32 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Callback interface provided to StatisticsCollector.
+ *
+ * Thread safety:
+ * StatisticsCollector doesn't make any guarantees about thread safety.
+ * If the same reference of StatisticsCollectorCallback is passed to multiple
+ * StatisticsCollector references, then its the responsibility of the
+ * user to make StatisticsCollectorCallback's implementation thread-safe.
+ *
+ */
+public interface StatisticsCollectorCallback {
+  /**
+   * Callback function to get ticker values.
+   * @param tickerType Ticker type.
+   * @param tickerCount Value of ticker type.
+  */
+  void tickerCallback(TickerType tickerType, long tickerCount);
+
+  /**
+   * Callback function to get histogram values.
+   * @param histType Histogram type.
+   * @param histData Histogram data.
+  */
+  void histogramCallback(HistogramType histType, HistogramData histData);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/StatsCollectorInput.java b/src/rocksdb/java/src/main/java/org/rocksdb/StatsCollectorInput.java
new file mode 100644
index 0000000..0e842c2
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/StatsCollectorInput.java
@@ -0,0 +1,35 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Contains all information necessary to collect statistics from one instance
+ * of DB statistics.
+ */
+public class StatsCollectorInput {
+  private final Statistics _statistics;
+  private final StatisticsCollectorCallback _statsCallback;
+
+  /**
+   * Constructor for StatsCollectorInput.
+   *
+   * @param statistics Reference of DB statistics.
+   * @param statsCallback Reference of statistics callback interface.
+   */
+  public StatsCollectorInput(final Statistics statistics,
+      final StatisticsCollectorCallback statsCallback) {
+    _statistics = statistics;
+    _statsCallback = statsCallback;
+  }
+
+  public Statistics getStatistics() {
+    return _statistics;
+  }
+
+  public StatisticsCollectorCallback getCallback() {
+    return _statsCallback;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java b/src/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java
new file mode 100644
index 0000000..52cd43e
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java
@@ -0,0 +1,17 @@
+// Copyright (c) 2014, Vlad Balan (vlad.gm at gmail.com).  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * StringAppendOperator is a merge operator that concatenates
+ * two strings.
+ */
+public class StringAppendOperator implements MergeOperator {
+    @Override public long newMergeOperatorHandle() {
+        return newMergeOperatorHandleImpl();
+    }
+    private native long newMergeOperatorHandleImpl();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TableFormatConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/TableFormatConfig.java
new file mode 100644
index 0000000..58a533b
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TableFormatConfig.java
@@ -0,0 +1,22 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+/**
+ * TableFormatConfig is used to config the internal Table format of a RocksDB.
+ * To make a RocksDB to use a specific Table format, its associated
+ * TableFormatConfig should be properly set and passed into Options via
+ * Options.setTableFormatConfig() and open the db using that Options.
+ */
+public abstract class TableFormatConfig {
+  /**
+   * <p>This function should only be called by Options.setTableFormatConfig(),
+   * which will create a c++ shared-pointer to the c++ TableFactory
+   * that associated with the Java TableFormatConfig.</p>
+   *
+   * @return native handle address to native table instance.
+   */
+  abstract protected long newTableFactoryHandle();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TickerType.java b/src/rocksdb/java/src/main/java/org/rocksdb/TickerType.java
new file mode 100644
index 0000000..180fbf4
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TickerType.java
@@ -0,0 +1,137 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+public enum TickerType {
+  // total block cache misses
+  // REQUIRES: BLOCK_CACHE_MISS == BLOCK_CACHE_INDEX_MISS +
+  //                               BLOCK_CACHE_FILTER_MISS +
+  //                               BLOCK_CACHE_DATA_MISS;
+  BLOCK_CACHE_MISS(0),
+  // total block cache hit
+  // REQUIRES: BLOCK_CACHE_HIT == BLOCK_CACHE_INDEX_HIT +
+  //                              BLOCK_CACHE_FILTER_HIT +
+  //                              BLOCK_CACHE_DATA_HIT;
+  BLOCK_CACHE_HIT(1),
+  // # of blocks added to block cache.
+  BLOCK_CACHE_ADD(2),
+  // # of times cache miss when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_MISS(3),
+  // # of times cache hit when accessing index block from block cache.
+  BLOCK_CACHE_INDEX_HIT(4),
+  // # of times cache miss when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_MISS(5),
+  // # of times cache hit when accessing filter block from block cache.
+  BLOCK_CACHE_FILTER_HIT(6),
+  // # of times cache miss when accessing data block from block cache.
+  BLOCK_CACHE_DATA_MISS(7),
+  // # of times cache hit when accessing data block from block cache.
+  BLOCK_CACHE_DATA_HIT(8),
+  // # of times bloom filter has avoided file reads.
+  BLOOM_FILTER_USEFUL(9),
+
+  // # of memtable hits.
+  MEMTABLE_HIT(10),
+  // # of memtable misses.
+  MEMTABLE_MISS(11),
+
+  // # of Get() queries served by L0
+  GET_HIT_L0(12),
+  // # of Get() queries served by L1
+  GET_HIT_L1(13),
+  // # of Get() queries served by L2 and up
+  GET_HIT_L2_AND_UP(14),
+
+  /**
+   * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction
+   * There are 3 reasons currently.
+   */
+  COMPACTION_KEY_DROP_NEWER_ENTRY(15),  // key was written with a newer value.
+  COMPACTION_KEY_DROP_OBSOLETE(16),     // The key is obsolete.
+  COMPACTION_KEY_DROP_USER(17),  // user compaction function has dropped the key.
+
+  // Number of keys written to the database via the Put and Write call's
+  NUMBER_KEYS_WRITTEN(18),
+  // Number of Keys read,
+  NUMBER_KEYS_READ(19),
+  // Number keys updated, if inplace update is enabled
+  NUMBER_KEYS_UPDATED(20),
+  // Bytes written / read
+  BYTES_WRITTEN(21),
+  BYTES_READ(22),
+  NO_FILE_CLOSES(23),
+  NO_FILE_OPENS(24),
+  NO_FILE_ERRORS(25),
+  // Time system had to wait to do LO-L1 compactions
+  STALL_L0_SLOWDOWN_MICROS(26),
+  // Time system had to wait to move memtable to L1.
+  STALL_MEMTABLE_COMPACTION_MICROS(27),
+  // write throttle because of too many files in L0
+  STALL_L0_NUM_FILES_MICROS(28),
+  // Writer has to wait for compaction or flush to finish.
+  STALL_MICROS(29),
+  // The wait time for db mutex.
+  DB_MUTEX_WAIT_MICROS(30),
+  RATE_LIMIT_DELAY_MILLIS(31),
+  NO_ITERATORS(32),  // number of iterators currently open
+
+  // Number of MultiGet calls, keys read, and bytes read
+  NUMBER_MULTIGET_CALLS(33),
+  NUMBER_MULTIGET_KEYS_READ(34),
+  NUMBER_MULTIGET_BYTES_READ(35),
+
+  // Number of deletes records that were not required to be
+  // written to storage because key does not exist
+  NUMBER_FILTERED_DELETES(36),
+  NUMBER_MERGE_FAILURES(37),
+  SEQUENCE_NUMBER(38),
+
+  // number of times bloom was checked before creating iterator on a
+  // file, and the number of times the check was useful in avoiding
+  // iterator creation (and thus likely IOPs).
+  BLOOM_FILTER_PREFIX_CHECKED(39),
+  BLOOM_FILTER_PREFIX_USEFUL(40),
+
+  // Number of times we had to reseek inside an iteration to skip
+  // over large number of keys with same userkey.
+  NUMBER_OF_RESEEKS_IN_ITERATION(41),
+
+  // Record the number of calls to GetUpadtesSince. Useful to keep track of
+  // transaction log iterator refreshes
+  GET_UPDATES_SINCE_CALLS(42),
+  BLOCK_CACHE_COMPRESSED_MISS(43),  // miss in the compressed block cache
+  BLOCK_CACHE_COMPRESSED_HIT(44),   // hit in the compressed block cache
+  WAL_FILE_SYNCED(45),              // Number of times WAL sync is done
+  WAL_FILE_BYTES(46),               // Number of bytes written to WAL
+
+  // Writes can be processed by requesting thread or by the thread at the
+  // head of the writers queue.
+  WRITE_DONE_BY_SELF(47),
+  WRITE_DONE_BY_OTHER(48),
+  WRITE_TIMEDOUT(49),       // Number of writes ending up with timed-out.
+  WRITE_WITH_WAL(50),       // Number of Write calls that request WAL
+  COMPACT_READ_BYTES(51),   // Bytes read during compaction
+  COMPACT_WRITE_BYTES(52),  // Bytes written during compaction
+  FLUSH_WRITE_BYTES(53),    // Bytes written during flush
+
+  // Number of table's properties loaded directly from file, without creating
+  // table reader object.
+  NUMBER_DIRECT_LOAD_TABLE_PROPERTIES(54),
+  NUMBER_SUPERVERSION_ACQUIRES(55),
+  NUMBER_SUPERVERSION_RELEASES(56),
+  NUMBER_SUPERVERSION_CLEANUPS(57),
+  NUMBER_BLOCK_NOT_COMPRESSED(58);
+
+  private final int value_;
+
+  private TickerType(int value) {
+    value_ = value;
+  }
+
+  public int getValue() {
+    return value_;
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TransactionLogIterator.java b/src/rocksdb/java/src/main/java/org/rocksdb/TransactionLogIterator.java
new file mode 100644
index 0000000..36f7e2c
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TransactionLogIterator.java
@@ -0,0 +1,116 @@
+package org.rocksdb;
+
+/**
+ * <p>A TransactionLogIterator is used to iterate over the transactions in a db.
+ * One run of the iterator is continuous, i.e. the iterator will stop at the
+ * beginning of any gap in sequences.</p>
+ */
+public class TransactionLogIterator extends RocksObject {
+
+  /**
+   * <p>An iterator is either positioned at a WriteBatch
+   * or not valid. This method returns true if the iterator
+   * is valid. Can read data from a valid iterator.</p>
+   *
+   * @return true if iterator position is valid.
+   */
+  public boolean isValid() {
+    return isValid(nativeHandle_);
+  }
+
+  /**
+   * <p>Moves the iterator to the next WriteBatch.
+   * <strong>REQUIRES</strong>: Valid() to be true.</p>
+   */
+  public void next() {
+    next(nativeHandle_);
+  }
+
+  /**
+   * <p>Throws RocksDBException if something went wrong.</p>
+   *
+   * @throws org.rocksdb.RocksDBException if something went
+   *     wrong in the underlying C++ code.
+   */
+  public void status() throws RocksDBException {
+    status(nativeHandle_);
+  }
+
+  /**
+   * <p>If iterator position is valid, return the current
+   * write_batch and the sequence number of the earliest
+   * transaction contained in the batch.</p>
+   *
+   * <p>ONLY use if Valid() is true and status() is OK.</p>
+   *
+   * @return {@link org.rocksdb.TransactionLogIterator.BatchResult}
+   *     instance.
+   */
+  public BatchResult getBatch() {
+    assert(isValid());
+    return getBatch(nativeHandle_);
+  }
+
+  /**
+   * <p>TransactionLogIterator constructor.</p>
+   *
+   * @param nativeHandle address to native address.
+   */
+  TransactionLogIterator(final long nativeHandle) {
+    super();
+    nativeHandle_ = nativeHandle;
+  }
+
+  @Override protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  /**
+   * <p>BatchResult represents a data structure returned
+   * by a TransactionLogIterator containing a sequence
+   * number and a {@link WriteBatch} instance.</p>
+   */
+  public final class BatchResult {
+    /**
+     * <p>Constructor of BatchResult class.</p>
+     *
+     * @param sequenceNumber related to this BatchResult instance.
+     * @param nativeHandle to {@link org.rocksdb.WriteBatch}
+     *     native instance.
+     */
+    public BatchResult(final long sequenceNumber,
+        final long nativeHandle) {
+      sequenceNumber_ = sequenceNumber;
+      writeBatch_ = new WriteBatch(nativeHandle);
+    }
+
+    /**
+     * <p>Return sequence number related to this BatchResult.</p>
+     *
+     * @return Sequence number.
+     */
+    public long sequenceNumber() {
+      return sequenceNumber_;
+    }
+
+    /**
+     * <p>Return contained {@link org.rocksdb.WriteBatch}
+     * instance</p>
+     *
+     * @return {@link org.rocksdb.WriteBatch} instance.
+     */
+    public WriteBatch writeBatch() {
+      return writeBatch_;
+    }
+
+    private final long sequenceNumber_;
+    private final WriteBatch writeBatch_;
+  }
+
+  private native void disposeInternal(long handle);
+  private native boolean isValid(long handle);
+  private native void next(long handle);
+  private native void status(long handle)
+      throws RocksDBException;
+  private native BatchResult getBatch(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java b/src/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java
new file mode 100644
index 0000000..de6dea9
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java
@@ -0,0 +1,197 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * Database with TTL support.
+ *
+ * <p><strong>Use case</strong></p>
+ * <p>This API should be used to open the db when key-values inserted are
+ * meant to be removed from the db in a non-strict 'ttl' amount of time
+ * Therefore, this guarantees that key-values inserted will remain in the
+ * db for >= ttl amount of time and the db will make efforts to remove the
+ * key-values as soon as possible after ttl seconds of their insertion.
+ * </p>
+ *
+ * <p><strong>Behaviour</strong></p>
+ * <p>TTL is accepted in seconds
+ * (int32_t)Timestamp(creation) is suffixed to values in Put internally
+ * Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now)
+ * Get/Iterator may return expired entries(compaction not run on them yet)
+ * Different TTL may be used during different Opens
+ * </p>
+ *
+ * <p><strong>Example</strong></p>
+ * <ul>
+ * <li>Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2</li>
+ * <li>Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5</li>
+ * </ul>
+ *
+ * <p>
+ * read_only=true opens in the usual read-only mode. Compactions will not be
+ *  triggered(neither manual nor automatic), so no expired entries removed
+ * </p>
+ *
+ * <p><strong>Constraints</strong></p>
+ * <p>Not specifying/passing or non-positive TTL behaves
+ * like TTL = infinity</p>
+ *
+ * <p><strong>!!!WARNING!!!</strong></p>
+ * <p>Calling DB::Open directly to re-open a db created by this API will get
+ * corrupt values(timestamp suffixed) and no ttl effect will be there
+ * during the second Open, so use this API consistently to open the db
+ * Be careful when passing ttl with a small positive value because the
+ * whole database may be deleted in a small amount of time.</p>
+ */
+public class TtlDB extends RocksDB {
+
+  /**
+   * <p>Opens a TtlDB.</p>
+   *
+   * <p>Database is opened in read-write mode without default TTL.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param db_path path to database.
+   *
+   * @return TtlDB instance.
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public static TtlDB open(final Options options, final String db_path)
+      throws RocksDBException {
+    return open(options, db_path, 0, false);
+  }
+
+  /**
+   * <p>Opens a TtlDB.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param db_path path to database.
+   * @param ttl time to live for new entries.
+   * @param readOnly boolean value indicating if database if db is
+   *     opened read-only.
+   *
+   * @return TtlDB instance.
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public static TtlDB open(final Options options, final String db_path,
+      final int ttl, final boolean readOnly) throws RocksDBException {
+    TtlDB ttldb = new TtlDB();
+    ttldb.open(options.nativeHandle_, db_path, ttl, readOnly);
+    return ttldb;
+  }
+
+  /**
+   * <p>Opens a TtlDB.</p>
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param db_path path to database.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *     on open.
+   * @param ttlValues time to live values per column family handle
+   * @param readOnly boolean value indicating if database if db is
+   *     opened read-only.
+   *
+   * @return TtlDB instance.
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   * @throws java.lang.IllegalArgumentException when there is not a ttl value
+   *     per given column family handle.
+   */
+  public static TtlDB open(final DBOptions options, final String db_path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles,
+      final List<Integer> ttlValues, final boolean readOnly)
+      throws RocksDBException {
+    if (columnFamilyDescriptors.size() != ttlValues.size()) {
+      throw new IllegalArgumentException("There must be a ttl value per column" +
+          "family handle.");
+    }
+    TtlDB ttlDB = new TtlDB();
+    List<Long> cfReferences = ttlDB.openCF(options.nativeHandle_, db_path,
+        columnFamilyDescriptors, columnFamilyDescriptors.size(),
+        ttlValues, readOnly);
+    for (int i=0; i<columnFamilyDescriptors.size(); i++) {
+      columnFamilyHandles.add(new ColumnFamilyHandle(ttlDB, cfReferences.get(i)));
+    }
+    return ttlDB;
+  }
+
+  /**
+   * <p>Creates a new ttl based column family with a name defined
+   * in given ColumnFamilyDescriptor and allocates a
+   * ColumnFamilyHandle within an internal structure.</p>
+   *
+   * <p>The ColumnFamilyHandle is automatically disposed with DB
+   * disposal.</p>
+   *
+   * @param columnFamilyDescriptor column family to be created.
+   * @param ttl TTL to set for this column family.
+   *
+   * @return {@link org.rocksdb.ColumnFamilyHandle} instance.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public ColumnFamilyHandle createColumnFamilyWithTtl(
+      final ColumnFamilyDescriptor columnFamilyDescriptor,
+      final int ttl) throws RocksDBException {
+    assert(isInitialized());
+    return new ColumnFamilyHandle(this,
+        createColumnFamilyWithTtl(nativeHandle_,
+            columnFamilyDescriptor, ttl));
+  }
+
+  /**
+   * <p>Close the TtlDB instance and release resource.</p>
+   *
+   * <p>Internally, TtlDB owns the {@code rocksdb::DB} pointer
+   * to its associated {@link org.rocksdb.RocksDB}. The release
+   * of that RocksDB pointer is handled in the destructor of the
+   * c++ {@code rocksdb::TtlDB} and should be transparent to
+   * Java developers.</p>
+   */
+  @Override public synchronized void close() {
+    if (isInitialized()) {
+      super.close();
+    }
+  }
+
+  /**
+   * <p>A protected constructor that will be used in the static
+   * factory method
+   * {@link #open(Options, String, int, boolean)}
+   * and
+   * {@link #open(DBOptions, String, java.util.List, java.util.List,
+   * java.util.List, boolean)}.
+   * </p>
+   */
+  protected TtlDB() {
+    super();
+  }
+
+  @Override protected void finalize() throws Throwable {
+    close();
+    super.finalize();
+  }
+
+  private native void open(long optionsHandle, String db_path, int ttl,
+      boolean readOnly) throws RocksDBException;
+  private native List<Long> openCF(long optionsHandle, String db_path,
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      int columnFamilyDescriptorsLength, List<Integer> ttlValues,
+      boolean readOnly) throws RocksDBException;
+  private native long createColumnFamilyWithTtl(long handle,
+      ColumnFamilyDescriptor columnFamilyDescriptor, int ttl)
+      throws RocksDBException;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/VectorMemTableConfig.java b/src/rocksdb/java/src/main/java/org/rocksdb/VectorMemTableConfig.java
new file mode 100644
index 0000000..3783402
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/VectorMemTableConfig.java
@@ -0,0 +1,45 @@
+package org.rocksdb;
+
+/**
+ * The config for vector memtable representation.
+ */
+public class VectorMemTableConfig extends MemTableConfig {
+  public static final int DEFAULT_RESERVED_SIZE = 0;
+
+  /**
+   * VectorMemTableConfig constructor
+   */
+  public VectorMemTableConfig() {
+    reservedSize_ = DEFAULT_RESERVED_SIZE;
+  }
+
+  /**
+   * Set the initial size of the vector that will be used
+   * by the memtable created based on this config.
+   *
+   * @param size the initial size of the vector.
+   * @return the reference to the current config.
+   */
+  public VectorMemTableConfig setReservedSize(final int size) {
+    reservedSize_ = size;
+    return this;
+  }
+
+  /**
+   * Returns the initial size of the vector used by the memtable
+   * created based on this config.
+   *
+   * @return the initial size of the vector.
+   */
+  public int reservedSize() {
+    return reservedSize_;
+  }
+
+  @Override protected long newMemTableFactoryHandle() {
+    return newMemTableFactoryHandle(reservedSize_);
+  }
+
+  private native long newMemTableFactoryHandle(long reservedSize)
+      throws IllegalArgumentException;
+  private int reservedSize_;
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java b/src/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java
new file mode 100644
index 0000000..f42f549
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java
@@ -0,0 +1,149 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+public class WBWIRocksIterator extends AbstractRocksIterator<WriteBatchWithIndex> {
+  private final WriteEntry entry = new WriteEntry();
+
+  protected WBWIRocksIterator(final WriteBatchWithIndex wbwi, final long nativeHandle) {
+    super(wbwi, nativeHandle);
+  }
+
+  /**
+   * Get the current entry
+   *
+   * The WriteEntry is only valid
+   * until the iterator is repositioned.
+   * If you want to keep the WriteEntry across iterator
+   * movements, you must make a copy of its data!
+   *
+   * @return The WriteEntry of the current entry
+   */
+  public WriteEntry entry() {
+    assert(isInitialized());
+    assert(entry != null);
+    entry1(nativeHandle_, entry);
+    return entry;
+  }
+
+  @Override final native void disposeInternal(long handle);
+  @Override final native boolean isValid0(long handle);
+  @Override final native void seekToFirst0(long handle);
+  @Override final native void seekToLast0(long handle);
+  @Override final native void next0(long handle);
+  @Override final native void prev0(long handle);
+  @Override final native void seek0(long handle, byte[] target, int targetLen);
+  @Override final native void status0(long handle) throws RocksDBException;
+
+  private native void entry1(long handle, WriteEntry entry);
+
+  /**
+   * Enumeration of the Write operation
+   * that created the record in the Write Batch
+   */
+  public enum WriteType {
+    PUT,
+    MERGE,
+    DELETE,
+    LOG
+  }
+
+  /**
+   * Represents an entry returned by
+   * {@link org.rocksdb.WBWIRocksIterator#entry()}
+   *
+   * It is worth noting that a WriteEntry with
+   * the type {@link org.rocksdb.WBWIRocksIterator.WriteType#DELETE}
+   * or {@link org.rocksdb.WBWIRocksIterator.WriteType#LOG}
+   * will not have a value.
+   */
+  public static class WriteEntry {
+    WriteType type = null;
+    final DirectSlice key;
+    final DirectSlice value;
+
+    /**
+     * Intentionally private as this
+     * should only be instantiated in
+     * this manner by the outer WBWIRocksIterator
+     * class; The class members are then modified
+     * by calling {@link org.rocksdb.WBWIRocksIterator#entry()}
+     */
+    private WriteEntry() {
+      key = new DirectSlice();
+      value = new DirectSlice();
+    }
+
+    public WriteEntry(WriteType type, DirectSlice key, DirectSlice value) {
+      this.type = type;
+      this.key = key;
+      this.value = value;
+    }
+
+    /**
+     * Returns the type of the Write Entry
+     *
+     * @return the WriteType of the WriteEntry
+     */
+    public WriteType getType() {
+      return type;
+    }
+
+    /**
+     * Returns the key of the Write Entry
+     *
+     * @return The slice containing the key
+     * of the WriteEntry
+     */
+    public DirectSlice getKey() {
+      return key;
+    }
+
+    /**
+     * Returns the value of the Write Entry
+     *
+     * @return The slice containing the value of
+     * the WriteEntry or null if the WriteEntry has
+     * no value
+     */
+    public DirectSlice getValue() {
+      if(!value.isInitialized()) {
+        return null; //TODO(AR) migrate to JDK8 java.util.Optional#empty()
+      } else {
+        return value;
+      }
+    }
+
+    /**
+     * Generates a hash code for the Write Entry. NOTE: The hash code is based
+     * on the string representation of the key, so it may not work correctly
+     * with exotic custom comparators.
+     *
+     * @return The hash code for the Write Entry
+     */
+    @Override
+    public int hashCode() {
+      return (key == null) ? 0 : key.hashCode();
+    }
+
+    @Override
+    public boolean equals(Object other) {
+      if(other == null) {
+        return false;
+      } else if (this == other) {
+        return true;
+      } else if(other instanceof WriteEntry) {
+        final WriteEntry otherWriteEntry = (WriteEntry)other;
+        return type.equals(otherWriteEntry.type)
+            && key.equals(otherWriteEntry.key)
+            && (value.isInitialized() ? value.equals(otherWriteEntry.value)
+                : !otherWriteEntry.value.isInitialized());
+      } else {
+        return false;
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java b/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java
new file mode 100644
index 0000000..960d122
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java
@@ -0,0 +1,126 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * WriteBatch holds a collection of updates to apply atomically to a DB.
+ *
+ * The updates are applied in the order in which they are added
+ * to the WriteBatch.  For example, the value of "key" will be "v3"
+ * after the following batch is written:
+ *
+ *    batch.put("key", "v1");
+ *    batch.remove("key");
+ *    batch.put("key", "v2");
+ *    batch.put("key", "v3");
+ *
+ * Multiple threads can invoke const methods on a WriteBatch without
+ * external synchronization, but if any of the threads may call a
+ * non-const method, all threads accessing the same WriteBatch must use
+ * external synchronization.
+ */
+public class WriteBatch extends AbstractWriteBatch {
+  /**
+   * Constructs a WriteBatch instance.
+   */
+  public WriteBatch() {
+    super();
+    newWriteBatch(0);
+  }
+
+  /**
+   * Constructs a WriteBatch instance with a given size.
+   *
+   * @param reserved_bytes reserved size for WriteBatch
+   */
+  public WriteBatch(final int reserved_bytes) {
+    nativeHandle_ = 0;
+    newWriteBatch(reserved_bytes);
+  }
+
+  /**
+   * Support for iterating over the contents of a batch.
+   *
+   * @param handler A handler that is called back for each
+   *                update present in the batch
+   *
+   * @throws RocksDBException If we cannot iterate over the batch
+   */
+  public void iterate(final Handler handler) throws RocksDBException {
+    iterate(handler.nativeHandle_);
+  }
+
+  /**
+   * <p>Private WriteBatch constructor which is used to construct
+   * WriteBatch instances from C++ side. As the reference to this
+   * object is also managed from C++ side the handle will be disowned.</p>
+   *
+   * @param nativeHandle address of native instance.
+   */
+  WriteBatch(final long nativeHandle) {
+    super();
+    disOwnNativeHandle();
+    nativeHandle_ = nativeHandle;
+  }
+
+  @Override final native void disposeInternal(long handle);
+  @Override final native int count0();
+  @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen);
+  @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen,
+      long cfHandle);
+  @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen);
+  @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen,
+      long cfHandle);
+  @Override final native void remove(byte[] key, int keyLen);
+  @Override final native void remove(byte[] key, int keyLen, long cfHandle);
+  @Override final native void putLogData(byte[] blob, int blobLen);
+  @Override final native void clear0();
+
+  private native void newWriteBatch(int reserved_bytes);
+  private native void iterate(long handlerHandle) throws RocksDBException;
+
+
+  /**
+   * Handler callback for iterating over the contents of a batch.
+   */
+  public static abstract class Handler extends RocksObject {
+    public Handler() {
+      super();
+      createNewHandler0();
+    }
+
+    public abstract void put(byte[] key, byte[] value);
+    public abstract void merge(byte[] key, byte[] value);
+    public abstract void delete(byte[] key);
+    public abstract void logData(byte[] blob);
+
+    /**
+     * shouldContinue is called by the underlying iterator
+     * WriteBatch::Iterate. If it returns false,
+     * iteration is halted. Otherwise, it continues
+     * iterating. The default implementation always
+     * returns true.
+     *
+     * @return boolean value indicating if the
+     *     iteration is halted.
+     */
+    public boolean shouldContinue() {
+      return true;
+    }
+
+    /**
+     * Deletes underlying C++ handler pointer.
+     */
+    @Override
+    protected void disposeInternal() {
+      assert(isInitialized());
+      disposeInternal(nativeHandle_);
+    }
+
+    private native void createNewHandler0();
+    private native void disposeInternal(long handle);
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java b/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java
new file mode 100644
index 0000000..d5c24ec
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java
@@ -0,0 +1,98 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * <p>Defines the interface for a Write Batch which
+ * holds a collection of updates to apply atomically to a DB.</p>
+ */
+public interface WriteBatchInterface {
+
+    /**
+     * Returns the number of updates in the batch.
+     *
+     * @return number of items in WriteBatch
+     */
+    int count();
+
+    /**
+     * <p>Store the mapping "key->value" in the database.</p>
+     *
+     * @param key the specified key to be inserted.
+     * @param value the value associated with the specified key.
+     */
+    void put(byte[] key, byte[] value);
+
+    /**
+     * <p>Store the mapping "key->value" within given column
+     * family.</p>
+     *
+     * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+     *     instance
+     * @param key the specified key to be inserted.
+     * @param value the value associated with the specified key.
+     */
+    void put(ColumnFamilyHandle columnFamilyHandle,
+                    byte[] key, byte[] value);
+
+    /**
+     * <p>Merge "value" with the existing value of "key" in the database.
+     * "key->merge(existing, value)"</p>
+     *
+     * @param key the specified key to be merged.
+     * @param value the value to be merged with the current value for
+     * the specified key.
+     */
+    void merge(byte[] key, byte[] value);
+
+    /**
+     * <p>Merge "value" with the existing value of "key" in given column family.
+     * "key->merge(existing, value)"</p>
+     *
+     * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+     * @param key the specified key to be merged.
+     * @param value the value to be merged with the current value for
+     * the specified key.
+     */
+    void merge(ColumnFamilyHandle columnFamilyHandle,
+                      byte[] key, byte[] value);
+
+    /**
+     * <p>If the database contains a mapping for "key", erase it.  Else do nothing.</p>
+     *
+     * @param key Key to delete within database
+     */
+    void remove(byte[] key);
+
+    /**
+     * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
+     *
+     * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+     * @param key Key to delete within database
+     */
+    void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key);
+
+    /**
+     * Append a blob of arbitrary size to the records in this batch. The blob will
+     * be stored in the transaction log but not in any other file. In particular,
+     * it will not be persisted to the SST files. When iterating over this
+     * WriteBatch, WriteBatch::Handler::LogData will be called with the contents
+     * of the blob as it is encountered. Blobs, puts, deletes, and merges will be
+     * encountered in the same order in thich they were inserted. The blob will
+     * NOT consume sequence number(s) and will NOT increase the count of the batch
+     *
+     * Example application: add timestamps to the transaction log for use in
+     * replication.
+     *
+     * @param blob binary object to be inserted
+     */
+    void putLogData(byte[] blob);
+
+    /**
+     * Clear all updates buffered in this batch
+     */
+    void clear();
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java b/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java
new file mode 100644
index 0000000..bde037b
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java
@@ -0,0 +1,149 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Similar to {@link org.rocksdb.WriteBatch} but with a binary searchable
+ * index built for all the keys inserted.
+ *
+ * Calling put, merge, remove or putLogData calls the same function
+ * as with {@link org.rocksdb.WriteBatch} whilst also building an index.
+ *
+ * A user can call {@link org.rocksdb.WriteBatchWithIndex#newIterator() }to create an iterator
+ * over the write batch or
+ * {@link org.rocksdb.WriteBatchWithIndex#newIteratorWithBase(org.rocksdb.RocksIterator)} to
+ * get an iterator for the database with Read-Your-Own-Writes like capability
+ */
+public class WriteBatchWithIndex extends AbstractWriteBatch {
+  /**
+   * Creates a WriteBatchWithIndex where no bytes
+   * are reserved up-front, bytewise comparison is
+   * used for fallback key comparisons,
+   * and duplicate keys operations are retained
+   */
+  public WriteBatchWithIndex() {
+    super();
+    newWriteBatchWithIndex();
+  }
+
+
+  /**
+   * Creates a WriteBatchWithIndex where no bytes
+   * are reserved up-front, bytewise comparison is
+   * used for fallback key comparisons, and duplicate key
+   * assignment is determined by the constructor argument
+   *
+   * @param overwriteKey if true, overwrite the key in the index when
+   *   inserting a duplicate key, in this way an iterator will never
+   *   show two entries with the same key.
+   */
+  public WriteBatchWithIndex(final boolean overwriteKey) {
+    super();
+    newWriteBatchWithIndex(overwriteKey);
+  }
+
+  /**
+   * Creates a WriteBatchWithIndex
+   *
+   * @param fallbackIndexComparator We fallback to this comparator
+   *  to compare keys within a column family if we cannot determine
+   *  the column family and so look up it's comparator.
+   *
+   * @param reservedBytes reserved bytes in underlying WriteBatch
+   *
+   * @param overwriteKey if true, overwrite the key in the index when
+   *   inserting a duplicate key, in this way an iterator will never
+   *   show two entries with the same key.
+   */
+  public WriteBatchWithIndex(final AbstractComparator<? extends AbstractSlice<?>>
+      fallbackIndexComparator, final int reservedBytes, final boolean overwriteKey) {
+    super();
+    newWriteBatchWithIndex(fallbackIndexComparator.nativeHandle_, reservedBytes, overwriteKey);
+  }
+
+  /**
+   * Create an iterator of a column family. User can call
+   * {@link org.rocksdb.RocksIteratorInterface#seek(byte[])} to
+   * search to the next entry of or after a key. Keys will be iterated in the
+   * order given by index_comparator. For multiple updates on the same key,
+   * each update will be returned as a separate entry, in the order of update
+   * time.
+   *
+   * @param columnFamilyHandle The column family to iterate over
+   * @return An iterator for the Write Batch contents, restricted to the column family
+   */
+  public WBWIRocksIterator newIterator(final ColumnFamilyHandle columnFamilyHandle) {
+    return new WBWIRocksIterator(this, iterator1(columnFamilyHandle.nativeHandle_));
+  }
+
+  /**
+   * Create an iterator of the default column family. User can call
+   * {@link org.rocksdb.RocksIteratorInterface#seek(byte[])} to
+   * search to the next entry of or after a key. Keys will be iterated in the
+   * order given by index_comparator. For multiple updates on the same key,
+   * each update will be returned as a separate entry, in the order of update
+   * time.
+   *
+   * @return An iterator for the Write Batch contents
+   */
+  public WBWIRocksIterator newIterator() {
+    return new WBWIRocksIterator(this, iterator0());
+  }
+
+  /**
+   * Provides Read-Your-Own-Writes like functionality by
+   * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator}
+   * as a delta and baseIterator as a base
+   *
+   * @param columnFamilyHandle The column family to iterate over
+   * @param baseIterator The base iterator, e.g. {@link org.rocksdb.RocksDB#newIterator()}
+   * @return An iterator which shows a view comprised of both the database point-in-time
+   * from baseIterator and modifications made in this write batch.
+   */
+  public RocksIterator newIteratorWithBase(final ColumnFamilyHandle columnFamilyHandle,
+      final RocksIterator baseIterator) {
+    RocksIterator iterator = new RocksIterator(
+        baseIterator.parent_,
+        iteratorWithBase(columnFamilyHandle.nativeHandle_, baseIterator.nativeHandle_));
+    //when the iterator is deleted it will also delete the baseIterator
+    baseIterator.disOwnNativeHandle();
+    return iterator;
+  }
+
+  /**
+   * Provides Read-Your-Own-Writes like functionality by
+   * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator}
+   * as a delta and baseIterator as a base. Operates on the default column family.
+   *
+   * @param baseIterator The base iterator, e.g. {@link org.rocksdb.RocksDB#newIterator()}
+   * @return An iterator which shows a view comprised of both the database point-in-time
+   * from baseIterator and modifications made in this write batch.
+   */
+  public RocksIterator newIteratorWithBase(final RocksIterator baseIterator) {
+    return newIteratorWithBase(baseIterator.parent_.getDefaultColumnFamily(), baseIterator);
+  }
+
+  @Override final native void disposeInternal(long handle);
+  @Override final native int count0();
+  @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen);
+  @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen,
+      long cfHandle);
+  @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen);
+  @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen,
+      long cfHandle);
+  @Override final native void remove(byte[] key, int keyLen);
+  @Override final native void remove(byte[] key, int keyLen, long cfHandle);
+  @Override final native void putLogData(byte[] blob, int blobLen);
+  @Override final native void clear0();
+
+  private native void newWriteBatchWithIndex();
+  private native void newWriteBatchWithIndex(boolean overwriteKey);
+  private native void newWriteBatchWithIndex(long fallbackIndexComparatorHandle, int reservedBytes,
+      boolean overwriteKey);
+  private native long iterator0();
+  private native long iterator1(long cfHandle);
+  private native long iteratorWithBase(long baseIteratorHandle, long cfHandle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java b/src/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java
new file mode 100644
index 0000000..c27dc9b
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java
@@ -0,0 +1,106 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Options that control write operations.
+ *
+ * Note that developers should call WriteOptions.dispose() to release the
+ * c++ side memory before a WriteOptions instance runs out of scope.
+ */
+public class WriteOptions extends RocksObject {
+  /**
+   * Construct WriteOptions instance.
+   */
+  public WriteOptions() {
+    super();
+    newWriteOptions();
+  }
+
+  @Override protected void disposeInternal() {
+    assert(isInitialized());
+    disposeInternal(nativeHandle_);
+  }
+
+  /**
+   * If true, the write will be flushed from the operating system
+   * buffer cache (by calling WritableFile::Sync()) before the write
+   * is considered complete.  If this flag is true, writes will be
+   * slower.
+   *
+   * If this flag is false, and the machine crashes, some recent
+   * writes may be lost.  Note that if it is just the process that
+   * crashes (i.e., the machine does not reboot), no writes will be
+   * lost even if sync==false.
+   *
+   * In other words, a DB write with sync==false has similar
+   * crash semantics as the "write()" system call.  A DB write
+   * with sync==true has similar crash semantics to a "write()"
+   * system call followed by "fdatasync()".
+   *
+   * Default: false
+   *
+   * @param flag a boolean flag to indicate whether a write
+   *     should be synchronized.
+   * @return the instance of the current WriteOptions.
+   */
+  public WriteOptions setSync(final boolean flag) {
+    setSync(nativeHandle_, flag);
+    return this;
+  }
+
+  /**
+   * If true, the write will be flushed from the operating system
+   * buffer cache (by calling WritableFile::Sync()) before the write
+   * is considered complete.  If this flag is true, writes will be
+   * slower.
+   *
+   * If this flag is false, and the machine crashes, some recent
+   * writes may be lost.  Note that if it is just the process that
+   * crashes (i.e., the machine does not reboot), no writes will be
+   * lost even if sync==false.
+   *
+   * In other words, a DB write with sync==false has similar
+   * crash semantics as the "write()" system call.  A DB write
+   * with sync==true has similar crash semantics to a "write()"
+   * system call followed by "fdatasync()".
+   *
+   * @return boolean value indicating if sync is active.
+   */
+  public boolean sync() {
+    return sync(nativeHandle_);
+  }
+
+  /**
+   * If true, writes will not first go to the write ahead log,
+   * and the write may got lost after a crash.
+   *
+   * @param flag a boolean flag to specify whether to disable
+   *     write-ahead-log on writes.
+   * @return the instance of the current WriteOptions.
+   */
+  public WriteOptions setDisableWAL(final boolean flag) {
+    setDisableWAL(nativeHandle_, flag);
+    return this;
+  }
+
+  /**
+   * If true, writes will not first go to the write ahead log,
+   * and the write may got lost after a crash.
+   *
+   * @return boolean value indicating if WAL is disabled.
+   */
+  public boolean disableWAL() {
+    return disableWAL(nativeHandle_);
+  }
+
+  private native void newWriteOptions();
+  private native void setSync(long handle, boolean flag);
+  private native boolean sync(long handle);
+  private native void setDisableWAL(long handle, boolean flag);
+  private native boolean disableWAL(long handle);
+  private native void disposeInternal(long handle);
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java b/src/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java
new file mode 100644
index 0000000..f65b92a
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java
@@ -0,0 +1,59 @@
+package org.rocksdb.util;
+
+public class Environment {
+  private static String OS = System.getProperty("os.name").toLowerCase();
+  private static String ARCH = System.getProperty("os.arch").toLowerCase();
+
+  public static boolean isWindows() {
+    return (OS.contains("win"));
+  }
+
+  public static boolean isMac() {
+    return (OS.contains("mac"));
+  }
+
+  public static boolean isUnix() {
+    return (OS.contains("nix") ||
+        OS.contains("nux") ||
+        OS.contains("aix"));
+  }
+
+  public static boolean is64Bit() {
+    return (ARCH.indexOf("64") > 0);
+  }
+
+  public static String getSharedLibraryName(final String name) {
+    return name + "jni";
+  }
+
+  public static String getSharedLibraryFileName(final String name) {
+    return appendLibOsSuffix("lib" + getSharedLibraryName(name), true);
+  }
+
+  public static String getJniLibraryName(final String name) {
+    if (isUnix()) {
+      final String arch = (is64Bit()) ? "64" : "32";
+      return String.format("%sjni-linux%s", name, arch);
+    } else if (isMac()) {
+      return String.format("%sjni-osx", name);
+    }
+    throw new UnsupportedOperationException();
+  }
+
+  public static String getJniLibraryFileName(final String name) {
+    return appendLibOsSuffix("lib" + getJniLibraryName(name), false);
+  }
+
+  private static String appendLibOsSuffix(final String libraryFileName, final boolean shared) {
+    if (isUnix()) {
+      return libraryFileName + ".so";
+    } else if (isMac()) {
+      return libraryFileName + (shared ? ".dylib" : ".jnilib");
+    }
+    throw new UnsupportedOperationException();
+  }
+
+  public static String getJniLibraryExtension() {
+    return (isMac()) ? ".jnilib" : ".so";
+  }
+}
diff --git a/src/rocksdb/java/src/main/java/org/rocksdb/util/SizeUnit.java b/src/rocksdb/java/src/main/java/org/rocksdb/util/SizeUnit.java
new file mode 100644
index 0000000..8d50cd1
--- /dev/null
+++ b/src/rocksdb/java/src/main/java/org/rocksdb/util/SizeUnit.java
@@ -0,0 +1,16 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb.util;
+
+public class SizeUnit {
+  public static final long KB = 1024L;
+  public static final long MB = KB * KB;
+  public static final long GB = KB * MB;
+  public static final long TB = KB * GB;
+  public static final long PB = KB * TB;
+
+  private SizeUnit() {}
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/AbstractComparatorTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/AbstractComparatorTest.java
new file mode 100644
index 0000000..a776351
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/AbstractComparatorTest.java
@@ -0,0 +1,217 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.io.IOException;
+import java.nio.file.*;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.rocksdb.Types.byteToInt;
+import static org.rocksdb.Types.intToByte;
+
+/**
+ * Abstract tests for both Comparator and DirectComparator
+ */
+public abstract class AbstractComparatorTest {
+
+  /**
+   * Get a comparator which will expect Integer keys
+   * and determine an ascending order
+   *
+   * @return An integer ascending order key comparator
+   */
+  public abstract AbstractComparator getAscendingIntKeyComparator();
+
+  /**
+   * Test which stores random keys into the database
+   * using an @see getAscendingIntKeyComparator
+   * it then checks that these keys are read back in
+   * ascending order
+   *
+   * @param db_path A path where we can store database
+   *                files temporarily
+   *
+   * @throws java.io.IOException if IO error happens.
+   */
+  public void testRoundtrip(final Path db_path) throws IOException, RocksDBException {
+
+    Options opt = null;
+    RocksDB db = null;
+
+    try {
+      opt = new Options();
+      opt.setCreateIfMissing(true);
+      opt.setComparator(getAscendingIntKeyComparator());
+
+      // store 10,000 random integer keys
+      final int ITERATIONS = 10000;
+
+      db = RocksDB.open(opt, db_path.toString());
+      final Random random = new Random();
+      for (int i = 0; i < ITERATIONS; i++) {
+        final byte key[] = intToByte(random.nextInt());
+        if (i > 0 && db.get(key) != null) { // does key already exist (avoid duplicates)
+          i--; // generate a different key
+        } else {
+          db.put(key, "value".getBytes());
+        }
+      }
+      db.close();
+
+      // re-open db and read from start to end
+      // integer keys should be in ascending
+      // order as defined by SimpleIntComparator
+      db = RocksDB.open(opt, db_path.toString());
+      final RocksIterator it = db.newIterator();
+      it.seekToFirst();
+      int lastKey = Integer.MIN_VALUE;
+      int count = 0;
+      for (it.seekToFirst(); it.isValid(); it.next()) {
+        final int thisKey = byteToInt(it.key());
+        assertThat(thisKey).isGreaterThan(lastKey);
+        lastKey = thisKey;
+        count++;
+      }
+      it.dispose();
+      db.close();
+
+      assertThat(count).isEqualTo(ITERATIONS);
+
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  /**
+   * Test which stores random keys into a column family
+   * in the database
+   * using an @see getAscendingIntKeyComparator
+   * it then checks that these keys are read back in
+   * ascending order
+   *
+   * @param db_path A path where we can store database
+   *                files temporarily
+   *
+   * @throws java.io.IOException if IO error happens.
+   */
+  public void testRoundtripCf(final Path db_path) throws IOException,
+      RocksDBException {
+
+    DBOptions opt = null;
+    RocksDB db = null;
+    List<ColumnFamilyDescriptor> cfDescriptors =
+        new ArrayList<>();
+    cfDescriptors.add(new ColumnFamilyDescriptor(
+        RocksDB.DEFAULT_COLUMN_FAMILY));
+    cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes(),
+        new ColumnFamilyOptions().setComparator(
+            getAscendingIntKeyComparator())));
+    List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+    try {
+      opt = new DBOptions().
+          setCreateIfMissing(true).
+          setCreateMissingColumnFamilies(true);
+
+      // store 10,000 random integer keys
+      final int ITERATIONS = 10000;
+
+      db = RocksDB.open(opt, db_path.toString(), cfDescriptors, cfHandles);
+      assertThat(cfDescriptors.size()).isEqualTo(2);
+      assertThat(cfHandles.size()).isEqualTo(2);
+
+      final Random random = new Random();
+      for (int i = 0; i < ITERATIONS; i++) {
+        final byte key[] = intToByte(random.nextInt());
+        if (i > 0 && db.get(cfHandles.get(1), key) != null) {
+          // does key already exist (avoid duplicates)
+          i--; // generate a different key
+        } else {
+          db.put(cfHandles.get(1), key, "value".getBytes());
+        }
+      }
+      for (ColumnFamilyHandle handle : cfHandles) {
+        handle.dispose();
+      }
+      cfHandles.clear();
+      db.close();
+
+      // re-open db and read from start to end
+      // integer keys should be in ascending
+      // order as defined by SimpleIntComparator
+      db = RocksDB.open(opt, db_path.toString(), cfDescriptors, cfHandles);
+      assertThat(cfDescriptors.size()).isEqualTo(2);
+      assertThat(cfHandles.size()).isEqualTo(2);
+      final RocksIterator it = db.newIterator(cfHandles.get(1));
+      it.seekToFirst();
+      int lastKey = Integer.MIN_VALUE;
+      int count = 0;
+      for (it.seekToFirst(); it.isValid(); it.next()) {
+        final int thisKey = byteToInt(it.key());
+        assertThat(thisKey).isGreaterThan(lastKey);
+        lastKey = thisKey;
+        count++;
+      }
+
+      it.dispose();
+      for (ColumnFamilyHandle handle : cfHandles) {
+        handle.dispose();
+      }
+      cfHandles.clear();
+      db.close();
+      assertThat(count).isEqualTo(ITERATIONS);
+
+    } finally {
+      for (ColumnFamilyHandle handle : cfHandles) {
+        handle.dispose();
+      }
+
+      if (db != null) {
+        db.close();
+      }
+
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  /**
+   * Compares integer keys
+   * so that they are in ascending order
+   *
+   * @param a 4-bytes representing an integer key
+   * @param b 4-bytes representing an integer key
+   *
+   * @return negative if a < b, 0 if a == b, positive otherwise
+   */
+  protected final int compareIntKeys(final byte[] a, final byte[] b) {
+
+    final int iA = byteToInt(a);
+    final int iB = byteToInt(b);
+
+    // protect against int key calculation overflow
+    final double diff = (double)iA - iB;
+    final int result;
+    if (diff < Integer.MIN_VALUE) {
+      result = Integer.MIN_VALUE;
+    } else if(diff > Integer.MAX_VALUE) {
+      result = Integer.MAX_VALUE;
+    } else {
+      result = (int)diff;
+    }
+
+    return result;
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java
new file mode 100644
index 0000000..48dff19
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java
@@ -0,0 +1,305 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class BackupEngineTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Rule
+  public TemporaryFolder backupFolder = new TemporaryFolder();
+
+  @Test
+  public void backupDb() throws RocksDBException {
+    Options opt = null;
+    RocksDB db = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      // Open empty database.
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(db);
+      // Create two backups
+      BackupableDBOptions bopt = null;
+      try {
+        bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath());
+        try(final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) {
+          be.createNewBackup(db, false);
+          be.createNewBackup(db, true);
+          verifyNumberOfValidBackups(be, 2);
+        }
+      } finally {
+        if(bopt != null) {
+          bopt.dispose();
+        }
+      }
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void deleteBackup() throws RocksDBException {
+    Options opt = null;
+    RocksDB db = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      // Open empty database.
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(db);
+      // Create two backups
+      BackupableDBOptions bopt = null;
+      try {
+        bopt = new BackupableDBOptions(
+            backupFolder.getRoot().getAbsolutePath());
+        try(final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) {
+          be.createNewBackup(db, false);
+          be.createNewBackup(db, true);
+          final List<BackupInfo> backupInfo =
+              verifyNumberOfValidBackups(be, 2);
+          // Delete the first backup
+          be.deleteBackup(backupInfo.get(0).backupId());
+          final List<BackupInfo> newBackupInfo =
+              verifyNumberOfValidBackups(be, 1);
+
+          // The second backup must remain.
+          assertThat(newBackupInfo.get(0).backupId()).
+              isEqualTo(backupInfo.get(1).backupId());
+        }
+      } finally {
+        if(bopt != null) {
+          bopt.dispose();
+        }
+      }
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void purgeOldBackups() throws RocksDBException {
+    Options opt = null;
+    RocksDB db = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      // Open empty database.
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(db);
+      // Create four backups
+      BackupableDBOptions bopt = null;
+      try {
+        bopt = new BackupableDBOptions(
+            backupFolder.getRoot().getAbsolutePath());
+        try(final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) {
+          be.createNewBackup(db, false);
+          be.createNewBackup(db, true);
+          be.createNewBackup(db, true);
+          be.createNewBackup(db, true);
+          final List<BackupInfo> backupInfo =
+              verifyNumberOfValidBackups(be, 4);
+          // Delete everything except the latest backup
+          be.purgeOldBackups(1);
+          final List<BackupInfo> newBackupInfo =
+              verifyNumberOfValidBackups(be, 1);
+          // The latest backup must remain.
+          assertThat(newBackupInfo.get(0).backupId()).
+              isEqualTo(backupInfo.get(3).backupId());
+        }
+      } finally {
+        if(bopt != null) {
+          bopt.dispose();
+        }
+      }
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void restoreLatestBackup()
+      throws RocksDBException {
+    Options opt = null;
+    RocksDB db = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      // Open empty database.
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(db);
+      BackupableDBOptions bopt = null;
+      try {
+        bopt = new BackupableDBOptions(
+            backupFolder.getRoot().getAbsolutePath());
+        try (final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) {
+          be.createNewBackup(db, true);
+          verifyNumberOfValidBackups(be, 1);
+          db.put("key1".getBytes(), "valueV2".getBytes());
+          db.put("key2".getBytes(), "valueV2".getBytes());
+          be.createNewBackup(db, true);
+          verifyNumberOfValidBackups(be, 2);
+          db.put("key1".getBytes(), "valueV3".getBytes());
+          db.put("key2".getBytes(), "valueV3".getBytes());
+          assertThat(new String(db.get("key1".getBytes()))).endsWith("V3");
+          assertThat(new String(db.get("key2".getBytes()))).endsWith("V3");
+
+          db.close();
+
+          verifyNumberOfValidBackups(be, 2);
+          // restore db from latest backup
+          be.restoreDbFromLatestBackup(dbFolder.getRoot().getAbsolutePath(),
+              dbFolder.getRoot().getAbsolutePath(),
+              new RestoreOptions(false));
+          // Open database again.
+          db = RocksDB.open(opt,
+              dbFolder.getRoot().getAbsolutePath());
+          // Values must have suffix V2 because of restoring latest backup.
+          assertThat(new String(db.get("key1".getBytes()))).endsWith("V2");
+          assertThat(new String(db.get("key2".getBytes()))).endsWith("V2");
+        }
+      } finally {
+        if(bopt != null) {
+          bopt.dispose();
+        }
+      }
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void restoreFromBackup()
+      throws RocksDBException {
+    Options opt = null;
+    RocksDB db = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      // Open empty database.
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(db);
+      BackupableDBOptions bopt = null;
+      try {
+        bopt = new BackupableDBOptions(
+            backupFolder.getRoot().getAbsolutePath());
+        try (final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) {
+          be.createNewBackup(db, true);
+          verifyNumberOfValidBackups(be, 1);
+          db.put("key1".getBytes(), "valueV2".getBytes());
+          db.put("key2".getBytes(), "valueV2".getBytes());
+          be.createNewBackup(db, true);
+          verifyNumberOfValidBackups(be, 2);
+          db.put("key1".getBytes(), "valueV3".getBytes());
+          db.put("key2".getBytes(), "valueV3".getBytes());
+          assertThat(new String(db.get("key1".getBytes()))).endsWith("V3");
+          assertThat(new String(db.get("key2".getBytes()))).endsWith("V3");
+
+          //close the database
+          db.close();
+
+          //restore the backup
+          List<BackupInfo> backupInfo = verifyNumberOfValidBackups(be, 2);
+          // restore db from first backup
+          be.restoreDbFromBackup(backupInfo.get(0).backupId(),
+              dbFolder.getRoot().getAbsolutePath(),
+              dbFolder.getRoot().getAbsolutePath(),
+              new RestoreOptions(false));
+          // Open database again.
+          db = RocksDB.open(opt,
+              dbFolder.getRoot().getAbsolutePath());
+          // Values must have suffix V2 because of restoring latest backup.
+          assertThat(new String(db.get("key1".getBytes()))).endsWith("V1");
+          assertThat(new String(db.get("key2".getBytes()))).endsWith("V1");
+        }
+      } finally {
+        if(bopt != null) {
+          bopt.dispose();
+        }
+      }
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  /**
+   * Verify backups.
+   *
+   * @param be {@link BackupEngine} instance.
+   * @param expectedNumberOfBackups numerical value
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  private List<BackupInfo> verifyNumberOfValidBackups(final BackupEngine be,
+      final int expectedNumberOfBackups) throws RocksDBException {
+    // Verify that backups exist
+    assertThat(be.getCorruptedBackups().length).
+        isEqualTo(0);
+    be.garbageCollect();
+    final List<BackupInfo> backupInfo = be.getBackupInfo();
+    assertThat(backupInfo.size()).
+        isEqualTo(expectedNumberOfBackups);
+    return backupInfo;
+  }
+
+  /**
+   * Fill database with some test values.
+   *
+   * @param db {@link RocksDB} instance.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  private void prepareDatabase(final RocksDB db)
+      throws RocksDBException {
+    db.put("key1".getBytes(), "valueV1".getBytes());
+    db.put("key2".getBytes(), "valueV1".getBytes());
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java
new file mode 100644
index 0000000..6fe3bd2
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java
@@ -0,0 +1,283 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import java.util.Random;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class BackupableDBOptionsTest {
+
+  private final static String ARBITRARY_PATH = "/tmp";
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public ExpectedException exception = ExpectedException.none();
+
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void backupDir() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      assertThat(backupableDBOptions.backupDir()).
+          isEqualTo(ARBITRARY_PATH);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void shareTableFiles() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      boolean value = rand.nextBoolean();
+      backupableDBOptions.setShareTableFiles(value);
+      assertThat(backupableDBOptions.shareTableFiles()).
+          isEqualTo(value);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void sync() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      boolean value = rand.nextBoolean();
+      backupableDBOptions.setSync(value);
+      assertThat(backupableDBOptions.sync()).isEqualTo(value);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void destroyOldData() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      boolean value = rand.nextBoolean();
+      backupableDBOptions.setDestroyOldData(value);
+      assertThat(backupableDBOptions.destroyOldData()).
+          isEqualTo(value);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void backupLogFiles() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      boolean value = rand.nextBoolean();
+      backupableDBOptions.setBackupLogFiles(value);
+      assertThat(backupableDBOptions.backupLogFiles()).
+          isEqualTo(value);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void backupRateLimit() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      long value = Math.abs(rand.nextLong());
+      backupableDBOptions.setBackupRateLimit(value);
+      assertThat(backupableDBOptions.backupRateLimit()).
+          isEqualTo(value);
+      // negative will be mapped to 0
+      backupableDBOptions.setBackupRateLimit(-1);
+      assertThat(backupableDBOptions.backupRateLimit()).
+          isEqualTo(0);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void restoreRateLimit() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      long value = Math.abs(rand.nextLong());
+      backupableDBOptions.setRestoreRateLimit(value);
+      assertThat(backupableDBOptions.restoreRateLimit()).
+          isEqualTo(value);
+      // negative will be mapped to 0
+      backupableDBOptions.setRestoreRateLimit(-1);
+      assertThat(backupableDBOptions.restoreRateLimit()).
+          isEqualTo(0);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void shareFilesWithChecksum() {
+    BackupableDBOptions backupableDBOptions = null;
+    try {
+      backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH);
+      boolean value = rand.nextBoolean();
+      backupableDBOptions.setShareFilesWithChecksum(value);
+      assertThat(backupableDBOptions.shareFilesWithChecksum()).
+          isEqualTo(value);
+    } finally {
+      if (backupableDBOptions != null) {
+        backupableDBOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void failBackupDirIsNull() {
+    exception.expect(IllegalArgumentException.class);
+    new BackupableDBOptions(null);
+  }
+
+  @Test
+  public void failBackupDirIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.backupDir();
+  }
+
+  @Test
+  public void failSetShareTableFilesIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.setShareTableFiles(true);
+  }
+
+  @Test
+  public void failShareTableFilesIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.shareTableFiles();
+  }
+
+  @Test
+  public void failSetSyncIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.setSync(true);
+  }
+
+  @Test
+  public void failSyncIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.sync();
+  }
+
+  @Test
+  public void failSetDestroyOldDataIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.setDestroyOldData(true);
+  }
+
+  @Test
+  public void failDestroyOldDataIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.destroyOldData();
+  }
+
+  @Test
+  public void failSetBackupLogFilesIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.setBackupLogFiles(true);
+  }
+
+  @Test
+  public void failBackupLogFilesIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.backupLogFiles();
+  }
+
+  @Test
+  public void failSetBackupRateLimitIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.setBackupRateLimit(1);
+  }
+
+  @Test
+  public void failBackupRateLimitIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.backupRateLimit();
+  }
+
+  @Test
+  public void failSetRestoreRateLimitIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.setRestoreRateLimit(1);
+  }
+
+  @Test
+  public void failRestoreRateLimitIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.restoreRateLimit();
+  }
+
+  @Test
+  public void failSetShareFilesWithChecksumIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.setShareFilesWithChecksum(true);
+  }
+
+  @Test
+  public void failShareFilesWithChecksumIfDisposed(){
+    BackupableDBOptions options = setupUninitializedBackupableDBOptions(
+        exception);
+    options.shareFilesWithChecksum();
+  }
+
+  private BackupableDBOptions setupUninitializedBackupableDBOptions(
+      ExpectedException exception) {
+    BackupableDBOptions backupableDBOptions =
+        new BackupableDBOptions(ARBITRARY_PATH);
+    backupableDBOptions.dispose();
+    exception.expect(AssertionError.class);
+    return backupableDBOptions;
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/BackupableDBTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/BackupableDBTest.java
new file mode 100644
index 0000000..3f358bd
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/BackupableDBTest.java
@@ -0,0 +1,425 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class BackupableDBTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Rule
+  public TemporaryFolder backupFolder = new TemporaryFolder();
+
+  @Test
+  public void backupDb() throws RocksDBException {
+    Options opt = null;
+    BackupableDBOptions bopt = null;
+    BackupableDB bdb = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath());
+      assertThat(bopt.backupDir()).isEqualTo(
+          backupFolder.getRoot().getAbsolutePath());
+      // Open empty database.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(bdb);
+      // Create two backups
+      bdb.createNewBackup(false);
+      bdb.createNewBackup(true);
+      verifyNumberOfValidBackups(bdb, 2);
+    } finally {
+      if (bdb != null) {
+        bdb.close();
+      }
+      if (bopt != null) {
+        bopt.dispose();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void deleteBackup() throws RocksDBException {
+    Options opt = null;
+    BackupableDBOptions bopt = null;
+    BackupableDB bdb = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath());
+      assertThat(bopt.backupDir()).isEqualTo(
+          backupFolder.getRoot().getAbsolutePath());
+      // Open empty database.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(bdb);
+      // Create two backups
+      bdb.createNewBackup(false);
+      bdb.createNewBackup(true);
+      List<BackupInfo> backupInfo =
+          verifyNumberOfValidBackups(bdb, 2);
+      // Delete the first backup
+      bdb.deleteBackup(backupInfo.get(0).backupId());
+      List<BackupInfo> newBackupInfo =
+          verifyNumberOfValidBackups(bdb, 1);
+      // The second backup must remain.
+      assertThat(newBackupInfo.get(0).backupId()).
+          isEqualTo(backupInfo.get(1).backupId());
+    } finally {
+      if (bdb != null) {
+        bdb.close();
+      }
+      if (bopt != null) {
+        bopt.dispose();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void deleteBackupWithRestoreBackupableDB()
+      throws RocksDBException {
+    Options opt = null;
+    BackupableDBOptions bopt = null;
+    BackupableDB bdb = null;
+    RestoreBackupableDB rdb = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath());
+      assertThat(bopt.backupDir()).isEqualTo(
+          backupFolder.getRoot().getAbsolutePath());
+      // Open empty database.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(bdb);
+      // Create two backups
+      bdb.createNewBackup(false);
+      bdb.createNewBackup(true);
+      List<BackupInfo> backupInfo =
+          verifyNumberOfValidBackups(bdb, 2);
+      // init RestoreBackupableDB
+      rdb = new RestoreBackupableDB(bopt);
+      // Delete the first backup
+      rdb.deleteBackup(backupInfo.get(0).backupId());
+      // Fetch backup info using RestoreBackupableDB
+      List<BackupInfo> newBackupInfo = verifyNumberOfValidBackups(rdb, 1);
+      // The second backup must remain.
+      assertThat(newBackupInfo.get(0).backupId()).
+          isEqualTo(backupInfo.get(1).backupId());
+    } finally {
+      if (bdb != null) {
+        bdb.close();
+      }
+      if (rdb != null) {
+        rdb.dispose();
+      }
+      if (bopt != null) {
+        bopt.dispose();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void purgeOldBackups() throws RocksDBException {
+    Options opt = null;
+    BackupableDBOptions bopt = null;
+    BackupableDB bdb = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath());
+      assertThat(bopt.backupDir()).isEqualTo(
+          backupFolder.getRoot().getAbsolutePath());
+      // Open empty database.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(bdb);
+      // Create two backups
+      bdb.createNewBackup(false);
+      bdb.createNewBackup(true);
+      bdb.createNewBackup(true);
+      bdb.createNewBackup(true);
+      List<BackupInfo> backupInfo =
+          verifyNumberOfValidBackups(bdb, 4);
+      // Delete everything except the latest backup
+      bdb.purgeOldBackups(1);
+      List<BackupInfo> newBackupInfo =
+          verifyNumberOfValidBackups(bdb, 1);
+      // The latest backup must remain.
+      assertThat(newBackupInfo.get(0).backupId()).
+          isEqualTo(backupInfo.get(3).backupId());
+    } finally {
+      if (bdb != null) {
+        bdb.close();
+      }
+      if (bopt != null) {
+        bopt.dispose();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void purgeOldBackupsWithRestoreBackupableDb()
+      throws RocksDBException {
+    Options opt = null;
+    BackupableDBOptions bopt = null;
+    BackupableDB bdb = null;
+    RestoreBackupableDB rdb = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath());
+      assertThat(bopt.backupDir()).isEqualTo(
+          backupFolder.getRoot().getAbsolutePath());
+      // Open empty database.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(bdb);
+      // Create two backups
+      bdb.createNewBackup(false);
+      bdb.createNewBackup(true);
+      bdb.createNewBackup(true);
+      bdb.createNewBackup(true);
+      List<BackupInfo> infos = verifyNumberOfValidBackups(bdb, 4);
+      assertThat(infos.get(1).size()).
+          isEqualTo(infos.get(2).size());
+      assertThat(infos.get(1).numberFiles()).
+          isEqualTo(infos.get(2).numberFiles());
+      long maxTimeBeforePurge = Long.MIN_VALUE;
+      for (BackupInfo backupInfo : infos) {
+        if (maxTimeBeforePurge < backupInfo.timestamp()) {
+          maxTimeBeforePurge = backupInfo.timestamp();
+        }
+      }
+      // init RestoreBackupableDB
+      rdb = new RestoreBackupableDB(bopt);
+      // the same number of backups must
+      // exist using RestoreBackupableDB.
+      verifyNumberOfValidBackups(rdb, 4);
+      rdb.purgeOldBackups(1);
+      infos = verifyNumberOfValidBackups(rdb, 1);
+      assertThat(infos.get(0).timestamp()).
+          isEqualTo(maxTimeBeforePurge);
+    } finally {
+      if (bdb != null) {
+        bdb.close();
+      }
+      if (rdb != null) {
+        rdb.dispose();
+      }
+      if (bopt != null) {
+        bopt.dispose();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void restoreLatestBackup()
+      throws RocksDBException {
+    Options opt = null;
+    BackupableDBOptions bopt = null;
+    BackupableDB bdb = null;
+    RestoreBackupableDB rdb = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath());
+      assertThat(bopt.backupDir()).isEqualTo(
+          backupFolder.getRoot().getAbsolutePath());
+      // Open empty database.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(bdb);
+      bdb.createNewBackup(true);
+      verifyNumberOfValidBackups(bdb, 1);
+      bdb.put("key1".getBytes(), "valueV2".getBytes());
+      bdb.put("key2".getBytes(), "valueV2".getBytes());
+      bdb.createNewBackup(true);
+      verifyNumberOfValidBackups(bdb, 2);
+      bdb.put("key1".getBytes(), "valueV3".getBytes());
+      bdb.put("key2".getBytes(), "valueV3".getBytes());
+      assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V3");
+      assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V3");
+      bdb.close();
+
+      // init RestoreBackupableDB
+      rdb = new RestoreBackupableDB(bopt);
+      verifyNumberOfValidBackups(rdb, 2);
+      // restore db from latest backup
+      rdb.restoreDBFromLatestBackup(dbFolder.getRoot().getAbsolutePath(),
+          dbFolder.getRoot().getAbsolutePath(),
+          new RestoreOptions(false));
+      // Open database again.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Values must have suffix V2 because of restoring latest backup.
+      assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V2");
+      assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V2");
+    } finally {
+      if (bdb != null) {
+        bdb.close();
+      }
+      if (rdb != null) {
+        rdb.dispose();
+      }
+      if (bopt != null) {
+        bopt.dispose();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void restoreFromBackup()
+      throws RocksDBException {
+    Options opt = null;
+    BackupableDBOptions bopt = null;
+    BackupableDB bdb = null;
+    RestoreBackupableDB rdb = null;
+    try {
+      opt = new Options().setCreateIfMissing(true);
+      bopt = new BackupableDBOptions(
+          backupFolder.getRoot().getAbsolutePath());
+      assertThat(bopt.backupDir()).isEqualTo(
+          backupFolder.getRoot().getAbsolutePath());
+      // Open empty database.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Fill database with some test values
+      prepareDatabase(bdb);
+      bdb.createNewBackup(true);
+      verifyNumberOfValidBackups(bdb, 1);
+      bdb.put("key1".getBytes(), "valueV2".getBytes());
+      bdb.put("key2".getBytes(), "valueV2".getBytes());
+      bdb.createNewBackup(true);
+      verifyNumberOfValidBackups(bdb, 2);
+      bdb.put("key1".getBytes(), "valueV3".getBytes());
+      bdb.put("key2".getBytes(), "valueV3".getBytes());
+      assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V3");
+      assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V3");
+      bdb.close();
+
+      // init RestoreBackupableDB
+      rdb = new RestoreBackupableDB(bopt);
+      List<BackupInfo> backupInfo = verifyNumberOfValidBackups(rdb, 2);
+      // restore db from first backup
+      rdb.restoreDBFromBackup(backupInfo.get(0).backupId(),
+          dbFolder.getRoot().getAbsolutePath(),
+          dbFolder.getRoot().getAbsolutePath(),
+          new RestoreOptions(false));
+      // Open database again.
+      bdb = BackupableDB.open(opt, bopt,
+          dbFolder.getRoot().getAbsolutePath());
+      // Values must have suffix V2 because of restoring latest backup.
+      assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V1");
+      assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V1");
+    } finally {
+      if (bdb != null) {
+        bdb.close();
+      }
+      if (rdb != null) {
+        rdb.dispose();
+      }
+      if (bopt != null) {
+        bopt.dispose();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  /**
+   * Verify backups.
+   *
+   * @param bdb {@link BackupableDB} instance.
+   * @param expectedNumberOfBackups numerical value
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  private List<BackupInfo> verifyNumberOfValidBackups(BackupableDB bdb,
+     int expectedNumberOfBackups) throws RocksDBException {
+    // Verify that backups exist
+    assertThat(bdb.getCorruptedBackups().length).
+        isEqualTo(0);
+    bdb.garbageCollect();
+    List<BackupInfo> backupInfo = bdb.getBackupInfos();
+    assertThat(backupInfo.size()).
+        isEqualTo(expectedNumberOfBackups);
+    return backupInfo;
+  }
+
+  /**
+   * Verify backups.
+   *
+   * @param rdb {@link RestoreBackupableDB} instance.
+   * @param expectedNumberOfBackups numerical value
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  private List<BackupInfo> verifyNumberOfValidBackups(
+      RestoreBackupableDB rdb, int expectedNumberOfBackups)
+      throws RocksDBException {
+    // Verify that backups exist
+    assertThat(rdb.getCorruptedBackups().length).
+        isEqualTo(0);
+    rdb.garbageCollect();
+    List<BackupInfo> backupInfo = rdb.getBackupInfos();
+    assertThat(backupInfo.size()).
+        isEqualTo(expectedNumberOfBackups);
+    return backupInfo;
+  }
+
+  /**
+   * Fill database with some test values.
+   *
+   * @param db {@link RocksDB} instance.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  private void prepareDatabase(RocksDB db)
+      throws RocksDBException {
+    db.put("key1".getBytes(), "valueV1".getBytes());
+    db.put("key2".getBytes(), "valueV1".getBytes());
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
new file mode 100644
index 0000000..aacf440
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
@@ -0,0 +1,185 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class BlockBasedTableConfigTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void noBlockCache() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setNoBlockCache(true);
+    assertThat(blockBasedTableConfig.noBlockCache()).isTrue();
+  }
+
+  @Test
+  public void blockCacheSize() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockCacheSize(8 * 1024);
+    assertThat(blockBasedTableConfig.blockCacheSize()).
+        isEqualTo(8 * 1024);
+  }
+
+  @Test
+  public void blockSizeDeviation() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockSizeDeviation(12);
+    assertThat(blockBasedTableConfig.blockSizeDeviation()).
+        isEqualTo(12);
+  }
+
+  @Test
+  public void blockRestartInterval() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockRestartInterval(15);
+    assertThat(blockBasedTableConfig.blockRestartInterval()).
+        isEqualTo(15);
+  }
+
+  @Test
+  public void wholeKeyFiltering() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setWholeKeyFiltering(false);
+    assertThat(blockBasedTableConfig.wholeKeyFiltering()).
+        isFalse();
+  }
+
+  @Test
+  public void cacheIndexAndFilterBlocks() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setCacheIndexAndFilterBlocks(true);
+    assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocks()).
+        isTrue();
+
+  }
+
+  @Test
+  public void hashIndexAllowCollision() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setHashIndexAllowCollision(false);
+    assertThat(blockBasedTableConfig.hashIndexAllowCollision()).
+        isFalse();
+  }
+
+  @Test
+  public void blockCacheCompressedSize() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockCacheCompressedSize(40);
+    assertThat(blockBasedTableConfig.blockCacheCompressedSize()).
+        isEqualTo(40);
+  }
+
+  @Test
+  public void checksumType() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    assertThat(ChecksumType.values().length).isEqualTo(3);
+    assertThat(ChecksumType.valueOf("kxxHash")).
+        isEqualTo(ChecksumType.kxxHash);
+    blockBasedTableConfig.setChecksumType(ChecksumType.kNoChecksum);
+    blockBasedTableConfig.setChecksumType(ChecksumType.kxxHash);
+    assertThat(blockBasedTableConfig.checksumType().equals(
+        ChecksumType.kxxHash));
+  }
+
+  @Test
+  public void indexType() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    assertThat(IndexType.values().length).isEqualTo(2);
+    blockBasedTableConfig.setIndexType(IndexType.kHashSearch);
+    assertThat(blockBasedTableConfig.indexType().equals(
+        IndexType.kHashSearch));
+    assertThat(IndexType.valueOf("kBinarySearch")).isNotNull();
+    blockBasedTableConfig.setIndexType(IndexType.valueOf("kBinarySearch"));
+    assertThat(blockBasedTableConfig.indexType().equals(
+        IndexType.kBinarySearch));
+  }
+
+  @Test
+  public void blockCacheCompressedNumShardBits() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockCacheCompressedNumShardBits(4);
+    assertThat(blockBasedTableConfig.blockCacheCompressedNumShardBits()).
+        isEqualTo(4);
+  }
+
+  @Test
+  public void cacheNumShardBits() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setCacheNumShardBits(5);
+    assertThat(blockBasedTableConfig.cacheNumShardBits()).
+        isEqualTo(5);
+  }
+
+  @Test
+  public void blockSize() {
+    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockSize(10);
+    assertThat(blockBasedTableConfig.blockSize()).isEqualTo(10);
+  }
+
+
+  @Test
+  public void blockBasedTableWithFilter() {
+    Options options = null;
+    try {
+      options = new Options();
+      options.setTableFormatConfig(
+          new BlockBasedTableConfig().setFilter(
+              new BloomFilter(10)));
+      assertThat(options.tableFactoryName()).
+          isEqualTo("BlockBasedTable");
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void blockBasedTableWithoutFilter() {
+    Options options = null;
+    try {
+      options = new Options();
+      options.setTableFormatConfig(
+          new BlockBasedTableConfig().setFilter(null));
+      assertThat(options.tableFactoryName()).
+          isEqualTo("BlockBasedTable");
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void blockBasedTableFormatVersion() {
+    BlockBasedTableConfig config = new BlockBasedTableConfig();
+    for (int version=0; version<=2; version++) {
+      config.setFormatVersion(version);
+      assertThat(config.formatVersion()).isEqualTo(version);
+    }
+  }
+
+  @Test(expected = AssertionError.class)
+  public void blockBasedTableFormatVersionFailNegative() {
+    BlockBasedTableConfig config = new BlockBasedTableConfig();
+    config.setFormatVersion(-1);
+  }
+
+  @Test(expected = AssertionError.class)
+  public void blockBasedTableFormatVersionFailIllegalVersion() {
+    BlockBasedTableConfig config = new BlockBasedTableConfig();
+    config.setFormatVersion(3);
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/CheckPointTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/CheckPointTest.java
new file mode 100644
index 0000000..3081e58
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/CheckPointTest.java
@@ -0,0 +1,97 @@
+package org.rocksdb;
+
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CheckPointTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Rule
+  public TemporaryFolder checkpointFolder = new TemporaryFolder();
+
+  @Test
+  public void checkPoint() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    Checkpoint checkpoint = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true);
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      checkpoint = Checkpoint.create(db);
+      checkpoint.createCheckpoint(checkpointFolder.
+          getRoot().getAbsolutePath() + "/snapshot1");
+      db.put("key2".getBytes(), "value2".getBytes());
+      checkpoint.createCheckpoint(checkpointFolder.
+          getRoot().getAbsolutePath() + "/snapshot2");
+      db.close();
+      db = RocksDB.open(options,
+          checkpointFolder.getRoot().getAbsolutePath() +
+              "/snapshot1");
+      assertThat(new String(db.get("key".getBytes()))).
+          isEqualTo("value");
+      assertThat(db.get("key2".getBytes())).isNull();
+      db.close();
+      db = RocksDB.open(options,
+          checkpointFolder.getRoot().getAbsolutePath() +
+              "/snapshot2");
+      assertThat(new String(db.get("key".getBytes()))).
+          isEqualTo("value");
+      assertThat(new String(db.get("key2".getBytes()))).
+          isEqualTo("value2");
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (checkpoint != null) {
+        checkpoint.dispose();
+      }
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failIfDbIsNull() {
+    Checkpoint.create(null);
+  }
+
+  @Test(expected = IllegalStateException.class)
+  public void failIfDbNotInitialized() throws RocksDBException {
+    RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+    db.dispose();
+    Checkpoint.create(db);
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failWithIllegalPath() throws RocksDBException {
+    RocksDB db = null;
+    Checkpoint checkpoint = null;
+    try {
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      checkpoint = Checkpoint.create(db);
+      checkpoint.createCheckpoint("/Z:///:\\C:\\TZ/-");
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (checkpoint != null) {
+        checkpoint.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
new file mode 100644
index 0000000..af72161
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
@@ -0,0 +1,745 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Properties;
+import java.util.Random;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class ColumnFamilyOptionsTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void getColumnFamilyOptionsFromProps() {
+    ColumnFamilyOptions opt = null;
+    try {
+      // setup sample properties
+      Properties properties = new Properties();
+      properties.put("write_buffer_size", "112");
+      properties.put("max_write_buffer_number", "13");
+      opt = ColumnFamilyOptions.
+          getColumnFamilyOptionsFromProps(properties);
+      assertThat(opt).isNotNull();
+      assertThat(String.valueOf(opt.writeBufferSize())).
+          isEqualTo(properties.get("write_buffer_size"));
+      assertThat(String.valueOf(opt.maxWriteBufferNumber())).
+          isEqualTo(properties.get("max_write_buffer_number"));
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void failColumnFamilyOptionsFromPropsWithIllegalValue() {
+    ColumnFamilyOptions opt = null;
+    try {
+      // setup sample properties
+      Properties properties = new Properties();
+      properties.put("tomato", "1024");
+      properties.put("burger", "2");
+      opt = ColumnFamilyOptions.
+          getColumnFamilyOptionsFromProps(properties);
+      assertThat(opt).isNull();
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failColumnFamilyOptionsFromPropsWithNullValue() {
+    ColumnFamilyOptions.getColumnFamilyOptionsFromProps(null);
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failColumnFamilyOptionsFromPropsWithEmptyProps() {
+    ColumnFamilyOptions.getColumnFamilyOptionsFromProps(
+        new Properties());
+  }
+
+  @Test
+  public void writeBufferSize() throws RocksDBException {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      long longValue = rand.nextLong();
+      opt.setWriteBufferSize(longValue);
+      assertThat(opt.writeBufferSize()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxWriteBufferNumber() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      int intValue = rand.nextInt();
+      opt.setMaxWriteBufferNumber(intValue);
+      assertThat(opt.maxWriteBufferNumber()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void minWriteBufferNumberToMerge() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      int intValue = rand.nextInt();
+      opt.setMinWriteBufferNumberToMerge(intValue);
+      assertThat(opt.minWriteBufferNumberToMerge()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void numLevels() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      int intValue = rand.nextInt();
+      opt.setNumLevels(intValue);
+      assertThat(opt.numLevels()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void levelZeroFileNumCompactionTrigger() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      int intValue = rand.nextInt();
+      opt.setLevelZeroFileNumCompactionTrigger(intValue);
+      assertThat(opt.levelZeroFileNumCompactionTrigger()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void levelZeroSlowdownWritesTrigger() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      int intValue = rand.nextInt();
+      opt.setLevelZeroSlowdownWritesTrigger(intValue);
+      assertThat(opt.levelZeroSlowdownWritesTrigger()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void levelZeroStopWritesTrigger() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      int intValue = rand.nextInt();
+      opt.setLevelZeroStopWritesTrigger(intValue);
+      assertThat(opt.levelZeroStopWritesTrigger()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void targetFileSizeBase() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      long longValue = rand.nextLong();
+      opt.setTargetFileSizeBase(longValue);
+      assertThat(opt.targetFileSizeBase()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void targetFileSizeMultiplier() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      int intValue = rand.nextInt();
+      opt.setTargetFileSizeMultiplier(intValue);
+      assertThat(opt.targetFileSizeMultiplier()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxBytesForLevelBase() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      long longValue = rand.nextLong();
+      opt.setMaxBytesForLevelBase(longValue);
+      assertThat(opt.maxBytesForLevelBase()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void levelCompactionDynamicLevelBytes() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      final boolean boolValue = rand.nextBoolean();
+      opt.setLevelCompactionDynamicLevelBytes(boolValue);
+      assertThat(opt.levelCompactionDynamicLevelBytes())
+          .isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxBytesForLevelMultiplier() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      int intValue = rand.nextInt();
+      opt.setMaxBytesForLevelMultiplier(intValue);
+      assertThat(opt.maxBytesForLevelMultiplier()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void expandedCompactionFactor() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      int intValue = rand.nextInt();
+      opt.setExpandedCompactionFactor(intValue);
+      assertThat(opt.expandedCompactionFactor()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void sourceCompactionFactor() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      int intValue = rand.nextInt();
+      opt.setSourceCompactionFactor(intValue);
+      assertThat(opt.sourceCompactionFactor()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxGrandparentOverlapFactor() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      int intValue = rand.nextInt();
+      opt.setMaxGrandparentOverlapFactor(intValue);
+      assertThat(opt.maxGrandparentOverlapFactor()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void softRateLimit() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      double doubleValue = rand.nextDouble();
+      opt.setSoftRateLimit(doubleValue);
+      assertThat(opt.softRateLimit()).isEqualTo(doubleValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void hardRateLimit() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      double doubleValue = rand.nextDouble();
+      opt.setHardRateLimit(doubleValue);
+      assertThat(opt.hardRateLimit()).isEqualTo(doubleValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void rateLimitDelayMaxMilliseconds() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      int intValue = rand.nextInt();
+      opt.setRateLimitDelayMaxMilliseconds(intValue);
+      assertThat(opt.rateLimitDelayMaxMilliseconds()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void arenaBlockSize() throws RocksDBException {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      long longValue = rand.nextLong();
+      opt.setArenaBlockSize(longValue);
+      assertThat(opt.arenaBlockSize()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void disableAutoCompactions() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setDisableAutoCompactions(boolValue);
+      assertThat(opt.disableAutoCompactions()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void purgeRedundantKvsWhileFlush() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setPurgeRedundantKvsWhileFlush(boolValue);
+      assertThat(opt.purgeRedundantKvsWhileFlush()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void verifyChecksumsInCompaction() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setVerifyChecksumsInCompaction(boolValue);
+      assertThat(opt.verifyChecksumsInCompaction()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void filterDeletes() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setFilterDeletes(boolValue);
+      assertThat(opt.filterDeletes()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxSequentialSkipInIterations() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      long longValue = rand.nextLong();
+      opt.setMaxSequentialSkipInIterations(longValue);
+      assertThat(opt.maxSequentialSkipInIterations()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void inplaceUpdateSupport() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setInplaceUpdateSupport(boolValue);
+      assertThat(opt.inplaceUpdateSupport()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void inplaceUpdateNumLocks() throws RocksDBException {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      long longValue = rand.nextLong();
+      opt.setInplaceUpdateNumLocks(longValue);
+      assertThat(opt.inplaceUpdateNumLocks()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void memtablePrefixBloomBits() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      int intValue = rand.nextInt();
+      opt.setMemtablePrefixBloomBits(intValue);
+      assertThat(opt.memtablePrefixBloomBits()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void memtablePrefixBloomProbes() {
+    ColumnFamilyOptions opt = null;
+    try {
+      int intValue = rand.nextInt();
+      opt = new ColumnFamilyOptions();
+      opt.setMemtablePrefixBloomProbes(intValue);
+      assertThat(opt.memtablePrefixBloomProbes()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void bloomLocality() {
+    ColumnFamilyOptions opt = null;
+    try {
+      int intValue = rand.nextInt();
+      opt = new ColumnFamilyOptions();
+      opt.setBloomLocality(intValue);
+      assertThat(opt.bloomLocality()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxSuccessiveMerges() throws RocksDBException {
+    ColumnFamilyOptions opt = null;
+    try {
+      long longValue = rand.nextLong();
+      opt = new ColumnFamilyOptions();
+      opt.setMaxSuccessiveMerges(longValue);
+      assertThat(opt.maxSuccessiveMerges()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void minPartialMergeOperands() {
+    ColumnFamilyOptions opt = null;
+    try {
+      int intValue = rand.nextInt();
+      opt = new ColumnFamilyOptions();
+      opt.setMinPartialMergeOperands(intValue);
+      assertThat(opt.minPartialMergeOperands()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void optimizeFiltersForHits() {
+    ColumnFamilyOptions opt = null;
+    try {
+      boolean aBoolean = rand.nextBoolean();
+      opt = new ColumnFamilyOptions();
+      opt.setOptimizeFiltersForHits(aBoolean);
+      assertThat(opt.optimizeFiltersForHits()).isEqualTo(aBoolean);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void memTable() throws RocksDBException {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      opt.setMemTableConfig(new HashLinkedListMemTableConfig());
+      assertThat(opt.memTableFactoryName()).
+          isEqualTo("HashLinkedListRepFactory");
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void comparator() throws RocksDBException {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      opt.setComparator(BuiltinComparator.BYTEWISE_COMPARATOR);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void linkageOfPrepMethods() {
+    ColumnFamilyOptions options = null;
+    try {
+      options = new ColumnFamilyOptions();
+      options.optimizeUniversalStyleCompaction();
+      options.optimizeUniversalStyleCompaction(4000);
+      options.optimizeLevelStyleCompaction();
+      options.optimizeLevelStyleCompaction(3000);
+      options.optimizeForPointLookup(10);
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void shouldSetTestPrefixExtractor() {
+    ColumnFamilyOptions options = null;
+    try {
+      options = new ColumnFamilyOptions();
+      options.useFixedLengthPrefixExtractor(100);
+      options.useFixedLengthPrefixExtractor(10);
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+
+  @Test
+  public void shouldSetTestCappedPrefixExtractor() {
+    ColumnFamilyOptions options = null;
+    try {
+      options = new ColumnFamilyOptions();
+      options.useCappedPrefixExtractor(100);
+      options.useCappedPrefixExtractor(10);
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compressionTypes() {
+    ColumnFamilyOptions columnFamilyOptions = null;
+    try {
+      columnFamilyOptions = new ColumnFamilyOptions();
+      for (CompressionType compressionType :
+          CompressionType.values()) {
+        columnFamilyOptions.setCompressionType(compressionType);
+        assertThat(columnFamilyOptions.compressionType()).
+            isEqualTo(compressionType);
+        assertThat(CompressionType.valueOf("NO_COMPRESSION")).
+            isEqualTo(CompressionType.NO_COMPRESSION);
+      }
+    } finally {
+      if (columnFamilyOptions != null) {
+        columnFamilyOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compressionPerLevel() {
+    ColumnFamilyOptions columnFamilyOptions = null;
+    try {
+      columnFamilyOptions = new ColumnFamilyOptions();
+      assertThat(columnFamilyOptions.compressionPerLevel()).isEmpty();
+      List<CompressionType> compressionTypeList = new ArrayList<>();
+      for (int i=0; i < columnFamilyOptions.numLevels(); i++) {
+        compressionTypeList.add(CompressionType.NO_COMPRESSION);
+      }
+      columnFamilyOptions.setCompressionPerLevel(compressionTypeList);
+      compressionTypeList = columnFamilyOptions.compressionPerLevel();
+      for (CompressionType compressionType : compressionTypeList) {
+        assertThat(compressionType).isEqualTo(
+            CompressionType.NO_COMPRESSION);
+      }
+    } finally {
+      if (columnFamilyOptions != null) {
+        columnFamilyOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void differentCompressionsPerLevel() {
+    ColumnFamilyOptions columnFamilyOptions = null;
+    try {
+      columnFamilyOptions = new ColumnFamilyOptions();
+      columnFamilyOptions.setNumLevels(3);
+
+      assertThat(columnFamilyOptions.compressionPerLevel()).isEmpty();
+      List<CompressionType> compressionTypeList = new ArrayList<>();
+
+      compressionTypeList.add(CompressionType.BZLIB2_COMPRESSION);
+      compressionTypeList.add(CompressionType.SNAPPY_COMPRESSION);
+      compressionTypeList.add(CompressionType.LZ4_COMPRESSION);
+
+      columnFamilyOptions.setCompressionPerLevel(compressionTypeList);
+      compressionTypeList = columnFamilyOptions.compressionPerLevel();
+
+      assertThat(compressionTypeList.size()).isEqualTo(3);
+      assertThat(compressionTypeList).
+          containsExactly(
+              CompressionType.BZLIB2_COMPRESSION,
+              CompressionType.SNAPPY_COMPRESSION,
+              CompressionType.LZ4_COMPRESSION);
+
+    } finally {
+      if (columnFamilyOptions != null) {
+        columnFamilyOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compactionStyles() {
+    ColumnFamilyOptions ColumnFamilyOptions = null;
+    try {
+      ColumnFamilyOptions = new ColumnFamilyOptions();
+      for (CompactionStyle compactionStyle :
+          CompactionStyle.values()) {
+        ColumnFamilyOptions.setCompactionStyle(compactionStyle);
+        assertThat(ColumnFamilyOptions.compactionStyle()).
+            isEqualTo(compactionStyle);
+        assertThat(CompactionStyle.valueOf("FIFO")).
+            isEqualTo(CompactionStyle.FIFO);
+      }
+    } finally {
+      if (ColumnFamilyOptions != null) {
+        ColumnFamilyOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxTableFilesSizeFIFO() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      long longValue = rand.nextLong();
+      // Size has to be positive
+      longValue = (longValue < 0) ? -longValue : longValue;
+      longValue = (longValue == 0) ? longValue + 1 : longValue;
+      opt.setMaxTableFilesSizeFIFO(longValue);
+      assertThat(opt.maxTableFilesSizeFIFO()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java
new file mode 100644
index 0000000..decdbbc
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java
@@ -0,0 +1,746 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.*;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class ColumnFamilyTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void listColumnFamilies() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    try {
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      DBOptions dbOptions = new DBOptions();
+      dbOptions.setCreateIfMissing(true);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      // Test listColumnFamilies
+      List<byte[]> columnFamilyNames;
+      columnFamilyNames = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(columnFamilyNames).isNotNull();
+      assertThat(columnFamilyNames.size()).isGreaterThan(0);
+      assertThat(columnFamilyNames.size()).isEqualTo(1);
+      assertThat(new String(columnFamilyNames.get(0))).isEqualTo("default");
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void defaultColumnFamily() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    ColumnFamilyHandle cfh;
+    try {
+      options = new Options().setCreateIfMissing(true);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      cfh = db.getDefaultColumnFamily();
+      assertThat(cfh).isNotNull();
+
+      final byte[] key = "key".getBytes();
+      final byte[] value = "value".getBytes();
+
+      db.put(cfh, key, value);
+
+      final byte[] actualValue = db.get(cfh, key);
+
+      assertThat(cfh).isNotNull();
+      assertThat(actualValue).isEqualTo(value);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void createColumnFamily() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    ColumnFamilyHandle columnFamilyHandle = null;
+    try {
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      columnFamilyHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf".getBytes(), new ColumnFamilyOptions()));
+
+      List<byte[]> columnFamilyNames;
+      columnFamilyNames = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(columnFamilyNames).isNotNull();
+      assertThat(columnFamilyNames.size()).isGreaterThan(0);
+      assertThat(columnFamilyNames.size()).isEqualTo(2);
+      assertThat(new String(columnFamilyNames.get(0))).isEqualTo("default");
+      assertThat(new String(columnFamilyNames.get(1))).isEqualTo("new_cf");
+    } finally {
+      if (columnFamilyHandle != null) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void openWithColumnFamilies() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    List<ColumnFamilyDescriptor> cfNames =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      options.setCreateMissingColumnFamilies(true);
+      // Test open database with column family names
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      assertThat(columnFamilyHandleList.size()).isEqualTo(2);
+      db.put("dfkey1".getBytes(), "dfvalue".getBytes());
+      db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(),
+          "dfvalue".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(),
+          "newcfvalue".getBytes());
+
+      String retVal = new String(db.get(columnFamilyHandleList.get(1),
+          "newcfkey1".getBytes()));
+      assertThat(retVal).isEqualTo("newcfvalue");
+      assertThat((db.get(columnFamilyHandleList.get(1),
+          "dfkey1".getBytes()))).isNull();
+      db.remove(columnFamilyHandleList.get(1), "newcfkey1".getBytes());
+      assertThat((db.get(columnFamilyHandleList.get(1),
+          "newcfkey1".getBytes()))).isNull();
+      db.remove(columnFamilyHandleList.get(0), new WriteOptions(),
+          "dfkey2".getBytes());
+      assertThat(db.get(columnFamilyHandleList.get(0), new ReadOptions(),
+          "dfkey2".getBytes())).isNull();
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void getWithOutValueAndCf() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    List<ColumnFamilyDescriptor> cfDescriptors =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      options.setCreateMissingColumnFamilies(true);
+      // Test open database with column family names
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfDescriptors, columnFamilyHandleList);
+      db.put(columnFamilyHandleList.get(0), new WriteOptions(),
+          "key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      byte[] outValue = new byte[5];
+      // not found value
+      int getResult = db.get("keyNotFound".getBytes(), outValue);
+      assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND);
+      // found value which fits in outValue
+      getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("value".getBytes());
+      // found value which fits partially
+      getResult = db.get(columnFamilyHandleList.get(0), new ReadOptions(),
+          "key2".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("12345".getBytes());
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void createWriteDropColumnFamily() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions opt = null;
+    ColumnFamilyHandle tmpColumnFamilyHandle = null;
+    List<ColumnFamilyDescriptor> cfNames =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      opt = new DBOptions();
+      opt.setCreateIfMissing(true);
+      opt.setCreateMissingColumnFamilies(true);
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+      db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      tmpColumnFamilyHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions()));
+      db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
+      db.dropColumnFamily(tmpColumnFamilyHandle);
+      tmpColumnFamilyHandle.dispose();
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (tmpColumnFamilyHandle != null) {
+        tmpColumnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void writeBatch() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions opt = null;
+    List<ColumnFamilyDescriptor> cfNames =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      opt = new DBOptions();
+      opt.setCreateIfMissing(true);
+      opt.setCreateMissingColumnFamilies(true);
+
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+          new ColumnFamilyOptions().setMergeOperator(new StringAppendOperator())));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+      db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+
+      WriteBatch writeBatch = new WriteBatch();
+      WriteOptions writeOpt = new WriteOptions();
+      writeBatch.put("key".getBytes(), "value".getBytes());
+      writeBatch.put(db.getDefaultColumnFamily(),
+          "mergeKey".getBytes(), "merge".getBytes());
+      writeBatch.merge(db.getDefaultColumnFamily(), "mergeKey".getBytes(),
+          "merge".getBytes());
+      writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(),
+          "value".getBytes());
+      writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(),
+          "value2".getBytes());
+      writeBatch.remove("xyz".getBytes());
+      writeBatch.remove(columnFamilyHandleList.get(1), "xyz".getBytes());
+      db.write(writeOpt, writeBatch);
+      writeBatch.dispose();
+      assertThat(db.get(columnFamilyHandleList.get(1),
+          "xyz".getBytes()) == null);
+      assertThat(new String(db.get(columnFamilyHandleList.get(1),
+          "newcfkey".getBytes()))).isEqualTo("value");
+      assertThat(new String(db.get(columnFamilyHandleList.get(1),
+          "newcfkey2".getBytes()))).isEqualTo("value2");
+      assertThat(new String(db.get("key".getBytes()))).isEqualTo("value");
+      // check if key is merged
+      assertThat(new String(db.get(db.getDefaultColumnFamily(),
+          "mergeKey".getBytes()))).isEqualTo("merge,merge");
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void iteratorOnColumnFamily() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    RocksIterator rocksIterator = null;
+    List<ColumnFamilyDescriptor> cfNames =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      options.setCreateMissingColumnFamilies(true);
+
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(),
+          "value".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(),
+          "value2".getBytes());
+      rocksIterator = db.newIterator(
+          columnFamilyHandleList.get(1));
+      rocksIterator.seekToFirst();
+      Map<String, String> refMap = new HashMap<>();
+      refMap.put("newcfkey", "value");
+      refMap.put("newcfkey2", "value2");
+      int i = 0;
+      while (rocksIterator.isValid()) {
+        i++;
+        assertThat(refMap.get(new String(rocksIterator.key()))).
+            isEqualTo(new String(rocksIterator.value()));
+        rocksIterator.next();
+      }
+      assertThat(i).isEqualTo(2);
+      rocksIterator.dispose();
+    } finally {
+      if (rocksIterator != null) {
+        rocksIterator.dispose();
+      }
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void multiGet() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    List<ColumnFamilyDescriptor> cfDescriptors =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      options.setCreateMissingColumnFamilies(true);
+
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfDescriptors, columnFamilyHandleList);
+      db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes());
+      db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes());
+
+      List<byte[]> keys = new ArrayList<>();
+      keys.add("key".getBytes());
+      keys.add("newcfkey".getBytes());
+      Map<byte[], byte[]> retValues = db.multiGet(columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(keys.get(0))))
+          .isEqualTo("value");
+      assertThat(new String(retValues.get(keys.get(1))))
+          .isEqualTo("value");
+      retValues = db.multiGet(new ReadOptions(), columnFamilyHandleList, keys);
+      assertThat(retValues.size()).isEqualTo(2);
+      assertThat(new String(retValues.get(keys.get(0))))
+          .isEqualTo("value");
+      assertThat(new String(retValues.get(keys.get(1))))
+          .isEqualTo("value");
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void properties() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    List<ColumnFamilyDescriptor> cfNames =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      options.setCreateMissingColumnFamilies(true);
+
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      assertThat(db.getProperty("rocksdb.estimate-num-keys")).
+          isNotNull();
+      assertThat(db.getLongProperty(columnFamilyHandleList.get(0),
+          "rocksdb.estimate-num-keys")).isGreaterThanOrEqualTo(0);
+      assertThat(db.getProperty("rocksdb.stats")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(0),
+          "rocksdb.sstables")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1),
+          "rocksdb.estimate-num-keys")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1),
+          "rocksdb.stats")).isNotNull();
+      assertThat(db.getProperty(columnFamilyHandleList.get(1),
+          "rocksdb.sstables")).isNotNull();
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+
+  @Test
+  public void iterators() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    List<ColumnFamilyDescriptor> cfNames =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    List<RocksIterator> iterators = null;
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+      options.setCreateMissingColumnFamilies(true);
+
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      iterators = db.newIterators(columnFamilyHandleList);
+      assertThat(iterators.size()).isEqualTo(2);
+      RocksIterator iter = iterators.get(0);
+      iter.seekToFirst();
+      Map<String, String> defRefMap = new HashMap<>();
+      defRefMap.put("dfkey1", "dfvalue");
+      defRefMap.put("key", "value");
+      while (iter.isValid()) {
+        assertThat(defRefMap.get(new String(iter.key()))).
+            isEqualTo(new String(iter.value()));
+        iter.next();
+      }
+      // iterate over new_cf key/value pairs
+      Map<String, String> cfRefMap = new HashMap<>();
+      cfRefMap.put("newcfkey", "value");
+      cfRefMap.put("newcfkey2", "value2");
+      iter = iterators.get(1);
+      iter.seekToFirst();
+      while (iter.isValid()) {
+        assertThat(cfRefMap.get(new String(iter.key()))).
+            isEqualTo(new String(iter.value()));
+        iter.next();
+      }
+    } finally {
+      if (iterators != null) {
+        for (RocksIterator rocksIterator : iterators) {
+          rocksIterator.dispose();
+        }
+      }
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failPutDisposedCF() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    List<ColumnFamilyDescriptor> cfNames =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.put(columnFamilyHandleList.get(1), "key".getBytes(), "value".getBytes());
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failRemoveDisposedCF() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    List<ColumnFamilyDescriptor> cfNames =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.remove(columnFamilyHandleList.get(1), "key".getBytes());
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failGetDisposedCF() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    List<ColumnFamilyDescriptor> cfNames =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      db.dropColumnFamily(columnFamilyHandleList.get(1));
+      db.get(columnFamilyHandleList.get(1), "key".getBytes());
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failMultiGetWithoutCorrectNumberOfCF() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    List<ColumnFamilyDescriptor> cfNames =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true);
+
+      cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList);
+      List<byte[]> keys = new ArrayList<>();
+      keys.add("key".getBytes());
+      keys.add("newcfkey".getBytes());
+      List<ColumnFamilyHandle> cfCustomList = new ArrayList<>();
+      db.multiGet(cfCustomList, keys);
+
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void testByteCreateFolumnFamily() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    ColumnFamilyHandle cf1 = null, cf2 = null, cf3 = null;
+    try {
+      options = new Options().setCreateIfMissing(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+
+      byte[] b0 = new byte[] { (byte)0x00 };
+      byte[] b1 = new byte[] { (byte)0x01 };
+      byte[] b2 = new byte[] { (byte)0x02 };
+      cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0));
+      cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1));
+      List<byte[]> families = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(families).contains("default".getBytes(), b0, b1);
+      cf3 = db.createColumnFamily(new ColumnFamilyDescriptor(b2));
+    } finally {
+      if (cf1 != null) {
+        cf1.dispose();
+      }
+      if (cf2 != null) {
+        cf2.dispose();
+      }
+      if (cf3 != null) {
+        cf3.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void testCFNamesWithZeroBytes() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    ColumnFamilyHandle cf1 = null, cf2 = null;
+    try {
+      options = new Options().setCreateIfMissing(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+
+      byte[] b0 = new byte[] { 0, 0 };
+      byte[] b1 = new byte[] { 0, 1 };
+      cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0));
+      cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1));
+      List<byte[]> families = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(families).contains("default".getBytes(), b0, b1);
+    } finally {
+      if (cf1 != null) {
+        cf1.dispose();
+      }
+      if (cf2 != null) {
+        cf2.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void testCFNameSimplifiedChinese() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    ColumnFamilyHandle columnFamilyHandle = null;
+    try {
+      options = new Options().setCreateIfMissing(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      final String simplifiedChinese = "\u7b80\u4f53\u5b57";
+      columnFamilyHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor(simplifiedChinese.getBytes()));
+
+      List<byte[]> families = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(families).contains("default".getBytes(), simplifiedChinese.getBytes());
+    } finally {
+      if (columnFamilyHandle != null) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+
+
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java
new file mode 100644
index 0000000..4f8a7d1
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java
@@ -0,0 +1,35 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class ComparatorOptionsTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void comparatorOptions() {
+    final ComparatorOptions copt = new ComparatorOptions();
+
+    assertThat(copt).isNotNull();
+
+    { // UseAdaptiveMutex test
+      copt.setUseAdaptiveMutex(true);
+      assertThat(copt.useAdaptiveMutex()).isTrue();
+
+      copt.setUseAdaptiveMutex(false);
+      assertThat(copt.useAdaptiveMutex()).isFalse();
+    }
+
+    copt.dispose();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/ComparatorTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/ComparatorTest.java
new file mode 100644
index 0000000..e689a9c
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/ComparatorTest.java
@@ -0,0 +1,227 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.io.IOException;
+import java.nio.file.FileSystems;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class ComparatorTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+     public void javaComparator() throws IOException, RocksDBException {
+
+    final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() {
+      @Override
+      public AbstractComparator getAscendingIntKeyComparator() {
+        return new Comparator(new ComparatorOptions()) {
+
+          @Override
+          public String name() {
+            return "test.AscendingIntKeyComparator";
+          }
+
+          @Override
+          public int compare(final Slice a, final Slice b) {
+            return compareIntKeys(a.data(), b.data());
+          }
+        };
+      }
+    };
+
+    // test the round-tripability of keys written and read with the Comparator
+    comparatorTest.testRoundtrip(FileSystems.getDefault().getPath(
+        dbFolder.getRoot().getAbsolutePath()));
+  }
+
+  @Test
+  public void javaComparatorCf() throws IOException, RocksDBException {
+
+    final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() {
+      @Override
+      public AbstractComparator getAscendingIntKeyComparator() {
+        return new Comparator(new ComparatorOptions()) {
+
+          @Override
+          public String name() {
+            return "test.AscendingIntKeyComparator";
+          }
+
+          @Override
+          public int compare(final Slice a, final Slice b) {
+            return compareIntKeys(a.data(), b.data());
+          }
+        };
+      }
+    };
+
+    // test the round-tripability of keys written and read with the Comparator
+    comparatorTest.testRoundtripCf(FileSystems.getDefault().getPath(
+        dbFolder.getRoot().getAbsolutePath()));
+  }
+
+  @Test
+  public void builtinForwardComparator()
+      throws RocksDBException {
+    Options options = null;
+    RocksDB rocksDB = null;
+    RocksIterator rocksIterator = null;
+    try {
+      options = new Options();
+      options.setCreateIfMissing(true);
+      options.setComparator(BuiltinComparator.BYTEWISE_COMPARATOR);
+      rocksDB = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+
+      rocksDB.put("abc1".getBytes(), "abc1".getBytes());
+      rocksDB.put("abc2".getBytes(), "abc2".getBytes());
+      rocksDB.put("abc3".getBytes(), "abc3".getBytes());
+
+      rocksIterator = rocksDB.newIterator();
+      // Iterate over keys using a iterator
+      rocksIterator.seekToFirst();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc1".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc1".getBytes());
+      rocksIterator.next();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc2".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc2".getBytes());
+      rocksIterator.next();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc3".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc3".getBytes());
+      rocksIterator.next();
+      assertThat(rocksIterator.isValid()).isFalse();
+      // Get last one
+      rocksIterator.seekToLast();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc3".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc3".getBytes());
+      // Seek for abc
+      rocksIterator.seek("abc".getBytes());
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc1".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc1".getBytes());
+
+    } finally {
+      if (rocksIterator != null) {
+        rocksIterator.dispose();
+      }
+      if (rocksDB != null) {
+        rocksDB.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void builtinReverseComparator()
+      throws RocksDBException {
+    Options options = null;
+    RocksDB rocksDB = null;
+    RocksIterator rocksIterator = null;
+    try {
+      options = new Options();
+      options.setCreateIfMissing(true);
+      options.setComparator(
+          BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR);
+      rocksDB = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+
+      rocksDB.put("abc1".getBytes(), "abc1".getBytes());
+      rocksDB.put("abc2".getBytes(), "abc2".getBytes());
+      rocksDB.put("abc3".getBytes(), "abc3".getBytes());
+
+      rocksIterator = rocksDB.newIterator();
+      // Iterate over keys using a iterator
+      rocksIterator.seekToFirst();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc3".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc3".getBytes());
+      rocksIterator.next();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc2".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc2".getBytes());
+      rocksIterator.next();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc1".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc1".getBytes());
+      rocksIterator.next();
+      assertThat(rocksIterator.isValid()).isFalse();
+      // Get last one
+      rocksIterator.seekToLast();
+      assertThat(rocksIterator.isValid()).isTrue();
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc1".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc1".getBytes());
+      // Will be invalid because abc is after abc1
+      rocksIterator.seek("abc".getBytes());
+      assertThat(rocksIterator.isValid()).isFalse();
+      // Will be abc3 because the next one after abc999
+      // is abc3
+      rocksIterator.seek("abc999".getBytes());
+      assertThat(rocksIterator.key()).isEqualTo(
+          "abc3".getBytes());
+      assertThat(rocksIterator.value()).isEqualTo(
+          "abc3".getBytes());
+    } finally {
+      if (rocksIterator != null) {
+        rocksIterator.dispose();
+      }
+      if (rocksDB != null) {
+        rocksDB.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void builtinComparatorEnum(){
+    assertThat(BuiltinComparator.BYTEWISE_COMPARATOR.ordinal())
+        .isEqualTo(0);
+    assertThat(
+        BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR.ordinal())
+        .isEqualTo(1);
+    assertThat(BuiltinComparator.values().length).isEqualTo(2);
+    assertThat(BuiltinComparator.valueOf("BYTEWISE_COMPARATOR")).
+        isEqualTo(BuiltinComparator.BYTEWISE_COMPARATOR);
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java
new file mode 100644
index 0000000..bff4d5f
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java
@@ -0,0 +1,21 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+
+public class CompressionOptionsTest
+{
+  @Test
+  public void getCompressionType() {
+    for (CompressionType compressionType : CompressionType.values()) {
+      String libraryName = compressionType.getLibraryName();
+      compressionType.equals(CompressionType.getCompressionType(
+          libraryName));
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java
new file mode 100644
index 0000000..98ba4ce
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java
@@ -0,0 +1,570 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import java.util.Properties;
+import java.util.Random;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class DBOptionsTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void getDBOptionsFromProps() {
+    DBOptions opt = null;
+    try {
+      // setup sample properties
+      Properties properties = new Properties();
+      properties.put("allow_mmap_reads", "true");
+      properties.put("bytes_per_sync", "13");
+      opt = DBOptions.getDBOptionsFromProps(properties);
+      assertThat(opt).isNotNull();
+      assertThat(String.valueOf(opt.allowMmapReads())).
+          isEqualTo(properties.get("allow_mmap_reads"));
+      assertThat(String.valueOf(opt.bytesPerSync())).
+          isEqualTo(properties.get("bytes_per_sync"));
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void failDBOptionsFromPropsWithIllegalValue() {
+    DBOptions opt = null;
+    try {
+      // setup sample properties
+      Properties properties = new Properties();
+      properties.put("tomato", "1024");
+      properties.put("burger", "2");
+      opt = DBOptions.
+          getDBOptionsFromProps(properties);
+      assertThat(opt).isNull();
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failDBOptionsFromPropsWithNullValue() {
+    DBOptions.getDBOptionsFromProps(null);
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failDBOptionsFromPropsWithEmptyProps() {
+    DBOptions.getDBOptionsFromProps(
+        new Properties());
+  }
+
+  @Test
+  public void setIncreaseParallelism() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      final int threads = Runtime.getRuntime().availableProcessors() * 2;
+      opt.setIncreaseParallelism(threads);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void createIfMissing() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setCreateIfMissing(boolValue);
+      assertThat(opt.createIfMissing()).
+          isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void createMissingColumnFamilies() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setCreateMissingColumnFamilies(boolValue);
+      assertThat(opt.createMissingColumnFamilies()).
+          isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void errorIfExists() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setErrorIfExists(boolValue);
+      assertThat(opt.errorIfExists()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void paranoidChecks() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setParanoidChecks(boolValue);
+      assertThat(opt.paranoidChecks()).
+          isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxTotalWalSize() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      long longValue = rand.nextLong();
+      opt.setMaxTotalWalSize(longValue);
+      assertThat(opt.maxTotalWalSize()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxOpenFiles() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      int intValue = rand.nextInt();
+      opt.setMaxOpenFiles(intValue);
+      assertThat(opt.maxOpenFiles()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void disableDataSync() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setDisableDataSync(boolValue);
+      assertThat(opt.disableDataSync()).
+          isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void useFsync() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setUseFsync(boolValue);
+      assertThat(opt.useFsync()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void dbLogDir() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      String str = "path/to/DbLogDir";
+      opt.setDbLogDir(str);
+      assertThat(opt.dbLogDir()).isEqualTo(str);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void walDir() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      String str = "path/to/WalDir";
+      opt.setWalDir(str);
+      assertThat(opt.walDir()).isEqualTo(str);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void deleteObsoleteFilesPeriodMicros() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      long longValue = rand.nextLong();
+      opt.setDeleteObsoleteFilesPeriodMicros(longValue);
+      assertThat(opt.deleteObsoleteFilesPeriodMicros()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxBackgroundCompactions() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      int intValue = rand.nextInt();
+      opt.setMaxBackgroundCompactions(intValue);
+      assertThat(opt.maxBackgroundCompactions()).
+          isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxBackgroundFlushes() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      int intValue = rand.nextInt();
+      opt.setMaxBackgroundFlushes(intValue);
+      assertThat(opt.maxBackgroundFlushes()).
+          isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxLogFileSize() throws RocksDBException {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      long longValue = rand.nextLong();
+      opt.setMaxLogFileSize(longValue);
+      assertThat(opt.maxLogFileSize()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void logFileTimeToRoll() throws RocksDBException {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      long longValue = rand.nextLong();
+      opt.setLogFileTimeToRoll(longValue);
+      assertThat(opt.logFileTimeToRoll()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void keepLogFileNum() throws RocksDBException {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      long longValue = rand.nextLong();
+      opt.setKeepLogFileNum(longValue);
+      assertThat(opt.keepLogFileNum()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxManifestFileSize() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      long longValue = rand.nextLong();
+      opt.setMaxManifestFileSize(longValue);
+      assertThat(opt.maxManifestFileSize()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void tableCacheNumshardbits() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      int intValue = rand.nextInt();
+      opt.setTableCacheNumshardbits(intValue);
+      assertThat(opt.tableCacheNumshardbits()).
+          isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void walSizeLimitMB() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      long longValue = rand.nextLong();
+      opt.setWalSizeLimitMB(longValue);
+      assertThat(opt.walSizeLimitMB()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void walTtlSeconds() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      long longValue = rand.nextLong();
+      opt.setWalTtlSeconds(longValue);
+      assertThat(opt.walTtlSeconds()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void manifestPreallocationSize() throws RocksDBException {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      long longValue = rand.nextLong();
+      opt.setManifestPreallocationSize(longValue);
+      assertThat(opt.manifestPreallocationSize()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void allowOsBuffer() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowOsBuffer(boolValue);
+      assertThat(opt.allowOsBuffer()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void allowMmapReads() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowMmapReads(boolValue);
+      assertThat(opt.allowMmapReads()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void allowMmapWrites() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowMmapWrites(boolValue);
+      assertThat(opt.allowMmapWrites()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void isFdCloseOnExec() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setIsFdCloseOnExec(boolValue);
+      assertThat(opt.isFdCloseOnExec()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void statsDumpPeriodSec() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      int intValue = rand.nextInt();
+      opt.setStatsDumpPeriodSec(intValue);
+      assertThat(opt.statsDumpPeriodSec()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void adviseRandomOnOpen() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setAdviseRandomOnOpen(boolValue);
+      assertThat(opt.adviseRandomOnOpen()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void useAdaptiveMutex() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      boolean boolValue = rand.nextBoolean();
+      opt.setUseAdaptiveMutex(boolValue);
+      assertThat(opt.useAdaptiveMutex()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void bytesPerSync() {
+    DBOptions opt = null;
+    try {
+      opt = new DBOptions();
+      long longValue = rand.nextLong();
+      opt.setBytesPerSync(longValue);
+      assertThat(opt.bytesPerSync()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void rateLimiterConfig() {
+    DBOptions options = null;
+    DBOptions anotherOptions = null;
+    try {
+      options = new DBOptions();
+      RateLimiterConfig rateLimiterConfig =
+          new GenericRateLimiterConfig(1000, 100 * 1000, 1);
+      options.setRateLimiterConfig(rateLimiterConfig);
+      // Test with parameter initialization
+      anotherOptions = new DBOptions();
+      anotherOptions.setRateLimiterConfig(
+          new GenericRateLimiterConfig(1000));
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+      if (anotherOptions != null) {
+        anotherOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void statistics() {
+    DBOptions options = new DBOptions();
+    Statistics statistics = options.createStatistics().
+        statisticsPtr();
+    assertThat(statistics).isNotNull();
+
+    DBOptions anotherOptions = new DBOptions();
+    statistics = anotherOptions.statisticsPtr();
+    assertThat(statistics).isNotNull();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/DirectComparatorTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/DirectComparatorTest.java
new file mode 100644
index 0000000..be84d66
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/DirectComparatorTest.java
@@ -0,0 +1,52 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.io.IOException;
+import java.nio.file.FileSystems;
+
+public class DirectComparatorTest {
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void directComparator() throws IOException, RocksDBException {
+
+    final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() {
+      @Override
+      public AbstractComparator getAscendingIntKeyComparator() {
+        return new DirectComparator(new ComparatorOptions()) {
+
+          @Override
+          public String name() {
+            return "test.AscendingIntKeyDirectComparator";
+          }
+
+          @Override
+          public int compare(final DirectSlice a, final DirectSlice b) {
+            final byte ax[] = new byte[4], bx[] = new byte[4];
+            a.data().get(ax);
+            b.data().get(bx);
+            return compareIntKeys(ax, bx);
+          }
+        };
+      }
+    };
+
+    // test the round-tripability of keys written and read with the DirectComparator
+    comparatorTest.testRoundtrip(FileSystems.getDefault().getPath(
+        dbFolder.getRoot().getAbsolutePath()));
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/DirectSliceTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/DirectSliceTest.java
new file mode 100644
index 0000000..123eed2
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/DirectSliceTest.java
@@ -0,0 +1,106 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import java.nio.ByteBuffer;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class DirectSliceTest {
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void directSlice() {
+    DirectSlice directSlice = null;
+    DirectSlice otherSlice = null;
+    try {
+      directSlice = new DirectSlice("abc");
+      otherSlice = new DirectSlice("abc");
+      assertThat(directSlice.toString()).isEqualTo("abc");
+      // clear first slice
+      directSlice.clear();
+      assertThat(directSlice.toString()).isEmpty();
+      // get first char in otherslice
+      assertThat(otherSlice.get(0)).isEqualTo("a".getBytes()[0]);
+      // remove prefix
+      otherSlice.removePrefix(1);
+      assertThat(otherSlice.toString()).isEqualTo("bc");
+    } finally {
+      if (directSlice != null) {
+        directSlice.dispose();
+      }
+      if (otherSlice != null) {
+        otherSlice.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void directSliceWithByteBuffer() {
+    DirectSlice directSlice = null;
+    try {
+      byte[] data = "Some text".getBytes();
+      ByteBuffer buffer = ByteBuffer.allocateDirect(data.length + 1);
+      buffer.put(data);
+      buffer.put(data.length, (byte)0);
+
+      directSlice = new DirectSlice(buffer);
+      assertThat(directSlice.toString()).isEqualTo("Some text");
+    } finally {
+      if (directSlice != null) {
+        directSlice.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void directSliceWithByteBufferAndLength() {
+    DirectSlice directSlice = null;
+    try {
+      byte[] data = "Some text".getBytes();
+      ByteBuffer buffer = ByteBuffer.allocateDirect(data.length);
+      buffer.put(data);
+      directSlice = new DirectSlice(buffer, 4);
+      assertThat(directSlice.toString()).isEqualTo("Some");
+    } finally {
+      if (directSlice != null) {
+        directSlice.dispose();
+      }
+    }
+  }
+
+  @Test(expected = AssertionError.class)
+  public void directSliceInitWithoutDirectAllocation() {
+    DirectSlice directSlice = null;
+    try {
+      byte[] data = "Some text".getBytes();
+      ByteBuffer buffer = ByteBuffer.wrap(data);
+      directSlice = new DirectSlice(buffer);
+    } finally {
+      if (directSlice != null) {
+        directSlice.dispose();
+      }
+    }
+  }
+
+  @Test(expected = AssertionError.class)
+  public void directSlicePrefixInitWithoutDirectAllocation() {
+    DirectSlice directSlice = null;
+    try {
+      byte[] data = "Some text".getBytes();
+      ByteBuffer buffer = ByteBuffer.wrap(data);
+      directSlice = new DirectSlice(buffer, 4);
+    } finally {
+      if (directSlice != null) {
+        directSlice.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/FilterTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/FilterTest.java
new file mode 100644
index 0000000..36ce379
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/FilterTest.java
@@ -0,0 +1,47 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+public class FilterTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void filter() {
+    Options options = null;
+    try {
+      options = new Options();
+      // test table config
+      options.setTableFormatConfig(new BlockBasedTableConfig().
+          setFilter(new BloomFilter()));
+      options.dispose();
+      System.gc();
+      System.runFinalization();
+      // new Bloom filter
+      options = new Options();
+      BlockBasedTableConfig blockConfig = new BlockBasedTableConfig();
+      blockConfig.setFilter(new BloomFilter());
+      options.setTableFormatConfig(blockConfig);
+      BloomFilter bloomFilter = new BloomFilter(10);
+      blockConfig.setFilter(bloomFilter);
+      options.setTableFormatConfig(blockConfig);
+      System.gc();
+      System.runFinalization();
+      blockConfig.setFilter(new BloomFilter(10, false));
+      options.setTableFormatConfig(blockConfig);
+
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/FlushTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/FlushTest.java
new file mode 100644
index 0000000..94a32d3
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/FlushTest.java
@@ -0,0 +1,65 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class FlushTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void flush() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    WriteOptions wOpt = null;
+    FlushOptions flushOptions = null;
+    try {
+      options = new Options();
+      // Setup options
+      options.setCreateIfMissing(true);
+      options.setMaxWriteBufferNumber(10);
+      options.setMinWriteBufferNumberToMerge(10);
+      wOpt = new WriteOptions();
+      flushOptions = new FlushOptions();
+      flushOptions.setWaitForFlush(true);
+      assertThat(flushOptions.waitForFlush()).isTrue();
+      wOpt.setDisableWAL(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.put(wOpt, "key1".getBytes(), "value1".getBytes());
+      db.put(wOpt, "key2".getBytes(), "value2".getBytes());
+      db.put(wOpt, "key3".getBytes(), "value3".getBytes());
+      db.put(wOpt, "key4".getBytes(), "value4".getBytes());
+      assertThat(db.getProperty("rocksdb.num-entries-active-mem-table")).isEqualTo("4");
+      db.flush(flushOptions);
+      assertThat(db.getProperty("rocksdb.num-entries-active-mem-table")).
+          isEqualTo("0");
+    } finally {
+      if (flushOptions != null) {
+        flushOptions.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (wOpt != null) {
+        wOpt.dispose();
+      }
+
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java
new file mode 100644
index 0000000..630666b
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java
@@ -0,0 +1,134 @@
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.io.IOException;
+
+import static java.nio.file.Files.readAllBytes;
+import static java.nio.file.Paths.get;
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class InfoLogLevelTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void testInfoLogLevel() throws RocksDBException,
+      IOException {
+    RocksDB db = null;
+    try {
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      assertThat(getLogContentsWithoutHeader()).isNotEmpty();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+    }
+  }
+
+  @Test
+     public void testFatalLogLevel() throws RocksDBException,
+      IOException {
+    RocksDB db = null;
+    Options options = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true).
+          setInfoLogLevel(InfoLogLevel.FATAL_LEVEL);
+      assertThat(options.infoLogLevel()).
+          isEqualTo(InfoLogLevel.FATAL_LEVEL);
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      // As InfoLogLevel is set to FATAL_LEVEL, here we expect the log
+      // content to be empty.
+      assertThat(getLogContentsWithoutHeader()).isEmpty();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void testFatalLogLevelWithDBOptions()
+      throws RocksDBException, IOException {
+    RocksDB db = null;
+    Options options = null;
+    DBOptions dbOptions = null;
+    try {
+      dbOptions = new DBOptions().
+          setInfoLogLevel(InfoLogLevel.FATAL_LEVEL);
+      options = new Options(dbOptions,
+          new ColumnFamilyOptions()).
+          setCreateIfMissing(true);
+      assertThat(dbOptions.infoLogLevel()).
+          isEqualTo(InfoLogLevel.FATAL_LEVEL);
+      assertThat(options.infoLogLevel()).
+          isEqualTo(InfoLogLevel.FATAL_LEVEL);
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      assertThat(getLogContentsWithoutHeader()).isEmpty();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (dbOptions != null) {
+        dbOptions.dispose();
+      }
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void failIfIllegalByteValueProvided() {
+    InfoLogLevel.getInfoLogLevel((byte)-1);
+  }
+
+  @Test
+  public void valueOf() {
+    assertThat(InfoLogLevel.valueOf("DEBUG_LEVEL")).
+        isEqualTo(InfoLogLevel.DEBUG_LEVEL);
+  }
+
+  /**
+   * Read LOG file contents into String.
+   *
+   * @return LOG file contents as String.
+   * @throws IOException if file is not found.
+   */
+  private String getLogContentsWithoutHeader() throws IOException {
+    final String separator = System.getProperty("line.separator");
+    final String[] lines = new String(readAllBytes(get(
+        dbFolder.getRoot().getAbsolutePath()+ "/LOG"))).split(separator);
+
+    int first_non_header = lines.length;
+    // Identify the last line of the header
+    for (int i = lines.length - 1; i >= 0; --i) {
+      if (lines[i].indexOf("Options.") >= 0 && lines[i].indexOf(':') >= 0) {
+        first_non_header = i + 1;
+        break;
+      }
+    }
+    StringBuilder builder = new StringBuilder();
+    for (int i = first_non_header; i < lines.length; ++i) {
+      builder.append(lines[i]).append(separator);
+    }
+    return builder.toString();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java
new file mode 100644
index 0000000..b670cad
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java
@@ -0,0 +1,95 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class KeyMayExistTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void keyMayExist() throws RocksDBException {
+    RocksDB db = null;
+    DBOptions options = null;
+    List<ColumnFamilyDescriptor> cfDescriptors =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      options = new DBOptions();
+      options.setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+      // open database using cf names
+
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath(),
+          cfDescriptors, columnFamilyHandleList);
+      assertThat(columnFamilyHandleList.size()).
+          isEqualTo(2);
+      db.put("key".getBytes(), "value".getBytes());
+      // Test without column family
+      StringBuffer retValue = new StringBuffer();
+      boolean exists = db.keyMayExist("key".getBytes(), retValue);
+      assertThat(exists).isTrue();
+      assertThat(retValue.toString()).
+          isEqualTo("value");
+
+      // Test without column family but with readOptions
+      retValue = new StringBuffer();
+      exists = db.keyMayExist(new ReadOptions(), "key".getBytes(),
+          retValue);
+      assertThat(exists).isTrue();
+      assertThat(retValue.toString()).
+          isEqualTo("value");
+
+      // Test with column family
+      retValue = new StringBuffer();
+      exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(),
+          retValue);
+      assertThat(exists).isTrue();
+      assertThat(retValue.toString()).
+          isEqualTo("value");
+
+      // Test with column family and readOptions
+      retValue = new StringBuffer();
+      exists = db.keyMayExist(new ReadOptions(),
+          columnFamilyHandleList.get(0), "key".getBytes(),
+          retValue);
+      assertThat(exists).isTrue();
+      assertThat(retValue.toString()).
+          isEqualTo("value");
+
+      // KeyMayExist in CF1 must return false
+      assertThat(db.keyMayExist(columnFamilyHandleList.get(1),
+          "key".getBytes(), retValue)).isFalse();
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/LoggerTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/LoggerTest.java
new file mode 100644
index 0000000..2eff319
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/LoggerTest.java
@@ -0,0 +1,220 @@
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class LoggerTest {
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  private AtomicInteger logMessageCounter = new AtomicInteger();
+
+  @Test
+  public void customLogger() throws RocksDBException {
+    RocksDB db = null;
+    logMessageCounter.set(0);
+    try {
+
+      // Setup options
+      final Options options = new Options().
+          setInfoLogLevel(InfoLogLevel.DEBUG_LEVEL).
+          setCreateIfMissing(true);
+
+      // Create new logger with max log level passed by options
+      Logger logger = new Logger(options) {
+        @Override
+        protected void log(InfoLogLevel infoLogLevel, String logMsg) {
+          assertThat(logMsg).isNotNull();
+          assertThat(logMsg.length()).isGreaterThan(0);
+          logMessageCounter.incrementAndGet();
+        }
+      };
+
+      // Set custom logger to options
+      options.setLogger(logger);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+
+      // there should be more than zero received log messages in
+      // debug level.
+      assertThat(logMessageCounter.get()).isGreaterThan(0);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+    }
+    logMessageCounter.set(0);
+  }
+
+
+  @Test
+  public void fatalLogger() throws RocksDBException {
+    RocksDB db = null;
+    logMessageCounter.set(0);
+
+    try {
+      // Setup options
+      final Options options = new Options().
+          setInfoLogLevel(InfoLogLevel.FATAL_LEVEL).
+          setCreateIfMissing(true);
+
+      // Create new logger with max log level passed by options
+      Logger logger = new Logger(options) {
+        @Override
+        protected void log(InfoLogLevel infoLogLevel, String logMsg) {
+          assertThat(logMsg).isNotNull();
+          assertThat(logMsg.length()).isGreaterThan(0);
+          logMessageCounter.incrementAndGet();
+        }
+      };
+
+      // Set custom logger to options
+      options.setLogger(logger);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+
+      // there should be zero messages
+      // using fatal level as log level.
+      assertThat(logMessageCounter.get()).isEqualTo(0);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+    }
+    logMessageCounter.set(0);
+  }
+
+  @Test
+  public void dbOptionsLogger() throws RocksDBException {
+    RocksDB db = null;
+    Logger logger = null;
+    List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+
+    logMessageCounter.set(0);
+    try {
+      // Setup options
+      final DBOptions options = new DBOptions().
+          setInfoLogLevel(InfoLogLevel.FATAL_LEVEL).
+          setCreateIfMissing(true);
+
+      // Create new logger with max log level passed by options
+      logger = new Logger(options) {
+        @Override
+        protected void log(InfoLogLevel infoLogLevel, String logMsg) {
+          assertThat(logMsg).isNotNull();
+          assertThat(logMsg.length()).isGreaterThan(0);
+          logMessageCounter.incrementAndGet();
+        }
+      };
+
+      // Set custom logger to options
+      options.setLogger(logger);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          cfDescriptors, cfHandles);
+      // there should be zero messages
+      // using fatal level as log level.
+      assertThat(logMessageCounter.get()).isEqualTo(0);
+      logMessageCounter.set(0);
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : cfHandles) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (logger != null) {
+        logger.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void setInfoLogLevel() {
+    Logger logger = null;
+    try {
+      // Setup options
+      final Options options = new Options().
+          setInfoLogLevel(InfoLogLevel.FATAL_LEVEL).
+          setCreateIfMissing(true);
+
+      // Create new logger with max log level passed by options
+      logger = new Logger(options) {
+        @Override
+        protected void log(InfoLogLevel infoLogLevel, String logMsg) {
+          assertThat(logMsg).isNotNull();
+          assertThat(logMsg.length()).isGreaterThan(0);
+          logMessageCounter.incrementAndGet();
+        }
+      };
+      assertThat(logger.infoLogLevel()).
+          isEqualTo(InfoLogLevel.FATAL_LEVEL);
+      logger.setInfoLogLevel(InfoLogLevel.DEBUG_LEVEL);
+      assertThat(logger.infoLogLevel()).
+          isEqualTo(InfoLogLevel.DEBUG_LEVEL);
+    } finally {
+      if (logger != null) {
+        logger.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void changeLogLevelAtRuntime() throws RocksDBException {
+    RocksDB db = null;
+    logMessageCounter.set(0);
+
+    try {
+      // Setup options
+      final Options options = new Options().
+          setInfoLogLevel(InfoLogLevel.FATAL_LEVEL).
+          setCreateIfMissing(true);
+
+      // Create new logger with max log level passed by options
+      Logger logger = new Logger(options) {
+        @Override
+        protected void log(InfoLogLevel infoLogLevel, String logMsg) {
+          assertThat(logMsg).isNotNull();
+          assertThat(logMsg.length()).isGreaterThan(0);
+          logMessageCounter.incrementAndGet();
+        }
+      };
+
+      // Set custom logger to options
+      options.setLogger(logger);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+
+      // there should be zero messages
+      // using fatal level as log level.
+      assertThat(logMessageCounter.get()).isEqualTo(0);
+
+      // change log level to debug level
+      logger.setInfoLogLevel(InfoLogLevel.DEBUG_LEVEL);
+
+      db.put("key".getBytes(), "value".getBytes());
+      db.flush(new FlushOptions().setWaitForFlush(true));
+
+      // messages shall be received due to previous actions.
+      assertThat(logMessageCounter.get()).isNotEqualTo(0);
+
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+    }
+    logMessageCounter.set(0);
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/MemTableTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/MemTableTest.java
new file mode 100644
index 0000000..bfc898c
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/MemTableTest.java
@@ -0,0 +1,137 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class MemTableTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void hashSkipListMemTable() throws RocksDBException {
+    Options options = null;
+    try {
+      options = new Options();
+      // Test HashSkipListMemTableConfig
+      HashSkipListMemTableConfig memTableConfig =
+          new HashSkipListMemTableConfig();
+      assertThat(memTableConfig.bucketCount()).
+          isEqualTo(1000000);
+      memTableConfig.setBucketCount(2000000);
+      assertThat(memTableConfig.bucketCount()).
+          isEqualTo(2000000);
+      assertThat(memTableConfig.height()).
+          isEqualTo(4);
+      memTableConfig.setHeight(5);
+      assertThat(memTableConfig.height()).
+          isEqualTo(5);
+      assertThat(memTableConfig.branchingFactor()).
+          isEqualTo(4);
+      memTableConfig.setBranchingFactor(6);
+      assertThat(memTableConfig.branchingFactor()).
+          isEqualTo(6);
+      options.setMemTableConfig(memTableConfig);
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void skipListMemTable() throws RocksDBException {
+    Options options = null;
+    try {
+      options = new Options();
+      SkipListMemTableConfig skipMemTableConfig =
+          new SkipListMemTableConfig();
+      assertThat(skipMemTableConfig.lookahead()).
+          isEqualTo(0);
+      skipMemTableConfig.setLookahead(20);
+      assertThat(skipMemTableConfig.lookahead()).
+          isEqualTo(20);
+      options.setMemTableConfig(skipMemTableConfig);
+      options.dispose();
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void hashLinkedListMemTable() throws RocksDBException {
+    Options options = null;
+    try {
+      options = new Options();
+      HashLinkedListMemTableConfig hashLinkedListMemTableConfig =
+          new HashLinkedListMemTableConfig();
+      assertThat(hashLinkedListMemTableConfig.bucketCount()).
+          isEqualTo(50000);
+      hashLinkedListMemTableConfig.setBucketCount(100000);
+      assertThat(hashLinkedListMemTableConfig.bucketCount()).
+          isEqualTo(100000);
+      assertThat(hashLinkedListMemTableConfig.hugePageTlbSize()).
+          isEqualTo(0);
+      hashLinkedListMemTableConfig.setHugePageTlbSize(1);
+      assertThat(hashLinkedListMemTableConfig.hugePageTlbSize()).
+          isEqualTo(1);
+      assertThat(hashLinkedListMemTableConfig.
+          bucketEntriesLoggingThreshold()).
+          isEqualTo(4096);
+      hashLinkedListMemTableConfig.
+          setBucketEntriesLoggingThreshold(200);
+      assertThat(hashLinkedListMemTableConfig.
+          bucketEntriesLoggingThreshold()).
+          isEqualTo(200);
+      assertThat(hashLinkedListMemTableConfig.
+          ifLogBucketDistWhenFlush()).isTrue();
+      hashLinkedListMemTableConfig.
+          setIfLogBucketDistWhenFlush(false);
+      assertThat(hashLinkedListMemTableConfig.
+          ifLogBucketDistWhenFlush()).isFalse();
+      assertThat(hashLinkedListMemTableConfig.
+          thresholdUseSkiplist()).
+          isEqualTo(256);
+      hashLinkedListMemTableConfig.setThresholdUseSkiplist(29);
+      assertThat(hashLinkedListMemTableConfig.
+          thresholdUseSkiplist()).
+          isEqualTo(29);
+      options.setMemTableConfig(hashLinkedListMemTableConfig);
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void vectorMemTable() throws RocksDBException {
+    Options options = null;
+    try {
+      options = new Options();
+      VectorMemTableConfig vectorMemTableConfig =
+          new VectorMemTableConfig();
+      assertThat(vectorMemTableConfig.reservedSize()).
+          isEqualTo(0);
+      vectorMemTableConfig.setReservedSize(123);
+      assertThat(vectorMemTableConfig.reservedSize()).
+          isEqualTo(123);
+      options.setMemTableConfig(vectorMemTableConfig);
+      options.dispose();
+    }  finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java
new file mode 100644
index 0000000..a5f8e1f
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java
@@ -0,0 +1,302 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.List;
+import java.util.ArrayList;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class MergeTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void stringOption()
+      throws InterruptedException, RocksDBException {
+    RocksDB db = null;
+    Options opt = null;
+    try {
+      String db_path_string =
+          dbFolder.getRoot().getAbsolutePath();
+      opt = new Options();
+      opt.setCreateIfMissing(true);
+      opt.setMergeOperatorName("stringappend");
+
+      db = RocksDB.open(opt, db_path_string);
+      // writing aa under key
+      db.put("key".getBytes(), "aa".getBytes());
+      // merge bb under key
+      db.merge("key".getBytes(), "bb".getBytes());
+
+      byte[] value = db.get("key".getBytes());
+      String strValue = new String(value);
+      assertThat(strValue).isEqualTo("aa,bb");
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void cFStringOption()
+      throws InterruptedException, RocksDBException {
+    RocksDB db = null;
+    DBOptions opt = null;
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      String db_path_string =
+          dbFolder.getRoot().getAbsolutePath();
+      opt = new DBOptions();
+      opt.setCreateIfMissing(true);
+      opt.setCreateMissingColumnFamilies(true);
+
+      List<ColumnFamilyDescriptor> cfDescriptors =
+          new ArrayList<>();
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+          new ColumnFamilyOptions().setMergeOperatorName(
+              "stringappend")));
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+          new ColumnFamilyOptions().setMergeOperatorName(
+              "stringappend")));
+      db = RocksDB.open(opt, db_path_string,
+          cfDescriptors, columnFamilyHandleList);
+
+      // writing aa under key
+      db.put(columnFamilyHandleList.get(1),
+          "cfkey".getBytes(), "aa".getBytes());
+      // merge bb under key
+      db.merge(columnFamilyHandleList.get(1),
+          "cfkey".getBytes(), "bb".getBytes());
+
+      byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes());
+      String strValue = new String(value);
+      assertThat(strValue).isEqualTo("aa,bb");
+    } finally {
+      for (ColumnFamilyHandle handle : columnFamilyHandleList) {
+        handle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void operatorOption()
+      throws InterruptedException, RocksDBException {
+    RocksDB db = null;
+    Options opt = null;
+    try {
+      String db_path_string =
+          dbFolder.getRoot().getAbsolutePath();
+      opt = new Options();
+      opt.setCreateIfMissing(true);
+
+      StringAppendOperator stringAppendOperator = new StringAppendOperator();
+      opt.setMergeOperator(stringAppendOperator);
+
+      db = RocksDB.open(opt, db_path_string);
+      // Writing aa under key
+      db.put("key".getBytes(), "aa".getBytes());
+
+      // Writing bb under key
+      db.merge("key".getBytes(), "bb".getBytes());
+
+      byte[] value = db.get("key".getBytes());
+      String strValue = new String(value);
+
+      assertThat(strValue).isEqualTo("aa,bb");
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void cFOperatorOption()
+      throws InterruptedException, RocksDBException {
+    RocksDB db = null;
+    DBOptions opt = null;
+    ColumnFamilyHandle cfHandle = null;
+    List<ColumnFamilyDescriptor> cfDescriptors =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      String db_path_string =
+          dbFolder.getRoot().getAbsolutePath();
+      opt = new DBOptions();
+      opt.setCreateIfMissing(true);
+      opt.setCreateMissingColumnFamilies(true);
+      StringAppendOperator stringAppendOperator = new StringAppendOperator();
+
+      cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+          new ColumnFamilyOptions().setMergeOperator(
+              stringAppendOperator)));
+      cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes(),
+          new ColumnFamilyOptions().setMergeOperator(
+              stringAppendOperator)));
+      db = RocksDB.open(opt, db_path_string,
+          cfDescriptors, columnFamilyHandleList);
+
+      // writing aa under key
+      db.put(columnFamilyHandleList.get(1),
+          "cfkey".getBytes(), "aa".getBytes());
+      // merge bb under key
+      db.merge(columnFamilyHandleList.get(1),
+          "cfkey".getBytes(), "bb".getBytes());
+      byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes());
+      String strValue = new String(value);
+
+      // Test also with createColumnFamily
+      cfHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf2".getBytes(),
+              new ColumnFamilyOptions().setMergeOperator(stringAppendOperator)));
+      // writing xx under cfkey2
+      db.put(cfHandle, "cfkey2".getBytes(), "xx".getBytes());
+      // merge yy under cfkey2
+      db.merge(cfHandle, new WriteOptions(), "cfkey2".getBytes(), "yy".getBytes());
+      value = db.get(cfHandle, "cfkey2".getBytes());
+      String strValueTmpCf = new String(value);
+
+      assertThat(strValue).isEqualTo("aa,bb");
+      assertThat(strValueTmpCf).isEqualTo("xx,yy");
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (cfHandle != null) {
+        cfHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void operatorGcBehaviour()
+      throws RocksDBException {
+    Options opt = null;
+    RocksDB db = null;
+    try {
+      String db_path_string =
+          dbFolder.getRoot().getAbsolutePath();
+      opt = new Options();
+      opt.setCreateIfMissing(true);
+      StringAppendOperator stringAppendOperator = new StringAppendOperator();
+      opt.setMergeOperator(stringAppendOperator);
+      db = RocksDB.open(opt, db_path_string);
+      db.close();
+      opt.dispose();
+      System.gc();
+      System.runFinalization();
+      // test reuse
+      opt = new Options();
+      opt.setMergeOperator(stringAppendOperator);
+      db = RocksDB.open(opt, db_path_string);
+      db.close();
+      opt.dispose();
+      System.gc();
+      System.runFinalization();
+      // test param init
+      opt = new Options();
+      opt.setMergeOperator(new StringAppendOperator());
+      db = RocksDB.open(opt, db_path_string);
+      db.close();
+      opt.dispose();
+      System.gc();
+      System.runFinalization();
+      // test replace one with another merge operator instance
+      opt = new Options();
+      opt.setMergeOperator(stringAppendOperator);
+      StringAppendOperator newStringAppendOperator = new StringAppendOperator();
+      opt.setMergeOperator(newStringAppendOperator);
+      db = RocksDB.open(opt, db_path_string);
+      db.close();
+      opt.dispose();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void emptyStringInSetMergeOperatorByName() {
+    Options opt = null;
+    ColumnFamilyOptions cOpt = null;
+    try {
+      opt = new Options();
+      cOpt = new ColumnFamilyOptions();
+      opt.setMergeOperatorName("");
+      cOpt.setMergeOperatorName("");
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+      if (cOpt != null) {
+        cOpt.dispose();
+      }
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void nullStringInSetMergeOperatorByNameOptions() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      opt.setMergeOperatorName(null);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void
+      nullStringInSetMergeOperatorByNameColumnFamilyOptions() {
+    ColumnFamilyOptions opt = null;
+    try {
+      opt = new ColumnFamilyOptions();
+      opt.setMergeOperatorName(null);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/MixedOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/MixedOptionsTest.java
new file mode 100644
index 0000000..f095e99
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/MixedOptionsTest.java
@@ -0,0 +1,56 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class MixedOptionsTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void mixedOptionsTest(){
+    // Set a table factory and check the names
+    ColumnFamilyOptions cfOptions = new ColumnFamilyOptions();
+    cfOptions.setTableFormatConfig(new BlockBasedTableConfig().
+        setFilter(new BloomFilter()));
+    assertThat(cfOptions.tableFactoryName()).isEqualTo(
+        "BlockBasedTable");
+    cfOptions.setTableFormatConfig(new PlainTableConfig());
+    assertThat(cfOptions.tableFactoryName()).isEqualTo("PlainTable");
+    // Initialize a dbOptions object from cf options and
+    // db options
+    DBOptions dbOptions = new DBOptions();
+    Options options = new Options(dbOptions, cfOptions);
+    assertThat(options.tableFactoryName()).isEqualTo("PlainTable");
+    // Free instances
+    options.dispose();
+    options = null;
+    cfOptions.dispose();
+    cfOptions = null;
+    dbOptions.dispose();
+    dbOptions = null;
+    System.gc();
+    System.runFinalization();
+    // Test Optimize for statements
+    cfOptions = new ColumnFamilyOptions();
+    cfOptions.optimizeUniversalStyleCompaction();
+    cfOptions.optimizeLevelStyleCompaction();
+    cfOptions.optimizeForPointLookup(1024);
+    options = new Options();
+    options.optimizeLevelStyleCompaction();
+    options.optimizeLevelStyleCompaction(400);
+    options.optimizeUniversalStyleCompaction();
+    options.optimizeUniversalStyleCompaction(400);
+    options.optimizeForPointLookup(1024);
+    options.prepareForBulkLoad();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java
new file mode 100644
index 0000000..7d9322a
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java
@@ -0,0 +1,31 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.util.Environment;
+
+import java.io.IOException;
+import java.nio.file.*;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class NativeLibraryLoaderTest {
+
+  @Rule
+  public TemporaryFolder temporaryFolder = new TemporaryFolder();
+
+  @Test
+  public void tempFolder() throws IOException {
+    NativeLibraryLoader.getInstance().loadLibraryFromJar(
+        temporaryFolder.getRoot().getAbsolutePath());
+    Path path = Paths.get(temporaryFolder.getRoot().getAbsolutePath(),
+        Environment.getJniLibraryFileName("rocksdb"));
+    assertThat(Files.exists(path)).isTrue();
+    assertThat(Files.isReadable(path)).isTrue();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java
new file mode 100644
index 0000000..1c1dfc6
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java
@@ -0,0 +1,1208 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+
+public class OptionsTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void setIncreaseParallelism() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      final int threads = Runtime.getRuntime().availableProcessors() * 2;
+      opt.setIncreaseParallelism(threads);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void writeBufferSize() throws RocksDBException {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setWriteBufferSize(longValue);
+      assertThat(opt.writeBufferSize()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxWriteBufferNumber() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMaxWriteBufferNumber(intValue);
+      assertThat(opt.maxWriteBufferNumber()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void minWriteBufferNumberToMerge() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMinWriteBufferNumberToMerge(intValue);
+      assertThat(opt.minWriteBufferNumberToMerge()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void numLevels() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setNumLevels(intValue);
+      assertThat(opt.numLevels()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void levelZeroFileNumCompactionTrigger() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setLevelZeroFileNumCompactionTrigger(intValue);
+      assertThat(opt.levelZeroFileNumCompactionTrigger()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void levelZeroSlowdownWritesTrigger() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setLevelZeroSlowdownWritesTrigger(intValue);
+      assertThat(opt.levelZeroSlowdownWritesTrigger()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void levelZeroStopWritesTrigger() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setLevelZeroStopWritesTrigger(intValue);
+      assertThat(opt.levelZeroStopWritesTrigger()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void targetFileSizeBase() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setTargetFileSizeBase(longValue);
+      assertThat(opt.targetFileSizeBase()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void targetFileSizeMultiplier() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setTargetFileSizeMultiplier(intValue);
+      assertThat(opt.targetFileSizeMultiplier()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxBytesForLevelBase() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setMaxBytesForLevelBase(longValue);
+      assertThat(opt.maxBytesForLevelBase()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void levelCompactionDynamicLevelBytes() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      final boolean boolValue = rand.nextBoolean();
+      opt.setLevelCompactionDynamicLevelBytes(boolValue);
+      assertThat(opt.levelCompactionDynamicLevelBytes())
+          .isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxBytesForLevelMultiplier() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMaxBytesForLevelMultiplier(intValue);
+      assertThat(opt.maxBytesForLevelMultiplier()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void expandedCompactionFactor() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setExpandedCompactionFactor(intValue);
+      assertThat(opt.expandedCompactionFactor()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void sourceCompactionFactor() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setSourceCompactionFactor(intValue);
+      assertThat(opt.sourceCompactionFactor()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxGrandparentOverlapFactor() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMaxGrandparentOverlapFactor(intValue);
+      assertThat(opt.maxGrandparentOverlapFactor()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void softRateLimit() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      double doubleValue = rand.nextDouble();
+      opt.setSoftRateLimit(doubleValue);
+      assertThat(opt.softRateLimit()).isEqualTo(doubleValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void hardRateLimit() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      double doubleValue = rand.nextDouble();
+      opt.setHardRateLimit(doubleValue);
+      assertThat(opt.hardRateLimit()).isEqualTo(doubleValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void rateLimitDelayMaxMilliseconds() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setRateLimitDelayMaxMilliseconds(intValue);
+      assertThat(opt.rateLimitDelayMaxMilliseconds()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void arenaBlockSize() throws RocksDBException {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setArenaBlockSize(longValue);
+      assertThat(opt.arenaBlockSize()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void disableAutoCompactions() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setDisableAutoCompactions(boolValue);
+      assertThat(opt.disableAutoCompactions()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void purgeRedundantKvsWhileFlush() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setPurgeRedundantKvsWhileFlush(boolValue);
+      assertThat(opt.purgeRedundantKvsWhileFlush()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void verifyChecksumsInCompaction() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setVerifyChecksumsInCompaction(boolValue);
+      assertThat(opt.verifyChecksumsInCompaction()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void filterDeletes() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setFilterDeletes(boolValue);
+      assertThat(opt.filterDeletes()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxSequentialSkipInIterations() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setMaxSequentialSkipInIterations(longValue);
+      assertThat(opt.maxSequentialSkipInIterations()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void inplaceUpdateSupport() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setInplaceUpdateSupport(boolValue);
+      assertThat(opt.inplaceUpdateSupport()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void inplaceUpdateNumLocks() throws RocksDBException {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setInplaceUpdateNumLocks(longValue);
+      assertThat(opt.inplaceUpdateNumLocks()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void memtablePrefixBloomBits() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMemtablePrefixBloomBits(intValue);
+      assertThat(opt.memtablePrefixBloomBits()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void memtablePrefixBloomProbes() {
+    Options opt = null;
+    try {
+      int intValue = rand.nextInt();
+      opt = new Options();
+      opt.setMemtablePrefixBloomProbes(intValue);
+      assertThat(opt.memtablePrefixBloomProbes()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void bloomLocality() {
+    Options opt = null;
+    try {
+      int intValue = rand.nextInt();
+      opt = new Options();
+      opt.setBloomLocality(intValue);
+      assertThat(opt.bloomLocality()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxSuccessiveMerges() throws RocksDBException {
+    Options opt = null;
+    try {
+      long longValue = rand.nextLong();
+      opt = new Options();
+      opt.setMaxSuccessiveMerges(longValue);
+      assertThat(opt.maxSuccessiveMerges()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void minPartialMergeOperands() {
+    Options opt = null;
+    try {
+      int intValue = rand.nextInt();
+      opt = new Options();
+      opt.setMinPartialMergeOperands(intValue);
+      assertThat(opt.minPartialMergeOperands()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void optimizeFiltersForHits() {
+    Options opt = null;
+    try {
+      boolean aBoolean = rand.nextBoolean();
+      opt = new Options();
+      opt.setOptimizeFiltersForHits(aBoolean);
+      assertThat(opt.optimizeFiltersForHits()).isEqualTo(aBoolean);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void createIfMissing() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setCreateIfMissing(boolValue);
+      assertThat(opt.createIfMissing()).
+          isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void createMissingColumnFamilies() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setCreateMissingColumnFamilies(boolValue);
+      assertThat(opt.createMissingColumnFamilies()).
+          isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void errorIfExists() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setErrorIfExists(boolValue);
+      assertThat(opt.errorIfExists()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void paranoidChecks() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setParanoidChecks(boolValue);
+      assertThat(opt.paranoidChecks()).
+          isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxTotalWalSize() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setMaxTotalWalSize(longValue);
+      assertThat(opt.maxTotalWalSize()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxOpenFiles() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMaxOpenFiles(intValue);
+      assertThat(opt.maxOpenFiles()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void disableDataSync() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setDisableDataSync(boolValue);
+      assertThat(opt.disableDataSync()).
+          isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void useFsync() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setUseFsync(boolValue);
+      assertThat(opt.useFsync()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void dbLogDir() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      String str = "path/to/DbLogDir";
+      opt.setDbLogDir(str);
+      assertThat(opt.dbLogDir()).isEqualTo(str);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void walDir() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      String str = "path/to/WalDir";
+      opt.setWalDir(str);
+      assertThat(opt.walDir()).isEqualTo(str);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void deleteObsoleteFilesPeriodMicros() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setDeleteObsoleteFilesPeriodMicros(longValue);
+      assertThat(opt.deleteObsoleteFilesPeriodMicros()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxBackgroundCompactions() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMaxBackgroundCompactions(intValue);
+      assertThat(opt.maxBackgroundCompactions()).
+          isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxBackgroundFlushes() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setMaxBackgroundFlushes(intValue);
+      assertThat(opt.maxBackgroundFlushes()).
+          isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxLogFileSize() throws RocksDBException {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setMaxLogFileSize(longValue);
+      assertThat(opt.maxLogFileSize()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void logFileTimeToRoll() throws RocksDBException {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setLogFileTimeToRoll(longValue);
+      assertThat(opt.logFileTimeToRoll()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void keepLogFileNum() throws RocksDBException {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setKeepLogFileNum(longValue);
+      assertThat(opt.keepLogFileNum()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxManifestFileSize() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setMaxManifestFileSize(longValue);
+      assertThat(opt.maxManifestFileSize()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void tableCacheNumshardbits() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setTableCacheNumshardbits(intValue);
+      assertThat(opt.tableCacheNumshardbits()).
+          isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void walSizeLimitMB() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setWalSizeLimitMB(longValue);
+      assertThat(opt.walSizeLimitMB()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void walTtlSeconds() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setWalTtlSeconds(longValue);
+      assertThat(opt.walTtlSeconds()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void manifestPreallocationSize() throws RocksDBException {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setManifestPreallocationSize(longValue);
+      assertThat(opt.manifestPreallocationSize()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void allowOsBuffer() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowOsBuffer(boolValue);
+      assertThat(opt.allowOsBuffer()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void allowMmapReads() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowMmapReads(boolValue);
+      assertThat(opt.allowMmapReads()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void allowMmapWrites() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setAllowMmapWrites(boolValue);
+      assertThat(opt.allowMmapWrites()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void isFdCloseOnExec() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setIsFdCloseOnExec(boolValue);
+      assertThat(opt.isFdCloseOnExec()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void statsDumpPeriodSec() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      int intValue = rand.nextInt();
+      opt.setStatsDumpPeriodSec(intValue);
+      assertThat(opt.statsDumpPeriodSec()).isEqualTo(intValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void adviseRandomOnOpen() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setAdviseRandomOnOpen(boolValue);
+      assertThat(opt.adviseRandomOnOpen()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void useAdaptiveMutex() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      boolean boolValue = rand.nextBoolean();
+      opt.setUseAdaptiveMutex(boolValue);
+      assertThat(opt.useAdaptiveMutex()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void bytesPerSync() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      opt.setBytesPerSync(longValue);
+      assertThat(opt.bytesPerSync()).isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void env() {
+    Options options = null;
+    try {
+      options = new Options();
+      Env env = Env.getDefault();
+      options.setEnv(env);
+      assertThat(options.getEnv()).isSameAs(env);
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void linkageOfPrepMethods() {
+    Options options = null;
+    try {
+      options = new Options();
+      options.optimizeUniversalStyleCompaction();
+      options.optimizeUniversalStyleCompaction(4000);
+      options.optimizeLevelStyleCompaction();
+      options.optimizeLevelStyleCompaction(3000);
+      options.optimizeForPointLookup(10);
+      options.prepareForBulkLoad();
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compressionTypes() {
+    Options options = null;
+    try {
+      options = new Options();
+      for (CompressionType compressionType :
+          CompressionType.values()) {
+        options.setCompressionType(compressionType);
+        assertThat(options.compressionType()).
+            isEqualTo(compressionType);
+        assertThat(CompressionType.valueOf("NO_COMPRESSION")).
+            isEqualTo(CompressionType.NO_COMPRESSION);
+      }
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compressionPerLevel() {
+    ColumnFamilyOptions columnFamilyOptions = null;
+    try {
+      columnFamilyOptions = new ColumnFamilyOptions();
+      assertThat(columnFamilyOptions.compressionPerLevel()).isEmpty();
+      List<CompressionType> compressionTypeList =
+          new ArrayList<>();
+      for (int i=0; i < columnFamilyOptions.numLevels(); i++) {
+        compressionTypeList.add(CompressionType.NO_COMPRESSION);
+      }
+      columnFamilyOptions.setCompressionPerLevel(compressionTypeList);
+      compressionTypeList = columnFamilyOptions.compressionPerLevel();
+      for (final CompressionType compressionType : compressionTypeList) {
+        assertThat(compressionType).isEqualTo(
+            CompressionType.NO_COMPRESSION);
+      }
+    } finally {
+      if (columnFamilyOptions != null) {
+        columnFamilyOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void differentCompressionsPerLevel() {
+    ColumnFamilyOptions columnFamilyOptions = null;
+    try {
+      columnFamilyOptions = new ColumnFamilyOptions();
+      columnFamilyOptions.setNumLevels(3);
+
+      assertThat(columnFamilyOptions.compressionPerLevel()).isEmpty();
+      List<CompressionType> compressionTypeList = new ArrayList<>();
+
+      compressionTypeList.add(CompressionType.BZLIB2_COMPRESSION);
+      compressionTypeList.add(CompressionType.SNAPPY_COMPRESSION);
+      compressionTypeList.add(CompressionType.LZ4_COMPRESSION);
+
+      columnFamilyOptions.setCompressionPerLevel(compressionTypeList);
+      compressionTypeList = columnFamilyOptions.compressionPerLevel();
+
+      assertThat(compressionTypeList.size()).isEqualTo(3);
+      assertThat(compressionTypeList).
+          containsExactly(
+              CompressionType.BZLIB2_COMPRESSION,
+              CompressionType.SNAPPY_COMPRESSION,
+              CompressionType.LZ4_COMPRESSION);
+
+    } finally {
+      if (columnFamilyOptions != null) {
+        columnFamilyOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compactionStyles() {
+    Options options = null;
+    try {
+      options = new Options();
+      for (CompactionStyle compactionStyle :
+          CompactionStyle.values()) {
+        options.setCompactionStyle(compactionStyle);
+        assertThat(options.compactionStyle()).
+            isEqualTo(compactionStyle);
+        assertThat(CompactionStyle.valueOf("FIFO")).
+            isEqualTo(CompactionStyle.FIFO);
+      }
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void maxTableFilesSizeFIFO() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      long longValue = rand.nextLong();
+      // Size has to be positive
+      longValue = (longValue < 0) ? -longValue : longValue;
+      longValue = (longValue == 0) ? longValue + 1 : longValue;
+      opt.setMaxTableFilesSizeFIFO(longValue);
+      assertThat(opt.maxTableFilesSizeFIFO()).
+          isEqualTo(longValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void rateLimiterConfig() {
+    Options options = null;
+    Options anotherOptions = null;
+    RateLimiterConfig rateLimiterConfig;
+    try {
+      options = new Options();
+      rateLimiterConfig = new GenericRateLimiterConfig(1000, 100 * 1000, 1);
+      options.setRateLimiterConfig(rateLimiterConfig);
+      // Test with parameter initialization
+      anotherOptions = new Options();
+      anotherOptions.setRateLimiterConfig(
+          new GenericRateLimiterConfig(1000));
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+      if (anotherOptions != null) {
+        anotherOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void shouldSetTestPrefixExtractor() {
+    Options options = null;
+    try {
+      options = new Options();
+      options.useFixedLengthPrefixExtractor(100);
+      options.useFixedLengthPrefixExtractor(10);
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void shouldSetTestCappedPrefixExtractor() {
+    Options options = null;
+    try {
+      options = new Options();
+      options.useCappedPrefixExtractor(100);
+      options.useCappedPrefixExtractor(10);
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+
+  @Test
+  public void shouldTestMemTableFactoryName()
+      throws RocksDBException {
+    Options options = null;
+    try {
+      options = new Options();
+      options.setMemTableConfig(new VectorMemTableConfig());
+      assertThat(options.memTableFactoryName()).
+          isEqualTo("VectorRepFactory");
+      options.setMemTableConfig(
+          new HashLinkedListMemTableConfig());
+      assertThat(options.memTableFactoryName()).
+          isEqualTo("HashLinkedListRepFactory");
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void statistics() {
+    Options options = null;
+    Options anotherOptions = null;
+    try {
+      options = new Options();
+      Statistics statistics = options.createStatistics().
+          statisticsPtr();
+      assertThat(statistics).isNotNull();
+      anotherOptions = new Options();
+      statistics = anotherOptions.statisticsPtr();
+      assertThat(statistics).isNotNull();
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+      if (anotherOptions != null) {
+        anotherOptions.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/PlainTableConfigTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/PlainTableConfigTest.java
new file mode 100644
index 0000000..850b050
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/PlainTableConfigTest.java
@@ -0,0 +1,95 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class PlainTableConfigTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void keySize() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setKeySize(5);
+    assertThat(plainTableConfig.keySize()).
+        isEqualTo(5);
+  }
+
+  @Test
+  public void bloomBitsPerKey() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setBloomBitsPerKey(11);
+    assertThat(plainTableConfig.bloomBitsPerKey()).
+        isEqualTo(11);
+  }
+
+  @Test
+  public void hashTableRatio() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setHashTableRatio(0.95);
+    assertThat(plainTableConfig.hashTableRatio()).
+        isEqualTo(0.95);
+  }
+
+  @Test
+  public void indexSparseness() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setIndexSparseness(18);
+    assertThat(plainTableConfig.indexSparseness()).
+        isEqualTo(18);
+  }
+
+  @Test
+  public void hugePageTlbSize() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setHugePageTlbSize(1);
+    assertThat(plainTableConfig.hugePageTlbSize()).
+        isEqualTo(1);
+  }
+
+  @Test
+  public void encodingType() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setEncodingType(EncodingType.kPrefix);
+    assertThat(plainTableConfig.encodingType()).isEqualTo(
+        EncodingType.kPrefix);
+  }
+
+  @Test
+  public void fullScanMode() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setFullScanMode(true);
+    assertThat(plainTableConfig.fullScanMode()).isTrue();  }
+
+  @Test
+  public void storeIndexInFile() {
+    PlainTableConfig plainTableConfig = new PlainTableConfig();
+    plainTableConfig.setStoreIndexInFile(true);
+    assertThat(plainTableConfig.storeIndexInFile()).
+        isTrue();
+  }
+
+  @Test
+  public void plainTableConfig() {
+    Options opt = null;
+    try {
+      opt = new Options();
+      PlainTableConfig plainTableConfig = new PlainTableConfig();
+      opt.setTableFormatConfig(plainTableConfig);
+      assertThat(opt.tableFactoryName()).isEqualTo("PlainTable");
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/PlatformRandomHelper.java b/src/rocksdb/java/src/test/java/org/rocksdb/PlatformRandomHelper.java
new file mode 100644
index 0000000..0155ce2
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/PlatformRandomHelper.java
@@ -0,0 +1,58 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.Random;
+
+/**
+ * Helper class to get the appropriate Random class instance dependent
+ * on the current platform architecture (32bit vs 64bit)
+ */
+public class PlatformRandomHelper {
+    /**
+     * Determine if OS is 32-Bit/64-Bit
+     *
+     * @return boolean value indicating if operating system is 64 Bit.
+     */
+    public static boolean isOs64Bit(){
+      boolean is64Bit;
+      if (System.getProperty("os.name").contains("Windows")) {
+        is64Bit = (System.getenv("ProgramFiles(x86)") != null);
+      } else {
+        is64Bit = (System.getProperty("os.arch").contains("64"));
+      }
+      return is64Bit;
+    }
+
+    /**
+     * Factory to get a platform specific Random instance
+     *
+     * @return {@link java.util.Random} instance.
+     */
+    public static Random getPlatformSpecificRandomFactory(){
+      if (isOs64Bit()) {
+        return new Random();
+      }
+      return new Random32Bit();
+    }
+
+    /**
+     * Random32Bit is a class which overrides {@code nextLong} to
+     * provide random numbers which fit in size_t. This workaround
+     * is necessary because there is no unsigned_int < Java 8
+     */
+    private static class Random32Bit extends Random {
+      @Override
+      public long nextLong(){
+      return this.nextInt(Integer.MAX_VALUE);
+    }
+    }
+
+    /**
+     * Utility class constructor
+     */
+    private PlatformRandomHelper() { }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java
new file mode 100644
index 0000000..70ea75d
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/ReadOnlyTest.java
@@ -0,0 +1,365 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class ReadOnlyTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void readOnlyOpen() throws RocksDBException {
+    RocksDB db = null;
+    RocksDB db2 = null;
+    RocksDB db3 = null;
+    Options options = null;
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList2 =
+        new ArrayList<>();
+    try {
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      db2 = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath());
+      assertThat("value").
+          isEqualTo(new String(db2.get("key".getBytes())));
+      db.close();
+      db2.close();
+
+      List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+
+      db = RocksDB.open(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList);
+      columnFamilyHandleList.add(db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf".getBytes(), new ColumnFamilyOptions())));
+      columnFamilyHandleList.add(db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf2".getBytes(), new ColumnFamilyOptions())));
+      db.put(columnFamilyHandleList.get(2), "key2".getBytes(),
+          "value2".getBytes());
+
+      db2 = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+      assertThat(db2.get("key2".getBytes())).isNull();
+      assertThat(db2.get(readOnlyColumnFamilyHandleList.get(0), "key2".getBytes())).
+          isNull();
+      cfDescriptors.clear();
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor("new_cf2".getBytes(), new ColumnFamilyOptions()));
+      db3 = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList2);
+      assertThat(new String(db3.get(readOnlyColumnFamilyHandleList2.get(1),
+          "key2".getBytes()))).isEqualTo("value2");
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db2 != null) {
+        db2.close();
+      }
+      for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList2) {
+        columnFamilyHandle.dispose();
+      }
+      if (db3 != null) {
+        db3.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToWriteInReadOnly() throws RocksDBException {
+    RocksDB db = null;
+    RocksDB rDb = null;
+    Options options = null;
+    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+        new ArrayList<>();
+    try {
+
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.close();
+      rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+
+      // test that put fails in readonly mode
+      rDb.put("key".getBytes(), "value".getBytes());
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (rDb != null) {
+        rDb.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToCFWriteInReadOnly() throws RocksDBException {
+    RocksDB db = null;
+    RocksDB rDb = null;
+    Options options = null;
+    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.close();
+      rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+
+      rDb.put(readOnlyColumnFamilyHandleList.get(0),
+          "key".getBytes(), "value".getBytes());
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (rDb != null) {
+        rDb.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToRemoveInReadOnly() throws RocksDBException {
+    RocksDB db = null;
+    RocksDB rDb = null;
+    Options options = null;
+    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.close();
+      rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+
+      rDb.remove("key".getBytes());
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (rDb != null) {
+        rDb.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToCFRemoveInReadOnly() throws RocksDBException {
+    RocksDB db = null;
+    RocksDB rDb = null;
+    Options options = null;
+    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+        new ArrayList<>();
+    try {
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.close();
+
+      rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+
+      rDb.remove(readOnlyColumnFamilyHandleList.get(0),
+          "key".getBytes());
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (rDb != null) {
+        rDb.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToWriteBatchReadOnly() throws RocksDBException {
+    RocksDB db = null;
+    RocksDB rDb = null;
+    Options options = null;
+    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+        new ArrayList<>();
+    try {
+
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.close();
+
+      rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+
+      WriteBatch wb = new WriteBatch();
+      wb.put("key".getBytes(), "value".getBytes());
+      rDb.write(new WriteOptions(), wb);
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (rDb != null) {
+        rDb.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void failToCFWriteBatchReadOnly() throws RocksDBException {
+    RocksDB db = null;
+    RocksDB rDb = null;
+    Options options = null;
+    WriteBatch wb = null;
+    List<ColumnFamilyDescriptor> cfDescriptors = new ArrayList<>();
+    List<ColumnFamilyHandle> readOnlyColumnFamilyHandleList =
+        new ArrayList<>();
+    try {
+
+      cfDescriptors.add(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY,
+              new ColumnFamilyOptions()));
+
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.close();
+
+      rDb = RocksDB.openReadOnly(
+          dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+          readOnlyColumnFamilyHandleList);
+
+      wb = new WriteBatch();
+      wb.put(readOnlyColumnFamilyHandleList.get(0),
+          "key".getBytes(), "value".getBytes());
+      rDb.write(new WriteOptions(), wb);
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle : readOnlyColumnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (rDb != null) {
+        rDb.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (wb != null) {
+        wb.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java
new file mode 100644
index 0000000..af88ce3
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java
@@ -0,0 +1,151 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.Random;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class ReadOptionsTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public ExpectedException exception = ExpectedException.none();
+
+  @Test
+  public void verifyChecksum(){
+    ReadOptions opt = null;
+    try {
+      opt = new ReadOptions();
+      Random rand = new Random();
+      boolean boolValue = rand.nextBoolean();
+      opt.setVerifyChecksums(boolValue);
+      assertThat(opt.verifyChecksums()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void fillCache(){
+    ReadOptions opt = null;
+    try {
+      opt = new ReadOptions();
+      Random rand = new Random();
+      boolean boolValue = rand.nextBoolean();
+      opt.setFillCache(boolValue);
+      assertThat(opt.fillCache()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void tailing(){
+    ReadOptions opt = null;
+    try {
+      opt = new ReadOptions();
+      Random rand = new Random();
+      boolean boolValue = rand.nextBoolean();
+      opt.setTailing(boolValue);
+      assertThat(opt.tailing()).isEqualTo(boolValue);
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void snapshot(){
+    ReadOptions opt = null;
+    try {
+      opt = new ReadOptions();
+      opt.setSnapshot(null);
+      assertThat(opt.snapshot()).isNull();
+    } finally {
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void failSetVerifyChecksumUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.setVerifyChecksums(true);
+  }
+
+  @Test
+  public void failVerifyChecksumUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.verifyChecksums();
+  }
+
+  @Test
+  public void failSetFillCacheUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.setFillCache(true);
+  }
+
+  @Test
+  public void failFillCacheUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.fillCache();
+  }
+
+  @Test
+  public void failSetTailingUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.setTailing(true);
+  }
+
+  @Test
+  public void failTailingUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.tailing();
+  }
+
+  @Test
+  public void failSetSnapshotUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.setSnapshot(null);
+  }
+
+  @Test
+  public void failSnapshotUninitialized(){
+    ReadOptions readOptions = setupUninitializedReadOptions(
+        exception);
+    readOptions.snapshot();
+  }
+
+  private ReadOptions setupUninitializedReadOptions(
+      ExpectedException exception) {
+    ReadOptions readOptions = new ReadOptions();
+    readOptions.dispose();
+    exception.expect(AssertionError.class);
+    return readOptions;
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java
new file mode 100644
index 0000000..31d2c52
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java
@@ -0,0 +1,809 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class RocksDBTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  public static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void open() throws RocksDBException {
+    RocksDB db = null;
+    Options opt = null;
+    try {
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      db.close();
+      opt = new Options();
+      opt.setCreateIfMissing(true);
+      db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void put() throws RocksDBException {
+    RocksDB db = null;
+    WriteOptions opt = null;
+    try {
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      db.put("key1".getBytes(), "value".getBytes());
+      opt = new WriteOptions();
+      db.put(opt, "key2".getBytes(), "12345678".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo(
+          "12345678".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void write() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    WriteBatch wb1 = null;
+    WriteBatch wb2 = null;
+    WriteOptions opts = null;
+    try {
+      options = new Options().
+          setMergeOperator(new StringAppendOperator()).
+          setCreateIfMissing(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      opts = new WriteOptions();
+      wb1 = new WriteBatch();
+      wb1.put("key1".getBytes(), "aa".getBytes());
+      wb1.merge("key1".getBytes(), "bb".getBytes());
+      wb2 = new WriteBatch();
+      wb2.put("key2".getBytes(), "xx".getBytes());
+      wb2.merge("key2".getBytes(), "yy".getBytes());
+      db.write(opts, wb1);
+      db.write(opts, wb2);
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "aa,bb".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo(
+          "xx,yy".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (wb1 != null) {
+        wb1.dispose();
+      }
+      if (wb2 != null) {
+        wb2.dispose();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (opts != null) {
+        opts.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void getWithOutValue() throws RocksDBException {
+    RocksDB db = null;
+    try {
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      byte[] outValue = new byte[5];
+      // not found value
+      int getResult = db.get("keyNotFound".getBytes(), outValue);
+      assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND);
+      // found value which fits in outValue
+      getResult = db.get("key1".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("value".getBytes());
+      // found value which fits partially
+      getResult = db.get("key2".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("12345".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+    }
+  }
+
+  @Test
+  public void getWithOutValueReadOptions() throws RocksDBException {
+    RocksDB db = null;
+    ReadOptions rOpt = null;
+    try {
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      rOpt = new ReadOptions();
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      byte[] outValue = new byte[5];
+      // not found value
+      int getResult = db.get(rOpt, "keyNotFound".getBytes(),
+          outValue);
+      assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND);
+      // found value which fits in outValue
+      getResult = db.get(rOpt, "key1".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("value".getBytes());
+      // found value which fits partially
+      getResult = db.get(rOpt, "key2".getBytes(), outValue);
+      assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND);
+      assertThat(outValue).isEqualTo("12345".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (rOpt != null) {
+        rOpt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void multiGet() throws RocksDBException {
+    RocksDB db = null;
+    ReadOptions rOpt = null;
+    try {
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      rOpt = new ReadOptions();
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      List<byte[]> lookupKeys = new ArrayList<byte[]>() {{
+        add("key1".getBytes());
+        add("key2".getBytes());
+      }};
+      Map<byte[], byte[]> results = db.multiGet(lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results.values()).isNotNull();
+      assertThat(results.values()).
+          contains("value".getBytes(), "12345678".getBytes());
+      // test same method with ReadOptions
+      results = db.multiGet(rOpt, lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results.values()).isNotNull();
+      assertThat(results.values()).
+          contains("value".getBytes(), "12345678".getBytes());
+
+      // remove existing key
+      lookupKeys.remove("key2".getBytes());
+      // add non existing key
+      lookupKeys.add("key3".getBytes());
+      results = db.multiGet(lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results.values()).isNotNull();
+      assertThat(results.values()).
+          contains("value".getBytes());
+      // test same call with readOptions
+      results = db.multiGet(rOpt, lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results.values()).isNotNull();
+      assertThat(results.values()).
+          contains("value".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (rOpt != null) {
+        rOpt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void merge() throws RocksDBException {
+    RocksDB db = null;
+    Options opt = null;
+    WriteOptions wOpt;
+    try {
+      opt = new Options().
+          setCreateIfMissing(true).
+          setMergeOperator(new StringAppendOperator());
+      wOpt = new WriteOptions();
+      db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath());
+      db.put("key1".getBytes(), "value".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value".getBytes());
+      // merge key1 with another value portion
+      db.merge("key1".getBytes(), "value2".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value,value2".getBytes());
+      // merge key1 with another value portion
+      db.merge(wOpt, "key1".getBytes(), "value3".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value,value2,value3".getBytes());
+      // merge on non existent key shall insert the value
+      db.merge(wOpt, "key2".getBytes(), "xxxx".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo(
+          "xxxx".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void remove() throws RocksDBException {
+    RocksDB db = null;
+    WriteOptions wOpt;
+    try {
+      wOpt = new WriteOptions();
+      db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      assertThat(db.get("key1".getBytes())).isEqualTo(
+          "value".getBytes());
+      assertThat(db.get("key2".getBytes())).isEqualTo(
+          "12345678".getBytes());
+      db.remove("key1".getBytes());
+      db.remove(wOpt, "key2".getBytes());
+      assertThat(db.get("key1".getBytes())).isNull();
+      assertThat(db.get("key2".getBytes())).isNull();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+    }
+  }
+
+  @Test
+  public void getIntProperty() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    WriteOptions wOpt = null;
+    try {
+      options = new Options();
+      wOpt = new WriteOptions();
+      // Setup options
+      options.setCreateIfMissing(true);
+      options.setMaxWriteBufferNumber(10);
+      options.setMinWriteBufferNumberToMerge(10);
+      wOpt.setDisableWAL(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.put(wOpt, "key1".getBytes(), "value1".getBytes());
+      db.put(wOpt, "key2".getBytes(), "value2".getBytes());
+      db.put(wOpt, "key3".getBytes(), "value3".getBytes());
+      db.put(wOpt, "key4".getBytes(), "value4".getBytes());
+      assertThat(db.getLongProperty("rocksdb.num-entries-active-mem-table")).isGreaterThan(0);
+      assertThat(db.getLongProperty("rocksdb.cur-size-active-mem-table")).isGreaterThan(0);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (wOpt != null) {
+        wOpt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void fullCompactRange() throws RocksDBException {
+    RocksDB db = null;
+    Options opt = null;
+    try {
+      opt = new Options().
+          setCreateIfMissing(true).
+          setDisableAutoCompactions(true).
+          setCompactionStyle(CompactionStyle.LEVEL).
+          setNumLevels(4).
+          setWriteBufferSize(100<<10).
+          setLevelZeroFileNumCompactionTrigger(3).
+          setTargetFileSizeBase(200 << 10).
+          setTargetFileSizeMultiplier(1).
+          setMaxBytesForLevelBase(500 << 10).
+          setMaxBytesForLevelMultiplier(1).
+          setDisableAutoCompactions(false);
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put((String.valueOf(i)).getBytes(), b);
+      }
+      db.compactRange();
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void fullCompactRangeColumnFamily()
+      throws RocksDBException {
+    RocksDB db = null;
+    DBOptions opt = null;
+    List<ColumnFamilyHandle> columnFamilyHandles =
+        new ArrayList<>();
+    try {
+      opt = new DBOptions().
+          setCreateIfMissing(true).
+          setCreateMissingColumnFamilies(true);
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          new ArrayList<>();
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          RocksDB.DEFAULT_COLUMN_FAMILY));
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          "new_cf".getBytes(),
+          new ColumnFamilyOptions().
+              setDisableAutoCompactions(true).
+              setCompactionStyle(CompactionStyle.LEVEL).
+              setNumLevels(4).
+              setWriteBufferSize(100 << 10).
+              setLevelZeroFileNumCompactionTrigger(3).
+              setTargetFileSizeBase(200 << 10).
+              setTargetFileSizeMultiplier(1).
+              setMaxBytesForLevelBase(500 << 10).
+              setMaxBytesForLevelMultiplier(1).
+              setDisableAutoCompactions(false)));
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles);
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put(columnFamilyHandles.get(1),
+            String.valueOf(i).getBytes(), b);
+      }
+      db.compactRange(columnFamilyHandles.get(1));
+    } finally {
+      for (ColumnFamilyHandle handle : columnFamilyHandles) {
+        handle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compactRangeWithKeys()
+      throws RocksDBException {
+    RocksDB db = null;
+    Options opt = null;
+    try {
+      opt = new Options().
+          setCreateIfMissing(true).
+          setDisableAutoCompactions(true).
+          setCompactionStyle(CompactionStyle.LEVEL).
+          setNumLevels(4).
+          setWriteBufferSize(100<<10).
+          setLevelZeroFileNumCompactionTrigger(3).
+          setTargetFileSizeBase(200 << 10).
+          setTargetFileSizeMultiplier(1).
+          setMaxBytesForLevelBase(500 << 10).
+          setMaxBytesForLevelMultiplier(1).
+          setDisableAutoCompactions(false);
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put((String.valueOf(i)).getBytes(), b);
+      }
+      db.compactRange("0".getBytes(), "201".getBytes());
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compactRangeWithKeysReduce()
+      throws RocksDBException {
+    RocksDB db = null;
+    Options opt = null;
+    try {
+      opt = new Options().
+          setCreateIfMissing(true).
+          setDisableAutoCompactions(true).
+          setCompactionStyle(CompactionStyle.LEVEL).
+          setNumLevels(4).
+          setWriteBufferSize(100<<10).
+          setLevelZeroFileNumCompactionTrigger(3).
+          setTargetFileSizeBase(200 << 10).
+          setTargetFileSizeMultiplier(1).
+          setMaxBytesForLevelBase(500 << 10).
+          setMaxBytesForLevelMultiplier(1).
+          setDisableAutoCompactions(false);
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put((String.valueOf(i)).getBytes(), b);
+      }
+      db.flush(new FlushOptions().setWaitForFlush(true));
+      db.compactRange("0".getBytes(), "201".getBytes(),
+          true, -1, 0);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compactRangeWithKeysColumnFamily()
+      throws RocksDBException {
+    RocksDB db = null;
+    DBOptions opt = null;
+    List<ColumnFamilyHandle> columnFamilyHandles =
+        new ArrayList<>();
+    try {
+      opt = new DBOptions().
+          setCreateIfMissing(true).
+          setCreateMissingColumnFamilies(true);
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          new ArrayList<>();
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          RocksDB.DEFAULT_COLUMN_FAMILY));
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          "new_cf".getBytes(),
+          new ColumnFamilyOptions().
+              setDisableAutoCompactions(true).
+              setCompactionStyle(CompactionStyle.LEVEL).
+              setNumLevels(4).
+              setWriteBufferSize(100<<10).
+              setLevelZeroFileNumCompactionTrigger(3).
+              setTargetFileSizeBase(200 << 10).
+              setTargetFileSizeMultiplier(1).
+              setMaxBytesForLevelBase(500 << 10).
+              setMaxBytesForLevelMultiplier(1).
+              setDisableAutoCompactions(false)));
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles);
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put(columnFamilyHandles.get(1),
+            String.valueOf(i).getBytes(), b);
+      }
+      db.compactRange(columnFamilyHandles.get(1),
+          "0".getBytes(), "201".getBytes());
+    } finally {
+      for (ColumnFamilyHandle handle : columnFamilyHandles) {
+        handle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compactRangeWithKeysReduceColumnFamily()
+      throws RocksDBException {
+    RocksDB db = null;
+    DBOptions opt = null;
+    List<ColumnFamilyHandle> columnFamilyHandles =
+        new ArrayList<>();
+    try {
+      opt = new DBOptions().
+          setCreateIfMissing(true).
+          setCreateMissingColumnFamilies(true);
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          new ArrayList<>();
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          RocksDB.DEFAULT_COLUMN_FAMILY));
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          "new_cf".getBytes(),
+          new ColumnFamilyOptions().
+              setDisableAutoCompactions(true).
+              setCompactionStyle(CompactionStyle.LEVEL).
+              setNumLevels(4).
+              setWriteBufferSize(100<<10).
+              setLevelZeroFileNumCompactionTrigger(3).
+              setTargetFileSizeBase(200 << 10).
+              setTargetFileSizeMultiplier(1).
+              setMaxBytesForLevelBase(500 << 10).
+              setMaxBytesForLevelMultiplier(1).
+              setDisableAutoCompactions(false)));
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles);
+      // fill database with key/value pairs
+      byte[] b = new byte[10000];
+      for (int i = 0; i < 200; i++) {
+        rand.nextBytes(b);
+        db.put(columnFamilyHandles.get(1),
+            String.valueOf(i).getBytes(), b);
+      }
+      db.compactRange(columnFamilyHandles.get(1), "0".getBytes(),
+          "201".getBytes(), true, -1, 0);
+    } finally {
+      for (ColumnFamilyHandle handle : columnFamilyHandles) {
+        handle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compactRangeToLevel()
+      throws RocksDBException, InterruptedException {
+    RocksDB db = null;
+    Options opt = null;
+    try {
+      final int NUM_KEYS_PER_L0_FILE = 100;
+      final int KEY_SIZE = 20;
+      final int VALUE_SIZE = 300;
+      final int L0_FILE_SIZE =
+          NUM_KEYS_PER_L0_FILE * (KEY_SIZE + VALUE_SIZE);
+      final int NUM_L0_FILES = 10;
+      final int TEST_SCALE = 5;
+      final int KEY_INTERVAL = 100;
+      opt = new Options().
+          setCreateIfMissing(true).
+          setCompactionStyle(CompactionStyle.LEVEL).
+          setNumLevels(5).
+          // a slightly bigger write buffer than L0 file
+          // so that we can ensure manual flush always
+          // go before background flush happens.
+          setWriteBufferSize(L0_FILE_SIZE * 2).
+          // Disable auto L0 -> L1 compaction
+          setLevelZeroFileNumCompactionTrigger(20).
+          setTargetFileSizeBase(L0_FILE_SIZE * 100).
+          setTargetFileSizeMultiplier(1).
+          // To disable auto compaction
+          setMaxBytesForLevelBase(NUM_L0_FILES * L0_FILE_SIZE * 100).
+          setMaxBytesForLevelMultiplier(2).
+          setDisableAutoCompactions(true);
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+      // fill database with key/value pairs
+      byte[] value = new byte[VALUE_SIZE];
+      int int_key = 0;
+      for (int round = 0; round < 5; ++round) {
+        int initial_key = int_key;
+        for (int f = 1; f <= NUM_L0_FILES; ++f) {
+          for (int i = 0; i < NUM_KEYS_PER_L0_FILE; ++i) {
+            int_key += KEY_INTERVAL;
+            rand.nextBytes(value);
+
+            db.put(String.format("%020d", int_key).getBytes(),
+                   value);
+          }
+          db.flush(new FlushOptions().setWaitForFlush(true));
+          // Make sure we do create one more L0 files.
+          assertThat(
+              db.getProperty("rocksdb.num-files-at-level0")).
+              isEqualTo("" + f);
+        }
+
+        // Compact all L0 files we just created
+        db.compactRange(
+            String.format("%020d", initial_key).getBytes(),
+            String.format("%020d", int_key - 1).getBytes());
+        // Making sure there isn't any L0 files.
+        assertThat(
+            db.getProperty("rocksdb.num-files-at-level0")).
+            isEqualTo("0");
+        // Making sure there are some L1 files.
+        // Here we only use != 0 instead of a specific number
+        // as we don't want the test make any assumption on
+        // how compaction works.
+        assertThat(
+            db.getProperty("rocksdb.num-files-at-level1")).
+            isNotEqualTo("0");
+        // Because we only compacted those keys we issued
+        // in this round, there shouldn't be any L1 -> L2
+        // compaction.  So we expect zero L2 files here.
+        assertThat(
+            db.getProperty("rocksdb.num-files-at-level2")).
+            isEqualTo("0");
+      }
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void compactRangeToLevelColumnFamily()
+      throws RocksDBException {
+    RocksDB db = null;
+    DBOptions opt = null;
+    List<ColumnFamilyHandle> columnFamilyHandles =
+        new ArrayList<>();
+    try {
+      final int NUM_KEYS_PER_L0_FILE = 100;
+      final int KEY_SIZE = 20;
+      final int VALUE_SIZE = 300;
+      final int L0_FILE_SIZE =
+          NUM_KEYS_PER_L0_FILE * (KEY_SIZE + VALUE_SIZE);
+      final int NUM_L0_FILES = 10;
+      final int TEST_SCALE = 5;
+      final int KEY_INTERVAL = 100;
+      opt = new DBOptions().
+          setCreateIfMissing(true).
+          setCreateMissingColumnFamilies(true);
+      List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          new ArrayList<>();
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          RocksDB.DEFAULT_COLUMN_FAMILY));
+      columnFamilyDescriptors.add(new ColumnFamilyDescriptor(
+          "new_cf".getBytes(),
+          new ColumnFamilyOptions().
+              setCompactionStyle(CompactionStyle.LEVEL).
+              setNumLevels(5).
+              // a slightly bigger write buffer than L0 file
+              // so that we can ensure manual flush always
+              // go before background flush happens.
+              setWriteBufferSize(L0_FILE_SIZE * 2).
+              // Disable auto L0 -> L1 compaction
+              setLevelZeroFileNumCompactionTrigger(20).
+              setTargetFileSizeBase(L0_FILE_SIZE * 100).
+              setTargetFileSizeMultiplier(1).
+              // To disable auto compaction
+              setMaxBytesForLevelBase(NUM_L0_FILES * L0_FILE_SIZE * 100).
+              setMaxBytesForLevelMultiplier(2).
+              setDisableAutoCompactions(true)));
+      // open database
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles);
+      // fill database with key/value pairs
+      byte[] value = new byte[VALUE_SIZE];
+      int int_key = 0;
+      for (int round = 0; round < 5; ++round) {
+        int initial_key = int_key;
+        for (int f = 1; f <= NUM_L0_FILES; ++f) {
+          for (int i = 0; i < NUM_KEYS_PER_L0_FILE; ++i) {
+            int_key += KEY_INTERVAL;
+            rand.nextBytes(value);
+
+            db.put(columnFamilyHandles.get(1),
+                   String.format("%020d", int_key).getBytes(),
+                   value);
+          }
+          db.flush(new FlushOptions().setWaitForFlush(true),
+                   columnFamilyHandles.get(1));
+          // Make sure we do create one more L0 files.
+          assertThat(
+              db.getProperty(columnFamilyHandles.get(1),
+                             "rocksdb.num-files-at-level0")).
+              isEqualTo("" + f);
+        }
+
+        // Compact all L0 files we just created
+        db.compactRange(
+            columnFamilyHandles.get(1),
+            String.format("%020d", initial_key).getBytes(),
+            String.format("%020d", int_key - 1).getBytes());
+        // Making sure there isn't any L0 files.
+        assertThat(
+            db.getProperty(columnFamilyHandles.get(1),
+                           "rocksdb.num-files-at-level0")).
+            isEqualTo("0");
+        // Making sure there are some L1 files.
+        // Here we only use != 0 instead of a specific number
+        // as we don't want the test make any assumption on
+        // how compaction works.
+        assertThat(
+            db.getProperty(columnFamilyHandles.get(1),
+                           "rocksdb.num-files-at-level1")).
+            isNotEqualTo("0");
+        // Because we only compacted those keys we issued
+        // in this round, there shouldn't be any L1 -> L2
+        // compaction.  So we expect zero L2 files here.
+        assertThat(
+            db.getProperty(columnFamilyHandles.get(1),
+                           "rocksdb.num-files-at-level2")).
+            isEqualTo("0");
+      }
+    } finally {
+      for (ColumnFamilyHandle handle : columnFamilyHandles) {
+        handle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void enableDisableFileDeletions() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    try {
+      options = new Options().setCreateIfMissing(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.disableFileDeletions();
+      db.enableFileDeletions(false);
+      db.disableFileDeletions();
+      db.enableFileDeletions(true);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/RocksEnvTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/RocksEnvTest.java
new file mode 100644
index 0000000..5914e6e
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/RocksEnvTest.java
@@ -0,0 +1,38 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class RocksEnvTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void rocksEnv(){
+    Env rocksEnv = RocksEnv.getDefault();
+    rocksEnv.setBackgroundThreads(5);
+    // default rocksenv will always return zero for flush pool
+    // no matter what was set via setBackgroundThreads
+    assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.FLUSH_POOL)).
+        isEqualTo(0);
+    rocksEnv.setBackgroundThreads(5, RocksEnv.FLUSH_POOL);
+    // default rocksenv will always return zero for flush pool
+    // no matter what was set via setBackgroundThreads
+    assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.FLUSH_POOL)).
+        isEqualTo(0);
+    rocksEnv.setBackgroundThreads(5, RocksEnv.COMPACTION_POOL);
+    // default rocksenv will always return zero for compaction pool
+    // no matter what was set via setBackgroundThreads
+    assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.COMPACTION_POOL)).
+        isEqualTo(0);
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java
new file mode 100644
index 0000000..170170f
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java
@@ -0,0 +1,72 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class RocksIteratorTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void rocksIterator() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    RocksIterator iterator = null;
+    try {
+      options = new Options();
+      options.setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+      db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      db.put("key1".getBytes(), "value1".getBytes());
+      db.put("key2".getBytes(), "value2".getBytes());
+
+      iterator = db.newIterator();
+
+      iterator.seekToFirst();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key1".getBytes());
+      assertThat(iterator.value()).isEqualTo("value1".getBytes());
+      iterator.next();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key2".getBytes());
+      assertThat(iterator.value()).isEqualTo("value2".getBytes());
+      iterator.next();
+      assertThat(iterator.isValid()).isFalse();
+      iterator.seekToLast();
+      iterator.prev();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key1".getBytes());
+      assertThat(iterator.value()).isEqualTo("value1".getBytes());
+      iterator.seekToFirst();
+      iterator.seekToLast();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key2".getBytes());
+      assertThat(iterator.value()).isEqualTo("value2".getBytes());
+      iterator.status();
+    } finally {
+      if (iterator != null) {
+        iterator.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java
new file mode 100644
index 0000000..d2791c9
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java
@@ -0,0 +1,196 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class RocksMemEnvTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void memEnvFillAndReopen() throws RocksDBException {
+
+    final byte[][] keys = {
+        "aaa".getBytes(),
+        "bbb".getBytes(),
+        "ccc".getBytes()
+    };
+
+    final byte[][] values = {
+        "foo".getBytes(),
+        "bar".getBytes(),
+        "baz".getBytes()
+    };
+
+    Env env = null;
+    Options options = null;
+    RocksDB db = null;
+    FlushOptions flushOptions = null;
+    try {
+      env = new RocksMemEnv();
+      options = new Options().
+          setCreateIfMissing(true).
+          setEnv(env);
+      flushOptions = new FlushOptions().
+          setWaitForFlush(true);
+      db = RocksDB.open(options, "dir/db");
+
+      // write key/value pairs using MemEnv
+      for (int i=0; i < keys.length; i++) {
+        db.put(keys[i], values[i]);
+      }
+
+      // read key/value pairs using MemEnv
+      for (int i=0; i < keys.length; i++) {
+        assertThat(db.get(keys[i])).isEqualTo(values[i]);
+      }
+
+      // Check iterator access
+      RocksIterator iterator = db.newIterator();
+      iterator.seekToFirst();
+      for (int i=0; i < keys.length; i++) {
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo(keys[i]);
+        assertThat(iterator.value()).isEqualTo(values[i]);
+        iterator.next();
+      }
+      // reached end of database
+      assertThat(iterator.isValid()).isFalse();
+      iterator.dispose();
+
+      // flush
+      db.flush(flushOptions);
+
+      // read key/value pairs after flush using MemEnv
+      for (int i=0; i < keys.length; i++) {
+        assertThat(db.get(keys[i])).isEqualTo(values[i]);
+      }
+
+      db.close();
+      options.setCreateIfMissing(false);
+
+      // After reopen the values shall still be in the mem env.
+      // as long as the env is not freed.
+      db = RocksDB.open(options, "dir/db");
+      // read key/value pairs using MemEnv
+      for (int i=0; i < keys.length; i++) {
+        assertThat(db.get(keys[i])).isEqualTo(values[i]);
+      }
+
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (flushOptions != null) {
+        flushOptions.dispose();
+      }
+      if (env != null) {
+        env.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void multipleDatabaseInstances() throws RocksDBException {
+    // db - keys
+    final byte[][] keys = {
+        "aaa".getBytes(),
+        "bbb".getBytes(),
+        "ccc".getBytes()
+    };
+    // otherDb - keys
+    final byte[][] otherKeys = {
+        "111".getBytes(),
+        "222".getBytes(),
+        "333".getBytes()
+    };
+    // values
+    final byte[][] values = {
+        "foo".getBytes(),
+        "bar".getBytes(),
+        "baz".getBytes()
+    };
+
+    Env env = null;
+    Options options = null;
+    RocksDB db = null, otherDb = null;
+
+    try {
+      env = new RocksMemEnv();
+      options = new Options().
+          setCreateIfMissing(true).
+          setEnv(env);
+      db = RocksDB.open(options, "dir/db");
+      otherDb = RocksDB.open(options, "dir/otherDb");
+
+      // write key/value pairs using MemEnv
+      // to db and to otherDb.
+      for (int i=0; i < keys.length; i++) {
+        db.put(keys[i], values[i]);
+        otherDb.put(otherKeys[i], values[i]);
+      }
+
+      // verify key/value pairs after flush using MemEnv
+      for (int i=0; i < keys.length; i++) {
+        // verify db
+        assertThat(db.get(otherKeys[i])).isNull();
+        assertThat(db.get(keys[i])).isEqualTo(values[i]);
+
+        // verify otherDb
+        assertThat(otherDb.get(keys[i])).isNull();
+        assertThat(otherDb.get(otherKeys[i])).isEqualTo(values[i]);
+      }
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (otherDb != null) {
+        otherDb.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (env != null) {
+        env.dispose();
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void createIfMissingFalse() throws RocksDBException {
+    Env env = null;
+    Options options = null;
+    RocksDB db = null;
+
+    try {
+      env = new RocksMemEnv();
+      options = new Options().
+          setCreateIfMissing(false).
+          setEnv(env);
+      // shall throw an exception because db dir does not
+      // exist.
+      db = RocksDB.open(options, "db/dir");
+    } finally {
+      if (options != null) {
+        options.dispose();
+      }
+      if (env != null) {
+        env.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/RocksMemoryResource.java b/src/rocksdb/java/src/test/java/org/rocksdb/RocksMemoryResource.java
new file mode 100644
index 0000000..de9ba0d
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/RocksMemoryResource.java
@@ -0,0 +1,20 @@
+package org.rocksdb;
+
+import org.junit.rules.ExternalResource;
+
+/**
+ * Resource to trigger garbage collection after each test
+ * run.
+ */
+public class RocksMemoryResource extends ExternalResource {
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  @Override
+  protected void after() {
+    System.gc();
+    System.runFinalization();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/SliceTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/SliceTest.java
new file mode 100644
index 0000000..fbd602b
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/SliceTest.java
@@ -0,0 +1,105 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class SliceTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void slice() {
+    Slice slice = null;
+    Slice otherSlice = null;
+    Slice thirdSlice = null;
+    try {
+      slice = new Slice("testSlice");
+      assertThat(slice.empty()).isFalse();
+      assertThat(slice.size()).isEqualTo(9);
+      assertThat(slice.data()).isEqualTo("testSlice".getBytes());
+
+      otherSlice = new Slice("otherSlice".getBytes());
+      assertThat(otherSlice.data()).isEqualTo("otherSlice".getBytes());
+
+      thirdSlice = new Slice("otherSlice".getBytes(), 5);
+      assertThat(thirdSlice.data()).isEqualTo("Slice".getBytes());
+    } finally {
+      if (slice != null) {
+        slice.dispose();
+      }
+      if (otherSlice != null) {
+        otherSlice.dispose();
+      }
+      if (thirdSlice != null) {
+        thirdSlice.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void sliceEquals() {
+    Slice slice = null;
+    Slice slice2 = null;
+    try {
+      slice = new Slice("abc");
+      slice2 = new Slice("abc");
+      assertThat(slice.equals(slice2)).isTrue();
+      assertThat(slice.hashCode() == slice2.hashCode()).isTrue();
+    } finally {
+      if (slice != null) {
+        slice.dispose();
+      }
+      if (slice2 != null) {
+        slice2.dispose();
+      }
+    }
+  }
+
+
+  @Test
+  public void sliceStartWith() {
+    Slice slice = null;
+    Slice match = null;
+    Slice noMatch = null;
+    try {
+      slice = new Slice("matchpoint");
+      match = new Slice("mat");
+      noMatch = new Slice("nomatch");
+
+      //assertThat(slice.startsWith(match)).isTrue();
+      assertThat(slice.startsWith(noMatch)).isFalse();
+    } finally {
+      if (slice != null) {
+        slice.dispose();
+      }
+      if (match != null) {
+        match.dispose();
+      }
+      if (noMatch != null) {
+        noMatch.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void sliceToString() {
+    Slice slice = null;
+    try {
+      slice = new Slice("stringTest");
+      assertThat(slice.toString()).isEqualTo("stringTest");
+      assertThat(slice.toString(true)).isNotEqualTo("");
+    } finally {
+      if (slice != null) {
+        slice.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/SnapshotTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/SnapshotTest.java
new file mode 100644
index 0000000..87ccdbc
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/SnapshotTest.java
@@ -0,0 +1,217 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class SnapshotTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void snapshots() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    ReadOptions readOptions = null;
+    try {
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      // Get new Snapshot of database
+      Snapshot snapshot = db.getSnapshot();
+      assertThat(snapshot.getSequenceNumber()).isGreaterThan(0);
+      assertThat(snapshot.getSequenceNumber()).isEqualTo(1);
+      readOptions = new ReadOptions();
+      // set snapshot in ReadOptions
+      readOptions.setSnapshot(snapshot);
+      // retrieve key value pair
+      assertThat(new String(db.get("key".getBytes()))).
+          isEqualTo("value");
+      // retrieve key value pair created before
+      // the snapshot was made
+      assertThat(new String(db.get(readOptions,
+          "key".getBytes()))).isEqualTo("value");
+      // add new key/value pair
+      db.put("newkey".getBytes(), "newvalue".getBytes());
+      // using no snapshot the latest db entries
+      // will be taken into account
+      assertThat(new String(db.get("newkey".getBytes()))).
+          isEqualTo("newvalue");
+      // snapshopot was created before newkey
+      assertThat(db.get(readOptions, "newkey".getBytes())).
+          isNull();
+      // Retrieve snapshot from read options
+      Snapshot sameSnapshot = readOptions.snapshot();
+      readOptions.setSnapshot(sameSnapshot);
+      // results must be the same with new Snapshot
+      // instance using the same native pointer
+      assertThat(new String(db.get(readOptions,
+          "key".getBytes()))).isEqualTo("value");
+      // update key value pair to newvalue
+      db.put("key".getBytes(), "newvalue".getBytes());
+      // read with previously created snapshot will
+      // read previous version of key value pair
+      assertThat(new String(db.get(readOptions,
+          "key".getBytes()))).isEqualTo("value");
+      // read for newkey using the snapshot must be
+      // null
+      assertThat(db.get(readOptions, "newkey".getBytes())).
+          isNull();
+      // setting null to snapshot in ReadOptions leads
+      // to no Snapshot being used.
+      readOptions.setSnapshot(null);
+      assertThat(new String(db.get(readOptions,
+          "newkey".getBytes()))).isEqualTo("newvalue");
+      // release Snapshot
+      db.releaseSnapshot(snapshot);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (readOptions != null) {
+        readOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void iteratorWithSnapshot() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    ReadOptions readOptions = null;
+    RocksIterator iterator = null;
+    RocksIterator snapshotIterator = null;
+    try {
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      // Get new Snapshot of database
+      Snapshot snapshot = db.getSnapshot();
+      readOptions = new ReadOptions();
+      // set snapshot in ReadOptions
+      readOptions.setSnapshot(snapshot);
+      db.put("key2".getBytes(), "value2".getBytes());
+
+      // iterate over current state of db
+      iterator = db.newIterator();
+      iterator.seekToFirst();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key".getBytes());
+      iterator.next();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key2".getBytes());
+      iterator.next();
+      assertThat(iterator.isValid()).isFalse();
+
+      // iterate using a snapshot
+      snapshotIterator = db.newIterator(readOptions);
+      snapshotIterator.seekToFirst();
+      assertThat(snapshotIterator.isValid()).isTrue();
+      assertThat(snapshotIterator.key()).isEqualTo("key".getBytes());
+      snapshotIterator.next();
+      assertThat(snapshotIterator.isValid()).isFalse();
+
+      // release Snapshot
+      db.releaseSnapshot(snapshot);
+    } finally {
+      if (iterator != null) {
+        iterator.dispose();
+      }
+      if (snapshotIterator != null) {
+        snapshotIterator.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (readOptions != null) {
+        readOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void iteratorWithSnapshotOnColumnFamily() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    ReadOptions readOptions = null;
+    RocksIterator iterator = null;
+    RocksIterator snapshotIterator = null;
+    try {
+
+      options = new Options();
+      options.setCreateIfMissing(true);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.put("key".getBytes(), "value".getBytes());
+      // Get new Snapshot of database
+      Snapshot snapshot = db.getSnapshot();
+      readOptions = new ReadOptions();
+      // set snapshot in ReadOptions
+      readOptions.setSnapshot(snapshot);
+      db.put("key2".getBytes(), "value2".getBytes());
+
+      // iterate over current state of column family
+      iterator = db.newIterator(db.getDefaultColumnFamily());
+      iterator.seekToFirst();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key".getBytes());
+      iterator.next();
+      assertThat(iterator.isValid()).isTrue();
+      assertThat(iterator.key()).isEqualTo("key2".getBytes());
+      iterator.next();
+      assertThat(iterator.isValid()).isFalse();
+
+      // iterate using a snapshot on default column family
+      snapshotIterator = db.newIterator(db.getDefaultColumnFamily(),
+          readOptions);
+      snapshotIterator.seekToFirst();
+      assertThat(snapshotIterator.isValid()).isTrue();
+      assertThat(snapshotIterator.key()).isEqualTo("key".getBytes());
+      snapshotIterator.next();
+      assertThat(snapshotIterator.isValid()).isFalse();
+
+      // release Snapshot
+      db.releaseSnapshot(snapshot);
+    } finally {
+      if (iterator != null) {
+        iterator.dispose();
+      }
+      if (snapshotIterator != null) {
+        snapshotIterator.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+      if (readOptions != null) {
+        readOptions.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java
new file mode 100644
index 0000000..927826d
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java
@@ -0,0 +1,60 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.util.Collections;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class StatisticsCollectorTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void statisticsCollector()
+      throws InterruptedException, RocksDBException {
+    Options opt = null;
+    RocksDB db = null;
+    try {
+      opt = new Options().createStatistics().setCreateIfMissing(true);
+      Statistics stats = opt.statisticsPtr();
+
+      db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath());
+
+      StatsCallbackMock callback = new StatsCallbackMock();
+      StatsCollectorInput statsInput = new StatsCollectorInput(stats, callback);
+
+      StatisticsCollector statsCollector = new StatisticsCollector(
+          Collections.singletonList(statsInput), 100);
+      statsCollector.start();
+
+      Thread.sleep(1000);
+
+      assertThat(callback.tickerCallbackCount).isGreaterThan(0);
+      assertThat(callback.histCallbackCount).isGreaterThan(0);
+
+      statsCollector.shutDown(1000);
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (opt != null) {
+        opt.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/StatsCallbackMock.java b/src/rocksdb/java/src/test/java/org/rocksdb/StatsCallbackMock.java
new file mode 100644
index 0000000..3c5800e
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/StatsCallbackMock.java
@@ -0,0 +1,20 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+public class StatsCallbackMock implements StatisticsCollectorCallback {
+  public int tickerCallbackCount = 0;
+  public int histCallbackCount = 0;
+
+  public void tickerCallback(TickerType tickerType, long tickerCount) {
+    tickerCallbackCount++;
+  }
+
+  public void histogramCallback(HistogramType histType,
+      HistogramData histData) {
+    histCallbackCount++;
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java
new file mode 100644
index 0000000..1de2efd
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java
@@ -0,0 +1,182 @@
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class TransactionLogIteratorTest {
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void transactionLogIterator() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    TransactionLogIterator transactionLogIterator = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      transactionLogIterator = db.getUpdatesSince(0);
+    } finally {
+      if (transactionLogIterator != null) {
+        transactionLogIterator.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void getBatch() throws RocksDBException {
+    final int numberOfPuts = 5;
+    RocksDB db = null;
+    Options options = null;
+    ColumnFamilyHandle cfHandle = null;
+    TransactionLogIterator transactionLogIterator = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true).
+          setWalTtlSeconds(1000).
+          setWalSizeLimitMB(10);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+
+      for (int i = 0; i < numberOfPuts; i++){
+        db.put(String.valueOf(i).getBytes(),
+            String.valueOf(i).getBytes());
+      }
+      db.flush(new FlushOptions().setWaitForFlush(true));
+
+      // the latest sequence number is 5 because 5 puts
+      // were written beforehand
+      assertThat(db.getLatestSequenceNumber()).
+          isEqualTo(numberOfPuts);
+
+      // insert 5 writes into a cf
+      cfHandle = db.createColumnFamily(
+          new ColumnFamilyDescriptor("new_cf".getBytes()));
+
+      for (int i = 0; i < numberOfPuts; i++){
+        db.put(cfHandle, String.valueOf(i).getBytes(),
+            String.valueOf(i).getBytes());
+      }
+      // the latest sequence number is 10 because
+      // (5 + 5) puts were written beforehand
+      assertThat(db.getLatestSequenceNumber()).
+          isEqualTo(numberOfPuts + numberOfPuts);
+
+      // Get updates since the beginning
+      transactionLogIterator = db.getUpdatesSince(0);
+      assertThat(transactionLogIterator.isValid()).isTrue();
+      transactionLogIterator.status();
+
+      // The first sequence number is 1
+      TransactionLogIterator.BatchResult batchResult =
+          transactionLogIterator.getBatch();
+      assertThat(batchResult.sequenceNumber()).isEqualTo(1);
+    } finally {
+      if (transactionLogIterator != null) {
+        transactionLogIterator.dispose();
+      }
+      if (cfHandle != null) {
+        cfHandle.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void transactionLogIteratorStallAtLastRecord() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    TransactionLogIterator transactionLogIterator = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true).
+          setWalTtlSeconds(1000).
+          setWalSizeLimitMB(10);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.put("key1".getBytes(), "value1".getBytes());
+      // Get updates since the beginning
+      transactionLogIterator = db.getUpdatesSince(0);
+      transactionLogIterator.status();
+      assertThat(transactionLogIterator.isValid()).isTrue();
+      transactionLogIterator.next();
+      assertThat(transactionLogIterator.isValid()).isFalse();
+      transactionLogIterator.status();
+      db.put("key2".getBytes(), "value2".getBytes());
+      transactionLogIterator.next();
+      transactionLogIterator.status();
+      assertThat(transactionLogIterator.isValid()).isTrue();
+
+    } finally {
+      if (transactionLogIterator != null) {
+        transactionLogIterator.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void transactionLogIteratorCheckAfterRestart() throws RocksDBException {
+    final int numberOfKeys = 2;
+    RocksDB db = null;
+    Options options = null;
+    TransactionLogIterator transactionLogIterator = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true).
+          setWalTtlSeconds(1000).
+          setWalSizeLimitMB(10);
+
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      db.put("key1".getBytes(), "value1".getBytes());
+      db.put("key2".getBytes(), "value2".getBytes());
+      db.flush(new FlushOptions().setWaitForFlush(true));
+      // reopen
+      db.close();
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+      assertThat(db.getLatestSequenceNumber()).isEqualTo(numberOfKeys);
+
+      transactionLogIterator = db.getUpdatesSince(0);
+      for (int i = 0; i < numberOfKeys; i++) {
+        transactionLogIterator.status();
+        assertThat(transactionLogIterator.isValid()).isTrue();
+        transactionLogIterator.next();
+      }
+    } finally {
+      if (transactionLogIterator != null) {
+        transactionLogIterator.dispose();
+      }
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/TtlDBTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/TtlDBTest.java
new file mode 100644
index 0000000..c60b1d5
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/TtlDBTest.java
@@ -0,0 +1,166 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class TtlDBTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void ttlDBOpen() throws RocksDBException,
+      InterruptedException {
+    Options options = null;
+    TtlDB ttlDB = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true).
+          setMaxGrandparentOverlapFactor(0);
+      ttlDB = TtlDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      ttlDB.put("key".getBytes(), "value".getBytes());
+      assertThat(ttlDB.get("key".getBytes())).
+          isEqualTo("value".getBytes());
+      assertThat(ttlDB.get("key".getBytes())).isNotNull();
+    } finally {
+      if (ttlDB != null) {
+        ttlDB.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void ttlDBOpenWithTtl() throws RocksDBException,
+      InterruptedException {
+    Options options = null;
+    TtlDB ttlDB = null;
+    try {
+      options = new Options().
+          setCreateIfMissing(true).
+          setMaxGrandparentOverlapFactor(0);
+      ttlDB = TtlDB.open(options, dbFolder.getRoot().getAbsolutePath(),
+          1, false);
+      ttlDB.put("key".getBytes(), "value".getBytes());
+      assertThat(ttlDB.get("key".getBytes())).
+          isEqualTo("value".getBytes());
+      TimeUnit.SECONDS.sleep(2);
+
+      ttlDB.compactRange();
+      assertThat(ttlDB.get("key".getBytes())).isNull();
+    } finally {
+      if (ttlDB != null) {
+        ttlDB.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void ttlDbOpenWithColumnFamilies() throws RocksDBException, InterruptedException {
+    DBOptions dbOptions = null;
+    TtlDB ttlDB = null;
+    List<ColumnFamilyDescriptor> cfNames =
+        new ArrayList<>();
+    List<ColumnFamilyHandle> columnFamilyHandleList =
+        new ArrayList<>();
+    cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY));
+    cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes()));
+    List<Integer> ttlValues = new ArrayList<>();
+    // Default column family with infinite lifetime
+    ttlValues.add(0);
+    // new column family with 1 second ttl
+    ttlValues.add(1);
+
+    try {
+      dbOptions = new DBOptions().
+          setCreateMissingColumnFamilies(true).
+          setCreateIfMissing(true);
+      ttlDB = TtlDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(),
+          cfNames, columnFamilyHandleList, ttlValues, false);
+
+      ttlDB.put("key".getBytes(), "value".getBytes());
+      assertThat(ttlDB.get("key".getBytes())).
+          isEqualTo("value".getBytes());
+      ttlDB.put(columnFamilyHandleList.get(1), "key".getBytes(),
+          "value".getBytes());
+      assertThat(ttlDB.get(columnFamilyHandleList.get(1),
+          "key".getBytes())).isEqualTo("value".getBytes());
+      TimeUnit.SECONDS.sleep(2);
+
+      ttlDB.compactRange();
+      ttlDB.compactRange(columnFamilyHandleList.get(1));
+
+      assertThat(ttlDB.get("key".getBytes())).isNotNull();
+      assertThat(ttlDB.get(columnFamilyHandleList.get(1),
+          "key".getBytes())).isNull();
+
+
+    } finally {
+      for (ColumnFamilyHandle columnFamilyHandle :
+          columnFamilyHandleList) {
+        columnFamilyHandle.dispose();
+      }
+      if (ttlDB != null) {
+        ttlDB.close();
+      }
+      if (dbOptions != null) {
+        dbOptions.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void createTtlColumnFamily() throws RocksDBException,
+      InterruptedException {
+    Options options = null;
+    TtlDB ttlDB = null;
+    ColumnFamilyHandle columnFamilyHandle = null;
+    try {
+      options = new Options().setCreateIfMissing(true);
+      ttlDB = TtlDB.open(options,
+          dbFolder.getRoot().getAbsolutePath());
+      columnFamilyHandle = ttlDB.createColumnFamilyWithTtl(
+          new ColumnFamilyDescriptor("new_cf".getBytes()), 1);
+      ttlDB.put(columnFamilyHandle, "key".getBytes(),
+          "value".getBytes());
+      assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())).
+          isEqualTo("value".getBytes());
+      TimeUnit.SECONDS.sleep(2);
+      ttlDB.compactRange(columnFamilyHandle);
+      assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())).isNull();
+    } finally {
+      if (columnFamilyHandle != null) {
+        columnFamilyHandle.dispose();
+      }
+      if (ttlDB != null) {
+        ttlDB.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/Types.java b/src/rocksdb/java/src/test/java/org/rocksdb/Types.java
new file mode 100644
index 0000000..5ad35f4
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/Types.java
@@ -0,0 +1,43 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+/**
+ * Simple type conversion methods
+ * for use in tests
+ */
+public class Types {
+
+  /**
+   * Convert first 4 bytes of a byte array to an int
+   *
+   * @param data The byte array
+   *
+   * @return An integer
+   */
+  public static int byteToInt(final byte data[]) {
+    return (data[0] & 0xff) |
+        ((data[1] & 0xff) << 8) |
+        ((data[2] & 0xff) << 16) |
+        ((data[3] & 0xff) << 24);
+  }
+
+  /**
+   * Convert an int to 4 bytes
+   *
+   * @param v The int
+   *
+   * @return A byte array containing 4 bytes
+   */
+  public static byte[] intToByte(final int v) {
+    return new byte[] {
+        (byte)((v >>> 0) & 0xff),
+        (byte)((v >>> 8) & 0xff),
+        (byte)((v >>> 16) & 0xff),
+        (byte)((v >>> 24) & 0xff)
+    };
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java
new file mode 100644
index 0000000..b09cc92
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java
@@ -0,0 +1,170 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+
+public class WriteBatchHandlerTest {
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void writeBatchHandler() throws IOException, RocksDBException {
+    WriteBatch batch = null;
+    CapturingWriteBatchHandler handler = null;
+    try {
+      // setup test data
+      final List<Tuple<Action, Tuple<byte[], byte[]>>> testEvents = new ArrayList<>();
+      testEvents.add(new Tuple<>(Action.DELETE,
+          new Tuple<byte[], byte[]>("k0".getBytes(), null)));
+      testEvents.add(new Tuple<>(Action.PUT,
+          new Tuple<>("k1".getBytes(), "v1".getBytes())));
+      testEvents.add(new Tuple<>(Action.PUT,
+          new Tuple<>("k2".getBytes(), "v2".getBytes())));
+      testEvents.add(new Tuple<>(Action.PUT,
+          new Tuple<>("k3".getBytes(), "v3".getBytes())));
+      testEvents.add(new Tuple<>(Action.LOG,
+          new Tuple<byte[], byte[]>(null, "log1".getBytes())));
+      testEvents.add(new Tuple<>(Action.MERGE,
+          new Tuple<>("k2".getBytes(), "v22".getBytes())));
+      testEvents.add(new Tuple<>(Action.DELETE,
+          new Tuple<byte[], byte[]>("k3".getBytes(), null)));
+
+      // load test data to the write batch
+      batch = new WriteBatch();
+      for (final Tuple<Action, Tuple<byte[], byte[]>> testEvent : testEvents) {
+        final Tuple<byte[], byte[]> data = testEvent.value;
+        switch (testEvent.key) {
+
+          case PUT:
+            batch.put(data.key, data.value);
+            break;
+
+          case MERGE:
+            batch.merge(data.key, data.value);
+            break;
+
+          case DELETE:
+            batch.remove(data.key);
+            break;
+
+          case LOG:
+            batch.putLogData(data.value);
+            break;
+        }
+      }
+
+      // attempt to read test data back from the WriteBatch by iterating with a handler
+      handler = new CapturingWriteBatchHandler();
+      batch.iterate(handler);
+
+      // compare the results to the test data
+      final List<Tuple<Action, Tuple<byte[], byte[]>>> actualEvents = handler.getEvents();
+      assertThat(testEvents.size()).isSameAs(actualEvents.size());
+
+      for (int i = 0; i < testEvents.size(); i++) {
+        assertThat(equals(testEvents.get(i), actualEvents.get(i))).isTrue();
+      }
+    } finally {
+      if (handler != null) {
+        handler.dispose();
+      }
+      if (batch != null) {
+        batch.dispose();
+      }
+    }
+  }
+
+  private static boolean equals(final Tuple<Action, Tuple<byte[], byte[]>> expected,
+                                final Tuple<Action, Tuple<byte[], byte[]>> actual) {
+    if (!expected.key.equals(actual.key)) {
+      return false;
+    }
+
+    final Tuple<byte[], byte[]> expectedData = expected.value;
+    final Tuple<byte[], byte[]> actualData = actual.value;
+
+    return equals(expectedData.key, actualData.key)
+        && equals(expectedData.value, actualData.value);
+  }
+
+  private static boolean equals(byte[] expected, byte[] actual) {
+    if (expected != null) {
+      return Arrays.equals(expected, actual);
+    } else {
+      return actual == null;
+    }
+  }
+
+  private static class Tuple<K, V> {
+    public final K key;
+    public final V value;
+
+    public Tuple(final K key, final V value) {
+      this.key = key;
+      this.value = value;
+    }
+  }
+
+  /**
+   * Enumeration of Write Batch
+   * event actions
+   */
+  private enum Action {
+    PUT,
+    MERGE,
+    DELETE,
+    LOG
+  }
+
+  /**
+   * A simple WriteBatch Handler which adds a record
+   * of each event that it receives to a list
+   */
+  private static class CapturingWriteBatchHandler extends WriteBatch.Handler {
+
+    private final List<Tuple<Action, Tuple<byte[], byte[]>>> events = new ArrayList<>();
+
+    /**
+     * Returns a copy of the current events list
+     *
+     * @return a list of the events which have happened upto now
+     */
+    public List<Tuple<Action, Tuple<byte[], byte[]>>> getEvents() {
+      return new ArrayList<>(events);
+    }
+
+    @Override
+    public void put(final byte[] key, final byte[] value) {
+      events.add(new Tuple<>(Action.PUT, new Tuple<>(key, value)));
+    }
+
+    @Override
+    public void merge(final byte[] key, final byte[] value) {
+      events.add(new Tuple<>(Action.MERGE, new Tuple<>(key, value)));
+    }
+
+    @Override
+    public void delete(final byte[] key) {
+      events.add(new Tuple<>(Action.DELETE, new Tuple<byte[], byte[]>(key, null)));
+    }
+
+    @Override
+    public void logData(final byte[] blob) {
+      events.add(new Tuple<>(Action.LOG, new Tuple<byte[], byte[]>(null, blob)));
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java
new file mode 100644
index 0000000..89a9d54
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java
@@ -0,0 +1,123 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.io.UnsupportedEncodingException;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * This class mimics the db/write_batch_test.cc
+ * in the c++ rocksdb library.
+ *
+ * Not ported yet:
+ *
+ * Continue();
+ * PutGatherSlices();
+ */
+public class WriteBatchTest {
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void emptyWriteBatch() {
+    WriteBatch batch = new WriteBatch();
+    assertThat(batch.count()).isEqualTo(0);
+  }
+
+  @Test
+  public void multipleBatchOperations()
+      throws UnsupportedEncodingException {
+    WriteBatch batch =  new WriteBatch();
+    batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
+    batch.remove("box".getBytes("US-ASCII"));
+    batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII"));
+    WriteBatchTestInternalHelper.setSequence(batch, 100);
+    assertThat(WriteBatchTestInternalHelper.sequence(batch)).
+        isNotNull().
+        isEqualTo(100);
+    assertThat(batch.count()).isEqualTo(3);
+    assertThat(new String(getContents(batch), "US-ASCII")).
+        isEqualTo("Put(baz, boo)@102" +
+                  "Delete(box)@101" +
+                  "Put(foo, bar)@100");
+  }
+
+  @Test
+  public void testAppendOperation()
+      throws UnsupportedEncodingException {
+    WriteBatch b1 = new WriteBatch();
+    WriteBatch b2 = new WriteBatch();
+    WriteBatchTestInternalHelper.setSequence(b1, 200);
+    WriteBatchTestInternalHelper.setSequence(b2, 300);
+    WriteBatchTestInternalHelper.append(b1, b2);
+    assertThat(getContents(b1).length).isEqualTo(0);
+    assertThat(b1.count()).isEqualTo(0);
+    b2.put("a".getBytes("US-ASCII"), "va".getBytes("US-ASCII"));
+    WriteBatchTestInternalHelper.append(b1, b2);
+    assertThat("Put(a, va)@200".equals(new String(getContents(b1), "US-ASCII")));
+    assertThat(b1.count()).isEqualTo(1);
+    b2.clear();
+    b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII"));
+    WriteBatchTestInternalHelper.append(b1, b2);
+    assertThat(("Put(a, va)@200" +
+            "Put(b, vb)@201")
+                .equals(new String(getContents(b1), "US-ASCII")));
+    assertThat(b1.count()).isEqualTo(2);
+    b2.remove("foo".getBytes("US-ASCII"));
+    WriteBatchTestInternalHelper.append(b1, b2);
+    assertThat(("Put(a, va)@200" +
+        "Put(b, vb)@202" +
+        "Put(b, vb)@201" +
+        "Delete(foo)@203")
+        .equals(new String(getContents(b1), "US-ASCII")));
+    assertThat(b1.count()).isEqualTo(4);
+  }
+
+  @Test
+  public void blobOperation()
+      throws UnsupportedEncodingException {
+    WriteBatch batch = new WriteBatch();
+    batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII"));
+    batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII"));
+    batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII"));
+    batch.putLogData("blob1".getBytes("US-ASCII"));
+    batch.remove("k2".getBytes("US-ASCII"));
+    batch.putLogData("blob2".getBytes("US-ASCII"));
+    batch.merge("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
+    assertThat(batch.count()).isEqualTo(5);
+    assertThat(("Merge(foo, bar)@4" +
+            "Put(k1, v1)@0" +
+            "Delete(k2)@3" +
+            "Put(k2, v2)@1" +
+            "Put(k3, v3)@2")
+               .equals(new String(getContents(batch), "US-ASCII")));
+  }
+
+  static native byte[] getContents(WriteBatch batch);
+}
+
+/**
+ * Package-private class which provides java api to access
+ * c++ WriteBatchInternal.
+ */
+class WriteBatchTestInternalHelper {
+  static native void setSequence(WriteBatch batch, long sn);
+  static native long sequence(WriteBatch batch);
+  static native void append(WriteBatch b1, WriteBatch b2);
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java
new file mode 100644
index 0000000..b0c729a
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java
@@ -0,0 +1,268 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayDeque;
+import java.util.Deque;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+
+public class WriteBatchWithIndexTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void readYourOwnWrites() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    try {
+      options = new Options();
+      // Setup options
+      options.setCreateIfMissing(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+
+      final byte[] k1 = "key1".getBytes();
+      final byte[] v1 = "value1".getBytes();
+      final byte[] k2 = "key2".getBytes();
+      final byte[] v2 = "value2".getBytes();
+
+      db.put(k1, v1);
+      db.put(k2, v2);
+
+      final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+
+      RocksIterator base = null;
+      RocksIterator it = null;
+      try {
+        base = db.newIterator();
+        it = wbwi.newIteratorWithBase(base);
+
+        it.seek(k1);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k1);
+        assertThat(it.value()).isEqualTo(v1);
+
+        it.seek(k2);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k2);
+        assertThat(it.value()).isEqualTo(v2);
+
+        //put data to the write batch and make sure we can read it.
+        final byte[] k3 = "key3".getBytes();
+        final byte[] v3 = "value3".getBytes();
+        wbwi.put(k3, v3);
+        it.seek(k3);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k3);
+        assertThat(it.value()).isEqualTo(v3);
+
+        //update k2 in the write batch and check the value
+        final byte[] v2Other = "otherValue2".getBytes();
+        wbwi.put(k2, v2Other);
+        it.seek(k2);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k2);
+        assertThat(it.value()).isEqualTo(v2Other);
+
+        //remove k1 and make sure we can read back the write
+        wbwi.remove(k1);
+        it.seek(k1);
+        assertThat(it.key()).isNotEqualTo(k1);
+
+        //reinsert k1 and make sure we see the new value
+        final byte[] v1Other = "otherValue1".getBytes();
+        wbwi.put(k1, v1Other);
+        it.seek(k1);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k1);
+        assertThat(it.value()).isEqualTo(v1Other);
+      } finally {
+        if (it != null) {
+          it.dispose();
+        }
+        if (base != null) {
+          base.dispose();
+        }
+      }
+
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void write_writeBatchWithIndex() throws RocksDBException {
+    RocksDB db = null;
+    Options options = null;
+    try {
+      options = new Options();
+      // Setup options
+      options.setCreateIfMissing(true);
+      db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());
+
+      final byte[] k1 = "key1".getBytes();
+      final byte[] v1 = "value1".getBytes();
+      final byte[] k2 = "key2".getBytes();
+      final byte[] v2 = "value2".getBytes();
+
+      WriteBatchWithIndex wbwi = null;
+
+      try {
+        wbwi = new WriteBatchWithIndex();
+
+
+        wbwi.put(k1, v1);
+        wbwi.put(k2, v2);
+
+        db.write(new WriteOptions(), wbwi);
+      } finally {
+        if(wbwi != null) {
+          wbwi.dispose();
+        }
+      }
+
+      assertThat(db.get(k1)).isEqualTo(v1);
+      assertThat(db.get(k2)).isEqualTo(v2);
+
+    } finally {
+      if (db != null) {
+        db.close();
+      }
+      if (options != null) {
+        options.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void iterator() throws RocksDBException {
+    final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+
+    final String k1 = "key1";
+    final String v1 = "value1";
+    final String k2 = "key2";
+    final String v2 = "value2";
+    final String k3 = "key3";
+    final String v3 = "value3";
+    final byte[] k1b = k1.getBytes();
+    final byte[] v1b = v1.getBytes();
+    final byte[] k2b = k2.getBytes();
+    final byte[] v2b = v2.getBytes();
+    final byte[] k3b = k3.getBytes();
+    final byte[] v3b = v3.getBytes();
+
+    //add put records
+    wbwi.put(k1b, v1b);
+    wbwi.put(k2b, v2b);
+    wbwi.put(k3b, v3b);
+
+    //add a deletion record
+    final String k4 = "key4";
+    final byte[] k4b = k4.getBytes();
+    wbwi.remove(k4b);
+
+    WBWIRocksIterator.WriteEntry[] expected = {
+        new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT,
+            new DirectSlice(k1), new DirectSlice(v1)),
+        new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT,
+            new DirectSlice(k2), new DirectSlice(v2)),
+        new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT,
+            new DirectSlice(k3), new DirectSlice(v3)),
+        new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.DELETE,
+            new DirectSlice(k4), DirectSlice.NONE)
+    };
+
+    WBWIRocksIterator it = null;
+    try {
+      it = wbwi.newIterator();
+
+      //direct access - seek to key offsets
+      final int[] testOffsets = {2, 0, 1, 3};
+
+      for(int i = 0; i < testOffsets.length; i++) {
+        final int testOffset = testOffsets[i];
+        final byte[] key = toArray(expected[testOffset].getKey().data());
+
+        it.seek(key);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.entry().equals(expected[testOffset])).isTrue();
+      }
+
+      //forward iterative access
+      int i = 0;
+      for(it.seekToFirst(); it.isValid(); it.next()) {
+        assertThat(it.entry().equals(expected[i++])).isTrue();
+      }
+
+      //reverse iterative access
+      i = expected.length - 1;
+      for(it.seekToLast(); it.isValid(); it.prev()) {
+        assertThat(it.entry().equals(expected[i--])).isTrue();
+      }
+
+    } finally {
+      if(it != null) {
+        it.dispose();
+      }
+    }
+  }
+
+  @Test
+  public void zeroByteTests() {
+    final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
+    byte[] zeroByteValue = new byte[] { 0, 0 };
+
+    //add zero byte value
+    wbwi.put(zeroByteValue, zeroByteValue);
+
+    ByteBuffer buffer = ByteBuffer.allocateDirect(zeroByteValue.length);
+    buffer.put(zeroByteValue);
+
+    WBWIRocksIterator.WriteEntry[] expected = {
+        new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT,
+            new DirectSlice(buffer, zeroByteValue.length),
+            new DirectSlice(buffer, zeroByteValue.length))
+    };
+    WBWIRocksIterator it = null;
+    try {
+      it = wbwi.newIterator();
+      it.seekToFirst();
+      assertThat(it.entry().equals(expected[0])).isTrue();
+      assertThat(it.entry().hashCode() == expected[0].hashCode()).isTrue();
+    } finally {
+      if(it != null) {
+        it.dispose();
+      }
+    }
+  }
+
+  private byte[] toArray(final ByteBuffer buf) {
+    final byte[] ary = new byte[buf.remaining()];
+    buf.get(ary);
+    return ary;
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java
new file mode 100644
index 0000000..4d8e6d9
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java
@@ -0,0 +1,31 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class WriteOptionsTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void writeOptions(){
+    WriteOptions writeOptions = new WriteOptions();
+    writeOptions.setDisableWAL(true);
+    assertThat(writeOptions.disableWAL()).isTrue();
+    writeOptions.setDisableWAL(false);
+    assertThat(writeOptions.disableWAL()).isFalse();
+    writeOptions.setSync(true);
+    assertThat(writeOptions.sync()).isTrue();
+    writeOptions.setSync(false);
+    assertThat(writeOptions.sync()).isFalse();
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java b/src/rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java
new file mode 100644
index 0000000..c800574
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java
@@ -0,0 +1,68 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb.test;
+
+import org.junit.internal.JUnitSystem;
+import org.junit.internal.RealSystem;
+import org.junit.internal.TextListener;
+import org.junit.runner.Description;
+import org.junit.runner.JUnitCore;
+import org.junit.runner.Result;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Custom Junit Runner to print also Test classes
+ * and executed methods to command prompt.
+ */
+public class RocksJunitRunner {
+
+  /**
+   * Listener which overrides default functionality
+   * to print class and method to system out.
+   */
+  static class RocksJunitListener extends TextListener {
+
+    /**
+     * RocksJunitListener constructor
+     *
+     * @param system JUnitSystem
+     */
+    public RocksJunitListener(JUnitSystem system) {
+      super(system);
+    }
+
+    @Override
+    public void testStarted(Description description) {
+       System.out.format("Run: %s testing now -> %s \n",
+           description.getClassName(),
+           description.getMethodName());
+    }
+  }
+
+  /**
+   * Main method to execute tests
+   *
+   * @param args Test classes as String names
+   */
+  public static void main(String[] args){
+    JUnitCore runner = new JUnitCore();
+    final JUnitSystem system = new RealSystem();
+    runner.addListener(new RocksJunitListener(system));
+    try {
+      List<Class<?>> classes = new ArrayList<>();
+      for (String arg : args) {
+        classes.add(Class.forName(arg));
+      }
+      final Result result = runner.run(classes.toArray(new Class[1]));
+      if(!result.wasSuccessful()) {
+        System.exit(-1);
+      }
+    } catch (ClassNotFoundException e) {
+      e.printStackTrace();
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
new file mode 100644
index 0000000..c7160de
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
@@ -0,0 +1,171 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb.util;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class EnvironmentTest {
+  private final static String ARCH_FIELD_NAME = "ARCH";
+  private final static String OS_FIELD_NAME = "OS";
+
+  private static String INITIAL_OS;
+  private static String INITIAL_ARCH;
+
+  @BeforeClass
+  public static void saveState() {
+    INITIAL_ARCH = getEnvironmentClassField(ARCH_FIELD_NAME);
+    INITIAL_OS = getEnvironmentClassField(OS_FIELD_NAME);
+  }
+
+  @Test
+  public void mac32() {
+    setEnvironmentClassFields("mac", "32");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".jnilib");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-osx.jnilib");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.dylib");
+  }
+
+  @Test
+  public void mac64() {
+    setEnvironmentClassFields("mac", "64");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".jnilib");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-osx.jnilib");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.dylib");
+  }
+
+  @Test
+  public void nix32() {
+    // Linux
+    setEnvironmentClassFields("Linux", "32");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux32.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+    // UNIX
+    setEnvironmentClassFields("Unix", "32");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux32.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+    // AIX
+    setEnvironmentClassFields("aix", "32");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux32.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+  }
+
+  @Test
+  public void nix64() {
+    setEnvironmentClassFields("Linux", "x64");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux64.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+    // UNIX
+    setEnvironmentClassFields("Unix", "x64");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux64.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+    // AIX
+    setEnvironmentClassFields("aix", "x64");
+    assertThat(Environment.isWindows()).isFalse();
+    assertThat(Environment.getJniLibraryExtension()).
+        isEqualTo(".so");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni-linux64.so");
+    assertThat(Environment.getSharedLibraryFileName("rocksdb")).
+        isEqualTo("librocksdbjni.so");
+  }
+
+  @Test
+  public void detectWindows(){
+    setEnvironmentClassFields("win", "x64");
+    assertThat(Environment.isWindows()).isTrue();
+  }
+
+  @Test(expected = UnsupportedOperationException.class)
+  public void failWinJniLibraryName(){
+    setEnvironmentClassFields("win", "x64");
+    Environment.getJniLibraryFileName("rocksdb");
+  }
+
+  @Test(expected = UnsupportedOperationException.class)
+  public void failWinSharedLibrary(){
+    setEnvironmentClassFields("win", "x64");
+    Environment.getSharedLibraryFileName("rocksdb");
+  }
+
+  private void setEnvironmentClassFields(String osName,
+      String osArch) {
+    setEnvironmentClassField(OS_FIELD_NAME, osName);
+    setEnvironmentClassField(ARCH_FIELD_NAME, osArch);
+  }
+
+  @AfterClass
+  public static void restoreState() {
+    setEnvironmentClassField(OS_FIELD_NAME, INITIAL_OS);
+    setEnvironmentClassField(ARCH_FIELD_NAME, INITIAL_ARCH);
+  }
+
+  private static String getEnvironmentClassField(String fieldName) {
+    final Field field;
+    try {
+      field = Environment.class.getDeclaredField(fieldName);
+      field.setAccessible(true);
+      final Field modifiersField = Field.class.getDeclaredField("modifiers");
+      modifiersField.setAccessible(true);
+      modifiersField.setInt(field, field.getModifiers() & ~Modifier.FINAL);
+      return (String)field.get(null);
+    } catch (NoSuchFieldException | IllegalAccessException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private static void setEnvironmentClassField(String fieldName, String value) {
+    final Field field;
+    try {
+      field = Environment.class.getDeclaredField(fieldName);
+      field.setAccessible(true);
+      final Field modifiersField = Field.class.getDeclaredField("modifiers");
+      modifiersField.setAccessible(true);
+      modifiersField.setInt(field, field.getModifiers() & ~Modifier.FINAL);
+      field.set(null, value);
+    } catch (NoSuchFieldException | IllegalAccessException e) {
+      throw new RuntimeException(e);
+    }
+  }
+}
diff --git a/src/rocksdb/java/src/test/java/org/rocksdb/util/SizeUnitTest.java b/src/rocksdb/java/src/test/java/org/rocksdb/util/SizeUnitTest.java
new file mode 100644
index 0000000..517e1b2
--- /dev/null
+++ b/src/rocksdb/java/src/test/java/org/rocksdb/util/SizeUnitTest.java
@@ -0,0 +1,27 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+package org.rocksdb.util;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class SizeUnitTest {
+
+  public static final long COMPUTATION_UNIT = 1024L;
+
+  @Test
+  public void sizeUnit() {
+    assertThat(SizeUnit.KB).isEqualTo(COMPUTATION_UNIT);
+    assertThat(SizeUnit.MB).isEqualTo(
+        SizeUnit.KB * COMPUTATION_UNIT);
+    assertThat(SizeUnit.GB).isEqualTo(
+        SizeUnit.MB * COMPUTATION_UNIT);
+    assertThat(SizeUnit.TB).isEqualTo(
+        SizeUnit.GB * COMPUTATION_UNIT);
+    assertThat(SizeUnit.PB).isEqualTo(
+        SizeUnit.TB * COMPUTATION_UNIT);
+  }
+}
diff --git a/src/rocksdb/m4/libtool.m4 b/src/rocksdb/m4/libtool.m4
deleted file mode 100644
index d7c043f..0000000
--- a/src/rocksdb/m4/libtool.m4
+++ /dev/null
@@ -1,7997 +0,0 @@
-# libtool.m4 - Configure libtool for the host system. -*-Autoconf-*-
-#
-#   Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005,
-#                 2006, 2007, 2008, 2009, 2010, 2011 Free Software
-#                 Foundation, Inc.
-#   Written by Gordon Matzigkeit, 1996
-#
-# This file is free software; the Free Software Foundation gives
-# unlimited permission to copy and/or distribute it, with or without
-# modifications, as long as this notice is preserved.
-
-m4_define([_LT_COPYING], [dnl
-#   Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005,
-#                 2006, 2007, 2008, 2009, 2010, 2011 Free Software
-#                 Foundation, Inc.
-#   Written by Gordon Matzigkeit, 1996
-#
-#   This file is part of GNU Libtool.
-#
-# GNU Libtool is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License as
-# published by the Free Software Foundation; either version 2 of
-# the License, or (at your option) any later version.
-#
-# As a special exception to the GNU General Public License,
-# if you distribute this file as part of a program or library that
-# is built using GNU Libtool, you may include this file under the
-# same distribution terms that you use for the rest of that program.
-#
-# GNU Libtool is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with GNU Libtool; see the file COPYING.  If not, a copy
-# can be downloaded from http://www.gnu.org/licenses/gpl.html, or
-# obtained by writing to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-])
-
-# serial 57 LT_INIT
-
-
-# LT_PREREQ(VERSION)
-# ------------------
-# Complain and exit if this libtool version is less that VERSION.
-m4_defun([LT_PREREQ],
-[m4_if(m4_version_compare(m4_defn([LT_PACKAGE_VERSION]), [$1]), -1,
-       [m4_default([$3],
-		   [m4_fatal([Libtool version $1 or higher is required],
-		             63)])],
-       [$2])])
-
-
-# _LT_CHECK_BUILDDIR
-# ------------------
-# Complain if the absolute build directory name contains unusual characters
-m4_defun([_LT_CHECK_BUILDDIR],
-[case `pwd` in
-  *\ * | *\	*)
-    AC_MSG_WARN([Libtool does not cope well with whitespace in `pwd`]) ;;
-esac
-])
-
-
-# LT_INIT([OPTIONS])
-# ------------------
-AC_DEFUN([LT_INIT],
-[AC_PREREQ([2.58])dnl We use AC_INCLUDES_DEFAULT
-AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
-AC_BEFORE([$0], [LT_LANG])dnl
-AC_BEFORE([$0], [LT_OUTPUT])dnl
-AC_BEFORE([$0], [LTDL_INIT])dnl
-m4_require([_LT_CHECK_BUILDDIR])dnl
-
-dnl Autoconf doesn't catch unexpanded LT_ macros by default:
-m4_pattern_forbid([^_?LT_[A-Z_]+$])dnl
-m4_pattern_allow([^(_LT_EOF|LT_DLGLOBAL|LT_DLLAZY_OR_NOW|LT_MULTI_MODULE)$])dnl
-dnl aclocal doesn't pull ltoptions.m4, ltsugar.m4, or ltversion.m4
-dnl unless we require an AC_DEFUNed macro:
-AC_REQUIRE([LTOPTIONS_VERSION])dnl
-AC_REQUIRE([LTSUGAR_VERSION])dnl
-AC_REQUIRE([LTVERSION_VERSION])dnl
-AC_REQUIRE([LTOBSOLETE_VERSION])dnl
-m4_require([_LT_PROG_LTMAIN])dnl
-
-_LT_SHELL_INIT([SHELL=${CONFIG_SHELL-/bin/sh}])
-
-dnl Parse OPTIONS
-_LT_SET_OPTIONS([$0], [$1])
-
-# This can be used to rebuild libtool when needed
-LIBTOOL_DEPS="$ltmain"
-
-# Always use our own libtool.
-LIBTOOL='$(SHELL) $(top_builddir)/libtool'
-AC_SUBST(LIBTOOL)dnl
-
-_LT_SETUP
-
-# Only expand once:
-m4_define([LT_INIT])
-])# LT_INIT
-
-# Old names:
-AU_ALIAS([AC_PROG_LIBTOOL], [LT_INIT])
-AU_ALIAS([AM_PROG_LIBTOOL], [LT_INIT])
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_PROG_LIBTOOL], [])
-dnl AC_DEFUN([AM_PROG_LIBTOOL], [])
-
-
-# _LT_CC_BASENAME(CC)
-# -------------------
-# Calculate cc_basename.  Skip known compiler wrappers and cross-prefix.
-m4_defun([_LT_CC_BASENAME],
-[for cc_temp in $1""; do
-  case $cc_temp in
-    compile | *[[\\/]]compile | ccache | *[[\\/]]ccache ) ;;
-    distcc | *[[\\/]]distcc | purify | *[[\\/]]purify ) ;;
-    \-*) ;;
-    *) break;;
-  esac
-done
-cc_basename=`$ECHO "$cc_temp" | $SED "s%.*/%%; s%^$host_alias-%%"`
-])
-
-
-# _LT_FILEUTILS_DEFAULTS
-# ----------------------
-# It is okay to use these file commands and assume they have been set
-# sensibly after `m4_require([_LT_FILEUTILS_DEFAULTS])'.
-m4_defun([_LT_FILEUTILS_DEFAULTS],
-[: ${CP="cp -f"}
-: ${MV="mv -f"}
-: ${RM="rm -f"}
-])# _LT_FILEUTILS_DEFAULTS
-
-
-# _LT_SETUP
-# ---------
-m4_defun([_LT_SETUP],
-[AC_REQUIRE([AC_CANONICAL_HOST])dnl
-AC_REQUIRE([AC_CANONICAL_BUILD])dnl
-AC_REQUIRE([_LT_PREPARE_SED_QUOTE_VARS])dnl
-AC_REQUIRE([_LT_PROG_ECHO_BACKSLASH])dnl
-
-_LT_DECL([], [PATH_SEPARATOR], [1], [The PATH separator for the build system])dnl
-dnl
-_LT_DECL([], [host_alias], [0], [The host system])dnl
-_LT_DECL([], [host], [0])dnl
-_LT_DECL([], [host_os], [0])dnl
-dnl
-_LT_DECL([], [build_alias], [0], [The build system])dnl
-_LT_DECL([], [build], [0])dnl
-_LT_DECL([], [build_os], [0])dnl
-dnl
-AC_REQUIRE([AC_PROG_CC])dnl
-AC_REQUIRE([LT_PATH_LD])dnl
-AC_REQUIRE([LT_PATH_NM])dnl
-dnl
-AC_REQUIRE([AC_PROG_LN_S])dnl
-test -z "$LN_S" && LN_S="ln -s"
-_LT_DECL([], [LN_S], [1], [Whether we need soft or hard links])dnl
-dnl
-AC_REQUIRE([LT_CMD_MAX_LEN])dnl
-_LT_DECL([objext], [ac_objext], [0], [Object file suffix (normally "o")])dnl
-_LT_DECL([], [exeext], [0], [Executable file suffix (normally "")])dnl
-dnl
-m4_require([_LT_FILEUTILS_DEFAULTS])dnl
-m4_require([_LT_CHECK_SHELL_FEATURES])dnl
-m4_require([_LT_PATH_CONVERSION_FUNCTIONS])dnl
-m4_require([_LT_CMD_RELOAD])dnl
-m4_require([_LT_CHECK_MAGIC_METHOD])dnl
-m4_require([_LT_CHECK_SHAREDLIB_FROM_LINKLIB])dnl
-m4_require([_LT_CMD_OLD_ARCHIVE])dnl
-m4_require([_LT_CMD_GLOBAL_SYMBOLS])dnl
-m4_require([_LT_WITH_SYSROOT])dnl
-
-_LT_CONFIG_LIBTOOL_INIT([
-# See if we are running on zsh, and set the options which allow our
-# commands through without removal of \ escapes INIT.
-if test -n "\${ZSH_VERSION+set}" ; then
-   setopt NO_GLOB_SUBST
-fi
-])
-if test -n "${ZSH_VERSION+set}" ; then
-   setopt NO_GLOB_SUBST
-fi
-
-_LT_CHECK_OBJDIR
-
-m4_require([_LT_TAG_COMPILER])dnl
-
-case $host_os in
-aix3*)
-  # AIX sometimes has problems with the GCC collect2 program.  For some
-  # reason, if we set the COLLECT_NAMES environment variable, the problems
-  # vanish in a puff of smoke.
-  if test "X${COLLECT_NAMES+set}" != Xset; then
-    COLLECT_NAMES=
-    export COLLECT_NAMES
-  fi
-  ;;
-esac
-
-# Global variables:
-ofile=libtool
-can_build_shared=yes
-
-# All known linkers require a `.a' archive for static linking (except MSVC,
-# which needs '.lib').
-libext=a
-
-with_gnu_ld="$lt_cv_prog_gnu_ld"
-
-old_CC="$CC"
-old_CFLAGS="$CFLAGS"
-
-# Set sane defaults for various variables
-test -z "$CC" && CC=cc
-test -z "$LTCC" && LTCC=$CC
-test -z "$LTCFLAGS" && LTCFLAGS=$CFLAGS
-test -z "$LD" && LD=ld
-test -z "$ac_objext" && ac_objext=o
-
-_LT_CC_BASENAME([$compiler])
-
-# Only perform the check for file, if the check method requires it
-test -z "$MAGIC_CMD" && MAGIC_CMD=file
-case $deplibs_check_method in
-file_magic*)
-  if test "$file_magic_cmd" = '$MAGIC_CMD'; then
-    _LT_PATH_MAGIC
-  fi
-  ;;
-esac
-
-# Use C for the default configuration in the libtool script
-LT_SUPPORTED_TAG([CC])
-_LT_LANG_C_CONFIG
-_LT_LANG_DEFAULT_CONFIG
-_LT_CONFIG_COMMANDS
-])# _LT_SETUP
-
-
-# _LT_PREPARE_SED_QUOTE_VARS
-# --------------------------
-# Define a few sed substitution that help us do robust quoting.
-m4_defun([_LT_PREPARE_SED_QUOTE_VARS],
-[# Backslashify metacharacters that are still active within
-# double-quoted strings.
-sed_quote_subst='s/\([["`$\\]]\)/\\\1/g'
-
-# Same as above, but do not quote variable references.
-double_quote_subst='s/\([["`\\]]\)/\\\1/g'
-
-# Sed substitution to delay expansion of an escaped shell variable in a
-# double_quote_subst'ed string.
-delay_variable_subst='s/\\\\\\\\\\\$/\\\\\\$/g'
-
-# Sed substitution to delay expansion of an escaped single quote.
-delay_single_quote_subst='s/'\''/'\'\\\\\\\'\''/g'
-
-# Sed substitution to avoid accidental globbing in evaled expressions
-no_glob_subst='s/\*/\\\*/g'
-])
-
-# _LT_PROG_LTMAIN
-# ---------------
-# Note that this code is called both from `configure', and `config.status'
-# now that we use AC_CONFIG_COMMANDS to generate libtool.  Notably,
-# `config.status' has no value for ac_aux_dir unless we are using Automake,
-# so we pass a copy along to make sure it has a sensible value anyway.
-m4_defun([_LT_PROG_LTMAIN],
-[m4_ifdef([AC_REQUIRE_AUX_FILE], [AC_REQUIRE_AUX_FILE([ltmain.sh])])dnl
-_LT_CONFIG_LIBTOOL_INIT([ac_aux_dir='$ac_aux_dir'])
-ltmain="$ac_aux_dir/ltmain.sh"
-])# _LT_PROG_LTMAIN
-
-
-## ------------------------------------- ##
-## Accumulate code for creating libtool. ##
-## ------------------------------------- ##
-
-# So that we can recreate a full libtool script including additional
-# tags, we accumulate the chunks of code to send to AC_CONFIG_COMMANDS
-# in macros and then make a single call at the end using the `libtool'
-# label.
-
-
-# _LT_CONFIG_LIBTOOL_INIT([INIT-COMMANDS])
-# ----------------------------------------
-# Register INIT-COMMANDS to be passed to AC_CONFIG_COMMANDS later.
-m4_define([_LT_CONFIG_LIBTOOL_INIT],
-[m4_ifval([$1],
-          [m4_append([_LT_OUTPUT_LIBTOOL_INIT],
-                     [$1
-])])])
-
-# Initialize.
-m4_define([_LT_OUTPUT_LIBTOOL_INIT])
-
-
-# _LT_CONFIG_LIBTOOL([COMMANDS])
-# ------------------------------
-# Register COMMANDS to be passed to AC_CONFIG_COMMANDS later.
-m4_define([_LT_CONFIG_LIBTOOL],
-[m4_ifval([$1],
-          [m4_append([_LT_OUTPUT_LIBTOOL_COMMANDS],
-                     [$1
-])])])
-
-# Initialize.
-m4_define([_LT_OUTPUT_LIBTOOL_COMMANDS])
-
-
-# _LT_CONFIG_SAVE_COMMANDS([COMMANDS], [INIT_COMMANDS])
-# -----------------------------------------------------
-m4_defun([_LT_CONFIG_SAVE_COMMANDS],
-[_LT_CONFIG_LIBTOOL([$1])
-_LT_CONFIG_LIBTOOL_INIT([$2])
-])
-
-
-# _LT_FORMAT_COMMENT([COMMENT])
-# -----------------------------
-# Add leading comment marks to the start of each line, and a trailing
-# full-stop to the whole comment if one is not present already.
-m4_define([_LT_FORMAT_COMMENT],
-[m4_ifval([$1], [
-m4_bpatsubst([m4_bpatsubst([$1], [^ *], [# ])],
-              [['`$\]], [\\\&])]m4_bmatch([$1], [[!?.]$], [], [.])
-)])
-
-
-
-## ------------------------ ##
-## FIXME: Eliminate VARNAME ##
-## ------------------------ ##
-
-
-# _LT_DECL([CONFIGNAME], VARNAME, VALUE, [DESCRIPTION], [IS-TAGGED?])
-# -------------------------------------------------------------------
-# CONFIGNAME is the name given to the value in the libtool script.
-# VARNAME is the (base) name used in the configure script.
-# VALUE may be 0, 1 or 2 for a computed quote escaped value based on
-# VARNAME.  Any other value will be used directly.
-m4_define([_LT_DECL],
-[lt_if_append_uniq([lt_decl_varnames], [$2], [, ],
-    [lt_dict_add_subkey([lt_decl_dict], [$2], [libtool_name],
-	[m4_ifval([$1], [$1], [$2])])
-    lt_dict_add_subkey([lt_decl_dict], [$2], [value], [$3])
-    m4_ifval([$4],
-	[lt_dict_add_subkey([lt_decl_dict], [$2], [description], [$4])])
-    lt_dict_add_subkey([lt_decl_dict], [$2],
-	[tagged?], [m4_ifval([$5], [yes], [no])])])
-])
-
-
-# _LT_TAGDECL([CONFIGNAME], VARNAME, VALUE, [DESCRIPTION])
-# --------------------------------------------------------
-m4_define([_LT_TAGDECL], [_LT_DECL([$1], [$2], [$3], [$4], [yes])])
-
-
-# lt_decl_tag_varnames([SEPARATOR], [VARNAME1...])
-# ------------------------------------------------
-m4_define([lt_decl_tag_varnames],
-[_lt_decl_filter([tagged?], [yes], $@)])
-
-
-# _lt_decl_filter(SUBKEY, VALUE, [SEPARATOR], [VARNAME1..])
-# ---------------------------------------------------------
-m4_define([_lt_decl_filter],
-[m4_case([$#],
-  [0], [m4_fatal([$0: too few arguments: $#])],
-  [1], [m4_fatal([$0: too few arguments: $#: $1])],
-  [2], [lt_dict_filter([lt_decl_dict], [$1], [$2], [], lt_decl_varnames)],
-  [3], [lt_dict_filter([lt_decl_dict], [$1], [$2], [$3], lt_decl_varnames)],
-  [lt_dict_filter([lt_decl_dict], $@)])[]dnl
-])
-
-
-# lt_decl_quote_varnames([SEPARATOR], [VARNAME1...])
-# --------------------------------------------------
-m4_define([lt_decl_quote_varnames],
-[_lt_decl_filter([value], [1], $@)])
-
-
-# lt_decl_dquote_varnames([SEPARATOR], [VARNAME1...])
-# ---------------------------------------------------
-m4_define([lt_decl_dquote_varnames],
-[_lt_decl_filter([value], [2], $@)])
-
-
-# lt_decl_varnames_tagged([SEPARATOR], [VARNAME1...])
-# ---------------------------------------------------
-m4_define([lt_decl_varnames_tagged],
-[m4_assert([$# <= 2])dnl
-_$0(m4_quote(m4_default([$1], [[, ]])),
-    m4_ifval([$2], [[$2]], [m4_dquote(lt_decl_tag_varnames)]),
-    m4_split(m4_normalize(m4_quote(_LT_TAGS)), [ ]))])
-m4_define([_lt_decl_varnames_tagged],
-[m4_ifval([$3], [lt_combine([$1], [$2], [_], $3)])])
-
-
-# lt_decl_all_varnames([SEPARATOR], [VARNAME1...])
-# ------------------------------------------------
-m4_define([lt_decl_all_varnames],
-[_$0(m4_quote(m4_default([$1], [[, ]])),
-     m4_if([$2], [],
-	   m4_quote(lt_decl_varnames),
-	m4_quote(m4_shift($@))))[]dnl
-])
-m4_define([_lt_decl_all_varnames],
-[lt_join($@, lt_decl_varnames_tagged([$1],
-			lt_decl_tag_varnames([[, ]], m4_shift($@))))dnl
-])
-
-
-# _LT_CONFIG_STATUS_DECLARE([VARNAME])
-# ------------------------------------
-# Quote a variable value, and forward it to `config.status' so that its
-# declaration there will have the same value as in `configure'.  VARNAME
-# must have a single quote delimited value for this to work.
-m4_define([_LT_CONFIG_STATUS_DECLARE],
-[$1='`$ECHO "$][$1" | $SED "$delay_single_quote_subst"`'])
-
-
-# _LT_CONFIG_STATUS_DECLARATIONS
-# ------------------------------
-# We delimit libtool config variables with single quotes, so when
-# we write them to config.status, we have to be sure to quote all
-# embedded single quotes properly.  In configure, this macro expands
-# each variable declared with _LT_DECL (and _LT_TAGDECL) into:
-#
-#    <var>='`$ECHO "$<var>" | $SED "$delay_single_quote_subst"`'
-m4_defun([_LT_CONFIG_STATUS_DECLARATIONS],
-[m4_foreach([_lt_var], m4_quote(lt_decl_all_varnames),
-    [m4_n([_LT_CONFIG_STATUS_DECLARE(_lt_var)])])])
-
-
-# _LT_LIBTOOL_TAGS
-# ----------------
-# Output comment and list of tags supported by the script
-m4_defun([_LT_LIBTOOL_TAGS],
-[_LT_FORMAT_COMMENT([The names of the tagged configurations supported by this script])dnl
-available_tags="_LT_TAGS"dnl
-])
-
-
-# _LT_LIBTOOL_DECLARE(VARNAME, [TAG])
-# -----------------------------------
-# Extract the dictionary values for VARNAME (optionally with TAG) and
-# expand to a commented shell variable setting:
-#
-#    # Some comment about what VAR is for.
-#    visible_name=$lt_internal_name
-m4_define([_LT_LIBTOOL_DECLARE],
-[_LT_FORMAT_COMMENT(m4_quote(lt_dict_fetch([lt_decl_dict], [$1],
-					   [description])))[]dnl
-m4_pushdef([_libtool_name],
-    m4_quote(lt_dict_fetch([lt_decl_dict], [$1], [libtool_name])))[]dnl
-m4_case(m4_quote(lt_dict_fetch([lt_decl_dict], [$1], [value])),
-    [0], [_libtool_name=[$]$1],
-    [1], [_libtool_name=$lt_[]$1],
-    [2], [_libtool_name=$lt_[]$1],
-    [_libtool_name=lt_dict_fetch([lt_decl_dict], [$1], [value])])[]dnl
-m4_ifval([$2], [_$2])[]m4_popdef([_libtool_name])[]dnl
-])
-
-
-# _LT_LIBTOOL_CONFIG_VARS
-# -----------------------
-# Produce commented declarations of non-tagged libtool config variables
-# suitable for insertion in the LIBTOOL CONFIG section of the `libtool'
-# script.  Tagged libtool config variables (even for the LIBTOOL CONFIG
-# section) are produced by _LT_LIBTOOL_TAG_VARS.
-m4_defun([_LT_LIBTOOL_CONFIG_VARS],
-[m4_foreach([_lt_var],
-    m4_quote(_lt_decl_filter([tagged?], [no], [], lt_decl_varnames)),
-    [m4_n([_LT_LIBTOOL_DECLARE(_lt_var)])])])
-
-
-# _LT_LIBTOOL_TAG_VARS(TAG)
-# -------------------------
-m4_define([_LT_LIBTOOL_TAG_VARS],
-[m4_foreach([_lt_var], m4_quote(lt_decl_tag_varnames),
-    [m4_n([_LT_LIBTOOL_DECLARE(_lt_var, [$1])])])])
-
-
-# _LT_TAGVAR(VARNAME, [TAGNAME])
-# ------------------------------
-m4_define([_LT_TAGVAR], [m4_ifval([$2], [$1_$2], [$1])])
-
-
-# _LT_CONFIG_COMMANDS
-# -------------------
-# Send accumulated output to $CONFIG_STATUS.  Thanks to the lists of
-# variables for single and double quote escaping we saved from calls
-# to _LT_DECL, we can put quote escaped variables declarations
-# into `config.status', and then the shell code to quote escape them in
-# for loops in `config.status'.  Finally, any additional code accumulated
-# from calls to _LT_CONFIG_LIBTOOL_INIT is expanded.
-m4_defun([_LT_CONFIG_COMMANDS],
-[AC_PROVIDE_IFELSE([LT_OUTPUT],
-	dnl If the libtool generation code has been placed in $CONFIG_LT,
-	dnl instead of duplicating it all over again into config.status,
-	dnl then we will have config.status run $CONFIG_LT later, so it
-	dnl needs to know what name is stored there:
-        [AC_CONFIG_COMMANDS([libtool],
-            [$SHELL $CONFIG_LT || AS_EXIT(1)], [CONFIG_LT='$CONFIG_LT'])],
-    dnl If the libtool generation code is destined for config.status,
-    dnl expand the accumulated commands and init code now:
-    [AC_CONFIG_COMMANDS([libtool],
-        [_LT_OUTPUT_LIBTOOL_COMMANDS], [_LT_OUTPUT_LIBTOOL_COMMANDS_INIT])])
-])#_LT_CONFIG_COMMANDS
-
-
-# Initialize.
-m4_define([_LT_OUTPUT_LIBTOOL_COMMANDS_INIT],
-[
-
-# The HP-UX ksh and POSIX shell print the target directory to stdout
-# if CDPATH is set.
-(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
-
-sed_quote_subst='$sed_quote_subst'
-double_quote_subst='$double_quote_subst'
-delay_variable_subst='$delay_variable_subst'
-_LT_CONFIG_STATUS_DECLARATIONS
-LTCC='$LTCC'
-LTCFLAGS='$LTCFLAGS'
-compiler='$compiler_DEFAULT'
-
-# A function that is used when there is no print builtin or printf.
-func_fallback_echo ()
-{
-  eval 'cat <<_LTECHO_EOF
-\$[]1
-_LTECHO_EOF'
-}
-
-# Quote evaled strings.
-for var in lt_decl_all_varnames([[ \
-]], lt_decl_quote_varnames); do
-    case \`eval \\\\\$ECHO \\\\""\\\\\$\$var"\\\\"\` in
-    *[[\\\\\\\`\\"\\\$]]*)
-      eval "lt_\$var=\\\\\\"\\\`\\\$ECHO \\"\\\$\$var\\" | \\\$SED \\"\\\$sed_quote_subst\\"\\\`\\\\\\""
-      ;;
-    *)
-      eval "lt_\$var=\\\\\\"\\\$\$var\\\\\\""
-      ;;
-    esac
-done
-
-# Double-quote double-evaled strings.
-for var in lt_decl_all_varnames([[ \
-]], lt_decl_dquote_varnames); do
-    case \`eval \\\\\$ECHO \\\\""\\\\\$\$var"\\\\"\` in
-    *[[\\\\\\\`\\"\\\$]]*)
-      eval "lt_\$var=\\\\\\"\\\`\\\$ECHO \\"\\\$\$var\\" | \\\$SED -e \\"\\\$double_quote_subst\\" -e \\"\\\$sed_quote_subst\\" -e \\"\\\$delay_variable_subst\\"\\\`\\\\\\""
-      ;;
-    *)
-      eval "lt_\$var=\\\\\\"\\\$\$var\\\\\\""
-      ;;
-    esac
-done
-
-_LT_OUTPUT_LIBTOOL_INIT
-])
-
-# _LT_GENERATED_FILE_INIT(FILE, [COMMENT])
-# ------------------------------------
-# Generate a child script FILE with all initialization necessary to
-# reuse the environment learned by the parent script, and make the
-# file executable.  If COMMENT is supplied, it is inserted after the
-# `#!' sequence but before initialization text begins.  After this
-# macro, additional text can be appended to FILE to form the body of
-# the child script.  The macro ends with non-zero status if the
-# file could not be fully written (such as if the disk is full).
-m4_ifdef([AS_INIT_GENERATED],
-[m4_defun([_LT_GENERATED_FILE_INIT],[AS_INIT_GENERATED($@)])],
-[m4_defun([_LT_GENERATED_FILE_INIT],
-[m4_require([AS_PREPARE])]dnl
-[m4_pushdef([AS_MESSAGE_LOG_FD])]dnl
-[lt_write_fail=0
-cat >$1 <<_ASEOF || lt_write_fail=1
-#! $SHELL
-# Generated by $as_me.
-$2
-SHELL=\${CONFIG_SHELL-$SHELL}
-export SHELL
-_ASEOF
-cat >>$1 <<\_ASEOF || lt_write_fail=1
-AS_SHELL_SANITIZE
-_AS_PREPARE
-exec AS_MESSAGE_FD>&1
-_ASEOF
-test $lt_write_fail = 0 && chmod +x $1[]dnl
-m4_popdef([AS_MESSAGE_LOG_FD])])])# _LT_GENERATED_FILE_INIT
-
-# LT_OUTPUT
-# ---------
-# This macro allows early generation of the libtool script (before
-# AC_OUTPUT is called), incase it is used in configure for compilation
-# tests.
-AC_DEFUN([LT_OUTPUT],
-[: ${CONFIG_LT=./config.lt}
-AC_MSG_NOTICE([creating $CONFIG_LT])
-_LT_GENERATED_FILE_INIT(["$CONFIG_LT"],
-[# Run this file to recreate a libtool stub with the current configuration.])
-
-cat >>"$CONFIG_LT" <<\_LTEOF
-lt_cl_silent=false
-exec AS_MESSAGE_LOG_FD>>config.log
-{
-  echo
-  AS_BOX([Running $as_me.])
-} >&AS_MESSAGE_LOG_FD
-
-lt_cl_help="\
-\`$as_me' creates a local libtool stub from the current configuration,
-for use in further configure time tests before the real libtool is
-generated.
-
-Usage: $[0] [[OPTIONS]]
-
-  -h, --help      print this help, then exit
-  -V, --version   print version number, then exit
-  -q, --quiet     do not print progress messages
-  -d, --debug     don't remove temporary files
-
-Report bugs to <bug-libtool at gnu.org>."
-
-lt_cl_version="\
-m4_ifset([AC_PACKAGE_NAME], [AC_PACKAGE_NAME ])config.lt[]dnl
-m4_ifset([AC_PACKAGE_VERSION], [ AC_PACKAGE_VERSION])
-configured by $[0], generated by m4_PACKAGE_STRING.
-
-Copyright (C) 2011 Free Software Foundation, Inc.
-This config.lt script is free software; the Free Software Foundation
-gives unlimited permision to copy, distribute and modify it."
-
-while test $[#] != 0
-do
-  case $[1] in
-    --version | --v* | -V )
-      echo "$lt_cl_version"; exit 0 ;;
-    --help | --h* | -h )
-      echo "$lt_cl_help"; exit 0 ;;
-    --debug | --d* | -d )
-      debug=: ;;
-    --quiet | --q* | --silent | --s* | -q )
-      lt_cl_silent=: ;;
-
-    -*) AC_MSG_ERROR([unrecognized option: $[1]
-Try \`$[0] --help' for more information.]) ;;
-
-    *) AC_MSG_ERROR([unrecognized argument: $[1]
-Try \`$[0] --help' for more information.]) ;;
-  esac
-  shift
-done
-
-if $lt_cl_silent; then
-  exec AS_MESSAGE_FD>/dev/null
-fi
-_LTEOF
-
-cat >>"$CONFIG_LT" <<_LTEOF
-_LT_OUTPUT_LIBTOOL_COMMANDS_INIT
-_LTEOF
-
-cat >>"$CONFIG_LT" <<\_LTEOF
-AC_MSG_NOTICE([creating $ofile])
-_LT_OUTPUT_LIBTOOL_COMMANDS
-AS_EXIT(0)
-_LTEOF
-chmod +x "$CONFIG_LT"
-
-# configure is writing to config.log, but config.lt does its own redirection,
-# appending to config.log, which fails on DOS, as config.log is still kept
-# open by configure.  Here we exec the FD to /dev/null, effectively closing
-# config.log, so it can be properly (re)opened and appended to by config.lt.
-lt_cl_success=:
-test "$silent" = yes &&
-  lt_config_lt_args="$lt_config_lt_args --quiet"
-exec AS_MESSAGE_LOG_FD>/dev/null
-$SHELL "$CONFIG_LT" $lt_config_lt_args || lt_cl_success=false
-exec AS_MESSAGE_LOG_FD>>config.log
-$lt_cl_success || AS_EXIT(1)
-])# LT_OUTPUT
-
-
-# _LT_CONFIG(TAG)
-# ---------------
-# If TAG is the built-in tag, create an initial libtool script with a
-# default configuration from the untagged config vars.  Otherwise add code
-# to config.status for appending the configuration named by TAG from the
-# matching tagged config vars.
-m4_defun([_LT_CONFIG],
-[m4_require([_LT_FILEUTILS_DEFAULTS])dnl
-_LT_CONFIG_SAVE_COMMANDS([
-  m4_define([_LT_TAG], m4_if([$1], [], [C], [$1]))dnl
-  m4_if(_LT_TAG, [C], [
-    # See if we are running on zsh, and set the options which allow our
-    # commands through without removal of \ escapes.
-    if test -n "${ZSH_VERSION+set}" ; then
-      setopt NO_GLOB_SUBST
-    fi
-
-    cfgfile="${ofile}T"
-    trap "$RM \"$cfgfile\"; exit 1" 1 2 15
-    $RM "$cfgfile"
-
-    cat <<_LT_EOF >> "$cfgfile"
-#! $SHELL
-
-# `$ECHO "$ofile" | sed 's%^.*/%%'` - Provide generalized library-building support services.
-# Generated automatically by $as_me ($PACKAGE$TIMESTAMP) $VERSION
-# Libtool was configured on host `(hostname || uname -n) 2>/dev/null | sed 1q`:
-# NOTE: Changes made to this file will be lost: look at ltmain.sh.
-#
-_LT_COPYING
-_LT_LIBTOOL_TAGS
-
-# ### BEGIN LIBTOOL CONFIG
-_LT_LIBTOOL_CONFIG_VARS
-_LT_LIBTOOL_TAG_VARS
-# ### END LIBTOOL CONFIG
-
-_LT_EOF
-
-  case $host_os in
-  aix3*)
-    cat <<\_LT_EOF >> "$cfgfile"
-# AIX sometimes has problems with the GCC collect2 program.  For some
-# reason, if we set the COLLECT_NAMES environment variable, the problems
-# vanish in a puff of smoke.
-if test "X${COLLECT_NAMES+set}" != Xset; then
-  COLLECT_NAMES=
-  export COLLECT_NAMES
-fi
-_LT_EOF
-    ;;
-  esac
-
-  _LT_PROG_LTMAIN
-
-  # We use sed instead of cat because bash on DJGPP gets confused if
-  # if finds mixed CR/LF and LF-only lines.  Since sed operates in
-  # text mode, it properly converts lines to CR/LF.  This bash problem
-  # is reportedly fixed, but why not run on old versions too?
-  sed '$q' "$ltmain" >> "$cfgfile" \
-     || (rm -f "$cfgfile"; exit 1)
-
-  _LT_PROG_REPLACE_SHELLFNS
-
-   mv -f "$cfgfile" "$ofile" ||
-    (rm -f "$ofile" && cp "$cfgfile" "$ofile" && rm -f "$cfgfile")
-  chmod +x "$ofile"
-],
-[cat <<_LT_EOF >> "$ofile"
-
-dnl Unfortunately we have to use $1 here, since _LT_TAG is not expanded
-dnl in a comment (ie after a #).
-# ### BEGIN LIBTOOL TAG CONFIG: $1
-_LT_LIBTOOL_TAG_VARS(_LT_TAG)
-# ### END LIBTOOL TAG CONFIG: $1
-_LT_EOF
-])dnl /m4_if
-],
-[m4_if([$1], [], [
-    PACKAGE='$PACKAGE'
-    VERSION='$VERSION'
-    TIMESTAMP='$TIMESTAMP'
-    RM='$RM'
-    ofile='$ofile'], [])
-])dnl /_LT_CONFIG_SAVE_COMMANDS
-])# _LT_CONFIG
-
-
-# LT_SUPPORTED_TAG(TAG)
-# ---------------------
-# Trace this macro to discover what tags are supported by the libtool
-# --tag option, using:
-#    autoconf --trace 'LT_SUPPORTED_TAG:$1'
-AC_DEFUN([LT_SUPPORTED_TAG], [])
-
-
-# C support is built-in for now
-m4_define([_LT_LANG_C_enabled], [])
-m4_define([_LT_TAGS], [])
-
-
-# LT_LANG(LANG)
-# -------------
-# Enable libtool support for the given language if not already enabled.
-AC_DEFUN([LT_LANG],
-[AC_BEFORE([$0], [LT_OUTPUT])dnl
-m4_case([$1],
-  [C],			[_LT_LANG(C)],
-  [C++],		[_LT_LANG(CXX)],
-  [Go],			[_LT_LANG(GO)],
-  [Java],		[_LT_LANG(GCJ)],
-  [Fortran 77],		[_LT_LANG(F77)],
-  [Fortran],		[_LT_LANG(FC)],
-  [Windows Resource],	[_LT_LANG(RC)],
-  [m4_ifdef([_LT_LANG_]$1[_CONFIG],
-    [_LT_LANG($1)],
-    [m4_fatal([$0: unsupported language: "$1"])])])dnl
-])# LT_LANG
-
-
-# _LT_LANG(LANGNAME)
-# ------------------
-m4_defun([_LT_LANG],
-[m4_ifdef([_LT_LANG_]$1[_enabled], [],
-  [LT_SUPPORTED_TAG([$1])dnl
-  m4_append([_LT_TAGS], [$1 ])dnl
-  m4_define([_LT_LANG_]$1[_enabled], [])dnl
-  _LT_LANG_$1_CONFIG($1)])dnl
-])# _LT_LANG
-
-
-m4_ifndef([AC_PROG_GO], [
-############################################################
-# NOTE: This macro has been submitted for inclusion into   #
-#  GNU Autoconf as AC_PROG_GO.  When it is available in    #
-#  a released version of Autoconf we should remove this    #
-#  macro and use it instead.                               #
-############################################################
-m4_defun([AC_PROG_GO],
-[AC_LANG_PUSH(Go)dnl
-AC_ARG_VAR([GOC],     [Go compiler command])dnl
-AC_ARG_VAR([GOFLAGS], [Go compiler flags])dnl
-_AC_ARG_VAR_LDFLAGS()dnl
-AC_CHECK_TOOL(GOC, gccgo)
-if test -z "$GOC"; then
-  if test -n "$ac_tool_prefix"; then
-    AC_CHECK_PROG(GOC, [${ac_tool_prefix}gccgo], [${ac_tool_prefix}gccgo])
-  fi
-fi
-if test -z "$GOC"; then
-  AC_CHECK_PROG(GOC, gccgo, gccgo, false)
-fi
-])#m4_defun
-])#m4_ifndef
-
-
-# _LT_LANG_DEFAULT_CONFIG
-# -----------------------
-m4_defun([_LT_LANG_DEFAULT_CONFIG],
-[AC_PROVIDE_IFELSE([AC_PROG_CXX],
-  [LT_LANG(CXX)],
-  [m4_define([AC_PROG_CXX], defn([AC_PROG_CXX])[LT_LANG(CXX)])])
-
-AC_PROVIDE_IFELSE([AC_PROG_F77],
-  [LT_LANG(F77)],
-  [m4_define([AC_PROG_F77], defn([AC_PROG_F77])[LT_LANG(F77)])])
-
-AC_PROVIDE_IFELSE([AC_PROG_FC],
-  [LT_LANG(FC)],
-  [m4_define([AC_PROG_FC], defn([AC_PROG_FC])[LT_LANG(FC)])])
-
-dnl The call to [A][M_PROG_GCJ] is quoted like that to stop aclocal
-dnl pulling things in needlessly.
-AC_PROVIDE_IFELSE([AC_PROG_GCJ],
-  [LT_LANG(GCJ)],
-  [AC_PROVIDE_IFELSE([A][M_PROG_GCJ],
-    [LT_LANG(GCJ)],
-    [AC_PROVIDE_IFELSE([LT_PROG_GCJ],
-      [LT_LANG(GCJ)],
-      [m4_ifdef([AC_PROG_GCJ],
-	[m4_define([AC_PROG_GCJ], defn([AC_PROG_GCJ])[LT_LANG(GCJ)])])
-       m4_ifdef([A][M_PROG_GCJ],
-	[m4_define([A][M_PROG_GCJ], defn([A][M_PROG_GCJ])[LT_LANG(GCJ)])])
-       m4_ifdef([LT_PROG_GCJ],
-	[m4_define([LT_PROG_GCJ], defn([LT_PROG_GCJ])[LT_LANG(GCJ)])])])])])
-
-AC_PROVIDE_IFELSE([AC_PROG_GO],
-  [LT_LANG(GO)],
-  [m4_define([AC_PROG_GO], defn([AC_PROG_GO])[LT_LANG(GO)])])
-
-AC_PROVIDE_IFELSE([LT_PROG_RC],
-  [LT_LANG(RC)],
-  [m4_define([LT_PROG_RC], defn([LT_PROG_RC])[LT_LANG(RC)])])
-])# _LT_LANG_DEFAULT_CONFIG
-
-# Obsolete macros:
-AU_DEFUN([AC_LIBTOOL_CXX], [LT_LANG(C++)])
-AU_DEFUN([AC_LIBTOOL_F77], [LT_LANG(Fortran 77)])
-AU_DEFUN([AC_LIBTOOL_FC], [LT_LANG(Fortran)])
-AU_DEFUN([AC_LIBTOOL_GCJ], [LT_LANG(Java)])
-AU_DEFUN([AC_LIBTOOL_RC], [LT_LANG(Windows Resource)])
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_LIBTOOL_CXX], [])
-dnl AC_DEFUN([AC_LIBTOOL_F77], [])
-dnl AC_DEFUN([AC_LIBTOOL_FC], [])
-dnl AC_DEFUN([AC_LIBTOOL_GCJ], [])
-dnl AC_DEFUN([AC_LIBTOOL_RC], [])
-
-
-# _LT_TAG_COMPILER
-# ----------------
-m4_defun([_LT_TAG_COMPILER],
-[AC_REQUIRE([AC_PROG_CC])dnl
-
-_LT_DECL([LTCC], [CC], [1], [A C compiler])dnl
-_LT_DECL([LTCFLAGS], [CFLAGS], [1], [LTCC compiler flags])dnl
-_LT_TAGDECL([CC], [compiler], [1], [A language specific compiler])dnl
-_LT_TAGDECL([with_gcc], [GCC], [0], [Is the compiler the GNU compiler?])dnl
-
-# If no C compiler was specified, use CC.
-LTCC=${LTCC-"$CC"}
-
-# If no C compiler flags were specified, use CFLAGS.
-LTCFLAGS=${LTCFLAGS-"$CFLAGS"}
-
-# Allow CC to be a program name with arguments.
-compiler=$CC
-])# _LT_TAG_COMPILER
-
-
-# _LT_COMPILER_BOILERPLATE
-# ------------------------
-# Check for compiler boilerplate output or warnings with
-# the simple compiler test code.
-m4_defun([_LT_COMPILER_BOILERPLATE],
-[m4_require([_LT_DECL_SED])dnl
-ac_outfile=conftest.$ac_objext
-echo "$lt_simple_compile_test_code" >conftest.$ac_ext
-eval "$ac_compile" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err
-_lt_compiler_boilerplate=`cat conftest.err`
-$RM conftest*
-])# _LT_COMPILER_BOILERPLATE
-
-
-# _LT_LINKER_BOILERPLATE
-# ----------------------
-# Check for linker boilerplate output or warnings with
-# the simple link test code.
-m4_defun([_LT_LINKER_BOILERPLATE],
-[m4_require([_LT_DECL_SED])dnl
-ac_outfile=conftest.$ac_objext
-echo "$lt_simple_link_test_code" >conftest.$ac_ext
-eval "$ac_link" 2>&1 >/dev/null | $SED '/^$/d; /^ *+/d' >conftest.err
-_lt_linker_boilerplate=`cat conftest.err`
-$RM -r conftest*
-])# _LT_LINKER_BOILERPLATE
-
-# _LT_REQUIRED_DARWIN_CHECKS
-# -------------------------
-m4_defun_once([_LT_REQUIRED_DARWIN_CHECKS],[
-  case $host_os in
-    rhapsody* | darwin*)
-    AC_CHECK_TOOL([DSYMUTIL], [dsymutil], [:])
-    AC_CHECK_TOOL([NMEDIT], [nmedit], [:])
-    AC_CHECK_TOOL([LIPO], [lipo], [:])
-    AC_CHECK_TOOL([OTOOL], [otool], [:])
-    AC_CHECK_TOOL([OTOOL64], [otool64], [:])
-    _LT_DECL([], [DSYMUTIL], [1],
-      [Tool to manipulate archived DWARF debug symbol files on Mac OS X])
-    _LT_DECL([], [NMEDIT], [1],
-      [Tool to change global to local symbols on Mac OS X])
-    _LT_DECL([], [LIPO], [1],
-      [Tool to manipulate fat objects and archives on Mac OS X])
-    _LT_DECL([], [OTOOL], [1],
-      [ldd/readelf like tool for Mach-O binaries on Mac OS X])
-    _LT_DECL([], [OTOOL64], [1],
-      [ldd/readelf like tool for 64 bit Mach-O binaries on Mac OS X 10.4])
-
-    AC_CACHE_CHECK([for -single_module linker flag],[lt_cv_apple_cc_single_mod],
-      [lt_cv_apple_cc_single_mod=no
-      if test -z "${LT_MULTI_MODULE}"; then
-	# By default we will add the -single_module flag. You can override
-	# by either setting the environment variable LT_MULTI_MODULE
-	# non-empty at configure time, or by adding -multi_module to the
-	# link flags.
-	rm -rf libconftest.dylib*
-	echo "int foo(void){return 1;}" > conftest.c
-	echo "$LTCC $LTCFLAGS $LDFLAGS -o libconftest.dylib \
--dynamiclib -Wl,-single_module conftest.c" >&AS_MESSAGE_LOG_FD
-	$LTCC $LTCFLAGS $LDFLAGS -o libconftest.dylib \
-	  -dynamiclib -Wl,-single_module conftest.c 2>conftest.err
-        _lt_result=$?
-	# If there is a non-empty error log, and "single_module"
-	# appears in it, assume the flag caused a linker warning
-        if test -s conftest.err && $GREP single_module conftest.err; then
-	  cat conftest.err >&AS_MESSAGE_LOG_FD
-	# Otherwise, if the output was created with a 0 exit code from
-	# the compiler, it worked.
-	elif test -f libconftest.dylib && test $_lt_result -eq 0; then
-	  lt_cv_apple_cc_single_mod=yes
-	else
-	  cat conftest.err >&AS_MESSAGE_LOG_FD
-	fi
-	rm -rf libconftest.dylib*
-	rm -f conftest.*
-      fi])
-
-    AC_CACHE_CHECK([for -exported_symbols_list linker flag],
-      [lt_cv_ld_exported_symbols_list],
-      [lt_cv_ld_exported_symbols_list=no
-      save_LDFLAGS=$LDFLAGS
-      echo "_main" > conftest.sym
-      LDFLAGS="$LDFLAGS -Wl,-exported_symbols_list,conftest.sym"
-      AC_LINK_IFELSE([AC_LANG_PROGRAM([],[])],
-	[lt_cv_ld_exported_symbols_list=yes],
-	[lt_cv_ld_exported_symbols_list=no])
-	LDFLAGS="$save_LDFLAGS"
-    ])
-
-    AC_CACHE_CHECK([for -force_load linker flag],[lt_cv_ld_force_load],
-      [lt_cv_ld_force_load=no
-      cat > conftest.c << _LT_EOF
-int forced_loaded() { return 2;}
-_LT_EOF
-      echo "$LTCC $LTCFLAGS -c -o conftest.o conftest.c" >&AS_MESSAGE_LOG_FD
-      $LTCC $LTCFLAGS -c -o conftest.o conftest.c 2>&AS_MESSAGE_LOG_FD
-      echo "$AR cru libconftest.a conftest.o" >&AS_MESSAGE_LOG_FD
-      $AR cru libconftest.a conftest.o 2>&AS_MESSAGE_LOG_FD
-      echo "$RANLIB libconftest.a" >&AS_MESSAGE_LOG_FD
-      $RANLIB libconftest.a 2>&AS_MESSAGE_LOG_FD
-      cat > conftest.c << _LT_EOF
-int main() { return 0;}
-_LT_EOF
-      echo "$LTCC $LTCFLAGS $LDFLAGS -o conftest conftest.c -Wl,-force_load,./libconftest.a" >&AS_MESSAGE_LOG_FD
-      $LTCC $LTCFLAGS $LDFLAGS -o conftest conftest.c -Wl,-force_load,./libconftest.a 2>conftest.err
-      _lt_result=$?
-      if test -s conftest.err && $GREP force_load conftest.err; then
-	cat conftest.err >&AS_MESSAGE_LOG_FD
-      elif test -f conftest && test $_lt_result -eq 0 && $GREP forced_load conftest >/dev/null 2>&1 ; then
-	lt_cv_ld_force_load=yes
-      else
-	cat conftest.err >&AS_MESSAGE_LOG_FD
-      fi
-        rm -f conftest.err libconftest.a conftest conftest.c
-        rm -rf conftest.dSYM
-    ])
-    case $host_os in
-    rhapsody* | darwin1.[[012]])
-      _lt_dar_allow_undefined='${wl}-undefined ${wl}suppress' ;;
-    darwin1.*)
-      _lt_dar_allow_undefined='${wl}-flat_namespace ${wl}-undefined ${wl}suppress' ;;
-    darwin*) # darwin 5.x on
-      # if running on 10.5 or later, the deployment target defaults
-      # to the OS version, if on x86, and 10.4, the deployment
-      # target defaults to 10.4. Don't you love it?
-      case ${MACOSX_DEPLOYMENT_TARGET-10.0},$host in
-	10.0,*86*-darwin8*|10.0,*-darwin[[91]]*)
-	  _lt_dar_allow_undefined='${wl}-undefined ${wl}dynamic_lookup' ;;
-	10.[[012]]*)
-	  _lt_dar_allow_undefined='${wl}-flat_namespace ${wl}-undefined ${wl}suppress' ;;
-	10.*)
-	  _lt_dar_allow_undefined='${wl}-undefined ${wl}dynamic_lookup' ;;
-      esac
-    ;;
-  esac
-    if test "$lt_cv_apple_cc_single_mod" = "yes"; then
-      _lt_dar_single_mod='$single_module'
-    fi
-    if test "$lt_cv_ld_exported_symbols_list" = "yes"; then
-      _lt_dar_export_syms=' ${wl}-exported_symbols_list,$output_objdir/${libname}-symbols.expsym'
-    else
-      _lt_dar_export_syms='~$NMEDIT -s $output_objdir/${libname}-symbols.expsym ${lib}'
-    fi
-    if test "$DSYMUTIL" != ":" && test "$lt_cv_ld_force_load" = "no"; then
-      _lt_dsymutil='~$DSYMUTIL $lib || :'
-    else
-      _lt_dsymutil=
-    fi
-    ;;
-  esac
-])
-
-
-# _LT_DARWIN_LINKER_FEATURES([TAG])
-# ---------------------------------
-# Checks for linker and compiler features on darwin
-m4_defun([_LT_DARWIN_LINKER_FEATURES],
-[
-  m4_require([_LT_REQUIRED_DARWIN_CHECKS])
-  _LT_TAGVAR(archive_cmds_need_lc, $1)=no
-  _LT_TAGVAR(hardcode_direct, $1)=no
-  _LT_TAGVAR(hardcode_automatic, $1)=yes
-  _LT_TAGVAR(hardcode_shlibpath_var, $1)=unsupported
-  if test "$lt_cv_ld_force_load" = "yes"; then
-    _LT_TAGVAR(whole_archive_flag_spec, $1)='`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience ${wl}-force_load,$conv\"; done; func_echo_all \"$new_convenience\"`'
-    m4_case([$1], [F77], [_LT_TAGVAR(compiler_needs_object, $1)=yes],
-                  [FC],  [_LT_TAGVAR(compiler_needs_object, $1)=yes])
-  else
-    _LT_TAGVAR(whole_archive_flag_spec, $1)=''
-  fi
-  _LT_TAGVAR(link_all_deplibs, $1)=yes
-  _LT_TAGVAR(allow_undefined_flag, $1)="$_lt_dar_allow_undefined"
-  case $cc_basename in
-     ifort*) _lt_dar_can_shared=yes ;;
-     *) _lt_dar_can_shared=$GCC ;;
-  esac
-  if test "$_lt_dar_can_shared" = "yes"; then
-    output_verbose_link_cmd=func_echo_all
-    _LT_TAGVAR(archive_cmds, $1)="\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring $_lt_dar_single_mod${_lt_dsymutil}"
-    _LT_TAGVAR(module_cmds, $1)="\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dsymutil}"
-    _LT_TAGVAR(archive_expsym_cmds, $1)="sed 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \$libobjs \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring ${_lt_dar_single_mod}${_lt_dar_export_syms}${_lt_dsymutil}"
-    _LT_TAGVAR(module_expsym_cmds, $1)="sed -e 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC \$allow_undefined_flag -o \$lib -bundle \$libobjs \$deplibs \$compiler_flags${_lt_dar_export_syms}${_lt_dsymutil}"
-    m4_if([$1], [CXX],
-[   if test "$lt_cv_apple_cc_single_mod" != "yes"; then
-      _LT_TAGVAR(archive_cmds, $1)="\$CC -r -keep_private_externs -nostdlib -o \${lib}-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \${lib}-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring${_lt_dsymutil}"
-      _LT_TAGVAR(archive_expsym_cmds, $1)="sed 's,^,_,' < \$export_symbols > \$output_objdir/\${libname}-symbols.expsym~\$CC -r -keep_private_externs -nostdlib -o \${lib}-master.o \$libobjs~\$CC -dynamiclib \$allow_undefined_flag -o \$lib \${lib}-master.o \$deplibs \$compiler_flags -install_name \$rpath/\$soname \$verstring${_lt_dar_export_syms}${_lt_dsymutil}"
-    fi
-],[])
-  else
-  _LT_TAGVAR(ld_shlibs, $1)=no
-  fi
-])
-
-# _LT_SYS_MODULE_PATH_AIX([TAGNAME])
-# ----------------------------------
-# Links a minimal program and checks the executable
-# for the system default hardcoded library path. In most cases,
-# this is /usr/lib:/lib, but when the MPI compilers are used
-# the location of the communication and MPI libs are included too.
-# If we don't find anything, use the default library path according
-# to the aix ld manual.
-# Store the results from the different compilers for each TAGNAME.
-# Allow to override them for all tags through lt_cv_aix_libpath.
-m4_defun([_LT_SYS_MODULE_PATH_AIX],
-[m4_require([_LT_DECL_SED])dnl
-if test "${lt_cv_aix_libpath+set}" = set; then
-  aix_libpath=$lt_cv_aix_libpath
-else
-  AC_CACHE_VAL([_LT_TAGVAR([lt_cv_aix_libpath_], [$1])],
-  [AC_LINK_IFELSE([AC_LANG_PROGRAM],[
-  lt_aix_libpath_sed='[
-      /Import File Strings/,/^$/ {
-	  /^0/ {
-	      s/^0  *\([^ ]*\) *$/\1/
-	      p
-	  }
-      }]'
-  _LT_TAGVAR([lt_cv_aix_libpath_], [$1])=`dump -H conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
-  # Check for a 64-bit object if we didn't find anything.
-  if test -z "$_LT_TAGVAR([lt_cv_aix_libpath_], [$1])"; then
-    _LT_TAGVAR([lt_cv_aix_libpath_], [$1])=`dump -HX64 conftest$ac_exeext 2>/dev/null | $SED -n -e "$lt_aix_libpath_sed"`
-  fi],[])
-  if test -z "$_LT_TAGVAR([lt_cv_aix_libpath_], [$1])"; then
-    _LT_TAGVAR([lt_cv_aix_libpath_], [$1])="/usr/lib:/lib"
-  fi
-  ])
-  aix_libpath=$_LT_TAGVAR([lt_cv_aix_libpath_], [$1])
-fi
-])# _LT_SYS_MODULE_PATH_AIX
-
-
-# _LT_SHELL_INIT(ARG)
-# -------------------
-m4_define([_LT_SHELL_INIT],
-[m4_divert_text([M4SH-INIT], [$1
-])])# _LT_SHELL_INIT
-
-
-
-# _LT_PROG_ECHO_BACKSLASH
-# -----------------------
-# Find how we can fake an echo command that does not interpret backslash.
-# In particular, with Autoconf 2.60 or later we add some code to the start
-# of the generated configure script which will find a shell with a builtin
-# printf (which we can use as an echo command).
-m4_defun([_LT_PROG_ECHO_BACKSLASH],
-[ECHO='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
-ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO
-ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO$ECHO
-
-AC_MSG_CHECKING([how to print strings])
-# Test print first, because it will be a builtin if present.
-if test "X`( print -r -- -n ) 2>/dev/null`" = X-n && \
-   test "X`print -r -- $ECHO 2>/dev/null`" = "X$ECHO"; then
-  ECHO='print -r --'
-elif test "X`printf %s $ECHO 2>/dev/null`" = "X$ECHO"; then
-  ECHO='printf %s\n'
-else
-  # Use this function as a fallback that always works.
-  func_fallback_echo ()
-  {
-    eval 'cat <<_LTECHO_EOF
-$[]1
-_LTECHO_EOF'
-  }
-  ECHO='func_fallback_echo'
-fi
-
-# func_echo_all arg...
-# Invoke $ECHO with all args, space-separated.
-func_echo_all ()
-{
-    $ECHO "$*" 
-}
-
-case "$ECHO" in
-  printf*) AC_MSG_RESULT([printf]) ;;
-  print*) AC_MSG_RESULT([print -r]) ;;
-  *) AC_MSG_RESULT([cat]) ;;
-esac
-
-m4_ifdef([_AS_DETECT_SUGGESTED],
-[_AS_DETECT_SUGGESTED([
-  test -n "${ZSH_VERSION+set}${BASH_VERSION+set}" || (
-    ECHO='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
-    ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO
-    ECHO=$ECHO$ECHO$ECHO$ECHO$ECHO$ECHO
-    PATH=/empty FPATH=/empty; export PATH FPATH
-    test "X`printf %s $ECHO`" = "X$ECHO" \
-      || test "X`print -r -- $ECHO`" = "X$ECHO" )])])
-
-_LT_DECL([], [SHELL], [1], [Shell to use when invoking shell scripts])
-_LT_DECL([], [ECHO], [1], [An echo program that protects backslashes])
-])# _LT_PROG_ECHO_BACKSLASH
-
-
-# _LT_WITH_SYSROOT
-# ----------------
-AC_DEFUN([_LT_WITH_SYSROOT],
-[AC_MSG_CHECKING([for sysroot])
-AC_ARG_WITH([sysroot],
-[  --with-sysroot[=DIR] Search for dependent libraries within DIR
-                        (or the compiler's sysroot if not specified).],
-[], [with_sysroot=no])
-
-dnl lt_sysroot will always be passed unquoted.  We quote it here
-dnl in case the user passed a directory name.
-lt_sysroot=
-case ${with_sysroot} in #(
- yes)
-   if test "$GCC" = yes; then
-     lt_sysroot=`$CC --print-sysroot 2>/dev/null`
-   fi
-   ;; #(
- /*)
-   lt_sysroot=`echo "$with_sysroot" | sed -e "$sed_quote_subst"`
-   ;; #(
- no|'')
-   ;; #(
- *)
-   AC_MSG_RESULT([${with_sysroot}])
-   AC_MSG_ERROR([The sysroot must be an absolute path.])
-   ;;
-esac
-
- AC_MSG_RESULT([${lt_sysroot:-no}])
-_LT_DECL([], [lt_sysroot], [0], [The root where to search for ]dnl
-[dependent libraries, and in which our libraries should be installed.])])
-
-# _LT_ENABLE_LOCK
-# ---------------
-m4_defun([_LT_ENABLE_LOCK],
-[AC_ARG_ENABLE([libtool-lock],
-  [AS_HELP_STRING([--disable-libtool-lock],
-    [avoid locking (might break parallel builds)])])
-test "x$enable_libtool_lock" != xno && enable_libtool_lock=yes
-
-# Some flags need to be propagated to the compiler or linker for good
-# libtool support.
-case $host in
-ia64-*-hpux*)
-  # Find out which ABI we are using.
-  echo 'int i;' > conftest.$ac_ext
-  if AC_TRY_EVAL(ac_compile); then
-    case `/usr/bin/file conftest.$ac_objext` in
-      *ELF-32*)
-	HPUX_IA64_MODE="32"
-	;;
-      *ELF-64*)
-	HPUX_IA64_MODE="64"
-	;;
-    esac
-  fi
-  rm -rf conftest*
-  ;;
-*-*-irix6*)
-  # Find out which ABI we are using.
-  echo '[#]line '$LINENO' "configure"' > conftest.$ac_ext
-  if AC_TRY_EVAL(ac_compile); then
-    if test "$lt_cv_prog_gnu_ld" = yes; then
-      case `/usr/bin/file conftest.$ac_objext` in
-	*32-bit*)
-	  LD="${LD-ld} -melf32bsmip"
-	  ;;
-	*N32*)
-	  LD="${LD-ld} -melf32bmipn32"
-	  ;;
-	*64-bit*)
-	  LD="${LD-ld} -melf64bmip"
-	;;
-      esac
-    else
-      case `/usr/bin/file conftest.$ac_objext` in
-	*32-bit*)
-	  LD="${LD-ld} -32"
-	  ;;
-	*N32*)
-	  LD="${LD-ld} -n32"
-	  ;;
-	*64-bit*)
-	  LD="${LD-ld} -64"
-	  ;;
-      esac
-    fi
-  fi
-  rm -rf conftest*
-  ;;
-
-x86_64-*kfreebsd*-gnu|x86_64-*linux*|powerpc*-*linux*| \
-s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
-  # Find out which ABI we are using.
-  echo 'int i;' > conftest.$ac_ext
-  if AC_TRY_EVAL(ac_compile); then
-    case `/usr/bin/file conftest.o` in
-      *32-bit*)
-	case $host in
-	  x86_64-*kfreebsd*-gnu)
-	    LD="${LD-ld} -m elf_i386_fbsd"
-	    ;;
-	  x86_64-*linux*)
-	    case `/usr/bin/file conftest.o` in
-	      *x86-64*)
-		LD="${LD-ld} -m elf32_x86_64"
-		;;
-	      *)
-		LD="${LD-ld} -m elf_i386"
-		;;
-	    esac
-	    ;;
-	  powerpc64le-*)
-	    LD="${LD-ld} -m elf32lppclinux"
-	    ;;
-	  powerpc64-*)
-	    LD="${LD-ld} -m elf32ppclinux"
-	    ;;
-	  s390x-*linux*)
-	    LD="${LD-ld} -m elf_s390"
-	    ;;
-	  sparc64-*linux*)
-	    LD="${LD-ld} -m elf32_sparc"
-	    ;;
-	esac
-	;;
-      *64-bit*)
-	case $host in
-	  x86_64-*kfreebsd*-gnu)
-	    LD="${LD-ld} -m elf_x86_64_fbsd"
-	    ;;
-	  x86_64-*linux*)
-	    LD="${LD-ld} -m elf_x86_64"
-	    ;;
-	  powerpcle-*)
-	    LD="${LD-ld} -m elf64lppc"
-	    ;;
-	  powerpc-*)
-	    LD="${LD-ld} -m elf64ppc"
-	    ;;
-	  s390*-*linux*|s390*-*tpf*)
-	    LD="${LD-ld} -m elf64_s390"
-	    ;;
-	  sparc*-*linux*)
-	    LD="${LD-ld} -m elf64_sparc"
-	    ;;
-	esac
-	;;
-    esac
-  fi
-  rm -rf conftest*
-  ;;
-
-*-*-sco3.2v5*)
-  # On SCO OpenServer 5, we need -belf to get full-featured binaries.
-  SAVE_CFLAGS="$CFLAGS"
-  CFLAGS="$CFLAGS -belf"
-  AC_CACHE_CHECK([whether the C compiler needs -belf], lt_cv_cc_needs_belf,
-    [AC_LANG_PUSH(C)
-     AC_LINK_IFELSE([AC_LANG_PROGRAM([[]],[[]])],[lt_cv_cc_needs_belf=yes],[lt_cv_cc_needs_belf=no])
-     AC_LANG_POP])
-  if test x"$lt_cv_cc_needs_belf" != x"yes"; then
-    # this is probably gcc 2.8.0, egcs 1.0 or newer; no need for -belf
-    CFLAGS="$SAVE_CFLAGS"
-  fi
-  ;;
-*-*solaris*)
-  # Find out which ABI we are using.
-  echo 'int i;' > conftest.$ac_ext
-  if AC_TRY_EVAL(ac_compile); then
-    case `/usr/bin/file conftest.o` in
-    *64-bit*)
-      case $lt_cv_prog_gnu_ld in
-      yes*)
-        case $host in
-        i?86-*-solaris*)
-          LD="${LD-ld} -m elf_x86_64"
-          ;;
-        sparc*-*-solaris*)
-          LD="${LD-ld} -m elf64_sparc"
-          ;;
-        esac
-        # GNU ld 2.21 introduced _sol2 emulations.  Use them if available.
-        if ${LD-ld} -V | grep _sol2 >/dev/null 2>&1; then
-          LD="${LD-ld}_sol2"
-        fi
-        ;;
-      *)
-	if ${LD-ld} -64 -r -o conftest2.o conftest.o >/dev/null 2>&1; then
-	  LD="${LD-ld} -64"
-	fi
-	;;
-      esac
-      ;;
-    esac
-  fi
-  rm -rf conftest*
-  ;;
-esac
-
-need_locks="$enable_libtool_lock"
-])# _LT_ENABLE_LOCK
-
-
-# _LT_PROG_AR
-# -----------
-m4_defun([_LT_PROG_AR],
-[AC_CHECK_TOOLS(AR, [ar], false)
-: ${AR=ar}
-: ${AR_FLAGS=cru}
-_LT_DECL([], [AR], [1], [The archiver])
-_LT_DECL([], [AR_FLAGS], [1], [Flags to create an archive])
-
-AC_CACHE_CHECK([for archiver @FILE support], [lt_cv_ar_at_file],
-  [lt_cv_ar_at_file=no
-   AC_COMPILE_IFELSE([AC_LANG_PROGRAM],
-     [echo conftest.$ac_objext > conftest.lst
-      lt_ar_try='$AR $AR_FLAGS libconftest.a @conftest.lst >&AS_MESSAGE_LOG_FD'
-      AC_TRY_EVAL([lt_ar_try])
-      if test "$ac_status" -eq 0; then
-	# Ensure the archiver fails upon bogus file names.
-	rm -f conftest.$ac_objext libconftest.a
-	AC_TRY_EVAL([lt_ar_try])
-	if test "$ac_status" -ne 0; then
-          lt_cv_ar_at_file=@
-        fi
-      fi
-      rm -f conftest.* libconftest.a
-     ])
-  ])
-
-if test "x$lt_cv_ar_at_file" = xno; then
-  archiver_list_spec=
-else
-  archiver_list_spec=$lt_cv_ar_at_file
-fi
-_LT_DECL([], [archiver_list_spec], [1],
-  [How to feed a file listing to the archiver])
-])# _LT_PROG_AR
-
-
-# _LT_CMD_OLD_ARCHIVE
-# -------------------
-m4_defun([_LT_CMD_OLD_ARCHIVE],
-[_LT_PROG_AR
-
-AC_CHECK_TOOL(STRIP, strip, :)
-test -z "$STRIP" && STRIP=:
-_LT_DECL([], [STRIP], [1], [A symbol stripping program])
-
-AC_CHECK_TOOL(RANLIB, ranlib, :)
-test -z "$RANLIB" && RANLIB=:
-_LT_DECL([], [RANLIB], [1],
-    [Commands used to install an old-style archive])
-
-# Determine commands to create old-style static archives.
-old_archive_cmds='$AR $AR_FLAGS $oldlib$oldobjs'
-old_postinstall_cmds='chmod 644 $oldlib'
-old_postuninstall_cmds=
-
-if test -n "$RANLIB"; then
-  case $host_os in
-  openbsd*)
-    old_postinstall_cmds="$old_postinstall_cmds~\$RANLIB -t \$tool_oldlib"
-    ;;
-  *)
-    old_postinstall_cmds="$old_postinstall_cmds~\$RANLIB \$tool_oldlib"
-    ;;
-  esac
-  old_archive_cmds="$old_archive_cmds~\$RANLIB \$tool_oldlib"
-fi
-
-case $host_os in
-  darwin*)
-    lock_old_archive_extraction=yes ;;
-  *)
-    lock_old_archive_extraction=no ;;
-esac
-_LT_DECL([], [old_postinstall_cmds], [2])
-_LT_DECL([], [old_postuninstall_cmds], [2])
-_LT_TAGDECL([], [old_archive_cmds], [2],
-    [Commands used to build an old-style archive])
-_LT_DECL([], [lock_old_archive_extraction], [0],
-    [Whether to use a lock for old archive extraction])
-])# _LT_CMD_OLD_ARCHIVE
-
-
-# _LT_COMPILER_OPTION(MESSAGE, VARIABLE-NAME, FLAGS,
-#		[OUTPUT-FILE], [ACTION-SUCCESS], [ACTION-FAILURE])
-# ----------------------------------------------------------------
-# Check whether the given compiler option works
-AC_DEFUN([_LT_COMPILER_OPTION],
-[m4_require([_LT_FILEUTILS_DEFAULTS])dnl
-m4_require([_LT_DECL_SED])dnl
-AC_CACHE_CHECK([$1], [$2],
-  [$2=no
-   m4_if([$4], , [ac_outfile=conftest.$ac_objext], [ac_outfile=$4])
-   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
-   lt_compiler_flag="$3"
-   # Insert the option either (1) after the last *FLAGS variable, or
-   # (2) before a word containing "conftest.", or (3) at the end.
-   # Note that $ac_compile itself does not contain backslashes and begins
-   # with a dollar sign (not a hyphen), so the echo should work correctly.
-   # The option is referenced via a variable to avoid confusing sed.
-   lt_compile=`echo "$ac_compile" | $SED \
-   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
-   -e 's: [[^ ]]*conftest\.: $lt_compiler_flag&:; t' \
-   -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&AS_MESSAGE_LOG_FD)
-   (eval "$lt_compile" 2>conftest.err)
-   ac_status=$?
-   cat conftest.err >&AS_MESSAGE_LOG_FD
-   echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD
-   if (exit $ac_status) && test -s "$ac_outfile"; then
-     # The compiler can only warn and ignore the option if not recognized
-     # So say no if there are warnings other than the usual output.
-     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' >conftest.exp
-     $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
-     if test ! -s conftest.er2 || diff conftest.exp conftest.er2 >/dev/null; then
-       $2=yes
-     fi
-   fi
-   $RM conftest*
-])
-
-if test x"[$]$2" = xyes; then
-    m4_if([$5], , :, [$5])
-else
-    m4_if([$6], , :, [$6])
-fi
-])# _LT_COMPILER_OPTION
-
-# Old name:
-AU_ALIAS([AC_LIBTOOL_COMPILER_OPTION], [_LT_COMPILER_OPTION])
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_LIBTOOL_COMPILER_OPTION], [])
-
-
-# _LT_LINKER_OPTION(MESSAGE, VARIABLE-NAME, FLAGS,
-#                  [ACTION-SUCCESS], [ACTION-FAILURE])
-# ----------------------------------------------------
-# Check whether the given linker option works
-AC_DEFUN([_LT_LINKER_OPTION],
-[m4_require([_LT_FILEUTILS_DEFAULTS])dnl
-m4_require([_LT_DECL_SED])dnl
-AC_CACHE_CHECK([$1], [$2],
-  [$2=no
-   save_LDFLAGS="$LDFLAGS"
-   LDFLAGS="$LDFLAGS $3"
-   echo "$lt_simple_link_test_code" > conftest.$ac_ext
-   if (eval $ac_link 2>conftest.err) && test -s conftest$ac_exeext; then
-     # The linker can only warn and ignore the option if not recognized
-     # So say no if there are warnings
-     if test -s conftest.err; then
-       # Append any errors to the config.log.
-       cat conftest.err 1>&AS_MESSAGE_LOG_FD
-       $ECHO "$_lt_linker_boilerplate" | $SED '/^$/d' > conftest.exp
-       $SED '/^$/d; /^ *+/d' conftest.err >conftest.er2
-       if diff conftest.exp conftest.er2 >/dev/null; then
-         $2=yes
-       fi
-     else
-       $2=yes
-     fi
-   fi
-   $RM -r conftest*
-   LDFLAGS="$save_LDFLAGS"
-])
-
-if test x"[$]$2" = xyes; then
-    m4_if([$4], , :, [$4])
-else
-    m4_if([$5], , :, [$5])
-fi
-])# _LT_LINKER_OPTION
-
-# Old name:
-AU_ALIAS([AC_LIBTOOL_LINKER_OPTION], [_LT_LINKER_OPTION])
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_LIBTOOL_LINKER_OPTION], [])
-
-
-# LT_CMD_MAX_LEN
-#---------------
-AC_DEFUN([LT_CMD_MAX_LEN],
-[AC_REQUIRE([AC_CANONICAL_HOST])dnl
-# find the maximum length of command line arguments
-AC_MSG_CHECKING([the maximum length of command line arguments])
-AC_CACHE_VAL([lt_cv_sys_max_cmd_len], [dnl
-  i=0
-  teststring="ABCD"
-
-  case $build_os in
-  msdosdjgpp*)
-    # On DJGPP, this test can blow up pretty badly due to problems in libc
-    # (any single argument exceeding 2000 bytes causes a buffer overrun
-    # during glob expansion).  Even if it were fixed, the result of this
-    # check would be larger than it should be.
-    lt_cv_sys_max_cmd_len=12288;    # 12K is about right
-    ;;
-
-  gnu*)
-    # Under GNU Hurd, this test is not required because there is
-    # no limit to the length of command line arguments.
-    # Libtool will interpret -1 as no limit whatsoever
-    lt_cv_sys_max_cmd_len=-1;
-    ;;
-
-  cygwin* | mingw* | cegcc*)
-    # On Win9x/ME, this test blows up -- it succeeds, but takes
-    # about 5 minutes as the teststring grows exponentially.
-    # Worse, since 9x/ME are not pre-emptively multitasking,
-    # you end up with a "frozen" computer, even though with patience
-    # the test eventually succeeds (with a max line length of 256k).
-    # Instead, let's just punt: use the minimum linelength reported by
-    # all of the supported platforms: 8192 (on NT/2K/XP).
-    lt_cv_sys_max_cmd_len=8192;
-    ;;
-
-  mint*)
-    # On MiNT this can take a long time and run out of memory.
-    lt_cv_sys_max_cmd_len=8192;
-    ;;
-
-  amigaos*)
-    # On AmigaOS with pdksh, this test takes hours, literally.
-    # So we just punt and use a minimum line length of 8192.
-    lt_cv_sys_max_cmd_len=8192;
-    ;;
-
-  netbsd* | freebsd* | openbsd* | darwin* | dragonfly*)
-    # This has been around since 386BSD, at least.  Likely further.
-    if test -x /sbin/sysctl; then
-      lt_cv_sys_max_cmd_len=`/sbin/sysctl -n kern.argmax`
-    elif test -x /usr/sbin/sysctl; then
-      lt_cv_sys_max_cmd_len=`/usr/sbin/sysctl -n kern.argmax`
-    else
-      lt_cv_sys_max_cmd_len=65536	# usable default for all BSDs
-    fi
-    # And add a safety zone
-    lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 4`
-    lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \* 3`
-    ;;
-
-  interix*)
-    # We know the value 262144 and hardcode it with a safety zone (like BSD)
-    lt_cv_sys_max_cmd_len=196608
-    ;;
-
-  os2*)
-    # The test takes a long time on OS/2.
-    lt_cv_sys_max_cmd_len=8192
-    ;;
-
-  osf*)
-    # Dr. Hans Ekkehard Plesser reports seeing a kernel panic running configure
-    # due to this test when exec_disable_arg_limit is 1 on Tru64. It is not
-    # nice to cause kernel panics so lets avoid the loop below.
-    # First set a reasonable default.
-    lt_cv_sys_max_cmd_len=16384
-    #
-    if test -x /sbin/sysconfig; then
-      case `/sbin/sysconfig -q proc exec_disable_arg_limit` in
-        *1*) lt_cv_sys_max_cmd_len=-1 ;;
-      esac
-    fi
-    ;;
-  sco3.2v5*)
-    lt_cv_sys_max_cmd_len=102400
-    ;;
-  sysv5* | sco5v6* | sysv4.2uw2*)
-    kargmax=`grep ARG_MAX /etc/conf/cf.d/stune 2>/dev/null`
-    if test -n "$kargmax"; then
-      lt_cv_sys_max_cmd_len=`echo $kargmax | sed 's/.*[[	 ]]//'`
-    else
-      lt_cv_sys_max_cmd_len=32768
-    fi
-    ;;
-  *)
-    lt_cv_sys_max_cmd_len=`(getconf ARG_MAX) 2> /dev/null`
-    if test -n "$lt_cv_sys_max_cmd_len" && \
-	test undefined != "$lt_cv_sys_max_cmd_len"; then
-      lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 4`
-      lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \* 3`
-    else
-      # Make teststring a little bigger before we do anything with it.
-      # a 1K string should be a reasonable start.
-      for i in 1 2 3 4 5 6 7 8 ; do
-        teststring=$teststring$teststring
-      done
-      SHELL=${SHELL-${CONFIG_SHELL-/bin/sh}}
-      # If test is not a shell built-in, we'll probably end up computing a
-      # maximum length that is only half of the actual maximum length, but
-      # we can't tell.
-      while { test "X"`env echo "$teststring$teststring" 2>/dev/null` \
-	         = "X$teststring$teststring"; } >/dev/null 2>&1 &&
-	      test $i != 17 # 1/2 MB should be enough
-      do
-        i=`expr $i + 1`
-        teststring=$teststring$teststring
-      done
-      # Only check the string length outside the loop.
-      lt_cv_sys_max_cmd_len=`expr "X$teststring" : ".*" 2>&1`
-      teststring=
-      # Add a significant safety factor because C++ compilers can tack on
-      # massive amounts of additional arguments before passing them to the
-      # linker.  It appears as though 1/2 is a usable value.
-      lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 2`
-    fi
-    ;;
-  esac
-])
-if test -n $lt_cv_sys_max_cmd_len ; then
-  AC_MSG_RESULT($lt_cv_sys_max_cmd_len)
-else
-  AC_MSG_RESULT(none)
-fi
-max_cmd_len=$lt_cv_sys_max_cmd_len
-_LT_DECL([], [max_cmd_len], [0],
-    [What is the maximum length of a command?])
-])# LT_CMD_MAX_LEN
-
-# Old name:
-AU_ALIAS([AC_LIBTOOL_SYS_MAX_CMD_LEN], [LT_CMD_MAX_LEN])
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_LIBTOOL_SYS_MAX_CMD_LEN], [])
-
-
-# _LT_HEADER_DLFCN
-# ----------------
-m4_defun([_LT_HEADER_DLFCN],
-[AC_CHECK_HEADERS([dlfcn.h], [], [], [AC_INCLUDES_DEFAULT])dnl
-])# _LT_HEADER_DLFCN
-
-
-# _LT_TRY_DLOPEN_SELF (ACTION-IF-TRUE, ACTION-IF-TRUE-W-USCORE,
-#                      ACTION-IF-FALSE, ACTION-IF-CROSS-COMPILING)
-# ----------------------------------------------------------------
-m4_defun([_LT_TRY_DLOPEN_SELF],
-[m4_require([_LT_HEADER_DLFCN])dnl
-if test "$cross_compiling" = yes; then :
-  [$4]
-else
-  lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
-  lt_status=$lt_dlunknown
-  cat > conftest.$ac_ext <<_LT_EOF
-[#line $LINENO "configure"
-#include "confdefs.h"
-
-#if HAVE_DLFCN_H
-#include <dlfcn.h>
-#endif
-
-#include <stdio.h>
-
-#ifdef RTLD_GLOBAL
-#  define LT_DLGLOBAL		RTLD_GLOBAL
-#else
-#  ifdef DL_GLOBAL
-#    define LT_DLGLOBAL		DL_GLOBAL
-#  else
-#    define LT_DLGLOBAL		0
-#  endif
-#endif
-
-/* We may have to define LT_DLLAZY_OR_NOW in the command line if we
-   find out it does not work in some platform. */
-#ifndef LT_DLLAZY_OR_NOW
-#  ifdef RTLD_LAZY
-#    define LT_DLLAZY_OR_NOW		RTLD_LAZY
-#  else
-#    ifdef DL_LAZY
-#      define LT_DLLAZY_OR_NOW		DL_LAZY
-#    else
-#      ifdef RTLD_NOW
-#        define LT_DLLAZY_OR_NOW	RTLD_NOW
-#      else
-#        ifdef DL_NOW
-#          define LT_DLLAZY_OR_NOW	DL_NOW
-#        else
-#          define LT_DLLAZY_OR_NOW	0
-#        endif
-#      endif
-#    endif
-#  endif
-#endif
-
-/* When -fvisbility=hidden is used, assume the code has been annotated
-   correspondingly for the symbols needed.  */
-#if defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 3)) || (__GNUC__ > 3))
-int fnord () __attribute__((visibility("default")));
-#endif
-
-int fnord () { return 42; }
-int main ()
-{
-  void *self = dlopen (0, LT_DLGLOBAL|LT_DLLAZY_OR_NOW);
-  int status = $lt_dlunknown;
-
-  if (self)
-    {
-      if (dlsym (self,"fnord"))       status = $lt_dlno_uscore;
-      else
-        {
-	  if (dlsym( self,"_fnord"))  status = $lt_dlneed_uscore;
-          else puts (dlerror ());
-	}
-      /* dlclose (self); */
-    }
-  else
-    puts (dlerror ());
-
-  return status;
-}]
-_LT_EOF
-  if AC_TRY_EVAL(ac_link) && test -s conftest${ac_exeext} 2>/dev/null; then
-    (./conftest; exit; ) >&AS_MESSAGE_LOG_FD 2>/dev/null
-    lt_status=$?
-    case x$lt_status in
-      x$lt_dlno_uscore) $1 ;;
-      x$lt_dlneed_uscore) $2 ;;
-      x$lt_dlunknown|x*) $3 ;;
-    esac
-  else :
-    # compilation failed
-    $3
-  fi
-fi
-rm -fr conftest*
-])# _LT_TRY_DLOPEN_SELF
-
-
-# LT_SYS_DLOPEN_SELF
-# ------------------
-AC_DEFUN([LT_SYS_DLOPEN_SELF],
-[m4_require([_LT_HEADER_DLFCN])dnl
-if test "x$enable_dlopen" != xyes; then
-  enable_dlopen=unknown
-  enable_dlopen_self=unknown
-  enable_dlopen_self_static=unknown
-else
-  lt_cv_dlopen=no
-  lt_cv_dlopen_libs=
-
-  case $host_os in
-  beos*)
-    lt_cv_dlopen="load_add_on"
-    lt_cv_dlopen_libs=
-    lt_cv_dlopen_self=yes
-    ;;
-
-  mingw* | pw32* | cegcc*)
-    lt_cv_dlopen="LoadLibrary"
-    lt_cv_dlopen_libs=
-    ;;
-
-  cygwin*)
-    lt_cv_dlopen="dlopen"
-    lt_cv_dlopen_libs=
-    ;;
-
-  darwin*)
-  # if libdl is installed we need to link against it
-    AC_CHECK_LIB([dl], [dlopen],
-		[lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl"],[
-    lt_cv_dlopen="dyld"
-    lt_cv_dlopen_libs=
-    lt_cv_dlopen_self=yes
-    ])
-    ;;
-
-  *)
-    AC_CHECK_FUNC([shl_load],
-	  [lt_cv_dlopen="shl_load"],
-      [AC_CHECK_LIB([dld], [shl_load],
-	    [lt_cv_dlopen="shl_load" lt_cv_dlopen_libs="-ldld"],
-	[AC_CHECK_FUNC([dlopen],
-	      [lt_cv_dlopen="dlopen"],
-	  [AC_CHECK_LIB([dl], [dlopen],
-		[lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-ldl"],
-	    [AC_CHECK_LIB([svld], [dlopen],
-		  [lt_cv_dlopen="dlopen" lt_cv_dlopen_libs="-lsvld"],
-	      [AC_CHECK_LIB([dld], [dld_link],
-		    [lt_cv_dlopen="dld_link" lt_cv_dlopen_libs="-ldld"])
-	      ])
-	    ])
-	  ])
-	])
-      ])
-    ;;
-  esac
-
-  if test "x$lt_cv_dlopen" != xno; then
-    enable_dlopen=yes
-  else
-    enable_dlopen=no
-  fi
-
-  case $lt_cv_dlopen in
-  dlopen)
-    save_CPPFLAGS="$CPPFLAGS"
-    test "x$ac_cv_header_dlfcn_h" = xyes && CPPFLAGS="$CPPFLAGS -DHAVE_DLFCN_H"
-
-    save_LDFLAGS="$LDFLAGS"
-    wl=$lt_prog_compiler_wl eval LDFLAGS=\"\$LDFLAGS $export_dynamic_flag_spec\"
-
-    save_LIBS="$LIBS"
-    LIBS="$lt_cv_dlopen_libs $LIBS"
-
-    AC_CACHE_CHECK([whether a program can dlopen itself],
-	  lt_cv_dlopen_self, [dnl
-	  _LT_TRY_DLOPEN_SELF(
-	    lt_cv_dlopen_self=yes, lt_cv_dlopen_self=yes,
-	    lt_cv_dlopen_self=no, lt_cv_dlopen_self=cross)
-    ])
-
-    if test "x$lt_cv_dlopen_self" = xyes; then
-      wl=$lt_prog_compiler_wl eval LDFLAGS=\"\$LDFLAGS $lt_prog_compiler_static\"
-      AC_CACHE_CHECK([whether a statically linked program can dlopen itself],
-	  lt_cv_dlopen_self_static, [dnl
-	  _LT_TRY_DLOPEN_SELF(
-	    lt_cv_dlopen_self_static=yes, lt_cv_dlopen_self_static=yes,
-	    lt_cv_dlopen_self_static=no,  lt_cv_dlopen_self_static=cross)
-      ])
-    fi
-
-    CPPFLAGS="$save_CPPFLAGS"
-    LDFLAGS="$save_LDFLAGS"
-    LIBS="$save_LIBS"
-    ;;
-  esac
-
-  case $lt_cv_dlopen_self in
-  yes|no) enable_dlopen_self=$lt_cv_dlopen_self ;;
-  *) enable_dlopen_self=unknown ;;
-  esac
-
-  case $lt_cv_dlopen_self_static in
-  yes|no) enable_dlopen_self_static=$lt_cv_dlopen_self_static ;;
-  *) enable_dlopen_self_static=unknown ;;
-  esac
-fi
-_LT_DECL([dlopen_support], [enable_dlopen], [0],
-	 [Whether dlopen is supported])
-_LT_DECL([dlopen_self], [enable_dlopen_self], [0],
-	 [Whether dlopen of programs is supported])
-_LT_DECL([dlopen_self_static], [enable_dlopen_self_static], [0],
-	 [Whether dlopen of statically linked programs is supported])
-])# LT_SYS_DLOPEN_SELF
-
-# Old name:
-AU_ALIAS([AC_LIBTOOL_DLOPEN_SELF], [LT_SYS_DLOPEN_SELF])
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_LIBTOOL_DLOPEN_SELF], [])
-
-
-# _LT_COMPILER_C_O([TAGNAME])
-# ---------------------------
-# Check to see if options -c and -o are simultaneously supported by compiler.
-# This macro does not hard code the compiler like AC_PROG_CC_C_O.
-m4_defun([_LT_COMPILER_C_O],
-[m4_require([_LT_DECL_SED])dnl
-m4_require([_LT_FILEUTILS_DEFAULTS])dnl
-m4_require([_LT_TAG_COMPILER])dnl
-AC_CACHE_CHECK([if $compiler supports -c -o file.$ac_objext],
-  [_LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)],
-  [_LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)=no
-   $RM -r conftest 2>/dev/null
-   mkdir conftest
-   cd conftest
-   mkdir out
-   echo "$lt_simple_compile_test_code" > conftest.$ac_ext
-
-   lt_compiler_flag="-o out/conftest2.$ac_objext"
-   # Insert the option either (1) after the last *FLAGS variable, or
-   # (2) before a word containing "conftest.", or (3) at the end.
-   # Note that $ac_compile itself does not contain backslashes and begins
-   # with a dollar sign (not a hyphen), so the echo should work correctly.
-   lt_compile=`echo "$ac_compile" | $SED \
-   -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
-   -e 's: [[^ ]]*conftest\.: $lt_compiler_flag&:; t' \
-   -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:$LINENO: $lt_compile\"" >&AS_MESSAGE_LOG_FD)
-   (eval "$lt_compile" 2>out/conftest.err)
-   ac_status=$?
-   cat out/conftest.err >&AS_MESSAGE_LOG_FD
-   echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD
-   if (exit $ac_status) && test -s out/conftest2.$ac_objext
-   then
-     # The compiler can only warn and ignore the option if not recognized
-     # So say no if there are warnings
-     $ECHO "$_lt_compiler_boilerplate" | $SED '/^$/d' > out/conftest.exp
-     $SED '/^$/d; /^ *+/d' out/conftest.err >out/conftest.er2
-     if test ! -s out/conftest.er2 || diff out/conftest.exp out/conftest.er2 >/dev/null; then
-       _LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)=yes
-     fi
-   fi
-   chmod u+w . 2>&AS_MESSAGE_LOG_FD
-   $RM conftest*
-   # SGI C++ compiler will create directory out/ii_files/ for
-   # template instantiation
-   test -d out/ii_files && $RM out/ii_files/* && rmdir out/ii_files
-   $RM out/* && rmdir out
-   cd ..
-   $RM -r conftest
-   $RM conftest*
-])
-_LT_TAGDECL([compiler_c_o], [lt_cv_prog_compiler_c_o], [1],
-	[Does compiler simultaneously support -c and -o options?])
-])# _LT_COMPILER_C_O
-
-
-# _LT_COMPILER_FILE_LOCKS([TAGNAME])
-# ----------------------------------
-# Check to see if we can do hard links to lock some files if needed
-m4_defun([_LT_COMPILER_FILE_LOCKS],
-[m4_require([_LT_ENABLE_LOCK])dnl
-m4_require([_LT_FILEUTILS_DEFAULTS])dnl
-_LT_COMPILER_C_O([$1])
-
-hard_links="nottested"
-if test "$_LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)" = no && test "$need_locks" != no; then
-  # do not overwrite the value of need_locks provided by the user
-  AC_MSG_CHECKING([if we can lock with hard links])
-  hard_links=yes
-  $RM conftest*
-  ln conftest.a conftest.b 2>/dev/null && hard_links=no
-  touch conftest.a
-  ln conftest.a conftest.b 2>&5 || hard_links=no
-  ln conftest.a conftest.b 2>/dev/null && hard_links=no
-  AC_MSG_RESULT([$hard_links])
-  if test "$hard_links" = no; then
-    AC_MSG_WARN([`$CC' does not support `-c -o', so `make -j' may be unsafe])
-    need_locks=warn
-  fi
-else
-  need_locks=no
-fi
-_LT_DECL([], [need_locks], [1], [Must we lock files when doing compilation?])
-])# _LT_COMPILER_FILE_LOCKS
-
-
-# _LT_CHECK_OBJDIR
-# ----------------
-m4_defun([_LT_CHECK_OBJDIR],
-[AC_CACHE_CHECK([for objdir], [lt_cv_objdir],
-[rm -f .libs 2>/dev/null
-mkdir .libs 2>/dev/null
-if test -d .libs; then
-  lt_cv_objdir=.libs
-else
-  # MS-DOS does not allow filenames that begin with a dot.
-  lt_cv_objdir=_libs
-fi
-rmdir .libs 2>/dev/null])
-objdir=$lt_cv_objdir
-_LT_DECL([], [objdir], [0],
-         [The name of the directory that contains temporary libtool files])dnl
-m4_pattern_allow([LT_OBJDIR])dnl
-AC_DEFINE_UNQUOTED(LT_OBJDIR, "$lt_cv_objdir/",
-  [Define to the sub-directory in which libtool stores uninstalled libraries.])
-])# _LT_CHECK_OBJDIR
-
-
-# _LT_LINKER_HARDCODE_LIBPATH([TAGNAME])
-# --------------------------------------
-# Check hardcoding attributes.
-m4_defun([_LT_LINKER_HARDCODE_LIBPATH],
-[AC_MSG_CHECKING([how to hardcode library paths into programs])
-_LT_TAGVAR(hardcode_action, $1)=
-if test -n "$_LT_TAGVAR(hardcode_libdir_flag_spec, $1)" ||
-   test -n "$_LT_TAGVAR(runpath_var, $1)" ||
-   test "X$_LT_TAGVAR(hardcode_automatic, $1)" = "Xyes" ; then
-
-  # We can hardcode non-existent directories.
-  if test "$_LT_TAGVAR(hardcode_direct, $1)" != no &&
-     # If the only mechanism to avoid hardcoding is shlibpath_var, we
-     # have to relink, otherwise we might link with an installed library
-     # when we should be linking with a yet-to-be-installed one
-     ## test "$_LT_TAGVAR(hardcode_shlibpath_var, $1)" != no &&
-     test "$_LT_TAGVAR(hardcode_minus_L, $1)" != no; then
-    # Linking always hardcodes the temporary library directory.
-    _LT_TAGVAR(hardcode_action, $1)=relink
-  else
-    # We can link without hardcoding, and we can hardcode nonexisting dirs.
-    _LT_TAGVAR(hardcode_action, $1)=immediate
-  fi
-else
-  # We cannot hardcode anything, or else we can only hardcode existing
-  # directories.
-  _LT_TAGVAR(hardcode_action, $1)=unsupported
-fi
-AC_MSG_RESULT([$_LT_TAGVAR(hardcode_action, $1)])
-
-if test "$_LT_TAGVAR(hardcode_action, $1)" = relink ||
-   test "$_LT_TAGVAR(inherit_rpath, $1)" = yes; then
-  # Fast installation is not supported
-  enable_fast_install=no
-elif test "$shlibpath_overrides_runpath" = yes ||
-     test "$enable_shared" = no; then
-  # Fast installation is not necessary
-  enable_fast_install=needless
-fi
-_LT_TAGDECL([], [hardcode_action], [0],
-    [How to hardcode a shared library path into an executable])
-])# _LT_LINKER_HARDCODE_LIBPATH
-
-
-# _LT_CMD_STRIPLIB
-# ----------------
-m4_defun([_LT_CMD_STRIPLIB],
-[m4_require([_LT_DECL_EGREP])
-striplib=
-old_striplib=
-AC_MSG_CHECKING([whether stripping libraries is possible])
-if test -n "$STRIP" && $STRIP -V 2>&1 | $GREP "GNU strip" >/dev/null; then
-  test -z "$old_striplib" && old_striplib="$STRIP --strip-debug"
-  test -z "$striplib" && striplib="$STRIP --strip-unneeded"
-  AC_MSG_RESULT([yes])
-else
-# FIXME - insert some real tests, host_os isn't really good enough
-  case $host_os in
-  darwin*)
-    if test -n "$STRIP" ; then
-      striplib="$STRIP -x"
-      old_striplib="$STRIP -S"
-      AC_MSG_RESULT([yes])
-    else
-      AC_MSG_RESULT([no])
-    fi
-    ;;
-  *)
-    AC_MSG_RESULT([no])
-    ;;
-  esac
-fi
-_LT_DECL([], [old_striplib], [1], [Commands to strip libraries])
-_LT_DECL([], [striplib], [1])
-])# _LT_CMD_STRIPLIB
-
-
-# _LT_SYS_DYNAMIC_LINKER([TAG])
-# -----------------------------
-# PORTME Fill in your ld.so characteristics
-m4_defun([_LT_SYS_DYNAMIC_LINKER],
-[AC_REQUIRE([AC_CANONICAL_HOST])dnl
-m4_require([_LT_DECL_EGREP])dnl
-m4_require([_LT_FILEUTILS_DEFAULTS])dnl
-m4_require([_LT_DECL_OBJDUMP])dnl
-m4_require([_LT_DECL_SED])dnl
-m4_require([_LT_CHECK_SHELL_FEATURES])dnl
-AC_MSG_CHECKING([dynamic linker characteristics])
-m4_if([$1],
-	[], [
-if test "$GCC" = yes; then
-  case $host_os in
-    darwin*) lt_awk_arg="/^libraries:/,/LR/" ;;
-    *) lt_awk_arg="/^libraries:/" ;;
-  esac
-  case $host_os in
-    mingw* | cegcc*) lt_sed_strip_eq="s,=\([[A-Za-z]]:\),\1,g" ;;
-    *) lt_sed_strip_eq="s,=/,/,g" ;;
-  esac
-  lt_search_path_spec=`$CC -print-search-dirs | awk $lt_awk_arg | $SED -e "s/^libraries://" -e $lt_sed_strip_eq`
-  case $lt_search_path_spec in
-  *\;*)
-    # if the path contains ";" then we assume it to be the separator
-    # otherwise default to the standard path separator (i.e. ":") - it is
-    # assumed that no part of a normal pathname contains ";" but that should
-    # okay in the real world where ";" in dirpaths is itself problematic.
-    lt_search_path_spec=`$ECHO "$lt_search_path_spec" | $SED 's/;/ /g'`
-    ;;
-  *)
-    lt_search_path_spec=`$ECHO "$lt_search_path_spec" | $SED "s/$PATH_SEPARATOR/ /g"`
-    ;;
-  esac
-  # Ok, now we have the path, separated by spaces, we can step through it
-  # and add multilib dir if necessary.
-  lt_tmp_lt_search_path_spec=
-  lt_multi_os_dir=`$CC $CPPFLAGS $CFLAGS $LDFLAGS -print-multi-os-directory 2>/dev/null`
-  for lt_sys_path in $lt_search_path_spec; do
-    if test -d "$lt_sys_path/$lt_multi_os_dir"; then
-      lt_tmp_lt_search_path_spec="$lt_tmp_lt_search_path_spec $lt_sys_path/$lt_multi_os_dir"
-    else
-      test -d "$lt_sys_path" && \
-	lt_tmp_lt_search_path_spec="$lt_tmp_lt_search_path_spec $lt_sys_path"
-    fi
-  done
-  lt_search_path_spec=`$ECHO "$lt_tmp_lt_search_path_spec" | awk '
-BEGIN {RS=" "; FS="/|\n";} {
-  lt_foo="";
-  lt_count=0;
-  for (lt_i = NF; lt_i > 0; lt_i--) {
-    if ($lt_i != "" && $lt_i != ".") {
-      if ($lt_i == "..") {
-        lt_count++;
-      } else {
-        if (lt_count == 0) {
-          lt_foo="/" $lt_i lt_foo;
-        } else {
-          lt_count--;
-        }
-      }
-    }
-  }
-  if (lt_foo != "") { lt_freq[[lt_foo]]++; }
-  if (lt_freq[[lt_foo]] == 1) { print lt_foo; }
-}'`
-  # AWK program above erroneously prepends '/' to C:/dos/paths
-  # for these hosts.
-  case $host_os in
-    mingw* | cegcc*) lt_search_path_spec=`$ECHO "$lt_search_path_spec" |\
-      $SED 's,/\([[A-Za-z]]:\),\1,g'` ;;
-  esac
-  sys_lib_search_path_spec=`$ECHO "$lt_search_path_spec" | $lt_NL2SP`
-else
-  sys_lib_search_path_spec="/lib /usr/lib /usr/local/lib"
-fi])
-library_names_spec=
-libname_spec='lib$name'
-soname_spec=
-shrext_cmds=".so"
-postinstall_cmds=
-postuninstall_cmds=
-finish_cmds=
-finish_eval=
-shlibpath_var=
-shlibpath_overrides_runpath=unknown
-version_type=none
-dynamic_linker="$host_os ld.so"
-sys_lib_dlsearch_path_spec="/lib /usr/lib"
-need_lib_prefix=unknown
-hardcode_into_libs=no
-
-# when you set need_version to no, make sure it does not cause -set_version
-# flags to be left without arguments
-need_version=unknown
-
-case $host_os in
-aix3*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  library_names_spec='${libname}${release}${shared_ext}$versuffix $libname.a'
-  shlibpath_var=LIBPATH
-
-  # AIX 3 has no versioning support, so we append a major version to the name.
-  soname_spec='${libname}${release}${shared_ext}$major'
-  ;;
-
-aix[[4-9]]*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  hardcode_into_libs=yes
-  if test "$host_cpu" = ia64; then
-    # AIX 5 supports IA64
-    library_names_spec='${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext}$versuffix $libname${shared_ext}'
-    shlibpath_var=LD_LIBRARY_PATH
-  else
-    # With GCC up to 2.95.x, collect2 would create an import file
-    # for dependence libraries.  The import file would start with
-    # the line `#! .'.  This would cause the generated library to
-    # depend on `.', always an invalid library.  This was fixed in
-    # development snapshots of GCC prior to 3.0.
-    case $host_os in
-      aix4 | aix4.[[01]] | aix4.[[01]].*)
-      if { echo '#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 97)'
-	   echo ' yes '
-	   echo '#endif'; } | ${CC} -E - | $GREP yes > /dev/null; then
-	:
-      else
-	can_build_shared=no
-      fi
-      ;;
-    esac
-    # AIX (on Power*) has no versioning support, so currently we can not hardcode correct
-    # soname into executable. Probably we can add versioning support to
-    # collect2, so additional links can be useful in future.
-    if test "$aix_use_runtimelinking" = yes; then
-      # If using run time linking (on AIX 4.2 or later) use lib<name>.so
-      # instead of lib<name>.a to let people know that these are not
-      # typical AIX shared libraries.
-      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    else
-      # We preserve .a as extension for shared libraries through AIX4.2
-      # and later when we are not doing run time linking.
-      library_names_spec='${libname}${release}.a $libname.a'
-      soname_spec='${libname}${release}${shared_ext}$major'
-    fi
-    shlibpath_var=LIBPATH
-  fi
-  ;;
-
-amigaos*)
-  case $host_cpu in
-  powerpc)
-    # Since July 2007 AmigaOS4 officially supports .so libraries.
-    # When compiling the executable, add -use-dynld -Lsobjs: to the compileline.
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    ;;
-  m68k)
-    library_names_spec='$libname.ixlibrary $libname.a'
-    # Create ${libname}_ixlibrary.a entries in /sys/libs.
-    finish_eval='for lib in `ls $libdir/*.ixlibrary 2>/dev/null`; do libname=`func_echo_all "$lib" | $SED '\''s%^.*/\([[^/]]*\)\.ixlibrary$%\1%'\''`; test $RM /sys/libs/${libname}_ixlibrary.a; $show "cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a"; cd /sys/libs && $LN_S $lib ${libname}_ixlibrary.a || exit 1; done'
-    ;;
-  esac
-  ;;
-
-beos*)
-  library_names_spec='${libname}${shared_ext}'
-  dynamic_linker="$host_os ld.so"
-  shlibpath_var=LIBRARY_PATH
-  ;;
-
-bsdi[[45]]*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  finish_cmds='PATH="\$PATH:/sbin" ldconfig $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  sys_lib_search_path_spec="/shlib /usr/lib /usr/X11/lib /usr/contrib/lib /lib /usr/local/lib"
-  sys_lib_dlsearch_path_spec="/shlib /usr/lib /usr/local/lib"
-  # the default ld.so.conf also contains /usr/contrib/lib and
-  # /usr/X11R6/lib (/usr/X11 is a link to /usr/X11R6), but let us allow
-  # libtool to hard-code these into programs
-  ;;
-
-cygwin* | mingw* | pw32* | cegcc*)
-  version_type=windows
-  shrext_cmds=".dll"
-  need_version=no
-  need_lib_prefix=no
-
-  case $GCC,$cc_basename in
-  yes,*)
-    # gcc
-    library_names_spec='$libname.dll.a'
-    # DLL is installed to $(libdir)/../bin by postinstall_cmds
-    postinstall_cmds='base_file=`basename \${file}`~
-      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
-      dldir=$destdir/`dirname \$dlpath`~
-      test -d \$dldir || mkdir -p \$dldir~
-      $install_prog $dir/$dlname \$dldir/$dlname~
-      chmod a+x \$dldir/$dlname~
-      if test -n '\''$stripme'\'' && test -n '\''$striplib'\''; then
-        eval '\''$striplib \$dldir/$dlname'\'' || exit \$?;
-      fi'
-    postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~
-      dlpath=$dir/\$dldll~
-       $RM \$dlpath'
-    shlibpath_overrides_runpath=yes
-
-    case $host_os in
-    cygwin*)
-      # Cygwin DLLs use 'cyg' prefix rather than 'lib'
-      soname_spec='`echo ${libname} | sed -e 's/^lib/cyg/'``echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext}'
-m4_if([$1], [],[
-      sys_lib_search_path_spec="$sys_lib_search_path_spec /usr/lib/w32api"])
-      ;;
-    mingw* | cegcc*)
-      # MinGW DLLs use traditional 'lib' prefix
-      soname_spec='${libname}`echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext}'
-      ;;
-    pw32*)
-      # pw32 DLLs use 'pw' prefix rather than 'lib'
-      library_names_spec='`echo ${libname} | sed -e 's/^lib/pw/'``echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext}'
-      ;;
-    esac
-    dynamic_linker='Win32 ld.exe'
-    ;;
-
-  *,cl*)
-    # Native MSVC
-    libname_spec='$name'
-    soname_spec='${libname}`echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext}'
-    library_names_spec='${libname}.dll.lib'
-
-    case $build_os in
-    mingw*)
-      sys_lib_search_path_spec=
-      lt_save_ifs=$IFS
-      IFS=';'
-      for lt_path in $LIB
-      do
-        IFS=$lt_save_ifs
-        # Let DOS variable expansion print the short 8.3 style file name.
-        lt_path=`cd "$lt_path" 2>/dev/null && cmd //C "for %i in (".") do @echo %~si"`
-        sys_lib_search_path_spec="$sys_lib_search_path_spec $lt_path"
-      done
-      IFS=$lt_save_ifs
-      # Convert to MSYS style.
-      sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | sed -e 's|\\\\|/|g' -e 's| \\([[a-zA-Z]]\\):| /\\1|g' -e 's|^ ||'`
-      ;;
-    cygwin*)
-      # Convert to unix form, then to dos form, then back to unix form
-      # but this time dos style (no spaces!) so that the unix form looks
-      # like /cygdrive/c/PROGRA~1:/cygdr...
-      sys_lib_search_path_spec=`cygpath --path --unix "$LIB"`
-      sys_lib_search_path_spec=`cygpath --path --dos "$sys_lib_search_path_spec" 2>/dev/null`
-      sys_lib_search_path_spec=`cygpath --path --unix "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"`
-      ;;
-    *)
-      sys_lib_search_path_spec="$LIB"
-      if $ECHO "$sys_lib_search_path_spec" | [$GREP ';[c-zC-Z]:/' >/dev/null]; then
-        # It is most probably a Windows format PATH.
-        sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e 's/;/ /g'`
-      else
-        sys_lib_search_path_spec=`$ECHO "$sys_lib_search_path_spec" | $SED -e "s/$PATH_SEPARATOR/ /g"`
-      fi
-      # FIXME: find the short name or the path components, as spaces are
-      # common. (e.g. "Program Files" -> "PROGRA~1")
-      ;;
-    esac
-
-    # DLL is installed to $(libdir)/../bin by postinstall_cmds
-    postinstall_cmds='base_file=`basename \${file}`~
-      dlpath=`$SHELL 2>&1 -c '\''. $dir/'\''\${base_file}'\''i; echo \$dlname'\''`~
-      dldir=$destdir/`dirname \$dlpath`~
-      test -d \$dldir || mkdir -p \$dldir~
-      $install_prog $dir/$dlname \$dldir/$dlname'
-    postuninstall_cmds='dldll=`$SHELL 2>&1 -c '\''. $file; echo \$dlname'\''`~
-      dlpath=$dir/\$dldll~
-       $RM \$dlpath'
-    shlibpath_overrides_runpath=yes
-    dynamic_linker='Win32 link.exe'
-    ;;
-
-  *)
-    # Assume MSVC wrapper
-    library_names_spec='${libname}`echo ${release} | $SED -e 's/[[.]]/-/g'`${versuffix}${shared_ext} $libname.lib'
-    dynamic_linker='Win32 ld.exe'
-    ;;
-  esac
-  # FIXME: first we should search . and the directory the executable is in
-  shlibpath_var=PATH
-  ;;
-
-darwin* | rhapsody*)
-  dynamic_linker="$host_os dyld"
-  version_type=darwin
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${major}$shared_ext ${libname}$shared_ext'
-  soname_spec='${libname}${release}${major}$shared_ext'
-  shlibpath_overrides_runpath=yes
-  shlibpath_var=DYLD_LIBRARY_PATH
-  shrext_cmds='`test .$module = .yes && echo .so || echo .dylib`'
-m4_if([$1], [],[
-  sys_lib_search_path_spec="$sys_lib_search_path_spec /usr/local/lib"])
-  sys_lib_dlsearch_path_spec='/usr/local/lib /lib /usr/lib'
-  ;;
-
-dgux*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname$shared_ext'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-freebsd* | dragonfly*)
-  # DragonFly does not have aout.  When/if they implement a new
-  # versioning mechanism, adjust this.
-  if test -x /usr/bin/objformat; then
-    objformat=`/usr/bin/objformat`
-  else
-    case $host_os in
-    freebsd[[23]].*) objformat=aout ;;
-    *) objformat=elf ;;
-    esac
-  fi
-  version_type=freebsd-$objformat
-  case $version_type in
-    freebsd-elf*)
-      library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
-      need_version=no
-      need_lib_prefix=no
-      ;;
-    freebsd-*)
-      library_names_spec='${libname}${release}${shared_ext}$versuffix $libname${shared_ext}$versuffix'
-      need_version=yes
-      ;;
-  esac
-  shlibpath_var=LD_LIBRARY_PATH
-  case $host_os in
-  freebsd2.*)
-    shlibpath_overrides_runpath=yes
-    ;;
-  freebsd3.[[01]]* | freebsdelf3.[[01]]*)
-    shlibpath_overrides_runpath=yes
-    hardcode_into_libs=yes
-    ;;
-  freebsd3.[[2-9]]* | freebsdelf3.[[2-9]]* | \
-  freebsd4.[[0-5]] | freebsdelf4.[[0-5]] | freebsd4.1.1 | freebsdelf4.1.1)
-    shlibpath_overrides_runpath=no
-    hardcode_into_libs=yes
-    ;;
-  *) # from 4.6 on, and DragonFly
-    shlibpath_overrides_runpath=yes
-    hardcode_into_libs=yes
-    ;;
-  esac
-  ;;
-
-haiku*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  dynamic_linker="$host_os runtime_loader"
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}${major} ${libname}${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  sys_lib_dlsearch_path_spec='/boot/home/config/lib /boot/common/lib /boot/system/lib'
-  hardcode_into_libs=yes
-  ;;
-
-hpux9* | hpux10* | hpux11*)
-  # Give a soname corresponding to the major version so that dld.sl refuses to
-  # link against other versions.
-  version_type=sunos
-  need_lib_prefix=no
-  need_version=no
-  case $host_cpu in
-  ia64*)
-    shrext_cmds='.so'
-    hardcode_into_libs=yes
-    dynamic_linker="$host_os dld.so"
-    shlibpath_var=LD_LIBRARY_PATH
-    shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
-    if test "X$HPUX_IA64_MODE" = X32; then
-      sys_lib_search_path_spec="/usr/lib/hpux32 /usr/local/lib/hpux32 /usr/local/lib"
-    else
-      sys_lib_search_path_spec="/usr/lib/hpux64 /usr/local/lib/hpux64"
-    fi
-    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
-    ;;
-  hppa*64*)
-    shrext_cmds='.sl'
-    hardcode_into_libs=yes
-    dynamic_linker="$host_os dld.sl"
-    shlibpath_var=LD_LIBRARY_PATH # How should we handle SHLIB_PATH
-    shlibpath_overrides_runpath=yes # Unless +noenvvar is specified.
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
-    sys_lib_search_path_spec="/usr/lib/pa20_64 /usr/ccs/lib/pa20_64"
-    sys_lib_dlsearch_path_spec=$sys_lib_search_path_spec
-    ;;
-  *)
-    shrext_cmds='.sl'
-    dynamic_linker="$host_os dld.sl"
-    shlibpath_var=SHLIB_PATH
-    shlibpath_overrides_runpath=no # +s is required to enable SHLIB_PATH
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
-    ;;
-  esac
-  # HP-UX runs *really* slowly unless shared libraries are mode 555, ...
-  postinstall_cmds='chmod 555 $lib'
-  # or fails outright, so override atomically:
-  install_override_mode=555
-  ;;
-
-interix[[3-9]]*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  dynamic_linker='Interix 3.x ld.so.1 (PE, like ELF)'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
-  hardcode_into_libs=yes
-  ;;
-
-irix5* | irix6* | nonstopux*)
-  case $host_os in
-    nonstopux*) version_type=nonstopux ;;
-    *)
-	if test "$lt_cv_prog_gnu_ld" = yes; then
-		version_type=linux # correct to gnu/linux during the next big refactor
-	else
-		version_type=irix
-	fi ;;
-  esac
-  need_lib_prefix=no
-  need_version=no
-  soname_spec='${libname}${release}${shared_ext}$major'
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${release}${shared_ext} $libname${shared_ext}'
-  case $host_os in
-  irix5* | nonstopux*)
-    libsuff= shlibsuff=
-    ;;
-  *)
-    case $LD in # libtool.m4 will add one of these switches to LD
-    *-32|*"-32 "|*-melf32bsmip|*"-melf32bsmip ")
-      libsuff= shlibsuff= libmagic=32-bit;;
-    *-n32|*"-n32 "|*-melf32bmipn32|*"-melf32bmipn32 ")
-      libsuff=32 shlibsuff=N32 libmagic=N32;;
-    *-64|*"-64 "|*-melf64bmip|*"-melf64bmip ")
-      libsuff=64 shlibsuff=64 libmagic=64-bit;;
-    *) libsuff= shlibsuff= libmagic=never-match;;
-    esac
-    ;;
-  esac
-  shlibpath_var=LD_LIBRARY${shlibsuff}_PATH
-  shlibpath_overrides_runpath=no
-  sys_lib_search_path_spec="/usr/lib${libsuff} /lib${libsuff} /usr/local/lib${libsuff}"
-  sys_lib_dlsearch_path_spec="/usr/lib${libsuff} /lib${libsuff}"
-  hardcode_into_libs=yes
-  ;;
-
-# No shared lib support for Linux oldld, aout, or coff.
-linux*oldld* | linux*aout* | linux*coff*)
-  dynamic_linker=no
-  ;;
-
-# This must be glibc/ELF.
-linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  finish_cmds='PATH="\$PATH:/sbin" ldconfig -n $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
-
-  # Some binutils ld are patched to set DT_RUNPATH
-  AC_CACHE_VAL([lt_cv_shlibpath_overrides_runpath],
-    [lt_cv_shlibpath_overrides_runpath=no
-    save_LDFLAGS=$LDFLAGS
-    save_libdir=$libdir
-    eval "libdir=/foo; wl=\"$_LT_TAGVAR(lt_prog_compiler_wl, $1)\"; \
-	 LDFLAGS=\"\$LDFLAGS $_LT_TAGVAR(hardcode_libdir_flag_spec, $1)\""
-    AC_LINK_IFELSE([AC_LANG_PROGRAM([],[])],
-      [AS_IF([ ($OBJDUMP -p conftest$ac_exeext) 2>/dev/null | grep "RUNPATH.*$libdir" >/dev/null],
-	 [lt_cv_shlibpath_overrides_runpath=yes])])
-    LDFLAGS=$save_LDFLAGS
-    libdir=$save_libdir
-    ])
-  shlibpath_overrides_runpath=$lt_cv_shlibpath_overrides_runpath
-
-  # This implies no fast_install, which is unacceptable.
-  # Some rework will be needed to allow for fast_install
-  # before this can be enabled.
-  hardcode_into_libs=yes
-
-  # Append ld.so.conf contents to the search path
-  if test -f /etc/ld.so.conf; then
-    lt_ld_extra=`awk '/^include / { system(sprintf("cd /etc; cat %s 2>/dev/null", \[$]2)); skip = 1; } { if (!skip) print \[$]0; skip = 0; }' < /etc/ld.so.conf | $SED -e 's/#.*//;/^[	 ]*hwcap[	 ]/d;s/[:,	]/ /g;s/=[^=]*$//;s/=[^= ]* / /g;s/"//g;/^$/d' | tr '\n' ' '`
-    sys_lib_dlsearch_path_spec="/lib /usr/lib $lt_ld_extra"
-  fi
-
-  # We used to test for /lib/ld.so.1 and disable shared libraries on
-  # powerpc, because MkLinux only supported shared libraries with the
-  # GNU dynamic linker.  Since this was broken with cross compilers,
-  # most powerpc-linux boxes support dynamic linking these days and
-  # people can always --disable-shared, the test was removed, and we
-  # assume the GNU/Linux dynamic linker is in use.
-  dynamic_linker='GNU/Linux ld.so'
-  ;;
-
-netbsdelf*-gnu)
-  version_type=linux
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
-  hardcode_into_libs=yes
-  dynamic_linker='NetBSD ld.elf_so'
-  ;;
-
-netbsd*)
-  version_type=sunos
-  need_lib_prefix=no
-  need_version=no
-  if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
-    finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
-    dynamic_linker='NetBSD (a.out) ld.so'
-  else
-    library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major ${libname}${shared_ext}'
-    soname_spec='${libname}${release}${shared_ext}$major'
-    dynamic_linker='NetBSD ld.elf_so'
-  fi
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  hardcode_into_libs=yes
-  ;;
-
-newsos6)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  ;;
-
-*nto* | *qnx*)
-  version_type=qnx
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
-  hardcode_into_libs=yes
-  dynamic_linker='ldqnx.so'
-  ;;
-
-openbsd*)
-  version_type=sunos
-  sys_lib_dlsearch_path_spec="/usr/lib"
-  need_lib_prefix=no
-  # Some older versions of OpenBSD (3.3 at least) *do* need versioned libs.
-  case $host_os in
-    openbsd3.3 | openbsd3.3.*)	need_version=yes ;;
-    *)				need_version=no  ;;
-  esac
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
-  finish_cmds='PATH="\$PATH:/sbin" ldconfig -m $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
-    case $host_os in
-      openbsd2.[[89]] | openbsd2.[[89]].*)
-	shlibpath_overrides_runpath=no
-	;;
-      *)
-	shlibpath_overrides_runpath=yes
-	;;
-      esac
-  else
-    shlibpath_overrides_runpath=yes
-  fi
-  ;;
-
-os2*)
-  libname_spec='$name'
-  shrext_cmds=".dll"
-  need_lib_prefix=no
-  library_names_spec='$libname${shared_ext} $libname.a'
-  dynamic_linker='OS/2 ld.exe'
-  shlibpath_var=LIBPATH
-  ;;
-
-osf3* | osf4* | osf5*)
-  version_type=osf
-  need_lib_prefix=no
-  need_version=no
-  soname_spec='${libname}${release}${shared_ext}$major'
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  shlibpath_var=LD_LIBRARY_PATH
-  sys_lib_search_path_spec="/usr/shlib /usr/ccs/lib /usr/lib/cmplrs/cc /usr/lib /usr/local/lib /var/shlib"
-  sys_lib_dlsearch_path_spec="$sys_lib_search_path_spec"
-  ;;
-
-rdos*)
-  dynamic_linker=no
-  ;;
-
-solaris*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  hardcode_into_libs=yes
-  # ldd complains unless libraries are executable
-  postinstall_cmds='chmod +x $lib'
-  ;;
-
-sunos4*)
-  version_type=sunos
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${shared_ext}$versuffix'
-  finish_cmds='PATH="\$PATH:/usr/etc" ldconfig $libdir'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  if test "$with_gnu_ld" = yes; then
-    need_lib_prefix=no
-  fi
-  need_version=yes
-  ;;
-
-sysv4 | sysv4.3*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  case $host_vendor in
-    sni)
-      shlibpath_overrides_runpath=no
-      need_lib_prefix=no
-      runpath_var=LD_RUN_PATH
-      ;;
-    siemens)
-      need_lib_prefix=no
-      ;;
-    motorola)
-      need_lib_prefix=no
-      need_version=no
-      shlibpath_overrides_runpath=no
-      sys_lib_search_path_spec='/lib /usr/lib /usr/ccs/lib'
-      ;;
-  esac
-  ;;
-
-sysv4*MP*)
-  if test -d /usr/nec ;then
-    version_type=linux # correct to gnu/linux during the next big refactor
-    library_names_spec='$libname${shared_ext}.$versuffix $libname${shared_ext}.$major $libname${shared_ext}'
-    soname_spec='$libname${shared_ext}.$major'
-    shlibpath_var=LD_LIBRARY_PATH
-  fi
-  ;;
-
-sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*)
-  version_type=freebsd-elf
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext} $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=yes
-  hardcode_into_libs=yes
-  if test "$with_gnu_ld" = yes; then
-    sys_lib_search_path_spec='/usr/local/lib /usr/gnu/lib /usr/ccs/lib /usr/lib /lib'
-  else
-    sys_lib_search_path_spec='/usr/ccs/lib /usr/lib'
-    case $host_os in
-      sco3.2v5*)
-        sys_lib_search_path_spec="$sys_lib_search_path_spec /lib"
-	;;
-    esac
-  fi
-  sys_lib_dlsearch_path_spec='/usr/lib'
-  ;;
-
-tpf*)
-  # TPF is a cross-target only.  Preferred cross-host = GNU/Linux.
-  version_type=linux # correct to gnu/linux during the next big refactor
-  need_lib_prefix=no
-  need_version=no
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  shlibpath_var=LD_LIBRARY_PATH
-  shlibpath_overrides_runpath=no
-  hardcode_into_libs=yes
-  ;;
-
-uts4*)
-  version_type=linux # correct to gnu/linux during the next big refactor
-  library_names_spec='${libname}${release}${shared_ext}$versuffix ${libname}${release}${shared_ext}$major $libname${shared_ext}'
-  soname_spec='${libname}${release}${shared_ext}$major'
-  shlibpath_var=LD_LIBRARY_PATH
-  ;;
-
-*)
-  dynamic_linker=no
-  ;;
-esac
-AC_MSG_RESULT([$dynamic_linker])
-test "$dynamic_linker" = no && can_build_shared=no
-
-variables_saved_for_relink="PATH $shlibpath_var $runpath_var"
-if test "$GCC" = yes; then
-  variables_saved_for_relink="$variables_saved_for_relink GCC_EXEC_PREFIX COMPILER_PATH LIBRARY_PATH"
-fi
-
-if test "${lt_cv_sys_lib_search_path_spec+set}" = set; then
-  sys_lib_search_path_spec="$lt_cv_sys_lib_search_path_spec"
-fi
-if test "${lt_cv_sys_lib_dlsearch_path_spec+set}" = set; then
-  sys_lib_dlsearch_path_spec="$lt_cv_sys_lib_dlsearch_path_spec"
-fi
-
-_LT_DECL([], [variables_saved_for_relink], [1],
-    [Variables whose values should be saved in libtool wrapper scripts and
-    restored at link time])
-_LT_DECL([], [need_lib_prefix], [0],
-    [Do we need the "lib" prefix for modules?])
-_LT_DECL([], [need_version], [0], [Do we need a version for libraries?])
-_LT_DECL([], [version_type], [0], [Library versioning type])
-_LT_DECL([], [runpath_var], [0],  [Shared library runtime path variable])
-_LT_DECL([], [shlibpath_var], [0],[Shared library path variable])
-_LT_DECL([], [shlibpath_overrides_runpath], [0],
-    [Is shlibpath searched before the hard-coded library search path?])
-_LT_DECL([], [libname_spec], [1], [Format of library name prefix])
-_LT_DECL([], [library_names_spec], [1],
-    [[List of archive names.  First name is the real one, the rest are links.
-    The last name is the one that the linker finds with -lNAME]])
-_LT_DECL([], [soname_spec], [1],
-    [[The coded name of the library, if different from the real name]])
-_LT_DECL([], [install_override_mode], [1],
-    [Permission mode override for installation of shared libraries])
-_LT_DECL([], [postinstall_cmds], [2],
-    [Command to use after installation of a shared archive])
-_LT_DECL([], [postuninstall_cmds], [2],
-    [Command to use after uninstallation of a shared archive])
-_LT_DECL([], [finish_cmds], [2],
-    [Commands used to finish a libtool library installation in a directory])
-_LT_DECL([], [finish_eval], [1],
-    [[As "finish_cmds", except a single script fragment to be evaled but
-    not shown]])
-_LT_DECL([], [hardcode_into_libs], [0],
-    [Whether we should hardcode library paths into libraries])
-_LT_DECL([], [sys_lib_search_path_spec], [2],
-    [Compile-time system search path for libraries])
-_LT_DECL([], [sys_lib_dlsearch_path_spec], [2],
-    [Run-time system search path for libraries])
-])# _LT_SYS_DYNAMIC_LINKER
-
-
-# _LT_PATH_TOOL_PREFIX(TOOL)
-# --------------------------
-# find a file program which can recognize shared library
-AC_DEFUN([_LT_PATH_TOOL_PREFIX],
-[m4_require([_LT_DECL_EGREP])dnl
-AC_MSG_CHECKING([for $1])
-AC_CACHE_VAL(lt_cv_path_MAGIC_CMD,
-[case $MAGIC_CMD in
-[[\\/*] |  ?:[\\/]*])
-  lt_cv_path_MAGIC_CMD="$MAGIC_CMD" # Let the user override the test with a path.
-  ;;
-*)
-  lt_save_MAGIC_CMD="$MAGIC_CMD"
-  lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
-dnl $ac_dummy forces splitting on constant user-supplied paths.
-dnl POSIX.2 word splitting is done only on the output of word expansions,
-dnl not every word.  This closes a longstanding sh security hole.
-  ac_dummy="m4_if([$2], , $PATH, [$2])"
-  for ac_dir in $ac_dummy; do
-    IFS="$lt_save_ifs"
-    test -z "$ac_dir" && ac_dir=.
-    if test -f $ac_dir/$1; then
-      lt_cv_path_MAGIC_CMD="$ac_dir/$1"
-      if test -n "$file_magic_test_file"; then
-	case $deplibs_check_method in
-	"file_magic "*)
-	  file_magic_regex=`expr "$deplibs_check_method" : "file_magic \(.*\)"`
-	  MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
-	  if eval $file_magic_cmd \$file_magic_test_file 2> /dev/null |
-	    $EGREP "$file_magic_regex" > /dev/null; then
-	    :
-	  else
-	    cat <<_LT_EOF 1>&2
-
-*** Warning: the command libtool uses to detect shared libraries,
-*** $file_magic_cmd, produces output that libtool cannot recognize.
-*** The result is that libtool may fail to recognize shared libraries
-*** as such.  This will affect the creation of libtool libraries that
-*** depend on shared libraries, but programs linked with such libtool
-*** libraries will work regardless of this problem.  Nevertheless, you
-*** may want to report the problem to your system manager and/or to
-*** bug-libtool at gnu.org
-
-_LT_EOF
-	  fi ;;
-	esac
-      fi
-      break
-    fi
-  done
-  IFS="$lt_save_ifs"
-  MAGIC_CMD="$lt_save_MAGIC_CMD"
-  ;;
-esac])
-MAGIC_CMD="$lt_cv_path_MAGIC_CMD"
-if test -n "$MAGIC_CMD"; then
-  AC_MSG_RESULT($MAGIC_CMD)
-else
-  AC_MSG_RESULT(no)
-fi
-_LT_DECL([], [MAGIC_CMD], [0],
-	 [Used to examine libraries when file_magic_cmd begins with "file"])dnl
-])# _LT_PATH_TOOL_PREFIX
-
-# Old name:
-AU_ALIAS([AC_PATH_TOOL_PREFIX], [_LT_PATH_TOOL_PREFIX])
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_PATH_TOOL_PREFIX], [])
-
-
-# _LT_PATH_MAGIC
-# --------------
-# find a file program which can recognize a shared library
-m4_defun([_LT_PATH_MAGIC],
-[_LT_PATH_TOOL_PREFIX(${ac_tool_prefix}file, /usr/bin$PATH_SEPARATOR$PATH)
-if test -z "$lt_cv_path_MAGIC_CMD"; then
-  if test -n "$ac_tool_prefix"; then
-    _LT_PATH_TOOL_PREFIX(file, /usr/bin$PATH_SEPARATOR$PATH)
-  else
-    MAGIC_CMD=:
-  fi
-fi
-])# _LT_PATH_MAGIC
-
-
-# LT_PATH_LD
-# ----------
-# find the pathname to the GNU or non-GNU linker
-AC_DEFUN([LT_PATH_LD],
-[AC_REQUIRE([AC_PROG_CC])dnl
-AC_REQUIRE([AC_CANONICAL_HOST])dnl
-AC_REQUIRE([AC_CANONICAL_BUILD])dnl
-m4_require([_LT_DECL_SED])dnl
-m4_require([_LT_DECL_EGREP])dnl
-m4_require([_LT_PROG_ECHO_BACKSLASH])dnl
-
-AC_ARG_WITH([gnu-ld],
-    [AS_HELP_STRING([--with-gnu-ld],
-	[assume the C compiler uses GNU ld @<:@default=no@:>@])],
-    [test "$withval" = no || with_gnu_ld=yes],
-    [with_gnu_ld=no])dnl
-
-ac_prog=ld
-if test "$GCC" = yes; then
-  # Check if gcc -print-prog-name=ld gives a path.
-  AC_MSG_CHECKING([for ld used by $CC])
-  case $host in
-  *-*-mingw*)
-    # gcc leaves a trailing carriage return which upsets mingw
-    ac_prog=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;;
-  *)
-    ac_prog=`($CC -print-prog-name=ld) 2>&5` ;;
-  esac
-  case $ac_prog in
-    # Accept absolute paths.
-    [[\\/]]* | ?:[[\\/]]*)
-      re_direlt='/[[^/]][[^/]]*/\.\./'
-      # Canonicalize the pathname of ld
-      ac_prog=`$ECHO "$ac_prog"| $SED 's%\\\\%/%g'`
-      while $ECHO "$ac_prog" | $GREP "$re_direlt" > /dev/null 2>&1; do
-	ac_prog=`$ECHO $ac_prog| $SED "s%$re_direlt%/%"`
-      done
-      test -z "$LD" && LD="$ac_prog"
-      ;;
-  "")
-    # If it fails, then pretend we aren't using GCC.
-    ac_prog=ld
-    ;;
-  *)
-    # If it is relative, then search for the first ld in PATH.
-    with_gnu_ld=unknown
-    ;;
-  esac
-elif test "$with_gnu_ld" = yes; then
-  AC_MSG_CHECKING([for GNU ld])
-else
-  AC_MSG_CHECKING([for non-GNU ld])
-fi
-AC_CACHE_VAL(lt_cv_path_LD,
-[if test -z "$LD"; then
-  lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
-  for ac_dir in $PATH; do
-    IFS="$lt_save_ifs"
-    test -z "$ac_dir" && ac_dir=.
-    if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then
-      lt_cv_path_LD="$ac_dir/$ac_prog"
-      # Check to see if the program is GNU ld.  I'd rather use --version,
-      # but apparently some variants of GNU ld only accept -v.
-      # Break only if it was the GNU/non-GNU ld that we prefer.
-      case `"$lt_cv_path_LD" -v 2>&1 </dev/null` in
-      *GNU* | *'with BFD'*)
-	test "$with_gnu_ld" != no && break
-	;;
-      *)
-	test "$with_gnu_ld" != yes && break
-	;;
-      esac
-    fi
-  done
-  IFS="$lt_save_ifs"
-else
-  lt_cv_path_LD="$LD" # Let the user override the test with a path.
-fi])
-LD="$lt_cv_path_LD"
-if test -n "$LD"; then
-  AC_MSG_RESULT($LD)
-else
-  AC_MSG_RESULT(no)
-fi
-test -z "$LD" && AC_MSG_ERROR([no acceptable ld found in \$PATH])
-_LT_PATH_LD_GNU
-AC_SUBST([LD])
-
-_LT_TAGDECL([], [LD], [1], [The linker used to build libraries])
-])# LT_PATH_LD
-
-# Old names:
-AU_ALIAS([AM_PROG_LD], [LT_PATH_LD])
-AU_ALIAS([AC_PROG_LD], [LT_PATH_LD])
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AM_PROG_LD], [])
-dnl AC_DEFUN([AC_PROG_LD], [])
-
-
-# _LT_PATH_LD_GNU
-#- --------------
-m4_defun([_LT_PATH_LD_GNU],
-[AC_CACHE_CHECK([if the linker ($LD) is GNU ld], lt_cv_prog_gnu_ld,
-[# I'd rather use --version here, but apparently some GNU lds only accept -v.
-case `$LD -v 2>&1 </dev/null` in
-*GNU* | *'with BFD'*)
-  lt_cv_prog_gnu_ld=yes
-  ;;
-*)
-  lt_cv_prog_gnu_ld=no
-  ;;
-esac])
-with_gnu_ld=$lt_cv_prog_gnu_ld
-])# _LT_PATH_LD_GNU
-
-
-# _LT_CMD_RELOAD
-# --------------
-# find reload flag for linker
-#   -- PORTME Some linkers may need a different reload flag.
-m4_defun([_LT_CMD_RELOAD],
-[AC_CACHE_CHECK([for $LD option to reload object files],
-  lt_cv_ld_reload_flag,
-  [lt_cv_ld_reload_flag='-r'])
-reload_flag=$lt_cv_ld_reload_flag
-case $reload_flag in
-"" | " "*) ;;
-*) reload_flag=" $reload_flag" ;;
-esac
-reload_cmds='$LD$reload_flag -o $output$reload_objs'
-case $host_os in
-  cygwin* | mingw* | pw32* | cegcc*)
-    if test "$GCC" != yes; then
-      reload_cmds=false
-    fi
-    ;;
-  darwin*)
-    if test "$GCC" = yes; then
-      reload_cmds='$LTCC $LTCFLAGS -nostdlib ${wl}-r -o $output$reload_objs'
-    else
-      reload_cmds='$LD$reload_flag -o $output$reload_objs'
-    fi
-    ;;
-esac
-_LT_TAGDECL([], [reload_flag], [1], [How to create reloadable object files])dnl
-_LT_TAGDECL([], [reload_cmds], [2])dnl
-])# _LT_CMD_RELOAD
-
-
-# _LT_CHECK_MAGIC_METHOD
-# ----------------------
-# how to check for library dependencies
-#  -- PORTME fill in with the dynamic library characteristics
-m4_defun([_LT_CHECK_MAGIC_METHOD],
-[m4_require([_LT_DECL_EGREP])
-m4_require([_LT_DECL_OBJDUMP])
-AC_CACHE_CHECK([how to recognize dependent libraries],
-lt_cv_deplibs_check_method,
-[lt_cv_file_magic_cmd='$MAGIC_CMD'
-lt_cv_file_magic_test_file=
-lt_cv_deplibs_check_method='unknown'
-# Need to set the preceding variable on all platforms that support
-# interlibrary dependencies.
-# 'none' -- dependencies not supported.
-# `unknown' -- same as none, but documents that we really don't know.
-# 'pass_all' -- all dependencies passed with no checks.
-# 'test_compile' -- check by making test program.
-# 'file_magic [[regex]]' -- check by looking for files in library path
-# which responds to the $file_magic_cmd with a given extended regex.
-# If you have `file' or equivalent on your system and you're not sure
-# whether `pass_all' will *always* work, you probably want this one.
-
-case $host_os in
-aix[[4-9]]*)
-  lt_cv_deplibs_check_method=pass_all
-  ;;
-
-beos*)
-  lt_cv_deplibs_check_method=pass_all
-  ;;
-
-bsdi[[45]]*)
-  lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[ML]]SB (shared object|dynamic lib)'
-  lt_cv_file_magic_cmd='/usr/bin/file -L'
-  lt_cv_file_magic_test_file=/shlib/libc.so
-  ;;
-
-cygwin*)
-  # func_win32_libid is a shell function defined in ltmain.sh
-  lt_cv_deplibs_check_method='file_magic ^x86 archive import|^x86 DLL'
-  lt_cv_file_magic_cmd='func_win32_libid'
-  ;;
-
-mingw* | pw32*)
-  # Base MSYS/MinGW do not provide the 'file' command needed by
-  # func_win32_libid shell function, so use a weaker test based on 'objdump',
-  # unless we find 'file', for example because we are cross-compiling.
-  # func_win32_libid assumes BSD nm, so disallow it if using MS dumpbin.
-  if ( test "$lt_cv_nm_interface" = "BSD nm" && file / ) >/dev/null 2>&1; then
-    lt_cv_deplibs_check_method='file_magic ^x86 archive import|^x86 DLL'
-    lt_cv_file_magic_cmd='func_win32_libid'
-  else
-    # Keep this pattern in sync with the one in func_win32_libid.
-    lt_cv_deplibs_check_method='file_magic file format (pei*-i386(.*architecture: i386)?|pe-arm-wince|pe-x86-64)'
-    lt_cv_file_magic_cmd='$OBJDUMP -f'
-  fi
-  ;;
-
-cegcc*)
-  # use the weaker test based on 'objdump'. See mingw*.
-  lt_cv_deplibs_check_method='file_magic file format pe-arm-.*little(.*architecture: arm)?'
-  lt_cv_file_magic_cmd='$OBJDUMP -f'
-  ;;
-
-darwin* | rhapsody*)
-  lt_cv_deplibs_check_method=pass_all
-  ;;
-
-freebsd* | dragonfly*)
-  if echo __ELF__ | $CC -E - | $GREP __ELF__ > /dev/null; then
-    case $host_cpu in
-    i*86 )
-      # Not sure whether the presence of OpenBSD here was a mistake.
-      # Let's accept both of them until this is cleared up.
-      lt_cv_deplibs_check_method='file_magic (FreeBSD|OpenBSD|DragonFly)/i[[3-9]]86 (compact )?demand paged shared library'
-      lt_cv_file_magic_cmd=/usr/bin/file
-      lt_cv_file_magic_test_file=`echo /usr/lib/libc.so.*`
-      ;;
-    esac
-  else
-    lt_cv_deplibs_check_method=pass_all
-  fi
-  ;;
-
-haiku*)
-  lt_cv_deplibs_check_method=pass_all
-  ;;
-
-hpux10.20* | hpux11*)
-  lt_cv_file_magic_cmd=/usr/bin/file
-  case $host_cpu in
-  ia64*)
-    lt_cv_deplibs_check_method='file_magic (s[[0-9]][[0-9]][[0-9]]|ELF-[[0-9]][[0-9]]) shared object file - IA64'
-    lt_cv_file_magic_test_file=/usr/lib/hpux32/libc.so
-    ;;
-  hppa*64*)
-    [lt_cv_deplibs_check_method='file_magic (s[0-9][0-9][0-9]|ELF[ -][0-9][0-9])(-bit)?( [LM]SB)? shared object( file)?[, -]* PA-RISC [0-9]\.[0-9]']
-    lt_cv_file_magic_test_file=/usr/lib/pa20_64/libc.sl
-    ;;
-  *)
-    lt_cv_deplibs_check_method='file_magic (s[[0-9]][[0-9]][[0-9]]|PA-RISC[[0-9]]\.[[0-9]]) shared library'
-    lt_cv_file_magic_test_file=/usr/lib/libc.sl
-    ;;
-  esac
-  ;;
-
-interix[[3-9]]*)
-  # PIC code is broken on Interix 3.x, that's why |\.a not |_pic\.a here
-  lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so|\.a)$'
-  ;;
-
-irix5* | irix6* | nonstopux*)
-  case $LD in
-  *-32|*"-32 ") libmagic=32-bit;;
-  *-n32|*"-n32 ") libmagic=N32;;
-  *-64|*"-64 ") libmagic=64-bit;;
-  *) libmagic=never-match;;
-  esac
-  lt_cv_deplibs_check_method=pass_all
-  ;;
-
-# This must be glibc/ELF.
-linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
-  lt_cv_deplibs_check_method=pass_all
-  ;;
-
-netbsd* | netbsdelf*-gnu)
-  if echo __ELF__ | $CC -E - | $GREP __ELF__ > /dev/null; then
-    lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so\.[[0-9]]+\.[[0-9]]+|_pic\.a)$'
-  else
-    lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so|_pic\.a)$'
-  fi
-  ;;
-
-newos6*)
-  lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[ML]]SB (executable|dynamic lib)'
-  lt_cv_file_magic_cmd=/usr/bin/file
-  lt_cv_file_magic_test_file=/usr/lib/libnls.so
-  ;;
-
-*nto* | *qnx*)
-  lt_cv_deplibs_check_method=pass_all
-  ;;
-
-openbsd*)
-  if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
-    lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so\.[[0-9]]+\.[[0-9]]+|\.so|_pic\.a)$'
-  else
-    lt_cv_deplibs_check_method='match_pattern /lib[[^/]]+(\.so\.[[0-9]]+\.[[0-9]]+|_pic\.a)$'
-  fi
-  ;;
-
-osf3* | osf4* | osf5*)
-  lt_cv_deplibs_check_method=pass_all
-  ;;
-
-rdos*)
-  lt_cv_deplibs_check_method=pass_all
-  ;;
-
-solaris*)
-  lt_cv_deplibs_check_method=pass_all
-  ;;
-
-sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*)
-  lt_cv_deplibs_check_method=pass_all
-  ;;
-
-sysv4 | sysv4.3*)
-  case $host_vendor in
-  motorola)
-    lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[ML]]SB (shared object|dynamic lib) M[[0-9]][[0-9]]* Version [[0-9]]'
-    lt_cv_file_magic_test_file=`echo /usr/lib/libc.so*`
-    ;;
-  ncr)
-    lt_cv_deplibs_check_method=pass_all
-    ;;
-  sequent)
-    lt_cv_file_magic_cmd='/bin/file'
-    lt_cv_deplibs_check_method='file_magic ELF [[0-9]][[0-9]]*-bit [[LM]]SB (shared object|dynamic lib )'
-    ;;
-  sni)
-    lt_cv_file_magic_cmd='/bin/file'
-    lt_cv_deplibs_check_method="file_magic ELF [[0-9]][[0-9]]*-bit [[LM]]SB dynamic lib"
-    lt_cv_file_magic_test_file=/lib/libc.so
-    ;;
-  siemens)
-    lt_cv_deplibs_check_method=pass_all
-    ;;
-  pc)
-    lt_cv_deplibs_check_method=pass_all
-    ;;
-  esac
-  ;;
-
-tpf*)
-  lt_cv_deplibs_check_method=pass_all
-  ;;
-esac
-])
-
-file_magic_glob=
-want_nocaseglob=no
-if test "$build" = "$host"; then
-  case $host_os in
-  mingw* | pw32*)
-    if ( shopt | grep nocaseglob ) >/dev/null 2>&1; then
-      want_nocaseglob=yes
-    else
-      file_magic_glob=`echo aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ | $SED -e "s/\(..\)/s\/[[\1]]\/[[\1]]\/g;/g"`
-    fi
-    ;;
-  esac
-fi
-
-file_magic_cmd=$lt_cv_file_magic_cmd
-deplibs_check_method=$lt_cv_deplibs_check_method
-test -z "$deplibs_check_method" && deplibs_check_method=unknown
-
-_LT_DECL([], [deplibs_check_method], [1],
-    [Method to check whether dependent libraries are shared objects])
-_LT_DECL([], [file_magic_cmd], [1],
-    [Command to use when deplibs_check_method = "file_magic"])
-_LT_DECL([], [file_magic_glob], [1],
-    [How to find potential files when deplibs_check_method = "file_magic"])
-_LT_DECL([], [want_nocaseglob], [1],
-    [Find potential files using nocaseglob when deplibs_check_method = "file_magic"])
-])# _LT_CHECK_MAGIC_METHOD
-
-
-# LT_PATH_NM
-# ----------
-# find the pathname to a BSD- or MS-compatible name lister
-AC_DEFUN([LT_PATH_NM],
-[AC_REQUIRE([AC_PROG_CC])dnl
-AC_CACHE_CHECK([for BSD- or MS-compatible name lister (nm)], lt_cv_path_NM,
-[if test -n "$NM"; then
-  # Let the user override the test.
-  lt_cv_path_NM="$NM"
-else
-  lt_nm_to_check="${ac_tool_prefix}nm"
-  if test -n "$ac_tool_prefix" && test "$build" = "$host"; then
-    lt_nm_to_check="$lt_nm_to_check nm"
-  fi
-  for lt_tmp_nm in $lt_nm_to_check; do
-    lt_save_ifs="$IFS"; IFS=$PATH_SEPARATOR
-    for ac_dir in $PATH /usr/ccs/bin/elf /usr/ccs/bin /usr/ucb /bin; do
-      IFS="$lt_save_ifs"
-      test -z "$ac_dir" && ac_dir=.
-      tmp_nm="$ac_dir/$lt_tmp_nm"
-      if test -f "$tmp_nm" || test -f "$tmp_nm$ac_exeext" ; then
-	# Check to see if the nm accepts a BSD-compat flag.
-	# Adding the `sed 1q' prevents false positives on HP-UX, which says:
-	#   nm: unknown option "B" ignored
-	# Tru64's nm complains that /dev/null is an invalid object file
-	case `"$tmp_nm" -B /dev/null 2>&1 | sed '1q'` in
-	*/dev/null* | *'Invalid file or object type'*)
-	  lt_cv_path_NM="$tmp_nm -B"
-	  break
-	  ;;
-	*)
-	  case `"$tmp_nm" -p /dev/null 2>&1 | sed '1q'` in
-	  */dev/null*)
-	    lt_cv_path_NM="$tmp_nm -p"
-	    break
-	    ;;
-	  *)
-	    lt_cv_path_NM=${lt_cv_path_NM="$tmp_nm"} # keep the first match, but
-	    continue # so that we can try to find one that supports BSD flags
-	    ;;
-	  esac
-	  ;;
-	esac
-      fi
-    done
-    IFS="$lt_save_ifs"
-  done
-  : ${lt_cv_path_NM=no}
-fi])
-if test "$lt_cv_path_NM" != "no"; then
-  NM="$lt_cv_path_NM"
-else
-  # Didn't find any BSD compatible name lister, look for dumpbin.
-  if test -n "$DUMPBIN"; then :
-    # Let the user override the test.
-  else
-    AC_CHECK_TOOLS(DUMPBIN, [dumpbin "link -dump"], :)
-    case `$DUMPBIN -symbols /dev/null 2>&1 | sed '1q'` in
-    *COFF*)
-      DUMPBIN="$DUMPBIN -symbols"
-      ;;
-    *)
-      DUMPBIN=:
-      ;;
-    esac
-  fi
-  AC_SUBST([DUMPBIN])
-  if test "$DUMPBIN" != ":"; then
-    NM="$DUMPBIN"
-  fi
-fi
-test -z "$NM" && NM=nm
-AC_SUBST([NM])
-_LT_DECL([], [NM], [1], [A BSD- or MS-compatible name lister])dnl
-
-AC_CACHE_CHECK([the name lister ($NM) interface], [lt_cv_nm_interface],
-  [lt_cv_nm_interface="BSD nm"
-  echo "int some_variable = 0;" > conftest.$ac_ext
-  (eval echo "\"\$as_me:$LINENO: $ac_compile\"" >&AS_MESSAGE_LOG_FD)
-  (eval "$ac_compile" 2>conftest.err)
-  cat conftest.err >&AS_MESSAGE_LOG_FD
-  (eval echo "\"\$as_me:$LINENO: $NM \\\"conftest.$ac_objext\\\"\"" >&AS_MESSAGE_LOG_FD)
-  (eval "$NM \"conftest.$ac_objext\"" 2>conftest.err > conftest.out)
-  cat conftest.err >&AS_MESSAGE_LOG_FD
-  (eval echo "\"\$as_me:$LINENO: output\"" >&AS_MESSAGE_LOG_FD)
-  cat conftest.out >&AS_MESSAGE_LOG_FD
-  if $GREP 'External.*some_variable' conftest.out > /dev/null; then
-    lt_cv_nm_interface="MS dumpbin"
-  fi
-  rm -f conftest*])
-])# LT_PATH_NM
-
-# Old names:
-AU_ALIAS([AM_PROG_NM], [LT_PATH_NM])
-AU_ALIAS([AC_PROG_NM], [LT_PATH_NM])
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AM_PROG_NM], [])
-dnl AC_DEFUN([AC_PROG_NM], [])
-
-# _LT_CHECK_SHAREDLIB_FROM_LINKLIB
-# --------------------------------
-# how to determine the name of the shared library
-# associated with a specific link library.
-#  -- PORTME fill in with the dynamic library characteristics
-m4_defun([_LT_CHECK_SHAREDLIB_FROM_LINKLIB],
-[m4_require([_LT_DECL_EGREP])
-m4_require([_LT_DECL_OBJDUMP])
-m4_require([_LT_DECL_DLLTOOL])
-AC_CACHE_CHECK([how to associate runtime and link libraries],
-lt_cv_sharedlib_from_linklib_cmd,
-[lt_cv_sharedlib_from_linklib_cmd='unknown'
-
-case $host_os in
-cygwin* | mingw* | pw32* | cegcc*)
-  # two different shell functions defined in ltmain.sh
-  # decide which to use based on capabilities of $DLLTOOL
-  case `$DLLTOOL --help 2>&1` in
-  *--identify-strict*)
-    lt_cv_sharedlib_from_linklib_cmd=func_cygming_dll_for_implib
-    ;;
-  *)
-    lt_cv_sharedlib_from_linklib_cmd=func_cygming_dll_for_implib_fallback
-    ;;
-  esac
-  ;;
-*)
-  # fallback: assume linklib IS sharedlib
-  lt_cv_sharedlib_from_linklib_cmd="$ECHO"
-  ;;
-esac
-])
-sharedlib_from_linklib_cmd=$lt_cv_sharedlib_from_linklib_cmd
-test -z "$sharedlib_from_linklib_cmd" && sharedlib_from_linklib_cmd=$ECHO
-
-_LT_DECL([], [sharedlib_from_linklib_cmd], [1],
-    [Command to associate shared and link libraries])
-])# _LT_CHECK_SHAREDLIB_FROM_LINKLIB
-
-
-# _LT_PATH_MANIFEST_TOOL
-# ----------------------
-# locate the manifest tool
-m4_defun([_LT_PATH_MANIFEST_TOOL],
-[AC_CHECK_TOOL(MANIFEST_TOOL, mt, :)
-test -z "$MANIFEST_TOOL" && MANIFEST_TOOL=mt
-AC_CACHE_CHECK([if $MANIFEST_TOOL is a manifest tool], [lt_cv_path_mainfest_tool],
-  [lt_cv_path_mainfest_tool=no
-  echo "$as_me:$LINENO: $MANIFEST_TOOL '-?'" >&AS_MESSAGE_LOG_FD
-  $MANIFEST_TOOL '-?' 2>conftest.err > conftest.out
-  cat conftest.err >&AS_MESSAGE_LOG_FD
-  if $GREP 'Manifest Tool' conftest.out > /dev/null; then
-    lt_cv_path_mainfest_tool=yes
-  fi
-  rm -f conftest*])
-if test "x$lt_cv_path_mainfest_tool" != xyes; then
-  MANIFEST_TOOL=:
-fi
-_LT_DECL([], [MANIFEST_TOOL], [1], [Manifest tool])dnl
-])# _LT_PATH_MANIFEST_TOOL
-
-
-# LT_LIB_M
-# --------
-# check for math library
-AC_DEFUN([LT_LIB_M],
-[AC_REQUIRE([AC_CANONICAL_HOST])dnl
-LIBM=
-case $host in
-*-*-beos* | *-*-cegcc* | *-*-cygwin* | *-*-haiku* | *-*-pw32* | *-*-darwin*)
-  # These system don't have libm, or don't need it
-  ;;
-*-ncr-sysv4.3*)
-  AC_CHECK_LIB(mw, _mwvalidcheckl, LIBM="-lmw")
-  AC_CHECK_LIB(m, cos, LIBM="$LIBM -lm")
-  ;;
-*)
-  AC_CHECK_LIB(m, cos, LIBM="-lm")
-  ;;
-esac
-AC_SUBST([LIBM])
-])# LT_LIB_M
-
-# Old name:
-AU_ALIAS([AC_CHECK_LIBM], [LT_LIB_M])
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_CHECK_LIBM], [])
-
-
-# _LT_COMPILER_NO_RTTI([TAGNAME])
-# -------------------------------
-m4_defun([_LT_COMPILER_NO_RTTI],
-[m4_require([_LT_TAG_COMPILER])dnl
-
-_LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=
-
-if test "$GCC" = yes; then
-  case $cc_basename in
-  nvcc*)
-    _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=' -Xcompiler -fno-builtin' ;;
-  *)
-    _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=' -fno-builtin' ;;
-  esac
-
-  _LT_COMPILER_OPTION([if $compiler supports -fno-rtti -fno-exceptions],
-    lt_cv_prog_compiler_rtti_exceptions,
-    [-fno-rtti -fno-exceptions], [],
-    [_LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)="$_LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1) -fno-rtti -fno-exceptions"])
-fi
-_LT_TAGDECL([no_builtin_flag], [lt_prog_compiler_no_builtin_flag], [1],
-	[Compiler flag to turn off builtin functions])
-])# _LT_COMPILER_NO_RTTI
-
-
-# _LT_CMD_GLOBAL_SYMBOLS
-# ----------------------
-m4_defun([_LT_CMD_GLOBAL_SYMBOLS],
-[AC_REQUIRE([AC_CANONICAL_HOST])dnl
-AC_REQUIRE([AC_PROG_CC])dnl
-AC_REQUIRE([AC_PROG_AWK])dnl
-AC_REQUIRE([LT_PATH_NM])dnl
-AC_REQUIRE([LT_PATH_LD])dnl
-m4_require([_LT_DECL_SED])dnl
-m4_require([_LT_DECL_EGREP])dnl
-m4_require([_LT_TAG_COMPILER])dnl
-
-# Check for command to grab the raw symbol name followed by C symbol from nm.
-AC_MSG_CHECKING([command to parse $NM output from $compiler object])
-AC_CACHE_VAL([lt_cv_sys_global_symbol_pipe],
-[
-# These are sane defaults that work on at least a few old systems.
-# [They come from Ultrix.  What could be older than Ultrix?!! ;)]
-
-# Character class describing NM global symbol codes.
-symcode='[[BCDEGRST]]'
-
-# Regexp to match symbols that can be accessed directly from C.
-sympat='\([[_A-Za-z]][[_A-Za-z0-9]]*\)'
-
-# Define system-specific variables.
-case $host_os in
-aix*)
-  symcode='[[BCDT]]'
-  ;;
-cygwin* | mingw* | pw32* | cegcc*)
-  symcode='[[ABCDGISTW]]'
-  ;;
-hpux*)
-  if test "$host_cpu" = ia64; then
-    symcode='[[ABCDEGRST]]'
-  fi
-  ;;
-irix* | nonstopux*)
-  symcode='[[BCDEGRST]]'
-  ;;
-osf*)
-  symcode='[[BCDEGQRST]]'
-  ;;
-solaris*)
-  symcode='[[BDRT]]'
-  ;;
-sco3.2v5*)
-  symcode='[[DT]]'
-  ;;
-sysv4.2uw2*)
-  symcode='[[DT]]'
-  ;;
-sysv5* | sco5v6* | unixware* | OpenUNIX*)
-  symcode='[[ABDT]]'
-  ;;
-sysv4)
-  symcode='[[DFNSTU]]'
-  ;;
-esac
-
-# If we're using GNU nm, then use its standard symbol codes.
-case `$NM -V 2>&1` in
-*GNU* | *'with BFD'*)
-  symcode='[[ABCDGIRSTW]]' ;;
-esac
-
-# Transform an extracted symbol line into a proper C declaration.
-# Some systems (esp. on ia64) link data and code symbols differently,
-# so use this general approach.
-lt_cv_sys_global_symbol_to_cdecl="sed -n -e 's/^T .* \(.*\)$/extern int \1();/p' -e 's/^$symcode* .* \(.*\)$/extern char \1;/p'"
-
-# Transform an extracted symbol line into symbol name and symbol address
-lt_cv_sys_global_symbol_to_c_name_address="sed -n -e 's/^: \([[^ ]]*\)[[ ]]*$/  {\\\"\1\\\", (void *) 0},/p' -e 's/^$symcode* \([[^ ]]*\) \([[^ ]]*\)$/  {\"\2\", (void *) \&\2},/p'"
-lt_cv_sys_global_symbol_to_c_name_address_lib_prefix="sed -n -e 's/^: \([[^ ]]*\)[[ ]]*$/  {\\\"\1\\\", (void *) 0},/p' -e 's/^$symcode* \([[^ ]]*\) \(lib[[^ ]]*\)$/  {\"\2\", (void *) \&\2},/p' -e 's/^$symcode* \([[^ ]]*\) \([[^ ]]*\)$/  {\"lib\2\", (void *) \&\2},/p'"
-
-# Handle CRLF in mingw tool chain
-opt_cr=
-case $build_os in
-mingw*)
-  opt_cr=`$ECHO 'x\{0,1\}' | tr x '\015'` # option cr in regexp
-  ;;
-esac
-
-# Try without a prefix underscore, then with it.
-for ac_symprfx in "" "_"; do
-
-  # Transform symcode, sympat, and symprfx into a raw symbol and a C symbol.
-  symxfrm="\\1 $ac_symprfx\\2 \\2"
-
-  # Write the raw and C identifiers.
-  if test "$lt_cv_nm_interface" = "MS dumpbin"; then
-    # Fake it for dumpbin and say T for any non-static function
-    # and D for any global variable.
-    # Also find C++ and __fastcall symbols from MSVC++,
-    # which start with @ or ?.
-    lt_cv_sys_global_symbol_pipe="$AWK ['"\
-"     {last_section=section; section=\$ 3};"\
-"     /^COFF SYMBOL TABLE/{for(i in hide) delete hide[i]};"\
-"     /Section length .*#relocs.*(pick any)/{hide[last_section]=1};"\
-"     \$ 0!~/External *\|/{next};"\
-"     / 0+ UNDEF /{next}; / UNDEF \([^|]\)*()/{next};"\
-"     {if(hide[section]) next};"\
-"     {f=0}; \$ 0~/\(\).*\|/{f=1}; {printf f ? \"T \" : \"D \"};"\
-"     {split(\$ 0, a, /\||\r/); split(a[2], s)};"\
-"     s[1]~/^[@?]/{print s[1], s[1]; next};"\
-"     s[1]~prfx {split(s[1],t,\"@\"); print t[1], substr(t[1],length(prfx))}"\
-"     ' prfx=^$ac_symprfx]"
-  else
-    lt_cv_sys_global_symbol_pipe="sed -n -e 's/^.*[[	 ]]\($symcode$symcode*\)[[	 ]][[	 ]]*$ac_symprfx$sympat$opt_cr$/$symxfrm/p'"
-  fi
-  lt_cv_sys_global_symbol_pipe="$lt_cv_sys_global_symbol_pipe | sed '/ __gnu_lto/d'"
-
-  # Check to see that the pipe works correctly.
-  pipe_works=no
-
-  rm -f conftest*
-  cat > conftest.$ac_ext <<_LT_EOF
-#ifdef __cplusplus
-extern "C" {
-#endif
-char nm_test_var;
-void nm_test_func(void);
-void nm_test_func(void){}
-#ifdef __cplusplus
-}
-#endif
-int main(){nm_test_var='a';nm_test_func();return(0);}
-_LT_EOF
-
-  if AC_TRY_EVAL(ac_compile); then
-    # Now try to grab the symbols.
-    nlist=conftest.nm
-    if AC_TRY_EVAL(NM conftest.$ac_objext \| "$lt_cv_sys_global_symbol_pipe" \> $nlist) && test -s "$nlist"; then
-      # Try sorting and uniquifying the output.
-      if sort "$nlist" | uniq > "$nlist"T; then
-	mv -f "$nlist"T "$nlist"
-      else
-	rm -f "$nlist"T
-      fi
-
-      # Make sure that we snagged all the symbols we need.
-      if $GREP ' nm_test_var$' "$nlist" >/dev/null; then
-	if $GREP ' nm_test_func$' "$nlist" >/dev/null; then
-	  cat <<_LT_EOF > conftest.$ac_ext
-/* Keep this code in sync between libtool.m4, ltmain, lt_system.h, and tests.  */
-#if defined(_WIN32) || defined(__CYGWIN__) || defined(_WIN32_WCE)
-/* DATA imports from DLLs on WIN32 con't be const, because runtime
-   relocations are performed -- see ld's documentation on pseudo-relocs.  */
-# define LT@&t at _DLSYM_CONST
-#elif defined(__osf__)
-/* This system does not cope well with relocations in const data.  */
-# define LT@&t at _DLSYM_CONST
-#else
-# define LT@&t at _DLSYM_CONST const
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-_LT_EOF
-	  # Now generate the symbol file.
-	  eval "$lt_cv_sys_global_symbol_to_cdecl"' < "$nlist" | $GREP -v main >> conftest.$ac_ext'
-
-	  cat <<_LT_EOF >> conftest.$ac_ext
-
-/* The mapping between symbol names and symbols.  */
-LT@&t at _DLSYM_CONST struct {
-  const char *name;
-  void       *address;
-}
-lt__PROGRAM__LTX_preloaded_symbols[[]] =
-{
-  { "@PROGRAM@", (void *) 0 },
-_LT_EOF
-	  $SED "s/^$symcode$symcode* \(.*\) \(.*\)$/  {\"\2\", (void *) \&\2},/" < "$nlist" | $GREP -v main >> conftest.$ac_ext
-	  cat <<\_LT_EOF >> conftest.$ac_ext
-  {0, (void *) 0}
-};
-
-/* This works around a problem in FreeBSD linker */
-#ifdef FREEBSD_WORKAROUND
-static const void *lt_preloaded_setup() {
-  return lt__PROGRAM__LTX_preloaded_symbols;
-}
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-_LT_EOF
-	  # Now try linking the two files.
-	  mv conftest.$ac_objext conftstm.$ac_objext
-	  lt_globsym_save_LIBS=$LIBS
-	  lt_globsym_save_CFLAGS=$CFLAGS
-	  LIBS="conftstm.$ac_objext"
-	  CFLAGS="$CFLAGS$_LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)"
-	  if AC_TRY_EVAL(ac_link) && test -s conftest${ac_exeext}; then
-	    pipe_works=yes
-	  fi
-	  LIBS=$lt_globsym_save_LIBS
-	  CFLAGS=$lt_globsym_save_CFLAGS
-	else
-	  echo "cannot find nm_test_func in $nlist" >&AS_MESSAGE_LOG_FD
-	fi
-      else
-	echo "cannot find nm_test_var in $nlist" >&AS_MESSAGE_LOG_FD
-      fi
-    else
-      echo "cannot run $lt_cv_sys_global_symbol_pipe" >&AS_MESSAGE_LOG_FD
-    fi
-  else
-    echo "$progname: failed program was:" >&AS_MESSAGE_LOG_FD
-    cat conftest.$ac_ext >&5
-  fi
-  rm -rf conftest* conftst*
-
-  # Do not use the global_symbol_pipe unless it works.
-  if test "$pipe_works" = yes; then
-    break
-  else
-    lt_cv_sys_global_symbol_pipe=
-  fi
-done
-])
-if test -z "$lt_cv_sys_global_symbol_pipe"; then
-  lt_cv_sys_global_symbol_to_cdecl=
-fi
-if test -z "$lt_cv_sys_global_symbol_pipe$lt_cv_sys_global_symbol_to_cdecl"; then
-  AC_MSG_RESULT(failed)
-else
-  AC_MSG_RESULT(ok)
-fi
-
-# Response file support.
-if test "$lt_cv_nm_interface" = "MS dumpbin"; then
-  nm_file_list_spec='@'
-elif $NM --help 2>/dev/null | grep '[[@]]FILE' >/dev/null; then
-  nm_file_list_spec='@'
-fi
-
-_LT_DECL([global_symbol_pipe], [lt_cv_sys_global_symbol_pipe], [1],
-    [Take the output of nm and produce a listing of raw symbols and C names])
-_LT_DECL([global_symbol_to_cdecl], [lt_cv_sys_global_symbol_to_cdecl], [1],
-    [Transform the output of nm in a proper C declaration])
-_LT_DECL([global_symbol_to_c_name_address],
-    [lt_cv_sys_global_symbol_to_c_name_address], [1],
-    [Transform the output of nm in a C name address pair])
-_LT_DECL([global_symbol_to_c_name_address_lib_prefix],
-    [lt_cv_sys_global_symbol_to_c_name_address_lib_prefix], [1],
-    [Transform the output of nm in a C name address pair when lib prefix is needed])
-_LT_DECL([], [nm_file_list_spec], [1],
-    [Specify filename containing input files for $NM])
-]) # _LT_CMD_GLOBAL_SYMBOLS
-
-
-# _LT_COMPILER_PIC([TAGNAME])
-# ---------------------------
-m4_defun([_LT_COMPILER_PIC],
-[m4_require([_LT_TAG_COMPILER])dnl
-_LT_TAGVAR(lt_prog_compiler_wl, $1)=
-_LT_TAGVAR(lt_prog_compiler_pic, $1)=
-_LT_TAGVAR(lt_prog_compiler_static, $1)=
-
-m4_if([$1], [CXX], [
-  # C++ specific cases for pic, static, wl, etc.
-  if test "$GXX" = yes; then
-    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-    _LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
-
-    case $host_os in
-    aix*)
-      # All AIX code is PIC.
-      if test "$host_cpu" = ia64; then
-	# AIX 5 now supports IA64 processor
-	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-      fi
-      ;;
-
-    amigaos*)
-      case $host_cpu in
-      powerpc)
-            # see comment about AmigaOS4 .so support
-            _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
-        ;;
-      m68k)
-            # FIXME: we need at least 68020 code to build shared libraries, but
-            # adding the `-m68020' flag to GCC prevents building anything better,
-            # like `-m68040'.
-            _LT_TAGVAR(lt_prog_compiler_pic, $1)='-m68020 -resident32 -malways-restore-a4'
-        ;;
-      esac
-      ;;
-
-    beos* | irix5* | irix6* | nonstopux* | osf3* | osf4* | osf5*)
-      # PIC is the default for these OSes.
-      ;;
-    mingw* | cygwin* | os2* | pw32* | cegcc*)
-      # This hack is so that the source file can tell whether it is being
-      # built for inclusion in a dll (and should export symbols for example).
-      # Although the cygwin gcc ignores -fPIC, still need this for old-style
-      # (--disable-auto-import) libraries
-      m4_if([$1], [GCJ], [],
-	[_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT'])
-      ;;
-    darwin* | rhapsody*)
-      # PIC is the default on this platform
-      # Common symbols not allowed in MH_DYLIB files
-      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fno-common'
-      ;;
-    *djgpp*)
-      # DJGPP does not support shared libraries at all
-      _LT_TAGVAR(lt_prog_compiler_pic, $1)=
-      ;;
-    haiku*)
-      # PIC is the default for Haiku.
-      # The "-static" flag exists, but is broken.
-      _LT_TAGVAR(lt_prog_compiler_static, $1)=
-      ;;
-    interix[[3-9]]*)
-      # Interix 3.x gcc -fpic/-fPIC options generate broken code.
-      # Instead, we relocate shared libraries at runtime.
-      ;;
-    sysv4*MP*)
-      if test -d /usr/nec; then
-	_LT_TAGVAR(lt_prog_compiler_pic, $1)=-Kconform_pic
-      fi
-      ;;
-    hpux*)
-      # PIC is the default for 64-bit PA HP-UX, but not for 32-bit
-      # PA HP-UX.  On IA64 HP-UX, PIC is the default but the pic flag
-      # sets the default TLS model and affects inlining.
-      case $host_cpu in
-      hppa*64*)
-	;;
-      *)
-	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
-	;;
-      esac
-      ;;
-    *qnx* | *nto*)
-      # QNX uses GNU C++, but need to define -shared option too, otherwise
-      # it will coredump.
-      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC -shared'
-      ;;
-    *)
-      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
-      ;;
-    esac
-  else
-    case $host_os in
-      aix[[4-9]]*)
-	# All AIX code is PIC.
-	if test "$host_cpu" = ia64; then
-	  # AIX 5 now supports IA64 processor
-	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-	else
-	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-bnso -bI:/lib/syscalls.exp'
-	fi
-	;;
-      chorus*)
-	case $cc_basename in
-	cxch68*)
-	  # Green Hills C++ Compiler
-	  # _LT_TAGVAR(lt_prog_compiler_static, $1)="--no_auto_instantiation -u __main -u __premain -u _abort -r $COOL_DIR/lib/libOrb.a $MVME_DIR/lib/CC/libC.a $MVME_DIR/lib/classix/libcx.s.a"
-	  ;;
-	esac
-	;;
-      mingw* | cygwin* | os2* | pw32* | cegcc*)
-	# This hack is so that the source file can tell whether it is being
-	# built for inclusion in a dll (and should export symbols for example).
-	m4_if([$1], [GCJ], [],
-	  [_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT'])
-	;;
-      dgux*)
-	case $cc_basename in
-	  ec++*)
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
-	    ;;
-	  ghcx*)
-	    # Green Hills C++ Compiler
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      freebsd* | dragonfly*)
-	# FreeBSD uses GNU C++
-	;;
-      hpux9* | hpux10* | hpux11*)
-	case $cc_basename in
-	  CC*)
-	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	    _LT_TAGVAR(lt_prog_compiler_static, $1)='${wl}-a ${wl}archive'
-	    if test "$host_cpu" != ia64; then
-	      _LT_TAGVAR(lt_prog_compiler_pic, $1)='+Z'
-	    fi
-	    ;;
-	  aCC*)
-	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	    _LT_TAGVAR(lt_prog_compiler_static, $1)='${wl}-a ${wl}archive'
-	    case $host_cpu in
-	    hppa*64*|ia64*)
-	      # +Z the default
-	      ;;
-	    *)
-	      _LT_TAGVAR(lt_prog_compiler_pic, $1)='+Z'
-	      ;;
-	    esac
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      interix*)
-	# This is c89, which is MS Visual C++ (no shared libs)
-	# Anyone wants to do a port?
-	;;
-      irix5* | irix6* | nonstopux*)
-	case $cc_basename in
-	  CC*)
-	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
-	    # CC pic flag -KPIC is the default.
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
-	case $cc_basename in
-	  KCC*)
-	    # KAI C++ Compiler
-	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='--backend -Wl,'
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
-	    ;;
-	  ecpc* )
-	    # old Intel C++ for x86_64 which still supported -KPIC.
-	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
-	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
-	    ;;
-	  icpc* )
-	    # Intel C++, used to be incompatible with GCC.
-	    # ICC 10 doesn't accept -KPIC any more.
-	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
-	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
-	    ;;
-	  pgCC* | pgcpp*)
-	    # Portland Group C++ compiler
-	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fpic'
-	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-	    ;;
-	  cxx*)
-	    # Compaq C++
-	    # Make sure the PIC flag is empty.  It appears that all Alpha
-	    # Linux and Compaq Tru64 Unix objects are PIC.
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)=
-	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
-	    ;;
-	  xlc* | xlC* | bgxl[[cC]]* | mpixl[[cC]]*)
-	    # IBM XL 8.0, 9.0 on PPC and BlueGene
-	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-qpic'
-	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-qstaticlink'
-	    ;;
-	  *)
-	    case `$CC -V 2>&1 | sed 5q` in
-	    *Sun\ C*)
-	      # Sun C++ 5.9
-	      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
-	      _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-	      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld '
-	      ;;
-	    esac
-	    ;;
-	esac
-	;;
-      lynxos*)
-	;;
-      m88k*)
-	;;
-      mvs*)
-	case $cc_basename in
-	  cxx*)
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-W c,exportall'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      netbsd* | netbsdelf*-gnu)
-	;;
-      *qnx* | *nto*)
-        # QNX uses GNU C++, but need to define -shared option too, otherwise
-        # it will coredump.
-        _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC -shared'
-        ;;
-      osf3* | osf4* | osf5*)
-	case $cc_basename in
-	  KCC*)
-	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='--backend -Wl,'
-	    ;;
-	  RCC*)
-	    # Rational C++ 2.4.1
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic'
-	    ;;
-	  cxx*)
-	    # Digital/Compaq C++
-	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	    # Make sure the PIC flag is empty.  It appears that all Alpha
-	    # Linux and Compaq Tru64 Unix objects are PIC.
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)=
-	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      psos*)
-	;;
-      solaris*)
-	case $cc_basename in
-	  CC* | sunCC*)
-	    # Sun C++ 4.2, 5.x and Centerline C++
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
-	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld '
-	    ;;
-	  gcx*)
-	    # Green Hills C++ Compiler
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-PIC'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      sunos4*)
-	case $cc_basename in
-	  CC*)
-	    # Sun C++ 4.x
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic'
-	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-	    ;;
-	  lcc*)
-	    # Lucid
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      sysv5* | unixware* | sco3.2v5* | sco5v6* | OpenUNIX*)
-	case $cc_basename in
-	  CC*)
-	    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
-	    _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-	    ;;
-	esac
-	;;
-      tandem*)
-	case $cc_basename in
-	  NCC*)
-	    # NonStop-UX NCC 3.20
-	    _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
-	    ;;
-	  *)
-	    ;;
-	esac
-	;;
-      vxworks*)
-	;;
-      *)
-	_LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no
-	;;
-    esac
-  fi
-],
-[
-  if test "$GCC" = yes; then
-    _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-    _LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
-
-    case $host_os in
-      aix*)
-      # All AIX code is PIC.
-      if test "$host_cpu" = ia64; then
-	# AIX 5 now supports IA64 processor
-	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-      fi
-      ;;
-
-    amigaos*)
-      case $host_cpu in
-      powerpc)
-            # see comment about AmigaOS4 .so support
-            _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
-        ;;
-      m68k)
-            # FIXME: we need at least 68020 code to build shared libraries, but
-            # adding the `-m68020' flag to GCC prevents building anything better,
-            # like `-m68040'.
-            _LT_TAGVAR(lt_prog_compiler_pic, $1)='-m68020 -resident32 -malways-restore-a4'
-        ;;
-      esac
-      ;;
-
-    beos* | irix5* | irix6* | nonstopux* | osf3* | osf4* | osf5*)
-      # PIC is the default for these OSes.
-      ;;
-
-    mingw* | cygwin* | pw32* | os2* | cegcc*)
-      # This hack is so that the source file can tell whether it is being
-      # built for inclusion in a dll (and should export symbols for example).
-      # Although the cygwin gcc ignores -fPIC, still need this for old-style
-      # (--disable-auto-import) libraries
-      m4_if([$1], [GCJ], [],
-	[_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT'])
-      ;;
-
-    darwin* | rhapsody*)
-      # PIC is the default on this platform
-      # Common symbols not allowed in MH_DYLIB files
-      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fno-common'
-      ;;
-
-    haiku*)
-      # PIC is the default for Haiku.
-      # The "-static" flag exists, but is broken.
-      _LT_TAGVAR(lt_prog_compiler_static, $1)=
-      ;;
-
-    hpux*)
-      # PIC is the default for 64-bit PA HP-UX, but not for 32-bit
-      # PA HP-UX.  On IA64 HP-UX, PIC is the default but the pic flag
-      # sets the default TLS model and affects inlining.
-      case $host_cpu in
-      hppa*64*)
-	# +Z the default
-	;;
-      *)
-	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
-	;;
-      esac
-      ;;
-
-    interix[[3-9]]*)
-      # Interix 3.x gcc -fpic/-fPIC options generate broken code.
-      # Instead, we relocate shared libraries at runtime.
-      ;;
-
-    msdosdjgpp*)
-      # Just because we use GCC doesn't mean we suddenly get shared libraries
-      # on systems that don't support them.
-      _LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no
-      enable_shared=no
-      ;;
-
-    *nto* | *qnx*)
-      # QNX uses GNU C++, but need to define -shared option too, otherwise
-      # it will coredump.
-      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC -shared'
-      ;;
-
-    sysv4*MP*)
-      if test -d /usr/nec; then
-	_LT_TAGVAR(lt_prog_compiler_pic, $1)=-Kconform_pic
-      fi
-      ;;
-
-    *)
-      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
-      ;;
-    esac
-
-    case $cc_basename in
-    nvcc*) # Cuda Compiler Driver 2.2
-      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Xlinker '
-      if test -n "$_LT_TAGVAR(lt_prog_compiler_pic, $1)"; then
-        _LT_TAGVAR(lt_prog_compiler_pic, $1)="-Xcompiler $_LT_TAGVAR(lt_prog_compiler_pic, $1)"
-      fi
-      ;;
-    esac
-  else
-    # PORTME Check for flag to pass linker flags through the system compiler.
-    case $host_os in
-    aix*)
-      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-      if test "$host_cpu" = ia64; then
-	# AIX 5 now supports IA64 processor
-	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-      else
-	_LT_TAGVAR(lt_prog_compiler_static, $1)='-bnso -bI:/lib/syscalls.exp'
-      fi
-      ;;
-
-    mingw* | cygwin* | pw32* | os2* | cegcc*)
-      # This hack is so that the source file can tell whether it is being
-      # built for inclusion in a dll (and should export symbols for example).
-      m4_if([$1], [GCJ], [],
-	[_LT_TAGVAR(lt_prog_compiler_pic, $1)='-DDLL_EXPORT'])
-      ;;
-
-    hpux9* | hpux10* | hpux11*)
-      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-      # PIC is the default for IA64 HP-UX and 64-bit HP-UX, but
-      # not for PA HP-UX.
-      case $host_cpu in
-      hppa*64*|ia64*)
-	# +Z the default
-	;;
-      *)
-	_LT_TAGVAR(lt_prog_compiler_pic, $1)='+Z'
-	;;
-      esac
-      # Is there a better lt_prog_compiler_static that works with the bundled CC?
-      _LT_TAGVAR(lt_prog_compiler_static, $1)='${wl}-a ${wl}archive'
-      ;;
-
-    irix5* | irix6* | nonstopux*)
-      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-      # PIC (with -KPIC) is the default.
-      _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
-      ;;
-
-    linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
-      case $cc_basename in
-      # old Intel for x86_64 which still supported -KPIC.
-      ecc*)
-	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
-	_LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
-        ;;
-      # icc used to be incompatible with GCC.
-      # ICC 10 doesn't accept -KPIC any more.
-      icc* | ifort*)
-	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
-	_LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
-        ;;
-      # Lahey Fortran 8.1.
-      lf95*)
-	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	_LT_TAGVAR(lt_prog_compiler_pic, $1)='--shared'
-	_LT_TAGVAR(lt_prog_compiler_static, $1)='--static'
-	;;
-      nagfor*)
-	# NAG Fortran compiler
-	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,-Wl,,'
-	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-PIC'
-	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-	;;
-      pgcc* | pgf77* | pgf90* | pgf95* | pgfortran*)
-        # Portland Group compilers (*not* the Pentium gcc compiler,
-	# which looks to be a dead project)
-	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-fpic'
-	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-        ;;
-      ccc*)
-        _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-        # All Alpha code is PIC.
-        _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
-        ;;
-      xl* | bgxl* | bgf* | mpixl*)
-	# IBM XL C 8.0/Fortran 10.1, 11.1 on PPC and BlueGene
-	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-qpic'
-	_LT_TAGVAR(lt_prog_compiler_static, $1)='-qstaticlink'
-	;;
-      *)
-	case `$CC -V 2>&1 | sed 5q` in
-	*Sun\ Ceres\ Fortran* | *Sun*Fortran*\ [[1-7]].* | *Sun*Fortran*\ 8.[[0-3]]*)
-	  # Sun Fortran 8.3 passes all unrecognized flags to the linker
-	  _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
-	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-	  _LT_TAGVAR(lt_prog_compiler_wl, $1)=''
-	  ;;
-	*Sun\ F* | *Sun*Fortran*)
-	  _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
-	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-	  _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld '
-	  ;;
-	*Sun\ C*)
-	  # Sun C 5.9
-	  _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
-	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-	  _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	  ;;
-        *Intel*\ [[CF]]*Compiler*)
-	  _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	  _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC'
-	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-static'
-	  ;;
-	*Portland\ Group*)
-	  _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-	  _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fpic'
-	  _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-	  ;;
-	esac
-	;;
-      esac
-      ;;
-
-    newsos6)
-      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
-      _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-      ;;
-
-    *nto* | *qnx*)
-      # QNX uses GNU C++, but need to define -shared option too, otherwise
-      # it will coredump.
-      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-fPIC -shared'
-      ;;
-
-    osf3* | osf4* | osf5*)
-      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-      # All OSF/1 code is PIC.
-      _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
-      ;;
-
-    rdos*)
-      _LT_TAGVAR(lt_prog_compiler_static, $1)='-non_shared'
-      ;;
-
-    solaris*)
-      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
-      _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-      case $cc_basename in
-      f77* | f90* | f95* | sunf77* | sunf90* | sunf95*)
-	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld ';;
-      *)
-	_LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,';;
-      esac
-      ;;
-
-    sunos4*)
-      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Qoption ld '
-      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-PIC'
-      _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-      ;;
-
-    sysv4 | sysv4.2uw2* | sysv4.3*)
-      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
-      _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-      ;;
-
-    sysv4*MP*)
-      if test -d /usr/nec ;then
-	_LT_TAGVAR(lt_prog_compiler_pic, $1)='-Kconform_pic'
-	_LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-      fi
-      ;;
-
-    sysv5* | unixware* | sco3.2v5* | sco5v6* | OpenUNIX*)
-      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-KPIC'
-      _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-      ;;
-
-    unicos*)
-      _LT_TAGVAR(lt_prog_compiler_wl, $1)='-Wl,'
-      _LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no
-      ;;
-
-    uts4*)
-      _LT_TAGVAR(lt_prog_compiler_pic, $1)='-pic'
-      _LT_TAGVAR(lt_prog_compiler_static, $1)='-Bstatic'
-      ;;
-
-    *)
-      _LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no
-      ;;
-    esac
-  fi
-])
-case $host_os in
-  # For platforms which do not support PIC, -DPIC is meaningless:
-  *djgpp*)
-    _LT_TAGVAR(lt_prog_compiler_pic, $1)=
-    ;;
-  *)
-    _LT_TAGVAR(lt_prog_compiler_pic, $1)="$_LT_TAGVAR(lt_prog_compiler_pic, $1)@&t at m4_if([$1],[],[ -DPIC],[m4_if([$1],[CXX],[ -DPIC],[])])"
-    ;;
-esac
-
-AC_CACHE_CHECK([for $compiler option to produce PIC],
-  [_LT_TAGVAR(lt_cv_prog_compiler_pic, $1)],
-  [_LT_TAGVAR(lt_cv_prog_compiler_pic, $1)=$_LT_TAGVAR(lt_prog_compiler_pic, $1)])
-_LT_TAGVAR(lt_prog_compiler_pic, $1)=$_LT_TAGVAR(lt_cv_prog_compiler_pic, $1)
-
-#
-# Check to make sure the PIC flag actually works.
-#
-if test -n "$_LT_TAGVAR(lt_prog_compiler_pic, $1)"; then
-  _LT_COMPILER_OPTION([if $compiler PIC flag $_LT_TAGVAR(lt_prog_compiler_pic, $1) works],
-    [_LT_TAGVAR(lt_cv_prog_compiler_pic_works, $1)],
-    [$_LT_TAGVAR(lt_prog_compiler_pic, $1)@&t at m4_if([$1],[],[ -DPIC],[m4_if([$1],[CXX],[ -DPIC],[])])], [],
-    [case $_LT_TAGVAR(lt_prog_compiler_pic, $1) in
-     "" | " "*) ;;
-     *) _LT_TAGVAR(lt_prog_compiler_pic, $1)=" $_LT_TAGVAR(lt_prog_compiler_pic, $1)" ;;
-     esac],
-    [_LT_TAGVAR(lt_prog_compiler_pic, $1)=
-     _LT_TAGVAR(lt_prog_compiler_can_build_shared, $1)=no])
-fi
-_LT_TAGDECL([pic_flag], [lt_prog_compiler_pic], [1],
-	[Additional compiler flags for building library objects])
-
-_LT_TAGDECL([wl], [lt_prog_compiler_wl], [1],
-	[How to pass a linker flag through the compiler])
-#
-# Check to make sure the static flag actually works.
-#
-wl=$_LT_TAGVAR(lt_prog_compiler_wl, $1) eval lt_tmp_static_flag=\"$_LT_TAGVAR(lt_prog_compiler_static, $1)\"
-_LT_LINKER_OPTION([if $compiler static flag $lt_tmp_static_flag works],
-  _LT_TAGVAR(lt_cv_prog_compiler_static_works, $1),
-  $lt_tmp_static_flag,
-  [],
-  [_LT_TAGVAR(lt_prog_compiler_static, $1)=])
-_LT_TAGDECL([link_static_flag], [lt_prog_compiler_static], [1],
-	[Compiler flag to prevent dynamic linking])
-])# _LT_COMPILER_PIC
-
-
-# _LT_LINKER_SHLIBS([TAGNAME])
-# ----------------------------
-# See if the linker supports building shared libraries.
-m4_defun([_LT_LINKER_SHLIBS],
-[AC_REQUIRE([LT_PATH_LD])dnl
-AC_REQUIRE([LT_PATH_NM])dnl
-m4_require([_LT_PATH_MANIFEST_TOOL])dnl
-m4_require([_LT_FILEUTILS_DEFAULTS])dnl
-m4_require([_LT_DECL_EGREP])dnl
-m4_require([_LT_DECL_SED])dnl
-m4_require([_LT_CMD_GLOBAL_SYMBOLS])dnl
-m4_require([_LT_TAG_COMPILER])dnl
-AC_MSG_CHECKING([whether the $compiler linker ($LD) supports shared libraries])
-m4_if([$1], [CXX], [
-  _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
-  _LT_TAGVAR(exclude_expsyms, $1)=['_GLOBAL_OFFSET_TABLE_|_GLOBAL__F[ID]_.*']
-  case $host_os in
-  aix[[4-9]]*)
-    # If we're using GNU nm, then we don't want the "-C" option.
-    # -C means demangle to AIX nm, but means don't demangle with GNU nm
-    # Also, AIX nm treats weak defined symbols like other global defined
-    # symbols, whereas GNU nm marks them as "W".
-    if $NM -V 2>&1 | $GREP 'GNU' > /dev/null; then
-      _LT_TAGVAR(export_symbols_cmds, $1)='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && ([substr](\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
-    else
-      _LT_TAGVAR(export_symbols_cmds, $1)='$NM -BCpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B")) && ([substr](\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
-    fi
-    ;;
-  pw32*)
-    _LT_TAGVAR(export_symbols_cmds, $1)="$ltdll_cmds"
-    ;;
-  cygwin* | mingw* | cegcc*)
-    case $cc_basename in
-    cl*)
-      _LT_TAGVAR(exclude_expsyms, $1)='_NULL_IMPORT_DESCRIPTOR|_IMPORT_DESCRIPTOR_.*'
-      ;;
-    *)
-      _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[[BCDGRS]][[ ]]/s/.*[[ ]]\([[^ ]]*\)/\1 DATA/;s/^.*[[ ]]__nm__\([[^ ]]*\)[[ ]][[^ ]]*/\1 DATA/;/^I[[ ]]/d;/^[[AITW]][[ ]]/s/.* //'\'' | sort | uniq > $export_symbols'
-      _LT_TAGVAR(exclude_expsyms, $1)=['[_]+GLOBAL_OFFSET_TABLE_|[_]+GLOBAL__[FID]_.*|[_]+head_[A-Za-z0-9_]+_dll|[A-Za-z0-9_]+_dll_iname']
-      ;;
-    esac
-    ;;
-  linux* | k*bsd*-gnu | gnu*)
-    _LT_TAGVAR(link_all_deplibs, $1)=no
-    ;;
-  *)
-    _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
-    ;;
-  esac
-], [
-  runpath_var=
-  _LT_TAGVAR(allow_undefined_flag, $1)=
-  _LT_TAGVAR(always_export_symbols, $1)=no
-  _LT_TAGVAR(archive_cmds, $1)=
-  _LT_TAGVAR(archive_expsym_cmds, $1)=
-  _LT_TAGVAR(compiler_needs_object, $1)=no
-  _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=no
-  _LT_TAGVAR(export_dynamic_flag_spec, $1)=
-  _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED '\''s/.* //'\'' | sort | uniq > $export_symbols'
-  _LT_TAGVAR(hardcode_automatic, $1)=no
-  _LT_TAGVAR(hardcode_direct, $1)=no
-  _LT_TAGVAR(hardcode_direct_absolute, $1)=no
-  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)=
-  _LT_TAGVAR(hardcode_libdir_separator, $1)=
-  _LT_TAGVAR(hardcode_minus_L, $1)=no
-  _LT_TAGVAR(hardcode_shlibpath_var, $1)=unsupported
-  _LT_TAGVAR(inherit_rpath, $1)=no
-  _LT_TAGVAR(link_all_deplibs, $1)=unknown
-  _LT_TAGVAR(module_cmds, $1)=
-  _LT_TAGVAR(module_expsym_cmds, $1)=
-  _LT_TAGVAR(old_archive_from_new_cmds, $1)=
-  _LT_TAGVAR(old_archive_from_expsyms_cmds, $1)=
-  _LT_TAGVAR(thread_safe_flag_spec, $1)=
-  _LT_TAGVAR(whole_archive_flag_spec, $1)=
-  # include_expsyms should be a list of space-separated symbols to be *always*
-  # included in the symbol list
-  _LT_TAGVAR(include_expsyms, $1)=
-  # exclude_expsyms can be an extended regexp of symbols to exclude
-  # it will be wrapped by ` (' and `)$', so one must not match beginning or
-  # end of line.  Example: `a|bc|.*d.*' will exclude the symbols `a' and `bc',
-  # as well as any symbol that contains `d'.
-  _LT_TAGVAR(exclude_expsyms, $1)=['_GLOBAL_OFFSET_TABLE_|_GLOBAL__F[ID]_.*']
-  # Although _GLOBAL_OFFSET_TABLE_ is a valid symbol C name, most a.out
-  # platforms (ab)use it in PIC code, but their linkers get confused if
-  # the symbol is explicitly referenced.  Since portable code cannot
-  # rely on this symbol name, it's probably fine to never include it in
-  # preloaded symbol tables.
-  # Exclude shared library initialization/finalization symbols.
-dnl Note also adjust exclude_expsyms for C++ above.
-  extract_expsyms_cmds=
-
-  case $host_os in
-  cygwin* | mingw* | pw32* | cegcc*)
-    # FIXME: the MSVC++ port hasn't been tested in a loooong time
-    # When not using gcc, we currently assume that we are using
-    # Microsoft Visual C++.
-    if test "$GCC" != yes; then
-      with_gnu_ld=no
-    fi
-    ;;
-  interix*)
-    # we just hope/assume this is gcc and not c89 (= MSVC++)
-    with_gnu_ld=yes
-    ;;
-  openbsd*)
-    with_gnu_ld=no
-    ;;
-  linux* | k*bsd*-gnu | gnu*)
-    _LT_TAGVAR(link_all_deplibs, $1)=no
-    ;;
-  esac
-
-  _LT_TAGVAR(ld_shlibs, $1)=yes
-
-  # On some targets, GNU ld is compatible enough with the native linker
-  # that we're better off using the native interface for both.
-  lt_use_gnu_ld_interface=no
-  if test "$with_gnu_ld" = yes; then
-    case $host_os in
-      aix*)
-	# The AIX port of GNU ld has always aspired to compatibility
-	# with the native linker.  However, as the warning in the GNU ld
-	# block says, versions before 2.19.5* couldn't really create working
-	# shared libraries, regardless of the interface used.
-	case `$LD -v 2>&1` in
-	  *\ \(GNU\ Binutils\)\ 2.19.5*) ;;
-	  *\ \(GNU\ Binutils\)\ 2.[[2-9]]*) ;;
-	  *\ \(GNU\ Binutils\)\ [[3-9]]*) ;;
-	  *)
-	    lt_use_gnu_ld_interface=yes
-	    ;;
-	esac
-	;;
-      *)
-	lt_use_gnu_ld_interface=yes
-	;;
-    esac
-  fi
-
-  if test "$lt_use_gnu_ld_interface" = yes; then
-    # If archive_cmds runs LD, not CC, wlarc should be empty
-    wlarc='${wl}'
-
-    # Set some defaults for GNU ld with shared library support. These
-    # are reset later if shared libraries are not supported. Putting them
-    # here allows them to be overridden if necessary.
-    runpath_var=LD_RUN_PATH
-    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
-    # ancient GNU ld didn't support --whole-archive et. al.
-    if $LD --help 2>&1 | $GREP 'no-whole-archive' > /dev/null; then
-      _LT_TAGVAR(whole_archive_flag_spec, $1)="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
-    else
-      _LT_TAGVAR(whole_archive_flag_spec, $1)=
-    fi
-    supports_anon_versioning=no
-    case `$LD -v 2>&1` in
-      *GNU\ gold*) supports_anon_versioning=yes ;;
-      *\ [[01]].* | *\ 2.[[0-9]].* | *\ 2.10.*) ;; # catch versions < 2.11
-      *\ 2.11.93.0.2\ *) supports_anon_versioning=yes ;; # RH7.3 ...
-      *\ 2.11.92.0.12\ *) supports_anon_versioning=yes ;; # Mandrake 8.2 ...
-      *\ 2.11.*) ;; # other 2.11 versions
-      *) supports_anon_versioning=yes ;;
-    esac
-
-    # See if GNU ld supports shared libraries.
-    case $host_os in
-    aix[[3-9]]*)
-      # On AIX/PPC, the GNU linker is very broken
-      if test "$host_cpu" != ia64; then
-	_LT_TAGVAR(ld_shlibs, $1)=no
-	cat <<_LT_EOF 1>&2
-
-*** Warning: the GNU linker, at least up to release 2.19, is reported
-*** to be unable to reliably create shared libraries on AIX.
-*** Therefore, libtool is disabling shared libraries support.  If you
-*** really care for shared libraries, you may want to install binutils
-*** 2.20 or above, or modify your PATH so that a non-GNU linker is found.
-*** You will then need to restart the configuration process.
-
-_LT_EOF
-      fi
-      ;;
-
-    amigaos*)
-      case $host_cpu in
-      powerpc)
-            # see comment about AmigaOS4 .so support
-            _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-            _LT_TAGVAR(archive_expsym_cmds, $1)=''
-        ;;
-      m68k)
-            _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/a2ixlibrary.data~$ECHO "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$ECHO "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$ECHO "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$ECHO "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
-            _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
-            _LT_TAGVAR(hardcode_minus_L, $1)=yes
-        ;;
-      esac
-      ;;
-
-    beos*)
-      if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
-	_LT_TAGVAR(allow_undefined_flag, $1)=unsupported
-	# Joseph Beckenbach <jrb3 at best.com> says some releases of gcc
-	# support --undefined.  This deserves some investigation.  FIXME
-	_LT_TAGVAR(archive_cmds, $1)='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-      else
-	_LT_TAGVAR(ld_shlibs, $1)=no
-      fi
-      ;;
-
-    cygwin* | mingw* | pw32* | cegcc*)
-      # _LT_TAGVAR(hardcode_libdir_flag_spec, $1) is actually meaningless,
-      # as there is no search path for DLLs.
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
-      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-all-symbols'
-      _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
-      _LT_TAGVAR(always_export_symbols, $1)=no
-      _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
-      _LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[[BCDGRS]][[ ]]/s/.*[[ ]]\([[^ ]]*\)/\1 DATA/;s/^.*[[ ]]__nm__\([[^ ]]*\)[[ ]][[^ ]]*/\1 DATA/;/^I[[ ]]/d;/^[[AITW]][[ ]]/s/.* //'\'' | sort | uniq > $export_symbols'
-      _LT_TAGVAR(exclude_expsyms, $1)=['[_]+GLOBAL_OFFSET_TABLE_|[_]+GLOBAL__[FID]_.*|[_]+head_[A-Za-z0-9_]+_dll|[A-Za-z0-9_]+_dll_iname']
-
-      if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then
-        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
-	# If the export-symbols file already is a .def file (1st line
-	# is EXPORTS), use it as is; otherwise, prepend...
-	_LT_TAGVAR(archive_expsym_cmds, $1)='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
-	  cp $export_symbols $output_objdir/$soname.def;
-	else
-	  echo EXPORTS > $output_objdir/$soname.def;
-	  cat $export_symbols >> $output_objdir/$soname.def;
-	fi~
-	$CC -shared $output_objdir/$soname.def $libobjs $deplibs $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
-      else
-	_LT_TAGVAR(ld_shlibs, $1)=no
-      fi
-      ;;
-
-    haiku*)
-      _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-      _LT_TAGVAR(link_all_deplibs, $1)=yes
-      ;;
-
-    interix[[3-9]]*)
-      _LT_TAGVAR(hardcode_direct, $1)=no
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
-      # Hack: On Interix 3.x, we cannot compile PIC because of a broken gcc.
-      # Instead, shared libraries are loaded at an image base (0x10000000 by
-      # default) and relocated if they conflict, which is a slow very memory
-      # consuming and fragmenting process.  To avoid this, we pick a random,
-      # 256 KiB-aligned image base between 0x50000000 and 0x6FFC0000 at link
-      # time.  Moving up from 0x10000000 also allows more sbrk(2) space.
-      _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
-      _LT_TAGVAR(archive_expsym_cmds, $1)='sed "s,^,_," $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--retain-symbols-file,$output_objdir/$soname.expsym ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
-      ;;
-
-    gnu* | linux* | tpf* | k*bsd*-gnu | kopensolaris*-gnu)
-      tmp_diet=no
-      if test "$host_os" = linux-dietlibc; then
-	case $cc_basename in
-	  diet\ *) tmp_diet=yes;;	# linux-dietlibc with static linking (!diet-dyn)
-	esac
-      fi
-      if $LD --help 2>&1 | $EGREP ': supported targets:.* elf' > /dev/null \
-	 && test "$tmp_diet" = no
-      then
-	tmp_addflag=' $pic_flag'
-	tmp_sharedflag='-shared'
-	case $cc_basename,$host_cpu in
-        pgcc*)				# Portland Group C compiler
-	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
-	  tmp_addflag=' $pic_flag'
-	  ;;
-	pgf77* | pgf90* | pgf95* | pgfortran*)
-					# Portland Group f77 and f90 compilers
-	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
-	  tmp_addflag=' $pic_flag -Mnomain' ;;
-	ecc*,ia64* | icc*,ia64*)	# Intel C compiler on ia64
-	  tmp_addflag=' -i_dynamic' ;;
-	efc*,ia64* | ifort*,ia64*)	# Intel Fortran compiler on ia64
-	  tmp_addflag=' -i_dynamic -nofor_main' ;;
-	ifc* | ifort*)			# Intel Fortran compiler
-	  tmp_addflag=' -nofor_main' ;;
-	lf95*)				# Lahey Fortran 8.1
-	  _LT_TAGVAR(whole_archive_flag_spec, $1)=
-	  tmp_sharedflag='--shared' ;;
-	xl[[cC]]* | bgxl[[cC]]* | mpixl[[cC]]*) # IBM XL C 8.0 on PPC (deal with xlf below)
-	  tmp_sharedflag='-qmkshrobj'
-	  tmp_addflag= ;;
-	nvcc*)	# Cuda Compiler Driver 2.2
-	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
-	  _LT_TAGVAR(compiler_needs_object, $1)=yes
-	  ;;
-	esac
-	case `$CC -V 2>&1 | sed 5q` in
-	*Sun\ C*)			# Sun C 5.9
-	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
-	  _LT_TAGVAR(compiler_needs_object, $1)=yes
-	  tmp_sharedflag='-G' ;;
-	*Sun\ F*)			# Sun Fortran 8.3
-	  tmp_sharedflag='-G' ;;
-	esac
-	_LT_TAGVAR(archive_cmds, $1)='$CC '"$tmp_sharedflag""$tmp_addflag"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-
-        if test "x$supports_anon_versioning" = xyes; then
-          _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $output_objdir/$libname.ver~
-	    cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
-	    echo "local: *; };" >> $output_objdir/$libname.ver~
-	    $CC '"$tmp_sharedflag""$tmp_addflag"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-version-script ${wl}$output_objdir/$libname.ver -o $lib'
-        fi
-
-	case $cc_basename in
-	xlf* | bgf* | bgxlf* | mpixlf*)
-	  # IBM XL Fortran 10.1 on PPC cannot create shared libs itself
-	  _LT_TAGVAR(whole_archive_flag_spec, $1)='--whole-archive$convenience --no-whole-archive'
-	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-	  _LT_TAGVAR(archive_cmds, $1)='$LD -shared $libobjs $deplibs $linker_flags -soname $soname -o $lib'
-	  if test "x$supports_anon_versioning" = xyes; then
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $output_objdir/$libname.ver~
-	      cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
-	      echo "local: *; };" >> $output_objdir/$libname.ver~
-	      $LD -shared $libobjs $deplibs $linker_flags -soname $soname -version-script $output_objdir/$libname.ver -o $lib'
-	  fi
-	  ;;
-	esac
-      else
-        _LT_TAGVAR(ld_shlibs, $1)=no
-      fi
-      ;;
-
-    netbsd* | netbsdelf*-gnu)
-      if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
-	_LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable $libobjs $deplibs $linker_flags -o $lib'
-	wlarc=
-      else
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
-      fi
-      ;;
-
-    solaris*)
-      if $LD -v 2>&1 | $GREP 'BFD 2\.8' > /dev/null; then
-	_LT_TAGVAR(ld_shlibs, $1)=no
-	cat <<_LT_EOF 1>&2
-
-*** Warning: The releases 2.8.* of the GNU linker cannot reliably
-*** create shared libraries on Solaris systems.  Therefore, libtool
-*** is disabling shared libraries support.  We urge you to upgrade GNU
-*** binutils to release 2.9.1 or newer.  Another option is to modify
-*** your PATH or compiler configuration so that the native linker is
-*** used, and then restart.
-
-_LT_EOF
-      elif $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
-      else
-	_LT_TAGVAR(ld_shlibs, $1)=no
-      fi
-      ;;
-
-    sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX*)
-      case `$LD -v 2>&1` in
-        *\ [[01]].* | *\ 2.[[0-9]].* | *\ 2.1[[0-5]].*)
-	_LT_TAGVAR(ld_shlibs, $1)=no
-	cat <<_LT_EOF 1>&2
-
-*** Warning: Releases of the GNU linker prior to 2.16.91.0.3 can not
-*** reliably create shared libraries on SCO systems.  Therefore, libtool
-*** is disabling shared libraries support.  We urge you to upgrade GNU
-*** binutils to release 2.16.91.0.3 or newer.  Another option is to modify
-*** your PATH or compiler configuration so that the native linker is
-*** used, and then restart.
-
-_LT_EOF
-	;;
-	*)
-	  # For security reasons, it is highly recommended that you always
-	  # use absolute paths for naming shared libraries, and exclude the
-	  # DT_RUNPATH tag from executables and libraries.  But doing so
-	  # requires that you compile everything twice, which is a pain.
-	  if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
-	  else
-	    _LT_TAGVAR(ld_shlibs, $1)=no
-	  fi
-	;;
-      esac
-      ;;
-
-    sunos4*)
-      _LT_TAGVAR(archive_cmds, $1)='$LD -assert pure-text -Bshareable -o $lib $libobjs $deplibs $linker_flags'
-      wlarc=
-      _LT_TAGVAR(hardcode_direct, $1)=yes
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      ;;
-
-    *)
-      if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
-      else
-	_LT_TAGVAR(ld_shlibs, $1)=no
-      fi
-      ;;
-    esac
-
-    if test "$_LT_TAGVAR(ld_shlibs, $1)" = no; then
-      runpath_var=
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)=
-      _LT_TAGVAR(export_dynamic_flag_spec, $1)=
-      _LT_TAGVAR(whole_archive_flag_spec, $1)=
-    fi
-  else
-    # PORTME fill in a description of your system's linker (not GNU ld)
-    case $host_os in
-    aix3*)
-      _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
-      _LT_TAGVAR(always_export_symbols, $1)=yes
-      _LT_TAGVAR(archive_expsym_cmds, $1)='$LD -o $output_objdir/$soname $libobjs $deplibs $linker_flags -bE:$export_symbols -T512 -H512 -bM:SRE~$AR $AR_FLAGS $lib $output_objdir/$soname'
-      # Note: this linker hardcodes the directories in LIBPATH if there
-      # are no directories specified by -L.
-      _LT_TAGVAR(hardcode_minus_L, $1)=yes
-      if test "$GCC" = yes && test -z "$lt_prog_compiler_static"; then
-	# Neither direct hardcoding nor static linking is supported with a
-	# broken collect2.
-	_LT_TAGVAR(hardcode_direct, $1)=unsupported
-      fi
-      ;;
-
-    aix[[4-9]]*)
-      if test "$host_cpu" = ia64; then
-	# On IA64, the linker does run time linking by default, so we don't
-	# have to do anything special.
-	aix_use_runtimelinking=no
-	exp_sym_flag='-Bexport'
-	no_entry_flag=""
-      else
-	# If we're using GNU nm, then we don't want the "-C" option.
-	# -C means demangle to AIX nm, but means don't demangle with GNU nm
-	# Also, AIX nm treats weak defined symbols like other global
-	# defined symbols, whereas GNU nm marks them as "W".
-	if $NM -V 2>&1 | $GREP 'GNU' > /dev/null; then
-	  _LT_TAGVAR(export_symbols_cmds, $1)='$NM -Bpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B") || (\$ 2 == "W")) && ([substr](\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
-	else
-	  _LT_TAGVAR(export_symbols_cmds, $1)='$NM -BCpg $libobjs $convenience | awk '\''{ if (((\$ 2 == "T") || (\$ 2 == "D") || (\$ 2 == "B")) && ([substr](\$ 3,1,1) != ".")) { print \$ 3 } }'\'' | sort -u > $export_symbols'
-	fi
-	aix_use_runtimelinking=no
-
-	# Test if we are trying to use run time linking or normal
-	# AIX style linking. If -brtl is somewhere in LDFLAGS, we
-	# need to do runtime linking.
-	case $host_os in aix4.[[23]]|aix4.[[23]].*|aix[[5-9]]*)
-	  for ld_flag in $LDFLAGS; do
-	  if (test $ld_flag = "-brtl" || test $ld_flag = "-Wl,-brtl"); then
-	    aix_use_runtimelinking=yes
-	    break
-	  fi
-	  done
-	  ;;
-	esac
-
-	exp_sym_flag='-bexport'
-	no_entry_flag='-bnoentry'
-      fi
-
-      # When large executables or shared objects are built, AIX ld can
-      # have problems creating the table of contents.  If linking a library
-      # or program results in "error TOC overflow" add -mminimal-toc to
-      # CXXFLAGS/CFLAGS for g++/gcc.  In the cases where that is not
-      # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
-
-      _LT_TAGVAR(archive_cmds, $1)=''
-      _LT_TAGVAR(hardcode_direct, $1)=yes
-      _LT_TAGVAR(hardcode_direct_absolute, $1)=yes
-      _LT_TAGVAR(hardcode_libdir_separator, $1)=':'
-      _LT_TAGVAR(link_all_deplibs, $1)=yes
-      _LT_TAGVAR(file_list_spec, $1)='${wl}-f,'
-
-      if test "$GCC" = yes; then
-	case $host_os in aix4.[[012]]|aix4.[[012]].*)
-	# We only want to do this on AIX 4.2 and lower, the check
-	# below for broken collect2 doesn't work under 4.3+
-	  collect2name=`${CC} -print-prog-name=collect2`
-	  if test -f "$collect2name" &&
-	   strings "$collect2name" | $GREP resolve_lib_name >/dev/null
-	  then
-	  # We have reworked collect2
-	  :
-	  else
-	  # We have old collect2
-	  _LT_TAGVAR(hardcode_direct, $1)=unsupported
-	  # It fails to find uninstalled libraries when the uninstalled
-	  # path is not listed in the libpath.  Setting hardcode_minus_L
-	  # to unsupported forces relinking
-	  _LT_TAGVAR(hardcode_minus_L, $1)=yes
-	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
-	  _LT_TAGVAR(hardcode_libdir_separator, $1)=
-	  fi
-	  ;;
-	esac
-	shared_flag='-shared'
-	if test "$aix_use_runtimelinking" = yes; then
-	  shared_flag="$shared_flag "'${wl}-G'
-	fi
-	_LT_TAGVAR(link_all_deplibs, $1)=no
-      else
-	# not using gcc
-	if test "$host_cpu" = ia64; then
-	# VisualAge C++, Version 5.5 for AIX 5L for IA-64, Beta 3 Release
-	# chokes on -Wl,-G. The following line is correct:
-	  shared_flag='-G'
-	else
-	  if test "$aix_use_runtimelinking" = yes; then
-	    shared_flag='${wl}-G'
-	  else
-	    shared_flag='${wl}-bM:SRE'
-	  fi
-	fi
-      fi
-
-      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-bexpall'
-      # It seems that -bexpall does not export symbols beginning with
-      # underscore (_), so it is better to generate a list of symbols to export.
-      _LT_TAGVAR(always_export_symbols, $1)=yes
-      if test "$aix_use_runtimelinking" = yes; then
-	# Warning - without using the other runtime loading flags (-brtl),
-	# -berok will link without error, but may produce a broken library.
-	_LT_TAGVAR(allow_undefined_flag, $1)='-berok'
-        # Determine the default libpath from the value encoded in an
-        # empty executable.
-        _LT_SYS_MODULE_PATH_AIX([$1])
-        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-blibpath:$libdir:'"$aix_libpath"
-        _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags `if test "x${allow_undefined_flag}" != "x"; then func_echo_all "${wl}${allow_undefined_flag}"; else :; fi` '"\${wl}$exp_sym_flag:\$export_symbols $shared_flag"
-      else
-	if test "$host_cpu" = ia64; then
-	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R $libdir:/usr/lib:/lib'
-	  _LT_TAGVAR(allow_undefined_flag, $1)="-z nodefs"
-	  _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$exp_sym_flag:\$export_symbols"
-	else
-	 # Determine the default libpath from the value encoded in an
-	 # empty executable.
-	 _LT_SYS_MODULE_PATH_AIX([$1])
-	 _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-blibpath:$libdir:'"$aix_libpath"
-	  # Warning - without using the other run time loading flags,
-	  # -berok will link without error, but may produce a broken library.
-	  _LT_TAGVAR(no_undefined_flag, $1)=' ${wl}-bernotok'
-	  _LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-berok'
-	  if test "$with_gnu_ld" = yes; then
-	    # We only use this code for GNU lds that support --whole-archive.
-	    _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
-	  else
-	    # Exported symbols can be pulled into shared objects from archives
-	    _LT_TAGVAR(whole_archive_flag_spec, $1)='$convenience'
-	  fi
-	  _LT_TAGVAR(archive_cmds_need_lc, $1)=yes
-	  # This is similar to how AIX traditionally builds its shared libraries.
-	  _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs ${wl}-bnoentry $compiler_flags ${wl}-bE:$export_symbols${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
-	fi
-      fi
-      ;;
-
-    amigaos*)
-      case $host_cpu in
-      powerpc)
-            # see comment about AmigaOS4 .so support
-            _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-            _LT_TAGVAR(archive_expsym_cmds, $1)=''
-        ;;
-      m68k)
-            _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/a2ixlibrary.data~$ECHO "#define NAME $libname" > $output_objdir/a2ixlibrary.data~$ECHO "#define LIBRARY_ID 1" >> $output_objdir/a2ixlibrary.data~$ECHO "#define VERSION $major" >> $output_objdir/a2ixlibrary.data~$ECHO "#define REVISION $revision" >> $output_objdir/a2ixlibrary.data~$AR $AR_FLAGS $lib $libobjs~$RANLIB $lib~(cd $output_objdir && a2ixlibrary -32)'
-            _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
-            _LT_TAGVAR(hardcode_minus_L, $1)=yes
-        ;;
-      esac
-      ;;
-
-    bsdi[[45]]*)
-      _LT_TAGVAR(export_dynamic_flag_spec, $1)=-rdynamic
-      ;;
-
-    cygwin* | mingw* | pw32* | cegcc*)
-      # When not using gcc, we currently assume that we are using
-      # Microsoft Visual C++.
-      # hardcode_libdir_flag_spec is actually meaningless, as there is
-      # no search path for DLLs.
-      case $cc_basename in
-      cl*)
-	# Native MSVC
-	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)=' '
-	_LT_TAGVAR(allow_undefined_flag, $1)=unsupported
-	_LT_TAGVAR(always_export_symbols, $1)=yes
-	_LT_TAGVAR(file_list_spec, $1)='@'
-	# Tell ltmain to make .lib files, not .a files.
-	libext=lib
-	# Tell ltmain to make .dll files, not .so files.
-	shrext_cmds=".dll"
-	# FIXME: Setting linknames here is a bad hack.
-	_LT_TAGVAR(archive_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $compiler_flags $deplibs -Wl,-dll~linknames='
-	_LT_TAGVAR(archive_expsym_cmds, $1)='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
-	    sed -n -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' -e '1\\\!p' < $export_symbols > $output_objdir/$soname.exp;
-	  else
-	    sed -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' < $export_symbols > $output_objdir/$soname.exp;
-	  fi~
-	  $CC -o $tool_output_objdir$soname $libobjs $compiler_flags $deplibs "@$tool_output_objdir$soname.exp" -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~
-	  linknames='
-	# The linker will not automatically build a static lib if we build a DLL.
-	# _LT_TAGVAR(old_archive_from_new_cmds, $1)='true'
-	_LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
-	_LT_TAGVAR(exclude_expsyms, $1)='_NULL_IMPORT_DESCRIPTOR|_IMPORT_DESCRIPTOR_.*'
-	_LT_TAGVAR(export_symbols_cmds, $1)='$NM $libobjs $convenience | $global_symbol_pipe | $SED -e '\''/^[[BCDGRS]][[ ]]/s/.*[[ ]]\([[^ ]]*\)/\1,DATA/'\'' | $SED -e '\''/^[[AITW]][[ ]]/s/.*[[ ]]//'\'' | sort | uniq > $export_symbols'
-	# Don't use ranlib
-	_LT_TAGVAR(old_postinstall_cmds, $1)='chmod 644 $oldlib'
-	_LT_TAGVAR(postlink_cmds, $1)='lt_outputfile="@OUTPUT@"~
-	  lt_tool_outputfile="@TOOL_OUTPUT@"~
-	  case $lt_outputfile in
-	    *.exe|*.EXE) ;;
-	    *)
-	      lt_outputfile="$lt_outputfile.exe"
-	      lt_tool_outputfile="$lt_tool_outputfile.exe"
-	      ;;
-	  esac~
-	  if test "$MANIFEST_TOOL" != ":" && test -f "$lt_outputfile.manifest"; then
-	    $MANIFEST_TOOL -manifest "$lt_tool_outputfile.manifest" -outputresource:"$lt_tool_outputfile" || exit 1;
-	    $RM "$lt_outputfile.manifest";
-	  fi'
-	;;
-      *)
-	# Assume MSVC wrapper
-	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)=' '
-	_LT_TAGVAR(allow_undefined_flag, $1)=unsupported
-	# Tell ltmain to make .lib files, not .a files.
-	libext=lib
-	# Tell ltmain to make .dll files, not .so files.
-	shrext_cmds=".dll"
-	# FIXME: Setting linknames here is a bad hack.
-	_LT_TAGVAR(archive_cmds, $1)='$CC -o $lib $libobjs $compiler_flags `func_echo_all "$deplibs" | $SED '\''s/ -lc$//'\''` -link -dll~linknames='
-	# The linker will automatically build a .lib file if we build a DLL.
-	_LT_TAGVAR(old_archive_from_new_cmds, $1)='true'
-	# FIXME: Should let the user specify the lib program.
-	_LT_TAGVAR(old_archive_cmds, $1)='lib -OUT:$oldlib$oldobjs$old_deplibs'
-	_LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
-	;;
-      esac
-      ;;
-
-    darwin* | rhapsody*)
-      _LT_DARWIN_LINKER_FEATURES($1)
-      ;;
-
-    dgux*)
-      _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      ;;
-
-    # FreeBSD 2.2.[012] allows us to include c++rt0.o to get C++ constructor
-    # support.  Future versions do this automatically, but an explicit c++rt0.o
-    # does not break anything, and helps significantly (at the cost of a little
-    # extra space).
-    freebsd2.2*)
-      _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags /usr/lib/c++rt0.o'
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
-      _LT_TAGVAR(hardcode_direct, $1)=yes
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      ;;
-
-    # Unfortunately, older versions of FreeBSD 2 do not have this feature.
-    freebsd2.*)
-      _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
-      _LT_TAGVAR(hardcode_direct, $1)=yes
-      _LT_TAGVAR(hardcode_minus_L, $1)=yes
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      ;;
-
-    # FreeBSD 3 and greater uses gcc -shared to do shared libraries.
-    freebsd* | dragonfly*)
-      _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
-      _LT_TAGVAR(hardcode_direct, $1)=yes
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      ;;
-
-    hpux9*)
-      if test "$GCC" = yes; then
-	_LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -shared $pic_flag ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $libobjs $deplibs $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
-      else
-	_LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$LD -b +b $install_libdir -o $output_objdir/$soname $libobjs $deplibs $linker_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
-      fi
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
-      _LT_TAGVAR(hardcode_libdir_separator, $1)=:
-      _LT_TAGVAR(hardcode_direct, $1)=yes
-
-      # hardcode_minus_L: Not really in the search PATH,
-      # but as the default location of the library.
-      _LT_TAGVAR(hardcode_minus_L, $1)=yes
-      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
-      ;;
-
-    hpux10*)
-      if test "$GCC" = yes && test "$with_gnu_ld" = no; then
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'
-      else
-	_LT_TAGVAR(archive_cmds, $1)='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags'
-      fi
-      if test "$with_gnu_ld" = no; then
-	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
-	_LT_TAGVAR(hardcode_libdir_separator, $1)=:
-	_LT_TAGVAR(hardcode_direct, $1)=yes
-	_LT_TAGVAR(hardcode_direct_absolute, $1)=yes
-	_LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
-	# hardcode_minus_L: Not really in the search PATH,
-	# but as the default location of the library.
-	_LT_TAGVAR(hardcode_minus_L, $1)=yes
-      fi
-      ;;
-
-    hpux11*)
-      if test "$GCC" = yes && test "$with_gnu_ld" = no; then
-	case $host_cpu in
-	hppa*64*)
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}+h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  ;;
-	ia64*)
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $libobjs $deplibs $compiler_flags'
-	  ;;
-	*)
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'
-	  ;;
-	esac
-      else
-	case $host_cpu in
-	hppa*64*)
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  ;;
-	ia64*)
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $libobjs $deplibs $compiler_flags'
-	  ;;
-	*)
-	m4_if($1, [], [
-	  # Older versions of the 11.00 compiler do not understand -b yet
-	  # (HP92453-01 A.11.01.20 doesn't, HP92453-01 B.11.X.35175-35176.GP does)
-	  _LT_LINKER_OPTION([if $CC understands -b],
-	    _LT_TAGVAR(lt_cv_prog_compiler__b, $1), [-b],
-	    [_LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'],
-	    [_LT_TAGVAR(archive_cmds, $1)='$LD -b +h $soname +b $install_libdir -o $lib $libobjs $deplibs $linker_flags'])],
-	  [_LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $libobjs $deplibs $compiler_flags'])
-	  ;;
-	esac
-      fi
-      if test "$with_gnu_ld" = no; then
-	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
-	_LT_TAGVAR(hardcode_libdir_separator, $1)=:
-
-	case $host_cpu in
-	hppa*64*|ia64*)
-	  _LT_TAGVAR(hardcode_direct, $1)=no
-	  _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-	  ;;
-	*)
-	  _LT_TAGVAR(hardcode_direct, $1)=yes
-	  _LT_TAGVAR(hardcode_direct_absolute, $1)=yes
-	  _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
-
-	  # hardcode_minus_L: Not really in the search PATH,
-	  # but as the default location of the library.
-	  _LT_TAGVAR(hardcode_minus_L, $1)=yes
-	  ;;
-	esac
-      fi
-      ;;
-
-    irix5* | irix6* | nonstopux*)
-      if test "$GCC" = yes; then
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
-	# Try to use the -exported_symbol ld option, if it does not
-	# work, assume that -exports_file does not work either and
-	# implicitly export all symbols.
-	# This should be the same for all languages, so no per-tag cache variable.
-	AC_CACHE_CHECK([whether the $host_os linker accepts -exported_symbol],
-	  [lt_cv_irix_exported_symbol],
-	  [save_LDFLAGS="$LDFLAGS"
-	   LDFLAGS="$LDFLAGS -shared ${wl}-exported_symbol ${wl}foo ${wl}-update_registry ${wl}/dev/null"
-	   AC_LINK_IFELSE(
-	     [AC_LANG_SOURCE(
-	        [AC_LANG_CASE([C], [[int foo (void) { return 0; }]],
-			      [C++], [[int foo (void) { return 0; }]],
-			      [Fortran 77], [[
-      subroutine foo
-      end]],
-			      [Fortran], [[
-      subroutine foo
-      end]])])],
-	      [lt_cv_irix_exported_symbol=yes],
-	      [lt_cv_irix_exported_symbol=no])
-           LDFLAGS="$save_LDFLAGS"])
-	if test "$lt_cv_irix_exported_symbol" = yes; then
-          _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations ${wl}-exports_file ${wl}$export_symbols -o $lib'
-	fi
-      else
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -exports_file $export_symbols -o $lib'
-      fi
-      _LT_TAGVAR(archive_cmds_need_lc, $1)='no'
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-      _LT_TAGVAR(hardcode_libdir_separator, $1)=:
-      _LT_TAGVAR(inherit_rpath, $1)=yes
-      _LT_TAGVAR(link_all_deplibs, $1)=yes
-      ;;
-
-    netbsd* | netbsdelf*-gnu)
-      if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
-	_LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'  # a.out
-      else
-	_LT_TAGVAR(archive_cmds, $1)='$LD -shared -o $lib $libobjs $deplibs $linker_flags'      # ELF
-      fi
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
-      _LT_TAGVAR(hardcode_direct, $1)=yes
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      ;;
-
-    newsos6)
-      _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
-      _LT_TAGVAR(hardcode_direct, $1)=yes
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-      _LT_TAGVAR(hardcode_libdir_separator, $1)=:
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      ;;
-
-    *nto* | *qnx*)
-      ;;
-
-    openbsd*)
-      if test -f /usr/libexec/ld.so; then
-	_LT_TAGVAR(hardcode_direct, $1)=yes
-	_LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-	_LT_TAGVAR(hardcode_direct_absolute, $1)=yes
-	if test -z "`echo __ELF__ | $CC -E - | $GREP __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
-	  _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags ${wl}-retain-symbols-file,$export_symbols'
-	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-	  _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
-	else
-	  case $host_os in
-	   openbsd[[01]].* | openbsd2.[[0-7]] | openbsd2.[[0-7]].*)
-	     _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable -o $lib $libobjs $deplibs $linker_flags'
-	     _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
-	     ;;
-	   *)
-	     _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -o $lib $libobjs $deplibs $compiler_flags'
-	     _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-	     ;;
-	  esac
-	fi
-      else
-	_LT_TAGVAR(ld_shlibs, $1)=no
-      fi
-      ;;
-
-    os2*)
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
-      _LT_TAGVAR(hardcode_minus_L, $1)=yes
-      _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
-      _LT_TAGVAR(archive_cmds, $1)='$ECHO "LIBRARY $libname INITINSTANCE" > $output_objdir/$libname.def~$ECHO "DESCRIPTION \"$libname\"" >> $output_objdir/$libname.def~echo DATA >> $output_objdir/$libname.def~echo " SINGLE NONSHARED" >> $output_objdir/$libname.def~echo EXPORTS >> $output_objdir/$libname.def~emxexp $libobjs >> $output_objdir/$libname.def~$CC -Zdll -Zcrtdll -o $lib $libobjs $deplibs $compiler_flags $output_objdir/$libname.def'
-      _LT_TAGVAR(old_archive_from_new_cmds, $1)='emximp -o $output_objdir/$libname.a $output_objdir/$libname.def'
-      ;;
-
-    osf3*)
-      if test "$GCC" = yes; then
-	_LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-expect_unresolved ${wl}\*'
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
-      else
-	_LT_TAGVAR(allow_undefined_flag, $1)=' -expect_unresolved \*'
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
-      fi
-      _LT_TAGVAR(archive_cmds_need_lc, $1)='no'
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-      _LT_TAGVAR(hardcode_libdir_separator, $1)=:
-      ;;
-
-    osf4* | osf5*)	# as osf3* with the addition of -msym flag
-      if test "$GCC" = yes; then
-	_LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-expect_unresolved ${wl}\*'
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $pic_flag $libobjs $deplibs $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
-	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-      else
-	_LT_TAGVAR(allow_undefined_flag, $1)=' -expect_unresolved \*'
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $libobjs $deplibs $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='for i in `cat $export_symbols`; do printf "%s %s\\n" -exported_symbol "\$i" >> $lib.exp; done; printf "%s\\n" "-hidden">> $lib.exp~
-	$CC -shared${allow_undefined_flag} ${wl}-input ${wl}$lib.exp $compiler_flags $libobjs $deplibs -soname $soname `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib~$RM $lib.exp'
-
-	# Both c and cxx compiler support -rpath directly
-	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-rpath $libdir'
-      fi
-      _LT_TAGVAR(archive_cmds_need_lc, $1)='no'
-      _LT_TAGVAR(hardcode_libdir_separator, $1)=:
-      ;;
-
-    solaris*)
-      _LT_TAGVAR(no_undefined_flag, $1)=' -z defs'
-      if test "$GCC" = yes; then
-	wlarc='${wl}'
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag ${wl}-z ${wl}text ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-	  $CC -shared $pic_flag ${wl}-z ${wl}text ${wl}-M ${wl}$lib.exp ${wl}-h ${wl}$soname -o $lib $libobjs $deplibs $compiler_flags~$RM $lib.exp'
-      else
-	case `$CC -V 2>&1` in
-	*"Compilers 5.0"*)
-	  wlarc=''
-	  _LT_TAGVAR(archive_cmds, $1)='$LD -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $linker_flags'
-	  _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-	  $LD -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $linker_flags~$RM $lib.exp'
-	  ;;
-	*)
-	  wlarc='${wl}'
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -G${allow_undefined_flag} -h $soname -o $lib $libobjs $deplibs $compiler_flags'
-	  _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-	  $CC -G${allow_undefined_flag} -M $lib.exp -h $soname -o $lib $libobjs $deplibs $compiler_flags~$RM $lib.exp'
-	  ;;
-	esac
-      fi
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      case $host_os in
-      solaris2.[[0-5]] | solaris2.[[0-5]].*) ;;
-      *)
-	# The compiler driver will combine and reorder linker options,
-	# but understands `-z linker_flag'.  GCC discards it without `$wl',
-	# but is careful enough not to reorder.
-	# Supported since Solaris 2.6 (maybe 2.5.1?)
-	if test "$GCC" = yes; then
-	  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}-z ${wl}allextract$convenience ${wl}-z ${wl}defaultextract'
-	else
-	  _LT_TAGVAR(whole_archive_flag_spec, $1)='-z allextract$convenience -z defaultextract'
-	fi
-	;;
-      esac
-      _LT_TAGVAR(link_all_deplibs, $1)=yes
-      ;;
-
-    sunos4*)
-      if test "x$host_vendor" = xsequent; then
-	# Use $CC to link under sequent, because it throws in some extra .o
-	# files that make .init and .fini sections work.
-	_LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h $soname -o $lib $libobjs $deplibs $compiler_flags'
-      else
-	_LT_TAGVAR(archive_cmds, $1)='$LD -assert pure-text -Bstatic -o $lib $libobjs $deplibs $linker_flags'
-      fi
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
-      _LT_TAGVAR(hardcode_direct, $1)=yes
-      _LT_TAGVAR(hardcode_minus_L, $1)=yes
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      ;;
-
-    sysv4)
-      case $host_vendor in
-	sni)
-	  _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
-	  _LT_TAGVAR(hardcode_direct, $1)=yes # is this really true???
-	;;
-	siemens)
-	  ## LD is ld it makes a PLAMLIB
-	  ## CC just makes a GrossModule.
-	  _LT_TAGVAR(archive_cmds, $1)='$LD -G -o $lib $libobjs $deplibs $linker_flags'
-	  _LT_TAGVAR(reload_cmds, $1)='$CC -r -o $output$reload_objs'
-	  _LT_TAGVAR(hardcode_direct, $1)=no
-        ;;
-	motorola)
-	  _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
-	  _LT_TAGVAR(hardcode_direct, $1)=no #Motorola manual says yes, but my tests say they lie
-	;;
-      esac
-      runpath_var='LD_RUN_PATH'
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      ;;
-
-    sysv4.3*)
-      _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      _LT_TAGVAR(export_dynamic_flag_spec, $1)='-Bexport'
-      ;;
-
-    sysv4*MP*)
-      if test -d /usr/nec; then
-	_LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
-	_LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-	runpath_var=LD_RUN_PATH
-	hardcode_runpath_var=yes
-	_LT_TAGVAR(ld_shlibs, $1)=yes
-      fi
-      ;;
-
-    sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[[01]].[[10]]* | unixware7* | sco3.2v5.0.[[024]]*)
-      _LT_TAGVAR(no_undefined_flag, $1)='${wl}-z,text'
-      _LT_TAGVAR(archive_cmds_need_lc, $1)=no
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      runpath_var='LD_RUN_PATH'
-
-      if test "$GCC" = yes; then
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-      else
-	_LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-      fi
-      ;;
-
-    sysv5* | sco3.2v5* | sco5v6*)
-      # Note: We can NOT use -z defs as we might desire, because we do not
-      # link with -lc, and that would cause any symbols used from libc to
-      # always be unresolved, which means just about no library would
-      # ever link correctly.  If we're not using GNU ld we use -z text
-      # though, which does catch some bad symbols but isn't as heavy-handed
-      # as -z defs.
-      _LT_TAGVAR(no_undefined_flag, $1)='${wl}-z,text'
-      _LT_TAGVAR(allow_undefined_flag, $1)='${wl}-z,nodefs'
-      _LT_TAGVAR(archive_cmds_need_lc, $1)=no
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R,$libdir'
-      _LT_TAGVAR(hardcode_libdir_separator, $1)=':'
-      _LT_TAGVAR(link_all_deplibs, $1)=yes
-      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-Bexport'
-      runpath_var='LD_RUN_PATH'
-
-      if test "$GCC" = yes; then
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-      else
-	_LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-      fi
-      ;;
-
-    uts4*)
-      _LT_TAGVAR(archive_cmds, $1)='$LD -G -h $soname -o $lib $libobjs $deplibs $linker_flags'
-      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      ;;
-
-    *)
-      _LT_TAGVAR(ld_shlibs, $1)=no
-      ;;
-    esac
-
-    if test x$host_vendor = xsni; then
-      case $host in
-      sysv4 | sysv4.2uw2* | sysv4.3* | sysv5*)
-	_LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-Blargedynsym'
-	;;
-      esac
-    fi
-  fi
-])
-AC_MSG_RESULT([$_LT_TAGVAR(ld_shlibs, $1)])
-test "$_LT_TAGVAR(ld_shlibs, $1)" = no && can_build_shared=no
-
-_LT_TAGVAR(with_gnu_ld, $1)=$with_gnu_ld
-
-_LT_DECL([], [libext], [0], [Old archive suffix (normally "a")])dnl
-_LT_DECL([], [shrext_cmds], [1], [Shared library suffix (normally ".so")])dnl
-_LT_DECL([], [extract_expsyms_cmds], [2],
-    [The commands to extract the exported symbol list from a shared archive])
-
-#
-# Do we need to explicitly link libc?
-#
-case "x$_LT_TAGVAR(archive_cmds_need_lc, $1)" in
-x|xyes)
-  # Assume -lc should be added
-  _LT_TAGVAR(archive_cmds_need_lc, $1)=yes
-
-  if test "$enable_shared" = yes && test "$GCC" = yes; then
-    case $_LT_TAGVAR(archive_cmds, $1) in
-    *'~'*)
-      # FIXME: we may have to deal with multi-command sequences.
-      ;;
-    '$CC '*)
-      # Test whether the compiler implicitly links with -lc since on some
-      # systems, -lgcc has to come before -lc. If gcc already passes -lc
-      # to ld, don't add -lc before -lgcc.
-      AC_CACHE_CHECK([whether -lc should be explicitly linked in],
-	[lt_cv_]_LT_TAGVAR(archive_cmds_need_lc, $1),
-	[$RM conftest*
-	echo "$lt_simple_compile_test_code" > conftest.$ac_ext
-
-	if AC_TRY_EVAL(ac_compile) 2>conftest.err; then
-	  soname=conftest
-	  lib=conftest
-	  libobjs=conftest.$ac_objext
-	  deplibs=
-	  wl=$_LT_TAGVAR(lt_prog_compiler_wl, $1)
-	  pic_flag=$_LT_TAGVAR(lt_prog_compiler_pic, $1)
-	  compiler_flags=-v
-	  linker_flags=-v
-	  verstring=
-	  output_objdir=.
-	  libname=conftest
-	  lt_save_allow_undefined_flag=$_LT_TAGVAR(allow_undefined_flag, $1)
-	  _LT_TAGVAR(allow_undefined_flag, $1)=
-	  if AC_TRY_EVAL(_LT_TAGVAR(archive_cmds, $1) 2\>\&1 \| $GREP \" -lc \" \>/dev/null 2\>\&1)
-	  then
-	    lt_cv_[]_LT_TAGVAR(archive_cmds_need_lc, $1)=no
-	  else
-	    lt_cv_[]_LT_TAGVAR(archive_cmds_need_lc, $1)=yes
-	  fi
-	  _LT_TAGVAR(allow_undefined_flag, $1)=$lt_save_allow_undefined_flag
-	else
-	  cat conftest.err 1>&5
-	fi
-	$RM conftest*
-	])
-      _LT_TAGVAR(archive_cmds_need_lc, $1)=$lt_cv_[]_LT_TAGVAR(archive_cmds_need_lc, $1)
-      ;;
-    esac
-  fi
-  ;;
-esac
-
-_LT_TAGDECL([build_libtool_need_lc], [archive_cmds_need_lc], [0],
-    [Whether or not to add -lc for building shared libraries])
-_LT_TAGDECL([allow_libtool_libs_with_static_runtimes],
-    [enable_shared_with_static_runtimes], [0],
-    [Whether or not to disallow shared libs when runtime libs are static])
-_LT_TAGDECL([], [export_dynamic_flag_spec], [1],
-    [Compiler flag to allow reflexive dlopens])
-_LT_TAGDECL([], [whole_archive_flag_spec], [1],
-    [Compiler flag to generate shared objects directly from archives])
-_LT_TAGDECL([], [compiler_needs_object], [1],
-    [Whether the compiler copes with passing no objects directly])
-_LT_TAGDECL([], [old_archive_from_new_cmds], [2],
-    [Create an old-style archive from a shared archive])
-_LT_TAGDECL([], [old_archive_from_expsyms_cmds], [2],
-    [Create a temporary old-style archive to link instead of a shared archive])
-_LT_TAGDECL([], [archive_cmds], [2], [Commands used to build a shared archive])
-_LT_TAGDECL([], [archive_expsym_cmds], [2])
-_LT_TAGDECL([], [module_cmds], [2],
-    [Commands used to build a loadable module if different from building
-    a shared archive.])
-_LT_TAGDECL([], [module_expsym_cmds], [2])
-_LT_TAGDECL([], [with_gnu_ld], [1],
-    [Whether we are building with GNU ld or not])
-_LT_TAGDECL([], [allow_undefined_flag], [1],
-    [Flag that allows shared libraries with undefined symbols to be built])
-_LT_TAGDECL([], [no_undefined_flag], [1],
-    [Flag that enforces no undefined symbols])
-_LT_TAGDECL([], [hardcode_libdir_flag_spec], [1],
-    [Flag to hardcode $libdir into a binary during linking.
-    This must work even if $libdir does not exist])
-_LT_TAGDECL([], [hardcode_libdir_separator], [1],
-    [Whether we need a single "-rpath" flag with a separated argument])
-_LT_TAGDECL([], [hardcode_direct], [0],
-    [Set to "yes" if using DIR/libNAME${shared_ext} during linking hardcodes
-    DIR into the resulting binary])
-_LT_TAGDECL([], [hardcode_direct_absolute], [0],
-    [Set to "yes" if using DIR/libNAME${shared_ext} during linking hardcodes
-    DIR into the resulting binary and the resulting library dependency is
-    "absolute", i.e impossible to change by setting ${shlibpath_var} if the
-    library is relocated])
-_LT_TAGDECL([], [hardcode_minus_L], [0],
-    [Set to "yes" if using the -LDIR flag during linking hardcodes DIR
-    into the resulting binary])
-_LT_TAGDECL([], [hardcode_shlibpath_var], [0],
-    [Set to "yes" if using SHLIBPATH_VAR=DIR during linking hardcodes DIR
-    into the resulting binary])
-_LT_TAGDECL([], [hardcode_automatic], [0],
-    [Set to "yes" if building a shared library automatically hardcodes DIR
-    into the library and all subsequent libraries and executables linked
-    against it])
-_LT_TAGDECL([], [inherit_rpath], [0],
-    [Set to yes if linker adds runtime paths of dependent libraries
-    to runtime path list])
-_LT_TAGDECL([], [link_all_deplibs], [0],
-    [Whether libtool must link a program against all its dependency libraries])
-_LT_TAGDECL([], [always_export_symbols], [0],
-    [Set to "yes" if exported symbols are required])
-_LT_TAGDECL([], [export_symbols_cmds], [2],
-    [The commands to list exported symbols])
-_LT_TAGDECL([], [exclude_expsyms], [1],
-    [Symbols that should not be listed in the preloaded symbols])
-_LT_TAGDECL([], [include_expsyms], [1],
-    [Symbols that must always be exported])
-_LT_TAGDECL([], [prelink_cmds], [2],
-    [Commands necessary for linking programs (against libraries) with templates])
-_LT_TAGDECL([], [postlink_cmds], [2],
-    [Commands necessary for finishing linking programs])
-_LT_TAGDECL([], [file_list_spec], [1],
-    [Specify filename containing input files])
-dnl FIXME: Not yet implemented
-dnl _LT_TAGDECL([], [thread_safe_flag_spec], [1],
-dnl    [Compiler flag to generate thread safe objects])
-])# _LT_LINKER_SHLIBS
-
-
-# _LT_LANG_C_CONFIG([TAG])
-# ------------------------
-# Ensure that the configuration variables for a C compiler are suitably
-# defined.  These variables are subsequently used by _LT_CONFIG to write
-# the compiler configuration to `libtool'.
-m4_defun([_LT_LANG_C_CONFIG],
-[m4_require([_LT_DECL_EGREP])dnl
-lt_save_CC="$CC"
-AC_LANG_PUSH(C)
-
-# Source file extension for C test sources.
-ac_ext=c
-
-# Object file extension for compiled C test sources.
-objext=o
-_LT_TAGVAR(objext, $1)=$objext
-
-# Code to be used in simple compile tests
-lt_simple_compile_test_code="int some_variable = 0;"
-
-# Code to be used in simple link tests
-lt_simple_link_test_code='int main(){return(0);}'
-
-_LT_TAG_COMPILER
-# Save the default compiler, since it gets overwritten when the other
-# tags are being tested, and _LT_TAGVAR(compiler, []) is a NOP.
-compiler_DEFAULT=$CC
-
-# save warnings/boilerplate of simple test code
-_LT_COMPILER_BOILERPLATE
-_LT_LINKER_BOILERPLATE
-
-## CAVEAT EMPTOR:
-## There is no encapsulation within the following macros, do not change
-## the running order or otherwise move them around unless you know exactly
-## what you are doing...
-if test -n "$compiler"; then
-  _LT_COMPILER_NO_RTTI($1)
-  _LT_COMPILER_PIC($1)
-  _LT_COMPILER_C_O($1)
-  _LT_COMPILER_FILE_LOCKS($1)
-  _LT_LINKER_SHLIBS($1)
-  _LT_SYS_DYNAMIC_LINKER($1)
-  _LT_LINKER_HARDCODE_LIBPATH($1)
-  LT_SYS_DLOPEN_SELF
-  _LT_CMD_STRIPLIB
-
-  # Report which library types will actually be built
-  AC_MSG_CHECKING([if libtool supports shared libraries])
-  AC_MSG_RESULT([$can_build_shared])
-
-  AC_MSG_CHECKING([whether to build shared libraries])
-  test "$can_build_shared" = "no" && enable_shared=no
-
-  # On AIX, shared libraries and static libraries use the same namespace, and
-  # are all built from PIC.
-  case $host_os in
-  aix3*)
-    test "$enable_shared" = yes && enable_static=no
-    if test -n "$RANLIB"; then
-      archive_cmds="$archive_cmds~\$RANLIB \$lib"
-      postinstall_cmds='$RANLIB $lib'
-    fi
-    ;;
-
-  aix[[4-9]]*)
-    if test "$host_cpu" != ia64 && test "$aix_use_runtimelinking" = no ; then
-      test "$enable_shared" = yes && enable_static=no
-    fi
-    ;;
-  esac
-  AC_MSG_RESULT([$enable_shared])
-
-  AC_MSG_CHECKING([whether to build static libraries])
-  # Make sure either enable_shared or enable_static is yes.
-  test "$enable_shared" = yes || enable_static=yes
-  AC_MSG_RESULT([$enable_static])
-
-  _LT_CONFIG($1)
-fi
-AC_LANG_POP
-CC="$lt_save_CC"
-])# _LT_LANG_C_CONFIG
-
-
-# _LT_LANG_CXX_CONFIG([TAG])
-# --------------------------
-# Ensure that the configuration variables for a C++ compiler are suitably
-# defined.  These variables are subsequently used by _LT_CONFIG to write
-# the compiler configuration to `libtool'.
-m4_defun([_LT_LANG_CXX_CONFIG],
-[m4_require([_LT_FILEUTILS_DEFAULTS])dnl
-m4_require([_LT_DECL_EGREP])dnl
-m4_require([_LT_PATH_MANIFEST_TOOL])dnl
-if test -n "$CXX" && ( test "X$CXX" != "Xno" &&
-    ( (test "X$CXX" = "Xg++" && `g++ -v >/dev/null 2>&1` ) ||
-    (test "X$CXX" != "Xg++"))) ; then
-  AC_PROG_CXXCPP
-else
-  _lt_caught_CXX_error=yes
-fi
-
-AC_LANG_PUSH(C++)
-_LT_TAGVAR(archive_cmds_need_lc, $1)=no
-_LT_TAGVAR(allow_undefined_flag, $1)=
-_LT_TAGVAR(always_export_symbols, $1)=no
-_LT_TAGVAR(archive_expsym_cmds, $1)=
-_LT_TAGVAR(compiler_needs_object, $1)=no
-_LT_TAGVAR(export_dynamic_flag_spec, $1)=
-_LT_TAGVAR(hardcode_direct, $1)=no
-_LT_TAGVAR(hardcode_direct_absolute, $1)=no
-_LT_TAGVAR(hardcode_libdir_flag_spec, $1)=
-_LT_TAGVAR(hardcode_libdir_separator, $1)=
-_LT_TAGVAR(hardcode_minus_L, $1)=no
-_LT_TAGVAR(hardcode_shlibpath_var, $1)=unsupported
-_LT_TAGVAR(hardcode_automatic, $1)=no
-_LT_TAGVAR(inherit_rpath, $1)=no
-_LT_TAGVAR(module_cmds, $1)=
-_LT_TAGVAR(module_expsym_cmds, $1)=
-_LT_TAGVAR(link_all_deplibs, $1)=unknown
-_LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds
-_LT_TAGVAR(reload_flag, $1)=$reload_flag
-_LT_TAGVAR(reload_cmds, $1)=$reload_cmds
-_LT_TAGVAR(no_undefined_flag, $1)=
-_LT_TAGVAR(whole_archive_flag_spec, $1)=
-_LT_TAGVAR(enable_shared_with_static_runtimes, $1)=no
-
-# Source file extension for C++ test sources.
-ac_ext=cpp
-
-# Object file extension for compiled C++ test sources.
-objext=o
-_LT_TAGVAR(objext, $1)=$objext
-
-# No sense in running all these tests if we already determined that
-# the CXX compiler isn't working.  Some variables (like enable_shared)
-# are currently assumed to apply to all compilers on this platform,
-# and will be corrupted by setting them based on a non-working compiler.
-if test "$_lt_caught_CXX_error" != yes; then
-  # Code to be used in simple compile tests
-  lt_simple_compile_test_code="int some_variable = 0;"
-
-  # Code to be used in simple link tests
-  lt_simple_link_test_code='int main(int, char *[[]]) { return(0); }'
-
-  # ltmain only uses $CC for tagged configurations so make sure $CC is set.
-  _LT_TAG_COMPILER
-
-  # save warnings/boilerplate of simple test code
-  _LT_COMPILER_BOILERPLATE
-  _LT_LINKER_BOILERPLATE
-
-  # Allow CC to be a program name with arguments.
-  lt_save_CC=$CC
-  lt_save_CFLAGS=$CFLAGS
-  lt_save_LD=$LD
-  lt_save_GCC=$GCC
-  GCC=$GXX
-  lt_save_with_gnu_ld=$with_gnu_ld
-  lt_save_path_LD=$lt_cv_path_LD
-  if test -n "${lt_cv_prog_gnu_ldcxx+set}"; then
-    lt_cv_prog_gnu_ld=$lt_cv_prog_gnu_ldcxx
-  else
-    $as_unset lt_cv_prog_gnu_ld
-  fi
-  if test -n "${lt_cv_path_LDCXX+set}"; then
-    lt_cv_path_LD=$lt_cv_path_LDCXX
-  else
-    $as_unset lt_cv_path_LD
-  fi
-  test -z "${LDCXX+set}" || LD=$LDCXX
-  CC=${CXX-"c++"}
-  CFLAGS=$CXXFLAGS
-  compiler=$CC
-  _LT_TAGVAR(compiler, $1)=$CC
-  _LT_CC_BASENAME([$compiler])
-
-  if test -n "$compiler"; then
-    # We don't want -fno-exception when compiling C++ code, so set the
-    # no_builtin_flag separately
-    if test "$GXX" = yes; then
-      _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=' -fno-builtin'
-    else
-      _LT_TAGVAR(lt_prog_compiler_no_builtin_flag, $1)=
-    fi
-
-    if test "$GXX" = yes; then
-      # Set up default GNU C++ configuration
-
-      LT_PATH_LD
-
-      # Check if GNU C++ uses GNU ld as the underlying linker, since the
-      # archiving commands below assume that GNU ld is being used.
-      if test "$with_gnu_ld" = yes; then
-        _LT_TAGVAR(archive_cmds, $1)='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
-        _LT_TAGVAR(archive_expsym_cmds, $1)='$CC $pic_flag -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
-
-        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-        _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
-
-        # If archive_cmds runs LD, not CC, wlarc should be empty
-        # XXX I think wlarc can be eliminated in ltcf-cxx, but I need to
-        #     investigate it a little bit more. (MM)
-        wlarc='${wl}'
-
-        # ancient GNU ld didn't support --whole-archive et. al.
-        if eval "`$CC -print-prog-name=ld` --help 2>&1" |
-	  $GREP 'no-whole-archive' > /dev/null; then
-          _LT_TAGVAR(whole_archive_flag_spec, $1)="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
-        else
-          _LT_TAGVAR(whole_archive_flag_spec, $1)=
-        fi
-      else
-        with_gnu_ld=no
-        wlarc=
-
-        # A generic and very simple default shared library creation
-        # command for GNU C++ for the case where it uses the native
-        # linker, instead of GNU ld.  If possible, this setting should
-        # overridden to take advantage of the native linker features on
-        # the platform it is being used on.
-        _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib'
-      fi
-
-      # Commands to make compiler produce verbose output that lists
-      # what "hidden" libraries, object files and flags are used when
-      # linking a shared library.
-      output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
-
-    else
-      GXX=no
-      with_gnu_ld=no
-      wlarc=
-    fi
-
-    # PORTME: fill in a description of your system's C++ link characteristics
-    AC_MSG_CHECKING([whether the $compiler linker ($LD) supports shared libraries])
-    _LT_TAGVAR(ld_shlibs, $1)=yes
-    case $host_os in
-      aix3*)
-        # FIXME: insert proper C++ library support
-        _LT_TAGVAR(ld_shlibs, $1)=no
-        ;;
-      aix[[4-9]]*)
-        if test "$host_cpu" = ia64; then
-          # On IA64, the linker does run time linking by default, so we don't
-          # have to do anything special.
-          aix_use_runtimelinking=no
-          exp_sym_flag='-Bexport'
-          no_entry_flag=""
-        else
-          aix_use_runtimelinking=no
-
-          # Test if we are trying to use run time linking or normal
-          # AIX style linking. If -brtl is somewhere in LDFLAGS, we
-          # need to do runtime linking.
-          case $host_os in aix4.[[23]]|aix4.[[23]].*|aix[[5-9]]*)
-	    for ld_flag in $LDFLAGS; do
-	      case $ld_flag in
-	      *-brtl*)
-	        aix_use_runtimelinking=yes
-	        break
-	        ;;
-	      esac
-	    done
-	    ;;
-          esac
-
-          exp_sym_flag='-bexport'
-          no_entry_flag='-bnoentry'
-        fi
-
-        # When large executables or shared objects are built, AIX ld can
-        # have problems creating the table of contents.  If linking a library
-        # or program results in "error TOC overflow" add -mminimal-toc to
-        # CXXFLAGS/CFLAGS for g++/gcc.  In the cases where that is not
-        # enough to fix the problem, add -Wl,-bbigtoc to LDFLAGS.
-
-        _LT_TAGVAR(archive_cmds, $1)=''
-        _LT_TAGVAR(hardcode_direct, $1)=yes
-        _LT_TAGVAR(hardcode_direct_absolute, $1)=yes
-        _LT_TAGVAR(hardcode_libdir_separator, $1)=':'
-        _LT_TAGVAR(link_all_deplibs, $1)=yes
-        _LT_TAGVAR(file_list_spec, $1)='${wl}-f,'
-
-        if test "$GXX" = yes; then
-          case $host_os in aix4.[[012]]|aix4.[[012]].*)
-          # We only want to do this on AIX 4.2 and lower, the check
-          # below for broken collect2 doesn't work under 4.3+
-	  collect2name=`${CC} -print-prog-name=collect2`
-	  if test -f "$collect2name" &&
-	     strings "$collect2name" | $GREP resolve_lib_name >/dev/null
-	  then
-	    # We have reworked collect2
-	    :
-	  else
-	    # We have old collect2
-	    _LT_TAGVAR(hardcode_direct, $1)=unsupported
-	    # It fails to find uninstalled libraries when the uninstalled
-	    # path is not listed in the libpath.  Setting hardcode_minus_L
-	    # to unsupported forces relinking
-	    _LT_TAGVAR(hardcode_minus_L, $1)=yes
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
-	    _LT_TAGVAR(hardcode_libdir_separator, $1)=
-	  fi
-          esac
-          shared_flag='-shared'
-	  if test "$aix_use_runtimelinking" = yes; then
-	    shared_flag="$shared_flag "'${wl}-G'
-	  fi
-        else
-          # not using gcc
-          if test "$host_cpu" = ia64; then
-	  # VisualAge C++, Version 5.5 for AIX 5L for IA-64, Beta 3 Release
-	  # chokes on -Wl,-G. The following line is correct:
-	  shared_flag='-G'
-          else
-	    if test "$aix_use_runtimelinking" = yes; then
-	      shared_flag='${wl}-G'
-	    else
-	      shared_flag='${wl}-bM:SRE'
-	    fi
-          fi
-        fi
-
-        _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-bexpall'
-        # It seems that -bexpall does not export symbols beginning with
-        # underscore (_), so it is better to generate a list of symbols to
-	# export.
-        _LT_TAGVAR(always_export_symbols, $1)=yes
-        if test "$aix_use_runtimelinking" = yes; then
-          # Warning - without using the other runtime loading flags (-brtl),
-          # -berok will link without error, but may produce a broken library.
-          _LT_TAGVAR(allow_undefined_flag, $1)='-berok'
-          # Determine the default libpath from the value encoded in an empty
-          # executable.
-          _LT_SYS_MODULE_PATH_AIX([$1])
-          _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-blibpath:$libdir:'"$aix_libpath"
-
-          _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags `if test "x${allow_undefined_flag}" != "x"; then func_echo_all "${wl}${allow_undefined_flag}"; else :; fi` '"\${wl}$exp_sym_flag:\$export_symbols $shared_flag"
-        else
-          if test "$host_cpu" = ia64; then
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R $libdir:/usr/lib:/lib'
-	    _LT_TAGVAR(allow_undefined_flag, $1)="-z nodefs"
-	    _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs '"\${wl}$no_entry_flag"' $compiler_flags ${wl}${allow_undefined_flag} '"\${wl}$exp_sym_flag:\$export_symbols"
-          else
-	    # Determine the default libpath from the value encoded in an
-	    # empty executable.
-	    _LT_SYS_MODULE_PATH_AIX([$1])
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-blibpath:$libdir:'"$aix_libpath"
-	    # Warning - without using the other run time loading flags,
-	    # -berok will link without error, but may produce a broken library.
-	    _LT_TAGVAR(no_undefined_flag, $1)=' ${wl}-bernotok'
-	    _LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-berok'
-	    if test "$with_gnu_ld" = yes; then
-	      # We only use this code for GNU lds that support --whole-archive.
-	      _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
-	    else
-	      # Exported symbols can be pulled into shared objects from archives
-	      _LT_TAGVAR(whole_archive_flag_spec, $1)='$convenience'
-	    fi
-	    _LT_TAGVAR(archive_cmds_need_lc, $1)=yes
-	    # This is similar to how AIX traditionally builds its shared
-	    # libraries.
-	    _LT_TAGVAR(archive_expsym_cmds, $1)="\$CC $shared_flag"' -o $output_objdir/$soname $libobjs $deplibs ${wl}-bnoentry $compiler_flags ${wl}-bE:$export_symbols${allow_undefined_flag}~$AR $AR_FLAGS $output_objdir/$libname$release.a $output_objdir/$soname'
-          fi
-        fi
-        ;;
-
-      beos*)
-	if $LD --help 2>&1 | $GREP ': supported targets:.* elf' > /dev/null; then
-	  _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
-	  # Joseph Beckenbach <jrb3 at best.com> says some releases of gcc
-	  # support --undefined.  This deserves some investigation.  FIXME
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -nostart $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	else
-	  _LT_TAGVAR(ld_shlibs, $1)=no
-	fi
-	;;
-
-      chorus*)
-        case $cc_basename in
-          *)
-	  # FIXME: insert proper C++ library support
-	  _LT_TAGVAR(ld_shlibs, $1)=no
-	  ;;
-        esac
-        ;;
-
-      cygwin* | mingw* | pw32* | cegcc*)
-	case $GXX,$cc_basename in
-	,cl* | no,cl*)
-	  # Native MSVC
-	  # hardcode_libdir_flag_spec is actually meaningless, as there is
-	  # no search path for DLLs.
-	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)=' '
-	  _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
-	  _LT_TAGVAR(always_export_symbols, $1)=yes
-	  _LT_TAGVAR(file_list_spec, $1)='@'
-	  # Tell ltmain to make .lib files, not .a files.
-	  libext=lib
-	  # Tell ltmain to make .dll files, not .so files.
-	  shrext_cmds=".dll"
-	  # FIXME: Setting linknames here is a bad hack.
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -o $output_objdir/$soname $libobjs $compiler_flags $deplibs -Wl,-dll~linknames='
-	  _LT_TAGVAR(archive_expsym_cmds, $1)='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
-	      $SED -n -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' -e '1\\\!p' < $export_symbols > $output_objdir/$soname.exp;
-	    else
-	      $SED -e 's/\\\\\\\(.*\\\\\\\)/-link\\\ -EXPORT:\\\\\\\1/' < $export_symbols > $output_objdir/$soname.exp;
-	    fi~
-	    $CC -o $tool_output_objdir$soname $libobjs $compiler_flags $deplibs "@$tool_output_objdir$soname.exp" -Wl,-DLL,-IMPLIB:"$tool_output_objdir$libname.dll.lib"~
-	    linknames='
-	  # The linker will not automatically build a static lib if we build a DLL.
-	  # _LT_TAGVAR(old_archive_from_new_cmds, $1)='true'
-	  _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
-	  # Don't use ranlib
-	  _LT_TAGVAR(old_postinstall_cmds, $1)='chmod 644 $oldlib'
-	  _LT_TAGVAR(postlink_cmds, $1)='lt_outputfile="@OUTPUT@"~
-	    lt_tool_outputfile="@TOOL_OUTPUT@"~
-	    case $lt_outputfile in
-	      *.exe|*.EXE) ;;
-	      *)
-		lt_outputfile="$lt_outputfile.exe"
-		lt_tool_outputfile="$lt_tool_outputfile.exe"
-		;;
-	    esac~
-	    func_to_tool_file "$lt_outputfile"~
-	    if test "$MANIFEST_TOOL" != ":" && test -f "$lt_outputfile.manifest"; then
-	      $MANIFEST_TOOL -manifest "$lt_tool_outputfile.manifest" -outputresource:"$lt_tool_outputfile" || exit 1;
-	      $RM "$lt_outputfile.manifest";
-	    fi'
-	  ;;
-	*)
-	  # g++
-	  # _LT_TAGVAR(hardcode_libdir_flag_spec, $1) is actually meaningless,
-	  # as there is no search path for DLLs.
-	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-L$libdir'
-	  _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-all-symbols'
-	  _LT_TAGVAR(allow_undefined_flag, $1)=unsupported
-	  _LT_TAGVAR(always_export_symbols, $1)=no
-	  _LT_TAGVAR(enable_shared_with_static_runtimes, $1)=yes
-
-	  if $LD --help 2>&1 | $GREP 'auto-import' > /dev/null; then
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
-	    # If the export-symbols file already is a .def file (1st line
-	    # is EXPORTS), use it as is; otherwise, prepend...
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='if test "x`$SED 1q $export_symbols`" = xEXPORTS; then
-	      cp $export_symbols $output_objdir/$soname.def;
-	    else
-	      echo EXPORTS > $output_objdir/$soname.def;
-	      cat $export_symbols >> $output_objdir/$soname.def;
-	    fi~
-	    $CC -shared -nostdlib $output_objdir/$soname.def $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $output_objdir/$soname ${wl}--enable-auto-image-base -Xlinker --out-implib -Xlinker $lib'
-	  else
-	    _LT_TAGVAR(ld_shlibs, $1)=no
-	  fi
-	  ;;
-	esac
-	;;
-      darwin* | rhapsody*)
-        _LT_DARWIN_LINKER_FEATURES($1)
-	;;
-
-      dgux*)
-        case $cc_basename in
-          ec++*)
-	    # FIXME: insert proper C++ library support
-	    _LT_TAGVAR(ld_shlibs, $1)=no
-	    ;;
-          ghcx*)
-	    # Green Hills C++ Compiler
-	    # FIXME: insert proper C++ library support
-	    _LT_TAGVAR(ld_shlibs, $1)=no
-	    ;;
-          *)
-	    # FIXME: insert proper C++ library support
-	    _LT_TAGVAR(ld_shlibs, $1)=no
-	    ;;
-        esac
-        ;;
-
-      freebsd2.*)
-        # C++ shared libraries reported to be fairly broken before
-	# switch to ELF
-        _LT_TAGVAR(ld_shlibs, $1)=no
-        ;;
-
-      freebsd-elf*)
-        _LT_TAGVAR(archive_cmds_need_lc, $1)=no
-        ;;
-
-      freebsd* | dragonfly*)
-        # FreeBSD 3 and later use GNU C++ and GNU ld with standard ELF
-        # conventions
-        _LT_TAGVAR(ld_shlibs, $1)=yes
-        ;;
-
-      haiku*)
-        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-        _LT_TAGVAR(link_all_deplibs, $1)=yes
-        ;;
-
-      hpux9*)
-        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
-        _LT_TAGVAR(hardcode_libdir_separator, $1)=:
-        _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
-        _LT_TAGVAR(hardcode_direct, $1)=yes
-        _LT_TAGVAR(hardcode_minus_L, $1)=yes # Not in the search PATH,
-				             # but as the default
-				             # location of the library.
-
-        case $cc_basename in
-          CC*)
-            # FIXME: insert proper C++ library support
-            _LT_TAGVAR(ld_shlibs, $1)=no
-            ;;
-          aCC*)
-            _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -b ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
-            # Commands to make compiler produce verbose output that lists
-            # what "hidden" libraries, object files and flags are used when
-            # linking a shared library.
-            #
-            # There doesn't appear to be a way to prevent this compiler from
-            # explicitly linking system object files so we need to strip them
-            # from the output so that they don't get included in the library
-            # dependencies.
-            output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $EGREP "\-L"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
-            ;;
-          *)
-            if test "$GXX" = yes; then
-              _LT_TAGVAR(archive_cmds, $1)='$RM $output_objdir/$soname~$CC -shared -nostdlib $pic_flag ${wl}+b ${wl}$install_libdir -o $output_objdir/$soname $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~test $output_objdir/$soname = $lib || mv $output_objdir/$soname $lib'
-            else
-              # FIXME: insert proper C++ library support
-              _LT_TAGVAR(ld_shlibs, $1)=no
-            fi
-            ;;
-        esac
-        ;;
-
-      hpux10*|hpux11*)
-        if test $with_gnu_ld = no; then
-	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}+b ${wl}$libdir'
-	  _LT_TAGVAR(hardcode_libdir_separator, $1)=:
-
-          case $host_cpu in
-            hppa*64*|ia64*)
-              ;;
-            *)
-	      _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
-              ;;
-          esac
-        fi
-        case $host_cpu in
-          hppa*64*|ia64*)
-            _LT_TAGVAR(hardcode_direct, $1)=no
-            _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-            ;;
-          *)
-            _LT_TAGVAR(hardcode_direct, $1)=yes
-            _LT_TAGVAR(hardcode_direct_absolute, $1)=yes
-            _LT_TAGVAR(hardcode_minus_L, $1)=yes # Not in the search PATH,
-					         # but as the default
-					         # location of the library.
-            ;;
-        esac
-
-        case $cc_basename in
-          CC*)
-	    # FIXME: insert proper C++ library support
-	    _LT_TAGVAR(ld_shlibs, $1)=no
-	    ;;
-          aCC*)
-	    case $host_cpu in
-	      hppa*64*)
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	        ;;
-	      ia64*)
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	        ;;
-	      *)
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -b ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	        ;;
-	    esac
-	    # Commands to make compiler produce verbose output that lists
-	    # what "hidden" libraries, object files and flags are used when
-	    # linking a shared library.
-	    #
-	    # There doesn't appear to be a way to prevent this compiler from
-	    # explicitly linking system object files so we need to strip them
-	    # from the output so that they don't get included in the library
-	    # dependencies.
-	    output_verbose_link_cmd='templist=`($CC -b $CFLAGS -v conftest.$objext 2>&1) | $GREP "\-L"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
-	    ;;
-          *)
-	    if test "$GXX" = yes; then
-	      if test $with_gnu_ld = no; then
-	        case $host_cpu in
-	          hppa*64*)
-	            _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib -fPIC ${wl}+h ${wl}$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	            ;;
-	          ia64*)
-	            _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $pic_flag ${wl}+h ${wl}$soname ${wl}+nodefaultrpath -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	            ;;
-	          *)
-	            _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib $pic_flag ${wl}+h ${wl}$soname ${wl}+b ${wl}$install_libdir -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	            ;;
-	        esac
-	      fi
-	    else
-	      # FIXME: insert proper C++ library support
-	      _LT_TAGVAR(ld_shlibs, $1)=no
-	    fi
-	    ;;
-        esac
-        ;;
-
-      interix[[3-9]]*)
-	_LT_TAGVAR(hardcode_direct, $1)=no
-	_LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-	_LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
-	# Hack: On Interix 3.x, we cannot compile PIC because of a broken gcc.
-	# Instead, shared libraries are loaded at an image base (0x10000000 by
-	# default) and relocated if they conflict, which is a slow very memory
-	# consuming and fragmenting process.  To avoid this, we pick a random,
-	# 256 KiB-aligned image base between 0x50000000 and 0x6FFC0000 at link
-	# time.  Moving up from 0x10000000 also allows more sbrk(2) space.
-	_LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
-	_LT_TAGVAR(archive_expsym_cmds, $1)='sed "s,^,_," $export_symbols >$output_objdir/$soname.expsym~$CC -shared $pic_flag $libobjs $deplibs $compiler_flags ${wl}-h,$soname ${wl}--retain-symbols-file,$output_objdir/$soname.expsym ${wl}--image-base,`expr ${RANDOM-$$} % 4096 / 2 \* 262144 + 1342177280` -o $lib'
-	;;
-      irix5* | irix6*)
-        case $cc_basename in
-          CC*)
-	    # SGI C++
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared -all -multigot $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
-
-	    # Archives containing C++ object files must be created using
-	    # "CC -ar", where "CC" is the IRIX C++ compiler.  This is
-	    # necessary to make sure instantiated templates are included
-	    # in the archive.
-	    _LT_TAGVAR(old_archive_cmds, $1)='$CC -ar -WR,-u -o $oldlib $oldobjs'
-	    ;;
-          *)
-	    if test "$GXX" = yes; then
-	      if test "$with_gnu_ld" = no; then
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
-	      else
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` -o $lib'
-	      fi
-	    fi
-	    _LT_TAGVAR(link_all_deplibs, $1)=yes
-	    ;;
-        esac
-        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-        _LT_TAGVAR(hardcode_libdir_separator, $1)=:
-        _LT_TAGVAR(inherit_rpath, $1)=yes
-        ;;
-
-      linux* | k*bsd*-gnu | kopensolaris*-gnu | gnu*)
-        case $cc_basename in
-          KCC*)
-	    # Kuck and Associates, Inc. (KAI) C++ Compiler
-
-	    # KCC will only create a shared library if the output file
-	    # ends with ".so" (or ".sl" for HP-UX), so rename the library
-	    # to its proper name (with version) after linking.
-	    _LT_TAGVAR(archive_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib'
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo $lib | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib ${wl}-retain-symbols-file,$export_symbols; mv \$templib $lib'
-	    # Commands to make compiler produce verbose output that lists
-	    # what "hidden" libraries, object files and flags are used when
-	    # linking a shared library.
-	    #
-	    # There doesn't appear to be a way to prevent this compiler from
-	    # explicitly linking system object files so we need to strip them
-	    # from the output so that they don't get included in the library
-	    # dependencies.
-	    output_verbose_link_cmd='templist=`$CC $CFLAGS -v conftest.$objext -o libconftest$shared_ext 2>&1 | $GREP "ld"`; rm -f libconftest$shared_ext; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
-
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
-
-	    # Archives containing C++ object files must be created using
-	    # "CC -Bstatic", where "CC" is the KAI C++ compiler.
-	    _LT_TAGVAR(old_archive_cmds, $1)='$CC -Bstatic -o $oldlib $oldobjs'
-	    ;;
-	  icpc* | ecpc* )
-	    # Intel C++
-	    with_gnu_ld=yes
-	    # version 8.0 and above of icpc choke on multiply defined symbols
-	    # if we add $predep_objects and $postdep_objects, however 7.1 and
-	    # earlier do not add the objects themselves.
-	    case `$CC -V 2>&1` in
-	      *"Version 7."*)
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
-		_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
-		;;
-	      *)  # Version 8.0 or newer
-	        tmp_idyn=
-	        case $host_cpu in
-		  ia64*) tmp_idyn=' -i_dynamic';;
-		esac
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-		_LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared'"$tmp_idyn"' $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-retain-symbols-file $wl$export_symbols -o $lib'
-		;;
-	    esac
-	    _LT_TAGVAR(archive_cmds_need_lc, $1)=no
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
-	    _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive$convenience ${wl}--no-whole-archive'
-	    ;;
-          pgCC* | pgcpp*)
-            # Portland Group C++ compiler
-	    case `$CC -V` in
-	    *pgCC\ [[1-5]].* | *pgcpp\ [[1-5]].*)
-	      _LT_TAGVAR(prelink_cmds, $1)='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $objs $libobjs $compile_deplibs~
-		compile_command="$compile_command `find $tpldir -name \*.o | sort | $NL2SP`"'
-	      _LT_TAGVAR(old_archive_cmds, $1)='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $oldobjs$old_deplibs~
-		$AR $AR_FLAGS $oldlib$oldobjs$old_deplibs `find $tpldir -name \*.o | sort | $NL2SP`~
-		$RANLIB $oldlib'
-	      _LT_TAGVAR(archive_cmds, $1)='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~
-		$CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname -o $lib'
-	      _LT_TAGVAR(archive_expsym_cmds, $1)='tpldir=Template.dir~
-		rm -rf $tpldir~
-		$CC --prelink_objects --instantiation_dir $tpldir $predep_objects $libobjs $deplibs $convenience $postdep_objects~
-		$CC -shared $pic_flag $predep_objects $libobjs $deplibs `find $tpldir -name \*.o | sort | $NL2SP` $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname ${wl}-retain-symbols-file ${wl}$export_symbols -o $lib'
-	      ;;
-	    *) # Version 6 and above use weak symbols
-	      _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname -o $lib'
-	      _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname ${wl}-retain-symbols-file ${wl}$export_symbols -o $lib'
-	      ;;
-	    esac
-
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}--rpath ${wl}$libdir'
-	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
-	    _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`for conv in $convenience\"\"; do test  -n \"$conv\" && new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
-            ;;
-	  cxx*)
-	    # Compaq C++
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $wl$soname  -o $lib ${wl}-retain-symbols-file $wl$export_symbols'
-
-	    runpath_var=LD_RUN_PATH
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-rpath $libdir'
-	    _LT_TAGVAR(hardcode_libdir_separator, $1)=:
-
-	    # Commands to make compiler produce verbose output that lists
-	    # what "hidden" libraries, object files and flags are used when
-	    # linking a shared library.
-	    #
-	    # There doesn't appear to be a way to prevent this compiler from
-	    # explicitly linking system object files so we need to strip them
-	    # from the output so that they don't get included in the library
-	    # dependencies.
-	    output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld .*$\)/\1/"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "X$list" | $Xsed'
-	    ;;
-	  xl* | mpixl* | bgxl*)
-	    # IBM XL 8.0 on PPC, with GNU ld
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}--export-dynamic'
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -qmkshrobj $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname -o $lib'
-	    if test "x$supports_anon_versioning" = xyes; then
-	      _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $output_objdir/$libname.ver~
-		cat $export_symbols | sed -e "s/\(.*\)/\1;/" >> $output_objdir/$libname.ver~
-		echo "local: *; };" >> $output_objdir/$libname.ver~
-		$CC -qmkshrobj $libobjs $deplibs $compiler_flags ${wl}-soname $wl$soname ${wl}-version-script ${wl}$output_objdir/$libname.ver -o $lib'
-	    fi
-	    ;;
-	  *)
-	    case `$CC -V 2>&1 | sed 5q` in
-	    *Sun\ C*)
-	      # Sun C++ 5.9
-	      _LT_TAGVAR(no_undefined_flag, $1)=' -zdefs'
-	      _LT_TAGVAR(archive_cmds, $1)='$CC -G${allow_undefined_flag} -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	      _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G${allow_undefined_flag} -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-retain-symbols-file ${wl}$export_symbols'
-	      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
-	      _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}--whole-archive`new_convenience=; for conv in $convenience\"\"; do test -z \"$conv\" || new_convenience=\"$new_convenience,$conv\"; done; func_echo_all \"$new_convenience\"` ${wl}--no-whole-archive'
-	      _LT_TAGVAR(compiler_needs_object, $1)=yes
-
-	      # Not sure whether something based on
-	      # $CC $CFLAGS -v conftest.$objext -o libconftest$shared_ext 2>&1
-	      # would be better.
-	      output_verbose_link_cmd='func_echo_all'
-
-	      # Archives containing C++ object files must be created using
-	      # "CC -xar", where "CC" is the Sun C++ compiler.  This is
-	      # necessary to make sure instantiated templates are included
-	      # in the archive.
-	      _LT_TAGVAR(old_archive_cmds, $1)='$CC -xar -o $oldlib $oldobjs'
-	      ;;
-	    esac
-	    ;;
-	esac
-	;;
-
-      lynxos*)
-        # FIXME: insert proper C++ library support
-	_LT_TAGVAR(ld_shlibs, $1)=no
-	;;
-
-      m88k*)
-        # FIXME: insert proper C++ library support
-        _LT_TAGVAR(ld_shlibs, $1)=no
-	;;
-
-      mvs*)
-        case $cc_basename in
-          cxx*)
-	    # FIXME: insert proper C++ library support
-	    _LT_TAGVAR(ld_shlibs, $1)=no
-	    ;;
-	  *)
-	    # FIXME: insert proper C++ library support
-	    _LT_TAGVAR(ld_shlibs, $1)=no
-	    ;;
-	esac
-	;;
-
-      netbsd*)
-        if echo __ELF__ | $CC -E - | $GREP __ELF__ >/dev/null; then
-	  _LT_TAGVAR(archive_cmds, $1)='$LD -Bshareable  -o $lib $predep_objects $libobjs $deplibs $postdep_objects $linker_flags'
-	  wlarc=
-	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
-	  _LT_TAGVAR(hardcode_direct, $1)=yes
-	  _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-	fi
-	# Workaround some broken pre-1.5 toolchains
-	output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP conftest.$objext | $SED -e "s:-lgcc -lc -lgcc::"'
-	;;
-
-      *nto* | *qnx*)
-        _LT_TAGVAR(ld_shlibs, $1)=yes
-	;;
-
-      openbsd2*)
-        # C++ shared libraries are fairly broken
-	_LT_TAGVAR(ld_shlibs, $1)=no
-	;;
-
-      openbsd*)
-	if test -f /usr/libexec/ld.so; then
-	  _LT_TAGVAR(hardcode_direct, $1)=yes
-	  _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-	  _LT_TAGVAR(hardcode_direct_absolute, $1)=yes
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -o $lib'
-	  _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-	  if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared $pic_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-retain-symbols-file,$export_symbols -o $lib'
-	    _LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-E'
-	    _LT_TAGVAR(whole_archive_flag_spec, $1)="$wlarc"'--whole-archive$convenience '"$wlarc"'--no-whole-archive'
-	  fi
-	  output_verbose_link_cmd=func_echo_all
-	else
-	  _LT_TAGVAR(ld_shlibs, $1)=no
-	fi
-	;;
-
-      osf3* | osf4* | osf5*)
-        case $cc_basename in
-          KCC*)
-	    # Kuck and Associates, Inc. (KAI) C++ Compiler
-
-	    # KCC will only create a shared library if the output file
-	    # ends with ".so" (or ".sl" for HP-UX), so rename the library
-	    # to its proper name (with version) after linking.
-	    _LT_TAGVAR(archive_cmds, $1)='tempext=`echo $shared_ext | $SED -e '\''s/\([[^()0-9A-Za-z{}]]\)/\\\\\1/g'\''`; templib=`echo "$lib" | $SED -e "s/\${tempext}\..*/.so/"`; $CC $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags --soname $soname -o \$templib; mv \$templib $lib'
-
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath,$libdir'
-	    _LT_TAGVAR(hardcode_libdir_separator, $1)=:
-
-	    # Archives containing C++ object files must be created using
-	    # the KAI C++ compiler.
-	    case $host in
-	      osf3*) _LT_TAGVAR(old_archive_cmds, $1)='$CC -Bstatic -o $oldlib $oldobjs' ;;
-	      *) _LT_TAGVAR(old_archive_cmds, $1)='$CC -o $oldlib $oldobjs' ;;
-	    esac
-	    ;;
-          RCC*)
-	    # Rational C++ 2.4.1
-	    # FIXME: insert proper C++ library support
-	    _LT_TAGVAR(ld_shlibs, $1)=no
-	    ;;
-          cxx*)
-	    case $host in
-	      osf3*)
-	        _LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-expect_unresolved ${wl}\*'
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname $soname `test -n "$verstring" && func_echo_all "${wl}-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
-	        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-		;;
-	      *)
-	        _LT_TAGVAR(allow_undefined_flag, $1)=' -expect_unresolved \*'
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname `test -n "$verstring" && func_echo_all "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib'
-	        _LT_TAGVAR(archive_expsym_cmds, $1)='for i in `cat $export_symbols`; do printf "%s %s\\n" -exported_symbol "\$i" >> $lib.exp; done~
-	          echo "-hidden">> $lib.exp~
-	          $CC -shared$allow_undefined_flag $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags -msym -soname $soname ${wl}-input ${wl}$lib.exp  `test -n "$verstring" && $ECHO "-set_version $verstring"` -update_registry ${output_objdir}/so_locations -o $lib~
-	          $RM $lib.exp'
-	        _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-rpath $libdir'
-		;;
-	    esac
-
-	    _LT_TAGVAR(hardcode_libdir_separator, $1)=:
-
-	    # Commands to make compiler produce verbose output that lists
-	    # what "hidden" libraries, object files and flags are used when
-	    # linking a shared library.
-	    #
-	    # There doesn't appear to be a way to prevent this compiler from
-	    # explicitly linking system object files so we need to strip them
-	    # from the output so that they don't get included in the library
-	    # dependencies.
-	    output_verbose_link_cmd='templist=`$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP "ld" | $GREP -v "ld:"`; templist=`func_echo_all "$templist" | $SED "s/\(^.*ld.*\)\( .*ld.*$\)/\1/"`; list=""; for z in $templist; do case $z in conftest.$objext) list="$list $z";; *.$objext);; *) list="$list $z";;esac; done; func_echo_all "$list"'
-	    ;;
-	  *)
-	    if test "$GXX" = yes && test "$with_gnu_ld" = no; then
-	      _LT_TAGVAR(allow_undefined_flag, $1)=' ${wl}-expect_unresolved ${wl}\*'
-	      case $host in
-	        osf3*)
-	          _LT_TAGVAR(archive_cmds, $1)='$CC -shared -nostdlib ${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
-		  ;;
-	        *)
-	          _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib ${allow_undefined_flag} $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-msym ${wl}-soname ${wl}$soname `test -n "$verstring" && func_echo_all "${wl}-set_version ${wl}$verstring"` ${wl}-update_registry ${wl}${output_objdir}/so_locations -o $lib'
-		  ;;
-	      esac
-
-	      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-rpath ${wl}$libdir'
-	      _LT_TAGVAR(hardcode_libdir_separator, $1)=:
-
-	      # Commands to make compiler produce verbose output that lists
-	      # what "hidden" libraries, object files and flags are used when
-	      # linking a shared library.
-	      output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
-
-	    else
-	      # FIXME: insert proper C++ library support
-	      _LT_TAGVAR(ld_shlibs, $1)=no
-	    fi
-	    ;;
-        esac
-        ;;
-
-      psos*)
-        # FIXME: insert proper C++ library support
-        _LT_TAGVAR(ld_shlibs, $1)=no
-        ;;
-
-      sunos4*)
-        case $cc_basename in
-          CC*)
-	    # Sun C++ 4.x
-	    # FIXME: insert proper C++ library support
-	    _LT_TAGVAR(ld_shlibs, $1)=no
-	    ;;
-          lcc*)
-	    # Lucid
-	    # FIXME: insert proper C++ library support
-	    _LT_TAGVAR(ld_shlibs, $1)=no
-	    ;;
-          *)
-	    # FIXME: insert proper C++ library support
-	    _LT_TAGVAR(ld_shlibs, $1)=no
-	    ;;
-        esac
-        ;;
-
-      solaris*)
-        case $cc_basename in
-          CC* | sunCC*)
-	    # Sun C++ 4.2, 5.x and Centerline C++
-            _LT_TAGVAR(archive_cmds_need_lc,$1)=yes
-	    _LT_TAGVAR(no_undefined_flag, $1)=' -zdefs'
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -G${allow_undefined_flag}  -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags'
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-	      $CC -G${allow_undefined_flag} ${wl}-M ${wl}$lib.exp -h$soname -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
-
-	    _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='-R$libdir'
-	    _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-	    case $host_os in
-	      solaris2.[[0-5]] | solaris2.[[0-5]].*) ;;
-	      *)
-		# The compiler driver will combine and reorder linker options,
-		# but understands `-z linker_flag'.
-	        # Supported since Solaris 2.6 (maybe 2.5.1?)
-		_LT_TAGVAR(whole_archive_flag_spec, $1)='-z allextract$convenience -z defaultextract'
-	        ;;
-	    esac
-	    _LT_TAGVAR(link_all_deplibs, $1)=yes
-
-	    output_verbose_link_cmd='func_echo_all'
-
-	    # Archives containing C++ object files must be created using
-	    # "CC -xar", where "CC" is the Sun C++ compiler.  This is
-	    # necessary to make sure instantiated templates are included
-	    # in the archive.
-	    _LT_TAGVAR(old_archive_cmds, $1)='$CC -xar -o $oldlib $oldobjs'
-	    ;;
-          gcx*)
-	    # Green Hills C++ Compiler
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
-
-	    # The C++ compiler must be used to create the archive.
-	    _LT_TAGVAR(old_archive_cmds, $1)='$CC $LDFLAGS -archive -o $oldlib $oldobjs'
-	    ;;
-          *)
-	    # GNU C++ compiler with Solaris linker
-	    if test "$GXX" = yes && test "$with_gnu_ld" = no; then
-	      _LT_TAGVAR(no_undefined_flag, $1)=' ${wl}-z ${wl}defs'
-	      if $CC --version | $GREP -v '^2\.7' > /dev/null; then
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -shared $pic_flag -nostdlib $LDFLAGS $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
-	        _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-		  $CC -shared $pic_flag -nostdlib ${wl}-M $wl$lib.exp -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
-
-	        # Commands to make compiler produce verbose output that lists
-	        # what "hidden" libraries, object files and flags are used when
-	        # linking a shared library.
-	        output_verbose_link_cmd='$CC -shared $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
-	      else
-	        # g++ 2.7 appears to require `-G' NOT `-shared' on this
-	        # platform.
-	        _LT_TAGVAR(archive_cmds, $1)='$CC -G -nostdlib $LDFLAGS $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags ${wl}-h $wl$soname -o $lib'
-	        _LT_TAGVAR(archive_expsym_cmds, $1)='echo "{ global:" > $lib.exp~cat $export_symbols | $SED -e "s/\(.*\)/\1;/" >> $lib.exp~echo "local: *; };" >> $lib.exp~
-		  $CC -G -nostdlib ${wl}-M $wl$lib.exp -o $lib $predep_objects $libobjs $deplibs $postdep_objects $compiler_flags~$RM $lib.exp'
-
-	        # Commands to make compiler produce verbose output that lists
-	        # what "hidden" libraries, object files and flags are used when
-	        # linking a shared library.
-	        output_verbose_link_cmd='$CC -G $CFLAGS -v conftest.$objext 2>&1 | $GREP -v "^Configured with:" | $GREP "\-L"'
-	      fi
-
-	      _LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R $wl$libdir'
-	      case $host_os in
-		solaris2.[[0-5]] | solaris2.[[0-5]].*) ;;
-		*)
-		  _LT_TAGVAR(whole_archive_flag_spec, $1)='${wl}-z ${wl}allextract$convenience ${wl}-z ${wl}defaultextract'
-		  ;;
-	      esac
-	    fi
-	    ;;
-        esac
-        ;;
-
-    sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[[01]].[[10]]* | unixware7* | sco3.2v5.0.[[024]]*)
-      _LT_TAGVAR(no_undefined_flag, $1)='${wl}-z,text'
-      _LT_TAGVAR(archive_cmds_need_lc, $1)=no
-      _LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-      runpath_var='LD_RUN_PATH'
-
-      case $cc_basename in
-        CC*)
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  ;;
-	*)
-	  _LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	  ;;
-      esac
-      ;;
-
-      sysv5* | sco3.2v5* | sco5v6*)
-	# Note: We can NOT use -z defs as we might desire, because we do not
-	# link with -lc, and that would cause any symbols used from libc to
-	# always be unresolved, which means just about no library would
-	# ever link correctly.  If we're not using GNU ld we use -z text
-	# though, which does catch some bad symbols but isn't as heavy-handed
-	# as -z defs.
-	_LT_TAGVAR(no_undefined_flag, $1)='${wl}-z,text'
-	_LT_TAGVAR(allow_undefined_flag, $1)='${wl}-z,nodefs'
-	_LT_TAGVAR(archive_cmds_need_lc, $1)=no
-	_LT_TAGVAR(hardcode_shlibpath_var, $1)=no
-	_LT_TAGVAR(hardcode_libdir_flag_spec, $1)='${wl}-R,$libdir'
-	_LT_TAGVAR(hardcode_libdir_separator, $1)=':'
-	_LT_TAGVAR(link_all_deplibs, $1)=yes
-	_LT_TAGVAR(export_dynamic_flag_spec, $1)='${wl}-Bexport'
-	runpath_var='LD_RUN_PATH'
-
-	case $cc_basename in
-          CC*)
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -G ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -G ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	    _LT_TAGVAR(old_archive_cmds, $1)='$CC -Tprelink_objects $oldobjs~
-	      '"$_LT_TAGVAR(old_archive_cmds, $1)"
-	    _LT_TAGVAR(reload_cmds, $1)='$CC -Tprelink_objects $reload_objs~
-	      '"$_LT_TAGVAR(reload_cmds, $1)"
-	    ;;
-	  *)
-	    _LT_TAGVAR(archive_cmds, $1)='$CC -shared ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	    _LT_TAGVAR(archive_expsym_cmds, $1)='$CC -shared ${wl}-Bexport:$export_symbols ${wl}-h,$soname -o $lib $libobjs $deplibs $compiler_flags'
-	    ;;
-	esac
-      ;;
-
-      tandem*)
-        case $cc_basename in
-          NCC*)
-	    # NonStop-UX NCC 3.20
-	    # FIXME: insert proper C++ library support
-	    _LT_TAGVAR(ld_shlibs, $1)=no
-	    ;;
-          *)
-	    # FIXME: insert proper C++ library support
-	    _LT_TAGVAR(ld_shlibs, $1)=no
-	    ;;
-        esac
-        ;;
-
-      vxworks*)
-        # FIXME: insert proper C++ library support
-        _LT_TAGVAR(ld_shlibs, $1)=no
-        ;;
-
-      *)
-        # FIXME: insert proper C++ library support
-        _LT_TAGVAR(ld_shlibs, $1)=no
-        ;;
-    esac
-
-    AC_MSG_RESULT([$_LT_TAGVAR(ld_shlibs, $1)])
-    test "$_LT_TAGVAR(ld_shlibs, $1)" = no && can_build_shared=no
-
-    _LT_TAGVAR(GCC, $1)="$GXX"
-    _LT_TAGVAR(LD, $1)="$LD"
-
-    ## CAVEAT EMPTOR:
-    ## There is no encapsulation within the following macros, do not change
-    ## the running order or otherwise move them around unless you know exactly
-    ## what you are doing...
-    _LT_SYS_HIDDEN_LIBDEPS($1)
-    _LT_COMPILER_PIC($1)
-    _LT_COMPILER_C_O($1)
-    _LT_COMPILER_FILE_LOCKS($1)
-    _LT_LINKER_SHLIBS($1)
-    _LT_SYS_DYNAMIC_LINKER($1)
-    _LT_LINKER_HARDCODE_LIBPATH($1)
-
-    _LT_CONFIG($1)
-  fi # test -n "$compiler"
-
-  CC=$lt_save_CC
-  CFLAGS=$lt_save_CFLAGS
-  LDCXX=$LD
-  LD=$lt_save_LD
-  GCC=$lt_save_GCC
-  with_gnu_ld=$lt_save_with_gnu_ld
-  lt_cv_path_LDCXX=$lt_cv_path_LD
-  lt_cv_path_LD=$lt_save_path_LD
-  lt_cv_prog_gnu_ldcxx=$lt_cv_prog_gnu_ld
-  lt_cv_prog_gnu_ld=$lt_save_with_gnu_ld
-fi # test "$_lt_caught_CXX_error" != yes
-
-AC_LANG_POP
-])# _LT_LANG_CXX_CONFIG
-
-
-# _LT_FUNC_STRIPNAME_CNF
-# ----------------------
-# func_stripname_cnf prefix suffix name
-# strip PREFIX and SUFFIX off of NAME.
-# PREFIX and SUFFIX must not contain globbing or regex special
-# characters, hashes, percent signs, but SUFFIX may contain a leading
-# dot (in which case that matches only a dot).
-#
-# This function is identical to the (non-XSI) version of func_stripname,
-# except this one can be used by m4 code that may be executed by configure,
-# rather than the libtool script.
-m4_defun([_LT_FUNC_STRIPNAME_CNF],[dnl
-AC_REQUIRE([_LT_DECL_SED])
-AC_REQUIRE([_LT_PROG_ECHO_BACKSLASH])
-func_stripname_cnf ()
-{
-  case ${2} in
-  .*) func_stripname_result=`$ECHO "${3}" | $SED "s%^${1}%%; s%\\\\${2}\$%%"`;;
-  *)  func_stripname_result=`$ECHO "${3}" | $SED "s%^${1}%%; s%${2}\$%%"`;;
-  esac
-} # func_stripname_cnf
-])# _LT_FUNC_STRIPNAME_CNF
-
-# _LT_SYS_HIDDEN_LIBDEPS([TAGNAME])
-# ---------------------------------
-# Figure out "hidden" library dependencies from verbose
-# compiler output when linking a shared library.
-# Parse the compiler output and extract the necessary
-# objects, libraries and library flags.
-m4_defun([_LT_SYS_HIDDEN_LIBDEPS],
-[m4_require([_LT_FILEUTILS_DEFAULTS])dnl
-AC_REQUIRE([_LT_FUNC_STRIPNAME_CNF])dnl
-# Dependencies to place before and after the object being linked:
-_LT_TAGVAR(predep_objects, $1)=
-_LT_TAGVAR(postdep_objects, $1)=
-_LT_TAGVAR(predeps, $1)=
-_LT_TAGVAR(postdeps, $1)=
-_LT_TAGVAR(compiler_lib_search_path, $1)=
-
-dnl we can't use the lt_simple_compile_test_code here,
-dnl because it contains code intended for an executable,
-dnl not a library.  It's possible we should let each
-dnl tag define a new lt_????_link_test_code variable,
-dnl but it's only used here...
-m4_if([$1], [], [cat > conftest.$ac_ext <<_LT_EOF
-int a;
-void foo (void) { a = 0; }
-_LT_EOF
-], [$1], [CXX], [cat > conftest.$ac_ext <<_LT_EOF
-class Foo
-{
-public:
-  Foo (void) { a = 0; }
-private:
-  int a;
-};
-_LT_EOF
-], [$1], [F77], [cat > conftest.$ac_ext <<_LT_EOF
-      subroutine foo
-      implicit none
-      integer*4 a
-      a=0
-      return
-      end
-_LT_EOF
-], [$1], [FC], [cat > conftest.$ac_ext <<_LT_EOF
-      subroutine foo
-      implicit none
-      integer a
-      a=0
-      return
-      end
-_LT_EOF
-], [$1], [GCJ], [cat > conftest.$ac_ext <<_LT_EOF
-public class foo {
-  private int a;
-  public void bar (void) {
-    a = 0;
-  }
-};
-_LT_EOF
-], [$1], [GO], [cat > conftest.$ac_ext <<_LT_EOF
-package foo
-func foo() {
-}
-_LT_EOF
-])
-
-_lt_libdeps_save_CFLAGS=$CFLAGS
-case "$CC $CFLAGS " in #(
-*\ -flto*\ *) CFLAGS="$CFLAGS -fno-lto" ;;
-*\ -fwhopr*\ *) CFLAGS="$CFLAGS -fno-whopr" ;;
-*\ -fuse-linker-plugin*\ *) CFLAGS="$CFLAGS -fno-use-linker-plugin" ;;
-esac
-
-dnl Parse the compiler output and extract the necessary
-dnl objects, libraries and library flags.
-if AC_TRY_EVAL(ac_compile); then
-  # Parse the compiler output and extract the necessary
-  # objects, libraries and library flags.
-
-  # Sentinel used to keep track of whether or not we are before
-  # the conftest object file.
-  pre_test_object_deps_done=no
-
-  for p in `eval "$output_verbose_link_cmd"`; do
-    case ${prev}${p} in
-
-    -L* | -R* | -l*)
-       # Some compilers place space between "-{L,R}" and the path.
-       # Remove the space.
-       if test $p = "-L" ||
-          test $p = "-R"; then
-	 prev=$p
-	 continue
-       fi
-
-       # Expand the sysroot to ease extracting the directories later.
-       if test -z "$prev"; then
-         case $p in
-         -L*) func_stripname_cnf '-L' '' "$p"; prev=-L; p=$func_stripname_result ;;
-         -R*) func_stripname_cnf '-R' '' "$p"; prev=-R; p=$func_stripname_result ;;
-         -l*) func_stripname_cnf '-l' '' "$p"; prev=-l; p=$func_stripname_result ;;
-         esac
-       fi
-       case $p in
-       =*) func_stripname_cnf '=' '' "$p"; p=$lt_sysroot$func_stripname_result ;;
-       esac
-       if test "$pre_test_object_deps_done" = no; then
-	 case ${prev} in
-	 -L | -R)
-	   # Internal compiler library paths should come after those
-	   # provided the user.  The postdeps already come after the
-	   # user supplied libs so there is no need to process them.
-	   if test -z "$_LT_TAGVAR(compiler_lib_search_path, $1)"; then
-	     _LT_TAGVAR(compiler_lib_search_path, $1)="${prev}${p}"
-	   else
-	     _LT_TAGVAR(compiler_lib_search_path, $1)="${_LT_TAGVAR(compiler_lib_search_path, $1)} ${prev}${p}"
-	   fi
-	   ;;
-	 # The "-l" case would never come before the object being
-	 # linked, so don't bother handling this case.
-	 esac
-       else
-	 if test -z "$_LT_TAGVAR(postdeps, $1)"; then
-	   _LT_TAGVAR(postdeps, $1)="${prev}${p}"
-	 else
-	   _LT_TAGVAR(postdeps, $1)="${_LT_TAGVAR(postdeps, $1)} ${prev}${p}"
-	 fi
-       fi
-       prev=
-       ;;
-
-    *.lto.$objext) ;; # Ignore GCC LTO objects
-    *.$objext)
-       # This assumes that the test object file only shows up
-       # once in the compiler output.
-       if test "$p" = "conftest.$objext"; then
-	 pre_test_object_deps_done=yes
-	 continue
-       fi
-
-       if test "$pre_test_object_deps_done" = no; then
-	 if test -z "$_LT_TAGVAR(predep_objects, $1)"; then
-	   _LT_TAGVAR(predep_objects, $1)="$p"
-	 else
-	   _LT_TAGVAR(predep_objects, $1)="$_LT_TAGVAR(predep_objects, $1) $p"
-	 fi
-       else
-	 if test -z "$_LT_TAGVAR(postdep_objects, $1)"; then
-	   _LT_TAGVAR(postdep_objects, $1)="$p"
-	 else
-	   _LT_TAGVAR(postdep_objects, $1)="$_LT_TAGVAR(postdep_objects, $1) $p"
-	 fi
-       fi
-       ;;
-
-    *) ;; # Ignore the rest.
-
-    esac
-  done
-
-  # Clean up.
-  rm -f a.out a.exe
-else
-  echo "libtool.m4: error: problem compiling $1 test program"
-fi
-
-$RM -f confest.$objext
-CFLAGS=$_lt_libdeps_save_CFLAGS
-
-# PORTME: override above test on systems where it is broken
-m4_if([$1], [CXX],
-[case $host_os in
-interix[[3-9]]*)
-  # Interix 3.5 installs completely hosed .la files for C++, so rather than
-  # hack all around it, let's just trust "g++" to DTRT.
-  _LT_TAGVAR(predep_objects,$1)=
-  _LT_TAGVAR(postdep_objects,$1)=
-  _LT_TAGVAR(postdeps,$1)=
-  ;;
-
-linux*)
-  case `$CC -V 2>&1 | sed 5q` in
-  *Sun\ C*)
-    # Sun C++ 5.9
-
-    # The more standards-conforming stlport4 library is
-    # incompatible with the Cstd library. Avoid specifying
-    # it if it's in CXXFLAGS. Ignore libCrun as
-    # -library=stlport4 depends on it.
-    case " $CXX $CXXFLAGS " in
-    *" -library=stlport4 "*)
-      solaris_use_stlport4=yes
-      ;;
-    esac
-
-    if test "$solaris_use_stlport4" != yes; then
-      _LT_TAGVAR(postdeps,$1)='-library=Cstd -library=Crun'
-    fi
-    ;;
-  esac
-  ;;
-
-solaris*)
-  case $cc_basename in
-  CC* | sunCC*)
-    # The more standards-conforming stlport4 library is
-    # incompatible with the Cstd library. Avoid specifying
-    # it if it's in CXXFLAGS. Ignore libCrun as
-    # -library=stlport4 depends on it.
-    case " $CXX $CXXFLAGS " in
-    *" -library=stlport4 "*)
-      solaris_use_stlport4=yes
-      ;;
-    esac
-
-    # Adding this requires a known-good setup of shared libraries for
-    # Sun compiler versions before 5.6, else PIC objects from an old
-    # archive will be linked into the output, leading to subtle bugs.
-    if test "$solaris_use_stlport4" != yes; then
-      _LT_TAGVAR(postdeps,$1)='-library=Cstd -library=Crun'
-    fi
-    ;;
-  esac
-  ;;
-esac
-])
-
-case " $_LT_TAGVAR(postdeps, $1) " in
-*" -lc "*) _LT_TAGVAR(archive_cmds_need_lc, $1)=no ;;
-esac
- _LT_TAGVAR(compiler_lib_search_dirs, $1)=
-if test -n "${_LT_TAGVAR(compiler_lib_search_path, $1)}"; then
- _LT_TAGVAR(compiler_lib_search_dirs, $1)=`echo " ${_LT_TAGVAR(compiler_lib_search_path, $1)}" | ${SED} -e 's! -L! !g' -e 's!^ !!'`
-fi
-_LT_TAGDECL([], [compiler_lib_search_dirs], [1],
-    [The directories searched by this compiler when creating a shared library])
-_LT_TAGDECL([], [predep_objects], [1],
-    [Dependencies to place before and after the objects being linked to
-    create a shared library])
-_LT_TAGDECL([], [postdep_objects], [1])
-_LT_TAGDECL([], [predeps], [1])
-_LT_TAGDECL([], [postdeps], [1])
-_LT_TAGDECL([], [compiler_lib_search_path], [1],
-    [The library search path used internally by the compiler when linking
-    a shared library])
-])# _LT_SYS_HIDDEN_LIBDEPS
-
-
-# _LT_LANG_F77_CONFIG([TAG])
-# --------------------------
-# Ensure that the configuration variables for a Fortran 77 compiler are
-# suitably defined.  These variables are subsequently used by _LT_CONFIG
-# to write the compiler configuration to `libtool'.
-m4_defun([_LT_LANG_F77_CONFIG],
-[AC_LANG_PUSH(Fortran 77)
-if test -z "$F77" || test "X$F77" = "Xno"; then
-  _lt_disable_F77=yes
-fi
-
-_LT_TAGVAR(archive_cmds_need_lc, $1)=no
-_LT_TAGVAR(allow_undefined_flag, $1)=
-_LT_TAGVAR(always_export_symbols, $1)=no
-_LT_TAGVAR(archive_expsym_cmds, $1)=
-_LT_TAGVAR(export_dynamic_flag_spec, $1)=
-_LT_TAGVAR(hardcode_direct, $1)=no
-_LT_TAGVAR(hardcode_direct_absolute, $1)=no
-_LT_TAGVAR(hardcode_libdir_flag_spec, $1)=
-_LT_TAGVAR(hardcode_libdir_separator, $1)=
-_LT_TAGVAR(hardcode_minus_L, $1)=no
-_LT_TAGVAR(hardcode_automatic, $1)=no
-_LT_TAGVAR(inherit_rpath, $1)=no
-_LT_TAGVAR(module_cmds, $1)=
-_LT_TAGVAR(module_expsym_cmds, $1)=
-_LT_TAGVAR(link_all_deplibs, $1)=unknown
-_LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds
-_LT_TAGVAR(reload_flag, $1)=$reload_flag
-_LT_TAGVAR(reload_cmds, $1)=$reload_cmds
-_LT_TAGVAR(no_undefined_flag, $1)=
-_LT_TAGVAR(whole_archive_flag_spec, $1)=
-_LT_TAGVAR(enable_shared_with_static_runtimes, $1)=no
-
-# Source file extension for f77 test sources.
-ac_ext=f
-
-# Object file extension for compiled f77 test sources.
-objext=o
-_LT_TAGVAR(objext, $1)=$objext
-
-# No sense in running all these tests if we already determined that
-# the F77 compiler isn't working.  Some variables (like enable_shared)
-# are currently assumed to apply to all compilers on this platform,
-# and will be corrupted by setting them based on a non-working compiler.
-if test "$_lt_disable_F77" != yes; then
-  # Code to be used in simple compile tests
-  lt_simple_compile_test_code="\
-      subroutine t
-      return
-      end
-"
-
-  # Code to be used in simple link tests
-  lt_simple_link_test_code="\
-      program t
-      end
-"
-
-  # ltmain only uses $CC for tagged configurations so make sure $CC is set.
-  _LT_TAG_COMPILER
-
-  # save warnings/boilerplate of simple test code
-  _LT_COMPILER_BOILERPLATE
-  _LT_LINKER_BOILERPLATE
-
-  # Allow CC to be a program name with arguments.
-  lt_save_CC="$CC"
-  lt_save_GCC=$GCC
-  lt_save_CFLAGS=$CFLAGS
-  CC=${F77-"f77"}
-  CFLAGS=$FFLAGS
-  compiler=$CC
-  _LT_TAGVAR(compiler, $1)=$CC
-  _LT_CC_BASENAME([$compiler])
-  GCC=$G77
-  if test -n "$compiler"; then
-    AC_MSG_CHECKING([if libtool supports shared libraries])
-    AC_MSG_RESULT([$can_build_shared])
-
-    AC_MSG_CHECKING([whether to build shared libraries])
-    test "$can_build_shared" = "no" && enable_shared=no
-
-    # On AIX, shared libraries and static libraries use the same namespace, and
-    # are all built from PIC.
-    case $host_os in
-      aix3*)
-        test "$enable_shared" = yes && enable_static=no
-        if test -n "$RANLIB"; then
-          archive_cmds="$archive_cmds~\$RANLIB \$lib"
-          postinstall_cmds='$RANLIB $lib'
-        fi
-        ;;
-      aix[[4-9]]*)
-	if test "$host_cpu" != ia64 && test "$aix_use_runtimelinking" = no ; then
-	  test "$enable_shared" = yes && enable_static=no
-	fi
-        ;;
-    esac
-    AC_MSG_RESULT([$enable_shared])
-
-    AC_MSG_CHECKING([whether to build static libraries])
-    # Make sure either enable_shared or enable_static is yes.
-    test "$enable_shared" = yes || enable_static=yes
-    AC_MSG_RESULT([$enable_static])
-
-    _LT_TAGVAR(GCC, $1)="$G77"
-    _LT_TAGVAR(LD, $1)="$LD"
-
-    ## CAVEAT EMPTOR:
-    ## There is no encapsulation within the following macros, do not change
-    ## the running order or otherwise move them around unless you know exactly
-    ## what you are doing...
-    _LT_COMPILER_PIC($1)
-    _LT_COMPILER_C_O($1)
-    _LT_COMPILER_FILE_LOCKS($1)
-    _LT_LINKER_SHLIBS($1)
-    _LT_SYS_DYNAMIC_LINKER($1)
-    _LT_LINKER_HARDCODE_LIBPATH($1)
-
-    _LT_CONFIG($1)
-  fi # test -n "$compiler"
-
-  GCC=$lt_save_GCC
-  CC="$lt_save_CC"
-  CFLAGS="$lt_save_CFLAGS"
-fi # test "$_lt_disable_F77" != yes
-
-AC_LANG_POP
-])# _LT_LANG_F77_CONFIG
-
-
-# _LT_LANG_FC_CONFIG([TAG])
-# -------------------------
-# Ensure that the configuration variables for a Fortran compiler are
-# suitably defined.  These variables are subsequently used by _LT_CONFIG
-# to write the compiler configuration to `libtool'.
-m4_defun([_LT_LANG_FC_CONFIG],
-[AC_LANG_PUSH(Fortran)
-
-if test -z "$FC" || test "X$FC" = "Xno"; then
-  _lt_disable_FC=yes
-fi
-
-_LT_TAGVAR(archive_cmds_need_lc, $1)=no
-_LT_TAGVAR(allow_undefined_flag, $1)=
-_LT_TAGVAR(always_export_symbols, $1)=no
-_LT_TAGVAR(archive_expsym_cmds, $1)=
-_LT_TAGVAR(export_dynamic_flag_spec, $1)=
-_LT_TAGVAR(hardcode_direct, $1)=no
-_LT_TAGVAR(hardcode_direct_absolute, $1)=no
-_LT_TAGVAR(hardcode_libdir_flag_spec, $1)=
-_LT_TAGVAR(hardcode_libdir_separator, $1)=
-_LT_TAGVAR(hardcode_minus_L, $1)=no
-_LT_TAGVAR(hardcode_automatic, $1)=no
-_LT_TAGVAR(inherit_rpath, $1)=no
-_LT_TAGVAR(module_cmds, $1)=
-_LT_TAGVAR(module_expsym_cmds, $1)=
-_LT_TAGVAR(link_all_deplibs, $1)=unknown
-_LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds
-_LT_TAGVAR(reload_flag, $1)=$reload_flag
-_LT_TAGVAR(reload_cmds, $1)=$reload_cmds
-_LT_TAGVAR(no_undefined_flag, $1)=
-_LT_TAGVAR(whole_archive_flag_spec, $1)=
-_LT_TAGVAR(enable_shared_with_static_runtimes, $1)=no
-
-# Source file extension for fc test sources.
-ac_ext=${ac_fc_srcext-f}
-
-# Object file extension for compiled fc test sources.
-objext=o
-_LT_TAGVAR(objext, $1)=$objext
-
-# No sense in running all these tests if we already determined that
-# the FC compiler isn't working.  Some variables (like enable_shared)
-# are currently assumed to apply to all compilers on this platform,
-# and will be corrupted by setting them based on a non-working compiler.
-if test "$_lt_disable_FC" != yes; then
-  # Code to be used in simple compile tests
-  lt_simple_compile_test_code="\
-      subroutine t
-      return
-      end
-"
-
-  # Code to be used in simple link tests
-  lt_simple_link_test_code="\
-      program t
-      end
-"
-
-  # ltmain only uses $CC for tagged configurations so make sure $CC is set.
-  _LT_TAG_COMPILER
-
-  # save warnings/boilerplate of simple test code
-  _LT_COMPILER_BOILERPLATE
-  _LT_LINKER_BOILERPLATE
-
-  # Allow CC to be a program name with arguments.
-  lt_save_CC="$CC"
-  lt_save_GCC=$GCC
-  lt_save_CFLAGS=$CFLAGS
-  CC=${FC-"f95"}
-  CFLAGS=$FCFLAGS
-  compiler=$CC
-  GCC=$ac_cv_fc_compiler_gnu
-
-  _LT_TAGVAR(compiler, $1)=$CC
-  _LT_CC_BASENAME([$compiler])
-
-  if test -n "$compiler"; then
-    AC_MSG_CHECKING([if libtool supports shared libraries])
-    AC_MSG_RESULT([$can_build_shared])
-
-    AC_MSG_CHECKING([whether to build shared libraries])
-    test "$can_build_shared" = "no" && enable_shared=no
-
-    # On AIX, shared libraries and static libraries use the same namespace, and
-    # are all built from PIC.
-    case $host_os in
-      aix3*)
-        test "$enable_shared" = yes && enable_static=no
-        if test -n "$RANLIB"; then
-          archive_cmds="$archive_cmds~\$RANLIB \$lib"
-          postinstall_cmds='$RANLIB $lib'
-        fi
-        ;;
-      aix[[4-9]]*)
-	if test "$host_cpu" != ia64 && test "$aix_use_runtimelinking" = no ; then
-	  test "$enable_shared" = yes && enable_static=no
-	fi
-        ;;
-    esac
-    AC_MSG_RESULT([$enable_shared])
-
-    AC_MSG_CHECKING([whether to build static libraries])
-    # Make sure either enable_shared or enable_static is yes.
-    test "$enable_shared" = yes || enable_static=yes
-    AC_MSG_RESULT([$enable_static])
-
-    _LT_TAGVAR(GCC, $1)="$ac_cv_fc_compiler_gnu"
-    _LT_TAGVAR(LD, $1)="$LD"
-
-    ## CAVEAT EMPTOR:
-    ## There is no encapsulation within the following macros, do not change
-    ## the running order or otherwise move them around unless you know exactly
-    ## what you are doing...
-    _LT_SYS_HIDDEN_LIBDEPS($1)
-    _LT_COMPILER_PIC($1)
-    _LT_COMPILER_C_O($1)
-    _LT_COMPILER_FILE_LOCKS($1)
-    _LT_LINKER_SHLIBS($1)
-    _LT_SYS_DYNAMIC_LINKER($1)
-    _LT_LINKER_HARDCODE_LIBPATH($1)
-
-    _LT_CONFIG($1)
-  fi # test -n "$compiler"
-
-  GCC=$lt_save_GCC
-  CC=$lt_save_CC
-  CFLAGS=$lt_save_CFLAGS
-fi # test "$_lt_disable_FC" != yes
-
-AC_LANG_POP
-])# _LT_LANG_FC_CONFIG
-
-
-# _LT_LANG_GCJ_CONFIG([TAG])
-# --------------------------
-# Ensure that the configuration variables for the GNU Java Compiler compiler
-# are suitably defined.  These variables are subsequently used by _LT_CONFIG
-# to write the compiler configuration to `libtool'.
-m4_defun([_LT_LANG_GCJ_CONFIG],
-[AC_REQUIRE([LT_PROG_GCJ])dnl
-AC_LANG_SAVE
-
-# Source file extension for Java test sources.
-ac_ext=java
-
-# Object file extension for compiled Java test sources.
-objext=o
-_LT_TAGVAR(objext, $1)=$objext
-
-# Code to be used in simple compile tests
-lt_simple_compile_test_code="class foo {}"
-
-# Code to be used in simple link tests
-lt_simple_link_test_code='public class conftest { public static void main(String[[]] argv) {}; }'
-
-# ltmain only uses $CC for tagged configurations so make sure $CC is set.
-_LT_TAG_COMPILER
-
-# save warnings/boilerplate of simple test code
-_LT_COMPILER_BOILERPLATE
-_LT_LINKER_BOILERPLATE
-
-# Allow CC to be a program name with arguments.
-lt_save_CC=$CC
-lt_save_CFLAGS=$CFLAGS
-lt_save_GCC=$GCC
-GCC=yes
-CC=${GCJ-"gcj"}
-CFLAGS=$GCJFLAGS
-compiler=$CC
-_LT_TAGVAR(compiler, $1)=$CC
-_LT_TAGVAR(LD, $1)="$LD"
-_LT_CC_BASENAME([$compiler])
-
-# GCJ did not exist at the time GCC didn't implicitly link libc in.
-_LT_TAGVAR(archive_cmds_need_lc, $1)=no
-
-_LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds
-_LT_TAGVAR(reload_flag, $1)=$reload_flag
-_LT_TAGVAR(reload_cmds, $1)=$reload_cmds
-
-## CAVEAT EMPTOR:
-## There is no encapsulation within the following macros, do not change
-## the running order or otherwise move them around unless you know exactly
-## what you are doing...
-if test -n "$compiler"; then
-  _LT_COMPILER_NO_RTTI($1)
-  _LT_COMPILER_PIC($1)
-  _LT_COMPILER_C_O($1)
-  _LT_COMPILER_FILE_LOCKS($1)
-  _LT_LINKER_SHLIBS($1)
-  _LT_LINKER_HARDCODE_LIBPATH($1)
-
-  _LT_CONFIG($1)
-fi
-
-AC_LANG_RESTORE
-
-GCC=$lt_save_GCC
-CC=$lt_save_CC
-CFLAGS=$lt_save_CFLAGS
-])# _LT_LANG_GCJ_CONFIG
-
-
-# _LT_LANG_GO_CONFIG([TAG])
-# --------------------------
-# Ensure that the configuration variables for the GNU Go compiler
-# are suitably defined.  These variables are subsequently used by _LT_CONFIG
-# to write the compiler configuration to `libtool'.
-m4_defun([_LT_LANG_GO_CONFIG],
-[AC_REQUIRE([LT_PROG_GO])dnl
-AC_LANG_SAVE
-
-# Source file extension for Go test sources.
-ac_ext=go
-
-# Object file extension for compiled Go test sources.
-objext=o
-_LT_TAGVAR(objext, $1)=$objext
-
-# Code to be used in simple compile tests
-lt_simple_compile_test_code="package main; func main() { }"
-
-# Code to be used in simple link tests
-lt_simple_link_test_code='package main; func main() { }'
-
-# ltmain only uses $CC for tagged configurations so make sure $CC is set.
-_LT_TAG_COMPILER
-
-# save warnings/boilerplate of simple test code
-_LT_COMPILER_BOILERPLATE
-_LT_LINKER_BOILERPLATE
-
-# Allow CC to be a program name with arguments.
-lt_save_CC=$CC
-lt_save_CFLAGS=$CFLAGS
-lt_save_GCC=$GCC
-GCC=yes
-CC=${GOC-"gccgo"}
-CFLAGS=$GOFLAGS
-compiler=$CC
-_LT_TAGVAR(compiler, $1)=$CC
-_LT_TAGVAR(LD, $1)="$LD"
-_LT_CC_BASENAME([$compiler])
-
-# Go did not exist at the time GCC didn't implicitly link libc in.
-_LT_TAGVAR(archive_cmds_need_lc, $1)=no
-
-_LT_TAGVAR(old_archive_cmds, $1)=$old_archive_cmds
-_LT_TAGVAR(reload_flag, $1)=$reload_flag
-_LT_TAGVAR(reload_cmds, $1)=$reload_cmds
-
-## CAVEAT EMPTOR:
-## There is no encapsulation within the following macros, do not change
-## the running order or otherwise move them around unless you know exactly
-## what you are doing...
-if test -n "$compiler"; then
-  _LT_COMPILER_NO_RTTI($1)
-  _LT_COMPILER_PIC($1)
-  _LT_COMPILER_C_O($1)
-  _LT_COMPILER_FILE_LOCKS($1)
-  _LT_LINKER_SHLIBS($1)
-  _LT_LINKER_HARDCODE_LIBPATH($1)
-
-  _LT_CONFIG($1)
-fi
-
-AC_LANG_RESTORE
-
-GCC=$lt_save_GCC
-CC=$lt_save_CC
-CFLAGS=$lt_save_CFLAGS
-])# _LT_LANG_GO_CONFIG
-
-
-# _LT_LANG_RC_CONFIG([TAG])
-# -------------------------
-# Ensure that the configuration variables for the Windows resource compiler
-# are suitably defined.  These variables are subsequently used by _LT_CONFIG
-# to write the compiler configuration to `libtool'.
-m4_defun([_LT_LANG_RC_CONFIG],
-[AC_REQUIRE([LT_PROG_RC])dnl
-AC_LANG_SAVE
-
-# Source file extension for RC test sources.
-ac_ext=rc
-
-# Object file extension for compiled RC test sources.
-objext=o
-_LT_TAGVAR(objext, $1)=$objext
-
-# Code to be used in simple compile tests
-lt_simple_compile_test_code='sample MENU { MENUITEM "&Soup", 100, CHECKED }'
-
-# Code to be used in simple link tests
-lt_simple_link_test_code="$lt_simple_compile_test_code"
-
-# ltmain only uses $CC for tagged configurations so make sure $CC is set.
-_LT_TAG_COMPILER
-
-# save warnings/boilerplate of simple test code
-_LT_COMPILER_BOILERPLATE
-_LT_LINKER_BOILERPLATE
-
-# Allow CC to be a program name with arguments.
-lt_save_CC="$CC"
-lt_save_CFLAGS=$CFLAGS
-lt_save_GCC=$GCC
-GCC=
-CC=${RC-"windres"}
-CFLAGS=
-compiler=$CC
-_LT_TAGVAR(compiler, $1)=$CC
-_LT_CC_BASENAME([$compiler])
-_LT_TAGVAR(lt_cv_prog_compiler_c_o, $1)=yes
-
-if test -n "$compiler"; then
-  :
-  _LT_CONFIG($1)
-fi
-
-GCC=$lt_save_GCC
-AC_LANG_RESTORE
-CC=$lt_save_CC
-CFLAGS=$lt_save_CFLAGS
-])# _LT_LANG_RC_CONFIG
-
-
-# LT_PROG_GCJ
-# -----------
-AC_DEFUN([LT_PROG_GCJ],
-[m4_ifdef([AC_PROG_GCJ], [AC_PROG_GCJ],
-  [m4_ifdef([A][M_PROG_GCJ], [A][M_PROG_GCJ],
-    [AC_CHECK_TOOL(GCJ, gcj,)
-      test "x${GCJFLAGS+set}" = xset || GCJFLAGS="-g -O2"
-      AC_SUBST(GCJFLAGS)])])[]dnl
-])
-
-# Old name:
-AU_ALIAS([LT_AC_PROG_GCJ], [LT_PROG_GCJ])
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([LT_AC_PROG_GCJ], [])
-
-
-# LT_PROG_GO
-# ----------
-AC_DEFUN([LT_PROG_GO],
-[AC_CHECK_TOOL(GOC, gccgo,)
-])
-
-
-# LT_PROG_RC
-# ----------
-AC_DEFUN([LT_PROG_RC],
-[AC_CHECK_TOOL(RC, windres,)
-])
-
-# Old name:
-AU_ALIAS([LT_AC_PROG_RC], [LT_PROG_RC])
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([LT_AC_PROG_RC], [])
-
-
-# _LT_DECL_EGREP
-# --------------
-# If we don't have a new enough Autoconf to choose the best grep
-# available, choose the one first in the user's PATH.
-m4_defun([_LT_DECL_EGREP],
-[AC_REQUIRE([AC_PROG_EGREP])dnl
-AC_REQUIRE([AC_PROG_FGREP])dnl
-test -z "$GREP" && GREP=grep
-_LT_DECL([], [GREP], [1], [A grep program that handles long lines])
-_LT_DECL([], [EGREP], [1], [An ERE matcher])
-_LT_DECL([], [FGREP], [1], [A literal string matcher])
-dnl Non-bleeding-edge autoconf doesn't subst GREP, so do it here too
-AC_SUBST([GREP])
-])
-
-
-# _LT_DECL_OBJDUMP
-# --------------
-# If we don't have a new enough Autoconf to choose the best objdump
-# available, choose the one first in the user's PATH.
-m4_defun([_LT_DECL_OBJDUMP],
-[AC_CHECK_TOOL(OBJDUMP, objdump, false)
-test -z "$OBJDUMP" && OBJDUMP=objdump
-_LT_DECL([], [OBJDUMP], [1], [An object symbol dumper])
-AC_SUBST([OBJDUMP])
-])
-
-# _LT_DECL_DLLTOOL
-# ----------------
-# Ensure DLLTOOL variable is set.
-m4_defun([_LT_DECL_DLLTOOL],
-[AC_CHECK_TOOL(DLLTOOL, dlltool, false)
-test -z "$DLLTOOL" && DLLTOOL=dlltool
-_LT_DECL([], [DLLTOOL], [1], [DLL creation program])
-AC_SUBST([DLLTOOL])
-])
-
-# _LT_DECL_SED
-# ------------
-# Check for a fully-functional sed program, that truncates
-# as few characters as possible.  Prefer GNU sed if found.
-m4_defun([_LT_DECL_SED],
-[AC_PROG_SED
-test -z "$SED" && SED=sed
-Xsed="$SED -e 1s/^X//"
-_LT_DECL([], [SED], [1], [A sed program that does not truncate output])
-_LT_DECL([], [Xsed], ["\$SED -e 1s/^X//"],
-    [Sed that helps us avoid accidentally triggering echo(1) options like -n])
-])# _LT_DECL_SED
-
-m4_ifndef([AC_PROG_SED], [
-############################################################
-# NOTE: This macro has been submitted for inclusion into   #
-#  GNU Autoconf as AC_PROG_SED.  When it is available in   #
-#  a released version of Autoconf we should remove this    #
-#  macro and use it instead.                               #
-############################################################
-
-m4_defun([AC_PROG_SED],
-[AC_MSG_CHECKING([for a sed that does not truncate output])
-AC_CACHE_VAL(lt_cv_path_SED,
-[# Loop through the user's path and test for sed and gsed.
-# Then use that list of sed's as ones to test for truncation.
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for lt_ac_prog in sed gsed; do
-    for ac_exec_ext in '' $ac_executable_extensions; do
-      if $as_executable_p "$as_dir/$lt_ac_prog$ac_exec_ext"; then
-        lt_ac_sed_list="$lt_ac_sed_list $as_dir/$lt_ac_prog$ac_exec_ext"
-      fi
-    done
-  done
-done
-IFS=$as_save_IFS
-lt_ac_max=0
-lt_ac_count=0
-# Add /usr/xpg4/bin/sed as it is typically found on Solaris
-# along with /bin/sed that truncates output.
-for lt_ac_sed in $lt_ac_sed_list /usr/xpg4/bin/sed; do
-  test ! -f $lt_ac_sed && continue
-  cat /dev/null > conftest.in
-  lt_ac_count=0
-  echo $ECHO_N "0123456789$ECHO_C" >conftest.in
-  # Check for GNU sed and select it if it is found.
-  if "$lt_ac_sed" --version 2>&1 < /dev/null | grep 'GNU' > /dev/null; then
-    lt_cv_path_SED=$lt_ac_sed
-    break
-  fi
-  while true; do
-    cat conftest.in conftest.in >conftest.tmp
-    mv conftest.tmp conftest.in
-    cp conftest.in conftest.nl
-    echo >>conftest.nl
-    $lt_ac_sed -e 's/a$//' < conftest.nl >conftest.out || break
-    cmp -s conftest.out conftest.nl || break
-    # 10000 chars as input seems more than enough
-    test $lt_ac_count -gt 10 && break
-    lt_ac_count=`expr $lt_ac_count + 1`
-    if test $lt_ac_count -gt $lt_ac_max; then
-      lt_ac_max=$lt_ac_count
-      lt_cv_path_SED=$lt_ac_sed
-    fi
-  done
-done
-])
-SED=$lt_cv_path_SED
-AC_SUBST([SED])
-AC_MSG_RESULT([$SED])
-])#AC_PROG_SED
-])#m4_ifndef
-
-# Old name:
-AU_ALIAS([LT_AC_PROG_SED], [AC_PROG_SED])
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([LT_AC_PROG_SED], [])
-
-
-# _LT_CHECK_SHELL_FEATURES
-# ------------------------
-# Find out whether the shell is Bourne or XSI compatible,
-# or has some other useful features.
-m4_defun([_LT_CHECK_SHELL_FEATURES],
-[AC_MSG_CHECKING([whether the shell understands some XSI constructs])
-# Try some XSI features
-xsi_shell=no
-( _lt_dummy="a/b/c"
-  test "${_lt_dummy##*/},${_lt_dummy%/*},${_lt_dummy#??}"${_lt_dummy%"$_lt_dummy"}, \
-      = c,a/b,b/c, \
-    && eval 'test $(( 1 + 1 )) -eq 2 \
-    && test "${#_lt_dummy}" -eq 5' ) >/dev/null 2>&1 \
-  && xsi_shell=yes
-AC_MSG_RESULT([$xsi_shell])
-_LT_CONFIG_LIBTOOL_INIT([xsi_shell='$xsi_shell'])
-
-AC_MSG_CHECKING([whether the shell understands "+="])
-lt_shell_append=no
-( foo=bar; set foo baz; eval "$[1]+=\$[2]" && test "$foo" = barbaz ) \
-    >/dev/null 2>&1 \
-  && lt_shell_append=yes
-AC_MSG_RESULT([$lt_shell_append])
-_LT_CONFIG_LIBTOOL_INIT([lt_shell_append='$lt_shell_append'])
-
-if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
-  lt_unset=unset
-else
-  lt_unset=false
-fi
-_LT_DECL([], [lt_unset], [0], [whether the shell understands "unset"])dnl
-
-# test EBCDIC or ASCII
-case `echo X|tr X '\101'` in
- A) # ASCII based system
-    # \n is not interpreted correctly by Solaris 8 /usr/ucb/tr
-  lt_SP2NL='tr \040 \012'
-  lt_NL2SP='tr \015\012 \040\040'
-  ;;
- *) # EBCDIC based system
-  lt_SP2NL='tr \100 \n'
-  lt_NL2SP='tr \r\n \100\100'
-  ;;
-esac
-_LT_DECL([SP2NL], [lt_SP2NL], [1], [turn spaces into newlines])dnl
-_LT_DECL([NL2SP], [lt_NL2SP], [1], [turn newlines into spaces])dnl
-])# _LT_CHECK_SHELL_FEATURES
-
-
-# _LT_PROG_FUNCTION_REPLACE (FUNCNAME, REPLACEMENT-BODY)
-# ------------------------------------------------------
-# In `$cfgfile', look for function FUNCNAME delimited by `^FUNCNAME ()$' and
-# '^} FUNCNAME ', and replace its body with REPLACEMENT-BODY.
-m4_defun([_LT_PROG_FUNCTION_REPLACE],
-[dnl {
-sed -e '/^$1 ()$/,/^} # $1 /c\
-$1 ()\
-{\
-m4_bpatsubsts([$2], [$], [\\], [^\([	 ]\)], [\\\1])
-} # Extended-shell $1 implementation' "$cfgfile" > $cfgfile.tmp \
-  && mv -f "$cfgfile.tmp" "$cfgfile" \
-    || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
-test 0 -eq $? || _lt_function_replace_fail=:
-])
-
-
-# _LT_PROG_REPLACE_SHELLFNS
-# -------------------------
-# Replace existing portable implementations of several shell functions with
-# equivalent extended shell implementations where those features are available..
-m4_defun([_LT_PROG_REPLACE_SHELLFNS],
-[if test x"$xsi_shell" = xyes; then
-  _LT_PROG_FUNCTION_REPLACE([func_dirname], [dnl
-    case ${1} in
-      */*) func_dirname_result="${1%/*}${2}" ;;
-      *  ) func_dirname_result="${3}" ;;
-    esac])
-
-  _LT_PROG_FUNCTION_REPLACE([func_basename], [dnl
-    func_basename_result="${1##*/}"])
-
-  _LT_PROG_FUNCTION_REPLACE([func_dirname_and_basename], [dnl
-    case ${1} in
-      */*) func_dirname_result="${1%/*}${2}" ;;
-      *  ) func_dirname_result="${3}" ;;
-    esac
-    func_basename_result="${1##*/}"])
-
-  _LT_PROG_FUNCTION_REPLACE([func_stripname], [dnl
-    # pdksh 5.2.14 does not do ${X%$Y} correctly if both X and Y are
-    # positional parameters, so assign one to ordinary parameter first.
-    func_stripname_result=${3}
-    func_stripname_result=${func_stripname_result#"${1}"}
-    func_stripname_result=${func_stripname_result%"${2}"}])
-
-  _LT_PROG_FUNCTION_REPLACE([func_split_long_opt], [dnl
-    func_split_long_opt_name=${1%%=*}
-    func_split_long_opt_arg=${1#*=}])
-
-  _LT_PROG_FUNCTION_REPLACE([func_split_short_opt], [dnl
-    func_split_short_opt_arg=${1#??}
-    func_split_short_opt_name=${1%"$func_split_short_opt_arg"}])
-
-  _LT_PROG_FUNCTION_REPLACE([func_lo2o], [dnl
-    case ${1} in
-      *.lo) func_lo2o_result=${1%.lo}.${objext} ;;
-      *)    func_lo2o_result=${1} ;;
-    esac])
-
-  _LT_PROG_FUNCTION_REPLACE([func_xform], [    func_xform_result=${1%.*}.lo])
-
-  _LT_PROG_FUNCTION_REPLACE([func_arith], [    func_arith_result=$(( $[*] ))])
-
-  _LT_PROG_FUNCTION_REPLACE([func_len], [    func_len_result=${#1}])
-fi
-
-if test x"$lt_shell_append" = xyes; then
-  _LT_PROG_FUNCTION_REPLACE([func_append], [    eval "${1}+=\\${2}"])
-
-  _LT_PROG_FUNCTION_REPLACE([func_append_quoted], [dnl
-    func_quote_for_eval "${2}"
-dnl m4 expansion turns \\\\ into \\, and then the shell eval turns that into \
-    eval "${1}+=\\\\ \\$func_quote_for_eval_result"])
-
-  # Save a `func_append' function call where possible by direct use of '+='
-  sed -e 's%func_append \([[a-zA-Z_]]\{1,\}\) "%\1+="%g' $cfgfile > $cfgfile.tmp \
-    && mv -f "$cfgfile.tmp" "$cfgfile" \
-      || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
-  test 0 -eq $? || _lt_function_replace_fail=:
-else
-  # Save a `func_append' function call even when '+=' is not available
-  sed -e 's%func_append \([[a-zA-Z_]]\{1,\}\) "%\1="$\1%g' $cfgfile > $cfgfile.tmp \
-    && mv -f "$cfgfile.tmp" "$cfgfile" \
-      || (rm -f "$cfgfile" && cp "$cfgfile.tmp" "$cfgfile" && rm -f "$cfgfile.tmp")
-  test 0 -eq $? || _lt_function_replace_fail=:
-fi
-
-if test x"$_lt_function_replace_fail" = x":"; then
-  AC_MSG_WARN([Unable to substitute extended shell functions in $ofile])
-fi
-])
-
-# _LT_PATH_CONVERSION_FUNCTIONS
-# -----------------------------
-# Determine which file name conversion functions should be used by
-# func_to_host_file (and, implicitly, by func_to_host_path).  These are needed
-# for certain cross-compile configurations and native mingw.
-m4_defun([_LT_PATH_CONVERSION_FUNCTIONS],
-[AC_REQUIRE([AC_CANONICAL_HOST])dnl
-AC_REQUIRE([AC_CANONICAL_BUILD])dnl
-AC_MSG_CHECKING([how to convert $build file names to $host format])
-AC_CACHE_VAL(lt_cv_to_host_file_cmd,
-[case $host in
-  *-*-mingw* )
-    case $build in
-      *-*-mingw* ) # actually msys
-        lt_cv_to_host_file_cmd=func_convert_file_msys_to_w32
-        ;;
-      *-*-cygwin* )
-        lt_cv_to_host_file_cmd=func_convert_file_cygwin_to_w32
-        ;;
-      * ) # otherwise, assume *nix
-        lt_cv_to_host_file_cmd=func_convert_file_nix_to_w32
-        ;;
-    esac
-    ;;
-  *-*-cygwin* )
-    case $build in
-      *-*-mingw* ) # actually msys
-        lt_cv_to_host_file_cmd=func_convert_file_msys_to_cygwin
-        ;;
-      *-*-cygwin* )
-        lt_cv_to_host_file_cmd=func_convert_file_noop
-        ;;
-      * ) # otherwise, assume *nix
-        lt_cv_to_host_file_cmd=func_convert_file_nix_to_cygwin
-        ;;
-    esac
-    ;;
-  * ) # unhandled hosts (and "normal" native builds)
-    lt_cv_to_host_file_cmd=func_convert_file_noop
-    ;;
-esac
-])
-to_host_file_cmd=$lt_cv_to_host_file_cmd
-AC_MSG_RESULT([$lt_cv_to_host_file_cmd])
-_LT_DECL([to_host_file_cmd], [lt_cv_to_host_file_cmd],
-         [0], [convert $build file names to $host format])dnl
-
-AC_MSG_CHECKING([how to convert $build file names to toolchain format])
-AC_CACHE_VAL(lt_cv_to_tool_file_cmd,
-[#assume ordinary cross tools, or native build.
-lt_cv_to_tool_file_cmd=func_convert_file_noop
-case $host in
-  *-*-mingw* )
-    case $build in
-      *-*-mingw* ) # actually msys
-        lt_cv_to_tool_file_cmd=func_convert_file_msys_to_w32
-        ;;
-    esac
-    ;;
-esac
-])
-to_tool_file_cmd=$lt_cv_to_tool_file_cmd
-AC_MSG_RESULT([$lt_cv_to_tool_file_cmd])
-_LT_DECL([to_tool_file_cmd], [lt_cv_to_tool_file_cmd],
-         [0], [convert $build files to toolchain format])dnl
-])# _LT_PATH_CONVERSION_FUNCTIONS
diff --git a/src/rocksdb/m4/ltoptions.m4 b/src/rocksdb/m4/ltoptions.m4
deleted file mode 100644
index 5d9acd8..0000000
--- a/src/rocksdb/m4/ltoptions.m4
+++ /dev/null
@@ -1,384 +0,0 @@
-# Helper functions for option handling.                    -*- Autoconf -*-
-#
-#   Copyright (C) 2004, 2005, 2007, 2008, 2009 Free Software Foundation,
-#   Inc.
-#   Written by Gary V. Vaughan, 2004
-#
-# This file is free software; the Free Software Foundation gives
-# unlimited permission to copy and/or distribute it, with or without
-# modifications, as long as this notice is preserved.
-
-# serial 7 ltoptions.m4
-
-# This is to help aclocal find these macros, as it can't see m4_define.
-AC_DEFUN([LTOPTIONS_VERSION], [m4_if([1])])
-
-
-# _LT_MANGLE_OPTION(MACRO-NAME, OPTION-NAME)
-# ------------------------------------------
-m4_define([_LT_MANGLE_OPTION],
-[[_LT_OPTION_]m4_bpatsubst($1__$2, [[^a-zA-Z0-9_]], [_])])
-
-
-# _LT_SET_OPTION(MACRO-NAME, OPTION-NAME)
-# ---------------------------------------
-# Set option OPTION-NAME for macro MACRO-NAME, and if there is a
-# matching handler defined, dispatch to it.  Other OPTION-NAMEs are
-# saved as a flag.
-m4_define([_LT_SET_OPTION],
-[m4_define(_LT_MANGLE_OPTION([$1], [$2]))dnl
-m4_ifdef(_LT_MANGLE_DEFUN([$1], [$2]),
-        _LT_MANGLE_DEFUN([$1], [$2]),
-    [m4_warning([Unknown $1 option `$2'])])[]dnl
-])
-
-
-# _LT_IF_OPTION(MACRO-NAME, OPTION-NAME, IF-SET, [IF-NOT-SET])
-# ------------------------------------------------------------
-# Execute IF-SET if OPTION is set, IF-NOT-SET otherwise.
-m4_define([_LT_IF_OPTION],
-[m4_ifdef(_LT_MANGLE_OPTION([$1], [$2]), [$3], [$4])])
-
-
-# _LT_UNLESS_OPTIONS(MACRO-NAME, OPTION-LIST, IF-NOT-SET)
-# -------------------------------------------------------
-# Execute IF-NOT-SET unless all options in OPTION-LIST for MACRO-NAME
-# are set.
-m4_define([_LT_UNLESS_OPTIONS],
-[m4_foreach([_LT_Option], m4_split(m4_normalize([$2])),
-	    [m4_ifdef(_LT_MANGLE_OPTION([$1], _LT_Option),
-		      [m4_define([$0_found])])])[]dnl
-m4_ifdef([$0_found], [m4_undefine([$0_found])], [$3
-])[]dnl
-])
-
-
-# _LT_SET_OPTIONS(MACRO-NAME, OPTION-LIST)
-# ----------------------------------------
-# OPTION-LIST is a space-separated list of Libtool options associated
-# with MACRO-NAME.  If any OPTION has a matching handler declared with
-# LT_OPTION_DEFINE, dispatch to that macro; otherwise complain about
-# the unknown option and exit.
-m4_defun([_LT_SET_OPTIONS],
-[# Set options
-m4_foreach([_LT_Option], m4_split(m4_normalize([$2])),
-    [_LT_SET_OPTION([$1], _LT_Option)])
-
-m4_if([$1],[LT_INIT],[
-  dnl
-  dnl Simply set some default values (i.e off) if boolean options were not
-  dnl specified:
-  _LT_UNLESS_OPTIONS([LT_INIT], [dlopen], [enable_dlopen=no
-  ])
-  _LT_UNLESS_OPTIONS([LT_INIT], [win32-dll], [enable_win32_dll=no
-  ])
-  dnl
-  dnl If no reference was made to various pairs of opposing options, then
-  dnl we run the default mode handler for the pair.  For example, if neither
-  dnl `shared' nor `disable-shared' was passed, we enable building of shared
-  dnl archives by default:
-  _LT_UNLESS_OPTIONS([LT_INIT], [shared disable-shared], [_LT_ENABLE_SHARED])
-  _LT_UNLESS_OPTIONS([LT_INIT], [static disable-static], [_LT_ENABLE_STATIC])
-  _LT_UNLESS_OPTIONS([LT_INIT], [pic-only no-pic], [_LT_WITH_PIC])
-  _LT_UNLESS_OPTIONS([LT_INIT], [fast-install disable-fast-install],
-  		   [_LT_ENABLE_FAST_INSTALL])
-  ])
-])# _LT_SET_OPTIONS
-
-
-## --------------------------------- ##
-## Macros to handle LT_INIT options. ##
-## --------------------------------- ##
-
-# _LT_MANGLE_DEFUN(MACRO-NAME, OPTION-NAME)
-# -----------------------------------------
-m4_define([_LT_MANGLE_DEFUN],
-[[_LT_OPTION_DEFUN_]m4_bpatsubst(m4_toupper([$1__$2]), [[^A-Z0-9_]], [_])])
-
-
-# LT_OPTION_DEFINE(MACRO-NAME, OPTION-NAME, CODE)
-# -----------------------------------------------
-m4_define([LT_OPTION_DEFINE],
-[m4_define(_LT_MANGLE_DEFUN([$1], [$2]), [$3])[]dnl
-])# LT_OPTION_DEFINE
-
-
-# dlopen
-# ------
-LT_OPTION_DEFINE([LT_INIT], [dlopen], [enable_dlopen=yes
-])
-
-AU_DEFUN([AC_LIBTOOL_DLOPEN],
-[_LT_SET_OPTION([LT_INIT], [dlopen])
-AC_DIAGNOSE([obsolete],
-[$0: Remove this warning and the call to _LT_SET_OPTION when you
-put the `dlopen' option into LT_INIT's first parameter.])
-])
-
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_LIBTOOL_DLOPEN], [])
-
-
-# win32-dll
-# ---------
-# Declare package support for building win32 dll's.
-LT_OPTION_DEFINE([LT_INIT], [win32-dll],
-[enable_win32_dll=yes
-
-case $host in
-*-*-cygwin* | *-*-mingw* | *-*-pw32* | *-*-cegcc*)
-  AC_CHECK_TOOL(AS, as, false)
-  AC_CHECK_TOOL(DLLTOOL, dlltool, false)
-  AC_CHECK_TOOL(OBJDUMP, objdump, false)
-  ;;
-esac
-
-test -z "$AS" && AS=as
-_LT_DECL([], [AS],      [1], [Assembler program])dnl
-
-test -z "$DLLTOOL" && DLLTOOL=dlltool
-_LT_DECL([], [DLLTOOL], [1], [DLL creation program])dnl
-
-test -z "$OBJDUMP" && OBJDUMP=objdump
-_LT_DECL([], [OBJDUMP], [1], [Object dumper program])dnl
-])# win32-dll
-
-AU_DEFUN([AC_LIBTOOL_WIN32_DLL],
-[AC_REQUIRE([AC_CANONICAL_HOST])dnl
-_LT_SET_OPTION([LT_INIT], [win32-dll])
-AC_DIAGNOSE([obsolete],
-[$0: Remove this warning and the call to _LT_SET_OPTION when you
-put the `win32-dll' option into LT_INIT's first parameter.])
-])
-
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_LIBTOOL_WIN32_DLL], [])
-
-
-# _LT_ENABLE_SHARED([DEFAULT])
-# ----------------------------
-# implement the --enable-shared flag, and supports the `shared' and
-# `disable-shared' LT_INIT options.
-# DEFAULT is either `yes' or `no'.  If omitted, it defaults to `yes'.
-m4_define([_LT_ENABLE_SHARED],
-[m4_define([_LT_ENABLE_SHARED_DEFAULT], [m4_if($1, no, no, yes)])dnl
-AC_ARG_ENABLE([shared],
-    [AS_HELP_STRING([--enable-shared@<:@=PKGS@:>@],
-	[build shared libraries @<:@default=]_LT_ENABLE_SHARED_DEFAULT[@:>@])],
-    [p=${PACKAGE-default}
-    case $enableval in
-    yes) enable_shared=yes ;;
-    no) enable_shared=no ;;
-    *)
-      enable_shared=no
-      # Look at the argument we got.  We use all the common list separators.
-      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
-      for pkg in $enableval; do
-	IFS="$lt_save_ifs"
-	if test "X$pkg" = "X$p"; then
-	  enable_shared=yes
-	fi
-      done
-      IFS="$lt_save_ifs"
-      ;;
-    esac],
-    [enable_shared=]_LT_ENABLE_SHARED_DEFAULT)
-
-    _LT_DECL([build_libtool_libs], [enable_shared], [0],
-	[Whether or not to build shared libraries])
-])# _LT_ENABLE_SHARED
-
-LT_OPTION_DEFINE([LT_INIT], [shared], [_LT_ENABLE_SHARED([yes])])
-LT_OPTION_DEFINE([LT_INIT], [disable-shared], [_LT_ENABLE_SHARED([no])])
-
-# Old names:
-AC_DEFUN([AC_ENABLE_SHARED],
-[_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[shared])
-])
-
-AC_DEFUN([AC_DISABLE_SHARED],
-[_LT_SET_OPTION([LT_INIT], [disable-shared])
-])
-
-AU_DEFUN([AM_ENABLE_SHARED], [AC_ENABLE_SHARED($@)])
-AU_DEFUN([AM_DISABLE_SHARED], [AC_DISABLE_SHARED($@)])
-
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AM_ENABLE_SHARED], [])
-dnl AC_DEFUN([AM_DISABLE_SHARED], [])
-
-
-
-# _LT_ENABLE_STATIC([DEFAULT])
-# ----------------------------
-# implement the --enable-static flag, and support the `static' and
-# `disable-static' LT_INIT options.
-# DEFAULT is either `yes' or `no'.  If omitted, it defaults to `yes'.
-m4_define([_LT_ENABLE_STATIC],
-[m4_define([_LT_ENABLE_STATIC_DEFAULT], [m4_if($1, no, no, yes)])dnl
-AC_ARG_ENABLE([static],
-    [AS_HELP_STRING([--enable-static@<:@=PKGS@:>@],
-	[build static libraries @<:@default=]_LT_ENABLE_STATIC_DEFAULT[@:>@])],
-    [p=${PACKAGE-default}
-    case $enableval in
-    yes) enable_static=yes ;;
-    no) enable_static=no ;;
-    *)
-     enable_static=no
-      # Look at the argument we got.  We use all the common list separators.
-      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
-      for pkg in $enableval; do
-	IFS="$lt_save_ifs"
-	if test "X$pkg" = "X$p"; then
-	  enable_static=yes
-	fi
-      done
-      IFS="$lt_save_ifs"
-      ;;
-    esac],
-    [enable_static=]_LT_ENABLE_STATIC_DEFAULT)
-
-    _LT_DECL([build_old_libs], [enable_static], [0],
-	[Whether or not to build static libraries])
-])# _LT_ENABLE_STATIC
-
-LT_OPTION_DEFINE([LT_INIT], [static], [_LT_ENABLE_STATIC([yes])])
-LT_OPTION_DEFINE([LT_INIT], [disable-static], [_LT_ENABLE_STATIC([no])])
-
-# Old names:
-AC_DEFUN([AC_ENABLE_STATIC],
-[_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[static])
-])
-
-AC_DEFUN([AC_DISABLE_STATIC],
-[_LT_SET_OPTION([LT_INIT], [disable-static])
-])
-
-AU_DEFUN([AM_ENABLE_STATIC], [AC_ENABLE_STATIC($@)])
-AU_DEFUN([AM_DISABLE_STATIC], [AC_DISABLE_STATIC($@)])
-
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AM_ENABLE_STATIC], [])
-dnl AC_DEFUN([AM_DISABLE_STATIC], [])
-
-
-
-# _LT_ENABLE_FAST_INSTALL([DEFAULT])
-# ----------------------------------
-# implement the --enable-fast-install flag, and support the `fast-install'
-# and `disable-fast-install' LT_INIT options.
-# DEFAULT is either `yes' or `no'.  If omitted, it defaults to `yes'.
-m4_define([_LT_ENABLE_FAST_INSTALL],
-[m4_define([_LT_ENABLE_FAST_INSTALL_DEFAULT], [m4_if($1, no, no, yes)])dnl
-AC_ARG_ENABLE([fast-install],
-    [AS_HELP_STRING([--enable-fast-install@<:@=PKGS@:>@],
-    [optimize for fast installation @<:@default=]_LT_ENABLE_FAST_INSTALL_DEFAULT[@:>@])],
-    [p=${PACKAGE-default}
-    case $enableval in
-    yes) enable_fast_install=yes ;;
-    no) enable_fast_install=no ;;
-    *)
-      enable_fast_install=no
-      # Look at the argument we got.  We use all the common list separators.
-      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
-      for pkg in $enableval; do
-	IFS="$lt_save_ifs"
-	if test "X$pkg" = "X$p"; then
-	  enable_fast_install=yes
-	fi
-      done
-      IFS="$lt_save_ifs"
-      ;;
-    esac],
-    [enable_fast_install=]_LT_ENABLE_FAST_INSTALL_DEFAULT)
-
-_LT_DECL([fast_install], [enable_fast_install], [0],
-	 [Whether or not to optimize for fast installation])dnl
-])# _LT_ENABLE_FAST_INSTALL
-
-LT_OPTION_DEFINE([LT_INIT], [fast-install], [_LT_ENABLE_FAST_INSTALL([yes])])
-LT_OPTION_DEFINE([LT_INIT], [disable-fast-install], [_LT_ENABLE_FAST_INSTALL([no])])
-
-# Old names:
-AU_DEFUN([AC_ENABLE_FAST_INSTALL],
-[_LT_SET_OPTION([LT_INIT], m4_if([$1], [no], [disable-])[fast-install])
-AC_DIAGNOSE([obsolete],
-[$0: Remove this warning and the call to _LT_SET_OPTION when you put
-the `fast-install' option into LT_INIT's first parameter.])
-])
-
-AU_DEFUN([AC_DISABLE_FAST_INSTALL],
-[_LT_SET_OPTION([LT_INIT], [disable-fast-install])
-AC_DIAGNOSE([obsolete],
-[$0: Remove this warning and the call to _LT_SET_OPTION when you put
-the `disable-fast-install' option into LT_INIT's first parameter.])
-])
-
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_ENABLE_FAST_INSTALL], [])
-dnl AC_DEFUN([AM_DISABLE_FAST_INSTALL], [])
-
-
-# _LT_WITH_PIC([MODE])
-# --------------------
-# implement the --with-pic flag, and support the `pic-only' and `no-pic'
-# LT_INIT options.
-# MODE is either `yes' or `no'.  If omitted, it defaults to `both'.
-m4_define([_LT_WITH_PIC],
-[AC_ARG_WITH([pic],
-    [AS_HELP_STRING([--with-pic@<:@=PKGS@:>@],
-	[try to use only PIC/non-PIC objects @<:@default=use both@:>@])],
-    [lt_p=${PACKAGE-default}
-    case $withval in
-    yes|no) pic_mode=$withval ;;
-    *)
-      pic_mode=default
-      # Look at the argument we got.  We use all the common list separators.
-      lt_save_ifs="$IFS"; IFS="${IFS}$PATH_SEPARATOR,"
-      for lt_pkg in $withval; do
-	IFS="$lt_save_ifs"
-	if test "X$lt_pkg" = "X$lt_p"; then
-	  pic_mode=yes
-	fi
-      done
-      IFS="$lt_save_ifs"
-      ;;
-    esac],
-    [pic_mode=default])
-
-test -z "$pic_mode" && pic_mode=m4_default([$1], [default])
-
-_LT_DECL([], [pic_mode], [0], [What type of objects to build])dnl
-])# _LT_WITH_PIC
-
-LT_OPTION_DEFINE([LT_INIT], [pic-only], [_LT_WITH_PIC([yes])])
-LT_OPTION_DEFINE([LT_INIT], [no-pic], [_LT_WITH_PIC([no])])
-
-# Old name:
-AU_DEFUN([AC_LIBTOOL_PICMODE],
-[_LT_SET_OPTION([LT_INIT], [pic-only])
-AC_DIAGNOSE([obsolete],
-[$0: Remove this warning and the call to _LT_SET_OPTION when you
-put the `pic-only' option into LT_INIT's first parameter.])
-])
-
-dnl aclocal-1.4 backwards compatibility:
-dnl AC_DEFUN([AC_LIBTOOL_PICMODE], [])
-
-## ----------------- ##
-## LTDL_INIT Options ##
-## ----------------- ##
-
-m4_define([_LTDL_MODE], [])
-LT_OPTION_DEFINE([LTDL_INIT], [nonrecursive],
-		 [m4_define([_LTDL_MODE], [nonrecursive])])
-LT_OPTION_DEFINE([LTDL_INIT], [recursive],
-		 [m4_define([_LTDL_MODE], [recursive])])
-LT_OPTION_DEFINE([LTDL_INIT], [subproject],
-		 [m4_define([_LTDL_MODE], [subproject])])
-
-m4_define([_LTDL_TYPE], [])
-LT_OPTION_DEFINE([LTDL_INIT], [installable],
-		 [m4_define([_LTDL_TYPE], [installable])])
-LT_OPTION_DEFINE([LTDL_INIT], [convenience],
-		 [m4_define([_LTDL_TYPE], [convenience])])
diff --git a/src/rocksdb/m4/ltsugar.m4 b/src/rocksdb/m4/ltsugar.m4
deleted file mode 100644
index 9000a05..0000000
--- a/src/rocksdb/m4/ltsugar.m4
+++ /dev/null
@@ -1,123 +0,0 @@
-# ltsugar.m4 -- libtool m4 base layer.                         -*-Autoconf-*-
-#
-# Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
-# Written by Gary V. Vaughan, 2004
-#
-# This file is free software; the Free Software Foundation gives
-# unlimited permission to copy and/or distribute it, with or without
-# modifications, as long as this notice is preserved.
-
-# serial 6 ltsugar.m4
-
-# This is to help aclocal find these macros, as it can't see m4_define.
-AC_DEFUN([LTSUGAR_VERSION], [m4_if([0.1])])
-
-
-# lt_join(SEP, ARG1, [ARG2...])
-# -----------------------------
-# Produce ARG1SEPARG2...SEPARGn, omitting [] arguments and their
-# associated separator.
-# Needed until we can rely on m4_join from Autoconf 2.62, since all earlier
-# versions in m4sugar had bugs.
-m4_define([lt_join],
-[m4_if([$#], [1], [],
-       [$#], [2], [[$2]],
-       [m4_if([$2], [], [], [[$2]_])$0([$1], m4_shift(m4_shift($@)))])])
-m4_define([_lt_join],
-[m4_if([$#$2], [2], [],
-       [m4_if([$2], [], [], [[$1$2]])$0([$1], m4_shift(m4_shift($@)))])])
-
-
-# lt_car(LIST)
-# lt_cdr(LIST)
-# ------------
-# Manipulate m4 lists.
-# These macros are necessary as long as will still need to support
-# Autoconf-2.59 which quotes differently.
-m4_define([lt_car], [[$1]])
-m4_define([lt_cdr],
-[m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])],
-       [$#], 1, [],
-       [m4_dquote(m4_shift($@))])])
-m4_define([lt_unquote], $1)
-
-
-# lt_append(MACRO-NAME, STRING, [SEPARATOR])
-# ------------------------------------------
-# Redefine MACRO-NAME to hold its former content plus `SEPARATOR'`STRING'.
-# Note that neither SEPARATOR nor STRING are expanded; they are appended
-# to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked).
-# No SEPARATOR is output if MACRO-NAME was previously undefined (different
-# than defined and empty).
-#
-# This macro is needed until we can rely on Autoconf 2.62, since earlier
-# versions of m4sugar mistakenly expanded SEPARATOR but not STRING.
-m4_define([lt_append],
-[m4_define([$1],
-	   m4_ifdef([$1], [m4_defn([$1])[$3]])[$2])])
-
-
-
-# lt_combine(SEP, PREFIX-LIST, INFIX, SUFFIX1, [SUFFIX2...])
-# ----------------------------------------------------------
-# Produce a SEP delimited list of all paired combinations of elements of
-# PREFIX-LIST with SUFFIX1 through SUFFIXn.  Each element of the list
-# has the form PREFIXmINFIXSUFFIXn.
-# Needed until we can rely on m4_combine added in Autoconf 2.62.
-m4_define([lt_combine],
-[m4_if(m4_eval([$# > 3]), [1],
-       [m4_pushdef([_Lt_sep], [m4_define([_Lt_sep], m4_defn([lt_car]))])]]dnl
-[[m4_foreach([_Lt_prefix], [$2],
-	     [m4_foreach([_Lt_suffix],
-		]m4_dquote(m4_dquote(m4_shift(m4_shift(m4_shift($@)))))[,
-	[_Lt_sep([$1])[]m4_defn([_Lt_prefix])[$3]m4_defn([_Lt_suffix])])])])])
-
-
-# lt_if_append_uniq(MACRO-NAME, VARNAME, [SEPARATOR], [UNIQ], [NOT-UNIQ])
-# -----------------------------------------------------------------------
-# Iff MACRO-NAME does not yet contain VARNAME, then append it (delimited
-# by SEPARATOR if supplied) and expand UNIQ, else NOT-UNIQ.
-m4_define([lt_if_append_uniq],
-[m4_ifdef([$1],
-	  [m4_if(m4_index([$3]m4_defn([$1])[$3], [$3$2$3]), [-1],
-		 [lt_append([$1], [$2], [$3])$4],
-		 [$5])],
-	  [lt_append([$1], [$2], [$3])$4])])
-
-
-# lt_dict_add(DICT, KEY, VALUE)
-# -----------------------------
-m4_define([lt_dict_add],
-[m4_define([$1($2)], [$3])])
-
-
-# lt_dict_add_subkey(DICT, KEY, SUBKEY, VALUE)
-# --------------------------------------------
-m4_define([lt_dict_add_subkey],
-[m4_define([$1($2:$3)], [$4])])
-
-
-# lt_dict_fetch(DICT, KEY, [SUBKEY])
-# ----------------------------------
-m4_define([lt_dict_fetch],
-[m4_ifval([$3],
-	m4_ifdef([$1($2:$3)], [m4_defn([$1($2:$3)])]),
-    m4_ifdef([$1($2)], [m4_defn([$1($2)])]))])
-
-
-# lt_if_dict_fetch(DICT, KEY, [SUBKEY], VALUE, IF-TRUE, [IF-FALSE])
-# -----------------------------------------------------------------
-m4_define([lt_if_dict_fetch],
-[m4_if(lt_dict_fetch([$1], [$2], [$3]), [$4],
-	[$5],
-    [$6])])
-
-
-# lt_dict_filter(DICT, [SUBKEY], VALUE, [SEPARATOR], KEY, [...])
-# --------------------------------------------------------------
-m4_define([lt_dict_filter],
-[m4_if([$5], [], [],
-  [lt_join(m4_quote(m4_default([$4], [[, ]])),
-           lt_unquote(m4_split(m4_normalize(m4_foreach(_Lt_key, lt_car([m4_shiftn(4, $@)]),
-		      [lt_if_dict_fetch([$1], _Lt_key, [$2], [$3], [_Lt_key ])])))))])[]dnl
-])
diff --git a/src/rocksdb/m4/ltversion.m4 b/src/rocksdb/m4/ltversion.m4
deleted file mode 100644
index 07a8602..0000000
--- a/src/rocksdb/m4/ltversion.m4
+++ /dev/null
@@ -1,23 +0,0 @@
-# ltversion.m4 -- version numbers			-*- Autoconf -*-
-#
-#   Copyright (C) 2004 Free Software Foundation, Inc.
-#   Written by Scott James Remnant, 2004
-#
-# This file is free software; the Free Software Foundation gives
-# unlimited permission to copy and/or distribute it, with or without
-# modifications, as long as this notice is preserved.
-
-# @configure_input@
-
-# serial 3337 ltversion.m4
-# This file is part of GNU Libtool
-
-m4_define([LT_PACKAGE_VERSION], [2.4.2])
-m4_define([LT_PACKAGE_REVISION], [1.3337])
-
-AC_DEFUN([LTVERSION_VERSION],
-[macro_version='2.4.2'
-macro_revision='1.3337'
-_LT_DECL(, macro_version, 0, [Which release of libtool.m4 was used?])
-_LT_DECL(, macro_revision, 0)
-])
diff --git a/src/rocksdb/m4/lt~obsolete.m4 b/src/rocksdb/m4/lt~obsolete.m4
deleted file mode 100644
index c573da9..0000000
--- a/src/rocksdb/m4/lt~obsolete.m4
+++ /dev/null
@@ -1,98 +0,0 @@
-# lt~obsolete.m4 -- aclocal satisfying obsolete definitions.    -*-Autoconf-*-
-#
-#   Copyright (C) 2004, 2005, 2007, 2009 Free Software Foundation, Inc.
-#   Written by Scott James Remnant, 2004.
-#
-# This file is free software; the Free Software Foundation gives
-# unlimited permission to copy and/or distribute it, with or without
-# modifications, as long as this notice is preserved.
-
-# serial 5 lt~obsolete.m4
-
-# These exist entirely to fool aclocal when bootstrapping libtool.
-#
-# In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN)
-# which have later been changed to m4_define as they aren't part of the
-# exported API, or moved to Autoconf or Automake where they belong.
-#
-# The trouble is, aclocal is a bit thick.  It'll see the old AC_DEFUN
-# in /usr/share/aclocal/libtool.m4 and remember it, then when it sees us
-# using a macro with the same name in our local m4/libtool.m4 it'll
-# pull the old libtool.m4 in (it doesn't see our shiny new m4_define
-# and doesn't know about Autoconf macros at all.)
-#
-# So we provide this file, which has a silly filename so it's always
-# included after everything else.  This provides aclocal with the
-# AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything
-# because those macros already exist, or will be overwritten later.
-# We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6. 
-#
-# Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here.
-# Yes, that means every name once taken will need to remain here until
-# we give up compatibility with versions before 1.7, at which point
-# we need to keep only those names which we still refer to.
-
-# This is to help aclocal find these macros, as it can't see m4_define.
-AC_DEFUN([LTOBSOLETE_VERSION], [m4_if([1])])
-
-m4_ifndef([AC_LIBTOOL_LINKER_OPTION],	[AC_DEFUN([AC_LIBTOOL_LINKER_OPTION])])
-m4_ifndef([AC_PROG_EGREP],		[AC_DEFUN([AC_PROG_EGREP])])
-m4_ifndef([_LT_AC_PROG_ECHO_BACKSLASH],	[AC_DEFUN([_LT_AC_PROG_ECHO_BACKSLASH])])
-m4_ifndef([_LT_AC_SHELL_INIT],		[AC_DEFUN([_LT_AC_SHELL_INIT])])
-m4_ifndef([_LT_AC_SYS_LIBPATH_AIX],	[AC_DEFUN([_LT_AC_SYS_LIBPATH_AIX])])
-m4_ifndef([_LT_PROG_LTMAIN],		[AC_DEFUN([_LT_PROG_LTMAIN])])
-m4_ifndef([_LT_AC_TAGVAR],		[AC_DEFUN([_LT_AC_TAGVAR])])
-m4_ifndef([AC_LTDL_ENABLE_INSTALL],	[AC_DEFUN([AC_LTDL_ENABLE_INSTALL])])
-m4_ifndef([AC_LTDL_PREOPEN],		[AC_DEFUN([AC_LTDL_PREOPEN])])
-m4_ifndef([_LT_AC_SYS_COMPILER],	[AC_DEFUN([_LT_AC_SYS_COMPILER])])
-m4_ifndef([_LT_AC_LOCK],		[AC_DEFUN([_LT_AC_LOCK])])
-m4_ifndef([AC_LIBTOOL_SYS_OLD_ARCHIVE],	[AC_DEFUN([AC_LIBTOOL_SYS_OLD_ARCHIVE])])
-m4_ifndef([_LT_AC_TRY_DLOPEN_SELF],	[AC_DEFUN([_LT_AC_TRY_DLOPEN_SELF])])
-m4_ifndef([AC_LIBTOOL_PROG_CC_C_O],	[AC_DEFUN([AC_LIBTOOL_PROG_CC_C_O])])
-m4_ifndef([AC_LIBTOOL_SYS_HARD_LINK_LOCKS], [AC_DEFUN([AC_LIBTOOL_SYS_HARD_LINK_LOCKS])])
-m4_ifndef([AC_LIBTOOL_OBJDIR],		[AC_DEFUN([AC_LIBTOOL_OBJDIR])])
-m4_ifndef([AC_LTDL_OBJDIR],		[AC_DEFUN([AC_LTDL_OBJDIR])])
-m4_ifndef([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH], [AC_DEFUN([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH])])
-m4_ifndef([AC_LIBTOOL_SYS_LIB_STRIP],	[AC_DEFUN([AC_LIBTOOL_SYS_LIB_STRIP])])
-m4_ifndef([AC_PATH_MAGIC],		[AC_DEFUN([AC_PATH_MAGIC])])
-m4_ifndef([AC_PROG_LD_GNU],		[AC_DEFUN([AC_PROG_LD_GNU])])
-m4_ifndef([AC_PROG_LD_RELOAD_FLAG],	[AC_DEFUN([AC_PROG_LD_RELOAD_FLAG])])
-m4_ifndef([AC_DEPLIBS_CHECK_METHOD],	[AC_DEFUN([AC_DEPLIBS_CHECK_METHOD])])
-m4_ifndef([AC_LIBTOOL_PROG_COMPILER_NO_RTTI], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_NO_RTTI])])
-m4_ifndef([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE], [AC_DEFUN([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE])])
-m4_ifndef([AC_LIBTOOL_PROG_COMPILER_PIC], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_PIC])])
-m4_ifndef([AC_LIBTOOL_PROG_LD_SHLIBS],	[AC_DEFUN([AC_LIBTOOL_PROG_LD_SHLIBS])])
-m4_ifndef([AC_LIBTOOL_POSTDEP_PREDEP],	[AC_DEFUN([AC_LIBTOOL_POSTDEP_PREDEP])])
-m4_ifndef([LT_AC_PROG_EGREP],		[AC_DEFUN([LT_AC_PROG_EGREP])])
-m4_ifndef([LT_AC_PROG_SED],		[AC_DEFUN([LT_AC_PROG_SED])])
-m4_ifndef([_LT_CC_BASENAME],		[AC_DEFUN([_LT_CC_BASENAME])])
-m4_ifndef([_LT_COMPILER_BOILERPLATE],	[AC_DEFUN([_LT_COMPILER_BOILERPLATE])])
-m4_ifndef([_LT_LINKER_BOILERPLATE],	[AC_DEFUN([_LT_LINKER_BOILERPLATE])])
-m4_ifndef([_AC_PROG_LIBTOOL],		[AC_DEFUN([_AC_PROG_LIBTOOL])])
-m4_ifndef([AC_LIBTOOL_SETUP],		[AC_DEFUN([AC_LIBTOOL_SETUP])])
-m4_ifndef([_LT_AC_CHECK_DLFCN],		[AC_DEFUN([_LT_AC_CHECK_DLFCN])])
-m4_ifndef([AC_LIBTOOL_SYS_DYNAMIC_LINKER],	[AC_DEFUN([AC_LIBTOOL_SYS_DYNAMIC_LINKER])])
-m4_ifndef([_LT_AC_TAGCONFIG],		[AC_DEFUN([_LT_AC_TAGCONFIG])])
-m4_ifndef([AC_DISABLE_FAST_INSTALL],	[AC_DEFUN([AC_DISABLE_FAST_INSTALL])])
-m4_ifndef([_LT_AC_LANG_CXX],		[AC_DEFUN([_LT_AC_LANG_CXX])])
-m4_ifndef([_LT_AC_LANG_F77],		[AC_DEFUN([_LT_AC_LANG_F77])])
-m4_ifndef([_LT_AC_LANG_GCJ],		[AC_DEFUN([_LT_AC_LANG_GCJ])])
-m4_ifndef([AC_LIBTOOL_LANG_C_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_C_CONFIG])])
-m4_ifndef([_LT_AC_LANG_C_CONFIG],	[AC_DEFUN([_LT_AC_LANG_C_CONFIG])])
-m4_ifndef([AC_LIBTOOL_LANG_CXX_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_CXX_CONFIG])])
-m4_ifndef([_LT_AC_LANG_CXX_CONFIG],	[AC_DEFUN([_LT_AC_LANG_CXX_CONFIG])])
-m4_ifndef([AC_LIBTOOL_LANG_F77_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_F77_CONFIG])])
-m4_ifndef([_LT_AC_LANG_F77_CONFIG],	[AC_DEFUN([_LT_AC_LANG_F77_CONFIG])])
-m4_ifndef([AC_LIBTOOL_LANG_GCJ_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_GCJ_CONFIG])])
-m4_ifndef([_LT_AC_LANG_GCJ_CONFIG],	[AC_DEFUN([_LT_AC_LANG_GCJ_CONFIG])])
-m4_ifndef([AC_LIBTOOL_LANG_RC_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_RC_CONFIG])])
-m4_ifndef([_LT_AC_LANG_RC_CONFIG],	[AC_DEFUN([_LT_AC_LANG_RC_CONFIG])])
-m4_ifndef([AC_LIBTOOL_CONFIG],		[AC_DEFUN([AC_LIBTOOL_CONFIG])])
-m4_ifndef([_LT_AC_FILE_LTDLL_C],	[AC_DEFUN([_LT_AC_FILE_LTDLL_C])])
-m4_ifndef([_LT_REQUIRED_DARWIN_CHECKS],	[AC_DEFUN([_LT_REQUIRED_DARWIN_CHECKS])])
-m4_ifndef([_LT_AC_PROG_CXXCPP],		[AC_DEFUN([_LT_AC_PROG_CXXCPP])])
-m4_ifndef([_LT_PREPARE_SED_QUOTE_VARS],	[AC_DEFUN([_LT_PREPARE_SED_QUOTE_VARS])])
-m4_ifndef([_LT_PROG_ECHO_BACKSLASH],	[AC_DEFUN([_LT_PROG_ECHO_BACKSLASH])])
-m4_ifndef([_LT_PROG_F77],		[AC_DEFUN([_LT_PROG_F77])])
-m4_ifndef([_LT_PROG_FC],		[AC_DEFUN([_LT_PROG_FC])])
-m4_ifndef([_LT_PROG_CXX],		[AC_DEFUN([_LT_PROG_CXX])])
diff --git a/src/rocksdb/port/dirent.h b/src/rocksdb/port/dirent.h
new file mode 100644
index 0000000..ee4ded1
--- /dev/null
+++ b/src/rocksdb/port/dirent.h
@@ -0,0 +1,47 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// See port_example.h for documentation for the following types/functions.
+
+#ifndef STORAGE_LEVELDB_PORT_DIRENT_H_
+#define STORAGE_LEVELDB_PORT_DIRENT_H_
+
+#ifdef ROCKSDB_PLATFORM_POSIX
+#include <dirent.h>
+#include <sys/types.h>
+#elif defined(OS_WIN)
+
+namespace rocksdb {
+namespace port {
+
+struct dirent {
+  char d_name[_MAX_PATH]; /* filename */
+};
+
+struct DIR;
+
+DIR* opendir(const char* name);
+
+dirent* readdir(DIR* dirp);
+
+int closedir(DIR* dirp);
+
+}  // namespace port
+
+using port::dirent;
+using port::DIR;
+using port::opendir;
+using port::readdir;
+using port::closedir;
+
+}  // namespace rocksdb
+
+#endif  // OS_WIN
+
+#endif  // STORAGE_LEVELDB_PORT_DIRENT_H_
diff --git a/src/rocksdb/port/port.h b/src/rocksdb/port/port.h
index bc4b6a1..e949cd2 100644
--- a/src/rocksdb/port/port.h
+++ b/src/rocksdb/port/port.h
@@ -16,5 +16,7 @@
 // of what the new port_<platform>.h file must provide.
 #if defined(ROCKSDB_PLATFORM_POSIX)
 #include "port/port_posix.h"
+#elif defined(OS_WIN)
+#include "port/win/port_win.h"
 #endif
 
diff --git a/src/rocksdb/port/port_posix.cc b/src/rocksdb/port/port_posix.cc
index a8cffcc..773c6f1 100644
--- a/src/rocksdb/port/port_posix.cc
+++ b/src/rocksdb/port/port_posix.cc
@@ -9,11 +9,14 @@
 
 #include "port/port_posix.h"
 
-#include <stdio.h>
 #include <assert.h>
 #include <errno.h>
-#include <sys/time.h>
+#include <signal.h>
+#include <stdio.h>
 #include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <unistd.h>
 #include <cstdlib>
 #include "util/logging.h"
 
@@ -133,5 +136,26 @@ void InitOnce(OnceType* once, void (*initializer)()) {
   PthreadCall("once", pthread_once(once, initializer));
 }
 
+void Crash(const std::string& srcfile, int srcline) {
+  fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
+  fflush(stdout);
+  kill(getpid(), SIGTERM);
+}
+
+int GetMaxOpenFiles() {
+#if defined(RLIMIT_NOFILE)
+  struct rlimit no_files_limit;
+  if (getrlimit(RLIMIT_NOFILE, &no_files_limit) != 0) {
+    return -1;
+  }
+  // protect against overflow
+  if (no_files_limit.rlim_cur >= std::numeric_limits<int>::max()) {
+    return std::numeric_limits<int>::max();
+  }
+  return static_cast<int>(no_files_limit.rlim_cur);
+#endif
+  return -1;
+}
+
 }  // namespace port
 }  // namespace rocksdb
diff --git a/src/rocksdb/port/port_posix.h b/src/rocksdb/port/port_posix.h
index dbb6e17..efb72ee 100644
--- a/src/rocksdb/port/port_posix.h
+++ b/src/rocksdb/port/port_posix.h
@@ -11,6 +11,13 @@
 
 #pragma once
 
+// size_t printf formatting named in the manner of C99 standard formatting
+// strings such as PRIu64
+// in fact, we could use that one
+#define ROCKSDB_PRIszt "zu"
+
+#define ROCKSDB_NOEXCEPT noexcept
+
 #undef PLATFORM_IS_LITTLE_ENDIAN
 #if defined(OS_MACOSX)
   #include <machine/endian.h>
@@ -48,7 +55,7 @@
 
 #if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\
     defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\
-    defined(OS_ANDROID)
+    defined(OS_ANDROID) || defined(CYGWIN)
 // Use fread/fwrite/fflush on platforms without _unlocked variants
 #define fread_unlocked fread
 #define fwrite_unlocked fwrite
@@ -67,9 +74,16 @@
 #define fdatasync fsync
 #endif
 
+#include <limits>
+
 namespace rocksdb {
 namespace port {
 
+// For use at db/file_indexer.h kLevelMaxIndex
+const int kMaxInt32 = std::numeric_limits<int32_t>::max();
+const uint64_t kMaxUint64 = std::numeric_limits<uint64_t>::max();
+const size_t kMaxSizet = std::numeric_limits<size_t>::max();
+
 static const bool kLittleEndian = PLATFORM_IS_LITTLE_ENDIAN;
 #undef PLATFORM_IS_LITTLE_ENDIAN
 
@@ -139,6 +153,10 @@ extern void InitOnce(OnceType* once, void (*initializer)());
 
 #define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
 
+extern void Crash(const std::string& srcfile, int srcline);
+
+extern int GetMaxOpenFiles();
+
 } // namespace port
 } // namespace rocksdb
 
diff --git a/src/rocksdb/port/sys_time.h b/src/rocksdb/port/sys_time.h
new file mode 100644
index 0000000..6c23d8e
--- /dev/null
+++ b/src/rocksdb/port/sys_time.h
@@ -0,0 +1,48 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This file is a portable substitute for sys/time.h which does not exist on
+// Windows
+
+#ifndef STORAGE_LEVELDB_PORT_SYS_TIME_H_
+#define STORAGE_LEVELDB_PORT_SYS_TIME_H_
+
+#if defined(OS_WIN) && defined(_MSC_VER)
+
+#include <time.h>
+
+namespace rocksdb {
+
+namespace port {
+
+// Avoid including winsock2.h for this definition
+typedef struct timeval {
+  long tv_sec;
+  long tv_usec;
+} timeval;
+
+void gettimeofday(struct timeval* tv, struct timezone* tz);
+
+inline struct tm* localtime_r(const time_t* timep, struct tm* result) {
+  errno_t ret = localtime_s(result, timep);
+  return (ret == 0) ? result : NULL;
+}
+}
+
+using port::timeval;
+using port::gettimeofday;
+using port::localtime_r;
+}
+
+#else
+#include <time.h>
+#include <sys/time.h>
+#endif
+
+#endif  // STORAGE_LEVELDB_PORT_SYS_TIME_H_
diff --git a/src/rocksdb/port/util_logger.h b/src/rocksdb/port/util_logger.h
new file mode 100644
index 0000000..dbb6717
--- /dev/null
+++ b/src/rocksdb/port/util_logger.h
@@ -0,0 +1,23 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_PORT_UTIL_LOGGER_H_
+#define STORAGE_LEVELDB_PORT_UTIL_LOGGER_H_
+
+// Include the appropriate platform specific file below.  If you are
+// porting to a new platform, see "port_example.h" for documentation
+// of what the new port_<platform>.h file must provide.
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+#include "util/posix_logger.h"
+#elif defined(OS_WIN)
+#include "port/win/win_logger.h"
+#endif
+
+#endif  // STORAGE_LEVELDB_PORT_UTIL_LOGGER_H_
diff --git a/src/rocksdb/port/win/env_win.cc b/src/rocksdb/port/win/env_win.cc
new file mode 100644
index 0000000..4584732
--- /dev/null
+++ b/src/rocksdb/port/win/env_win.cc
@@ -0,0 +1,2099 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <deque>
+#include <thread>
+#include <ctime>
+
+#include <errno.h>
+#include <process.h>
+#include <io.h>
+#include <direct.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+
+#include "port/port.h"
+#include "port/dirent.h"
+#include "port/win/win_logger.h"
+
+#include "util/random.h"
+#include "util/iostats_context_imp.h"
+#include "util/rate_limiter.h"
+#include "util/sync_point.h"
+#include "util/aligned_buffer.h"
+
+#include "util/thread_status_updater.h"
+#include "util/thread_status_util.h"
+
+#include <Rpc.h>  // For UUID generation
+#include <Windows.h>
+
+namespace rocksdb {
+
+std::string GetWindowsErrSz(DWORD err) {
+  LPSTR lpMsgBuf;
+  FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
+                     FORMAT_MESSAGE_IGNORE_INSERTS,
+                 NULL, err,
+                 0,  // Default language
+                 reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
+
+  std::string Err = lpMsgBuf;
+  LocalFree(lpMsgBuf);
+  return Err;
+}
+
+namespace {
+
+const size_t c_OneMB = (1 << 20);
+
+ThreadStatusUpdater* CreateThreadStatusUpdater() {
+  return new ThreadStatusUpdater();
+}
+
+// A wrapper for fadvise, if the platform doesn't support fadvise,
+// it will simply return Status::NotSupport.
+int Fadvise(int fd, off_t offset, size_t len, int advice) {
+  return 0;  // simply do nothing.
+}
+
+inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) {
+  return Status::IOError(context, GetWindowsErrSz(err));
+}
+
+inline Status IOErrorFromLastWindowsError(const std::string& context) {
+  return IOErrorFromWindowsError(context, GetLastError());
+}
+
+inline Status IOError(const std::string& context, int err_number) {
+  return Status::IOError(context, strerror(err_number));
+}
+
+// TODO(sdong): temp logging. Need to help debugging. Remove it when
+// the feature is proved to be stable.
+inline void PrintThreadInfo(size_t thread_id, size_t terminatingId) {
+  fprintf(stdout, "Bg thread %Iu terminates %Iu\n", thread_id, terminatingId);
+}
+
+// returns the ID of the current process
+inline int current_process_id() { return _getpid(); }
+
+// RAII helpers for HANDLEs
+const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); };
+typedef std::unique_ptr<void, decltype(CloseHandleFunc)> UniqueCloseHandlePtr;
+
+// We preserve the original name of this interface to denote the original idea
+// behind it.
+// All reads happen by a specified offset and pwrite interface does not change
+// the position of the file pointer. Judging from the man page and errno it does
+// execute
+// lseek atomically to return the position of the file back where it was.
+// WriteFile() does not
+// have this capability. Therefore, for both pread and pwrite the pointer is
+// advanced to the next position
+// which is fine for writes because they are (should be) sequential.
+// Because all the reads/writes happen by the specified offset, the caller in
+// theory should not
+// rely on the current file offset.
+SSIZE_T pwrite(HANDLE hFile, const char* src, size_t numBytes,
+               uint64_t offset) {
+  OVERLAPPED overlapped = {0};
+  ULARGE_INTEGER offsetUnion;
+  offsetUnion.QuadPart = offset;
+
+  overlapped.Offset = offsetUnion.LowPart;
+  overlapped.OffsetHigh = offsetUnion.HighPart;
+
+  SSIZE_T result = 0;
+
+  unsigned long bytesWritten = 0;
+
+  if (FALSE == WriteFile(hFile, src, numBytes, &bytesWritten, &overlapped)) {
+    result = -1;
+  } else {
+    result = bytesWritten;
+  }
+
+  return result;
+}
+
+// See comments for pwrite above
+SSIZE_T pread(HANDLE hFile, char* src, size_t numBytes, uint64_t offset) {
+  OVERLAPPED overlapped = {0};
+  ULARGE_INTEGER offsetUnion;
+  offsetUnion.QuadPart = offset;
+
+  overlapped.Offset = offsetUnion.LowPart;
+  overlapped.OffsetHigh = offsetUnion.HighPart;
+
+  SSIZE_T result = 0;
+
+  unsigned long bytesRead = 0;
+
+  if (FALSE == ReadFile(hFile, src, numBytes, &bytesRead, &overlapped)) {
+    return -1;
+  } else {
+    result = bytesRead;
+  }
+
+  return result;
+}
+
+// Note the below two do not set errno because they are used only here in this
+// file
+// on a Windows handle and, therefore, not necessary. Translating GetLastError()
+// to errno
+// is a sad business
+inline int fsync(HANDLE hFile) {
+  if (!FlushFileBuffers(hFile)) {
+    return -1;
+  }
+
+  return 0;
+}
+
+// SetFileInformationByHandle() is capable of fast pre-allocates.
+// However, this does not change the file end position unless the file is
+// truncated and the pre-allocated space is not considered filled with zeros.
+inline Status fallocate(const std::string& filename, HANDLE hFile,
+                        uint64_t to_size) {
+  Status status;
+
+  FILE_ALLOCATION_INFO alloc_info;
+  alloc_info.AllocationSize.QuadPart = to_size;
+
+  if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
+                                  sizeof(FILE_ALLOCATION_INFO))) {
+    auto lastError = GetLastError();
+    status = IOErrorFromWindowsError(
+        "Failed to pre-allocate space: " + filename, lastError);
+  }
+
+  return status;
+}
+
+inline Status ftruncate(const std::string& filename, HANDLE hFile,
+                        uint64_t toSize) {
+  Status status;
+
+  FILE_END_OF_FILE_INFO end_of_file;
+  end_of_file.EndOfFile.QuadPart = toSize;
+
+  if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
+                                  sizeof(FILE_END_OF_FILE_INFO))) {
+    auto lastError = GetLastError();
+    status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
+                                     lastError);
+  }
+
+  return status;
+}
+
+// mmap() based random-access
+class WinMmapReadableFile : public RandomAccessFile {
+  const std::string fileName_;
+  HANDLE hFile_;
+  HANDLE hMap_;
+
+  const void* mapped_region_;
+  const size_t length_;
+
+ public:
+  // mapped_region_[0,length-1] contains the mmapped contents of the file.
+  WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap,
+                      const void* mapped_region, size_t length)
+      : fileName_(fileName),
+        hFile_(hFile),
+        hMap_(hMap),
+        mapped_region_(mapped_region),
+        length_(length) {}
+
+  ~WinMmapReadableFile() {
+    BOOL ret = ::UnmapViewOfFile(mapped_region_);
+    assert(ret);
+
+    ret = ::CloseHandle(hMap_);
+    assert(ret);
+
+    ret = ::CloseHandle(hFile_);
+    assert(ret);
+  }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const override {
+    Status s;
+
+    if (offset > length_) {
+      *result = Slice();
+      return IOError(fileName_, EINVAL);
+    } else if (offset + n > length_) {
+      n = length_ - offset;
+    }
+    *result =
+        Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n);
+    return s;
+  }
+
+  virtual Status InvalidateCache(size_t offset, size_t length) override {
+    return Status::OK();
+  }
+};
+
+// We preallocate up to an extra megabyte and use memcpy to append new
+// data to the file.  This is safe since we either properly close the
+// file before reading from it, or for log files, the reading code
+// knows enough to skip zero suffixes.
+class WinMmapFile : public WritableFile {
+ private:
+  const std::string filename_;
+  HANDLE hFile_;
+  HANDLE hMap_;
+
+  const size_t page_size_;  // We flush the mapping view in page_size
+                            // increments. We may decide if this is a memory
+                            // page size or SSD page size
+  const size_t
+      allocation_granularity_;  // View must start at such a granularity
+  size_t mapping_size_;         // We want file mapping to be of a specific size
+                                // because then the file is expandable
+  size_t view_size_;            // How much memory to map into a view at a time
+
+  char* mapped_begin_;  // Must begin at the file offset that is aligned with
+                        // allocation_granularity_
+  char* mapped_end_;
+  char* dst_;  // Where to write next  (in range [mapped_begin_,mapped_end_])
+  char* last_sync_;  // Where have we synced up to
+
+  uint64_t file_offset_;  // Offset of mapped_begin_ in file
+
+  // Do we have unsynced writes?
+  bool pending_sync_;
+
+  // Can only truncate or reserve to a sector size aligned if
+  // used on files that are opened with Unbuffered I/O
+  Status TruncateFile(uint64_t toSize) {
+    return ftruncate(filename_, hFile_, toSize);
+  }
+
+  // Can only truncate or reserve to a sector size aligned if
+  // used on files that are opened with Unbuffered I/O
+  // Normally it does not present a problem since in memory mapped files
+  // we do not disable buffering
+  Status ReserveFileSpace(uint64_t toSize) {
+    IOSTATS_TIMER_GUARD(allocate_nanos);
+    return fallocate(filename_, hFile_, toSize);
+  }
+
+  Status UnmapCurrentRegion() {
+    Status status;
+
+    if (mapped_begin_ != nullptr) {
+      if (!::UnmapViewOfFile(mapped_begin_)) {
+        status = IOErrorFromWindowsError(
+            "Failed to unmap file view: " + filename_, GetLastError());
+      }
+
+      // UnmapView automatically sends data to disk but not the metadata
+      // which is good and provides some equivalent of fdatasync() on Linux
+      // therefore, we donot need separate flag for metadata
+      pending_sync_ = false;
+      mapped_begin_ = nullptr;
+      mapped_end_ = nullptr;
+      dst_ = nullptr;
+      last_sync_ = nullptr;
+
+      // Move on to the next portion of the file
+      file_offset_ += view_size_;
+
+      // Increase the amount we map the next time, but capped at 1MB
+      view_size_ *= 2;
+      view_size_ = std::min(view_size_, c_OneMB);
+    }
+
+    return status;
+  }
+
+  Status MapNewRegion() {
+    Status status;
+
+    assert(mapped_begin_ == nullptr);
+
+    size_t minMappingSize = file_offset_ + view_size_;
+
+    // Check if we need to create a new mapping since we want to write beyond
+    // the current one
+    // If the mapping view is now too short
+    // CreateFileMapping will extend the size of the file automatically if the
+    // mapping size is greater than
+    // the current length of the file, which reserves the space and makes
+    // writing faster, except, windows can not map an empty file.
+    // Thus the first time around we must actually extend the file ourselves
+    if (hMap_ == NULL || minMappingSize > mapping_size_) {
+      if (NULL == hMap_) {
+        // Creating mapping for the first time so reserve the space on disk
+        status = ReserveFileSpace(minMappingSize);
+        if (!status.ok()) {
+          return status;
+        }
+      }
+
+      if (hMap_) {
+        // Unmap the previous one
+        BOOL ret = ::CloseHandle(hMap_);
+        assert(ret);
+        hMap_ = NULL;
+      }
+
+      // Calculate the new mapping size which will hopefully reserve space for
+      // several consecutive sliding views
+      // Query preallocation block size if set
+      size_t preallocationBlockSize = 0;
+      size_t lastAllocatedBlockSize = 0;  // Not used
+      GetPreallocationStatus(&preallocationBlockSize, &lastAllocatedBlockSize);
+
+      if (preallocationBlockSize) {
+        preallocationBlockSize =
+            Roundup(preallocationBlockSize, allocation_granularity_);
+      } else {
+        preallocationBlockSize = 2 * view_size_;
+      }
+
+      mapping_size_ += preallocationBlockSize;
+
+      ULARGE_INTEGER mappingSize;
+      mappingSize.QuadPart = mapping_size_;
+
+      hMap_ = CreateFileMappingA(
+          hFile_,
+          NULL,                  // Security attributes
+          PAGE_READWRITE,        // There is not a write only mode for mapping
+          mappingSize.HighPart,  // Enable mapping the whole file but the actual
+                                 // amount mapped is determined by MapViewOfFile
+          mappingSize.LowPart,
+          NULL);  // Mapping name
+
+      if (NULL == hMap_) {
+        return IOErrorFromWindowsError(
+            "WindowsMmapFile failed to create file mapping for: " + filename_,
+            GetLastError());
+      }
+    }
+
+    ULARGE_INTEGER offset;
+    offset.QuadPart = file_offset_;
+
+    // View must begin at the granularity aligned offset
+    mapped_begin_ = reinterpret_cast<char*>(
+        MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
+                        view_size_, NULL));
+
+    if (!mapped_begin_) {
+      status = IOErrorFromWindowsError(
+          "WindowsMmapFile failed to map file view: " + filename_,
+          GetLastError());
+    } else {
+      mapped_end_ = mapped_begin_ + view_size_;
+      dst_ = mapped_begin_;
+      last_sync_ = mapped_begin_;
+      pending_sync_ = false;
+    }
+    return status;
+  }
+
+ public:
+  WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size,
+              size_t allocation_granularity, const EnvOptions& options)
+      : filename_(fname),
+        hFile_(hFile),
+        hMap_(NULL),
+        page_size_(page_size),
+        allocation_granularity_(allocation_granularity),
+        mapping_size_(0),
+        view_size_(0),
+        mapped_begin_(nullptr),
+        mapped_end_(nullptr),
+        dst_(nullptr),
+        last_sync_(nullptr),
+        file_offset_(0),
+        pending_sync_(false) {
+    // Allocation granularity must be obtained from GetSystemInfo() and must be
+    // a power of two.
+    assert(allocation_granularity > 0);
+    assert((allocation_granularity & (allocation_granularity - 1)) == 0);
+
+    assert(page_size > 0);
+    assert((page_size & (page_size - 1)) == 0);
+
+    // Only for memory mapped writes
+    assert(options.use_mmap_writes);
+
+    // Make sure buffering is not disabled. It is ignored for mapping
+    // purposes but also imposes restriction on moving file position
+    // it is not a problem so much with reserving space since it is probably a
+    // factor
+    // of allocation_granularity but we also want to truncate the file in
+    // Close() at
+    // arbitrary position so we do not have to feel this with zeros.
+    assert(options.use_os_buffer);
+
+    // View size must be both the multiple of allocation_granularity AND the
+    // page size
+    if ((allocation_granularity_ % page_size_) == 0) {
+      view_size_ = 2 * allocation_granularity;
+    } else if ((page_size_ % allocation_granularity_) == 0) {
+      view_size_ = 2 * page_size_;
+    } else {
+      // we can multiply them together
+      assert(false);
+    }
+  }
+
+  ~WinMmapFile() {
+    if (hFile_) {
+      this->Close();
+    }
+  }
+
+  virtual Status Append(const Slice& data) override {
+    const char* src = data.data();
+    size_t left = data.size();
+
+    while (left > 0) {
+      assert(mapped_begin_ <= dst_);
+      size_t avail = mapped_end_ - dst_;
+
+      if (avail == 0) {
+        Status s = UnmapCurrentRegion();
+        if (s.ok()) {
+          s = MapNewRegion();
+        }
+
+        if (!s.ok()) {
+          return s;
+        }
+      }
+
+      size_t n = std::min(left, avail);
+      memcpy(dst_, src, n);
+      dst_ += n;
+      src += n;
+      left -= n;
+      pending_sync_ = true;
+    }
+
+    return Status::OK();
+  }
+
+  // Means Close() will properly take care of truncate
+  // and it does not need any additional information
+  virtual Status Truncate(uint64_t size) override {
+    return Status::OK();
+  }
+
+  virtual Status Close() override {
+    Status s;
+
+    assert(NULL != hFile_);
+
+    // We truncate to the precise size so no
+    // uninitialized data at the end. SetEndOfFile
+    // which we use does not write zeros and it is good.
+    uint64_t targetSize = GetFileSize();
+
+    s = UnmapCurrentRegion();
+
+    if (NULL != hMap_) {
+      BOOL ret = ::CloseHandle(hMap_);
+      if (!ret && s.ok()) {
+        auto lastError = GetLastError();
+        s = IOErrorFromWindowsError(
+            "Failed to Close mapping for file: " + filename_, lastError);
+      }
+
+      hMap_ = NULL;
+    }
+
+    TruncateFile(targetSize);
+
+    BOOL ret = ::CloseHandle(hFile_);
+    hFile_ = NULL;
+
+    if (!ret && s.ok()) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError(
+          "Failed to close file map handle: " + filename_, lastError);
+    }
+
+    return s;
+  }
+
+  virtual Status Flush() override { return Status::OK(); }
+
+  // Flush only data
+  virtual Status Sync() override {
+    Status s;
+
+    // Some writes occurred since last sync
+    if (pending_sync_) {
+      assert(mapped_begin_);
+      assert(dst_);
+      assert(dst_ > mapped_begin_);
+      assert(dst_ < mapped_end_);
+
+      size_t page_begin =
+          TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
+      size_t page_end =
+          TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
+      last_sync_ = dst_;
+
+      // Flush only the amount of that is a multiple of pages
+      if (!::FlushViewOfFile(mapped_begin_ + page_begin,
+                             (page_end - page_begin) + page_size_)) {
+        s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
+                                    GetLastError());
+      }
+
+      pending_sync_ = false;
+    }
+
+    return s;
+  }
+
+  /**
+  * Flush data as well as metadata to stable storage.
+  */
+  virtual Status Fsync() override {
+    Status s;
+
+    // Flush metadata if pending
+    const bool pending = pending_sync_;
+
+    s = Sync();
+
+    // Flush metadata
+    if (s.ok() && pending) {
+      if (!::FlushFileBuffers(hFile_)) {
+        s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
+                                    GetLastError());
+      }
+    }
+
+    return s;
+  }
+
+  /**
+  * Get the size of valid data in the file. This will not match the
+  * size that is returned from the filesystem because we use mmap
+  * to extend file by map_size every time.
+  */
+  virtual uint64_t GetFileSize() override {
+    size_t used = dst_ - mapped_begin_;
+    return file_offset_ + used;
+  }
+
+  virtual Status InvalidateCache(size_t offset, size_t length) override {
+    return Status::OK();
+  }
+
+  virtual Status Allocate(off_t offset, off_t len) override {
+    return Status::OK();
+  }
+};
+
+class WinSequentialFile : public SequentialFile {
+ private:
+  const std::string filename_;
+  HANDLE file_;
+
+  // There is no equivalent of advising away buffered pages as in posix.
+  // To implement this flag we would need to do unbuffered reads which 
+  // will need to be aligned (not sure there is a guarantee that the buffer
+  // passed in is aligned).
+  // Hence we currently ignore this flag. It is used only in a few cases
+  // which should not be perf critical.
+  // If perf evaluation finds this to be a problem, we can look into
+  // implementing this.
+  bool use_os_buffer_;
+
+ public:
+  WinSequentialFile(const std::string& fname, HANDLE f,
+                    const EnvOptions& options)
+      : filename_(fname),
+        file_(f),
+        use_os_buffer_(options.use_os_buffer) {}
+
+  virtual ~WinSequentialFile() {
+    assert(file_ != INVALID_HANDLE_VALUE);
+    CloseHandle(file_);
+  }
+
+  virtual Status Read(size_t n, Slice* result, char* scratch) override {
+    Status s;
+    size_t r = 0;
+
+    // Windows ReadFile API accepts a DWORD.
+    // While it is possible to read in a loop if n is > UINT_MAX
+    // it is a highly unlikely case.
+    if (n > UINT_MAX) {
+      return IOErrorFromWindowsError(filename_, ERROR_INVALID_PARAMETER);
+    }
+
+    DWORD bytesToRead = static_cast<DWORD>(n); //cast is safe due to the check above
+    DWORD bytesRead = 0;
+    BOOL ret = ReadFile(file_, scratch, bytesToRead, &bytesRead, NULL);
+    if (ret == TRUE) {
+      r = bytesRead;
+    } else {
+      return IOErrorFromWindowsError(filename_, GetLastError());
+    }
+
+    *result = Slice(scratch, r);
+
+    return s;
+  }
+
+  virtual Status Skip(uint64_t n) override {
+    // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit
+    // integer. As such it is a highly unlikley case to have n so large.
+    if (n > _I64_MAX) {
+      return IOErrorFromWindowsError(filename_, ERROR_INVALID_PARAMETER);
+    }
+
+    LARGE_INTEGER li;
+    li.QuadPart = static_cast<int64_t>(n); //cast is safe due to the check above
+    BOOL ret = SetFilePointerEx(file_, li, NULL, FILE_CURRENT);
+    if (ret == FALSE) {
+      return IOErrorFromWindowsError(filename_, GetLastError());
+    }
+    return Status::OK();
+  }
+
+  virtual Status InvalidateCache(size_t offset, size_t length) override {
+    return Status::OK();
+  }
+};
+
+// pread() based random-access
+class WinRandomAccessFile : public RandomAccessFile {
+  const std::string filename_;
+  HANDLE hFile_;
+  const bool use_os_buffer_;
+  mutable std::mutex buffer_mut_;
+  mutable AlignedBuffer buffer_;
+  mutable uint64_t
+      buffered_start_;  // file offset set that is currently buffered
+
+ public:
+  WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment,
+                      const EnvOptions& options)
+      : filename_(fname),
+        hFile_(hFile),
+        use_os_buffer_(options.use_os_buffer),
+        buffer_(),
+        buffered_start_(0) {
+    assert(!options.use_mmap_reads);
+
+    // Unbuffered access, use internal buffer for reads
+    if (!use_os_buffer_) {
+      buffer_.Alignment(alignment);
+      // Random read, no need in a big buffer
+      // We read things in database blocks which are likely to be similar to
+      // the alignment we use.
+      buffer_.AllocateNewBuffer(alignment * 2);
+    }
+  }
+
+  virtual ~WinRandomAccessFile() {
+    if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) {
+      ::CloseHandle(hFile_);
+    }
+  }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const override {
+    Status s;
+    SSIZE_T r = -1;
+    size_t left = n;
+    char* dest = scratch;
+
+    // When in unbuffered mode we need to do the following changes:
+    // - use our own aligned buffer
+    // - always read at the offset of that is a multiple of alignment
+    if (!use_os_buffer_) {
+      std::lock_guard<std::mutex> lg(buffer_mut_);
+
+      // Let's see if at least some of the requested data is already
+      // in the buffer
+      if (offset >= buffered_start_ &&
+          offset < (buffered_start_ + buffer_.CurrentSize())) {
+        size_t buffer_offset = offset - buffered_start_;
+        r = buffer_.Read(dest, buffer_offset, left);
+        assert(r >= 0);
+
+        left -= size_t(r);
+        offset += r;
+        dest += r;
+      }
+
+      // Still some left or none was buffered
+      if (left > 0) {
+        // Figure out the start/end offset for reading and amount to read
+        const size_t alignment = buffer_.Alignment();
+        const size_t start_page_start =
+            TruncateToPageBoundary(alignment, offset);
+        const size_t end_page_start =
+            TruncateToPageBoundary(alignment, offset + left - 1);
+        const size_t actual_bytes_toread =
+            (end_page_start - start_page_start) + alignment;
+
+        if (buffer_.Capacity() < actual_bytes_toread) {
+          buffer_.AllocateNewBuffer(actual_bytes_toread);
+        } else {
+          buffer_.Clear();
+        }
+
+        SSIZE_T read = 0;
+        read = pread(hFile_, buffer_.Destination(), actual_bytes_toread,
+                      start_page_start);
+
+        if (read > 0) {
+          buffer_.Size(read);
+          buffered_start_ = start_page_start;
+
+          // Let's figure out how much we read from the users standpoint
+          if ((buffered_start_ + uint64_t(read)) > offset) {
+            size_t buffer_offset = offset - buffered_start_;
+            r = buffer_.Read(dest, buffer_offset, left);
+          } else {
+            r = 0;
+          }
+          left -= r;
+        } else {
+          r = read;
+        }
+      }
+
+    } else {
+      r = pread(hFile_, scratch, left, offset);
+      if (r > 0) {
+        left -= r;
+      }
+    }
+
+    *result = Slice(scratch, (r < 0) ? 0 : n - left);
+
+    if (r < 0) {
+      s = IOErrorFromLastWindowsError(filename_);
+    }
+    return s;
+  }
+
+  virtual bool ShouldForwardRawRequest() const override {
+    return true;
+  }
+
+  virtual void Hint(AccessPattern pattern) override {}
+
+  virtual Status InvalidateCache(size_t offset, size_t length) override {
+    return Status::OK();
+  }
+};
+
+// This is a sequential write class. It has been mimicked (as others) after
+// the original Posix class. We add support for unbuffered I/O on windows as
+// well
+// we utilize the original buffer as an alignment buffer to write directly to
+// file with no buffering.
+// No buffering requires that the provided buffer is aligned to the physical
+// sector size (SSD page size) and
+// that all SetFilePointer() operations to occur with such an alignment.
+// We thus always write in sector/page size increments to the drive and leave
+// the tail for the next write OR for Close() at which point we pad with zeros.
+// No padding is required for
+// buffered access.
+class WinWritableFile : public WritableFile {
+ private:
+  const std::string filename_;
+  HANDLE            hFile_;
+  const bool        use_os_buffer_;  // Used to indicate unbuffered access, the file
+  const uint64_t    alignment_;
+  // must be opened as unbuffered if false
+  uint64_t          filesize_;      // How much data is actually written disk
+  uint64_t          reservedsize_;  // how far we have reserved space
+
+ public:
+  WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment,
+                  size_t capacity, const EnvOptions& options)
+      : filename_(fname),
+        hFile_(hFile),
+        use_os_buffer_(options.use_os_buffer),
+        alignment_(alignment),
+        filesize_(0),
+        reservedsize_(0) {
+    assert(!options.use_mmap_writes);
+  }
+
+  ~WinWritableFile() {
+    if (NULL != hFile_ && INVALID_HANDLE_VALUE != hFile_) {
+      WinWritableFile::Close();
+    }
+  }
+
+  // Indicates if the class makes use of unbuffered I/O
+  virtual bool UseOSBuffer() const override {
+    return use_os_buffer_;
+  }
+
+  virtual size_t GetRequiredBufferAlignment() const override {
+    return alignment_;
+  }
+
+  virtual Status Append(const Slice& data) override {
+
+    // Used for buffered access ONLY
+    assert(use_os_buffer_);
+    assert(data.size() < std::numeric_limits<int>::max());
+
+    Status s;
+
+    DWORD bytesWritten = 0;
+    if (!WriteFile(hFile_, data.data(),
+        data.size(), &bytesWritten, NULL)) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError(
+        "Failed to WriteFile: " + filename_,
+        lastError);
+    } else {
+      assert(size_t(bytesWritten) == data.size());
+      filesize_ += data.size();
+    }
+
+    return s;
+  }
+
+  virtual Status PositionedAppend(const Slice& data, uint64_t offset) override {
+    Status s;
+
+    SSIZE_T ret = pwrite(hFile_, data.data(), 
+      data.size(), offset);
+
+    // Error break
+    if (ret < 0) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError(
+        "Failed to pwrite for: " + filename_, lastError);
+    } else {
+      // With positional write it is not clear at all
+      // if this actually extends the filesize
+      assert(size_t(ret) == data.size());
+      filesize_ += data.size();
+    }
+    return s;
+  }
+
+  // Need to implement this so the file is truncated correctly
+  // when buffered and unbuffered mode
+  virtual Status Truncate(uint64_t size) override {
+    Status s =  ftruncate(filename_, hFile_, size);
+    if (s.ok()) {
+      filesize_ = size;
+    }
+    return s;
+  }
+
+  virtual Status Close() override {
+
+    Status s;
+
+    assert(INVALID_HANDLE_VALUE != hFile_);
+
+    if (fsync(hFile_) < 0) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError("fsync failed at Close() for: " + filename_,
+        lastError);
+    }
+
+    if (FALSE == ::CloseHandle(hFile_)) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError("CloseHandle failed for: " + filename_,
+                                  lastError);
+    }
+
+    hFile_ = INVALID_HANDLE_VALUE;
+    return s;
+  }
+
+  // write out the cached data to the OS cache
+  // This is now taken care of the WritableFileWriter
+  virtual Status Flush() override {
+    return Status::OK();
+  }
+
+  virtual Status Sync() override {
+    Status s;
+    // Calls flush buffers
+    if (fsync(hFile_) < 0) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError("fsync failed at Sync() for: " + filename_,
+                                  lastError);
+    }
+    return s;
+  }
+
+  virtual Status Fsync() override { return Sync(); }
+
+  virtual uint64_t GetFileSize() override {
+    // Double accounting now here with WritableFileWriter
+    // and this size will be wrong when unbuffered access is used
+    // but tests implement their own writable files and do not use WritableFileWrapper
+    // so we need to squeeze a square peg through
+    // a round hole here.
+    return filesize_;
+  }
+
+  virtual Status Allocate(off_t offset, off_t len) override {
+    Status status;
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+
+    // Make sure that we reserve an aligned amount of space
+    // since the reservation block size is driven outside so we want
+    // to check if we are ok with reservation here
+    size_t spaceToReserve = Roundup(offset + len, alignment_);
+    // Nothing to do
+    if (spaceToReserve <= reservedsize_) {
+      return status;
+    }
+
+    IOSTATS_TIMER_GUARD(allocate_nanos);
+    status = fallocate(filename_, hFile_, spaceToReserve);
+    if (status.ok()) {
+      reservedsize_ = spaceToReserve;
+    }
+    return status;
+  }
+};
+
+class WinDirectory : public Directory {
+ public:
+  WinDirectory() {}
+
+  virtual Status Fsync() override { return Status::OK(); }
+};
+
+class WinFileLock : public FileLock {
+ public:
+  explicit WinFileLock(HANDLE hFile) : hFile_(hFile) {
+    assert(hFile != NULL);
+    assert(hFile != INVALID_HANDLE_VALUE);
+  }
+
+  ~WinFileLock() {
+    BOOL ret = ::CloseHandle(hFile_);
+    assert(ret);
+  }
+
+ private:
+  HANDLE hFile_;
+};
+
+namespace {
+
+void WinthreadCall(const char* label, std::error_code result) {
+  if (0 != result.value()) {
+    fprintf(stderr, "pthread %s: %s\n", label, strerror(result.value()));
+    abort();
+  }
+}
+}
+
+class WinEnv : public Env {
+ public:
+  WinEnv();
+
+  virtual ~WinEnv() {
+    for (auto& th : threads_to_join_) {
+      th.join();
+    }
+
+    threads_to_join_.clear();
+
+    for (auto& thpool : thread_pools_) {
+      thpool.JoinAllThreads();
+    }
+    // All threads must be joined before the deletion of
+    // thread_status_updater_.
+    delete thread_status_updater_;
+  }
+
+  virtual Status DeleteFile(const std::string& fname) override {
+    Status result;
+
+    if (_unlink(fname.c_str())) {
+      result = IOError("Failed to delete: " + fname, errno);
+    }
+
+    return result;
+  }
+
+  Status GetCurrentTime(int64_t* unix_time) override {
+    time_t time = std::time(nullptr);
+    if (time == (time_t)(-1)) {
+      return Status::NotSupported("Failed to get time");
+    }
+
+    *unix_time = time;
+    return Status::OK();
+  }
+
+  virtual Status NewSequentialFile(const std::string& fname,
+                                   std::unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options) override {
+    Status s;
+
+    result->reset();
+
+    // Corruption test needs to rename and delete files of these kind
+    // while they are still open with another handle. For that reason we
+    // allow share_write and delete(allows rename).
+    HANDLE hFile = INVALID_HANDLE_VALUE;
+    {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      hFile = CreateFileA(
+          fname.c_str(), GENERIC_READ,
+          FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
+          OPEN_EXISTING,  // Original fopen mode is "rb"
+          FILE_ATTRIBUTE_NORMAL, NULL);
+    }
+
+    if (INVALID_HANDLE_VALUE == hFile) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError("Failed to open NewSequentialFile" + fname,
+                                  lastError);
+    } else {
+      result->reset(new WinSequentialFile(fname, hFile, options));
+    }
+    return s;
+  }
+
+  virtual Status NewRandomAccessFile(const std::string& fname,
+                                     std::unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options) override {
+    result->reset();
+    Status s;
+
+    // Open the file for read-only random access
+    // Random access is to disable read-ahead as the system reads too much data
+    DWORD fileFlags = FILE_ATTRIBUTE_READONLY;
+
+    if (!options.use_os_buffer && !options.use_mmap_reads) {
+      fileFlags |= FILE_FLAG_NO_BUFFERING;
+    } else {
+      fileFlags |= FILE_FLAG_RANDOM_ACCESS;
+    }
+
+    /// Shared access is necessary for corruption test to pass
+    // almost all tests would work with a possible exception of fault_injection
+    HANDLE hFile = 0;
+    {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      hFile =
+          CreateFileA(fname.c_str(), GENERIC_READ,
+                      FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                      NULL, OPEN_EXISTING, fileFlags, NULL);
+    }
+
+    if (INVALID_HANDLE_VALUE == hFile) {
+      auto lastError = GetLastError();
+      return IOErrorFromWindowsError(
+          "NewRandomAccessFile failed to Create/Open: " + fname, lastError);
+    }
+
+    UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
+
+    // CAUTION! This will map the entire file into the process address space
+    if (options.use_mmap_reads && sizeof(void*) >= 8) {
+      // Use mmap when virtual address-space is plentiful.
+      uint64_t fileSize;
+
+      s = GetFileSize(fname, &fileSize);
+
+      if (s.ok()) {
+        // Will not map empty files
+        if (fileSize == 0) {
+          return IOError(
+              "NewRandomAccessFile failed to map empty file: " + fname, EINVAL);
+        }
+
+        HANDLE hMap = CreateFileMappingA(hFile, NULL, PAGE_READONLY,
+                                         0,  // Whole file at its present length
+                                         0,
+                                         NULL);  // Mapping name
+
+        if (!hMap) {
+          auto lastError = GetLastError();
+          return IOErrorFromWindowsError(
+              "Failed to create file mapping for NewRandomAccessFile: " + fname,
+              lastError);
+        }
+
+        UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc);
+
+        const void* mapped_region =
+            MapViewOfFileEx(hMap, FILE_MAP_READ,
+                            0,  // High DWORD of access start
+                            0,  // Low DWORD
+                            fileSize,
+                            NULL);  // Let the OS choose the mapping
+
+        if (!mapped_region) {
+          auto lastError = GetLastError();
+          return IOErrorFromWindowsError(
+              "Failed to MapViewOfFile for NewRandomAccessFile: " + fname,
+              lastError);
+        }
+
+        result->reset(new WinMmapReadableFile(fname, hFile, hMap, mapped_region,
+                                              fileSize));
+
+        mapGuard.release();
+        fileGuard.release();
+      }
+    } else {
+      result->reset(new WinRandomAccessFile(fname, hFile, page_size_, options));
+      fileGuard.release();
+    }
+    return s;
+  }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 std::unique_ptr<WritableFile>* result,
+                                 const EnvOptions& options) override {
+    const size_t c_BufferCapacity = 64 * 1024;
+
+    EnvOptions local_options(options);
+
+    result->reset();
+    Status s;
+
+    DWORD fileFlags = FILE_ATTRIBUTE_NORMAL;
+
+    if (!local_options.use_os_buffer && !local_options.use_mmap_writes) {
+      fileFlags = FILE_FLAG_NO_BUFFERING;
+    }
+
+    // Desired access. We are want to write only here but if we want to memory
+    // map
+    // the file then there is no write only mode so we have to create it
+    // Read/Write
+    // However, MapViewOfFile specifies only Write only
+    DWORD desired_access = GENERIC_WRITE;
+    DWORD shared_mode = FILE_SHARE_READ;
+
+    if (local_options.use_mmap_writes) {
+      desired_access |= GENERIC_READ;
+    } else {
+      // Adding this solely for tests to pass (fault_injection_test,
+      // wal_manager_test).
+      shared_mode |= (FILE_SHARE_WRITE | FILE_SHARE_DELETE);
+    }
+
+    HANDLE hFile = 0;
+    {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      hFile = CreateFileA(
+          fname.c_str(),
+          desired_access,  // Access desired
+          shared_mode,
+          NULL,           // Security attributes
+          CREATE_ALWAYS,  // Posix env says O_CREAT | O_RDWR | O_TRUNC
+          fileFlags,      // Flags
+          NULL);          // Template File
+    }
+
+    if (INVALID_HANDLE_VALUE == hFile) {
+      auto lastError = GetLastError();
+      return IOErrorFromWindowsError(
+          "Failed to create a NewWriteableFile: " + fname, lastError);
+    }
+
+    if (options.use_mmap_writes) {
+      // We usually do not use mmmapping on SSD and thus we pass memory
+      // page_size
+      result->reset(new WinMmapFile(fname, hFile, page_size_,
+                                    allocation_granularity_, local_options));
+    } else {
+      // Here we want the buffer allocation to be aligned by the SSD page size
+      // and to be a multiple of it
+      result->reset(new WinWritableFile(fname, hFile, page_size_,
+                                        c_BufferCapacity, local_options));
+    }
+    return s;
+  }
+
+  virtual Status NewDirectory(const std::string& name,
+                              std::unique_ptr<Directory>* result) override {
+    Status s;
+    // Must be nullptr on failure
+    result->reset();
+    // Must fail if directory does not exist
+    if (!DirExists(name)) {
+      s = IOError("Directory does not exist: " + name, EEXIST);
+    } else {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      result->reset(new WinDirectory);
+    }
+    return s;
+  }
+
+  virtual Status FileExists(const std::string& fname) override {
+    // F_OK == 0
+    const int F_OK_ = 0;
+    return _access(fname.c_str(), F_OK_) == 0 ? Status::OK()
+                                              : Status::NotFound();
+  }
+
+  virtual Status GetChildren(const std::string& dir,
+                             std::vector<std::string>* result) override {
+    std::vector<std::string> output;
+
+    Status status;
+
+    auto CloseDir = [](DIR* p) { closedir(p); };
+    std::unique_ptr<DIR, decltype(CloseDir)> dirp(opendir(dir.c_str()),
+                                                  CloseDir);
+
+    if (!dirp) {
+      status = IOError(dir, errno);
+    } else {
+      if (result->capacity() > 0) {
+        output.reserve(result->capacity());
+      }
+
+      struct dirent* ent = readdir(dirp.get());
+      while (ent) {
+        output.push_back(ent->d_name);
+        ent = readdir(dirp.get());
+      }
+    }
+
+    output.swap(*result);
+
+    return status;
+  }
+
+  virtual Status CreateDir(const std::string& name) override {
+    Status result;
+
+    if (_mkdir(name.c_str()) != 0) {
+      auto code = errno;
+      result = IOError("Failed to create dir: " + name, code);
+    }
+
+    return result;
+  }
+
+  virtual Status CreateDirIfMissing(const std::string& name) override {
+    Status result;
+
+    if (DirExists(name)) {
+      return result;
+    }
+
+    if (_mkdir(name.c_str()) != 0) {
+      if (errno == EEXIST) {
+        result =
+            Status::IOError("`" + name + "' exists but is not a directory");
+      } else {
+        auto code = errno;
+        result = IOError("Failed to create dir: " + name, code);
+      }
+    }
+
+    return result;
+  }
+
+  virtual Status DeleteDir(const std::string& name) override {
+    Status result;
+    if (_rmdir(name.c_str()) != 0) {
+      auto code = errno;
+      result = IOError("Failed to remove dir: " + name, code);
+    }
+    return result;
+  }
+
+  virtual Status GetFileSize(const std::string& fname,
+                             uint64_t* size) override {
+    Status s;
+
+    WIN32_FILE_ATTRIBUTE_DATA attrs;
+    if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) {
+      ULARGE_INTEGER file_size;
+      file_size.HighPart = attrs.nFileSizeHigh;
+      file_size.LowPart = attrs.nFileSizeLow;
+      *size = file_size.QuadPart;
+    } else {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError("Can not get size for: " + fname, lastError);
+    }
+    return s;
+  }
+
+  static inline uint64_t FileTimeToUnixTime(const FILETIME& ftTime) {
+    const uint64_t c_FileTimePerSecond = 10000000U;
+    // UNIX epoch starts on 1970-01-01T00:00:00Z
+    // Windows FILETIME starts on 1601-01-01T00:00:00Z
+    // Therefore, we need to subtract the below number of seconds from
+    // the seconds that we obtain from FILETIME with an obvious loss of
+    // precision
+    const uint64_t c_SecondBeforeUnixEpoch = 11644473600U;
+
+    ULARGE_INTEGER li;
+    li.HighPart = ftTime.dwHighDateTime;
+    li.LowPart = ftTime.dwLowDateTime;
+
+    uint64_t result =
+        (li.QuadPart / c_FileTimePerSecond) - c_SecondBeforeUnixEpoch;
+    return result;
+  }
+
+  virtual Status GetFileModificationTime(const std::string& fname,
+                                         uint64_t* file_mtime) override {
+    Status s;
+
+    WIN32_FILE_ATTRIBUTE_DATA attrs;
+    if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) {
+      *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime);
+    } else {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError(
+          "Can not get file modification time for: " + fname, lastError);
+      *file_mtime = 0;
+    }
+
+    return s;
+  }
+
+  virtual Status RenameFile(const std::string& src,
+                            const std::string& target) override {
+    Status result;
+
+    // rename() is not capable of replacing the existing file as on Linux
+    // so use OS API directly
+    if (!MoveFileExA(src.c_str(), target.c_str(), MOVEFILE_REPLACE_EXISTING)) {
+      DWORD lastError = GetLastError();
+
+      std::string text("Failed to rename: ");
+      text.append(src).append(" to: ").append(target);
+
+      result = IOErrorFromWindowsError(text, lastError);
+    }
+
+    return result;
+  }
+
+  virtual Status LinkFile(const std::string& src,
+                          const std::string& target) override {
+    Status result;
+
+    if (!CreateHardLinkA(target.c_str(), src.c_str(), NULL)) {
+      DWORD lastError = GetLastError();
+
+      std::string text("Failed to link: ");
+      text.append(src).append(" to: ").append(target);
+
+      result = IOErrorFromWindowsError(text, lastError);
+    }
+
+    return result;
+  }
+
+  virtual Status LockFile(const std::string& lockFname,
+                          FileLock** lock) override {
+    assert(lock != nullptr);
+
+    *lock = NULL;
+    Status result;
+
+    // No-sharing, this is a LOCK file
+    const DWORD ExclusiveAccessON = 0;
+
+    // Obtain exclusive access to the LOCK file
+    // Previously, instead of NORMAL attr we set DELETE on close and that worked
+    // well except with fault_injection test that insists on deleting it.
+    HANDLE hFile = 0;
+    {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      hFile = CreateFileA(lockFname.c_str(), (GENERIC_READ | GENERIC_WRITE),
+                          ExclusiveAccessON, NULL, CREATE_ALWAYS,
+                          FILE_ATTRIBUTE_NORMAL, NULL);
+    }
+
+    if (INVALID_HANDLE_VALUE == hFile) {
+      auto lastError = GetLastError();
+      result = IOErrorFromWindowsError(
+          "Failed to create lock file: " + lockFname, lastError);
+    } else {
+      *lock = new WinFileLock(hFile);
+    }
+
+    return result;
+  }
+
+  virtual Status UnlockFile(FileLock* lock) override {
+    Status result;
+
+    assert(lock != nullptr);
+
+    delete lock;
+
+    return result;
+  }
+
+  virtual void Schedule(void (*function)(void*), void* arg, Priority pri = LOW,
+                        void* tag = nullptr) override;
+
+  virtual int UnSchedule(void* arg, Priority pri) override;
+
+  virtual void StartThread(void (*function)(void* arg), void* arg) override;
+
+  virtual void WaitForJoin() override;
+
+  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override;
+
+  virtual Status GetTestDirectory(std::string* result) override {
+    std::string output;
+
+    const char* env = getenv("TEST_TMPDIR");
+    if (env && env[0] != '\0') {
+      output = env;
+      CreateDir(output);
+    } else {
+      env = getenv("TMP");
+
+      if (env && env[0] != '\0') {
+        output = env;
+      } else {
+        output = "c:\\tmp";
+      }
+
+      CreateDir(output);
+    }
+
+    output.append("\\testrocksdb-");
+    output.append(std::to_string(_getpid()));
+
+    CreateDir(output);
+
+    output.swap(*result);
+
+    return Status::OK();
+  }
+
+  virtual Status GetThreadList(
+      std::vector<ThreadStatus>* thread_list) override {
+    assert(thread_status_updater_);
+    return thread_status_updater_->GetThreadList(thread_list);
+  }
+
+  static uint64_t gettid() {
+    uint64_t thread_id = GetCurrentThreadId();
+    return thread_id;
+  }
+
+  virtual uint64_t GetThreadID() const override { return gettid(); }
+
+  virtual Status NewLogger(const std::string& fname,
+                           std::shared_ptr<Logger>* result) override {
+    Status s;
+
+    result->reset();
+
+    HANDLE hFile = 0;
+    {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      hFile = CreateFileA(
+          fname.c_str(), GENERIC_WRITE,
+          FILE_SHARE_READ | FILE_SHARE_DELETE,  // In RocksDb log files are
+                                                // renamed and deleted before
+                                                // they are closed. This enables
+                                                // doing so.
+          NULL,
+          CREATE_ALWAYS,  // Original fopen mode is "w"
+          FILE_ATTRIBUTE_NORMAL, NULL);
+    }
+
+    if (INVALID_HANDLE_VALUE == hFile) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError("Failed to open LogFile" + fname, lastError);
+    } else {
+      {
+        // With log files we want to set the true creation time as of now
+        // because the system
+        // for some reason caches the attributes of the previous file that just
+        // been renamed from
+        // this name so auto_roll_logger_test fails
+        FILETIME ft;
+        GetSystemTimeAsFileTime(&ft);
+        // Set creation, last access and last write time to the same value
+        SetFileTime(hFile, &ft, &ft, &ft);
+      }
+      result->reset(new WinLogger(&WinEnv::gettid, this, hFile));
+    }
+    return s;
+  }
+
+  virtual uint64_t NowMicros() override {
+    // all std::chrono clocks on windows proved to return
+    // values that may repeat that is not good enough for some uses.
+    const int64_t c_UnixEpochStartTicks = 116444736000000000i64;
+    const int64_t c_FtToMicroSec = 10;
+
+    // This interface needs to return system time and not
+    // just any microseconds because it is often used as an argument
+    // to TimedWait() on condition variable
+    FILETIME ftSystemTime;
+    GetSystemTimePreciseAsFileTime(&ftSystemTime);
+
+    LARGE_INTEGER li;
+    li.LowPart = ftSystemTime.dwLowDateTime;
+    li.HighPart = ftSystemTime.dwHighDateTime;
+    // Subtract unix epoch start
+    li.QuadPart -= c_UnixEpochStartTicks;
+    // Convert to microsecs
+    li.QuadPart /= c_FtToMicroSec;
+    return li.QuadPart;
+  }
+
+  virtual uint64_t NowNanos() override {
+    // all std::chrono clocks on windows have the same resolution that is only
+    // good enough for microseconds but not nanoseconds
+    // On Windows 8 and Windows 2012 Server
+    // GetSystemTimePreciseAsFileTime(&current_time) can be used
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    // Convert to nanoseconds first to avoid loss of precision
+    // and divide by frequency
+    li.QuadPart *= std::nano::den;
+    li.QuadPart /= perf_counter_frequency_;
+    return li.QuadPart;
+  }
+
+  virtual void SleepForMicroseconds(int micros) override {
+    std::this_thread::sleep_for(std::chrono::microseconds(micros));
+  }
+
+  virtual Status GetHostName(char* name, uint64_t len) override {
+    Status s;
+    DWORD nSize = len;
+
+    if (!::GetComputerNameA(name, &nSize)) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError("GetHostName", lastError);
+    } else {
+      name[nSize] = 0;
+    }
+
+    return s;
+  }
+
+  virtual Status GetCurrTime(int64_t* unix_time) {
+    Status s;
+
+    time_t ret = time(nullptr);
+    if (ret == (time_t)-1) {
+      *unix_time = 0;
+      s = IOError("GetCurrTime", errno);
+    } else {
+      *unix_time = (int64_t)ret;
+    }
+
+    return s;
+  }
+
+  virtual Status GetAbsolutePath(const std::string& db_path,
+                                 std::string* output_path) override {
+    // Check if we already have an absolute path
+    // that starts with non dot and has a semicolon in it
+    if ((!db_path.empty() && (db_path[0] == '/' || db_path[0] == '\\')) ||
+        (db_path.size() > 2 && db_path[0] != '.' &&
+         ((db_path[1] == ':' && db_path[2] == '\\') ||
+          (db_path[1] == ':' && db_path[2] == '/')))) {
+      *output_path = db_path;
+      return Status::OK();
+    }
+
+    std::string result;
+    result.resize(_MAX_PATH);
+
+    char* ret = _getcwd(&result[0], _MAX_PATH);
+    if (ret == nullptr) {
+      return Status::IOError("Failed to get current working directory",
+                             strerror(errno));
+    }
+
+    result.resize(strlen(result.data()));
+
+    result.swap(*output_path);
+    return Status::OK();
+  }
+
+  // Allow increasing the number of worker threads.
+  virtual void SetBackgroundThreads(int num, Priority pri) override {
+    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+    thread_pools_[pri].SetBackgroundThreads(num);
+  }
+
+  virtual void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
+    assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+    thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
+  }
+
+  virtual std::string TimeToString(uint64_t secondsSince1970) override {
+    std::string result;
+
+    const time_t seconds = secondsSince1970;
+    const int maxsize = 64;
+
+    struct tm t;
+    errno_t ret = localtime_s(&t, &seconds);
+
+    if (ret) {
+      result = std::to_string(seconds);
+    } else {
+      result.resize(maxsize);
+      char* p = &result[0];
+
+      int len = snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ",
+                         t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
+                         t.tm_min, t.tm_sec);
+      assert(len > 0);
+
+      result.resize(len);
+    }
+
+    return result;
+  }
+
+  EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+                                 const DBOptions& db_options) const override {
+    EnvOptions optimized = env_options;
+    optimized.use_mmap_writes = false;
+    optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
+    optimized.use_os_buffer =
+        true;  // This is because we flush only whole pages on unbuffered io and
+               // the last records are not guaranteed to be flushed.
+    // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
+    // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
+    // test and make this false
+    optimized.fallocate_with_keep_size = true;
+    return optimized;
+  }
+
+  EnvOptions OptimizeForManifestWrite(
+      const EnvOptions& env_options) const override {
+    EnvOptions optimized = env_options;
+    optimized.use_mmap_writes = false;
+    optimized.use_os_buffer = true;
+    optimized.fallocate_with_keep_size = true;
+    return optimized;
+  }
+
+ private:
+  // Returns true iff the named directory exists and is a directory.
+  virtual bool DirExists(const std::string& dname) {
+    WIN32_FILE_ATTRIBUTE_DATA attrs;
+    if (GetFileAttributesExA(dname.c_str(), GetFileExInfoStandard, &attrs)) {
+      return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY);
+    }
+    return false;
+  }
+
+  bool SupportsFastAllocate(const std::string& /* path */) { return false; }
+
+  class ThreadPool {
+   public:
+    ThreadPool()
+        : total_threads_limit_(1),
+          bgthreads_(0),
+          queue_(),
+          queue_len_(0U),
+          exit_all_threads_(false),
+          low_io_priority_(false),
+          env_(nullptr) {}
+
+    ~ThreadPool() { assert(bgthreads_.size() == 0U); }
+
+    void JoinAllThreads() {
+      {
+        std::lock_guard<std::mutex> lock(mu_);
+        assert(!exit_all_threads_);
+        exit_all_threads_ = true;
+        bgsignal_.notify_all();
+      }
+
+      for (std::thread& th : bgthreads_) {
+        th.join();
+      }
+
+      // Subject to assert in the __dtor
+      bgthreads_.clear();
+    }
+
+    void SetHostEnv(Env* env) { env_ = env; }
+
+    // Return true if there is at least one thread needs to terminate.
+    bool HasExcessiveThread() const {
+      return bgthreads_.size() > total_threads_limit_;
+    }
+
+    // Return true iff the current thread is the excessive thread to terminate.
+    // Always terminate the running thread that is added last, even if there are
+    // more than one thread to terminate.
+    bool IsLastExcessiveThread(size_t thread_id) const {
+      return HasExcessiveThread() && thread_id == bgthreads_.size() - 1;
+    }
+
+    // Is one of the threads to terminate.
+    bool IsExcessiveThread(size_t thread_id) const {
+      return thread_id >= total_threads_limit_;
+    }
+
+    // Return the thread priority.
+    // This would allow its member-thread to know its priority.
+    Env::Priority GetThreadPriority() { return priority_; }
+
+    // Set the thread priority.
+    void SetThreadPriority(Env::Priority priority) { priority_ = priority; }
+
+    void BGThread(size_t thread_id) {
+      while (true) {
+        // Wait until there is an item that is ready to run
+        std::unique_lock<std::mutex> uniqueLock(mu_);
+
+        // Stop waiting if the thread needs to do work or needs to terminate.
+        while (!exit_all_threads_ && !IsLastExcessiveThread(thread_id) &&
+               (queue_.empty() || IsExcessiveThread(thread_id))) {
+          bgsignal_.wait(uniqueLock);
+        }
+
+        if (exit_all_threads_) {
+          // mechanism to let BG threads exit safely
+          uniqueLock.unlock();
+          break;
+        }
+
+        if (IsLastExcessiveThread(thread_id)) {
+          // Current thread is the last generated one and is excessive.
+          // We always terminate excessive thread in the reverse order of
+          // generation time.
+          std::thread& terminating_thread = bgthreads_.back();
+          auto tid = terminating_thread.get_id();
+          // Ensure that that this thread is ours
+          assert(tid == std::this_thread::get_id());
+          terminating_thread.detach();
+          bgthreads_.pop_back();
+
+          if (HasExcessiveThread()) {
+            // There is still at least more excessive thread to terminate.
+            WakeUpAllThreads();
+          }
+
+          uniqueLock.unlock();
+
+          PrintThreadInfo(thread_id, gettid());
+          break;
+        }
+
+        void (*function)(void*) = queue_.front().function;
+        void* arg = queue_.front().arg;
+        queue_.pop_front();
+        queue_len_.store(queue_.size(), std::memory_order_relaxed);
+
+        uniqueLock.unlock();
+        (*function)(arg);
+      }
+    }
+
+    // Helper struct for passing arguments when creating threads.
+    struct BGThreadMetadata {
+      ThreadPool* thread_pool_;
+      size_t thread_id_;  // Thread count in the thread.
+
+      BGThreadMetadata(ThreadPool* thread_pool, size_t thread_id)
+          : thread_pool_(thread_pool), thread_id_(thread_id) {}
+    };
+
+    static void* BGThreadWrapper(void* arg) {
+      std::unique_ptr<BGThreadMetadata> meta(
+          reinterpret_cast<BGThreadMetadata*>(arg));
+
+      size_t thread_id = meta->thread_id_;
+      ThreadPool* tp = meta->thread_pool_;
+
+#if ROCKSDB_USING_THREAD_STATUS
+      // for thread-status
+      ThreadStatusUtil::RegisterThread(
+          tp->env_, (tp->GetThreadPriority() == Env::Priority::HIGH
+                         ? ThreadStatus::HIGH_PRIORITY
+                         : ThreadStatus::LOW_PRIORITY));
+#endif
+      tp->BGThread(thread_id);
+#if ROCKSDB_USING_THREAD_STATUS
+      ThreadStatusUtil::UnregisterThread();
+#endif
+      return nullptr;
+    }
+
+    void WakeUpAllThreads() { bgsignal_.notify_all(); }
+
+    void SetBackgroundThreadsInternal(size_t num, bool allow_reduce) {
+      std::lock_guard<std::mutex> lg(mu_);
+
+      if (exit_all_threads_) {
+        return;
+      }
+
+      if (num > total_threads_limit_ ||
+          (num < total_threads_limit_ && allow_reduce)) {
+        total_threads_limit_ = std::max(size_t(1), num);
+        WakeUpAllThreads();
+        StartBGThreads();
+      }
+      assert(total_threads_limit_ > 0);
+    }
+
+    void IncBackgroundThreadsIfNeeded(int num) {
+      SetBackgroundThreadsInternal(num, false);
+    }
+
+    void SetBackgroundThreads(int num) {
+      SetBackgroundThreadsInternal(num, true);
+    }
+
+    void StartBGThreads() {
+      // Start background thread if necessary
+      while (bgthreads_.size() < total_threads_limit_) {
+        std::thread p_t(&ThreadPool::BGThreadWrapper,
+                        new BGThreadMetadata(this, bgthreads_.size()));
+        bgthreads_.push_back(std::move(p_t));
+      }
+    }
+
+    void Schedule(void (*function)(void* arg1), void* arg, void* tag) {
+      std::lock_guard<std::mutex> lg(mu_);
+
+      if (exit_all_threads_) {
+        return;
+      }
+
+      StartBGThreads();
+
+      // Add to priority queue
+      queue_.push_back(BGItem());
+      queue_.back().function = function;
+      queue_.back().arg = arg;
+      queue_.back().tag = tag;
+      queue_len_.store(queue_.size(), std::memory_order_relaxed);
+
+      if (!HasExcessiveThread()) {
+        // Wake up at least one waiting thread.
+        bgsignal_.notify_one();
+      } else {
+        // Need to wake up all threads to make sure the one woken
+        // up is not the one to terminate.
+        WakeUpAllThreads();
+      }
+    }
+
+    int UnSchedule(void* arg) {
+      int count = 0;
+
+      std::lock_guard<std::mutex> lg(mu_);
+
+      // Remove from priority queue
+      BGQueue::iterator it = queue_.begin();
+      while (it != queue_.end()) {
+        if (arg == (*it).tag) {
+          it = queue_.erase(it);
+          count++;
+        } else {
+          ++it;
+        }
+      }
+
+      queue_len_.store(queue_.size(), std::memory_order_relaxed);
+
+      return count;
+    }
+
+    unsigned int GetQueueLen() const {
+      return static_cast<unsigned int>(
+          queue_len_.load(std::memory_order_relaxed));
+    }
+
+   private:
+    // Entry per Schedule() call
+    struct BGItem {
+      void* arg;
+      void (*function)(void*);
+      void* tag;
+    };
+
+    typedef std::deque<BGItem> BGQueue;
+
+    std::mutex mu_;
+    std::condition_variable bgsignal_;
+    size_t total_threads_limit_;
+    std::vector<std::thread> bgthreads_;
+    BGQueue queue_;
+    std::atomic_size_t queue_len_;  // Queue length. Used for stats reporting
+    bool exit_all_threads_;
+    bool low_io_priority_;
+    Env::Priority priority_;
+    Env* env_;
+  };
+
+  bool checkedDiskForMmap_;
+  bool forceMmapOff;  // do we override Env options?
+  size_t page_size_;
+  size_t allocation_granularity_;
+  uint64_t perf_counter_frequency_;
+  std::vector<ThreadPool> thread_pools_;
+  mutable std::mutex mu_;
+  std::vector<std::thread> threads_to_join_;
+};
+
+WinEnv::WinEnv()
+    : checkedDiskForMmap_(false),
+      forceMmapOff(false),
+      page_size_(4 * 1012),
+      allocation_granularity_(page_size_),
+      perf_counter_frequency_(0),
+      thread_pools_(Priority::TOTAL) {
+  SYSTEM_INFO sinfo;
+  GetSystemInfo(&sinfo);
+
+  page_size_ = sinfo.dwPageSize;
+  allocation_granularity_ = sinfo.dwAllocationGranularity;
+
+  {
+    LARGE_INTEGER qpf;
+    BOOL ret = QueryPerformanceFrequency(&qpf);
+    assert(ret == TRUE);
+    perf_counter_frequency_ = qpf.QuadPart;
+  }
+
+  for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
+    thread_pools_[pool_id].SetThreadPriority(
+        static_cast<Env::Priority>(pool_id));
+    // This allows later initializing the thread-local-env of each thread.
+    thread_pools_[pool_id].SetHostEnv(this);
+  }
+
+  // Protected member of the base class
+  thread_status_updater_ = CreateThreadStatusUpdater();
+}
+
+void WinEnv::Schedule(void (*function)(void*), void* arg, Priority pri,
+                      void* tag) {
+  assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+  thread_pools_[pri].Schedule(function, arg, tag);
+}
+
+int WinEnv::UnSchedule(void* arg, Priority pri) {
+  return thread_pools_[pri].UnSchedule(arg);
+}
+
+unsigned int WinEnv::GetThreadPoolQueueLen(Priority pri) const {
+  assert(pri >= Priority::LOW && pri <= Priority::HIGH);
+  return thread_pools_[pri].GetQueueLen();
+}
+
+namespace {
+struct StartThreadState {
+  void (*user_function)(void*);
+  void* arg;
+};
+}
+
+static void* StartThreadWrapper(void* arg) {
+  std::unique_ptr<StartThreadState> state(
+      reinterpret_cast<StartThreadState*>(arg));
+  state->user_function(state->arg);
+  return nullptr;
+}
+
+void WinEnv::StartThread(void (*function)(void* arg), void* arg) {
+  StartThreadState* state = new StartThreadState;
+  state->user_function = function;
+  state->arg = arg;
+  try {
+    std::thread th(&StartThreadWrapper, state);
+
+    std::lock_guard<std::mutex> lg(mu_);
+    threads_to_join_.push_back(std::move(th));
+
+  } catch (const std::system_error& ex) {
+    WinthreadCall("start thread", ex.code());
+  }
+}
+
+void WinEnv::WaitForJoin() {
+  for (auto& th : threads_to_join_) {
+    th.join();
+  }
+
+  threads_to_join_.clear();
+}
+
+}  // namespace
+
+std::string Env::GenerateUniqueId() {
+  std::string result;
+
+  UUID uuid;
+  UuidCreateSequential(&uuid);
+
+  RPC_CSTR rpc_str;
+  auto status = UuidToStringA(&uuid, &rpc_str);
+  assert(status == RPC_S_OK);
+
+  result = reinterpret_cast<char*>(rpc_str);
+
+  status = RpcStringFreeA(&rpc_str);
+  assert(status == RPC_S_OK);
+
+  return result;
+}
+
+// We choose to create this on the heap and using std::once for the following
+// reasons
+// 1) Currently available MS compiler does not implement atomic C++11
+// initialization of
+//    function local statics
+// 2) We choose not to destroy the env because joining the threads from the
+// system loader
+//    which destroys the statics (same as from DLLMain) creates a system loader
+//    dead-lock.
+//    in this manner any remaining threads are terminated OK.
+namespace {
+std::once_flag winenv_once_flag;
+Env* envptr;
+};
+
+Env* Env::Default() {
+  std::call_once(winenv_once_flag, []() { envptr = new WinEnv(); });
+  return envptr;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/port/win/port_win.cc b/src/rocksdb/port/win/port_win.cc
new file mode 100644
index 0000000..2aaeada
--- /dev/null
+++ b/src/rocksdb/port/win/port_win.cc
@@ -0,0 +1,315 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if !defined(OS_WIN) && !defined(WIN32) && !defined(_WIN32)
+#error Windows Specific Code
+#endif
+
+#include "port/win/port_win.h"
+
+#include <io.h>
+#include "port/dirent.h"
+#include "port/sys_time.h"
+
+#include <cstdlib>
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+
+#include <memory>
+#include <exception>
+#include <chrono>
+
+#include "util/logging.h"
+
+namespace rocksdb {
+namespace port {
+
+void gettimeofday(struct timeval* tv, struct timezone* /* tz */) {
+  using namespace std::chrono;
+
+  microseconds usNow(
+      duration_cast<microseconds>(system_clock::now().time_since_epoch()));
+
+  seconds secNow(duration_cast<seconds>(usNow));
+
+  tv->tv_sec = secNow.count();
+  tv->tv_usec = usNow.count() - duration_cast<microseconds>(secNow).count();
+}
+
+Mutex::Mutex(bool adaptive) : lock(m_mutex, std::defer_lock) {}
+
+Mutex::~Mutex() {}
+
+void Mutex::Lock() {
+  lock.lock();
+#ifndef NDEBUG
+  locked_ = true;
+#endif
+}
+
+void Mutex::Unlock() {
+#ifndef NDEBUG
+  locked_ = false;
+#endif
+  lock.unlock();
+}
+
+void Mutex::AssertHeld() {
+#ifndef NDEBUG
+  assert(locked_);
+#endif
+}
+
+CondVar::CondVar(Mutex* mu) : mu_(mu) {}
+
+CondVar::~CondVar() {}
+
+void CondVar::Wait() {
+#ifndef NDEBUG
+  mu_->locked_ = false;
+#endif
+  cv_.wait(mu_->getLock());
+#ifndef NDEBUG
+  mu_->locked_ = true;
+#endif
+}
+
+bool CondVar::TimedWait(uint64_t abs_time_us) {
+#ifndef NDEBUG
+  mu_->locked_ = false;
+#endif
+
+  using namespace std::chrono;
+
+  // MSVC++ library implements wait_until in terms of wait_for so
+  // there is not an absolute wait anyway.
+  microseconds usAbsTime(abs_time_us);
+
+  microseconds usNow(
+      duration_cast<microseconds>(system_clock::now().time_since_epoch()));
+  microseconds relTimeUs =
+      (usAbsTime > usNow) ? (usAbsTime - usNow) : microseconds::zero();
+
+  std::cv_status cvStatus = cv_.wait_for(mu_->getLock(), relTimeUs);
+
+#ifndef NDEBUG
+  mu_->locked_ = true;
+#endif
+
+  if (cvStatus == std::cv_status::timeout) {
+    return true;
+  }
+
+  return false;
+}
+
+void CondVar::Signal() { cv_.notify_one(); }
+
+void CondVar::SignalAll() { cv_.notify_all(); }
+
+void InitOnce(OnceType* once, void (*initializer)()) {
+  std::call_once(*once, initializer);
+}
+
+// Private structure, exposed only by pointer
+struct DIR {
+  intptr_t handle_;
+  bool firstread_;
+  struct __finddata64_t data_;
+  dirent entry_;
+
+  DIR() : handle_(-1), firstread_(true) {}
+
+  DIR(const DIR&) = delete;
+  DIR& operator=(const DIR&) = delete;
+
+  ~DIR() {
+    if (-1 != handle_) {
+      _findclose(handle_);
+    }
+  }
+};
+
+DIR* opendir(const char* name) {
+  if (!name || *name == 0) {
+    errno = ENOENT;
+    return nullptr;
+  }
+
+  std::string pattern(name);
+  pattern.append("\\").append("*");
+
+  std::unique_ptr<DIR> dir(new DIR);
+
+  dir->handle_ = _findfirst64(pattern.c_str(), &dir->data_);
+
+  if (dir->handle_ == -1) {
+    return nullptr;
+  }
+
+  strncpy_s(dir->entry_.d_name, dir->data_.name, strlen(dir->data_.name));
+
+  return dir.release();
+}
+
+struct dirent* readdir(DIR* dirp) {
+  if (!dirp || dirp->handle_ == -1) {
+    errno = EBADF;
+    return nullptr;
+  }
+
+  if (dirp->firstread_) {
+    dirp->firstread_ = false;
+    return &dirp->entry_;
+  }
+
+  auto ret = _findnext64(dirp->handle_, &dirp->data_);
+
+  if (ret != 0) {
+    return nullptr;
+  }
+
+  strncpy_s(dirp->entry_.d_name, dirp->data_.name, strlen(dirp->data_.name));
+
+  return &dirp->entry_;
+}
+
+int closedir(DIR* dirp) {
+  delete dirp;
+  return 0;
+}
+
+int truncate(const char* path, int64_t len) {
+  if (path == nullptr) {
+    errno = EFAULT;
+    return -1;
+  }
+
+  if (len < 0) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  HANDLE hFile =
+      CreateFile(path, GENERIC_READ | GENERIC_WRITE,
+                 FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                 NULL,           // Security attrs
+                 OPEN_EXISTING,  // Truncate existing file only
+                 FILE_ATTRIBUTE_NORMAL, NULL);
+
+  if (INVALID_HANDLE_VALUE == hFile) {
+    auto lastError = GetLastError();
+    if (lastError == ERROR_FILE_NOT_FOUND) {
+      errno = ENOENT;
+    } else if (lastError == ERROR_ACCESS_DENIED) {
+      errno = EACCES;
+    } else {
+      errno = EIO;
+    }
+    return -1;
+  }
+
+  int result = 0;
+  FILE_END_OF_FILE_INFO end_of_file;
+  end_of_file.EndOfFile.QuadPart = len;
+
+  if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
+                                  sizeof(FILE_END_OF_FILE_INFO))) {
+    errno = EIO;
+    result = -1;
+  }
+
+  CloseHandle(hFile);
+  return result;
+}
+
+void Crash(const std::string& srcfile, int srcline) {
+  fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
+  fflush(stdout);
+  abort();
+}
+
+int GetMaxOpenFiles() { return -1; }
+
+}  // namespace port
+}  // namespace rocksdb
+
+#ifdef JEMALLOC
+
+#include "jemalloc/jemalloc.h"
+
+namespace rocksdb {
+
+namespace port {
+
+__declspec(noinline) void WINAPI InitializeJemalloc() {
+  je_init();
+  atexit(je_uninit);
+}
+
+}  // port
+}  // rocksdb
+
+extern "C" {
+
+#ifdef _WIN64
+
+#pragma comment(linker, "/INCLUDE:p_rocksdb_init_jemalloc")
+
+typedef void(WINAPI* CRT_Startup_Routine)(void);
+
+// .CRT section is merged with .rdata on x64 so it must be constant data.
+// must be of external linkage
+// We put this into XCT since we want to run this earlier than C++ static
+// constructors
+// which are placed into XCU
+#pragma const_seg(".CRT$XCT")
+extern const CRT_Startup_Routine p_rocksdb_init_jemalloc;
+const CRT_Startup_Routine p_rocksdb_init_jemalloc =
+    rocksdb::port::InitializeJemalloc;
+#pragma const_seg()
+
+#else  // _WIN64
+
+// x86 untested
+
+#pragma comment(linker, "/INCLUDE:_p_rocksdb_init_jemalloc")
+
+#pragma section(".CRT$XCT", read)
+JEMALLOC_SECTION(".CRT$XCT") JEMALLOC_ATTR(used) static const void(
+    WINAPI* p_rocksdb_init_jemalloc)(void) = rocksdb::port::InitializeJemalloc;
+
+#endif  // _WIN64
+
+}  // extern "C"
+
+// Global operators to be replaced by a linker
+
+void* operator new(size_t size) {
+  void* p = je_malloc(size);
+  if (!p) {
+    throw std::bad_alloc();
+  }
+  return p;
+}
+
+void* operator new[](size_t size) {
+  void* p = je_malloc(size);
+  if (!p) {
+    throw std::bad_alloc();
+  }
+  return p;
+}
+
+void operator delete(void* p) { je_free(p); }
+
+void operator delete[](void* p) { je_free(p); }
+
+#endif  // JEMALLOC
diff --git a/src/rocksdb/port/win/port_win.h b/src/rocksdb/port/win/port_win.h
new file mode 100644
index 0000000..1f517fb
--- /dev/null
+++ b/src/rocksdb/port/win/port_win.h
@@ -0,0 +1,250 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// See port_example.h for documentation for the following types/functions.
+
+#ifndef STORAGE_LEVELDB_PORT_PORT_WIN_H_
+#define STORAGE_LEVELDB_PORT_PORT_WIN_H_
+
+// Always want minimum headers
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+
+// Assume that for everywhere
+#undef PLATFORM_IS_LITTLE_ENDIAN
+#define PLATFORM_IS_LITTLE_ENDIAN true
+
+#include <windows.h>
+#include <string>
+#include <string.h>
+#include <mutex>
+#include <condition_variable>
+
+#include <stdint.h>
+
+#include "rocksdb/options.h"
+
+#undef min
+#undef max
+#undef DeleteFile
+#undef GetCurrentTime
+
+
+#ifndef strcasecmp
+#define strcasecmp _stricmp
+#endif
+
+// defined in stdio.h
+#ifndef snprintf
+#define snprintf _snprintf
+#endif
+
+#undef GetCurrentTime
+#undef DeleteFile
+
+typedef SSIZE_T ssize_t;
+
+// size_t printf formatting named in the manner of C99 standard formatting
+// strings such as PRIu64
+// in fact, we could use that one
+#ifndef ROCKSDB_PRIszt
+#define ROCKSDB_PRIszt "Iu"
+#endif
+
+#define ROCKSDB_NOEXCEPT
+
+#define __attribute__(A)
+
+#ifdef ZLIB
+#include <zlib.h>
+#endif
+
+#ifdef BZIP2
+#include <bzlib.h>
+#endif
+
+#if defined(LZ4)
+#include <lz4.h>
+#include <lz4hc.h>
+#endif
+
+#ifdef SNAPPY
+#include <snappy.h>
+#endif
+
+// Thread local storage on Linux
+// There is thread_local in C++11
+#ifndef __thread
+#define __thread __declspec(thread)
+#endif
+
+#ifndef PLATFORM_IS_LITTLE_ENDIAN
+#define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN)
+#endif
+
+namespace rocksdb {
+
+#define PREFETCH(addr, rw, locality)
+std::string GetWindowsErrSz(DWORD err);
+
+namespace port {
+
+// For use at db/file_indexer.h kLevelMaxIndex
+const int kMaxInt32 = INT32_MAX;
+const uint64_t kMaxUint64 = UINT64_MAX;
+// std::numeric_limits<size_t>::max() is not constexpr just yet
+// therefore, use the same limits
+#ifdef _WIN64
+const size_t kMaxSizet = UINT64_MAX;
+#else
+const size_t kMaxSizet = UINT_MAX;
+#endif
+
+const bool kLittleEndian = true;
+
+class CondVar;
+
+class Mutex {
+ public:
+  /* implicit */ Mutex(bool adaptive = false);
+  ~Mutex();
+
+  void Lock();
+  void Unlock();
+
+  // this will assert if the mutex is not locked
+  // it does NOT verify that mutex is held by a calling thread
+  void AssertHeld();
+
+  std::unique_lock<std::mutex>& getLock() { return lock; }
+
+ private:
+  friend class CondVar;
+  std::mutex m_mutex;
+  std::unique_lock<std::mutex> lock;
+#ifndef NDEBUG
+  bool locked_;
+#endif
+
+  // No copying
+  Mutex(const Mutex&);
+  void operator=(const Mutex&);
+};
+
+class RWMutex {
+ public:
+  RWMutex() { InitializeSRWLock(&srwLock_); }
+
+  void ReadLock() { AcquireSRWLockShared(&srwLock_); }
+
+  void WriteLock() { AcquireSRWLockExclusive(&srwLock_); }
+
+  void ReadUnlock() { ReleaseSRWLockShared(&srwLock_); }
+
+  void WriteUnlock() { ReleaseSRWLockExclusive(&srwLock_); }
+
+  // Empty as in POSIX
+  void AssertHeld() {}
+
+ private:
+  SRWLOCK srwLock_;
+  // No copying allowed
+  RWMutex(const RWMutex&);
+  void operator=(const RWMutex&);
+};
+
+class CondVar {
+ public:
+  explicit CondVar(Mutex* mu);
+  ~CondVar();
+  void Wait();
+  bool TimedWait(uint64_t expiration_time);
+  void Signal();
+  void SignalAll();
+
+ private:
+  std::condition_variable cv_;
+  Mutex* mu_;
+};
+
+typedef std::once_flag OnceType;
+#define LEVELDB_ONCE_INIT std::once_flag::once_flag();
+extern void InitOnce(OnceType* once, void (*initializer)());
+
+#define CACHE_LINE_SIZE 64U
+
+#ifdef min
+#undef min
+#endif
+#ifdef max
+#undef max
+#endif
+
+// For Thread Local Storage abstraction
+typedef DWORD pthread_key_t;
+
+inline int pthread_key_create(pthread_key_t* key, void (*destructor)(void*)) {
+  // Not used
+  (void)destructor;
+
+  pthread_key_t k = TlsAlloc();
+  if (TLS_OUT_OF_INDEXES == k) {
+    return ENOMEM;
+  }
+
+  *key = k;
+  return 0;
+}
+
+inline int pthread_key_delete(pthread_key_t key) {
+  if (!TlsFree(key)) {
+    return EINVAL;
+  }
+  return 0;
+}
+
+inline int pthread_setspecific(pthread_key_t key, const void* value) {
+  if (!TlsSetValue(key, const_cast<void*>(value))) {
+    return ENOMEM;
+  }
+  return 0;
+}
+
+inline void* pthread_getspecific(pthread_key_t key) {
+  void* result = TlsGetValue(key);
+  if (!result) {
+    if (GetLastError() != ERROR_SUCCESS) {
+      errno = EINVAL;
+    } else {
+      errno = NOERROR;
+    }
+  }
+  return result;
+}
+
+// UNIX equiv although errno numbers will be off
+// using C-runtime to implement. Note, this does not
+// feel space with zeros in case the file is extended.
+int truncate(const char* path, int64_t length);
+void Crash(const std::string& srcfile, int srcline);
+extern int GetMaxOpenFiles();
+
+}  // namespace port
+
+using port::pthread_key_t;
+using port::pthread_key_create;
+using port::pthread_key_delete;
+using port::pthread_setspecific;
+using port::pthread_getspecific;
+using port::truncate;
+
+}  // namespace rocksdb
+
+#endif  // STORAGE_LEVELDB_PORT_PORT_POSIX_H_
diff --git a/src/rocksdb/port/win/stdint.h b/src/rocksdb/port/win/stdint.h
deleted file mode 100644
index 39edd0d..0000000
--- a/src/rocksdb/port/win/stdint.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-// MSVC didn't ship with this file until the 2010 version.
-
-#ifndef STORAGE_LEVELDB_PORT_WIN_STDINT_H_
-#define STORAGE_LEVELDB_PORT_WIN_STDINT_H_
-
-#if !defined(_MSC_VER)
-#error This file should only be included when compiling with MSVC.
-#endif
-
-// Define C99 equivalent types.
-typedef signed char           int8_t;
-typedef signed short          int16_t;
-typedef signed int            int32_t;
-typedef signed long long      int64_t;
-typedef unsigned char         uint8_t;
-typedef unsigned short        uint16_t;
-typedef unsigned int          uint32_t;
-typedef unsigned long long    uint64_t;
-
-#endif  // STORAGE_LEVELDB_PORT_WIN_STDINT_H_
diff --git a/src/rocksdb/port/win/win_logger.cc b/src/rocksdb/port/win/win_logger.cc
new file mode 100644
index 0000000..e91930d
--- /dev/null
+++ b/src/rocksdb/port/win/win_logger.cc
@@ -0,0 +1,154 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Logger implementation that can be shared by all environments
+// where enough posix functionality is available.
+
+#include <stdint.h>
+#include <algorithm>
+#include <stdio.h>
+#include <time.h>
+#include <fcntl.h>
+#include <atomic>
+
+#include "rocksdb/env.h"
+
+#include <Windows.h>
+
+#include "port/win/win_logger.h"
+#include "port/sys_time.h"
+#include "util/iostats_context_imp.h"
+
+namespace rocksdb {
+
+WinLogger::WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file,
+                     const InfoLogLevel log_level)
+    : Logger(log_level),
+      gettid_(gettid),
+      log_size_(0),
+      last_flush_micros_(0),
+      env_(env),
+      flush_pending_(false),
+      file_(file) {}
+
+void WinLogger::DebugWriter(const char* str, int len) {
+  DWORD bytesWritten = 0;
+  BOOL ret = WriteFile(file_, str, len, &bytesWritten, NULL);
+  if (ret == FALSE) {
+    std::string errSz = GetWindowsErrSz(GetLastError());
+    fprintf(stderr, errSz.c_str());
+  }
+}
+
+WinLogger::~WinLogger() { close(); }
+
+void WinLogger::close() { CloseHandle(file_); }
+
+void WinLogger::Flush() {
+  if (flush_pending_) {
+    flush_pending_ = false;
+    // With Windows API writes go to OS buffers directly so no fflush needed unlike 
+    // with C runtime API. We don't flush all the way to disk for perf reasons.
+  }
+
+  last_flush_micros_ = env_->NowMicros();
+}
+
+void WinLogger::Logv(const char* format, va_list ap) {
+  IOSTATS_TIMER_GUARD(logger_nanos);
+
+  const uint64_t thread_id = (*gettid_)();
+
+  // We try twice: the first time with a fixed-size stack allocated buffer,
+  // and the second time with a much larger dynamically allocated buffer.
+  char buffer[500];
+  std::unique_ptr<char[]> largeBuffer;
+  for (int iter = 0; iter < 2; ++iter) {
+    char* base;
+    int bufsize;
+    if (iter == 0) {
+      bufsize = sizeof(buffer);
+      base = buffer;
+    } else {
+      bufsize = 30000;
+      largeBuffer.reset(new char[bufsize]);
+      base = largeBuffer.get();
+    }
+
+    char* p = base;
+    char* limit = base + bufsize;
+
+    struct timeval now_tv;
+    gettimeofday(&now_tv, nullptr);
+    const time_t seconds = now_tv.tv_sec;
+    struct tm t;
+    localtime_s(&t, &seconds);
+    p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
+                  t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
+                  t.tm_min, t.tm_sec, static_cast<int>(now_tv.tv_usec),
+                  static_cast<long long unsigned int>(thread_id));
+
+    // Print the message
+    if (p < limit) {
+      va_list backup_ap;
+      va_copy(backup_ap, ap);
+      int done = vsnprintf(p, limit - p, format, backup_ap);
+      if (done > 0) {
+        p += done;
+      } else {
+        continue;
+      }
+      va_end(backup_ap);
+    }
+
+    // Truncate to available space if necessary
+    if (p >= limit) {
+      if (iter == 0) {
+        continue;  // Try again with larger buffer
+      } else {
+        p = limit - 1;
+      }
+    }
+
+    // Add newline if necessary
+    if (p == base || p[-1] != '\n') {
+      *p++ = '\n';
+    }
+
+    assert(p <= limit);
+    const size_t write_size = p - base;
+
+    DWORD bytesWritten = 0;    
+    BOOL ret = WriteFile(file_, base, write_size, &bytesWritten, NULL);
+    if (ret == FALSE) {
+      std::string errSz = GetWindowsErrSz(GetLastError());
+      fprintf(stderr, errSz.c_str());
+    }
+
+    flush_pending_ = true;
+    assert(bytesWritten == write_size);
+    if (bytesWritten > 0) {
+      log_size_ += write_size;
+    }
+
+    uint64_t now_micros =
+        static_cast<uint64_t>(now_tv.tv_sec) * 1000000 + now_tv.tv_usec;
+    if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
+      flush_pending_ = false;
+      // With Windows API writes go to OS buffers directly so no fflush needed unlike 
+      // with C runtime API. We don't flush all the way to disk for perf reasons.
+      last_flush_micros_ = now_micros;
+    }
+    break;
+  }
+}
+
+size_t WinLogger::GetLogFileSize() const { return log_size_; }
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/port/win/win_logger.h b/src/rocksdb/port/win/win_logger.h
new file mode 100644
index 0000000..67e4590
--- /dev/null
+++ b/src/rocksdb/port/win/win_logger.h
@@ -0,0 +1,57 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Logger implementation that can be shared by all environments
+// where enough posix functionality is available.
+
+#pragma once
+
+#include <atomic>
+
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+class Env;
+
+const int kDebugLogChunkSize = 128 * 1024;
+
+class WinLogger : public rocksdb::Logger {
+ public:
+  WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file,
+            const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL);
+
+  virtual ~WinLogger();
+
+  WinLogger(const WinLogger&) = delete;
+
+  WinLogger& operator=(const WinLogger&) = delete;
+
+  void close();
+
+  void Flush() override;
+
+  void Logv(const char* format, va_list ap) override;
+
+  size_t GetLogFileSize() const override;
+
+  void DebugWriter(const char* str, int len);
+
+ private:
+  HANDLE file_;
+  uint64_t (*gettid_)();  // Return the thread id for the current thread
+  std::atomic_size_t log_size_;
+  std::atomic_uint_fast64_t last_flush_micros_;
+  Env* env_;
+  bool flush_pending_;
+
+  const static uint64_t flush_every_seconds_ = 5;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/src.mk b/src/rocksdb/src.mk
new file mode 100644
index 0000000..5e9ed16
--- /dev/null
+++ b/src/rocksdb/src.mk
@@ -0,0 +1,312 @@
+# These are the sources from which librocksdb.a is built:
+LIB_SOURCES =                                                   \
+  db/builder.cc                                                 \
+  db/c.cc                                                       \
+  db/column_family.cc                                           \
+  db/compacted_db_impl.cc                                       \
+  db/compaction.cc                                              \
+  db/compaction_iterator.cc                                     \
+  db/compaction_job.cc                                          \
+  db/compaction_picker.cc                                       \
+  db/convenience.cc                                             \
+  db/db_filesnapshot.cc                                         \
+  db/dbformat.cc                                                \
+  db/db_impl.cc                                                 \
+  db/db_impl_debug.cc                                           \
+  db/db_impl_readonly.cc                                        \
+  db/db_impl_experimental.cc                                    \
+  db/db_iter.cc                                                 \
+  db/experimental.cc                                            \
+  db/event_helpers.cc                                           \
+  db/file_indexer.cc                                            \
+  db/filename.cc                                                \
+  db/flush_job.cc                                               \
+  db/flush_scheduler.cc                                         \
+  db/forward_iterator.cc                                        \
+  db/internal_stats.cc                                          \
+  db/log_reader.cc                                              \
+  db/log_writer.cc                                              \
+  db/managed_iterator.cc                                        \
+  db/memtable_allocator.cc                                      \
+  db/memtable.cc                                                \
+  db/memtable_list.cc                                           \
+  db/merge_helper.cc                                            \
+  db/merge_operator.cc                                          \
+  db/repair.cc                                                  \
+  db/slice.cc                                                   \
+  db/snapshot_impl.cc                                           \
+  db/table_cache.cc                                             \
+  db/table_properties_collector.cc                              \
+  db/transaction_log_impl.cc                                    \
+  db/version_builder.cc                                         \
+  db/version_edit.cc                                            \
+  db/version_set.cc                                             \
+  db/wal_manager.cc                                             \
+  db/write_batch.cc                                             \
+  db/write_batch_base.cc                                        \
+  db/write_controller.cc                                        \
+  db/write_thread.cc                                            \
+  port/stack_trace.cc                                           \
+  port/port_posix.cc                                            \
+  table/adaptive_table_factory.cc                               \
+  table/block_based_filter_block.cc                             \
+  table/block_based_table_builder.cc                            \
+  table/block_based_table_factory.cc                            \
+  table/block_based_table_reader.cc                             \
+  table/block_builder.cc                                        \
+  table/block.cc                                                \
+  table/block_hash_index.cc                                     \
+  table/block_prefix_index.cc                                   \
+  table/bloom_block.cc                                          \
+  table/cuckoo_table_builder.cc                                 \
+  table/cuckoo_table_factory.cc                                 \
+  table/cuckoo_table_reader.cc                                  \
+  table/flush_block_policy.cc                                   \
+  table/format.cc                                               \
+  table/full_filter_block.cc                                    \
+  table/get_context.cc                                          \
+  table/iterator.cc                                             \
+  table/merger.cc                                               \
+  table/meta_blocks.cc                                          \
+  table/sst_file_writer.cc                                      \
+  table/plain_table_builder.cc                                  \
+  table/plain_table_factory.cc                                  \
+  table/plain_table_index.cc                                    \
+  table/plain_table_key_coding.cc                               \
+  table/plain_table_reader.cc                                   \
+  table/table_properties.cc                                     \
+  table/two_level_iterator.cc                                   \
+  tools/dump/db_dump_tool.cc                                    \
+  util/arena.cc                                                 \
+  util/auto_roll_logger.cc                                      \
+  util/bloom.cc                                                 \
+  util/build_version.cc                                         \
+  util/cache.cc                                                 \
+  util/coding.cc                                                \
+  util/comparator.cc                                            \
+  util/compaction_job_stats_impl.cc                             \
+  util/crc32c.cc                                                \
+  util/db_info_dumper.cc                                        \
+  util/delete_scheduler_impl.cc                                 \
+  util/dynamic_bloom.cc                                         \
+  util/env.cc                                                   \
+  util/env_hdfs.cc                                              \
+  util/env_posix.cc                                             \
+  util/file_util.cc                                             \
+  util/file_reader_writer.cc                                    \
+  util/filter_policy.cc                                         \
+  util/hash.cc                                                  \
+  util/hash_cuckoo_rep.cc                                       \
+  util/hash_linklist_rep.cc                                     \
+  util/hash_skiplist_rep.cc                                     \
+  util/histogram.cc                                             \
+  util/instrumented_mutex.cc                                    \
+  util/iostats_context.cc                                       \
+  utilities/backupable/backupable_db.cc                         \
+  utilities/convenience/info_log_finder.cc                      \
+  utilities/checkpoint/checkpoint.cc                            \
+  utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc    \
+  utilities/document/document_db.cc                             \
+  utilities/document/json_document_builder.cc                   \
+  utilities/document/json_document.cc                           \
+  utilities/flashcache/flashcache.cc                            \
+  utilities/geodb/geodb_impl.cc                                 \
+  utilities/leveldb_options/leveldb_options.cc                  \
+  utilities/merge_operators/put.cc                              \
+  utilities/merge_operators/string_append/stringappend2.cc      \
+  utilities/merge_operators/string_append/stringappend.cc       \
+  utilities/merge_operators/uint64add.cc                        \
+  utilities/redis/redis_lists.cc                                \
+  utilities/spatialdb/spatial_db.cc                             \
+  utilities/table_properties_collectors/compact_on_deletion_collector.cc \
+  utilities/transactions/optimistic_transaction_impl.cc         \
+  utilities/transactions/optimistic_transaction_db_impl.cc      \
+  utilities/transactions/transaction_base.cc                    \
+  utilities/transactions/transaction_db_impl.cc                 \
+  utilities/transactions/transaction_db_mutex_impl.cc           \
+  utilities/transactions/transaction_lock_mgr.cc                \
+  utilities/transactions/transaction_impl.cc                    \
+  utilities/transactions/transaction_util.cc                    \
+  utilities/ttl/db_ttl_impl.cc                                  \
+  utilities/write_batch_with_index/write_batch_with_index.cc    \
+  utilities/write_batch_with_index/write_batch_with_index_internal.cc    \
+  util/event_logger.cc                                          \
+  util/log_buffer.cc                                            \
+  util/logging.cc                                               \
+  util/memenv.cc                                                \
+  util/murmurhash.cc                                            \
+  util/mutable_cf_options.cc                                    \
+  util/options_builder.cc                                       \
+  util/options.cc                                               \
+  util/options_helper.cc                                        \
+  util/options_parser.cc                                        \
+  util/perf_context.cc                                          \
+  util/perf_level.cc                                            \
+  util/rate_limiter.cc                                          \
+  util/skiplistrep.cc                                           \
+  util/slice.cc                                                 \
+  util/statistics.cc                                            \
+  util/status.cc                                                \
+  util/status_message.cc                                        \
+  util/string_util.cc                                           \
+  util/sync_point.cc                                            \
+  util/thread_local.cc                                          \
+  util/thread_status_impl.cc                                    \
+  util/thread_status_updater.cc                                 \
+  util/thread_status_updater_debug.cc                           \
+  util/thread_status_util.cc                                    \
+  util/thread_status_util_debug.cc                              \
+  util/vectorrep.cc                                             \
+  util/xfunc.cc                                                 \
+  util/xxhash.cc                                                \
+
+TOOL_SOURCES = \
+  util/ldb_cmd.cc                                               \
+  util/ldb_tool.cc                                              \
+  util/sst_dump_tool.cc                                         \
+
+MOCK_SOURCES = \
+  table/mock_table.cc \
+  util/mock_env.cc
+
+TEST_BENCH_SOURCES =                                                    \
+  third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc                  \
+  db/column_family_test.cc                                              \
+  db/compaction_job_test.cc                                             \
+  db/compaction_job_stats_test.cc                                       \
+  db/compaction_picker_test.cc                                          \
+  db/comparator_db_test.cc                                              \
+  db/corruption_test.cc                                                 \
+  db/cuckoo_table_db_test.cc                                            \
+  db/db_bench.cc                                                        \
+  db/dbformat_test.cc                                                   \
+  db/db_iter_test.cc                                                    \
+  db/db_test.cc                                                         \
+  db/db_compaction_filter_test.cc                                       \
+  db/db_compaction_test.cc                                              \
+  db/db_dynamic_level_test.cc                                           \
+  db/db_inplace_update_test.cc                                          \
+  db/db_log_iter_test.cc                                                \
+  db/db_universal_compaction_test.cc                                    \
+  db/db_tailing_iter_test.cc                                            \
+  db/db_wal_test.cc                                                     \
+  db/deletefile_test.cc                                                 \
+  db/fault_injection_test.cc                                            \
+  db/file_indexer_test.cc                                               \
+  db/filename_test.cc                                                   \
+  db/flush_job_test.cc                                                  \
+  db/listener_test.cc                                                   \
+  db/log_and_apply_bench.cc                                             \
+  db/log_test.cc                                                        \
+  db/memtablerep_bench.cc                                               \
+  db/merge_test.cc                                                      \
+  db/perf_context_test.cc                                               \
+  db/plain_table_db_test.cc                                             \
+  db/prefix_test.cc                                                     \
+  db/skiplist_test.cc                                                   \
+  db/table_properties_collector_test.cc                                 \
+  db/version_builder_test.cc                                            \
+  db/version_edit_test.cc                                               \
+  db/version_set_test.cc                                                \
+  db/wal_manager_test.cc                                                \
+  db/write_batch_test.cc                                                \
+  db/write_controller_test.cc                                           \
+  db/write_callback_test.cc                                             \
+  table/block_based_filter_block_test.cc                                \
+  table/block_hash_index_test.cc                                        \
+  table/block_test.cc                                                   \
+  table/cuckoo_table_builder_test.cc                                    \
+  table/cuckoo_table_reader_test.cc                                     \
+  table/full_filter_block_test.cc                                       \
+  table/merger_test.cc                                                  \
+  table/table_reader_bench.cc                                           \
+  table/table_test.cc                                                   \
+  tools/db_sanity_test.cc                                               \
+  tools/reduce_levels_test.cc                                           \
+  util/arena_test.cc                                                    \
+  util/auto_roll_logger_test.cc                                         \
+  util/autovector_test.cc                                               \
+  util/benchharness.cc                                                  \
+  util/benchharness_test.cc                                             \
+  util/bloom_test.cc                                                    \
+  util/cache_bench.cc                                                   \
+  util/cache_test.cc                                                    \
+  util/coding_test.cc                                                   \
+  util/crc32c_test.cc                                                   \
+  util/db_test_util.cc                                                  \
+  util/dynamic_bloom_test.cc                                            \
+  util/env_test.cc                                                      \
+  util/filelock_test.cc                                                 \
+  util/histogram_test.cc                                                \
+  utilities/backupable/backupable_db_test.cc                            \
+  utilities/checkpoint/checkpoint_test.cc                               \
+  utilities/document/document_db_test.cc                                \
+  utilities/document/json_document_test.cc                              \
+  utilities/geodb/geodb_test.cc                                         \
+  utilities/merge_operators/string_append/stringappend_test.cc          \
+  utilities/redis/redis_lists_test.cc                                   \
+  utilities/spatialdb/spatial_db_test.cc                                \
+  utilities/table_properties_collectors/compact_on_deletion_collector_test.cc  \
+  utilities/transactions/optimistic_transaction_test.cc                 \
+  utilities/transactions/transaction_test.cc                            \
+  utilities/ttl/ttl_test.cc                                             \
+  utilities/write_batch_with_index/write_batch_with_index_test.cc       \
+  util/log_write_bench.cc                                               \
+  util/manual_compaction_test.cc                                        \
+  util/memenv_test.cc                                                   \
+  util/mock_env_test.cc                                                 \
+  util/options_test.cc                                                  \
+  util/event_logger_test.cc                                             \
+  util/rate_limiter_test.cc                                             \
+  util/slice_transform_test.cc                                          \
+  util/sst_dump_test.cc                                                 \
+  util/testharness.cc                                                   \
+  util/testutil.cc                                                      \
+  util/thread_list_test.cc                                              \
+  util/thread_local_test.cc                                             \
+  util/ldb_cmd_test.cc
+
+JNI_NATIVE_SOURCES =                                          \
+  java/rocksjni/backupenginejni.cc                            \
+  java/rocksjni/backupablejni.cc                              \
+  java/rocksjni/checkpoint.cc                                 \
+  java/rocksjni/columnfamilyhandle.cc                         \
+  java/rocksjni/compaction_filter.cc                          \
+  java/rocksjni/comparator.cc                                 \
+  java/rocksjni/comparatorjnicallback.cc                      \
+  java/rocksjni/env.cc                                        \
+  java/rocksjni/filter.cc                                     \
+  java/rocksjni/iterator.cc                                   \
+  java/rocksjni/loggerjnicallback.cc                          \
+  java/rocksjni/memtablejni.cc                                \
+  java/rocksjni/merge_operator.cc                             \
+  java/rocksjni/options.cc                                    \
+  java/rocksjni/ratelimiterjni.cc                             \
+  java/rocksjni/remove_emptyvalue_compactionfilterjni.cc      \
+  java/rocksjni/restorejni.cc                                 \
+  java/rocksjni/rocksjni.cc                                   \
+  java/rocksjni/slice.cc                                      \
+  java/rocksjni/snapshot.cc                                   \
+  java/rocksjni/statistics.cc                                 \
+  java/rocksjni/table.cc                                      \
+  java/rocksjni/transaction_log.cc                            \
+  java/rocksjni/ttl.cc                                        \
+  java/rocksjni/write_batch.cc                                \
+  java/rocksjni/writebatchhandlerjnicallback.cc               \
+  java/rocksjni/write_batch_test.cc                           \
+  java/rocksjni/write_batch_with_index.cc
+
+# Currently, we do not generate dependencies for
+# java/rocksjni/write_batch_test.cc, because its dependent,
+# java/include/org_rocksdb_WriteBatch.h is generated.
+# TODO/FIXME: fix the above.  Otherwise, the current rules would fail:
+#   java/rocksjni/write_batch_test.cc:13:44: fatal error: include/org_rocksdb_WriteBatch.h: No such file or directory
+#    #include "include/org_rocksdb_WriteBatch.h"
+
+# These are the xfunc tests run :
+XFUNC_TESTS =                                                   \
+  "managed_new"                                                 \
+  "managed_xftest_dropold"                                      \
+  "managed_xftest_release"                                      \
+  "inplace_lock_test"                                           \
+  "transaction"
diff --git a/src/rocksdb/table/adaptive_table_factory.cc b/src/rocksdb/table/adaptive_table_factory.cc
index dcc8406..c589c07 100644
--- a/src/rocksdb/table/adaptive_table_factory.cc
+++ b/src/rocksdb/table/adaptive_table_factory.cc
@@ -5,7 +5,9 @@
 #ifndef ROCKSDB_LITE
 #include "table/adaptive_table_factory.h"
 
+#include "table/table_builder.h"
 #include "table/format.h"
+#include "port/port.h"
 
 namespace rocksdb {
 
@@ -39,9 +41,9 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber;
 extern const uint64_t kCuckooTableMagicNumber;
 
 Status AdaptiveTableFactory::NewTableReader(
-    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
-    const InternalKeyComparator& icomp, unique_ptr<RandomAccessFile>&& file,
-    uint64_t file_size, unique_ptr<TableReader>* table) const {
+    const TableReaderOptions& table_reader_options,
+    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    unique_ptr<TableReader>* table) const {
   Footer footer;
   auto s = ReadFooterFromFile(file.get(), file_size, &footer);
   if (!s.ok()) {
@@ -50,14 +52,14 @@ Status AdaptiveTableFactory::NewTableReader(
   if (footer.table_magic_number() == kPlainTableMagicNumber ||
       footer.table_magic_number() == kLegacyPlainTableMagicNumber) {
     return plain_table_factory_->NewTableReader(
-        ioptions, env_options, icomp, std::move(file), file_size, table);
+        table_reader_options, std::move(file), file_size, table);
   } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber ||
       footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
     return block_based_table_factory_->NewTableReader(
-        ioptions, env_options, icomp, std::move(file), file_size, table);
+        table_reader_options, std::move(file), file_size, table);
   } else if (footer.table_magic_number() == kCuckooTableMagicNumber) {
     return cuckoo_table_factory_->NewTableReader(
-        ioptions, env_options, icomp, std::move(file), file_size, table);
+        table_reader_options, std::move(file), file_size, table);
   } else {
     return Status::NotSupported("Unidentified table format");
   }
@@ -65,7 +67,7 @@ Status AdaptiveTableFactory::NewTableReader(
 
 TableBuilder* AdaptiveTableFactory::NewTableBuilder(
     const TableBuilderOptions& table_builder_options,
-    WritableFile* file) const {
+    WritableFileWriter* file) const {
   return table_factory_to_write_->NewTableBuilder(table_builder_options, file);
 }
 
diff --git a/src/rocksdb/table/adaptive_table_factory.h b/src/rocksdb/table/adaptive_table_factory.h
index aa0f827..dfcae14 100644
--- a/src/rocksdb/table/adaptive_table_factory.h
+++ b/src/rocksdb/table/adaptive_table_factory.h
@@ -33,23 +33,18 @@ class AdaptiveTableFactory : public TableFactory {
 
   const char* Name() const override { return "AdaptiveTableFactory"; }
 
-  Status NewTableReader(
-      const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
-      const InternalKeyComparator& internal_comparator,
-      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table) const override;
+  Status NewTableReader(const TableReaderOptions& table_reader_options,
+                        unique_ptr<RandomAccessFileReader>&& file,
+                        uint64_t file_size,
+                        unique_ptr<TableReader>* table) const override;
 
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      WritableFile* file) const override;
+      WritableFileWriter* file) const override;
 
   // Sanitizes the specified DB Options.
   Status SanitizeOptions(const DBOptions& db_opts,
                          const ColumnFamilyOptions& cf_opts) const override {
-    if (db_opts.allow_mmap_reads == false) {
-      return Status::NotSupported(
-          "AdaptiveTable with allow_mmap_reads == false is not supported.");
-    }
     return Status::OK();
   }
 
diff --git a/src/rocksdb/table/block.cc b/src/rocksdb/table/block.cc
index 6a5ede6..99c76f6 100644
--- a/src/rocksdb/table/block.cc
+++ b/src/rocksdb/table/block.cc
@@ -22,6 +22,7 @@
 #include "table/block_prefix_index.h"
 #include "util/coding.h"
 #include "util/logging.h"
+#include "util/perf_context_imp.h"
 
 namespace rocksdb {
 
@@ -82,6 +83,7 @@ void BlockIter::Prev() {
 }
 
 void BlockIter::Seek(const Slice& target) {
+  PERF_TIMER_GUARD(block_seek_nanos);
   if (data_ == nullptr) {  // Not init yet
     return;
   }
@@ -359,7 +361,7 @@ void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) {
 }
 
 size_t Block::ApproximateMemoryUsage() const {
-  size_t usage = size();
+  size_t usage = usable_size();
   if (hash_index_) {
     usage += hash_index_->ApproximateMemoryUsage();
   }
diff --git a/src/rocksdb/table/block.h b/src/rocksdb/table/block.h
index 0187489..2ce48d3 100644
--- a/src/rocksdb/table/block.h
+++ b/src/rocksdb/table/block.h
@@ -10,6 +10,9 @@
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#include <malloc.h>
+#endif
 
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
@@ -37,6 +40,14 @@ class Block {
   size_t size() const { return size_; }
   const char* data() const { return data_; }
   bool cachable() const { return contents_.cachable; }
+  size_t usable_size() const {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    if (contents_.allocation.get() != nullptr) {
+      return malloc_usable_size(contents_.allocation.get());
+    }
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return size_;
+  }
   uint32_t NumRestarts() const;
   CompressionType compression_type() const {
     return contents_.compression_type;
diff --git a/src/rocksdb/table/block_based_filter_block.cc b/src/rocksdb/table/block_based_filter_block.cc
index cd56028..c33d485 100644
--- a/src/rocksdb/table/block_based_filter_block.cc
+++ b/src/rocksdb/table/block_based_filter_block.cc
@@ -7,12 +7,13 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include <algorithm>
 #include "table/block_based_filter_block.h"
+#include <algorithm>
 
 #include "db/dbformat.h"
 #include "rocksdb/filter_policy.h"
 #include "util/coding.h"
+#include "util/perf_context_imp.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
@@ -219,7 +220,14 @@ bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry,
     uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
     if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
       Slice filter = Slice(data_ + start, limit - start);
-      return policy_->KeyMayMatch(entry, filter);
+      bool const may_match = policy_->KeyMayMatch(entry, filter);
+      if (may_match) {
+        PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+        return true;
+      } else {
+        PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+        return false;
+      }
     } else if (start == limit) {
       // Empty filters do not match any entries
       return false;
diff --git a/src/rocksdb/table/block_based_table_builder.cc b/src/rocksdb/table/block_based_table_builder.cc
index 201f128..e135315 100644
--- a/src/rocksdb/table/block_based_table_builder.cc
+++ b/src/rocksdb/table/block_based_table_builder.cc
@@ -360,6 +360,13 @@ Slice CompressBlock(const Slice& raw,
         return *compressed_output;
       }
       break;     // fall back to no compression.
+    case kZSTDNotFinalCompression:
+      if (ZSTD_Compress(compression_options, raw.data(), raw.size(),
+                        compressed_output) &&
+          GoodCompressionRatio(compressed_output->size(), raw.size())) {
+        return *compressed_output;
+      }
+      break;     // fall back to no compression.
     default: {}  // Do not recognize this compression type
   }
 
@@ -374,12 +381,15 @@ Slice CompressBlock(const Slice& raw,
 // kBlockBasedTableMagicNumber was picked by running
 //    echo rocksdb.table.block_based | sha1sum
 // and taking the leading 64 bits.
-// Please note that kBlockBasedTableMagicNumber may also be accessed by
-// other .cc files so it have to be explicitly declared with "extern".
-extern const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull;
+// Please note that kBlockBasedTableMagicNumber may also be accessed by other
+// .cc files
+// for that reason we declare it extern in the header but to get the space
+// allocated
+// it must be not extern in one place.
+const uint64_t kBlockBasedTableMagicNumber = 0x88e241b785f4cff7ull;
 // We also support reading and writing legacy block based table format (for
 // backwards compatibility)
-extern const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
+const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
 
 // A collector that collects properties of interest to block-based table.
 // For now this class looks heavy-weight since we only write one additional
@@ -434,7 +444,7 @@ struct BlockBasedTableBuilder::Rep {
   const ImmutableCFOptions ioptions;
   const BlockBasedTableOptions table_options;
   const InternalKeyComparator& internal_comparator;
-  WritableFile* file;
+  WritableFileWriter* file;
   uint64_t offset = 0;
   Status status;
   BlockBuilder data_block;
@@ -464,7 +474,7 @@ struct BlockBasedTableBuilder::Rep {
       const InternalKeyComparator& icomparator,
       const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
           int_tbl_prop_collector_factories,
-      WritableFile* f, const CompressionType _compression_type,
+      WritableFileWriter* f, const CompressionType _compression_type,
       const CompressionOptions& _compression_opts, const bool skip_filters)
       : ioptions(_ioptions),
         table_options(table_opt),
@@ -499,7 +509,7 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
     const InternalKeyComparator& internal_comparator,
     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
         int_tbl_prop_collector_factories,
-    WritableFile* file, const CompressionType compression_type,
+    WritableFileWriter* file, const CompressionType compression_type,
     const CompressionOptions& compression_opts, const bool skip_filters) {
   BlockBasedTableOptions sanitized_table_options(table_options);
   if (sanitized_table_options.format_version == 0 &&
@@ -521,7 +531,7 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
   }
   if (table_options.block_cache_compressed.get() != nullptr) {
     BlockBasedTable::GenerateCachePrefix(
-        table_options.block_cache_compressed.get(), file,
+        table_options.block_cache_compressed.get(), file->writable_file(),
         &rep_->compressed_cache_key_prefix[0],
         &rep_->compressed_cache_key_prefix_size);
   }
@@ -703,8 +713,8 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
               (end - r->compressed_cache_key_prefix));
 
     // Insert into compressed block cache.
-    cache_handle = block_cache_compressed->Insert(key, block, block->size(),
-                                                  &DeleteCachedBlock);
+    cache_handle = block_cache_compressed->Insert(
+        key, block, block->usable_size(), &DeleteCachedBlock);
     block_cache_compressed->Release(cache_handle);
 
     // Invalidate OS cache.
@@ -848,6 +858,15 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
   return rep_->offset;
 }
 
+bool BlockBasedTableBuilder::NeedCompact() const {
+  for (const auto& collector : rep_->table_properties_collectors) {
+    if (collector->NeedCompact()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 TableProperties BlockBasedTableBuilder::GetTableProperties() const {
   TableProperties ret = rep_->props;
   for (const auto& collector : rep_->table_properties_collectors) {
diff --git a/src/rocksdb/table/block_based_table_builder.h b/src/rocksdb/table/block_based_table_builder.h
index 716a4e9..ce86820 100644
--- a/src/rocksdb/table/block_based_table_builder.h
+++ b/src/rocksdb/table/block_based_table_builder.h
@@ -26,6 +26,9 @@ class BlockHandle;
 class WritableFile;
 struct BlockBasedTableOptions;
 
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kLegacyBlockBasedTableMagicNumber;
+
 class BlockBasedTableBuilder : public TableBuilder {
  public:
   // Create a builder that will store the contents of the table it is
@@ -37,7 +40,7 @@ class BlockBasedTableBuilder : public TableBuilder {
       const InternalKeyComparator& internal_comparator,
       const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
           int_tbl_prop_collector_factories,
-      WritableFile* file, const CompressionType compression_type,
+      WritableFileWriter* file, const CompressionType compression_type,
       const CompressionOptions& compression_opts, const bool skip_filters);
 
   // REQUIRES: Either Finish() or Abandon() has been called.
@@ -70,6 +73,8 @@ class BlockBasedTableBuilder : public TableBuilder {
   // Finish() call, returns the size of the final generated file.
   uint64_t FileSize() const override;
 
+  bool NeedCompact() const override;
+
   // Get table properties
   TableProperties GetTableProperties() const override;
 
diff --git a/src/rocksdb/table/block_based_table_factory.cc b/src/rocksdb/table/block_based_table_factory.cc
index f87660c..ea910c6 100644
--- a/src/rocksdb/table/block_based_table_factory.cc
+++ b/src/rocksdb/table/block_based_table_factory.cc
@@ -42,18 +42,27 @@ BlockBasedTableFactory::BlockBasedTableFactory(
 }
 
 Status BlockBasedTableFactory::NewTableReader(
-    const ImmutableCFOptions& ioptions, const EnvOptions& soptions,
-    const InternalKeyComparator& internal_comparator,
-    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+    const TableReaderOptions& table_reader_options,
+    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    unique_ptr<TableReader>* table_reader) const {
+  return NewTableReader(table_reader_options, std::move(file), file_size,
+                        table_reader,
+                        /*prefetch_index_and_filter=*/true);
+}
+
+Status BlockBasedTableFactory::NewTableReader(
+    const TableReaderOptions& table_reader_options,
+    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     unique_ptr<TableReader>* table_reader, const bool prefetch_enabled) const {
-  return BlockBasedTable::Open(ioptions, soptions, table_options_,
-                               internal_comparator, std::move(file), file_size,
-                               table_reader, prefetch_enabled);
+  return BlockBasedTable::Open(
+      table_reader_options.ioptions, table_reader_options.env_options,
+      table_options_, table_reader_options.internal_comparator, std::move(file),
+      file_size, table_reader, prefetch_enabled);
 }
 
 TableBuilder* BlockBasedTableFactory::NewTableBuilder(
     const TableBuilderOptions& table_builder_options,
-    WritableFile* file) const {
+    WritableFileWriter* file) const {
   auto table_builder = new BlockBasedTableBuilder(
       table_builder_options.ioptions, table_options_,
       table_builder_options.internal_comparator,
@@ -115,7 +124,7 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
            table_options_.block_cache.get());
   ret.append(buffer);
   if (table_options_.block_cache) {
-    snprintf(buffer, kBufferSize, "  block_cache_size: %zd\n",
+    snprintf(buffer, kBufferSize, "  block_cache_size: %" ROCKSDB_PRIszt "\n",
              table_options_.block_cache->GetCapacity());
     ret.append(buffer);
   }
@@ -123,11 +132,12 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
            table_options_.block_cache_compressed.get());
   ret.append(buffer);
   if (table_options_.block_cache_compressed) {
-    snprintf(buffer, kBufferSize, "  block_cache_compressed_size: %zd\n",
+    snprintf(buffer, kBufferSize,
+             "  block_cache_compressed_size: %" ROCKSDB_PRIszt "\n",
              table_options_.block_cache_compressed->GetCapacity());
     ret.append(buffer);
   }
-  snprintf(buffer, kBufferSize, "  block_size: %zd\n",
+  snprintf(buffer, kBufferSize, "  block_size: %" ROCKSDB_PRIszt "\n",
            table_options_.block_size);
   ret.append(buffer);
   snprintf(buffer, kBufferSize, "  block_size_deviation: %d\n",
diff --git a/src/rocksdb/table/block_based_table_factory.h b/src/rocksdb/table/block_based_table_factory.h
index 6394926..8bdd4cd 100644
--- a/src/rocksdb/table/block_based_table_factory.h
+++ b/src/rocksdb/table/block_based_table_factory.h
@@ -33,28 +33,22 @@ class BlockBasedTableFactory : public TableFactory {
 
   const char* Name() const override { return "BlockBasedTable"; }
 
-  Status NewTableReader(const ImmutableCFOptions& ioptions,
-                        const EnvOptions& soptions,
-                        const InternalKeyComparator& internal_comparator,
-                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                        unique_ptr<TableReader>* table_reader) const override {
-    return NewTableReader(ioptions, soptions, internal_comparator,
-                          std::move(file), file_size, table_reader,
-                          /*prefetch_index_and_filter=*/true);
-  }
+  Status NewTableReader(const TableReaderOptions& table_reader_options,
+                        unique_ptr<RandomAccessFileReader>&& file,
+                        uint64_t file_size,
+                        unique_ptr<TableReader>* table_reader) const override;
 
   // This is a variant of virtual member function NewTableReader function with
   // added capability to disable pre-fetching of blocks on BlockBasedTable::Open
-  Status NewTableReader(const ImmutableCFOptions& ioptions,
-                        const EnvOptions& soptions,
-                        const InternalKeyComparator& internal_comparator,
-                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+  Status NewTableReader(const TableReaderOptions& table_reader_options,
+                        unique_ptr<RandomAccessFileReader>&& file,
+                        uint64_t file_size,
                         unique_ptr<TableReader>* table_reader,
                         bool prefetch_index_and_filter) const;
 
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      WritableFile* file) const override;
+      WritableFileWriter* file) const override;
 
   // Sanitizes the specified DB Options.
   Status SanitizeOptions(const DBOptions& db_opts,
diff --git a/src/rocksdb/table/block_based_table_reader.cc b/src/rocksdb/table/block_based_table_reader.cc
index ed7fb0b..b113272 100644
--- a/src/rocksdb/table/block_based_table_reader.cc
+++ b/src/rocksdb/table/block_based_table_reader.cc
@@ -37,6 +37,7 @@
 #include "table/get_context.h"
 
 #include "util/coding.h"
+#include "util/file_reader_writer.h"
 #include "util/perf_context_imp.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
@@ -62,7 +63,7 @@ const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) =
 // The only relevant option is options.verify_checksums for now.
 // On failure return non-OK.
 // On success fill *result and return OK - caller owns *result
-Status ReadBlockFromFile(RandomAccessFile* file, const Footer& footer,
+Status ReadBlockFromFile(RandomAccessFileReader* file, const Footer& footer,
                          const ReadOptions& options, const BlockHandle& handle,
                          std::unique_ptr<Block>* result, Env* env,
                          bool do_uncompress = true) {
@@ -116,6 +117,9 @@ Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
     PERF_COUNTER_ADD(block_cache_hit_count, 1);
     // overall cache hit
     RecordTick(statistics, BLOCK_CACHE_HIT);
+    // total bytes read from cache
+    RecordTick(statistics, BLOCK_CACHE_BYTES_READ,
+               block_cache->GetUsage(cache_handle));
     // block-type specific cache hit
     RecordTick(statistics, block_cache_hit_ticker);
   } else {
@@ -147,6 +151,8 @@ class BlockBasedTable::IndexReader {
 
   // The size of the index.
   virtual size_t size() const = 0;
+  // Memory usage of the index block
+  virtual size_t usable_size() const = 0;
 
   // Report an approximation of how much memory has been used other than memory
   // that was allocated in block cache.
@@ -165,7 +171,7 @@ class BinarySearchIndexReader : public IndexReader {
   // `BinarySearchIndexReader`.
   // On success, index_reader will be populated; otherwise it will remain
   // unmodified.
-  static Status Create(RandomAccessFile* file, const Footer& footer,
+  static Status Create(RandomAccessFileReader* file, const Footer& footer,
                        const BlockHandle& index_handle, Env* env,
                        const Comparator* comparator,
                        IndexReader** index_reader) {
@@ -187,6 +193,9 @@ class BinarySearchIndexReader : public IndexReader {
   }
 
   virtual size_t size() const override { return index_block_->size(); }
+  virtual size_t usable_size() const override {
+    return index_block_->usable_size();
+  }
 
   virtual size_t ApproximateMemoryUsage() const override {
     assert(index_block_);
@@ -207,8 +216,8 @@ class BinarySearchIndexReader : public IndexReader {
 class HashIndexReader : public IndexReader {
  public:
   static Status Create(const SliceTransform* hash_key_extractor,
-                       const Footer& footer, RandomAccessFile* file, Env* env,
-                       const Comparator* comparator,
+                       const Footer& footer, RandomAccessFileReader* file,
+                       Env* env, const Comparator* comparator,
                        const BlockHandle& index_handle,
                        Iterator* meta_index_iter, IndexReader** index_reader,
                        bool hash_index_allow_collision) {
@@ -295,6 +304,9 @@ class HashIndexReader : public IndexReader {
   }
 
   virtual size_t size() const override { return index_block_->size(); }
+  virtual size_t usable_size() const override {
+    return index_block_->usable_size();
+  }
 
   virtual size_t ApproximateMemoryUsage() const override {
     assert(index_block_);
@@ -330,6 +342,7 @@ struct BlockBasedTable::Rep {
         table_options(_table_opt),
         filter_policy(_table_opt.filter_policy.get()),
         internal_comparator(_internal_comparator),
+        filter_type(FilterType::kNoFilter),
         whole_key_filtering(_table_opt.whole_key_filtering),
         prefix_filtering(true) {}
 
@@ -339,7 +352,7 @@ struct BlockBasedTable::Rep {
   const FilterPolicy* const filter_policy;
   const InternalKeyComparator& internal_comparator;
   Status status;
-  unique_ptr<RandomAccessFile> file;
+  unique_ptr<RandomAccessFileReader> file;
   char cache_key_prefix[kMaxCacheKeyPrefixSize];
   size_t cache_key_prefix_size = 0;
   char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
@@ -353,6 +366,14 @@ struct BlockBasedTable::Rep {
   unique_ptr<IndexReader> index_reader;
   unique_ptr<FilterBlockReader> filter;
 
+  enum class FilterType {
+    kNoFilter,
+    kFullFilter,
+    kBlockFilter,
+  };
+  FilterType filter_type;
+  BlockHandle filter_handle;
+
   std::shared_ptr<const TableProperties> table_properties;
   BlockBasedTableOptions::IndexType index_type;
   bool hash_index_allow_collision;
@@ -397,13 +418,12 @@ void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
   rep->cache_key_prefix_size = 0;
   rep->compressed_cache_key_prefix_size = 0;
   if (rep->table_options.block_cache != nullptr) {
-    GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file.get(),
-                        &rep->cache_key_prefix[0],
-                        &rep->cache_key_prefix_size);
+    GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
+                        &rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
   }
   if (rep->table_options.block_cache_compressed != nullptr) {
     GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
-                        rep->file.get(), &rep->compressed_cache_key_prefix[0],
+                        rep->file->file(), &rep->compressed_cache_key_prefix[0],
                         &rep->compressed_cache_key_prefix_size);
   }
 }
@@ -461,7 +481,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
                              const EnvOptions& env_options,
                              const BlockBasedTableOptions& table_options,
                              const InternalKeyComparator& internal_comparator,
-                             unique_ptr<RandomAccessFile>&& file,
+                             unique_ptr<RandomAccessFileReader>&& file,
                              uint64_t file_size,
                              unique_ptr<TableReader>* table_reader,
                              const bool prefetch_index_and_filter) {
@@ -498,6 +518,21 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
     return s;
   }
 
+  // Find filter handle and filter type
+  if (rep->filter_policy) {
+    for (auto prefix : {kFullFilterBlockPrefix, kFilterBlockPrefix}) {
+      std::string filter_block_key = prefix;
+      filter_block_key.append(rep->filter_policy->Name());
+      if (FindMetaBlock(meta_iter.get(), filter_block_key, &rep->filter_handle)
+              .ok()) {
+        rep->filter_type = (prefix == kFullFilterBlockPrefix)
+                               ? Rep::FilterType::kFullFilter
+                               : Rep::FilterType::kBlockFilter;
+        break;
+      }
+    }
+  }
+
   // Read the properties
   bool found_properties_block = true;
   s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block);
@@ -565,7 +600,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
 
         // Set filter block
         if (rep->filter_policy) {
-          rep->filter.reset(ReadFilter(rep, meta_iter.get(), nullptr));
+          rep->filter.reset(ReadFilter(rep, nullptr));
         }
       } else {
         delete index_reader;
@@ -585,13 +620,13 @@ void BlockBasedTable::SetupForCompaction() {
     case Options::NONE:
       break;
     case Options::NORMAL:
-      rep_->file->Hint(RandomAccessFile::NORMAL);
+      rep_->file->file()->Hint(RandomAccessFile::NORMAL);
       break;
     case Options::SEQUENTIAL:
-      rep_->file->Hint(RandomAccessFile::SEQUENTIAL);
+      rep_->file->file()->Hint(RandomAccessFile::SEQUENTIAL);
       break;
     case Options::WILLNEED:
-      rep_->file->Hint(RandomAccessFile::WILLNEED);
+      rep_->file->file()->Hint(RandomAccessFile::WILLNEED);
       break;
     default:
       assert(false);
@@ -702,9 +737,9 @@ Status BlockBasedTable::GetDataBlockFromCache(
     assert(block->value->compression_type() == kNoCompression);
     if (block_cache != nullptr && block->value->cachable() &&
         read_options.fill_cache) {
-      block->cache_handle =
-          block_cache->Insert(block_cache_key, block->value,
-                              block->value->size(), &DeleteCachedEntry<Block>);
+      block->cache_handle = block_cache->Insert(block_cache_key, block->value,
+                                                block->value->usable_size(),
+                                                &DeleteCachedEntry<Block>);
       assert(reinterpret_cast<Block*>(
                  block_cache->Value(block->cache_handle)) == block->value);
     }
@@ -747,7 +782,7 @@ Status BlockBasedTable::PutDataBlockToCache(
   if (block_cache_compressed != nullptr && raw_block != nullptr &&
       raw_block->cachable()) {
     auto cache_handle = block_cache_compressed->Insert(
-        compressed_block_cache_key, raw_block, raw_block->size(),
+        compressed_block_cache_key, raw_block, raw_block->usable_size(),
         &DeleteCachedEntry<Block>);
     block_cache_compressed->Release(cache_handle);
     RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
@@ -759,10 +794,12 @@ Status BlockBasedTable::PutDataBlockToCache(
   // insert into uncompressed block cache
   assert((block->value->compression_type() == kNoCompression));
   if (block_cache != nullptr && block->value->cachable()) {
-    block->cache_handle =
-        block_cache->Insert(block_cache_key, block->value, block->value->size(),
-                            &DeleteCachedEntry<Block>);
+    block->cache_handle = block_cache->Insert(block_cache_key, block->value,
+                                              block->value->usable_size(),
+                                              &DeleteCachedEntry<Block>);
     RecordTick(statistics, BLOCK_CACHE_ADD);
+    RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE,
+               block->value->usable_size());
     assert(reinterpret_cast<Block*>(block_cache->Value(block->cache_handle)) ==
            block->value);
   }
@@ -770,45 +807,43 @@ Status BlockBasedTable::PutDataBlockToCache(
   return s;
 }
 
-FilterBlockReader* BlockBasedTable::ReadFilter(
-    Rep* rep, Iterator* meta_index_iter, size_t* filter_size) {
+FilterBlockReader* BlockBasedTable::ReadFilter(Rep* rep, size_t* filter_size) {
   // TODO: We might want to unify with ReadBlockFromFile() if we start
   // requiring checksum verification in Table::Open.
-  for (auto prefix : {kFullFilterBlockPrefix, kFilterBlockPrefix}) {
-    std::string filter_block_key = prefix;
-    filter_block_key.append(rep->filter_policy->Name());
-    BlockHandle handle;
-    if (FindMetaBlock(meta_index_iter, filter_block_key, &handle).ok()) {
-      BlockContents block;
-      if (!ReadBlockContents(rep->file.get(), rep->footer, ReadOptions(),
-                             handle, &block, rep->ioptions.env, false).ok()) {
-        // Error reading the block
-        return nullptr;
-      }
-
-      if (filter_size) {
-        *filter_size = block.data.size();
-      }
-
-      assert(rep->filter_policy);
-      if (kFilterBlockPrefix == prefix) {
-        return new BlockBasedFilterBlockReader(
-            rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr,
-            rep->table_options, rep->whole_key_filtering, std::move(block));
-      } else if (kFullFilterBlockPrefix == prefix) {
-        auto filter_bits_reader = rep->filter_policy->
-            GetFilterBitsReader(block.data);
-        if (filter_bits_reader != nullptr) {
-          return new FullFilterBlockReader(
-              rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr,
-              rep->whole_key_filtering, std::move(block), filter_bits_reader);
-        }
-      } else {
-        assert(false);
-        return nullptr;
-      }
+  if (rep->filter_type == Rep::FilterType::kNoFilter) {
+    return nullptr;
+  }
+  BlockContents block;
+  if (!ReadBlockContents(rep->file.get(), rep->footer, ReadOptions(),
+                         rep->filter_handle, &block, rep->ioptions.env,
+                         false).ok()) {
+    // Error reading the block
+    return nullptr;
+  }
+
+  if (filter_size) {
+    *filter_size = block.data.size();
+  }
+
+  assert(rep->filter_policy);
+
+  if (rep->filter_type == Rep::FilterType::kBlockFilter) {
+    return new BlockBasedFilterBlockReader(
+        rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr,
+        rep->table_options, rep->whole_key_filtering, std::move(block));
+  } else if (rep->filter_type == Rep::FilterType::kFullFilter) {
+    auto filter_bits_reader =
+        rep->filter_policy->GetFilterBitsReader(block.data);
+    if (filter_bits_reader != nullptr) {
+      return new FullFilterBlockReader(
+          rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr,
+          rep->whole_key_filtering, std::move(block), filter_bits_reader);
     }
   }
+
+  // filter_type is either kNoFilter (exited the function at the first if),
+  // kBlockFilter or kFullFilter. there is no way for the execution to come here
+  assert(false);
   return nullptr;
 }
 
@@ -822,6 +857,8 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     return {rep_->filter.get(), nullptr /* cache handle */};
   }
 
+  PERF_TIMER_GUARD(read_filter_block_nanos);
+
   Cache* block_cache = rep_->table_options.block_cache.get();
   if (rep_->filter_policy == nullptr /* do not use filter */ ||
       block_cache == nullptr /* no block cache at all */) {
@@ -848,18 +885,13 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     return CachableEntry<FilterBlockReader>();
   } else {
     size_t filter_size = 0;
-    std::unique_ptr<Block> meta;
-    std::unique_ptr<Iterator> iter;
-    auto s = ReadMetaBlock(rep_, &meta, &iter);
-
-    if (s.ok()) {
-      filter = ReadFilter(rep_, iter.get(), &filter_size);
-      if (filter != nullptr) {
-        assert(filter_size > 0);
-        cache_handle = block_cache->Insert(
-            key, filter, filter_size, &DeleteCachedEntry<FilterBlockReader>);
-        RecordTick(statistics, BLOCK_CACHE_ADD);
-      }
+    filter = ReadFilter(rep_, &filter_size);
+    if (filter != nullptr) {
+      assert(filter_size > 0);
+      cache_handle = block_cache->Insert(key, filter, filter_size,
+                                         &DeleteCachedEntry<FilterBlockReader>);
+      RecordTick(statistics, BLOCK_CACHE_ADD);
+      RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, filter_size);
     }
   }
 
@@ -873,6 +905,7 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options,
     return rep_->index_reader->NewIterator(
         input_iter, read_options.total_order_seek);
   }
+  PERF_TIMER_GUARD(read_index_block_nanos);
 
   bool no_io = read_options.read_tier == kBlockCacheTier;
   Cache* block_cache = rep_->table_options.block_cache.get();
@@ -913,9 +946,12 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options,
       }
     }
 
-    cache_handle = block_cache->Insert(key, index_reader, index_reader->size(),
-                                       &DeleteCachedEntry<IndexReader>);
+    cache_handle =
+        block_cache->Insert(key, index_reader, index_reader->usable_size(),
+                            &DeleteCachedEntry<IndexReader>);
     RecordTick(statistics, BLOCK_CACHE_ADD);
+    RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE,
+               index_reader->usable_size());
   }
 
   assert(cache_handle);
@@ -932,6 +968,8 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options,
 Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep,
     const ReadOptions& ro, const Slice& index_value,
     BlockIter* input_iter) {
+  PERF_TIMER_GUARD(new_table_block_iter_nanos);
+
   const bool no_io = (ro.read_tier == kBlockCacheTier);
   Cache* block_cache = rep->table_options.block_cache.get();
   Cache* block_cache_compressed =
diff --git a/src/rocksdb/table/block_based_table_reader.h b/src/rocksdb/table/block_based_table_reader.h
index 727a0d6..d81f610 100644
--- a/src/rocksdb/table/block_based_table_reader.h
+++ b/src/rocksdb/table/block_based_table_reader.h
@@ -21,6 +21,7 @@
 #include "table/table_reader.h"
 #include "table/table_properties_internal.h"
 #include "util/coding.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
@@ -69,8 +70,8 @@ class BlockBasedTable : public TableReader {
                      const EnvOptions& env_options,
                      const BlockBasedTableOptions& table_options,
                      const InternalKeyComparator& internal_key_comparator,
-                     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                     unique_ptr<TableReader>* table_reader,
+                     unique_ptr<RandomAccessFileReader>&& file,
+                     uint64_t file_size, unique_ptr<TableReader>* table_reader,
                      bool prefetch_index_and_filter = true);
 
   bool PrefixMayMatch(const Slice& internal_key);
@@ -198,10 +199,7 @@ class BlockBasedTable : public TableReader {
       std::unique_ptr<Iterator>* iter);
 
   // Create the filter from the filter block.
-  static FilterBlockReader* ReadFilter(
-      Rep* rep,
-      Iterator* meta_index_iter,
-      size_t* filter_size = nullptr);
+  static FilterBlockReader* ReadFilter(Rep* rep, size_t* filter_size = nullptr);
 
   static void SetupCacheKeyPrefix(Rep* rep);
 
diff --git a/src/rocksdb/table/block_hash_index.cc b/src/rocksdb/table/block_hash_index.cc
index 02ebcbc..fd13296 100644
--- a/src/rocksdb/table/block_hash_index.cc
+++ b/src/rocksdb/table/block_hash_index.cc
@@ -132,9 +132,9 @@ bool BlockHashIndex::Add(const Slice& prefix, uint32_t restart_index,
   auto prefix_to_insert = prefix;
   if (kOwnPrefixes) {
     auto prefix_ptr = arena_.Allocate(prefix.size());
-    std::copy(prefix.data() /* begin */,
-              prefix.data() + prefix.size() /* end */,
-              prefix_ptr /* destination */);
+    // MSVC reports C4996 Function call with parameters that may be
+    // unsafe when using std::copy with a output iterator - pointer
+    memcpy(prefix_ptr, prefix.data(), prefix.size());
     prefix_to_insert = Slice(prefix_ptr, prefix.size());
   }
   auto result = restart_indices_.insert(
diff --git a/src/rocksdb/table/block_prefix_index.h b/src/rocksdb/table/block_prefix_index.h
index 662bc09..bc36c48 100644
--- a/src/rocksdb/table/block_prefix_index.h
+++ b/src/rocksdb/table/block_prefix_index.h
@@ -4,6 +4,7 @@
 // of patent rights can be found in the PATENTS file in the same directory.
 #pragma once
 
+#include <stdint.h>
 #include "rocksdb/status.h"
 
 namespace rocksdb {
diff --git a/src/rocksdb/table/cuckoo_table_builder.cc b/src/rocksdb/table/cuckoo_table_builder.cc
index 1aa1e07..946a8b5 100644
--- a/src/rocksdb/table/cuckoo_table_builder.cc
+++ b/src/rocksdb/table/cuckoo_table_builder.cc
@@ -20,6 +20,7 @@
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "util/autovector.h"
+#include "util/file_reader_writer.h"
 #include "util/random.h"
 #include "util/string_util.h"
 
@@ -47,7 +48,7 @@ const std::string CuckooTablePropertyNames::kUserKeyLength =
 extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull;
 
 CuckooTableBuilder::CuckooTableBuilder(
-    WritableFile* file, double max_hash_table_ratio,
+    WritableFileWriter* file, double max_hash_table_ratio,
     uint32_t max_num_hash_table, uint32_t max_search_depth,
     const Comparator* user_comparator, uint32_t cuckoo_block_size,
     bool use_module_hash, bool identity_as_first_hash,
diff --git a/src/rocksdb/table/cuckoo_table_builder.h b/src/rocksdb/table/cuckoo_table_builder.h
index 6b5a180..093e1c2 100644
--- a/src/rocksdb/table/cuckoo_table_builder.h
+++ b/src/rocksdb/table/cuckoo_table_builder.h
@@ -10,6 +10,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "port/port.h"
 #include "rocksdb/status.h"
 #include "table/table_builder.h"
 #include "rocksdb/table.h"
@@ -20,12 +21,13 @@ namespace rocksdb {
 
 class CuckooTableBuilder: public TableBuilder {
  public:
-  CuckooTableBuilder(
-      WritableFile* file, double max_hash_table_ratio,
-      uint32_t max_num_hash_func, uint32_t max_search_depth,
-      const Comparator* user_comparator, uint32_t cuckoo_block_size,
-      bool use_module_hash, bool identity_as_first_hash,
-      uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t));
+  CuckooTableBuilder(WritableFileWriter* file, double max_hash_table_ratio,
+                     uint32_t max_num_hash_func, uint32_t max_search_depth,
+                     const Comparator* user_comparator,
+                     uint32_t cuckoo_block_size, bool use_module_hash,
+                     bool identity_as_first_hash,
+                     uint64_t (*get_slice_hash)(const Slice&, uint32_t,
+                                                uint64_t));
 
   // REQUIRES: Either Finish() or Abandon() has been called.
   ~CuckooTableBuilder() {}
@@ -68,7 +70,7 @@ class CuckooTableBuilder: public TableBuilder {
     // We assume number of items is <= 2^32.
     uint32_t make_space_for_key_call_id;
   };
-  static const uint32_t kMaxVectorIdx = std::numeric_limits<int32_t>::max();
+  static const uint32_t kMaxVectorIdx = port::kMaxInt32;
 
   bool MakeSpaceForKey(const autovector<uint64_t>& hash_vals,
                        const uint32_t call_id,
@@ -81,7 +83,7 @@ class CuckooTableBuilder: public TableBuilder {
   inline Slice GetValue(uint64_t idx) const;
 
   uint32_t num_hash_func_;
-  WritableFile* file_;
+  WritableFileWriter* file_;
   const double max_hash_table_ratio_;
   const uint32_t max_num_hash_func_;
   const uint32_t max_search_depth_;
diff --git a/src/rocksdb/table/cuckoo_table_builder_test.cc b/src/rocksdb/table/cuckoo_table_builder_test.cc
index cab5daf..2ee87fb 100644
--- a/src/rocksdb/table/cuckoo_table_builder_test.cc
+++ b/src/rocksdb/table/cuckoo_table_builder_test.cc
@@ -3,6 +3,8 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef ROCKSDB_LITE
+
 #include <vector>
 #include <string>
 #include <map>
@@ -10,6 +12,7 @@
 
 #include "table/meta_blocks.h"
 #include "table/cuckoo_table_builder.h"
+#include "util/file_reader_writer.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
@@ -48,8 +51,11 @@ class CuckooBuilderTest : public testing::Test {
 
     // Assert Table Properties.
     TableProperties* props = nullptr;
-    ASSERT_OK(ReadTableProperties(read_file.get(), read_file_size,
-          kCuckooTableMagicNumber, env_, nullptr, &props));
+    unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(std::move(read_file)));
+    ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size,
+                                  kCuckooTableMagicNumber, env_, nullptr,
+                                  &props));
     // Check unused bucket.
     std::string unused_key = props->user_collected_properties[
       CuckooTablePropertyNames::kEmptyKey];
@@ -90,8 +96,8 @@ class CuckooBuilderTest : public testing::Test {
     size_t bucket_size = expected_unused_bucket.size();
     for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) {
       Slice read_slice;
-      ASSERT_OK(read_file->Read(i*bucket_size, bucket_size,
-            &read_slice, nullptr));
+      ASSERT_OK(file_reader->Read(i * bucket_size, bucket_size, &read_slice,
+                                  nullptr));
       size_t key_idx =
           std::find(expected_locations.begin(), expected_locations.end(), i) -
           expected_locations.begin();
@@ -104,7 +110,7 @@ class CuckooBuilderTest : public testing::Test {
       }
     }
     for (auto key_found : keys_found) {
-      // Check that all keys were found.
+      // Check that all keys wereReader found.
       ASSERT_TRUE(key_found);
     }
   }
@@ -133,12 +139,15 @@ TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) {
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/EmptyFile";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      4, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100,
+                             BytewiseComparator(), 1, false, false,
+                             GetSliceHash);
   ASSERT_OK(builder.status());
   ASSERT_EQ(0UL, builder.FileSize());
   ASSERT_OK(builder.Finish());
-  ASSERT_OK(writable_file->Close());
+  ASSERT_OK(file_writer->Close());
   CheckFileContents({}, {}, {}, "", 2, 2, false);
 }
 
@@ -146,12 +155,15 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
   uint32_t num_hash_fun = 4;
   std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
   std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
-  hash_map = {
-    {user_keys[0], {0, 1, 2, 3}},
-    {user_keys[1], {1, 2, 3, 4}},
-    {user_keys[2], {2, 3, 4, 5}},
-    {user_keys[3], {3, 4, 5, 6}}
-  };
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1, 2, 3}},
+      {user_keys[1], {1, 2, 3, 4}},
+      {user_keys[2], {2, 3, 4, 5}},
+      {user_keys[3], {3, 4, 5, 6}}};
+  hash_map = std::move(hm);
+
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
   std::vector<std::string> keys;
   for (auto& user_key : user_keys) {
@@ -162,8 +174,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/NoCollisionFullKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -173,7 +188,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
   size_t bucket_size = keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
-  ASSERT_OK(writable_file->Close());
+  ASSERT_OK(file_writer->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
   std::string expected_unused_bucket = GetInternalKey("key00", true);
@@ -186,12 +201,16 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
   uint32_t num_hash_fun = 4;
   std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
   std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
-  hash_map = {
-    {user_keys[0], {0, 1, 2, 3}},
-    {user_keys[1], {0, 1, 2, 3}},
-    {user_keys[2], {0, 1, 2, 3}},
-    {user_keys[3], {0, 1, 2, 3}},
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1, 2, 3}},
+      {user_keys[1], {0, 1, 2, 3}},
+      {user_keys[2], {0, 1, 2, 3}},
+      {user_keys[3], {0, 1, 2, 3}},
   };
+  hash_map = std::move(hm);
+
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
   std::vector<std::string> keys;
   for (auto& user_key : user_keys) {
@@ -202,8 +221,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionFullKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -213,7 +235,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
   size_t bucket_size = keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
-  ASSERT_OK(writable_file->Close());
+  ASSERT_OK(file_writer->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
   std::string expected_unused_bucket = GetInternalKey("key00", true);
@@ -226,12 +248,16 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
   uint32_t num_hash_fun = 4;
   std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
   std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
-  hash_map = {
-    {user_keys[0], {0, 1, 2, 3}},
-    {user_keys[1], {0, 1, 2, 3}},
-    {user_keys[2], {0, 1, 2, 3}},
-    {user_keys[3], {0, 1, 2, 3}},
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1, 2, 3}},
+      {user_keys[1], {0, 1, 2, 3}},
+      {user_keys[2], {0, 1, 2, 3}},
+      {user_keys[3], {0, 1, 2, 3}},
   };
+  hash_map = std::move(hm);
+
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
   std::vector<std::string> keys;
   for (auto& user_key : user_keys) {
@@ -243,9 +269,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
   uint32_t cuckoo_block_size = 2;
   fname = test::TmpDir() + "/WithCollisionFullKey2";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size,
-      false, false, GetSliceHash);
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), cuckoo_block_size,
+                             false, false, GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -255,7 +283,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
   size_t bucket_size = keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
-  ASSERT_OK(writable_file->Close());
+  ASSERT_OK(file_writer->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
   std::string expected_unused_bucket = GetInternalKey("key00", true);
@@ -272,13 +300,17 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) {
   std::vector<std::string> user_keys = {"key01", "key02", "key03",
     "key04", "key05"};
   std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
-  hash_map = {
-    {user_keys[0], {0, 1}},
-    {user_keys[1], {1, 2}},
-    {user_keys[2], {2, 3}},
-    {user_keys[3], {3, 4}},
-    {user_keys[4], {0, 2}},
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1}},
+      {user_keys[1], {1, 2}},
+      {user_keys[2], {2, 3}},
+      {user_keys[3], {3, 4}},
+      {user_keys[4], {0, 2}},
   };
+  hash_map = std::move(hm);
+
   std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
   std::vector<std::string> keys;
   for (auto& user_key : user_keys) {
@@ -289,8 +321,11 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) {
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionPathFullKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -300,7 +335,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) {
   size_t bucket_size = keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
-  ASSERT_OK(writable_file->Close());
+  ASSERT_OK(file_writer->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
   std::string expected_unused_bucket = GetInternalKey("key00", true);
@@ -314,13 +349,17 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
   std::vector<std::string> user_keys = {"key01", "key02", "key03",
     "key04", "key05"};
   std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
-  hash_map = {
-    {user_keys[0], {0, 1}},
-    {user_keys[1], {1, 2}},
-    {user_keys[2], {3, 4}},
-    {user_keys[3], {4, 5}},
-    {user_keys[4], {0, 3}},
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1}},
+      {user_keys[1], {1, 2}},
+      {user_keys[2], {3, 4}},
+      {user_keys[3], {4, 5}},
+      {user_keys[4], {0, 3}},
   };
+  hash_map = std::move(hm);
+
   std::vector<uint64_t> expected_locations = {2, 1, 3, 4, 0};
   std::vector<std::string> keys;
   for (auto& user_key : user_keys) {
@@ -331,8 +370,11 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 2, false, false, GetSliceHash);
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 2, false, false,
+                             GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(keys[i]), Slice(values[i]));
@@ -342,7 +384,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
   size_t bucket_size = keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
-  ASSERT_OK(writable_file->Close());
+  ASSERT_OK(file_writer->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
   std::string expected_unused_bucket = GetInternalKey("key00", true);
@@ -355,20 +397,26 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
   uint32_t num_hash_fun = 4;
   std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
   std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
-  hash_map = {
-    {user_keys[0], {0, 1, 2, 3}},
-    {user_keys[1], {1, 2, 3, 4}},
-    {user_keys[2], {2, 3, 4, 5}},
-    {user_keys[3], {3, 4, 5, 6}}
-  };
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1, 2, 3}},
+      {user_keys[1], {1, 2, 3, 4}},
+      {user_keys[2], {2, 3, 4, 5}},
+      {user_keys[3], {3, 4, 5, 6}}};
+  hash_map = std::move(hm);
+
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
   uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/NoCollisionUserKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
@@ -378,7 +426,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
   size_t bucket_size = user_keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
-  ASSERT_OK(writable_file->Close());
+  ASSERT_OK(file_writer->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
   std::string expected_unused_bucket = "key00";
@@ -391,20 +439,27 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
   uint32_t num_hash_fun = 4;
   std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
   std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
-  hash_map = {
-    {user_keys[0], {0, 1, 2, 3}},
-    {user_keys[1], {0, 1, 2, 3}},
-    {user_keys[2], {0, 1, 2, 3}},
-    {user_keys[3], {0, 1, 2, 3}},
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1, 2, 3}},
+      {user_keys[1], {0, 1, 2, 3}},
+      {user_keys[2], {0, 1, 2, 3}},
+      {user_keys[3], {0, 1, 2, 3}},
   };
+  hash_map = std::move(hm);
+
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
   uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionUserKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
@@ -414,7 +469,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
   size_t bucket_size = user_keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
-  ASSERT_OK(writable_file->Close());
+  ASSERT_OK(file_writer->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
   std::string expected_unused_bucket = "key00";
@@ -428,21 +483,28 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) {
   std::vector<std::string> user_keys = {"key01", "key02", "key03",
     "key04", "key05"};
   std::vector<std::string> values = {"v01", "v02", "v03", "v04", "v05"};
-  hash_map = {
-    {user_keys[0], {0, 1}},
-    {user_keys[1], {1, 2}},
-    {user_keys[2], {2, 3}},
-    {user_keys[3], {3, 4}},
-    {user_keys[4], {0, 2}},
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1}},
+      {user_keys[1], {1, 2}},
+      {user_keys[2], {2, 3}},
+      {user_keys[3], {3, 4}},
+      {user_keys[4], {0, 2}},
   };
+  hash_map = std::move(hm);
+
   std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
   uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionPathUserKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash);
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             2, BytewiseComparator(), 1, false, false,
+                             GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i]));
@@ -452,7 +514,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) {
   size_t bucket_size = user_keys[0].size() + values[0].size();
   ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
   ASSERT_OK(builder.Finish());
-  ASSERT_OK(writable_file->Close());
+  ASSERT_OK(file_writer->Close());
   ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
 
   std::string expected_unused_bucket = "key00";
@@ -468,19 +530,25 @@ TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) {
   uint32_t num_hash_fun = 2;
   std::vector<std::string> user_keys = {"key01", "key02", "key03",
     "key04", "key05"};
-  hash_map = {
-    {user_keys[0], {0, 1}},
-    {user_keys[1], {1, 2}},
-    {user_keys[2], {2, 3}},
-    {user_keys[3], {3, 4}},
-    {user_keys[4], {0, 1}},
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {user_keys[0], {0, 1}},
+      {user_keys[1], {1, 2}},
+      {user_keys[2], {2, 3}},
+      {user_keys[3], {3, 4}},
+      {user_keys[4], {0, 1}},
   };
+  hash_map = std::move(hm);
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/WithCollisionPathUserKey";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash);
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             2, BytewiseComparator(), 1, false, false,
+                             GetSliceHash);
   ASSERT_OK(builder.status());
   for (uint32_t i = 0; i < user_keys.size(); i++) {
     builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value"));
@@ -488,19 +556,26 @@ TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) {
     ASSERT_OK(builder.status());
   }
   ASSERT_TRUE(builder.Finish().IsNotSupported());
-  ASSERT_OK(writable_file->Close());
+  ASSERT_OK(file_writer->Close());
 }
 
 TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) {
-  hash_map = {{"repeatedkey", {0, 1, 2, 3}}};
+  // Need to have a temporary variable here as VS compiler does not currently
+  // support operator= with initializer_list as a parameter
+  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+      {"repeatedkey", {0, 1, 2, 3}}};
+  hash_map = std::move(hm);
   uint32_t num_hash_fun = 4;
   std::string user_key = "repeatedkey";
 
   unique_ptr<WritableFile> writable_file;
   fname = test::TmpDir() + "/FailWhenSameKeyInserted";
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  CuckooTableBuilder builder(writable_file.get(), kHashTableRatio,
-      num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash);
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                             100, BytewiseComparator(), 1, false, false,
+                             GetSliceHash);
   ASSERT_OK(builder.status());
 
   builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1"));
@@ -511,7 +586,7 @@ TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) {
   ASSERT_OK(builder.status());
 
   ASSERT_TRUE(builder.Finish().IsNotSupported());
-  ASSERT_OK(writable_file->Close());
+  ASSERT_OK(file_writer->Close());
 }
 }  // namespace rocksdb
 
@@ -519,3 +594,13 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/cuckoo_table_factory.cc b/src/rocksdb/table/cuckoo_table_factory.cc
index 17aa1d7..16bf3fb 100644
--- a/src/rocksdb/table/cuckoo_table_factory.cc
+++ b/src/rocksdb/table/cuckoo_table_factory.cc
@@ -12,12 +12,13 @@
 
 namespace rocksdb {
 
-Status CuckooTableFactory::NewTableReader(const ImmutableCFOptions& ioptions,
-    const EnvOptions& env_options, const InternalKeyComparator& icomp,
-    std::unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+Status CuckooTableFactory::NewTableReader(
+    const TableReaderOptions& table_reader_options,
+    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     std::unique_ptr<TableReader>* table) const {
-  std::unique_ptr<CuckooTableReader> new_reader(new CuckooTableReader(ioptions,
-      std::move(file), file_size, icomp.user_comparator(), nullptr));
+  std::unique_ptr<CuckooTableReader> new_reader(new CuckooTableReader(
+      table_reader_options.ioptions, std::move(file), file_size,
+      table_reader_options.internal_comparator.user_comparator(), nullptr));
   Status s = new_reader->status();
   if (s.ok()) {
     *table = std::move(new_reader);
@@ -27,7 +28,7 @@ Status CuckooTableFactory::NewTableReader(const ImmutableCFOptions& ioptions,
 
 TableBuilder* CuckooTableFactory::NewTableBuilder(
     const TableBuilderOptions& table_builder_options,
-    WritableFile* file) const {
+    WritableFileWriter* file) const {
   // Ignore the skipFIlters flag. Does not apply to this file format
   //
 
diff --git a/src/rocksdb/table/cuckoo_table_factory.h b/src/rocksdb/table/cuckoo_table_factory.h
index 0b3729e..394e834 100644
--- a/src/rocksdb/table/cuckoo_table_factory.h
+++ b/src/rocksdb/table/cuckoo_table_factory.h
@@ -18,12 +18,14 @@ static inline uint64_t CuckooHash(
     const Slice& user_key, uint32_t hash_cnt, bool use_module_hash,
     uint64_t table_size_, bool identity_as_first_hash,
     uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) {
-#ifndef NDEBUG
-  // This part is used only in unit tests.
+#if !defined NDEBUG || defined OS_WIN
+  // This part is used only in unit tests but we have to keep it for Windows
+  // build as we run test in both debug and release modes under Windows.
   if (get_slice_hash != nullptr) {
     return get_slice_hash(user_key, hash_cnt, table_size_);
   }
 #endif
+
   uint64_t value = 0;
   if (hash_cnt == 0 && identity_as_first_hash) {
     value = (*reinterpret_cast<const int64_t*>(user_key.data()));
@@ -53,15 +55,14 @@ class CuckooTableFactory : public TableFactory {
 
   const char* Name() const override { return "CuckooTable"; }
 
-  Status NewTableReader(
-      const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
-      const InternalKeyComparator& internal_comparator,
-      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table) const override;
+  Status NewTableReader(const TableReaderOptions& table_reader_options,
+                        unique_ptr<RandomAccessFileReader>&& file,
+                        uint64_t file_size,
+                        unique_ptr<TableReader>* table) const override;
 
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      WritableFile* file) const override;
+      WritableFileWriter* file) const override;
 
   // Sanitizes the specified DB Options.
   Status SanitizeOptions(const DBOptions& db_opts,
diff --git a/src/rocksdb/table/cuckoo_table_reader.cc b/src/rocksdb/table/cuckoo_table_reader.cc
index 7f017ec..8c0329c 100644
--- a/src/rocksdb/table/cuckoo_table_reader.cc
+++ b/src/rocksdb/table/cuckoo_table_reader.cc
@@ -33,8 +33,7 @@ extern const uint64_t kCuckooTableMagicNumber;
 
 CuckooTableReader::CuckooTableReader(
     const ImmutableCFOptions& ioptions,
-    std::unique_ptr<RandomAccessFile>&& file,
-    uint64_t file_size,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     const Comparator* comparator,
     uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
     : file_(std::move(file)),
@@ -138,13 +137,13 @@ Status CuckooTableReader::Get(const ReadOptions& readOptions, const Slice& key,
     const char* bucket = &file_data_.data()[offset];
     for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
          ++block_idx, bucket += bucket_length_) {
-      if (ucomp_->Compare(Slice(unused_key_.data(), user_key.size()),
-                          Slice(bucket, user_key.size())) == 0) {
+      if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()),
+                        Slice(bucket, user_key.size()))) {
         return Status::OK();
       }
       // Here, we compare only the user key part as we support only one entry
       // per user key and we don't support sanpshot.
-      if (ucomp_->Compare(user_key, Slice(bucket, user_key.size())) == 0) {
+      if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) {
         Slice value(bucket + key_length_, value_length_);
         if (is_last_level_) {
           get_context->SaveValue(value);
diff --git a/src/rocksdb/table/cuckoo_table_reader.h b/src/rocksdb/table/cuckoo_table_reader.h
index 4f00a9e..6643be0 100644
--- a/src/rocksdb/table/cuckoo_table_reader.h
+++ b/src/rocksdb/table/cuckoo_table_reader.h
@@ -18,6 +18,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "table/table_reader.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
@@ -26,12 +27,11 @@ class TableReader;
 
 class CuckooTableReader: public TableReader {
  public:
-  CuckooTableReader(
-      const ImmutableCFOptions& ioptions,
-      std::unique_ptr<RandomAccessFile>&& file,
-      uint64_t file_size,
-      const Comparator* user_comparator,
-      uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t));
+  CuckooTableReader(const ImmutableCFOptions& ioptions,
+                    std::unique_ptr<RandomAccessFileReader>&& file,
+                    uint64_t file_size, const Comparator* user_comparator,
+                    uint64_t (*get_slice_hash)(const Slice&, uint32_t,
+                                               uint64_t));
   ~CuckooTableReader() {}
 
   std::shared_ptr<const TableProperties> GetTableProperties() const override {
@@ -57,7 +57,7 @@ class CuckooTableReader: public TableReader {
  private:
   friend class CuckooTableIterator;
   void LoadAllKeys(std::vector<std::pair<Slice, uint32_t>>* key_to_bucket_id);
-  std::unique_ptr<RandomAccessFile> file_;
+  std::unique_ptr<RandomAccessFileReader> file_;
   Slice file_data_;
   bool is_last_level_;
   bool identity_as_first_hash_;
diff --git a/src/rocksdb/table/cuckoo_table_reader_test.cc b/src/rocksdb/table/cuckoo_table_reader_test.cc
index 660261a..f10fcc5 100644
--- a/src/rocksdb/table/cuckoo_table_reader_test.cc
+++ b/src/rocksdb/table/cuckoo_table_reader_test.cc
@@ -3,11 +3,13 @@
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef ROCKSDB_LITE
+
 #ifndef GFLAGS
 #include <cstdio>
 int main() {
-  fprintf(stderr, "Please install gflags to run this test\n");
-  return 1;
+  fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+  return 0;
 }
 #else
 
@@ -62,7 +64,6 @@ uint64_t GetSliceHash(const Slice& s, uint32_t index,
     uint64_t max_num_buckets) {
   return hash_map[s.ToString()][index];
 }
-
 }  // namespace
 
 class CuckooReaderTest : public testing::Test {
@@ -94,9 +95,11 @@ class CuckooReaderTest : public testing::Test {
       const Comparator* ucomp = BytewiseComparator()) {
     std::unique_ptr<WritableFile> writable_file;
     ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
-    CuckooTableBuilder builder(
-        writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, 2,
-        false, false, GetSliceHash);
+    unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(writable_file), env_options));
+
+    CuckooTableBuilder builder(file_writer.get(), 0.9, kNumHashFunc, 100, ucomp,
+                               2, false, false, GetSliceHash);
     ASSERT_OK(builder.status());
     for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) {
       builder.Add(Slice(keys[key_idx]), Slice(values[key_idx]));
@@ -106,18 +109,16 @@ class CuckooReaderTest : public testing::Test {
     ASSERT_OK(builder.Finish());
     ASSERT_EQ(num_items, builder.NumEntries());
     file_size = builder.FileSize();
-    ASSERT_OK(writable_file->Close());
+    ASSERT_OK(file_writer->Close());
 
     // Check reader now.
     std::unique_ptr<RandomAccessFile> read_file;
     ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+    unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(std::move(read_file)));
     const ImmutableCFOptions ioptions(options);
-    CuckooTableReader reader(
-        ioptions,
-        std::move(read_file),
-        file_size,
-        ucomp,
-        GetSliceHash);
+    CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
+                             GetSliceHash);
     ASSERT_OK(reader.status());
     // Assume no merge/deletion
     for (uint32_t i = 0; i < num_items; ++i) {
@@ -141,13 +142,11 @@ class CuckooReaderTest : public testing::Test {
   void CheckIterator(const Comparator* ucomp = BytewiseComparator()) {
     std::unique_ptr<RandomAccessFile> read_file;
     ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+    unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(std::move(read_file)));
     const ImmutableCFOptions ioptions(options);
-    CuckooTableReader reader(
-        ioptions,
-        std::move(read_file),
-        file_size,
-        ucomp,
-        GetSliceHash);
+    CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
+                             GetSliceHash);
     ASSERT_OK(reader.status());
     Iterator* it = reader.NewIterator(ReadOptions(), nullptr);
     ASSERT_OK(it->status());
@@ -321,13 +320,11 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
   CreateCuckooFileAndCheckReader();
   std::unique_ptr<RandomAccessFile> read_file;
   ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+  unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(read_file)));
   const ImmutableCFOptions ioptions(options);
-  CuckooTableReader reader(
-      ioptions,
-      std::move(read_file),
-      file_size,
-      ucmp,
-      GetSliceHash);
+  CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucmp,
+                           GetSliceHash);
   ASSERT_OK(reader.status());
   // Search for a key with colliding hash values.
   std::string not_found_user_key = "key" + NumToStr(num_items);
@@ -406,10 +403,11 @@ void WriteFile(const std::vector<std::string>& keys,
 
   std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
-  CuckooTableBuilder builder(
-      writable_file.get(), hash_ratio,
-      64, 1000, test::Uint64Comparator(), 5,
-      false, FLAGS_identity_as_first_hash, nullptr);
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), env_options));
+  CuckooTableBuilder builder(file_writer.get(), hash_ratio, 64, 1000,
+                             test::Uint64Comparator(), 5, false,
+                             FLAGS_identity_as_first_hash, nullptr);
   ASSERT_OK(builder.status());
   for (uint64_t key_idx = 0; key_idx < num; ++key_idx) {
     // Value is just a part of key.
@@ -419,17 +417,18 @@ void WriteFile(const std::vector<std::string>& keys,
   }
   ASSERT_OK(builder.Finish());
   ASSERT_EQ(num, builder.NumEntries());
-  ASSERT_OK(writable_file->Close());
+  ASSERT_OK(file_writer->Close());
 
   uint64_t file_size;
   env->GetFileSize(fname, &file_size);
   std::unique_ptr<RandomAccessFile> read_file;
   ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+  unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(read_file)));
 
   const ImmutableCFOptions ioptions(options);
-  CuckooTableReader reader(
-      ioptions, std::move(read_file), file_size,
-      test::Uint64Comparator(), nullptr);
+  CuckooTableReader reader(ioptions, std::move(file_reader), file_size,
+                           test::Uint64Comparator(), nullptr);
   ASSERT_OK(reader.status());
   ReadOptions r_options;
   std::string value;
@@ -455,11 +454,12 @@ void ReadKeys(uint64_t num, uint32_t batch_size) {
   env->GetFileSize(fname, &file_size);
   std::unique_ptr<RandomAccessFile> read_file;
   ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
+  unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(read_file)));
 
   const ImmutableCFOptions ioptions(options);
-  CuckooTableReader reader(
-      ioptions, std::move(read_file), file_size, test::Uint64Comparator(),
-      nullptr);
+  CuckooTableReader reader(ioptions, std::move(file_reader), file_size,
+                           test::Uint64Comparator(), nullptr);
   ASSERT_OK(reader.status());
   const UserCollectedProperties user_props =
     reader.GetTableProperties()->user_collected_properties;
@@ -522,7 +522,8 @@ TEST_F(CuckooReaderTest, TestReadPerformance) {
       "WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n");
 #endif
   for (uint64_t num : nums) {
-    if (FLAGS_write || !Env::Default()->FileExists(GetFileName(num))) {
+    if (FLAGS_write ||
+        Env::Default()->FileExists(GetFileName(num)).IsNotFound()) {
       std::vector<std::string> all_keys;
       GetKeys(num, &all_keys);
       WriteFile(all_keys, num, hash_ratio);
@@ -544,3 +545,13 @@ int main(int argc, char** argv) {
 }
 
 #endif  // GFLAGS.
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/table/format.cc b/src/rocksdb/table/format.cc
index ccc345f..a58bbee 100644
--- a/src/rocksdb/table/format.cc
+++ b/src/rocksdb/table/format.cc
@@ -17,6 +17,7 @@
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
+#include "util/file_reader_writer.h"
 #include "util/perf_context_imp.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
@@ -210,7 +211,7 @@ std::string Footer::ToString() const {
   return result;
 }
 
-Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size,
+Status ReadFooterFromFile(RandomAccessFileReader* file, uint64_t file_size,
                           Footer* footer, uint64_t enforce_table_magic_number) {
   if (file_size < Footer::kMinEncodedLength) {
     return Status::Corruption("file is too short to be an sstable");
@@ -249,9 +250,9 @@ namespace {
 // Read a block and check its CRC
 // contents is the result of reading.
 // According to the implementation of file->Read, contents may not point to buf
-Status ReadBlock(RandomAccessFile* file, const Footer& footer,
-                  const ReadOptions& options, const BlockHandle& handle,
-                  Slice* contents,  /* result of reading */ char* buf) {
+Status ReadBlock(RandomAccessFileReader* file, const Footer& footer,
+                 const ReadOptions& options, const BlockHandle& handle,
+                 Slice* contents, /* result of reading */ char* buf) {
   size_t n = static_cast<size_t>(handle.size());
   Status s;
 
@@ -299,7 +300,7 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer,
 
 }  // namespace
 
-Status ReadBlockContents(RandomAccessFile* file, const Footer& footer,
+Status ReadBlockContents(RandomAccessFileReader* file, const Footer& footer,
                          const ReadOptions& options, const BlockHandle& handle,
                          BlockContents* contents, Env* env,
                          bool decompression_requested) {
@@ -425,6 +426,17 @@ Status UncompressBlockContents(const char* data, size_t n,
       *contents =
           BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
       break;
+    case kZSTDNotFinalCompression:
+      ubuf =
+          std::unique_ptr<char[]>(ZSTD_Uncompress(data, n, &decompress_size));
+      if (!ubuf) {
+        static char zstd_corrupt_msg[] =
+            "ZSTD not supported or corrupted ZSTD compressed block contents";
+        return Status::Corruption(zstd_corrupt_msg);
+      }
+      *contents =
+          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      break;
     default:
       return Status::Corruption("bad block type");
   }
diff --git a/src/rocksdb/table/format.h b/src/rocksdb/table/format.h
index 900a071..74ec808 100644
--- a/src/rocksdb/table/format.h
+++ b/src/rocksdb/table/format.h
@@ -166,7 +166,7 @@ class Footer {
 // Read the footer from file
 // If enforce_table_magic_number != 0, ReadFooterFromFile() will return
 // corruption if table_magic number is not equal to enforce_table_magic_number
-Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size,
+Status ReadFooterFromFile(RandomAccessFileReader* file, uint64_t file_size,
                           Footer* footer,
                           uint64_t enforce_table_magic_number = 0);
 
@@ -191,11 +191,22 @@ struct BlockContents {
         cachable(_cachable),
         compression_type(_compression_type),
         allocation(std::move(_data)) {}
+
+  BlockContents(BlockContents&& other) { *this = std::move(other); }
+
+  BlockContents& operator=(BlockContents&& other) {
+    data = std::move(other.data);
+    cachable = other.cachable;
+    compression_type = other.compression_type;
+    allocation = std::move(other.allocation);
+    return *this;
+  }
 };
 
 // Read the block identified by "handle" from "file".  On failure
 // return non-OK.  On success fill *result and return OK.
-extern Status ReadBlockContents(RandomAccessFile* file, const Footer& footer,
+extern Status ReadBlockContents(RandomAccessFileReader* file,
+                                const Footer& footer,
                                 const ReadOptions& options,
                                 const BlockHandle& handle,
                                 BlockContents* contents, Env* env,
diff --git a/src/rocksdb/table/full_filter_block.cc b/src/rocksdb/table/full_filter_block.cc
index b3afdac..3744d41 100644
--- a/src/rocksdb/table/full_filter_block.cc
+++ b/src/rocksdb/table/full_filter_block.cc
@@ -8,6 +8,7 @@
 #include "rocksdb/filter_policy.h"
 #include "port/port.h"
 #include "util/coding.h"
+#include "util/perf_context_imp.h"
 
 namespace rocksdb {
 
@@ -89,7 +90,13 @@ bool FullFilterBlockReader::PrefixMayMatch(const Slice& prefix,
 
 bool FullFilterBlockReader::MayMatch(const Slice& entry) {
   if (contents_.size() != 0)  {
-    return filter_bits_reader_->MayMatch(entry);
+    if (filter_bits_reader_->MayMatch(entry)) {
+      PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+      return true;
+    } else {
+      PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+      return false;
+    }
   }
   return true;  // remain the same with block_based filter
 }
diff --git a/src/rocksdb/table/get_context.cc b/src/rocksdb/table/get_context.cc
index e83aa1d..609ca30 100644
--- a/src/rocksdb/table/get_context.cc
+++ b/src/rocksdb/table/get_context.cc
@@ -12,6 +12,24 @@
 
 namespace rocksdb {
 
+namespace {
+
+void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) {
+#ifndef ROCKSDB_LITE
+  if (replay_log) {
+    if (replay_log->empty()) {
+      // Optimization: in the common case of only one operation in the
+      // log, we allocate the exact amount of space needed.
+      replay_log->reserve(1 + VarintLength(value.size()) + value.size());
+    }
+    replay_log->push_back(type);
+    PutLengthPrefixedSlice(replay_log, value);
+  }
+#endif  // ROCKSDB_LITE
+}
+
+}  // namespace
+
 GetContext::GetContext(const Comparator* ucmp,
                        const MergeOperator* merge_operator, Logger* logger,
                        Statistics* statistics, GetState init_state,
@@ -26,7 +44,8 @@ GetContext::GetContext(const Comparator* ucmp,
       value_(ret_value),
       value_found_(value_found),
       merge_context_(merge_context),
-      env_(env) {}
+      env_(env),
+      replay_log_(nullptr) {}
 
 // Called from TableCache::Get and Table::Get when file/block in which
 // key may exist are not there in TableCache/BlockCache respectively. In this
@@ -41,6 +60,9 @@ void GetContext::MarkKeyMayExist() {
 }
 
 void GetContext::SaveValue(const Slice& value) {
+  assert(state_ == kNotFound);
+  appendToReplayLog(replay_log_, kTypeValue, value);
+
   state_ = kFound;
   value_->assign(value.data(), value.size());
 }
@@ -49,7 +71,9 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
                            const Slice& value) {
   assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
          merge_context_ != nullptr);
-  if (ucmp_->Compare(parsed_key.user_key, user_key_) == 0) {
+  if (ucmp_->Equal(parsed_key.user_key, user_key_)) {
+    appendToReplayLog(replay_log_, parsed_key.type, value);
+
     // Key matches. Process it
     switch (parsed_key.type) {
       case kTypeValue:
@@ -68,7 +92,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
                 user_key_, &value, merge_context_->GetOperands(), value_,
                 logger_);
             RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME,
-                       env_ != nullptr ? timer.ElapsedNanos() : 0);
+                       timer.ElapsedNanosSafe());
           }
           if (!merge_success) {
             RecordTick(statistics_, NUMBER_MERGE_FAILURES);
@@ -78,6 +102,9 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
         return false;
 
       case kTypeDeletion:
+      case kTypeSingleDeletion:
+        // TODO(noetzli): Verify correctness once merge of single-deletes
+        // is supported
         assert(state_ == kNotFound || state_ == kMerge);
         if (kNotFound == state_) {
           state_ = kDeleted;
@@ -91,7 +118,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
                 user_key_, nullptr, merge_context_->GetOperands(), value_,
                 logger_);
             RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME,
-                       env_ != nullptr ? timer.ElapsedNanos() : 0);
+                       timer.ElapsedNanosSafe());
           }
           if (!merge_success) {
             RecordTick(statistics_, NUMBER_MERGE_FAILURES);
@@ -116,4 +143,23 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
   return false;
 }
 
+void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
+                         GetContext* get_context) {
+#ifndef ROCKSDB_LITE
+  Slice s = replay_log;
+  while (s.size()) {
+    auto type = static_cast<ValueType>(*s.data());
+    s.remove_prefix(1);
+    Slice value;
+    bool ret = GetLengthPrefixedSlice(&s, &value);
+    assert(ret);
+    (void)ret;
+    // Sequence number is ignored in SaveValue, so we just pass 0.
+    get_context->SaveValue(ParsedInternalKey(user_key, 0, type), value);
+  }
+#else   // ROCKSDB_LITE
+  assert(false);
+#endif  // ROCKSDB_LITE
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/get_context.h b/src/rocksdb/table/get_context.h
index 700f23a..2c2dd8e 100644
--- a/src/rocksdb/table/get_context.h
+++ b/src/rocksdb/table/get_context.h
@@ -31,6 +31,11 @@ class GetContext {
   bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value);
   GetState State() const { return state_; }
 
+  // If a non-null string is passed, all the SaveValue calls will be
+  // logged into the string. The operations can then be replayed on
+  // another GetContext with replayGetContextLog.
+  void SetReplayLog(std::string* replay_log) { replay_log_ = replay_log; }
+
  private:
   const Comparator* ucmp_;
   const MergeOperator* merge_operator_;
@@ -44,6 +49,10 @@ class GetContext {
   bool* value_found_;  // Is value set correctly? Used by KeyMayExist
   MergeContext* merge_context_;
   Env* env_;
+  std::string* replay_log_;
 };
 
+void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
+                         GetContext* get_context);
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/iter_heap.h b/src/rocksdb/table/iter_heap.h
index 9569d36..5343175 100644
--- a/src/rocksdb/table/iter_heap.h
+++ b/src/rocksdb/table/iter_heap.h
@@ -5,36 +5,34 @@
 //
 
 #pragma once
-#include <queue>
 
 #include "rocksdb/comparator.h"
 #include "table/iterator_wrapper.h"
 
 namespace rocksdb {
 
-// Return the max of two keys.
+// When used with std::priority_queue, this comparison functor puts the
+// iterator with the max/largest key on top.
 class MaxIteratorComparator {
  public:
   MaxIteratorComparator(const Comparator* comparator) :
     comparator_(comparator) {}
 
-  bool operator()(IteratorWrapper* a, IteratorWrapper* b) {
-    return comparator_->Compare(a->key(), b->key()) <= 0;
+  bool operator()(IteratorWrapper* a, IteratorWrapper* b) const {
+    return comparator_->Compare(a->key(), b->key()) < 0;
   }
  private:
   const Comparator* comparator_;
 };
 
-// Return the max of two keys.
+// When used with std::priority_queue, this comparison functor puts the
+// iterator with the min/smallest key on top.
 class MinIteratorComparator {
  public:
-  // if maxHeap is set comparator returns the max value.
-  // else returns the min Value.
-  // Can use to create a minHeap or a maxHeap.
   MinIteratorComparator(const Comparator* comparator) :
     comparator_(comparator) {}
 
-  bool operator()(IteratorWrapper* a, IteratorWrapper* b) {
+  bool operator()(IteratorWrapper* a, IteratorWrapper* b) const {
     return comparator_->Compare(a->key(), b->key()) > 0;
   }
  private:
diff --git a/src/rocksdb/table/merger.cc b/src/rocksdb/table/merger.cc
index b418b88..242587e 100644
--- a/src/rocksdb/table/merger.cc
+++ b/src/rocksdb/table/merger.cc
@@ -10,7 +10,6 @@
 #include "table/merger.h"
 
 #include <vector>
-#include <queue>
 
 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
@@ -18,28 +17,17 @@
 #include "table/iter_heap.h"
 #include "table/iterator_wrapper.h"
 #include "util/arena.h"
+#include "util/heap.h"
 #include "util/stop_watch.h"
+#include "util/sync_point.h"
 #include "util/perf_context_imp.h"
 #include "util/autovector.h"
 
 namespace rocksdb {
 // Without anonymous namespace here, we fail the warning -Wmissing-prototypes
 namespace {
-typedef std::priority_queue<IteratorWrapper*, std::vector<IteratorWrapper*>,
-                            MaxIteratorComparator> MergerMaxIterHeap;
-
-typedef std::priority_queue<IteratorWrapper*, std::vector<IteratorWrapper*>,
-                            MinIteratorComparator> MergerMinIterHeap;
-
-// Return's a new MaxHeap of IteratorWrapper's using the provided Comparator.
-MergerMaxIterHeap NewMergerMaxIterHeap(const Comparator* comparator) {
-  return MergerMaxIterHeap(MaxIteratorComparator(comparator));
-}
-
-// Return's a new MinHeap of IteratorWrapper's using the provided Comparator.
-MergerMinIterHeap NewMergerMinIterHeap(const Comparator* comparator) {
-  return MergerMinIterHeap(MinIteratorComparator(comparator));
-}
+typedef BinaryHeap<IteratorWrapper*, MaxIteratorComparator> MergerMaxIterHeap;
+typedef BinaryHeap<IteratorWrapper*, MinIteratorComparator> MergerMinIterHeap;
 }  // namespace
 
 const size_t kNumIterReserve = 4;
@@ -51,10 +39,8 @@ class MergingIterator : public Iterator {
       : is_arena_mode_(is_arena_mode),
         comparator_(comparator),
         current_(nullptr),
-        use_heap_(true),
         direction_(kForward),
-        maxHeap_(NewMergerMaxIterHeap(comparator_)),
-        minHeap_(NewMergerMinIterHeap(comparator_)) {
+        minHeap_(comparator_) {
     children_.resize(n);
     for (int i = 0; i < n; i++) {
       children_[i].Set(children[i]);
@@ -64,6 +50,7 @@ class MergingIterator : public Iterator {
         minHeap_.push(&child);
       }
     }
+    current_ = CurrentForward();
   }
 
   virtual void AddIterator(Iterator* iter) {
@@ -72,6 +59,7 @@ class MergingIterator : public Iterator {
     auto new_wrapper = children_.back();
     if (new_wrapper.Valid()) {
       minHeap_.push(&new_wrapper);
+      current_ = CurrentForward();
     }
   }
 
@@ -91,27 +79,25 @@ class MergingIterator : public Iterator {
         minHeap_.push(&child);
       }
     }
-    FindSmallest();
     direction_ = kForward;
+    current_ = CurrentForward();
   }
 
   virtual void SeekToLast() override {
     ClearHeaps();
+    InitMaxHeap();
     for (auto& child : children_) {
       child.SeekToLast();
       if (child.Valid()) {
-        maxHeap_.push(&child);
+        maxHeap_->push(&child);
       }
     }
-    FindLargest();
     direction_ = kReverse;
+    current_ = CurrentReverse();
   }
 
   virtual void Seek(const Slice& target) override {
-    // Invalidate the heap.
-    use_heap_ = false;
-    IteratorWrapper* first_child = nullptr;
-
+    ClearHeaps();
     for (auto& child : children_) {
       {
         PERF_TIMER_GUARD(seek_child_seek_time);
@@ -120,36 +106,15 @@ class MergingIterator : public Iterator {
       PERF_COUNTER_ADD(seek_child_seek_count, 1);
 
       if (child.Valid()) {
-        // This child has valid key
-        if (!use_heap_) {
-          if (first_child == nullptr) {
-            // It's the first child has valid key. Only put it int
-            // current_. Now the values in the heap should be invalid.
-            first_child = &child;
-          } else {
-            // We have more than one children with valid keys. Initialize
-            // the heap and put the first child into the heap.
-            PERF_TIMER_GUARD(seek_min_heap_time);
-            ClearHeaps();
-            minHeap_.push(first_child);
-          }
-        }
-        if (use_heap_) {
-          PERF_TIMER_GUARD(seek_min_heap_time);
-          minHeap_.push(&child);
-        }
+        PERF_TIMER_GUARD(seek_min_heap_time);
+        minHeap_.push(&child);
       }
     }
-    if (use_heap_) {
-      // If heap is valid, need to put the smallest key to curent_.
+    direction_ = kForward;
+    {
       PERF_TIMER_GUARD(seek_min_heap_time);
-      FindSmallest();
-    } else {
-      // The heap is not valid, then the current_ iterator is the first
-      // one, or null if there is no first child.
-      current_ = first_child;
+      current_ = CurrentForward();
     }
-    direction_ = kForward;
   }
 
   virtual void Next() override {
@@ -157,71 +122,98 @@ class MergingIterator : public Iterator {
 
     // Ensure that all children are positioned after key().
     // If we are moving in the forward direction, it is already
-    // true for all of the non-current_ children since current_ is
-    // the smallest child and key() == current_->key().  Otherwise,
-    // we explicitly position the non-current_ children.
+    // true for all of the non-current children since current_ is
+    // the smallest child and key() == current_->key().
     if (direction_ != kForward) {
+      // Otherwise, advance the non-current children.  We advance current_
+      // just after the if-block.
       ClearHeaps();
       for (auto& child : children_) {
         if (&child != current_) {
           child.Seek(key());
-          if (child.Valid() &&
-              comparator_->Compare(key(), child.key()) == 0) {
+          if (child.Valid() && comparator_->Equal(key(), child.key())) {
             child.Next();
           }
-          if (child.Valid()) {
-            minHeap_.push(&child);
-          }
+        }
+        if (child.Valid()) {
+          minHeap_.push(&child);
         }
       }
       direction_ = kForward;
+      // The loop advanced all non-current children to be > key() so current_
+      // should still be strictly the smallest key.
+      assert(current_ == CurrentForward());
     }
 
+    // For the heap modifications below to be correct, current_ must be the
+    // current top of the heap.
+    assert(current_ == CurrentForward());
+
     // as the current points to the current record. move the iterator forward.
-    // and if it is valid add it to the heap.
     current_->Next();
-    if (use_heap_) {
-      if (current_->Valid()) {
-        minHeap_.push(current_);
-      }
-      FindSmallest();
-    } else if (!current_->Valid()) {
-      current_ = nullptr;
+    if (current_->Valid()) {
+      // current is still valid after the Next() call above.  Call
+      // replace_top() to restore the heap property.  When the same child
+      // iterator yields a sequence of keys, this is cheap.
+      minHeap_.replace_top(current_);
+    } else {
+      // current stopped being valid, remove it from the heap.
+      minHeap_.pop();
     }
+    current_ = CurrentForward();
   }
 
   virtual void Prev() override {
     assert(Valid());
     // Ensure that all children are positioned before key().
     // If we are moving in the reverse direction, it is already
-    // true for all of the non-current_ children since current_ is
-    // the largest child and key() == current_->key().  Otherwise,
-    // we explicitly position the non-current_ children.
+    // true for all of the non-current children since current_ is
+    // the largest child and key() == current_->key().
     if (direction_ != kReverse) {
+      // Otherwise, retreat the non-current children.  We retreat current_
+      // just after the if-block.
       ClearHeaps();
+      InitMaxHeap();
       for (auto& child : children_) {
         if (&child != current_) {
           child.Seek(key());
           if (child.Valid()) {
             // Child is at first entry >= key().  Step back one to be < key()
+            TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
             child.Prev();
           } else {
             // Child has no entries >= key().  Position at last entry.
+            TEST_SYNC_POINT("MergeIterator::Prev:BeforeSeekToLast");
             child.SeekToLast();
           }
-          if (child.Valid()) {
-            maxHeap_.push(&child);
-          }
+        }
+        if (child.Valid()) {
+          maxHeap_->push(&child);
         }
       }
       direction_ = kReverse;
+      // Note that we don't do assert(current_ == CurrentReverse()) here
+      // because it is possible to have some keys larger than the seek-key
+      // inserted between Seek() and SeekToLast(), which makes current_ not
+      // equal to CurrentReverse().
+      current_ = CurrentReverse();
     }
 
+    // For the heap modifications below to be correct, current_ must be the
+    // current top of the heap.
+    assert(current_ == CurrentReverse());
+
     current_->Prev();
     if (current_->Valid()) {
-      maxHeap_.push(current_);
+      // current is still valid after the Prev() call above.  Call
+      // replace_top() to restore the heap property.  When the same child
+      // iterator yields a sequence of keys, this is cheap.
+      maxHeap_->replace_top(current_);
+    } else {
+      // current stopped being valid, remove it from the heap.
+      maxHeap_->pop();
     }
-    FindLargest();
+    current_ = CurrentReverse();
   }
 
   virtual Slice key() const override {
@@ -246,56 +238,54 @@ class MergingIterator : public Iterator {
   }
 
  private:
-  void FindSmallest();
-  void FindLargest();
+  // Clears heaps for both directions, used when changing direction or seeking
   void ClearHeaps();
+  // Ensures that maxHeap_ is initialized when starting to go in the reverse
+  // direction
+  void InitMaxHeap();
 
   bool is_arena_mode_;
   const Comparator* comparator_;
   autovector<IteratorWrapper, kNumIterReserve> children_;
+
+  // Cached pointer to child iterator with the current key, or nullptr if no
+  // child iterators are valid.  This is the top of minHeap_ or maxHeap_
+  // depending on the direction.
   IteratorWrapper* current_;
-  // If the value is true, both of iterators in the heap and current_
-  // contain valid rows. If it is false, only current_ can possibly contain
-  // valid rows.
-  // This flag is always true for reverse direction, as we always use heap for
-  // the reverse iterating case.
-  bool use_heap_;
   // Which direction is the iterator moving?
   enum Direction {
     kForward,
     kReverse
   };
   Direction direction_;
-  MergerMaxIterHeap maxHeap_;
   MergerMinIterHeap minHeap_;
-};
+  // Max heap is used for reverse iteration, which is way less common than
+  // forward.  Lazily initialize it to save memory.
+  std::unique_ptr<MergerMaxIterHeap> maxHeap_;
 
-void MergingIterator::FindSmallest() {
-  assert(use_heap_);
-  if (minHeap_.empty()) {
-    current_ = nullptr;
-  } else {
-    current_ = minHeap_.top();
-    assert(current_->Valid());
-    minHeap_.pop();
+  IteratorWrapper* CurrentForward() const {
+    assert(direction_ == kForward);
+    return !minHeap_.empty() ? minHeap_.top() : nullptr;
   }
-}
 
-void MergingIterator::FindLargest() {
-  assert(use_heap_);
-  if (maxHeap_.empty()) {
-    current_ = nullptr;
-  } else {
-    current_ = maxHeap_.top();
-    assert(current_->Valid());
-    maxHeap_.pop();
+  IteratorWrapper* CurrentReverse() const {
+    assert(direction_ == kReverse);
+    assert(maxHeap_);
+    return !maxHeap_->empty() ? maxHeap_->top() : nullptr;
   }
-}
+};
 
 void MergingIterator::ClearHeaps() {
-  use_heap_ = true;
-  maxHeap_ = NewMergerMaxIterHeap(comparator_);
-  minHeap_ = NewMergerMinIterHeap(comparator_);
+  minHeap_.clear();
+  if (maxHeap_) {
+    maxHeap_->clear();
+  }
+}
+
+void MergingIterator::InitMaxHeap() {
+  if (!maxHeap_) {
+    maxHeap_.reset(new MergerMaxIterHeap(comparator_));
+  }
 }
 
 Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n,
diff --git a/src/rocksdb/table/merger_test.cc b/src/rocksdb/table/merger_test.cc
index 1085ce4..562c0ae 100644
--- a/src/rocksdb/table/merger_test.cc
+++ b/src/rocksdb/table/merger_test.cc
@@ -5,45 +5,13 @@
 
 #include <vector>
 #include <string>
-#include <algorithm>
 
-#include "rocksdb/iterator.h"
 #include "table/merger.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
 namespace rocksdb {
 
-class VectorIterator : public Iterator {
- public:
-  explicit VectorIterator(const std::vector<std::string>& keys)
-      : keys_(keys), current_(keys.size()) {
-    std::sort(keys_.begin(), keys_.end());
-  }
-
-  virtual bool Valid() const override { return current_ < keys_.size(); }
-
-  virtual void SeekToFirst() override { current_ = 0; }
-  virtual void SeekToLast() override { current_ = keys_.size() - 1; }
-
-  virtual void Seek(const Slice& target) override {
-    current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) -
-               keys_.begin();
-  }
-
-  virtual void Next() override { current_++; }
-  virtual void Prev() override { current_--; }
-
-  virtual Slice key() const override { return Slice(keys_[current_]); }
-  virtual Slice value() const override { return Slice(); }
-
-  virtual Status status() const override { return Status::OK(); }
-
- private:
-  std::vector<std::string> keys_;
-  size_t current_;
-};
-
 class MergerTest : public testing::Test {
  public:
   MergerTest()
@@ -123,14 +91,14 @@ class MergerTest : public testing::Test {
     std::vector<Iterator*> small_iterators;
     for (size_t i = 0; i < num_iterators; ++i) {
       auto strings = GenerateStrings(strings_per_iterator, letters_per_string);
-      small_iterators.push_back(new VectorIterator(strings));
+      small_iterators.push_back(new test::VectorIterator(strings));
       all_keys_.insert(all_keys_.end(), strings.begin(), strings.end());
     }
 
     merging_iterator_.reset(
         NewMergingIterator(BytewiseComparator(), &small_iterators[0],
                            static_cast<int>(small_iterators.size())));
-    single_iterator_.reset(new VectorIterator(all_keys_));
+    single_iterator_.reset(new test::VectorIterator(all_keys_));
   }
 
   Random rnd_;
diff --git a/src/rocksdb/table/meta_blocks.cc b/src/rocksdb/table/meta_blocks.cc
index 6fad808..7bcdf75 100644
--- a/src/rocksdb/table/meta_blocks.cc
+++ b/src/rocksdb/table/meta_blocks.cc
@@ -8,9 +8,9 @@
 #include <string>
 
 #include "db/table_properties_collector.h"
-#include "table/block.h"
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
+#include "table/block.h"
 #include "table/format.h"
 #include "table/table_properties_internal.h"
 #include "util/coding.h"
@@ -129,9 +129,9 @@ bool NotifyCollectTableCollectorsOnFinish(
   return all_succeeded;
 }
 
-Status ReadProperties(const Slice &handle_value, RandomAccessFile *file,
-                      const Footer &footer, Env *env, Logger *logger,
-                      TableProperties **table_properties) {
+Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
+                      const Footer& footer, Env* env, Logger* logger,
+                      TableProperties** table_properties) {
   assert(table_properties);
 
   Slice v = handle_value;
@@ -217,7 +217,7 @@ Status ReadProperties(const Slice &handle_value, RandomAccessFile *file,
   return s;
 }
 
-Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size,
+Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
                            uint64_t table_magic_number, Env* env,
                            Logger* info_log, TableProperties** properties) {
   // -- Read metaindex block
@@ -271,7 +271,7 @@ Status FindMetaBlock(Iterator* meta_index_iter,
   }
 }
 
-Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
+Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
                      uint64_t table_magic_number, Env* env,
                      const std::string& meta_block_name,
                      BlockHandle* block_handle) {
@@ -298,7 +298,7 @@ Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
   return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
 }
 
-Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
+Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
                      uint64_t table_magic_number, Env* env,
                      const std::string& meta_block_name,
                      BlockContents* contents) {
diff --git a/src/rocksdb/table/meta_blocks.h b/src/rocksdb/table/meta_blocks.h
index 7ac3cb0..005bcaa 100644
--- a/src/rocksdb/table/meta_blocks.h
+++ b/src/rocksdb/table/meta_blocks.h
@@ -6,8 +6,8 @@
 
 #include <map>
 #include <memory>
-#include <vector>
 #include <string>
+#include <vector>
 
 #include "db/builder.h"
 #include "db/table_properties_collector.h"
@@ -16,6 +16,7 @@
 #include "rocksdb/slice.h"
 #include "table/block_builder.h"
 #include "table/format.h"
+#include "util/stl_wrappers.h"
 
 namespace rocksdb {
 
@@ -27,22 +28,6 @@ class Logger;
 class RandomAccessFile;
 struct TableProperties;
 
-// An STL style comparator that does the bytewise comparator comparasion
-// internally.
-struct BytewiseLessThan {
-  bool operator()(const std::string& key1, const std::string& key2) const {
-    // smaller entries will be placed in front.
-    return comparator->Compare(key1, key2) <= 0;
-  }
-
-  const Comparator* comparator = BytewiseComparator();
-};
-
-// When writing to a block that requires entries to be sorted by
-// `BytewiseComparator`, we can buffer the content to `BytewiseSortedMap`
-// before writng to store.
-typedef std::map<std::string, std::string, BytewiseLessThan> BytewiseSortedMap;
-
 class MetaIndexBuilder {
  public:
   MetaIndexBuilder(const MetaIndexBuilder&) = delete;
@@ -57,7 +42,7 @@ class MetaIndexBuilder {
 
  private:
   // store the sorted key/handle of the metablocks.
-  BytewiseSortedMap meta_block_handles_;
+  stl_wrappers::KVMap meta_block_handles_;
   std::unique_ptr<BlockBuilder> meta_index_block_;
 };
 
@@ -78,7 +63,7 @@ class PropertyBlockBuilder {
 
  private:
   std::unique_ptr<BlockBuilder> properties_block_;
-  BytewiseSortedMap props_;
+  stl_wrappers::KVMap props_;
 };
 
 // Were we encounter any error occurs during user-defined statistics collection,
@@ -107,26 +92,25 @@ bool NotifyCollectTableCollectorsOnFinish(
 // @returns a status to indicate if the operation succeeded. On success,
 //          *table_properties will point to a heap-allocated TableProperties
 //          object, otherwise value of `table_properties` will not be modified.
-Status ReadProperties(const Slice &handle_value, RandomAccessFile *file,
-                      const Footer &footer, Env *env, Logger *logger,
-                      TableProperties **table_properties);
+Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
+                      const Footer& footer, Env* env, Logger* logger,
+                      TableProperties** table_properties);
 
 // Directly read the properties from the properties block of a plain table.
 // @returns a status to indicate if the operation succeeded. On success,
 //          *table_properties will point to a heap-allocated TableProperties
 //          object, otherwise value of `table_properties` will not be modified.
-Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size,
+Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
                            uint64_t table_magic_number, Env* env,
                            Logger* info_log, TableProperties** properties);
 
-
 // Find the meta block from the meta index block.
 Status FindMetaBlock(Iterator* meta_index_iter,
                      const std::string& meta_block_name,
                      BlockHandle* block_handle);
 
 // Find the meta block
-Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
+Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
                      uint64_t table_magic_number, Env* env,
                      const std::string& meta_block_name,
                      BlockHandle* block_handle);
@@ -134,7 +118,7 @@ Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size,
 // Read the specified meta block with name meta_block_name
 // from `file` and initialize `contents` with contents of this block.
 // Return Status::OK in case of success.
-Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size,
+Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
                      uint64_t table_magic_number, Env* env,
                      const std::string& meta_block_name,
                      BlockContents* contents);
diff --git a/src/rocksdb/table/mock_table.cc b/src/rocksdb/table/mock_table.cc
index 90e2079..ff56d63 100644
--- a/src/rocksdb/table/mock_table.cc
+++ b/src/rocksdb/table/mock_table.cc
@@ -5,16 +5,29 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
-#include "rocksdb/table_properties.h"
 #include "table/mock_table.h"
-#include "table/get_context.h"
+
 #include "db/dbformat.h"
 #include "port/port.h"
+#include "rocksdb/table_properties.h"
+#include "table/get_context.h"
 #include "util/coding.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 namespace mock {
 
+namespace {
+
+const InternalKeyComparator icmp_(BytewiseComparator());
+
+}  // namespace
+
+stl_wrappers::KVMap MakeMockFile(
+    std::initializer_list<std::pair<const std::string, std::string>> l) {
+  return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_));
+}
+
 Iterator* MockTableReader::NewIterator(const ReadOptions&, Arena* arena) {
   return new MockTableIterator(table_);
 }
@@ -43,9 +56,8 @@ std::shared_ptr<const TableProperties> MockTableReader::GetTableProperties()
 MockTableFactory::MockTableFactory() : next_id_(1) {}
 
 Status MockTableFactory::NewTableReader(
-    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
-    const InternalKeyComparator& internal_key,
-    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+    const TableReaderOptions& table_reader_options,
+    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     unique_ptr<TableReader>* table_reader) const {
   uint32_t id = GetIDFromFile(file.get());
 
@@ -63,14 +75,14 @@ Status MockTableFactory::NewTableReader(
 
 TableBuilder* MockTableFactory::NewTableBuilder(
     const TableBuilderOptions& table_builder_options,
-    WritableFile* file) const {
-  uint32_t id = GetAndWriteNextID(file);
+    WritableFileWriter* file) const {
+  uint32_t id = GetAndWriteNextID(file->writable_file());
 
   return new MockTableBuilder(id, &file_system_);
 }
 
 Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname,
-                                         MockFileContents file_contents) {
+                                         stl_wrappers::KVMap file_contents) {
   std::unique_ptr<WritableFile> file;
   auto s = env->NewWritableFile(fname, &file, EnvOptions());
   if (!s.ok()) {
@@ -90,7 +102,7 @@ uint32_t MockTableFactory::GetAndWriteNextID(WritableFile* file) const {
   return next_id;
 }
 
-uint32_t MockTableFactory::GetIDFromFile(RandomAccessFile* file) const {
+uint32_t MockTableFactory::GetIDFromFile(RandomAccessFileReader* file) const {
   char buf[4];
   Slice result;
   file->Read(0, 4, &result, buf);
@@ -98,16 +110,29 @@ uint32_t MockTableFactory::GetIDFromFile(RandomAccessFile* file) const {
   return DecodeFixed32(buf);
 }
 
-void MockTableFactory::AssertSingleFile(const MockFileContents& file_contents) {
+void MockTableFactory::AssertSingleFile(
+    const stl_wrappers::KVMap& file_contents) {
   ASSERT_EQ(file_system_.files.size(), 1U);
   ASSERT_TRUE(file_contents == file_system_.files.begin()->second);
 }
 
-void MockTableFactory::AssertLatestFile(const MockFileContents& file_contents) {
+void MockTableFactory::AssertLatestFile(
+    const stl_wrappers::KVMap& file_contents) {
   ASSERT_GE(file_system_.files.size(), 1U);
   auto latest = file_system_.files.end();
   --latest;
-  ASSERT_TRUE(file_contents == latest->second);
+
+  if (file_contents != latest->second) {
+    std::cout << "Wrong content! Content of latest file:" << std::endl;
+    for (const auto& kv : latest->second) {
+      ParsedInternalKey ikey;
+      std::string key, value;
+      std::tie(key, value) = kv;
+      ParseInternalKey(Slice(key), &ikey);
+      std::cout << ikey.DebugString(false) << " -> " << value << std::endl;
+    }
+    ASSERT_TRUE(false);
+  }
 }
 
 }  // namespace mock
diff --git a/src/rocksdb/table/mock_table.h b/src/rocksdb/table/mock_table.h
index ef38575..322a51d 100644
--- a/src/rocksdb/table/mock_table.h
+++ b/src/rocksdb/table/mock_table.h
@@ -5,35 +5,39 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 #pragma once
+
 #include <algorithm>
-#include <set>
-#include <memory>
 #include <atomic>
 #include <map>
+#include <memory>
+#include <set>
 #include <string>
+#include <utility>
 
+#include "port/port.h"
+#include "rocksdb/comparator.h"
 #include "rocksdb/table.h"
-#include "table/table_reader.h"
 #include "table/table_builder.h"
-#include "port/port.h"
+#include "table/table_reader.h"
 #include "util/mutexlock.h"
+#include "util/stl_wrappers.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
 namespace rocksdb {
 namespace mock {
 
-typedef std::map<std::string, std::string> MockFileContents;
-// NOTE this currently only supports bitwise comparator
+stl_wrappers::KVMap MakeMockFile(
+    std::initializer_list<std::pair<const std::string, std::string>> l = {});
 
 struct MockTableFileSystem {
   port::Mutex mutex;
-  std::map<uint32_t, MockFileContents> files;
+  std::map<uint32_t, stl_wrappers::KVMap> files;
 };
 
 class MockTableReader : public TableReader {
  public:
-  explicit MockTableReader(const MockFileContents& table) : table_(table) {}
+  explicit MockTableReader(const stl_wrappers::KVMap& table) : table_(table) {}
 
   Iterator* NewIterator(const ReadOptions&, Arena* arena) override;
 
@@ -51,12 +55,12 @@ class MockTableReader : public TableReader {
   ~MockTableReader() {}
 
  private:
-  const MockFileContents& table_;
+  const stl_wrappers::KVMap& table_;
 };
 
 class MockTableIterator : public Iterator {
  public:
-  explicit MockTableIterator(const MockFileContents& table) : table_(table) {
+  explicit MockTableIterator(const stl_wrappers::KVMap& table) : table_(table) {
     itr_ = table_.end();
   }
 
@@ -91,14 +95,16 @@ class MockTableIterator : public Iterator {
   Status status() const override { return Status::OK(); }
 
  private:
-  const MockFileContents& table_;
-  MockFileContents::const_iterator itr_;
+  const stl_wrappers::KVMap& table_;
+  stl_wrappers::KVMap::const_iterator itr_;
 };
 
 class MockTableBuilder : public TableBuilder {
  public:
   MockTableBuilder(uint32_t id, MockTableFileSystem* file_system)
-      : id_(id), file_system_(file_system) {}
+      : id_(id), file_system_(file_system) {
+    table_ = MakeMockFile({});
+  }
 
   // REQUIRES: Either Finish() or Abandon() has been called.
   ~MockTableBuilder() {}
@@ -132,27 +138,26 @@ class MockTableBuilder : public TableBuilder {
  private:
   uint32_t id_;
   MockTableFileSystem* file_system_;
-  MockFileContents table_;
+  stl_wrappers::KVMap table_;
 };
 
 class MockTableFactory : public TableFactory {
  public:
   MockTableFactory();
   const char* Name() const override { return "MockTable"; }
-  Status NewTableReader(const ImmutableCFOptions& ioptions,
-                               const EnvOptions& env_options,
-                               const InternalKeyComparator& internal_key,
-                               unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                               unique_ptr<TableReader>* table_reader) const override;
+  Status NewTableReader(const TableReaderOptions& table_reader_options,
+                        unique_ptr<RandomAccessFileReader>&& file,
+                        uint64_t file_size,
+                        unique_ptr<TableReader>* table_reader) const override;
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      WritableFile* file) const override;
+      WritableFileWriter* file) const override;
 
   // This function will directly create mock table instead of going through
-  // MockTableBuilder. MockFileContents has to have a format of <internal_key,
-  // value>. Those key-value pairs will then be inserted into the mock table
+  // MockTableBuilder. file_contents has to have a format of <internal_key,
+  // value>. Those key-value pairs will then be inserted into the mock table.
   Status CreateMockTable(Env* env, const std::string& fname,
-                         MockFileContents file_contents);
+                         stl_wrappers::KVMap file_contents);
 
   virtual Status SanitizeOptions(
       const DBOptions& db_opts,
@@ -166,12 +171,12 @@ class MockTableFactory : public TableFactory {
 
   // This function will assert that only a single file exists and that the
   // contents are equal to file_contents
-  void AssertSingleFile(const MockFileContents& file_contents);
-  void AssertLatestFile(const MockFileContents& file_contents);
+  void AssertSingleFile(const stl_wrappers::KVMap& file_contents);
+  void AssertLatestFile(const stl_wrappers::KVMap& file_contents);
 
  private:
   uint32_t GetAndWriteNextID(WritableFile* file) const;
-  uint32_t GetIDFromFile(RandomAccessFile* file) const;
+  uint32_t GetIDFromFile(RandomAccessFileReader* file) const;
 
   mutable MockTableFileSystem file_system_;
   mutable std::atomic<uint32_t> next_id_;
diff --git a/src/rocksdb/table/plain_table_builder.cc b/src/rocksdb/table/plain_table_builder.cc
index 25e1b85..e16224a 100644
--- a/src/rocksdb/table/plain_table_builder.cc
+++ b/src/rocksdb/table/plain_table_builder.cc
@@ -26,6 +26,7 @@
 #include "table/meta_blocks.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
+#include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
@@ -35,11 +36,8 @@ namespace {
 // a utility that helps writing block content to the file
 //   @offset will advance if @block_contents was successfully written.
 //   @block_handle the block handle this particular block.
-Status WriteBlock(
-    const Slice& block_contents,
-    WritableFile* file,
-    uint64_t* offset,
-    BlockHandle* block_handle) {
+Status WriteBlock(const Slice& block_contents, WritableFileWriter* file,
+                  uint64_t* offset, BlockHandle* block_handle) {
   block_handle->set_offset(*offset);
   block_handle->set_size(block_contents.size());
   Status s = file->Append(block_contents);
@@ -62,7 +60,7 @@ PlainTableBuilder::PlainTableBuilder(
     const ImmutableCFOptions& ioptions,
     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
         int_tbl_prop_collector_factories,
-    WritableFile* file, uint32_t user_key_len, EncodingType encoding_type,
+    WritableFileWriter* file, uint32_t user_key_len, EncodingType encoding_type,
     size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes,
     size_t huge_page_tlb_size, double hash_table_ratio,
     bool store_index_in_file)
diff --git a/src/rocksdb/table/plain_table_builder.h b/src/rocksdb/table/plain_table_builder.h
index f542d2f..75ec3fa 100644
--- a/src/rocksdb/table/plain_table_builder.h
+++ b/src/rocksdb/table/plain_table_builder.h
@@ -34,10 +34,11 @@ class PlainTableBuilder: public TableBuilder {
       const ImmutableCFOptions& ioptions,
       const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
           int_tbl_prop_collector_factories,
-      WritableFile* file, uint32_t user_key_size, EncodingType encoding_type,
-      size_t index_sparseness, uint32_t bloom_bits_per_key,
-      uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
-      double hash_table_ratio = 0, bool store_index_in_file = false);
+      WritableFileWriter* file, uint32_t user_key_size,
+      EncodingType encoding_type, size_t index_sparseness,
+      uint32_t bloom_bits_per_key, uint32_t num_probes = 6,
+      size_t huge_page_tlb_size = 0, double hash_table_ratio = 0,
+      bool store_index_in_file = false);
 
   // REQUIRES: Either Finish() or Abandon() has been called.
   ~PlainTableBuilder();
@@ -82,7 +83,7 @@ class PlainTableBuilder: public TableBuilder {
   BloomBlockBuilder bloom_block_;
   std::unique_ptr<PlainTableIndexBuilder> index_builder_;
 
-  WritableFile* file_;
+  WritableFileWriter* file_;
   uint64_t offset_ = 0;
   uint32_t bloom_bits_per_key_;
   size_t huge_page_tlb_size_;
diff --git a/src/rocksdb/table/plain_table_factory.cc b/src/rocksdb/table/plain_table_factory.cc
index 5f19c3b..6e86ff5 100644
--- a/src/rocksdb/table/plain_table_factory.cc
+++ b/src/rocksdb/table/plain_table_factory.cc
@@ -14,21 +14,20 @@
 
 namespace rocksdb {
 
-Status PlainTableFactory::NewTableReader(const ImmutableCFOptions& ioptions,
-                                         const EnvOptions& env_options,
-                                         const InternalKeyComparator& icomp,
-                                         unique_ptr<RandomAccessFile>&& file,
-                                         uint64_t file_size,
-                                         unique_ptr<TableReader>* table) const {
-  return PlainTableReader::Open(ioptions, env_options, icomp, std::move(file),
-                                file_size, table, bloom_bits_per_key_,
-                                hash_table_ratio_, index_sparseness_,
-                                huge_page_tlb_size_, full_scan_mode_);
+Status PlainTableFactory::NewTableReader(
+    const TableReaderOptions& table_reader_options,
+    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    unique_ptr<TableReader>* table) const {
+  return PlainTableReader::Open(
+      table_reader_options.ioptions, table_reader_options.env_options,
+      table_reader_options.internal_comparator, std::move(file), file_size,
+      table, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_,
+      huge_page_tlb_size_, full_scan_mode_);
 }
 
 TableBuilder* PlainTableFactory::NewTableBuilder(
     const TableBuilderOptions& table_builder_options,
-    WritableFile* file) const {
+    WritableFileWriter* file) const {
   // Ignore the skip_filters flag. PlainTable format is optimized for small
   // in-memory dbs. The skip_filters optimization is not useful for plain
   // tables
@@ -55,10 +54,10 @@ std::string PlainTableFactory::GetPrintableTableOptions() const {
   snprintf(buffer, kBufferSize, "  hash_table_ratio: %lf\n",
            hash_table_ratio_);
   ret.append(buffer);
-  snprintf(buffer, kBufferSize, "  index_sparseness: %zu\n",
+  snprintf(buffer, kBufferSize, "  index_sparseness: %" ROCKSDB_PRIszt "\n",
            index_sparseness_);
   ret.append(buffer);
-  snprintf(buffer, kBufferSize, "  huge_page_tlb_size: %zu\n",
+  snprintf(buffer, kBufferSize, "  huge_page_tlb_size: %" ROCKSDB_PRIszt "\n",
            huge_page_tlb_size_);
   ret.append(buffer);
   snprintf(buffer, kBufferSize, "  encoding_type: %d\n",
diff --git a/src/rocksdb/table/plain_table_factory.h b/src/rocksdb/table/plain_table_factory.h
index 730e134..539e753 100644
--- a/src/rocksdb/table/plain_table_factory.h
+++ b/src/rocksdb/table/plain_table_factory.h
@@ -153,14 +153,13 @@ class PlainTableFactory : public TableFactory {
         full_scan_mode_(options.full_scan_mode),
         store_index_in_file_(options.store_index_in_file) {}
   const char* Name() const override { return "PlainTable"; }
-  Status NewTableReader(
-      const ImmutableCFOptions& options, const EnvOptions& soptions,
-      const InternalKeyComparator& internal_comparator,
-      unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table) const override;
+  Status NewTableReader(const TableReaderOptions& table_reader_options,
+                        unique_ptr<RandomAccessFileReader>&& file,
+                        uint64_t file_size,
+                        unique_ptr<TableReader>* table) const override;
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
-      WritableFile* file) const override;
+      WritableFileWriter* file) const override;
 
   std::string GetPrintableTableOptions() const override;
 
@@ -169,10 +168,6 @@ class PlainTableFactory : public TableFactory {
   // Sanitizes the specified DB Options.
   Status SanitizeOptions(const DBOptions& db_opts,
                          const ColumnFamilyOptions& cf_opts) const override {
-    if (db_opts.allow_mmap_reads == false) {
-      return Status::NotSupported(
-          "PlainTable with allow_mmap_reads == false is not supported.");
-    }
     return Status::OK();
   }
 
diff --git a/src/rocksdb/table/plain_table_index.cc b/src/rocksdb/table/plain_table_index.cc
index 7ca451e..3e422c3 100644
--- a/src/rocksdb/table/plain_table_index.cc
+++ b/src/rocksdb/table/plain_table_index.cc
@@ -203,8 +203,8 @@ Slice PlainTableIndexBuilder::FillIndexes(
   assert(sub_index_offset == sub_index_size_);
 
   Log(InfoLogLevel::DEBUG_LEVEL, ioptions_.info_log,
-      "hash table size: %d, suffix_map length %zu",
-      index_size_, sub_index_size_);
+      "hash table size: %d, suffix_map length %" ROCKSDB_PRIszt, index_size_,
+      sub_index_size_);
   return Slice(allocated, GetTotalSize());
 }
 
diff --git a/src/rocksdb/table/plain_table_key_coding.cc b/src/rocksdb/table/plain_table_key_coding.cc
index 4f09b50..057c7f9 100644
--- a/src/rocksdb/table/plain_table_key_coding.cc
+++ b/src/rocksdb/table/plain_table_key_coding.cc
@@ -6,19 +6,23 @@
 #ifndef ROCKSDB_LITE
 #include "table/plain_table_key_coding.h"
 
-#include "table/plain_table_factory.h"
+#include <algorithm>
+#include <string>
 #include "db/dbformat.h"
+#include "table/plain_table_reader.h"
+#include "table/plain_table_factory.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
-namespace {
-
 enum PlainTableEntryType : unsigned char {
   kFullKey = 0,
   kPrefixFromPreviousKey = 1,
   kKeySuffix = 2,
 };
 
+namespace {
+
 // Control byte:
 // First two bits indicate type of entry
 // Other bytes are inlined sizes. If all bits are 1 (0x03F), overflow bytes
@@ -41,30 +45,43 @@ size_t EncodeSize(PlainTableEntryType type, uint32_t key_size,
     return ptr - out_buffer;
   }
 }
+}  // namespace
 
-// Return position after the size byte(s). nullptr means error
-const char* DecodeSize(const char* offset, const char* limit,
-                       PlainTableEntryType* entry_type, uint32_t* key_size) {
-  assert(offset < limit);
+// Fill bytes_read with number of bytes read.
+inline Status PlainTableKeyDecoder::DecodeSize(uint32_t start_offset,
+                                               PlainTableEntryType* entry_type,
+                                               uint32_t* key_size,
+                                               uint32_t* bytes_read) {
+  Slice next_byte_slice;
+  bool success = file_reader_.Read(start_offset, 1, &next_byte_slice);
+  if (!success) {
+    return file_reader_.status();
+  }
   *entry_type = static_cast<PlainTableEntryType>(
-      (static_cast<unsigned char>(offset[0]) & ~kSizeInlineLimit) >> 6);
-  char inline_key_size = offset[0] & kSizeInlineLimit;
+      (static_cast<unsigned char>(next_byte_slice[0]) & ~kSizeInlineLimit) >>
+      6);
+  char inline_key_size = next_byte_slice[0] & kSizeInlineLimit;
   if (inline_key_size < kSizeInlineLimit) {
     *key_size = inline_key_size;
-    return offset + 1;
+    *bytes_read = 1;
+    return Status::OK();
   } else {
     uint32_t extra_size;
-    const char* ptr = GetVarint32Ptr(offset + 1, limit, &extra_size);
-    if (ptr == nullptr) {
-      return nullptr;
+    uint32_t tmp_bytes_read;
+    success = file_reader_.ReadVarint32(start_offset + 1, &extra_size,
+                                        &tmp_bytes_read);
+    if (!success) {
+      return file_reader_.status();
     }
+    assert(tmp_bytes_read > 0);
     *key_size = kSizeInlineLimit + extra_size;
-    return ptr;
+    *bytes_read = tmp_bytes_read + 1;
+    return Status::OK();
   }
 }
-}  // namespace
 
-Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file,
+Status PlainTableKeyEncoder::AppendKey(const Slice& key,
+                                       WritableFileWriter* file,
                                        uint64_t* offset, char* meta_bytes_buf,
                                        size_t* meta_bytes_buf_size) {
   ParsedInternalKey parsed_key;
@@ -147,28 +164,101 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file,
   return Status::OK();
 }
 
-namespace {
-Status ReadInternalKey(const char* key_ptr, const char* limit,
-                       uint32_t user_key_size, ParsedInternalKey* parsed_key,
-                       size_t* bytes_read, bool* internal_key_valid,
-                       Slice* internal_key) {
-  if (key_ptr + user_key_size + 1 >= limit) {
-    return Status::Corruption("Unexpected EOF when reading the next key");
+inline bool PlainTableKeyDecoder::FileReader::Read(uint32_t file_offset,
+                                                   uint32_t len, Slice* out) {
+  if (file_info_->is_mmap_mode) {
+    assert(file_offset + len <= file_info_->data_end_offset);
+    *out = Slice(file_info_->file_data.data() + file_offset, len);
+    return true;
+  } else {
+    return ReadNonMmap(file_offset, len, out);
   }
-  if (*(key_ptr + user_key_size) == PlainTableFactory::kValueTypeSeqId0) {
+}
+
+bool PlainTableKeyDecoder::FileReader::ReadNonMmap(uint32_t file_offset,
+                                                   uint32_t len, Slice* out) {
+  const uint32_t kPrefetchSize = 256u;
+  if (file_offset < buf_start_offset_ ||
+      file_offset + len > buf_start_offset_ + buf_len_) {
+    // Load buffer
+    assert(file_offset + len <= file_info_->data_end_offset);
+    uint32_t size_to_read = std::min(file_info_->data_end_offset - file_offset,
+                                     std::max(kPrefetchSize, len));
+    if (size_to_read > buf_capacity_) {
+      buf_.reset(new char[size_to_read]);
+      buf_capacity_ = size_to_read;
+      buf_len_ = 0;
+    }
+    Slice read_result;
+    Status s = file_info_->file->Read(file_offset, size_to_read, &read_result,
+                                      buf_.get());
+    if (!s.ok()) {
+      status_ = s;
+      return false;
+    }
+    buf_start_offset_ = file_offset;
+    buf_len_ = size_to_read;
+  }
+  *out = Slice(buf_.get() + (file_offset - buf_start_offset_), len);
+  return true;
+}
+
+inline bool PlainTableKeyDecoder::FileReader::ReadVarint32(
+    uint32_t offset, uint32_t* out, uint32_t* bytes_read) {
+  if (file_info_->is_mmap_mode) {
+    const char* start = file_info_->file_data.data() + offset;
+    const char* limit =
+        file_info_->file_data.data() + file_info_->data_end_offset;
+    const char* key_ptr = GetVarint32Ptr(start, limit, out);
+    assert(key_ptr != nullptr);
+    *bytes_read = static_cast<uint32_t>(key_ptr - start);
+    return true;
+  } else {
+    return ReadVarint32NonMmap(offset, out, bytes_read);
+  }
+}
+
+bool PlainTableKeyDecoder::FileReader::ReadVarint32NonMmap(
+    uint32_t offset, uint32_t* out, uint32_t* bytes_read) {
+  const char* start;
+  const char* limit;
+  const uint32_t kMaxVarInt32Size = 6u;
+  uint32_t bytes_to_read =
+      std::min(file_info_->data_end_offset - offset, kMaxVarInt32Size);
+  Slice bytes;
+  if (!Read(offset, bytes_to_read, &bytes)) {
+    return false;
+  }
+  start = bytes.data();
+  limit = bytes.data() + bytes.size();
+
+  const char* key_ptr = GetVarint32Ptr(start, limit, out);
+  *bytes_read =
+      (key_ptr != nullptr) ? static_cast<uint32_t>(key_ptr - start) : 0;
+  return true;
+}
+
+Status PlainTableKeyDecoder::ReadInternalKey(
+    uint32_t file_offset, uint32_t user_key_size, ParsedInternalKey* parsed_key,
+    uint32_t* bytes_read, bool* internal_key_valid, Slice* internal_key) {
+  Slice tmp_slice;
+  bool success = file_reader_.Read(file_offset, user_key_size + 1, &tmp_slice);
+  if (!success) {
+    return file_reader_.status();
+  }
+  if (tmp_slice[user_key_size] == PlainTableFactory::kValueTypeSeqId0) {
     // Special encoding for the row with seqID=0
-    parsed_key->user_key = Slice(key_ptr, user_key_size);
+    parsed_key->user_key = Slice(tmp_slice.data(), user_key_size);
     parsed_key->sequence = 0;
     parsed_key->type = kTypeValue;
     *bytes_read += user_key_size + 1;
     *internal_key_valid = false;
   } else {
-    if (key_ptr + user_key_size + 8 >= limit) {
-      return Status::Corruption(
-          "Unexpected EOF when reading internal bytes of the next key");
+    success = file_reader_.Read(file_offset, user_key_size + 8, internal_key);
+    if (!success) {
+      return file_reader_.status();
     }
     *internal_key_valid = true;
-    *internal_key = Slice(key_ptr, user_key_size + 8);
     if (!ParseInternalKey(*internal_key, parsed_key)) {
       return Status::Corruption(
           Slice("Incorrect value type found when reading the next key"));
@@ -177,36 +267,44 @@ Status ReadInternalKey(const char* key_ptr, const char* limit,
   }
   return Status::OK();
 }
-}  // namespace
 
-Status PlainTableKeyDecoder::NextPlainEncodingKey(
-    const char* start, const char* limit, ParsedInternalKey* parsed_key,
-    Slice* internal_key, size_t* bytes_read, bool* seekable) {
-  const char* key_ptr = start;
+Status PlainTableKeyDecoder::NextPlainEncodingKey(uint32_t start_offset,
+                                                  ParsedInternalKey* parsed_key,
+                                                  Slice* internal_key,
+                                                  uint32_t* bytes_read,
+                                                  bool* seekable) {
   uint32_t user_key_size = 0;
+  Status s;
   if (fixed_user_key_len_ != kPlainTableVariableLength) {
     user_key_size = fixed_user_key_len_;
-    key_ptr = start;
   } else {
     uint32_t tmp_size = 0;
-    key_ptr = GetVarint32Ptr(start, limit, &tmp_size);
-    if (key_ptr == nullptr) {
-      return Status::Corruption(
-          "Unexpected EOF when reading the next key's size");
+    uint32_t tmp_read;
+    bool success =
+        file_reader_.ReadVarint32(start_offset, &tmp_size, &tmp_read);
+    if (!success) {
+      return file_reader_.status();
     }
+    assert(tmp_read > 0);
     user_key_size = tmp_size;
-    *bytes_read = key_ptr - start;
+    *bytes_read = tmp_read;
   }
   // dummy initial value to avoid compiler complain
   bool decoded_internal_key_valid = true;
   Slice decoded_internal_key;
-  Status s =
-      ReadInternalKey(key_ptr, limit, user_key_size, parsed_key, bytes_read,
-                      &decoded_internal_key_valid, &decoded_internal_key);
+  s = ReadInternalKey(start_offset + *bytes_read, user_key_size, parsed_key,
+                      bytes_read, &decoded_internal_key_valid,
+                      &decoded_internal_key);
   if (!s.ok()) {
     return s;
   }
-  if (internal_key != nullptr) {
+  if (!file_reader_.file_info_->is_mmap_mode) {
+    cur_key_.SetInternalKey(*parsed_key);
+    parsed_key->user_key = Slice(cur_key_.GetKey().data(), user_key_size);
+    if (internal_key != nullptr) {
+      *internal_key = cur_key_.GetKey();
+    }
+  } else if (internal_key != nullptr) {
     if (decoded_internal_key_valid) {
       *internal_key = decoded_internal_key;
     } else {
@@ -219,41 +317,55 @@ Status PlainTableKeyDecoder::NextPlainEncodingKey(
 }
 
 Status PlainTableKeyDecoder::NextPrefixEncodingKey(
-    const char* start, const char* limit, ParsedInternalKey* parsed_key,
-    Slice* internal_key, size_t* bytes_read, bool* seekable) {
-  const char* key_ptr = start;
+    uint32_t start_offset, ParsedInternalKey* parsed_key, Slice* internal_key,
+    uint32_t* bytes_read, bool* seekable) {
   PlainTableEntryType entry_type;
 
   bool expect_suffix = false;
+  Status s;
   do {
     uint32_t size = 0;
     // dummy initial value to avoid compiler complain
     bool decoded_internal_key_valid = true;
-    const char* pos = DecodeSize(key_ptr, limit, &entry_type, &size);
-    if (pos == nullptr) {
+    uint32_t my_bytes_read = 0;
+    s = DecodeSize(start_offset + *bytes_read, &entry_type, &size,
+                   &my_bytes_read);
+    if (!s.ok()) {
+      return s;
+    }
+    if (my_bytes_read == 0) {
       return Status::Corruption("Unexpected EOF when reading size of the key");
     }
-    *bytes_read += pos - key_ptr;
-    key_ptr = pos;
+    *bytes_read += my_bytes_read;
 
     switch (entry_type) {
       case kFullKey: {
         expect_suffix = false;
         Slice decoded_internal_key;
-        Status s =
-            ReadInternalKey(key_ptr, limit, size, parsed_key, bytes_read,
-                            &decoded_internal_key_valid, &decoded_internal_key);
+        s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key,
+                            bytes_read, &decoded_internal_key_valid,
+                            &decoded_internal_key);
         if (!s.ok()) {
           return s;
         }
-        saved_user_key_ = parsed_key->user_key;
-        if (internal_key != nullptr) {
-          if (decoded_internal_key_valid) {
-            *internal_key = decoded_internal_key;
-          } else {
-            cur_key_.SetInternalKey(*parsed_key);
+        if (!file_reader_.file_info_->is_mmap_mode ||
+            (internal_key != nullptr && !decoded_internal_key_valid)) {
+          // In non-mmap mode, always need to make a copy of keys returned to
+          // users, because after reading value for the key, the key might
+          // be invalid.
+          cur_key_.SetInternalKey(*parsed_key);
+          saved_user_key_ = cur_key_.GetKey();
+          if (!file_reader_.file_info_->is_mmap_mode) {
+            parsed_key->user_key = Slice(cur_key_.GetKey().data(), size);
+          }
+          if (internal_key != nullptr) {
             *internal_key = cur_key_.GetKey();
           }
+        } else {
+          if (internal_key != nullptr) {
+            *internal_key = decoded_internal_key;
+          }
+          saved_user_key_ = parsed_key->user_key;
         }
         break;
       }
@@ -274,20 +386,33 @@ Status PlainTableKeyDecoder::NextPrefixEncodingKey(
         if (seekable != nullptr) {
           *seekable = false;
         }
-        cur_key_.Reserve(prefix_len_ + size);
 
         Slice tmp_slice;
-        Status s = ReadInternalKey(key_ptr, limit, size, parsed_key, bytes_read,
-                                   &decoded_internal_key_valid, &tmp_slice);
+        s = ReadInternalKey(start_offset + *bytes_read, size, parsed_key,
+                            bytes_read, &decoded_internal_key_valid,
+                            &tmp_slice);
         if (!s.ok()) {
           return s;
         }
-        cur_key_.SetInternalKey(Slice(saved_user_key_.data(), prefix_len_),
-                                *parsed_key);
-        assert(
-            prefix_extractor_ == nullptr ||
-            prefix_extractor_->Transform(ExtractUserKey(cur_key_.GetKey())) ==
-                Slice(saved_user_key_.data(), prefix_len_));
+        if (!file_reader_.file_info_->is_mmap_mode) {
+          // In non-mmap mode, we need to make a copy of keys returned to
+          // users, because after reading value for the key, the key might
+          // be invalid.
+          // saved_user_key_ points to cur_key_. We are making a copy of
+          // the prefix part to another string, and construct the current
+          // key from the prefix part and the suffix part back to cur_key_.
+          std::string tmp =
+              Slice(saved_user_key_.data(), prefix_len_).ToString();
+          cur_key_.Reserve(prefix_len_ + size);
+          cur_key_.SetInternalKey(tmp, *parsed_key);
+          parsed_key->user_key =
+              Slice(cur_key_.GetKey().data(), prefix_len_ + size);
+          saved_user_key_ = cur_key_.GetKey();
+        } else {
+          cur_key_.Reserve(prefix_len_ + size);
+          cur_key_.SetInternalKey(Slice(saved_user_key_.data(), prefix_len_),
+                                  *parsed_key);
+        }
         parsed_key->user_key = ExtractUserKey(cur_key_.GetKey());
         if (internal_key != nullptr) {
           *internal_key = cur_key_.GetKey();
@@ -295,29 +420,61 @@ Status PlainTableKeyDecoder::NextPrefixEncodingKey(
         break;
       }
       default:
-        return Status::Corruption("Identified size flag.");
+        return Status::Corruption("Un-identified size flag.");
     }
   } while (expect_suffix);  // Another round if suffix is expected.
   return Status::OK();
 }
 
-Status PlainTableKeyDecoder::NextKey(const char* start, const char* limit,
+Status PlainTableKeyDecoder::NextKey(uint32_t start_offset,
                                      ParsedInternalKey* parsed_key,
-                                     Slice* internal_key, size_t* bytes_read,
-                                     bool* seekable) {
+                                     Slice* internal_key, Slice* value,
+                                     uint32_t* bytes_read, bool* seekable) {
+  assert(value != nullptr);
+  Status s = NextKeyNoValue(start_offset, parsed_key, internal_key, bytes_read,
+                            seekable);
+  if (s.ok()) {
+    assert(bytes_read != nullptr);
+    uint32_t value_size;
+    uint32_t value_size_bytes;
+    bool success = file_reader_.ReadVarint32(start_offset + *bytes_read,
+                                             &value_size, &value_size_bytes);
+    if (!success) {
+      return file_reader_.status();
+    }
+    if (value_size_bytes == 0) {
+      return Status::Corruption(
+          "Unexpected EOF when reading the next value's size.");
+    }
+    *bytes_read += value_size_bytes;
+    success = file_reader_.Read(start_offset + *bytes_read, value_size, value);
+    if (!success) {
+      return file_reader_.status();
+    }
+    *bytes_read += value_size;
+  }
+  return s;
+}
+
+Status PlainTableKeyDecoder::NextKeyNoValue(uint32_t start_offset,
+                                            ParsedInternalKey* parsed_key,
+                                            Slice* internal_key,
+                                            uint32_t* bytes_read,
+                                            bool* seekable) {
   *bytes_read = 0;
   if (seekable != nullptr) {
     *seekable = true;
   }
+  Status s;
   if (encoding_type_ == kPlain) {
-    return NextPlainEncodingKey(start, limit, parsed_key, internal_key,
+    return NextPlainEncodingKey(start_offset, parsed_key, internal_key,
                                 bytes_read, seekable);
   } else {
     assert(encoding_type_ == kPrefix);
-    return NextPrefixEncodingKey(start, limit, parsed_key, internal_key,
+    return NextPrefixEncodingKey(start_offset, parsed_key, internal_key,
                                  bytes_read, seekable);
   }
 }
 
 }  // namespace rocksdb
-#endif  // ROCKSDB_LITE
+#endif  // ROCKSDB_LIT
diff --git a/src/rocksdb/table/plain_table_key_coding.h b/src/rocksdb/table/plain_table_key_coding.h
index 9047087..a98010d 100644
--- a/src/rocksdb/table/plain_table_key_coding.h
+++ b/src/rocksdb/table/plain_table_key_coding.h
@@ -13,6 +13,8 @@ namespace rocksdb {
 
 class WritableFile;
 struct ParsedInternalKey;
+struct PlainTableReaderFileInfo;
+enum PlainTableEntryType : unsigned char;
 
 // Helper class to write out a key to an output file
 // Actual data format of the key is documented in plain_table_factory.h
@@ -34,7 +36,7 @@ class PlainTableKeyEncoder {
   // meta_bytes_buf: buffer for extra meta bytes
   // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
   //                      if meta_bytes_buf is updated.
-  Status AppendKey(const Slice& key, WritableFile* file, uint64_t* offset,
+  Status AppendKey(const Slice& key, WritableFileWriter* file, uint64_t* offset,
                    char* meta_bytes_buf, size_t* meta_bytes_buf_size);
 
   // Return actual encoding type to be picked
@@ -53,10 +55,12 @@ class PlainTableKeyEncoder {
 // Actual data format of the key is documented in plain_table_factory.h
 class PlainTableKeyDecoder {
  public:
-  explicit PlainTableKeyDecoder(EncodingType encoding_type,
+  explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info,
+                                EncodingType encoding_type,
                                 uint32_t user_key_len,
                                 const SliceTransform* prefix_extractor)
-      : encoding_type_(encoding_type),
+      : file_reader_(file_info),
+        encoding_type_(encoding_type),
         prefix_len_(0),
         fixed_user_key_len_(user_key_len),
         prefix_extractor_(prefix_extractor),
@@ -70,9 +74,51 @@ class PlainTableKeyDecoder {
   // bytes_read: how many bytes read from start. Output
   // seekable: whether key can be read from this place. Used when building
   //           indexes. Output.
-  Status NextKey(const char* start, const char* limit,
-                 ParsedInternalKey* parsed_key, Slice* internal_key,
-                 size_t* bytes_read, bool* seekable = nullptr);
+  Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key,
+                 Slice* internal_key, Slice* value, uint32_t* bytes_read,
+                 bool* seekable = nullptr);
+
+  Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key,
+                        Slice* internal_key, uint32_t* bytes_read,
+                        bool* seekable = nullptr);
+
+  class FileReader {
+   public:
+    explicit FileReader(const PlainTableReaderFileInfo* file_info)
+        : file_info_(file_info),
+          buf_start_offset_(0),
+          buf_len_(0),
+          buf_capacity_(0) {}
+    // In mmaped mode, the results point to mmaped area of the file, which
+    // means it is always valid before closing the file.
+    // In non-mmap mode, the results point to an internal buffer. If the caller
+    // makes another read call, the results will not be valid. So callers should
+    // make a copy when needed.
+    // If return false, status code is stored in status_.
+    inline bool Read(uint32_t file_offset, uint32_t len, Slice* output);
+
+    // If return false, status code is stored in status_.
+    bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output);
+
+    // *bytes_read = 0 means eof. false means failure and status is saved
+    // in status_. Not directly returning Status to save copying status
+    // object to map previous performance of mmap mode.
+    inline bool ReadVarint32(uint32_t offset, uint32_t* output,
+                             uint32_t* bytes_read);
+
+    bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output,
+                             uint32_t* bytes_read);
+
+    Status status() const { return status_; }
+
+    const PlainTableReaderFileInfo* file_info_;
+    std::unique_ptr<char[]> buf_;
+    uint32_t buf_start_offset_;
+    uint32_t buf_len_;
+    uint32_t buf_capacity_;
+    Status status_;
+  };
+  FileReader file_reader_;
   EncodingType encoding_type_;
   uint32_t prefix_len_;
   uint32_t fixed_user_key_len_;
@@ -82,14 +128,20 @@ class PlainTableKeyDecoder {
   bool in_prefix_;
 
  private:
-  Status NextPlainEncodingKey(const char* start, const char* limit,
+  Status NextPlainEncodingKey(uint32_t start_offset,
                               ParsedInternalKey* parsed_key,
-                              Slice* internal_key, size_t* bytes_read,
+                              Slice* internal_key, uint32_t* bytes_read,
                               bool* seekable = nullptr);
-  Status NextPrefixEncodingKey(const char* start, const char* limit,
+  Status NextPrefixEncodingKey(uint32_t start_offset,
                                ParsedInternalKey* parsed_key,
-                               Slice* internal_key, size_t* bytes_read,
+                               Slice* internal_key, uint32_t* bytes_read,
                                bool* seekable = nullptr);
+  Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size,
+                         ParsedInternalKey* parsed_key, uint32_t* bytes_read,
+                         bool* internal_key_valid, Slice* internal_key);
+  inline Status DecodeSize(uint32_t start_offset,
+                           PlainTableEntryType* entry_type, uint32_t* key_size,
+                           uint32_t* bytes_read);
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/table/plain_table_reader.cc b/src/rocksdb/table/plain_table_reader.cc
index c409204..1aabbb9 100644
--- a/src/rocksdb/table/plain_table_reader.cc
+++ b/src/rocksdb/table/plain_table_reader.cc
@@ -90,7 +90,7 @@ class PlainTableIterator : public Iterator {
 
 extern const uint64_t kPlainTableMagicNumber;
 PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions,
-                                   unique_ptr<RandomAccessFile>&& file,
+                                   unique_ptr<RandomAccessFileReader>&& file,
                                    const EnvOptions& storage_options,
                                    const InternalKeyComparator& icomparator,
                                    EncodingType encoding_type,
@@ -99,13 +99,13 @@ PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions,
     : internal_comparator_(icomparator),
       encoding_type_(encoding_type),
       full_scan_mode_(false),
-      data_end_offset_(static_cast<uint32_t>(table_properties->data_size)),
       user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
       prefix_extractor_(ioptions.prefix_extractor),
       enable_bloom_(false),
       bloom_(6, nullptr),
+      file_info_(std::move(file), storage_options,
+                 static_cast<uint32_t>(table_properties->data_size)),
       ioptions_(ioptions),
-      file_(std::move(file)),
       file_size_(file_size),
       table_properties_(nullptr) {}
 
@@ -115,13 +115,12 @@ PlainTableReader::~PlainTableReader() {
 Status PlainTableReader::Open(const ImmutableCFOptions& ioptions,
                               const EnvOptions& env_options,
                               const InternalKeyComparator& internal_comparator,
-                              unique_ptr<RandomAccessFile>&& file,
+                              unique_ptr<RandomAccessFileReader>&& file,
                               uint64_t file_size,
                               unique_ptr<TableReader>* table_reader,
                               const int bloom_bits_per_key,
                               double hash_table_ratio, size_t index_sparseness,
                               size_t huge_page_tlb_size, bool full_scan_mode) {
-  assert(ioptions.allow_mmap_reads);
   if (file_size > PlainTableIndex::kMaxFileSize) {
     return Status::NotSupported("File is too large for PlainTableReader!");
   }
@@ -163,7 +162,7 @@ Status PlainTableReader::Open(const ImmutableCFOptions& ioptions,
       ioptions, std::move(file), env_options, internal_comparator,
       encoding_type, file_size, props));
 
-  s = new_reader->MmapDataFile();
+  s = new_reader->MmapDataIfNeeded();
   if (!s.ok()) {
     return s;
   }
@@ -204,13 +203,14 @@ Iterator* PlainTableReader::NewIterator(const ReadOptions& options,
 Status PlainTableReader::PopulateIndexRecordList(
     PlainTableIndexBuilder* index_builder, vector<uint32_t>* prefix_hashes) {
   Slice prev_key_prefix_slice;
+  std::string prev_key_prefix_buf;
   uint32_t pos = data_start_offset_;
 
   bool is_first_record = true;
   Slice key_prefix_slice;
-  PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
+  PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
                                ioptions_.prefix_extractor);
-  while (pos < data_end_offset_) {
+  while (pos < file_info_.data_end_offset) {
     uint32_t key_offset = pos;
     ParsedInternalKey key;
     Slice value_slice;
@@ -228,7 +228,12 @@ Status PlainTableReader::PopulateIndexRecordList(
         if (!is_first_record) {
           prefix_hashes->push_back(GetSliceHash(prev_key_prefix_slice));
         }
-        prev_key_prefix_slice = key_prefix_slice;
+        if (file_info_.is_mmap_mode) {
+          prev_key_prefix_slice = key_prefix_slice;
+        } else {
+          prev_key_prefix_buf = key_prefix_slice.ToString();
+          prev_key_prefix_slice = prev_key_prefix_buf;
+        }
       }
     }
 
@@ -268,9 +273,12 @@ void PlainTableReader::FillBloom(vector<uint32_t>* prefix_hashes) {
   }
 }
 
-Status PlainTableReader::MmapDataFile() {
-  // Get mmapped memory to file_data_.
-  return file_->Read(0, file_size_, &file_data_, nullptr);
+Status PlainTableReader::MmapDataIfNeeded() {
+  if (file_info_.is_mmap_mode) {
+    // Get mmapped memory.
+    return file_info_.file->Read(0, file_size_, &file_info_.file_data, nullptr);
+  }
+  return Status::OK();
 }
 
 Status PlainTableReader::PopulateIndex(TableProperties* props,
@@ -282,31 +290,37 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
   table_properties_.reset(props);
 
   BlockContents bloom_block_contents;
-  auto s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
-                         ioptions_.env, BloomBlockBuilder::kBloomBlock,
-                         &bloom_block_contents);
+  auto s = ReadMetaBlock(file_info_.file.get(), file_size_,
+                         kPlainTableMagicNumber, ioptions_.env,
+                         BloomBlockBuilder::kBloomBlock, &bloom_block_contents);
   bool index_in_file = s.ok();
 
   BlockContents index_block_contents;
-  s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber,
-      ioptions_.env, PlainTableIndexBuilder::kPlainTableIndexBlock,
-      &index_block_contents);
+  s = ReadMetaBlock(
+      file_info_.file.get(), file_size_, kPlainTableMagicNumber, ioptions_.env,
+      PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_contents);
 
   index_in_file &= s.ok();
 
   Slice* bloom_block;
   if (index_in_file) {
+    // If bloom_block_contents.allocation is not empty (which will be the case
+    // for non-mmap mode), it holds the alloated memory for the bloom block.
+    // It needs to be kept alive to keep `bloom_block` valid.
+    bloom_block_alloc_ = std::move(bloom_block_contents.allocation);
     bloom_block = &bloom_block_contents.data;
   } else {
     bloom_block = nullptr;
   }
 
   // index_in_file == true only if there are kBloomBlock and
-  // kPlainTableIndexBlock
-  // in file
-
+  // kPlainTableIndexBlock in file
   Slice* index_block;
   if (index_in_file) {
+    // If index_block_contents.allocation is not empty (which will be the case
+    // for non-mmap mode), it holds the alloated memory for the index block.
+    // It needs to be kept alive to keep `index_block` valid.
+    index_block_alloc_ = std::move(index_block_contents.allocation);
     index_block = &index_block_contents.data;
   } else {
     index_block = nullptr;
@@ -401,7 +415,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
   uint32_t prefix_index_offset;
   auto res = index_.GetOffset(prefix_hash, &prefix_index_offset);
   if (res == PlainTableIndex::kNoPrefixForBucket) {
-    *offset = data_end_offset_;
+    *offset = file_info_.data_end_offset;
     return Status::OK();
   } else if (res == PlainTableIndex::kDirectToFile) {
     *offset = prefix_index_offset;
@@ -420,16 +434,15 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
     return Status::Corruption(Slice());
   }
 
+  PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
+                               ioptions_.prefix_extractor);
+
   // The key is between [low, high). Do a binary search between it.
   while (high - low > 1) {
     uint32_t mid = (high + low) / 2;
     uint32_t file_offset = GetFixed32Element(base_ptr, mid);
-    size_t tmp;
-    Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_,
-                                    ioptions_.prefix_extractor)
-                   .NextKey(file_data_.data() + file_offset,
-                            file_data_.data() + data_end_offset_, &mid_key,
-                            nullptr, &tmp);
+    uint32_t tmp;
+    Status s = decoder.NextKeyNoValue(file_offset, &mid_key, nullptr, &tmp);
     if (!s.ok()) {
       return s;
     }
@@ -452,13 +465,9 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
   // prefix as target. We need to rule out one of them to avoid to go
   // to the wrong prefix.
   ParsedInternalKey low_key;
-  size_t tmp;
+  uint32_t tmp;
   uint32_t low_key_offset = GetFixed32Element(base_ptr, low);
-  Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_,
-                                  ioptions_.prefix_extractor)
-                 .NextKey(file_data_.data() + low_key_offset,
-                          file_data_.data() + data_end_offset_, &low_key,
-                          nullptr, &tmp);
+  Status s = decoder.NextKeyNoValue(low_key_offset, &low_key, nullptr, &tmp);
   if (!s.ok()) {
     return s;
   }
@@ -473,50 +482,45 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
   } else {
     // target is larger than a key of the last prefix in this bucket
     // but with a different prefix. Key does not exist.
-    *offset = data_end_offset_;
+    *offset = file_info_.data_end_offset;
   }
   return Status::OK();
 }
 
 bool PlainTableReader::MatchBloom(uint32_t hash) const {
-  return !enable_bloom_ || bloom_.MayContainHash(hash);
-}
+  if (!enable_bloom_) {
+    return true;
+  }
 
+  if (bloom_.MayContainHash(hash)) {
+    PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
+    return true;
+  } else {
+    PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
+    return false;
+  }
+}
 
 Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,
                               ParsedInternalKey* parsed_key,
                               Slice* internal_key, Slice* value,
                               bool* seekable) const {
-  if (*offset == data_end_offset_) {
-    *offset = data_end_offset_;
+  if (*offset == file_info_.data_end_offset) {
+    *offset = file_info_.data_end_offset;
     return Status::OK();
   }
 
-  if (*offset > data_end_offset_) {
+  if (*offset > file_info_.data_end_offset) {
     return Status::Corruption("Offset is out of file size");
   }
 
-  const char* start = file_data_.data() + *offset;
-  size_t bytes_for_key;
-  Status s =
-      decoder->NextKey(start, file_data_.data() + data_end_offset_, parsed_key,
-                       internal_key, &bytes_for_key, seekable);
+  uint32_t bytes_read;
+  Status s = decoder->NextKey(*offset, parsed_key, internal_key, value,
+                              &bytes_read, seekable);
   if (!s.ok()) {
     return s;
   }
-  uint32_t value_size;
-  const char* value_ptr = GetVarint32Ptr(
-      start + bytes_for_key, file_data_.data() + data_end_offset_, &value_size);
-  if (value_ptr == nullptr) {
-    return Status::Corruption(
-        "Unexpected EOF when reading the next value's size.");
-  }
-  *offset = *offset + static_cast<uint32_t>(value_ptr - start) + value_size;
-  if (*offset > data_end_offset_) {
-    return Status::Corruption("Unexpected EOF when reading the next value. ");
-  }
-  *value = Slice(value_ptr, value_size);
-
+  *offset = *offset + bytes_read;
   return Status::OK();
 }
 
@@ -556,6 +560,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
   bool prefix_match;
   Status s =
       GetOffset(target, prefix_slice, prefix_hash, prefix_match, &offset);
+
   if (!s.ok()) {
     return s;
   }
@@ -565,9 +570,9 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
     return Status::Corruption(Slice());
   }
   Slice found_value;
-  PlainTableKeyDecoder decoder(encoding_type_, user_key_len_,
+  PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
                                ioptions_.prefix_extractor);
-  while (offset < data_end_offset_) {
+  while (offset < file_info_.data_end_offset) {
     s = Next(&decoder, &offset, &found_key, nullptr, &found_value);
     if (!s.ok()) {
       return s;
@@ -598,24 +603,24 @@ uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) {
 PlainTableIterator::PlainTableIterator(PlainTableReader* table,
                                        bool use_prefix_seek)
     : table_(table),
-      decoder_(table_->encoding_type_, table_->user_key_len_,
-               table_->prefix_extractor_),
+      decoder_(&table_->file_info_, table_->encoding_type_,
+               table_->user_key_len_, table_->prefix_extractor_),
       use_prefix_seek_(use_prefix_seek) {
-  next_offset_ = offset_ = table_->data_end_offset_;
+  next_offset_ = offset_ = table_->file_info_.data_end_offset;
 }
 
 PlainTableIterator::~PlainTableIterator() {
 }
 
 bool PlainTableIterator::Valid() const {
-  return offset_ < table_->data_end_offset_
-      && offset_ >= table_->data_start_offset_;
+  return offset_ < table_->file_info_.data_end_offset &&
+         offset_ >= table_->data_start_offset_;
 }
 
 void PlainTableIterator::SeekToFirst() {
   next_offset_ = table_->data_start_offset_;
-  if (next_offset_ >= table_->data_end_offset_) {
-    next_offset_ = offset_ = table_->data_end_offset_;
+  if (next_offset_ >= table_->file_info_.data_end_offset) {
+    next_offset_ = offset_ = table_->file_info_.data_end_offset;
   } else {
     Next();
   }
@@ -633,14 +638,14 @@ void PlainTableIterator::Seek(const Slice& target) {
     if (table_->full_scan_mode_) {
       status_ =
           Status::InvalidArgument("Seek() is not allowed in full scan mode.");
-      offset_ = next_offset_ = table_->data_end_offset_;
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
       return;
     } else if (table_->GetIndexSize() > 1) {
       assert(false);
       status_ = Status::NotSupported(
           "PlainTable cannot issue non-prefix seek unless in total order "
           "mode.");
-      offset_ = next_offset_ = table_->data_end_offset_;
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
       return;
     }
   }
@@ -651,7 +656,7 @@ void PlainTableIterator::Seek(const Slice& target) {
   if (!table_->IsTotalOrderMode()) {
     prefix_hash = GetSliceHash(prefix_slice);
     if (!table_->MatchBloom(prefix_hash)) {
-      offset_ = next_offset_ = table_->data_end_offset_;
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
       return;
     }
   }
@@ -659,16 +664,16 @@ void PlainTableIterator::Seek(const Slice& target) {
   status_ = table_->GetOffset(target, prefix_slice, prefix_hash, prefix_match,
                               &next_offset_);
   if (!status_.ok()) {
-    offset_ = next_offset_ = table_->data_end_offset_;
+    offset_ = next_offset_ = table_->file_info_.data_end_offset;
     return;
   }
 
-  if (next_offset_ < table_-> data_end_offset_) {
+  if (next_offset_ < table_->file_info_.data_end_offset) {
     for (Next(); status_.ok() && Valid(); Next()) {
       if (!prefix_match) {
         // Need to verify the first key's prefix
         if (table_->GetPrefix(key()) != prefix_slice) {
-          offset_ = next_offset_ = table_->data_end_offset_;
+          offset_ = next_offset_ = table_->file_info_.data_end_offset;
           break;
         }
         prefix_match = true;
@@ -678,19 +683,19 @@ void PlainTableIterator::Seek(const Slice& target) {
       }
     }
   } else {
-    offset_ = table_->data_end_offset_;
+    offset_ = table_->file_info_.data_end_offset;
   }
 }
 
 void PlainTableIterator::Next() {
   offset_ = next_offset_;
-  if (offset_ < table_->data_end_offset_) {
+  if (offset_ < table_->file_info_.data_end_offset) {
     Slice tmp_slice;
     ParsedInternalKey parsed_key;
     status_ =
         table_->Next(&decoder_, &next_offset_, &parsed_key, &key_, &value_);
     if (!status_.ok()) {
-      offset_ = next_offset_ = table_->data_end_offset_;
+      offset_ = next_offset_ = table_->file_info_.data_end_offset;
     }
   }
 }
diff --git a/src/rocksdb/table/plain_table_reader.h b/src/rocksdb/table/plain_table_reader.h
index b4f68a0..b9d8ceb 100644
--- a/src/rocksdb/table/plain_table_reader.h
+++ b/src/rocksdb/table/plain_table_reader.h
@@ -22,6 +22,7 @@
 #include "table/plain_table_index.h"
 #include "util/arena.h"
 #include "util/dynamic_bloom.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
@@ -43,6 +44,20 @@ using std::unordered_map;
 using std::vector;
 extern const uint32_t kPlainTableVariableLength;
 
+struct PlainTableReaderFileInfo {
+  bool is_mmap_mode;
+  Slice file_data;
+  uint32_t data_end_offset;
+  unique_ptr<RandomAccessFileReader> file;
+
+  PlainTableReaderFileInfo(unique_ptr<RandomAccessFileReader>&& _file,
+                           const EnvOptions& storage_options,
+                           uint32_t _data_size_offset)
+      : is_mmap_mode(storage_options.use_mmap_reads),
+        data_end_offset(_data_size_offset),
+        file(std::move(_file)) {}
+};
+
 // Based on following output file format shown in plain_table_factory.h
 // When opening the output file, IndexedTableReader creates a hash table
 // from key prefixes to offset of the output file. IndexedTable will decide
@@ -56,8 +71,8 @@ class PlainTableReader: public TableReader {
   static Status Open(const ImmutableCFOptions& ioptions,
                      const EnvOptions& env_options,
                      const InternalKeyComparator& internal_comparator,
-                     unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
-                     unique_ptr<TableReader>* table,
+                     unique_ptr<RandomAccessFileReader>&& file,
+                     uint64_t file_size, unique_ptr<TableReader>* table,
                      const int bloom_bits_per_key, double hash_table_ratio,
                      size_t index_sparseness, size_t huge_page_tlb_size,
                      bool full_scan_mode);
@@ -83,7 +98,7 @@ class PlainTableReader: public TableReader {
   }
 
   PlainTableReader(const ImmutableCFOptions& ioptions,
-                   unique_ptr<RandomAccessFile>&& file,
+                   unique_ptr<RandomAccessFileReader>&& file,
                    const EnvOptions& env_options,
                    const InternalKeyComparator& internal_comparator,
                    EncodingType encoding_type, uint64_t file_size,
@@ -107,14 +122,13 @@ class PlainTableReader: public TableReader {
                        double hash_table_ratio, size_t index_sparseness,
                        size_t huge_page_tlb_size);
 
-  Status MmapDataFile();
+  Status MmapDataIfNeeded();
 
  private:
   const InternalKeyComparator internal_comparator_;
   EncodingType encoding_type_;
   // represents plain table's current status.
   Status status_;
-  Slice file_data_;
 
   PlainTableIndex index_;
   bool full_scan_mode_;
@@ -122,7 +136,6 @@ class PlainTableReader: public TableReader {
   // data_start_offset_ and data_end_offset_ defines the range of the
   // sst file that stores data.
   const uint32_t data_start_offset_ = 0;
-  const uint32_t data_end_offset_;
   const uint32_t user_key_len_;
   const SliceTransform* prefix_extractor_;
 
@@ -131,10 +144,12 @@ class PlainTableReader: public TableReader {
   // Bloom filter is used to rule out non-existent key
   bool enable_bloom_;
   DynamicBloom bloom_;
+  PlainTableReaderFileInfo file_info_;
   Arena arena_;
+  std::unique_ptr<char[]> index_block_alloc_;
+  std::unique_ptr<char[]> bloom_block_alloc_;
 
   const ImmutableCFOptions& ioptions_;
-  unique_ptr<RandomAccessFile> file_;
   uint64_t file_size_;
   std::shared_ptr<const TableProperties> table_properties_;
 
diff --git a/src/rocksdb/table/sst_file_writer.cc b/src/rocksdb/table/sst_file_writer.cc
new file mode 100644
index 0000000..d780f0a
--- /dev/null
+++ b/src/rocksdb/table/sst_file_writer.cc
@@ -0,0 +1,188 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/sst_file_writer.h"
+
+#include <vector>
+#include "db/dbformat.h"
+#include "rocksdb/table.h"
+#include "table/block_based_table_builder.h"
+#include "util/file_reader_writer.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+const std::string ExternalSstFilePropertyNames::kVersion =
+    "rocksdb.external_sst_file.version";
+
+// PropertiesCollector used to add properties specific to tables
+// generated by SstFileWriter
+class SstFileWriter::SstFileWriterPropertiesCollector
+    : public IntTblPropCollector {
+ public:
+  explicit SstFileWriterPropertiesCollector(int32_t version)
+      : version_(version) {}
+
+  virtual Status InternalAdd(const Slice& key, const Slice& value,
+                             uint64_t file_size) override {
+    // Intentionally left blank. Have no interest in collecting stats for
+    // individual key/value pairs.
+    return Status::OK();
+  }
+
+  virtual Status Finish(UserCollectedProperties* properties) override {
+    std::string version_val;
+    PutFixed32(&version_val, static_cast<int32_t>(version_));
+    properties->insert({ExternalSstFilePropertyNames::kVersion, version_val});
+    return Status::OK();
+  }
+
+  virtual const char* Name() const override {
+    return "SstFileWriterPropertiesCollector";
+  }
+
+  virtual UserCollectedProperties GetReadableProperties() const override {
+    return {{ExternalSstFilePropertyNames::kVersion, ToString(version_)}};
+  }
+
+ private:
+  int32_t version_;
+};
+
+class SstFileWriter::SstFileWriterPropertiesCollectorFactory
+    : public IntTblPropCollectorFactory {
+ public:
+  explicit SstFileWriterPropertiesCollectorFactory(int32_t version)
+      : version_(version) {}
+
+  virtual IntTblPropCollector* CreateIntTblPropCollector() override {
+    return new SstFileWriterPropertiesCollector(version_);
+  }
+
+  virtual const char* Name() const override {
+    return "SstFileWriterPropertiesCollector";
+  }
+
+ private:
+  int32_t version_;
+};
+
+struct SstFileWriter::Rep {
+  Rep(const EnvOptions& _env_options, const ImmutableCFOptions& _ioptions,
+      const Comparator* _user_comparator)
+      : env_options(_env_options),
+        ioptions(_ioptions),
+        internal_comparator(_user_comparator) {}
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  std::unique_ptr<TableBuilder> builder;
+  EnvOptions env_options;
+  ImmutableCFOptions ioptions;
+  InternalKeyComparator internal_comparator;
+  ExternalSstFileInfo file_info;
+};
+
+SstFileWriter::SstFileWriter(const EnvOptions& env_options,
+                             const ImmutableCFOptions& ioptions,
+                             const Comparator* user_comparator)
+    : rep_(new Rep(env_options, ioptions, user_comparator)) {}
+
+SstFileWriter::~SstFileWriter() { delete rep_; }
+
+Status SstFileWriter::Open(const std::string& file_path) {
+  Rep* r = rep_;
+  Status s;
+  std::unique_ptr<WritableFile> sst_file;
+  s = r->ioptions.env->NewWritableFile(file_path, &sst_file, r->env_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  CompressionType compression_type = r->ioptions.compression;
+  if (!r->ioptions.compression_per_level.empty()) {
+    // Use the compression of the last level if we have per level compression
+    compression_type = *(r->ioptions.compression_per_level.rbegin());
+  }
+
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories;
+  int_tbl_prop_collector_factories.emplace_back(
+      new SstFileWriterPropertiesCollectorFactory(1 /* version */));
+
+  TableBuilderOptions table_builder_options(
+      r->ioptions, r->internal_comparator, &int_tbl_prop_collector_factories,
+      compression_type, r->ioptions.compression_opts, false);
+  r->file_writer.reset(
+      new WritableFileWriter(std::move(sst_file), r->env_options));
+  r->builder.reset(r->ioptions.table_factory->NewTableBuilder(
+      table_builder_options, r->file_writer.get()));
+
+  r->file_info.file_path = file_path;
+  r->file_info.file_size = 0;
+  r->file_info.num_entries = 0;
+  r->file_info.sequence_number = 0;
+  r->file_info.version = 1;
+  return s;
+}
+
+Status SstFileWriter::Add(const Slice& user_key, const Slice& value) {
+  Rep* r = rep_;
+  if (!r->builder) {
+    return Status::InvalidArgument("File is not opened");
+  }
+
+  if (r->file_info.num_entries == 0) {
+    r->file_info.smallest_key = user_key.ToString();
+  } else {
+    if (r->internal_comparator.user_comparator()->Compare(
+            user_key, r->file_info.largest_key) <= 0) {
+      // Make sure that keys are added in order
+      return Status::InvalidArgument("Keys must be added in order");
+    }
+  }
+
+  // update file info
+  r->file_info.num_entries++;
+  r->file_info.largest_key = user_key.ToString();
+  r->file_info.file_size = r->builder->FileSize();
+
+  InternalKey ikey(user_key, 0 /* Sequence Number */,
+                   ValueType::kTypeValue /* Put */);
+  r->builder->Add(ikey.Encode(), value);
+
+  return Status::OK();
+}
+
+Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) {
+  Rep* r = rep_;
+  if (!r->builder) {
+    return Status::InvalidArgument("File is not opened");
+  }
+
+  Status s = r->builder->Finish();
+  if (s.ok()) {
+    if (!r->ioptions.disable_data_sync) {
+      s = r->file_writer->Sync(r->ioptions.use_fsync);
+    }
+    if (s.ok()) {
+      s = r->file_writer->Close();
+    }
+  } else {
+    r->builder->Abandon();
+  }
+
+  if (!s.ok()) {
+    r->ioptions.env->DeleteFile(r->file_info.file_path);
+  }
+
+  if (s.ok() && file_info != nullptr) {
+    r->file_info.file_size = r->builder->FileSize();
+    *file_info = r->file_info;
+  }
+
+  r->builder.reset();
+  return s;
+}
+}  // namespace rocksdb
diff --git a/src/rocksdb/table/table_builder.h b/src/rocksdb/table/table_builder.h
index 19da4c2..55a1077 100644
--- a/src/rocksdb/table/table_builder.h
+++ b/src/rocksdb/table/table_builder.h
@@ -9,12 +9,14 @@
 
 #pragma once
 
+#include <stdint.h>
 #include <string>
 #include <utility>
 #include <vector>
 #include "db/table_properties_collector.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table_properties.h"
+#include "util/file_reader_writer.h"
 #include "util/mutable_cf_options.h"
 
 namespace rocksdb {
@@ -22,6 +24,19 @@ namespace rocksdb {
 class Slice;
 class Status;
 
+struct TableReaderOptions {
+  TableReaderOptions(const ImmutableCFOptions& _ioptions,
+                     const EnvOptions& _env_options,
+                     const InternalKeyComparator& _internal_comparator)
+      : ioptions(_ioptions),
+        env_options(_env_options),
+        internal_comparator(_internal_comparator) {}
+
+  const ImmutableCFOptions& ioptions;
+  const EnvOptions& env_options;
+  const InternalKeyComparator& internal_comparator;
+};
+
 struct TableBuilderOptions {
   TableBuilderOptions(
       const ImmutableCFOptions& _ioptions,
@@ -82,6 +97,10 @@ class TableBuilder {
   // Finish() call, returns the size of the final generated file.
   virtual uint64_t FileSize() const = 0;
 
+  // If the user defined table properties collector suggest the file to
+  // be further compacted.
+  virtual bool NeedCompact() const { return false; }
+
   // Returns table properties
   virtual TableProperties GetTableProperties() const = 0;
 };
diff --git a/src/rocksdb/table/table_properties.cc b/src/rocksdb/table/table_properties.cc
index 1ee34a6..86c0843 100644
--- a/src/rocksdb/table/table_properties.cc
+++ b/src/rocksdb/table/table_properties.cc
@@ -74,6 +74,16 @@ std::string TableProperties::ToString(
   return result;
 }
 
+void TableProperties::Add(const TableProperties& tp) {
+  data_size += tp.data_size;
+  index_size += tp.index_size;
+  filter_size += tp.filter_size;
+  raw_key_size += tp.raw_key_size;
+  raw_value_size += tp.raw_value_size;
+  num_data_blocks += tp.num_data_blocks;
+  num_entries += tp.num_entries;
+}
+
 const std::string TablePropertiesNames::kDataSize  =
     "rocksdb.data.size";
 const std::string TablePropertiesNames::kIndexSize =
diff --git a/src/rocksdb/table/table_reader_bench.cc b/src/rocksdb/table/table_reader_bench.cc
index b4039aa..e3baa29 100644
--- a/src/rocksdb/table/table_reader_bench.cc
+++ b/src/rocksdb/table/table_reader_bench.cc
@@ -22,6 +22,7 @@ int main() {
 #include "table/plain_table_factory.h"
 #include "table/table_builder.h"
 #include "table/get_context.h"
+#include "util/file_reader_writer.h"
 #include "util/histogram.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
@@ -78,23 +79,26 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
       + "/rocksdb_table_reader_benchmark";
   std::string dbname = test::TmpDir() + "/rocksdb_table_reader_bench_db";
   WriteOptions wo;
-  unique_ptr<WritableFile> file;
   Env* env = Env::Default();
   TableBuilder* tb = nullptr;
   DB* db = nullptr;
   Status s;
   const ImmutableCFOptions ioptions(opts);
+  unique_ptr<WritableFileWriter> file_writer;
   if (!through_db) {
+    unique_ptr<WritableFile> file;
     env->NewWritableFile(file_name, &file, env_options);
 
     std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
         int_tbl_prop_collector_factories;
 
+    file_writer.reset(new WritableFileWriter(std::move(file), env_options));
+
     tb = opts.table_factory->NewTableBuilder(
         TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories,
                             CompressionType::kNoCompression,
                             CompressionOptions(), false),
-        file.get());
+        file_writer.get());
   } else {
     s = DB::Open(opts, dbname, &db);
     ASSERT_OK(s);
@@ -113,19 +117,30 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
   }
   if (!through_db) {
     tb->Finish();
-    file->Close();
+    file_writer->Close();
   } else {
     db->Flush(FlushOptions());
   }
 
   unique_ptr<TableReader> table_reader;
-  unique_ptr<RandomAccessFile> raf;
   if (!through_db) {
+    unique_ptr<RandomAccessFile> raf;
     s = env->NewRandomAccessFile(file_name, &raf, env_options);
+    if (!s.ok()) {
+      fprintf(stderr, "Create File Error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
     uint64_t file_size;
     env->GetFileSize(file_name, &file_size);
+    unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(std::move(raf)));
     s = opts.table_factory->NewTableReader(
-        ioptions, env_options, ikc, std::move(raf), file_size, &table_reader);
+        TableReaderOptions(ioptions, env_options, ikc), std::move(file_reader),
+        file_size, &table_reader);
+    if (!s.ok()) {
+      fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
   }
 
   Random rnd(301);
diff --git a/src/rocksdb/table/table_test.cc b/src/rocksdb/table/table_test.cc
index 6f7b4db..e21503b 100644
--- a/src/rocksdb/table/table_test.cc
+++ b/src/rocksdb/table/table_test.cc
@@ -13,43 +13,39 @@
 #include <algorithm>
 #include <iostream>
 #include <map>
-#include <string>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "db/dbformat.h"
 #include "db/memtable.h"
 #include "db/write_batch_internal.h"
 #include "db/writebuffer.h"
-
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/memtablerep.h"
+#include "rocksdb/perf_context.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/statistics.h"
-
 #include "table/block.h"
 #include "table/block_based_table_builder.h"
 #include "table/block_based_table_factory.h"
 #include "table/block_based_table_reader.h"
 #include "table/block_builder.h"
 #include "table/format.h"
+#include "table/get_context.h"
 #include "table/meta_blocks.h"
 #include "table/plain_table_factory.h"
-#include "table/get_context.h"
-
 #include "util/compression.h"
 #include "util/random.h"
+#include "util/scoped_arena_iterator.h"
 #include "util/statistics.h"
+#include "util/stl_wrappers.h"
 #include "util/string_util.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
-#include "util/scoped_arena_iterator.h"
-
-using std::vector;
-using std::string;
 
 namespace rocksdb {
 
@@ -106,91 +102,14 @@ void Increment(const Comparator* cmp, std::string* key) {
   }
 }
 
-// An STL comparator that uses a Comparator
-struct STLLessThan {
-  const Comparator* cmp;
-
-  STLLessThan() : cmp(BytewiseComparator()) { }
-  explicit STLLessThan(const Comparator* c) : cmp(c) { }
-  bool operator()(const std::string& a, const std::string& b) const {
-    return cmp->Compare(Slice(a), Slice(b)) < 0;
-  }
-};
-
 }  // namespace
 
-class StringSink: public WritableFile {
- public:
-  ~StringSink() { }
-
-  const std::string& contents() const { return contents_; }
-
-  virtual Status Close() override { return Status::OK(); }
-  virtual Status Flush() override { return Status::OK(); }
-  virtual Status Sync() override { return Status::OK(); }
-
-  virtual Status Append(const Slice& data) override {
-    contents_.append(data.data(), data.size());
-    return Status::OK();
-  }
-
- private:
-  std::string contents_;
-};
-
-
-class StringSource: public RandomAccessFile {
- public:
-  StringSource(const Slice& contents, uint64_t uniq_id, bool mmap)
-      : contents_(contents.data(), contents.size()), uniq_id_(uniq_id),
-        mmap_(mmap) {
-  }
-
-  virtual ~StringSource() { }
-
-  uint64_t Size() const { return contents_.size(); }
-
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const override {
-    if (offset > contents_.size()) {
-      return Status::InvalidArgument("invalid Read offset");
-    }
-    if (offset + n > contents_.size()) {
-      n = contents_.size() - offset;
-    }
-    if (!mmap_) {
-      memcpy(scratch, &contents_[offset], n);
-      *result = Slice(scratch, n);
-    } else {
-      *result = Slice(&contents_[offset], n);
-    }
-    return Status::OK();
-  }
-
-  virtual size_t GetUniqueId(char* id, size_t max_size) const override {
-    if (max_size < 20) {
-      return 0;
-    }
-
-    char* rid = id;
-    rid = EncodeVarint64(rid, uniq_id_);
-    rid = EncodeVarint64(rid, 0);
-    return static_cast<size_t>(rid-id);
-  }
-
- private:
-  std::string contents_;
-  uint64_t uniq_id_;
-  bool mmap_;
-};
-
-typedef std::map<std::string, std::string, STLLessThan> KVMap;
-
 // Helper class for tests to unify the interface between
 // BlockBuilder/TableBuilder and Block/Table.
 class Constructor {
  public:
-  explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) {}
+  explicit Constructor(const Comparator* cmp)
+      : data_(stl_wrappers::LessOfComparator(cmp)) {}
   virtual ~Constructor() { }
 
   void Add(const std::string& key, const Slice& value) {
@@ -200,18 +119,15 @@ class Constructor {
   // Finish constructing the data structure with all the keys that have
   // been added so far.  Returns the keys in sorted order in "*keys"
   // and stores the key/value pairs in "*kvmap"
-  void Finish(const Options& options,
-              const ImmutableCFOptions& ioptions,
+  void Finish(const Options& options, const ImmutableCFOptions& ioptions,
               const BlockBasedTableOptions& table_options,
               const InternalKeyComparator& internal_comparator,
-              std::vector<std::string>* keys, KVMap* kvmap) {
+              std::vector<std::string>* keys, stl_wrappers::KVMap* kvmap) {
     last_internal_key_ = &internal_comparator;
     *kvmap = data_;
     keys->clear();
-    for (KVMap::const_iterator it = data_.begin();
-         it != data_.end();
-         ++it) {
-      keys->push_back(it->first);
+    for (const auto& kv : data_) {
+      keys->push_back(kv.first);
     }
     data_.clear();
     Status s = FinishImpl(options, ioptions, table_options,
@@ -224,11 +140,11 @@ class Constructor {
                             const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
-                            const KVMap& data) = 0;
+                            const stl_wrappers::KVMap& data) = 0;
 
   virtual Iterator* NewIterator() const = 0;
 
-  virtual const KVMap& data() { return data_; }
+  virtual const stl_wrappers::KVMap& data() { return data_; }
 
   virtual bool IsArenaMode() const { return false; }
 
@@ -240,7 +156,7 @@ class Constructor {
   const InternalKeyComparator* last_internal_key_;
 
  private:
-  KVMap data_;
+  stl_wrappers::KVMap data_;
 };
 
 class BlockConstructor: public Constructor {
@@ -256,7 +172,7 @@ class BlockConstructor: public Constructor {
                             const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
-                            const KVMap& kv_map) override {
+                            const stl_wrappers::KVMap& kv_map) override {
     delete block_;
     block_ = nullptr;
     BlockBuilder builder(table_options.block_restart_interval);
@@ -287,7 +203,7 @@ class BlockConstructor: public Constructor {
 // A helper class that converts internal format keys into user keys
 class KeyConvertingIterator: public Iterator {
  public:
-  KeyConvertingIterator(Iterator* iter, bool arena_mode = false)
+  explicit KeyConvertingIterator(Iterator* iter, bool arena_mode = false)
       : iter_(iter), arena_mode_(arena_mode) {}
   virtual ~KeyConvertingIterator() {
     if (arena_mode_) {
@@ -345,9 +261,10 @@ class TableConstructor: public Constructor {
                             const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
-                            const KVMap& kv_map) override {
+                            const stl_wrappers::KVMap& kv_map) override {
     Reset();
-    sink_.reset(new StringSink());
+    soptions.use_mmap_reads = ioptions.allow_mmap_reads;
+    file_writer_.reset(test::GetWritableFileWriter(new test::StringSink()));
     unique_ptr<TableBuilder> builder;
     std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
         int_tbl_prop_collector_factories;
@@ -355,7 +272,7 @@ class TableConstructor: public Constructor {
         TableBuilderOptions(ioptions, internal_comparator,
                             &int_tbl_prop_collector_factories,
                             options.compression, CompressionOptions(), false),
-        sink_.get()));
+        file_writer_.get()));
 
     for (const auto kv : kv_map) {
       if (convert_to_internal_key_) {
@@ -369,17 +286,18 @@ class TableConstructor: public Constructor {
       EXPECT_TRUE(builder->status().ok());
     }
     Status s = builder->Finish();
+    file_writer_->Flush();
     EXPECT_TRUE(s.ok()) << s.ToString();
 
-    EXPECT_EQ(sink_->contents().size(), builder->FileSize());
+    EXPECT_EQ(GetSink()->contents().size(), builder->FileSize());
 
     // Open the table
     uniq_id_ = cur_uniq_id_++;
-    source_.reset(new StringSource(sink_->contents(), uniq_id_,
-                                   ioptions.allow_mmap_reads));
+    file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource(
+        GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)));
     return ioptions.table_factory->NewTableReader(
-        ioptions, soptions, internal_comparator, std::move(source_),
-        sink_->contents().size(), &table_reader_);
+        TableReaderOptions(ioptions, soptions, internal_comparator),
+        std::move(file_reader_), GetSink()->contents().size(), &table_reader_);
   }
 
   virtual Iterator* NewIterator() const override {
@@ -397,12 +315,11 @@ class TableConstructor: public Constructor {
   }
 
   virtual Status Reopen(const ImmutableCFOptions& ioptions) {
-    source_.reset(
-        new StringSource(sink_->contents(), uniq_id_,
-                         ioptions.allow_mmap_reads));
+    file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource(
+        GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)));
     return ioptions.table_factory->NewTableReader(
-        ioptions, soptions, *last_internal_key_, std::move(source_),
-        sink_->contents().size(), &table_reader_);
+        TableReaderOptions(ioptions, soptions, *last_internal_key_),
+        std::move(file_reader_), GetSink()->contents().size(), &table_reader_);
   }
 
   virtual TableReader* GetTableReader() {
@@ -417,20 +334,24 @@ class TableConstructor: public Constructor {
   void Reset() {
     uniq_id_ = 0;
     table_reader_.reset();
-    sink_.reset();
-    source_.reset();
+    file_writer_.reset();
+    file_reader_.reset();
+  }
+
+  test::StringSink* GetSink() {
+    return static_cast<test::StringSink*>(file_writer_->writable_file());
   }
 
   uint64_t uniq_id_;
-  unique_ptr<StringSink> sink_;
-  unique_ptr<StringSource> source_;
+  unique_ptr<WritableFileWriter> file_writer_;
+  unique_ptr<RandomAccessFileReader> file_reader_;
   unique_ptr<TableReader> table_reader_;
   bool convert_to_internal_key_;
 
   TableConstructor();
 
   static uint64_t cur_uniq_id_;
-  const EnvOptions soptions;
+  EnvOptions soptions;
 };
 uint64_t TableConstructor::cur_uniq_id_ = 1;
 
@@ -444,7 +365,8 @@ class MemTableConstructor: public Constructor {
     options_.memtable_factory = table_factory_;
     ImmutableCFOptions ioptions(options_);
     memtable_ = new MemTable(internal_comparator_, ioptions,
-                             MutableCFOptions(options_, ioptions), wb);
+                             MutableCFOptions(options_, ioptions), wb,
+                             kMaxSequenceNumber);
     memtable_->Ref();
   }
   ~MemTableConstructor() {
@@ -453,12 +375,12 @@ class MemTableConstructor: public Constructor {
   virtual Status FinishImpl(const Options&, const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
-                            const KVMap& kv_map) override {
+                            const stl_wrappers::KVMap& kv_map) override {
     delete memtable_->Unref();
     ImmutableCFOptions mem_ioptions(ioptions);
     memtable_ = new MemTable(internal_comparator_, mem_ioptions,
                              MutableCFOptions(options_, mem_ioptions),
-                             write_buffer_);
+                             write_buffer_, kMaxSequenceNumber);
     memtable_->Ref();
     int seq = 1;
     for (const auto kv : kv_map) {
@@ -500,7 +422,7 @@ class DBConstructor: public Constructor {
                             const ImmutableCFOptions& ioptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
-                            const KVMap& kv_map) override {
+                            const stl_wrappers::KVMap& kv_map) override {
     delete db_;
     db_ = nullptr;
     NewDB();
@@ -539,9 +461,11 @@ class DBConstructor: public Constructor {
 
 enum TestType {
   BLOCK_BASED_TABLE_TEST,
+#ifndef ROCKSDB_LITE
   PLAIN_TABLE_SEMI_FIXED_PREFIX,
   PLAIN_TABLE_FULL_STR_PREFIX,
   PLAIN_TABLE_TOTAL_ORDER,
+#endif  // !ROCKSDB_LITE
   BLOCK_TEST,
   MEMTABLE_TEST,
   DB_TEST
@@ -553,15 +477,20 @@ struct TestArgs {
   int restart_interval;
   CompressionType compression;
   uint32_t format_version;
+  bool use_mmap;
 };
 
 static std::vector<TestArgs> GenerateArgList() {
   std::vector<TestArgs> test_args;
   std::vector<TestType> test_types = {
-      BLOCK_BASED_TABLE_TEST,      PLAIN_TABLE_SEMI_FIXED_PREFIX,
-      PLAIN_TABLE_FULL_STR_PREFIX, PLAIN_TABLE_TOTAL_ORDER,
-      BLOCK_TEST,                  MEMTABLE_TEST,
-      DB_TEST};
+      BLOCK_BASED_TABLE_TEST,
+#ifndef ROCKSDB_LITE
+      PLAIN_TABLE_SEMI_FIXED_PREFIX,
+      PLAIN_TABLE_FULL_STR_PREFIX,
+      PLAIN_TABLE_TOTAL_ORDER,
+#endif  // !ROCKSDB_LITE
+      BLOCK_TEST,
+      MEMTABLE_TEST, DB_TEST};
   std::vector<bool> reverse_compare_types = {false, true};
   std::vector<int> restart_intervals = {16, 1, 1024};
 
@@ -585,20 +514,30 @@ static std::vector<TestArgs> GenerateArgList() {
     compression_types.emplace_back(kLZ4HCCompression, false);
     compression_types.emplace_back(kLZ4HCCompression, true);
   }
+  if (ZSTD_Supported()) {
+    compression_types.emplace_back(kZSTDNotFinalCompression, false);
+    compression_types.emplace_back(kZSTDNotFinalCompression, true);
+  }
 
   for (auto test_type : test_types) {
     for (auto reverse_compare : reverse_compare_types) {
+#ifndef ROCKSDB_LITE
       if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX ||
-          test_type == PLAIN_TABLE_FULL_STR_PREFIX) {
+          test_type == PLAIN_TABLE_FULL_STR_PREFIX ||
+          test_type == PLAIN_TABLE_TOTAL_ORDER) {
         // Plain table doesn't use restart index or compression.
         TestArgs one_arg;
         one_arg.type = test_type;
         one_arg.reverse_compare = reverse_compare;
         one_arg.restart_interval = restart_intervals[0];
         one_arg.compression = compression_types[0].first;
+        one_arg.use_mmap = true;
+        test_args.push_back(one_arg);
+        one_arg.use_mmap = false;
         test_args.push_back(one_arg);
         continue;
       }
+#endif  // !ROCKSDB_LITE
 
       for (auto restart_interval : restart_intervals) {
         for (auto compression_type : compression_types) {
@@ -608,6 +547,7 @@ static std::vector<TestArgs> GenerateArgList() {
           one_arg.restart_interval = restart_interval;
           one_arg.compression = compression_type.first;
           one_arg.format_version = compression_type.second ? 2 : 1;
+          one_arg.use_mmap = false;
           test_args.push_back(one_arg);
         }
       }
@@ -669,6 +609,7 @@ class HarnessTest : public testing::Test {
 
     support_prev_ = true;
     only_support_prefix_seek_ = false;
+    options_.allow_mmap_reads = args.use_mmap;
     switch (args.type) {
       case BLOCK_BASED_TABLE_TEST:
         table_options_.flush_block_policy_factory.reset(
@@ -680,11 +621,12 @@ class HarnessTest : public testing::Test {
             new BlockBasedTableFactory(table_options_));
         constructor_ = new TableConstructor(options_.comparator);
         break;
+// Plain table is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
       case PLAIN_TABLE_SEMI_FIXED_PREFIX:
         support_prev_ = false;
         only_support_prefix_seek_ = true;
         options_.prefix_extractor.reset(new FixedOrLessPrefixTransform(2));
-        options_.allow_mmap_reads = true;
         options_.table_factory.reset(NewPlainTableFactory());
         constructor_ = new TableConstructor(options_.comparator, true);
         internal_comparator_.reset(
@@ -694,7 +636,6 @@ class HarnessTest : public testing::Test {
         support_prev_ = false;
         only_support_prefix_seek_ = true;
         options_.prefix_extractor.reset(NewNoopTransform());
-        options_.allow_mmap_reads = true;
         options_.table_factory.reset(NewPlainTableFactory());
         constructor_ = new TableConstructor(options_.comparator, true);
         internal_comparator_.reset(
@@ -704,7 +645,6 @@ class HarnessTest : public testing::Test {
         support_prev_ = false;
         only_support_prefix_seek_ = false;
         options_.prefix_extractor = nullptr;
-        options_.allow_mmap_reads = true;
 
         {
           PlainTableOptions plain_table_options;
@@ -719,6 +659,7 @@ class HarnessTest : public testing::Test {
         internal_comparator_.reset(
             new InternalKeyComparator(options_.comparator));
         break;
+#endif  // !ROCKSDB_LITE
       case BLOCK_TEST:
         table_options_.block_size = 256;
         options_.table_factory.reset(
@@ -750,7 +691,7 @@ class HarnessTest : public testing::Test {
 
   void Test(Random* rnd) {
     std::vector<std::string> keys;
-    KVMap data;
+    stl_wrappers::KVMap data;
     constructor_->Finish(options_, ioptions_, table_options_,
                          *internal_comparator_, &keys, &data);
 
@@ -762,13 +703,12 @@ class HarnessTest : public testing::Test {
   }
 
   void TestForwardScan(const std::vector<std::string>& keys,
-                       const KVMap& data) {
+                       const stl_wrappers::KVMap& data) {
     Iterator* iter = constructor_->NewIterator();
     ASSERT_TRUE(!iter->Valid());
     iter->SeekToFirst();
-    for (KVMap::const_iterator model_iter = data.begin();
-         model_iter != data.end();
-         ++model_iter) {
+    for (stl_wrappers::KVMap::const_iterator model_iter = data.begin();
+         model_iter != data.end(); ++model_iter) {
       ASSERT_EQ(ToString(data, model_iter), ToString(iter));
       iter->Next();
     }
@@ -781,13 +721,12 @@ class HarnessTest : public testing::Test {
   }
 
   void TestBackwardScan(const std::vector<std::string>& keys,
-                        const KVMap& data) {
+                        const stl_wrappers::KVMap& data) {
     Iterator* iter = constructor_->NewIterator();
     ASSERT_TRUE(!iter->Valid());
     iter->SeekToLast();
-    for (KVMap::const_reverse_iterator model_iter = data.rbegin();
-         model_iter != data.rend();
-         ++model_iter) {
+    for (stl_wrappers::KVMap::const_reverse_iterator model_iter = data.rbegin();
+         model_iter != data.rend(); ++model_iter) {
       ASSERT_EQ(ToString(data, model_iter), ToString(iter));
       iter->Prev();
     }
@@ -799,13 +738,12 @@ class HarnessTest : public testing::Test {
     }
   }
 
-  void TestRandomAccess(Random* rnd,
-                        const std::vector<std::string>& keys,
-                        const KVMap& data) {
+  void TestRandomAccess(Random* rnd, const std::vector<std::string>& keys,
+                        const stl_wrappers::KVMap& data) {
     static const bool kVerbose = false;
     Iterator* iter = constructor_->NewIterator();
     ASSERT_TRUE(!iter->Valid());
-    KVMap::const_iterator model_iter = data.begin();
+    stl_wrappers::KVMap::const_iterator model_iter = data.begin();
     if (kVerbose) fprintf(stderr, "---\n");
     for (int i = 0; i < 200; i++) {
       const int toss = rnd->Uniform(support_prev_ ? 5 : 3);
@@ -873,7 +811,8 @@ class HarnessTest : public testing::Test {
     }
   }
 
-  std::string ToString(const KVMap& data, const KVMap::const_iterator& it) {
+  std::string ToString(const stl_wrappers::KVMap& data,
+                       const stl_wrappers::KVMap::const_iterator& it) {
     if (it == data.end()) {
       return "END";
     } else {
@@ -881,8 +820,8 @@ class HarnessTest : public testing::Test {
     }
   }
 
-  std::string ToString(const KVMap& data,
-                       const KVMap::const_reverse_iterator& it) {
+  std::string ToString(const stl_wrappers::KVMap& data,
+                       const stl_wrappers::KVMap::const_reverse_iterator& it) {
     if (it == data.rend()) {
       return "END";
     } else {
@@ -1027,7 +966,7 @@ TEST_F(BlockBasedTableTest, BasicBlockBasedTableProperties) {
   c.Add("j9", "val9");
 
   std::vector<std::string> keys;
-  KVMap kvmap;
+  stl_wrappers::KVMap kvmap;
   Options options;
   options.compression = kNoCompression;
   BlockBasedTableOptions table_options;
@@ -1062,7 +1001,7 @@ TEST_F(BlockBasedTableTest, FilterPolicyNameProperties) {
   TableConstructor c(BytewiseComparator(), true);
   c.Add("a1", "val1");
   std::vector<std::string> keys;
-  KVMap kvmap;
+  stl_wrappers::KVMap kvmap;
   BlockBasedTableOptions table_options;
   table_options.filter_policy.reset(NewBloomFilterPolicy(10));
   Options options;
@@ -1079,8 +1018,8 @@ TEST_F(BlockBasedTableTest, FilterPolicyNameProperties) {
 // BlockBasedTableTest::PrefetchTest
 //
 void AssertKeysInCache(BlockBasedTable* table_reader,
-                 const vector<string>& keys_in_cache,
-                 const vector<string>& keys_not_in_cache) {
+                       const std::vector<std::string>& keys_in_cache,
+                       const std::vector<std::string>& keys_not_in_cache) {
   for (auto key : keys_in_cache) {
     ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
   }
@@ -1092,10 +1031,10 @@ void AssertKeysInCache(BlockBasedTable* table_reader,
 
 void PrefetchRange(TableConstructor* c, Options* opt,
                    BlockBasedTableOptions* table_options,
-                   const vector<std::string>& keys,
-                   const char* key_begin, const char* key_end,
-                   const vector<string>& keys_in_cache,
-                   const vector<string>& keys_not_in_cache,
+                   const std::vector<std::string>& keys, const char* key_begin,
+                   const char* key_end,
+                   const std::vector<std::string>& keys_in_cache,
+                   const std::vector<std::string>& keys_not_in_cache,
                    const Status expected_status = Status::OK()) {
   // reset the cache and reopen the table
   table_options->block_cache = NewLRUCache(16 * 1024 * 1024);
@@ -1138,7 +1077,7 @@ TEST_F(BlockBasedTableTest, PrefetchTest) {
   c.Add("k06", "hello3");
   c.Add("k07", std::string(100000, 'x'));
   std::vector<std::string> keys;
-  KVMap kvmap;
+  stl_wrappers::KVMap kvmap;
   const ImmutableCFOptions ioptions(opt);
   c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap);
 
@@ -1242,7 +1181,7 @@ TEST_F(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
     c.Add("abbb1", std::string('a', 56));
     c.Add("cccc2", std::string('a', 56));
     std::vector<std::string> keys;
-    KVMap kvmap;
+    stl_wrappers::KVMap kvmap;
     const ImmutableCFOptions ioptions(options);
     c.Finish(options, ioptions, table_options,
              GetPlainInternalComparator(options.comparator), &keys, &kvmap);
@@ -1316,7 +1255,7 @@ TEST_F(TableTest, HashIndexTest) {
   AddInternalKey(&c, "0095");
 
   std::vector<std::string> keys;
-  KVMap kvmap;
+  stl_wrappers::KVMap kvmap;
   Options options;
   options.prefix_extractor.reset(NewFixedPrefixTransform(3));
   BlockBasedTableOptions table_options;
@@ -1430,7 +1369,7 @@ TEST_F(BlockBasedTableTest, IndexSizeStat) {
     }
 
     std::vector<std::string> ks;
-    KVMap kvmap;
+    stl_wrappers::KVMap kvmap;
     Options options;
     options.compression = kNoCompression;
     BlockBasedTableOptions table_options;
@@ -1463,7 +1402,7 @@ TEST_F(BlockBasedTableTest, NumBlockStat) {
   }
 
   std::vector<std::string> ks;
-  KVMap kvmap;
+  stl_wrappers::KVMap kvmap;
   const ImmutableCFOptions ioptions(options);
   c.Finish(options, ioptions, table_options,
            GetPlainInternalComparator(options.comparator), &ks, &kvmap);
@@ -1484,6 +1423,9 @@ class BlockCachePropertiesSnapshot {
     filter_block_cache_miss =
         statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS);
     filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT);
+    block_cache_bytes_read = statistics->getTickerCount(BLOCK_CACHE_BYTES_READ);
+    block_cache_bytes_write =
+        statistics->getTickerCount(BLOCK_CACHE_BYTES_WRITE);
   }
 
   void AssertIndexBlockStat(int64_t expected_index_block_cache_miss,
@@ -1514,6 +1456,10 @@ class BlockCachePropertiesSnapshot {
               block_cache_hit);
   }
 
+  int64_t GetCacheBytesRead() { return block_cache_bytes_read; }
+
+  int64_t GetCacheBytesWrite() { return block_cache_bytes_write; }
+
  private:
   int64_t block_cache_miss = 0;
   int64_t block_cache_hit = 0;
@@ -1523,6 +1469,8 @@ class BlockCachePropertiesSnapshot {
   int64_t data_block_cache_hit = 0;
   int64_t filter_block_cache_miss = 0;
   int64_t filter_block_cache_hit = 0;
+  int64_t block_cache_bytes_read = 0;
+  int64_t block_cache_bytes_write = 0;
 };
 
 // Make sure, by default, index/filter blocks were pre-loaded (meaning we won't
@@ -1536,7 +1484,7 @@ TEST_F(BlockBasedTableTest, BlockCacheDisabledTest) {
   table_options.filter_policy.reset(NewBloomFilterPolicy(10));
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   std::vector<std::string> keys;
-  KVMap kvmap;
+  stl_wrappers::KVMap kvmap;
 
   TableConstructor c(BytewiseComparator(), true);
   c.Add("key", "value");
@@ -1582,7 +1530,7 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
   table_options.cache_index_and_filter_blocks = true;
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   std::vector<std::string> keys;
-  KVMap kvmap;
+  stl_wrappers::KVMap kvmap;
 
   TableConstructor c(BytewiseComparator());
   c.Add("key", "value");
@@ -1598,12 +1546,17 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
   // Since block_cache is disabled, no cache activities will be involved.
   unique_ptr<Iterator> iter;
 
+  int64_t last_cache_bytes_read = 0;
   // At first, no block will be accessed.
   {
     BlockCachePropertiesSnapshot props(options.statistics.get());
     // index will be added to block cache.
     props.AssertEqual(1,  // index block miss
                       0, 0, 0);
+    ASSERT_EQ(props.GetCacheBytesRead(), 0);
+    ASSERT_EQ(props.GetCacheBytesWrite(),
+              table_options.block_cache->GetUsage());
+    last_cache_bytes_read = props.GetCacheBytesRead();
   }
 
   // Only index block will be accessed
@@ -1615,6 +1568,11 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
     // value; other numbers remain the same.
     props.AssertEqual(1, 0 + 1,  // index block hit
                       0, 0);
+    // Cache hit, bytes read from cache should increase
+    ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
+    ASSERT_EQ(props.GetCacheBytesWrite(),
+              table_options.block_cache->GetUsage());
+    last_cache_bytes_read = props.GetCacheBytesRead();
   }
 
   // Only data block will be accessed
@@ -1623,6 +1581,11 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertEqual(1, 1, 0 + 1,  // data block miss
                       0);
+    // Cache miss, Bytes read from cache should not change
+    ASSERT_EQ(props.GetCacheBytesRead(), last_cache_bytes_read);
+    ASSERT_EQ(props.GetCacheBytesWrite(),
+              table_options.block_cache->GetUsage());
+    last_cache_bytes_read = props.GetCacheBytesRead();
   }
 
   // Data block will be in cache
@@ -1632,6 +1595,11 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertEqual(1, 1 + 1, /* index block hit */
                       1, 0 + 1 /* data block hit */);
+    // Cache hit, bytes read from cache should increase
+    ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
+    ASSERT_EQ(props.GetCacheBytesWrite(),
+              table_options.block_cache->GetUsage());
+    last_cache_bytes_read = props.GetCacheBytesRead();
   }
   // release the iterator so that the block cache can reset correctly.
   iter.reset();
@@ -1648,6 +1616,8 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertEqual(1,  // index block miss
                       0, 0, 0);
+    // Cache miss, Bytes read from cache should not change
+    ASSERT_EQ(props.GetCacheBytesRead(), 0);
   }
 
   {
@@ -1659,6 +1629,8 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
     props.AssertEqual(1 + 1,  // index block miss
                       0, 0,   // data block miss
                       0);
+    // Cache hit, bytes read from cache should increase
+    ASSERT_EQ(props.GetCacheBytesRead(), 0);
   }
 
   {
@@ -1668,6 +1640,8 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertEqual(2, 0, 0 + 1,  // data block miss
                       0);
+    // Cache miss, Bytes read from cache should not change
+    ASSERT_EQ(props.GetCacheBytesRead(), 0);
   }
   iter.reset();
 
@@ -1702,6 +1676,79 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
   props.AssertFilterBlockStat(0, 0);
 }
 
+TEST_F(BlockBasedTableTest, BlockReadCountTest) {
+  // bloom_filter_type = 0 -- block-based filter
+  // bloom_filter_type = 0 -- full filter
+  for (int bloom_filter_type = 0; bloom_filter_type < 2; ++bloom_filter_type) {
+    for (int index_and_filter_in_cache = 0; index_and_filter_in_cache < 2;
+         ++index_and_filter_in_cache) {
+      Options options;
+      options.create_if_missing = true;
+
+      BlockBasedTableOptions table_options;
+      table_options.block_cache = NewLRUCache(1, 0);
+      table_options.cache_index_and_filter_blocks = index_and_filter_in_cache;
+      table_options.filter_policy.reset(
+          NewBloomFilterPolicy(10, bloom_filter_type == 0));
+      options.table_factory.reset(new BlockBasedTableFactory(table_options));
+      std::vector<std::string> keys;
+      stl_wrappers::KVMap kvmap;
+
+      TableConstructor c(BytewiseComparator());
+      std::string user_key = "k04";
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      c.Add(encoded_key, "hello");
+      ImmutableCFOptions ioptions(options);
+      // Generate table with filter policy
+      c.Finish(options, ioptions, table_options,
+               GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+      auto reader = c.GetTableReader();
+      std::string value;
+      GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                             GetContext::kNotFound, user_key, &value, nullptr,
+                             nullptr, nullptr);
+      perf_context.Reset();
+      ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context));
+      if (index_and_filter_in_cache) {
+        // data, index and filter block
+        ASSERT_EQ(perf_context.block_read_count, 3);
+      } else {
+        // just the data block
+        ASSERT_EQ(perf_context.block_read_count, 1);
+      }
+      ASSERT_EQ(get_context.State(), GetContext::kFound);
+      ASSERT_EQ(value, "hello");
+
+      // Get non-existing key
+      user_key = "does-not-exist";
+      internal_key = InternalKey(user_key, 0, kTypeValue);
+      encoded_key = internal_key.Encode().ToString();
+
+      get_context = GetContext(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr);
+      perf_context.Reset();
+      ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context));
+      ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+
+      if (index_and_filter_in_cache) {
+        if (bloom_filter_type == 0) {
+          // with block-based, we read index and then the filter
+          ASSERT_EQ(perf_context.block_read_count, 2);
+        } else {
+          // with full-filter, we read filter first and then we stop
+          ASSERT_EQ(perf_context.block_read_count, 1);
+        }
+      } else {
+        // filter is already in memory and it figures out that the key doesn't
+        // exist
+        ASSERT_EQ(perf_context.block_read_count, 0);
+      }
+    }
+  }
+}
+
 TEST_F(BlockBasedTableTest, BlockCacheLeak) {
   // Check that when we reopen a table we don't lose access to blocks already
   // in the cache. This test checks whether the Table actually makes use of the
@@ -1726,7 +1773,7 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) {
   c.Add("k06", "hello3");
   c.Add("k07", std::string(100000, 'x'));
   std::vector<std::string> keys;
-  KVMap kvmap;
+  stl_wrappers::KVMap kvmap;
   const ImmutableCFOptions ioptions(opt);
   c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap);
 
@@ -1757,6 +1804,8 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) {
   }
 }
 
+// Plain table is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
 TEST_F(PlainTableTest, BasicPlainTableProperties) {
   PlainTableOptions plain_table_options;
   plain_table_options.user_key_len = 8;
@@ -1764,7 +1813,9 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) {
   plain_table_options.hash_table_ratio = 0;
 
   PlainTableFactory factory(plain_table_options);
-  StringSink sink;
+  test::StringSink sink;
+  unique_ptr<WritableFileWriter> file_writer(
+      test::GetWritableFileWriter(new test::StringSink()));
   Options options;
   const ImmutableCFOptions ioptions(options);
   InternalKeyComparator ikc(options.comparator);
@@ -1773,7 +1824,7 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) {
   std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
       TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories,
                           kNoCompression, CompressionOptions(), false),
-      &sink));
+      file_writer.get()));
 
   for (char c = 'a'; c <= 'z'; ++c) {
     std::string key(8, c);
@@ -1782,11 +1833,16 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) {
     builder->Add(key, value);
   }
   ASSERT_OK(builder->Finish());
+  file_writer->Flush();
 
-  StringSource source(sink.contents(), 72242, true);
+  test::StringSink* ss =
+    static_cast<test::StringSink*>(file_writer->writable_file());
+  unique_ptr<RandomAccessFileReader> file_reader(
+      test::GetRandomAccessFileReader(
+          new test::StringSource(ss->contents(), 72242, true)));
 
   TableProperties* props = nullptr;
-  auto s = ReadTableProperties(&source, sink.contents().size(),
+  auto s = ReadTableProperties(file_reader.get(), ss->contents().size(),
                                kPlainTableMagicNumber, Env::Default(), nullptr,
                                &props);
   std::unique_ptr<TableProperties> props_guard(props);
@@ -1799,6 +1855,7 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) {
   ASSERT_EQ(26ul, props->num_entries);
   ASSERT_EQ(1ul, props->num_data_blocks);
 }
+#endif  // !ROCKSDB_LITE
 
 TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) {
   TableConstructor c(BytewiseComparator());
@@ -1810,7 +1867,7 @@ TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) {
   c.Add("k06", "hello3");
   c.Add("k07", std::string(100000, 'x'));
   std::vector<std::string> keys;
-  KVMap kvmap;
+  stl_wrappers::KVMap kvmap;
   Options options;
   test::PlainInternalKeyComparator internal_comparator(options.comparator);
   options.compression = kNoCompression;
@@ -1842,7 +1899,7 @@ static void DoCompressionTest(CompressionType comp) {
   c.Add("k03", "hello3");
   c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
   std::vector<std::string> keys;
-  KVMap kvmap;
+  stl_wrappers::KVMap kvmap;
   Options options;
   test::PlainInternalKeyComparator ikc(options.comparator);
   options.compression = comp;
@@ -1949,7 +2006,8 @@ TEST_F(MemTableTest, Simple) {
   ImmutableCFOptions ioptions(options);
   WriteBuffer wb(options.db_write_buffer_size);
   MemTable* memtable =
-      new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb);
+      new MemTable(cmp, ioptions, MutableCFOptions(options, ioptions), &wb,
+                   kMaxSequenceNumber);
   memtable->Ref();
   WriteBatch batch;
   WriteBatchInternal::SetSequence(&batch, 100);
@@ -2056,6 +2114,8 @@ TEST_F(HarnessTest, FooterTests) {
     ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
     ASSERT_EQ(decoded_footer.version(), 1U);
   }
+// Plain table is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
   {
     // upconvert legacy plain table
     std::string encoded;
@@ -2095,6 +2155,7 @@ TEST_F(HarnessTest, FooterTests) {
     ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
     ASSERT_EQ(decoded_footer.version(), 1U);
   }
+#endif  // !ROCKSDB_LITE
   {
     // version == 2
     std::string encoded;
diff --git a/src/rocksdb/table/two_level_iterator.cc b/src/rocksdb/table/two_level_iterator.cc
index 5d3e372..f540d3b 100644
--- a/src/rocksdb/table/two_level_iterator.cc
+++ b/src/rocksdb/table/two_level_iterator.cc
@@ -22,11 +22,17 @@ namespace {
 class TwoLevelIterator: public Iterator {
  public:
   explicit TwoLevelIterator(TwoLevelIteratorState* state,
-      Iterator* first_level_iter);
+                            Iterator* first_level_iter,
+                            bool need_free_iter_and_state);
 
   virtual ~TwoLevelIterator() {
-    first_level_iter_.DeleteIter(false);
+    first_level_iter_.DeleteIter(!need_free_iter_and_state_);
     second_level_iter_.DeleteIter(false);
+    if (need_free_iter_and_state_) {
+      delete state_;
+    } else {
+      state_->~TwoLevelIteratorState();
+    }
   }
 
   virtual void Seek(const Slice& target) override;
@@ -65,9 +71,10 @@ class TwoLevelIterator: public Iterator {
   void SetSecondLevelIterator(Iterator* iter);
   void InitDataBlock();
 
-  std::unique_ptr<TwoLevelIteratorState> state_;
+  TwoLevelIteratorState* state_;
   IteratorWrapper first_level_iter_;
   IteratorWrapper second_level_iter_;  // May be nullptr
+  bool need_free_iter_and_state_;
   Status status_;
   // If second_level_iter is non-nullptr, then "data_block_handle_" holds the
   // "index_value" passed to block_function_ to create the second_level_iter.
@@ -75,8 +82,11 @@ class TwoLevelIterator: public Iterator {
 };
 
 TwoLevelIterator::TwoLevelIterator(TwoLevelIteratorState* state,
-    Iterator* first_level_iter)
-  : state_(state), first_level_iter_(first_level_iter) {}
+                                   Iterator* first_level_iter,
+                                   bool need_free_iter_and_state)
+    : state_(state),
+      first_level_iter_(first_level_iter),
+      need_free_iter_and_state_(need_free_iter_and_state) {}
 
 void TwoLevelIterator::Seek(const Slice& target) {
   if (state_->check_prefix_may_match &&
@@ -186,12 +196,15 @@ void TwoLevelIterator::InitDataBlock() {
 }  // namespace
 
 Iterator* NewTwoLevelIterator(TwoLevelIteratorState* state,
-                              Iterator* first_level_iter, Arena* arena) {
+                              Iterator* first_level_iter, Arena* arena,
+                              bool need_free_iter_and_state) {
   if (arena == nullptr) {
-    return new TwoLevelIterator(state, first_level_iter);
+    return new TwoLevelIterator(state, first_level_iter,
+                                need_free_iter_and_state);
   } else {
     auto mem = arena->AllocateAligned(sizeof(TwoLevelIterator));
-    return new (mem) TwoLevelIterator(state, first_level_iter);
+    return new (mem)
+        TwoLevelIterator(state, first_level_iter, need_free_iter_and_state);
   }
 }
 
diff --git a/src/rocksdb/table/two_level_iterator.h b/src/rocksdb/table/two_level_iterator.h
index 0301935..4c6b48c 100644
--- a/src/rocksdb/table/two_level_iterator.h
+++ b/src/rocksdb/table/two_level_iterator.h
@@ -43,8 +43,11 @@ struct TwoLevelIteratorState {
 // arena: If not null, the arena is used to allocate the Iterator.
 //        When destroying the iterator, the destructor will destroy
 //        all the states but those allocated in arena.
+// need_free_iter_and_state: free `state` and `first_level_iter` if
+//                           true. Otherwise, just call destructor.
 extern Iterator* NewTwoLevelIterator(TwoLevelIteratorState* state,
                                      Iterator* first_level_iter,
-                                     Arena* arena = nullptr);
+                                     Arena* arena = nullptr,
+                                     bool need_free_iter_and_state = true);
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/third-party/fbson/COMMIT.md b/src/rocksdb/third-party/fbson/COMMIT.md
index bba88d5..b38b542 100644
--- a/src/rocksdb/third-party/fbson/COMMIT.md
+++ b/src/rocksdb/third-party/fbson/COMMIT.md
@@ -1,2 +1,5 @@
 fbson commit: 
 https://github.com/facebook/mysql-5.6/commit/55ef9ff25c934659a70b4094e9b406c48e9dd43d
+
+# TODO.
+* Had to convert zero sized array to [1] sized arrays due to the fact that MS Compiler complains about it not being standard. At some point need to contribute this change back to MySql where this code was taken from.
diff --git a/src/rocksdb/third-party/fbson/FbsonDocument.h b/src/rocksdb/third-party/fbson/FbsonDocument.h
index 4d7c79a..c70f9ec 100644
--- a/src/rocksdb/third-party/fbson/FbsonDocument.h
+++ b/src/rocksdb/third-party/fbson/FbsonDocument.h
@@ -125,9 +125,12 @@ class FbsonDocument {
     uint8_t ver_;
   } header_;
 
-  char payload_[0];
+  char payload_[1];
 
   FbsonDocument();
+
+  FbsonDocument(const FbsonDocument&) = delete;
+  FbsonDocument& operator=(const FbsonDocument&) = delete;
 };
 
 /*
@@ -449,7 +452,7 @@ class BlobVal : public FbsonValue {
 
  protected:
   uint32_t size_;
-  char payload_[0];
+  char payload_[1];
 
   // set new blob bytes
   bool internalSetVal(const char* blob, uint32_t blobSize) {
@@ -468,6 +471,11 @@ class BlobVal : public FbsonValue {
   }
 
   BlobVal();
+
+ private:
+  // Disable as this class can only be allocated dynamically
+  BlobVal(const BlobVal&) = delete;
+  BlobVal& operator=(const BlobVal&) = delete;
 };
 
 /*
@@ -524,9 +532,12 @@ class ContainerVal : public FbsonValue {
 
  protected:
   uint32_t size_;
-  char payload_[0];
+  char payload_[1];
 
   ContainerVal();
+
+  ContainerVal(const ContainerVal&) = delete;
+  ContainerVal& operator=(const ContainerVal&) = delete;
 };
 
 /*
diff --git a/src/rocksdb/third-party/fbson/FbsonStream.h b/src/rocksdb/third-party/fbson/FbsonStream.h
index 6ac132b..2285124 100644
--- a/src/rocksdb/third-party/fbson/FbsonStream.h
+++ b/src/rocksdb/third-party/fbson/FbsonStream.h
@@ -30,6 +30,10 @@
 #define __STDC_FORMAT_MACROS
 #endif
 
+#if defined OS_WIN && !defined snprintf
+#define snprintf _snprintf
+#endif
+
 #include <inttypes.h>
 #include <iostream>
 
diff --git a/src/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/CMakeLists.txt b/src/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/CMakeLists.txt
new file mode 100644
index 0000000..90cff08
--- /dev/null
+++ b/src/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/CMakeLists.txt
@@ -0,0 +1 @@
+add_library(gtest gtest-all.cc)
diff --git a/src/rocksdb/thirdparty.inc b/src/rocksdb/thirdparty.inc
new file mode 100644
index 0000000..448c49a
--- /dev/null
+++ b/src/rocksdb/thirdparty.inc
@@ -0,0 +1,169 @@
+# Edit definitions below to specify paths to include files and libraries of all 3rd party libraries
+
+#
+# Edit these lines to set defaults for use of external libraries
+#
+set(USE_GFLAGS_DEFAULT 0)        # GFLAGS is disabled by default, enable with -DGFLAGS=1 cmake command line agrument
+set(USE_SNAPPY_DEFAULT 0)        # SNAPPY is disabled by default, enable with -DSNAPPY=1 cmake command line agrument
+set(USE_LZ4_DEFAULT 0)           # LZ4 is disabled by default, enable with -DLZ4=1 cmake command line agrument
+set(USE_ZLIB_DEFAULT 0)          # ZLIB is disabled by default, enable with -DZLIB=1 cmake command line agrument
+set(USE_JEMALLOC_DEFAULT 0)      # JEMALLOC is disabled by default, enable with -DJEMALLOC=1 cmake command line agrument
+
+#
+# This example assumes all the libraries locate in directories under THIRDPARTY_HOME environment variable
+# Set environment variable THIRDPARTY_HOME to point to your third party libraries home (Unix style dir separators)
+# or change the paths below to reflect where the libraries actually reside
+#
+set (THIRDPARTY_LIBS "")         # Initialization, don't touch
+
+#
+# Edit these 4 lines to define paths to GFLAGS
+#
+set(GFLAGS_HOME $ENV{THIRDPARTY_HOME}/Gflags.Library)
+set(GFLAGS_INCLUDE ${GFLAGS_HOME}/inc/include)
+set(GFLAGS_LIB_DEBUG ${GFLAGS_HOME}/bin/debug/amd64/gflags.lib)
+set(GFLAGS_LIB_RELEASE ${GFLAGS_HOME}/bin/retail/amd64/gflags.lib)
+
+# ================================================== GFLAGS ==================================================
+#
+# Don't touch these lines
+#
+if (DEFINED GFLAGS)
+  set(USE_GFLAGS ${GFLAGS})
+else ()
+  set(USE_GFLAGS ${USE_GFLAGS_DEFAULT})
+endif ()
+
+if (${USE_GFLAGS} EQUAL 1)
+  message(STATUS "GFLAGS library is enabled")
+  set(GFLAGS_CXX_FLAGS -DGFLAGS=gflags)
+  set(GFLAGS_LIBS debug ${GFLAGS_LIB_DEBUG} optimized ${GFLAGS_LIB_RELEASE})
+
+  add_definitions(${GFLAGS_CXX_FLAGS})
+  include_directories(${GFLAGS_INCLUDE})
+  set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${GFLAGS_LIBS})
+else ()
+  message(STATUS "GFLAGS library is disabled")
+endif ()
+
+# ================================================== SNAPPY ==================================================
+#
+# Edit these 4 lines to define paths to Snappy
+#
+set(SNAPPY_HOME $ENV{THIRDPARTY_HOME}/Snappy.Library)
+set(SNAPPY_INCLUDE ${SNAPPY_HOME}/inc/inc)
+set(SNAPPY_LIB_DEBUG ${SNAPPY_HOME}/bin/debug/amd64/snappy.lib)
+set(SNAPPY_LIB_RELEASE ${SNAPPY_HOME}/bin/retail/amd64/snappy.lib)
+
+#
+# Don't touch these lines
+#
+if (DEFINED SNAPPY)
+  set(USE_SNAPPY ${SNAPPY})
+else ()
+  set(USE_SNAPPY ${USE_SNAPPY_DEFAULT})
+endif ()
+
+if (${USE_SNAPPY} EQUAL 1)
+  message(STATUS "SNAPPY library is enabled")
+  set(SNAPPY_CXX_FLAGS -DSNAPPY)
+  set(SNAPPY_LIBS debug ${SNAPPY_LIB_DEBUG} optimized ${SNAPPY_LIB_RELEASE})
+
+  add_definitions(${SNAPPY_CXX_FLAGS})
+  include_directories(${SNAPPY_INCLUDE})
+  set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${SNAPPY_LIBS})
+else ()
+  message(STATUS "SNAPPY library is disabled")
+endif ()
+
+# ================================================== LZ4 ==================================================
+#
+# Edit these 4 lines to define paths to LZ4
+#
+set(LZ4_HOME $ENV{THIRDPARTY_HOME}/LZ4.Library)
+set(LZ4_INCLUDE ${LZ4_HOME}/inc/include)
+set(LZ4_LIB_DEBUG ${LZ4_HOME}/bin/debug/amd64/lz4.lib)
+set(LZ4_LIB_RELEASE ${LZ4_HOME}/bin/retail/amd64/lz4.lib)
+
+#
+# Don't touch these lines
+#
+if (DEFINED LZ4)
+  set(USE_LZ4 ${LZ4})
+else ()
+  set(USE_LZ4 ${USE_LZ4_DEFAULT})
+endif ()
+
+if (${USE_LZ4} EQUAL 1)
+  message(STATUS "LZ4 library is enabled")
+  set(LZ4_CXX_FLAGS -DLZ4)
+  set(LZ4_LIBS debug ${LZ4_LIB_DEBUG} optimized ${LZ4_LIB_RELEASE})
+
+  add_definitions(${LZ4_CXX_FLAGS})
+  include_directories(${LZ4_INCLUDE})
+  set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${LZ4_LIBS})
+else ()
+  message(STATUS "LZ4 library is disabled")
+endif ()
+
+# ================================================== ZLIB ==================================================
+#
+# Edit these 4 lines to define paths to ZLIB
+#
+set(ZLIB_HOME $ENV{THIRDPARTY_HOME}/ZLIB.Library)
+set(ZLIB_INCLUDE ${ZLIB_HOME}/inc/include)
+set(ZLIB_LIB_DEBUG ${ZLIB_HOME}/bin/debug/amd64/zlib.lib)
+set(ZLIB_LIB_RELEASE ${ZLIB_HOME}/bin/retail/amd64/zlib.lib)
+
+#
+# Don't touch these lines
+#
+if (DEFINED ZLIB)
+  set(USE_ZLIB ${ZLIB})
+else ()
+  set(USE_ZLIB ${USE_ZLIB_DEFAULT})
+endif ()
+
+if (${USE_ZLIB} EQUAL 1)
+  message(STATUS "ZLIB library is enabled")
+  set(ZLIB_CXX_FLAGS -DZLIB)
+  set(ZLIB_LIBS debug ${ZLIB_LIB_DEBUG} optimized ${ZLIB_LIB_RELEASE})
+
+  add_definitions(${ZLIB_CXX_FLAGS})
+  include_directories(${ZLIB_INCLUDE})
+  set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${ZLIB_LIBS})
+else ()
+  message(STATUS "ZLIB library is disabled")
+endif ()
+
+#
+# Edit these 4 lines to define paths to Jemalloc
+#
+set(JEMALLOC_HOME $ENV{THIRDPARTY_HOME}/Jemalloc.Library)
+set(JEMALLOC_INCLUDE ${JEMALLOC_HOME}/inc/include)
+set(JEMALLOC_LIB_DEBUG ${JEMALLOC_HOME}/bin/debug/amd64/jemalloc.lib)
+set(JEMALLOC_LIB_RELEASE ${JEMALLOC_HOME}/bin/retail/amd64/jemalloc.lib)
+
+# ================================================== JEMALLOC ==================================================
+#
+# Don't touch these lines
+#
+if (DEFINED JEMALLOC)
+  set(USE_JEMALLOC ${JEMALLOC})
+else ()
+  set(USE_JEMALLOC ${USE_JEMALLOC_DEFAULT})
+endif ()
+
+if (${USE_JEMALLOC} EQUAL 1)
+  message(STATUS "JEMALLOC library is enabled")
+  set(JEMALLOC_CXX_FLAGS -DJEMALLOC)
+  set(JEMALLOC_LIBS debug ${JEMALLOC_LIB_DEBUG} optimized ${JEMALLOC_LIB_RELEASE})
+
+  add_definitions(${JEMALLOC_CXX_FLAGS})
+  include_directories(${JEMALLOC_INCLUDE})
+  set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${JEMALLOC_LIBS})
+  set (ARTIFACT_SUFFIX "_je")
+else ()
+  set (ARTIFACT_SUFFIX "")
+  message(STATUS "JEMALLOC library is disabled")
+endif ()
diff --git a/src/rocksdb/tools/Dockerfile b/src/rocksdb/tools/Dockerfile
new file mode 100644
index 0000000..1d5ead7
--- /dev/null
+++ b/src/rocksdb/tools/Dockerfile
@@ -0,0 +1,5 @@
+FROM buildpack-deps:wheezy
+
+ADD ./ldb /rocksdb/tools/ldb
+
+CMD /rocksdb/tools/ldb
diff --git a/src/rocksdb/tools/auto_sanity_test.sh b/src/rocksdb/tools/auto_sanity_test.sh
new file mode 100755
index 0000000..bece681
--- /dev/null
+++ b/src/rocksdb/tools/auto_sanity_test.sh
@@ -0,0 +1,91 @@
+TMP_DIR="/tmp/rocksdb-sanity-test"
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: ./auto_sanity_test.sh [new_commit] [old_commit]"
+  echo "Missing either [new_commit] or [old_commit], perform sanity check with the latest and 10th latest commits."
+  recent_commits=`git log | grep -e "^commit [a-z0-9]\+$"| head -n10 | sed -e 's/commit //g'`
+  commit_new=`echo "$recent_commits" | head -n1`
+  commit_old=`echo "$recent_commits" | tail -n1`
+  echo "the most recent commits are:"
+  echo "$recent_commits"
+else
+  commit_new=$1
+  commit_old=$2
+fi
+
+if [ ! -d $TMP_DIR ]; then
+  mkdir $TMP_DIR
+fi
+dir_new="${TMP_DIR}/${commit_new}"
+dir_old="${TMP_DIR}/${commit_old}"
+
+function makestuff() {
+  echo "make clean"
+  make clean > /dev/null
+  echo "make db_sanity_test -j32"
+  make db_sanity_test -j32 > /dev/null
+  if [ $? -ne 0 ]; then
+    echo "[ERROR] Failed to perform 'make db_sanity_test'"
+    exit 1
+  fi
+}
+
+rm -r -f $dir_new
+rm -r -f $dir_old
+
+echo "Running db sanity check with commits $commit_new and $commit_old."
+
+echo "============================================================="
+echo "Making build $commit_new"
+git checkout $commit_new
+if [ $? -ne 0 ]; then
+  echo "[ERROR] Can't checkout $commit_new"
+  exit 1
+fi
+makestuff
+mv db_sanity_test new_db_sanity_test
+echo "Creating db based on the new commit --- $commit_new"
+./new_db_sanity_test $dir_new create
+cp ./tools/db_sanity_test.cc $dir_new
+cp ./tools/auto_sanity_test.sh $dir_new
+
+echo "============================================================="
+echo "Making build $commit_old"
+git checkout $commit_old
+if [ $? -ne 0 ]; then
+  echo "[ERROR] Can't checkout $commit_old"
+  exit 1
+fi
+cp -f $dir_new/db_sanity_test.cc ./tools/.
+cp -f $dir_new/auto_sanity_test.sh ./tools/.
+makestuff
+mv db_sanity_test old_db_sanity_test
+echo "Creating db based on the old commit --- $commit_old"
+./old_db_sanity_test $dir_old create
+
+echo "============================================================="
+echo "[Backward Compability Check]"
+echo "Verifying old db $dir_old using the new commit --- $commit_new"
+./new_db_sanity_test $dir_old verify
+if [ $? -ne 0 ]; then
+  echo "[ERROR] Backward Compability Check fails:"
+  echo "    Verification of $dir_old using commit $commit_new failed."
+  exit 2
+fi
+
+echo "============================================================="
+echo "[Forward Compatibility Check]"
+echo "Verifying new db $dir_new using the old commit --- $commit_old"
+./old_db_sanity_test $dir_new verify
+if [ $? -ne 0 ]; then
+  echo "[ERROR] Forward Compability Check fails:"
+  echo "    $dir_new using commit $commit_old failed."
+  exit 2
+fi
+
+rm old_db_sanity_test
+rm new_db_sanity_test
+rm -rf $dir_new
+rm -rf $dir_old
+
+echo "Auto sanity test passed!"
diff --git a/src/rocksdb/tools/benchmark.sh b/src/rocksdb/tools/benchmark.sh
new file mode 100755
index 0000000..3c862fd
--- /dev/null
+++ b/src/rocksdb/tools/benchmark.sh
@@ -0,0 +1,361 @@
+#!/bin/bash
+# REQUIRE: db_bench binary exists in the current directory
+
+if [ $# -ne 1 ]; then
+  echo -n "./benchmark.sh [bulkload/fillseq/overwrite/filluniquerandom/"
+  echo    "readrandom/readwhilewriting/readwhilemerging/updaterandom/"
+  echo    "mergerandom/randomtransaction]"
+  exit 0
+fi
+
+# size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+
+if [ -z $DB_DIR ]; then
+  echo "DB_DIR is not defined"
+  exit 0
+fi
+
+if [ -z $WAL_DIR ]; then
+  echo "WAL_DIR is not defined"
+  exit 0
+fi
+
+output_dir=${OUTPUT_DIR:-/tmp/}
+if [ ! -d $output_dir ]; then
+  mkdir -p $output_dir
+fi
+
+# all multithreaded tests run with sync=1 unless
+# $DB_BENCH_NO_SYNC is defined
+syncval="1"
+if [ ! -z $DB_BENCH_NO_SYNC ]; then
+  echo "Turning sync off for all multithreaded tests"
+  syncval="0";
+fi
+
+num_threads=${NUM_THREADS:-16}
+# Only for *whilewriting, *whilemerging
+writes_per_second=${WRITES_PER_SECOND:-$((10 * K))}
+# Only for tests that do range scans
+num_nexts_per_seek=${NUM_NEXTS_PER_SEEK:-10}
+cache_size=${CACHE_SIZE:-$((1 * G))}
+duration=${DURATION:-0}
+
+num_keys=${NUM_KEYS:-$((1 * G))}
+key_size=20
+value_size=${VALUE_SIZE:-400}
+block_size=${BLOCK_SIZE:-8192}
+
+const_params="
+  --db=$DB_DIR \
+  --wal_dir=$WAL_DIR \
+  --disable_data_sync=0 \
+  \
+  --num=$num_keys \
+  --num_levels=6 \
+  --key_size=$key_size \
+  --value_size=$value_size \
+  --block_size=$block_size \
+  --cache_size=$cache_size \
+  --cache_numshardbits=6 \
+  --compression_type=snappy \
+  --min_level_to_compress=3 \
+  --compression_ratio=0.5 \
+  --level_compaction_dynamic_level_bytes=true \
+  --bytes_per_sync=$((8 * M)) \
+  --cache_index_and_filter_blocks=0 \
+  \
+  --hard_rate_limit=3 \
+  --rate_limit_delay_max_milliseconds=1000000 \
+  --write_buffer_size=$((128 * M)) \
+  --max_write_buffer_number=8 \
+  --target_file_size_base=$((128 * M)) \
+  --max_bytes_for_level_base=$((1 * G)) \
+  \
+  --verify_checksum=1 \
+  --delete_obsolete_files_period_micros=$((60 * M)) \
+  --max_grandparent_overlap_factor=8 \
+  --max_bytes_for_level_multiplier=8 \
+  \
+  --statistics=0 \
+  --stats_per_interval=1 \
+  --stats_interval_seconds=60 \
+  --histogram=1 \
+  \
+  --memtablerep=skip_list \
+  --bloom_bits=10 \
+  --open_files=-1"
+
+l0_config="
+  --level0_file_num_compaction_trigger=4 \
+  --level0_slowdown_writes_trigger=12 \
+  --level0_stop_writes_trigger=20"
+
+if [ $duration -gt 0 ]; then
+  const_params="$const_params --duration=$duration"
+fi
+
+params_w="$const_params $l0_config --max_background_compactions=16 --max_background_flushes=7"
+params_bulkload="$const_params --max_background_compactions=16 --max_background_flushes=7 \
+                 --level0_file_num_compaction_trigger=$((10 * M)) \
+                 --level0_slowdown_writes_trigger=$((10 * M)) \
+                 --level0_stop_writes_trigger=$((10 * M))"
+
+function summarize_result {
+  test_out=$1
+  test_name=$2
+  bench_name=$3
+
+  uptime=$( grep ^Uptime\(secs $test_out | tail -1 | awk '{ printf "%.0f", $2 }' )
+  stall_time=$( grep "^Cumulative stall" $test_out | tail -1  | awk '{  print $3 }' )
+  stall_pct=$( grep "^Cumulative stall" $test_out| tail -1  | awk '{  print $5 }' )
+  ops_sec=$( grep ^${bench_name} $test_out | awk '{ print $5 }' )
+  mb_sec=$( grep ^${bench_name} $test_out | awk '{ print $7 }' )
+  lo_wgb=$( grep "^  L0" $test_out | tail -1 | awk '{ print $8 }' )
+  sum_wgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ print $8 }' )
+  sum_size=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.1f", $3 / 1024.0 }' )
+  wamp=$( echo "scale=1; $sum_wgb / $lo_wgb" | bc )
+  wmb_ps=$( echo "scale=1; ( $sum_wgb * 1024.0 ) / $uptime" | bc )
+  usecs_op=$( grep ^${bench_name} $test_out | awk '{ printf "%.1f", $3 }' )
+  p50=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.1f", $3 }' )
+  p75=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.1f", $5 }' )
+  p99=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $7 }' )
+  p999=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $9 }' )
+  p9999=$( grep "^Percentiles:" $test_out | tail -1 | awk '{ printf "%.0f", $11 }' )
+  echo -e "$ops_sec\t$mb_sec\t$sum_size\t$lo_wgb\t$sum_wgb\t$wamp\t$wmb_ps\t$usecs_op\t$p50\t$p75\t$p99\t$p999\t$p9999\t$uptime\t$stall_time\t$stall_pct\t$test_name" \
+    >> $output_dir/report.txt
+}
+
+function run_bulkload {
+  # This runs with a vector memtable and the WAL disabled to load faster. It is still crash safe and the
+  # client can discover where to restart a load after a crash. I think this is a good way to load.
+  echo "Bulk loading $num_keys random keys"
+  cmd="./db_bench --benchmarks=fillrandom \
+       --use_existing_db=0 \
+       --disable_auto_compactions=1 \
+       --sync=0 \
+       $params_bulkload \
+       --threads=1 \
+       --memtablerep=vector \
+       --disable_wal=1 \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/benchmark_bulkload_fillrandom.log"
+  echo $cmd | tee $output_dir/benchmark_bulkload_fillrandom.log
+  eval $cmd
+  summarize_result $output_dir/benchmark_bulkload_fillrandom.log bulkload fillrandom
+  echo "Compacting..."
+  cmd="./db_bench --benchmarks=compact \
+       --use_existing_db=1 \
+       --disable_auto_compactions=1 \
+       --sync=0 \
+       $params_w \
+       --threads=1 \
+       2>&1 | tee -a $output_dir/benchmark_bulkload_compact.log"
+  echo $cmd | tee $output_dir/benchmark_bulkload_compact.log
+  eval $cmd
+}
+
+function run_fillseq {
+  # This runs with a vector memtable and the WAL disabled to load faster. It is still crash safe and the
+  # client can discover where to restart a load after a crash. I think this is a good way to load.
+  echo "Loading $num_keys keys sequentially"
+  cmd="./db_bench --benchmarks=fillseq \
+       --use_existing_db=0 \
+       --sync=0 \
+       $params_w \
+       --min_level_to_compress=0 \
+       --threads=1 \
+       --memtablerep=vector \
+       --disable_wal=1 \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/benchmark_fillseq.v${value_size}.log"
+  echo $cmd | tee $output_dir/benchmark_fillseq.v${value_size}.log
+  eval $cmd
+  summarize_result $output_dir/benchmark_fillseq.v${value_size}.log fillseq.v${value_size} fillseq
+}
+
+function run_change {
+  operation=$1
+  echo "Do $num_keys random $operation"
+  out_name="benchmark_${operation}.t${num_threads}.s${syncval}.log"
+  cmd="./db_bench --benchmarks=$operation \
+       --use_existing_db=1 \
+       --sync=$syncval \
+       $params_w \
+       --threads=$num_threads \
+       --merge_operator=\"put\" \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
+  eval $cmd
+  summarize_result $output_dir/${out_name} ${operation}.t${num_threads}.s${syncval} $operation
+}
+
+function run_filluniquerandom {
+  echo "Loading $num_keys unique keys randomly"
+  cmd="./db_bench --benchmarks=filluniquerandom \
+       --use_existing_db=0 \
+       --sync=0 \
+       $params_w \
+       --threads=1 \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/benchmark_filluniquerandom.log"
+  echo $cmd | tee $output_dir/benchmark_filluniquerandom.log
+  eval $cmd
+  summarize_result $output_dir/benchmark_filluniquerandom.log filluniquerandom filluniquerandom
+}
+
+function run_readrandom {
+  echo "Reading $num_keys random keys"
+  out_name="benchmark_readrandom.t${num_threads}.log"
+  cmd="./db_bench --benchmarks=readrandom \
+       --use_existing_db=1 \
+       $params_w \
+       --threads=$num_threads \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
+  eval $cmd
+  summarize_result $output_dir/${out_name} readrandom.t${num_threads} readrandom
+}
+
+function run_readwhile {
+  operation=$1
+  echo "Reading $num_keys random keys while $operation"
+  out_name="benchmark_readwhile${operation}.t${num_threads}.log"
+  cmd="./db_bench --benchmarks=readwhile${operation} \
+       --use_existing_db=1 \
+       --sync=$syncval \
+       $params_w \
+       --threads=$num_threads \
+       --writes_per_second=$writes_per_second \
+       --merge_operator=\"put\" \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
+  eval $cmd
+  summarize_result $output_dir/${out_name} readwhile${operation}.t${num_threads} readwhile${operation}
+}
+
+function run_rangewhile {
+  operation=$1
+  full_name=$2
+  reverse_arg=$3
+  out_name="benchmark_${full_name}.t${num_threads}.log"
+  echo "Range scan $num_keys random keys while ${operation} for reverse_iter=${reverse_arg}"
+  cmd="./db_bench --benchmarks=seekrandomwhile${operation} \
+       --use_existing_db=1 \
+       --sync=$syncval \
+       $params_w \
+       --threads=$num_threads \
+       --writes_per_second=$writes_per_second \
+       --merge_operator=\"put\" \
+       --seek_nexts=$num_nexts_per_seek \
+       --reverse_iterator=$reverse_arg \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
+  eval $cmd
+  summarize_result $output_dir/${out_name} ${full_name}.t${num_threads} seekrandomwhile${operation}
+}
+
+function run_range {
+  full_name=$1
+  reverse_arg=$2
+  out_name="benchmark_${full_name}.t${num_threads}.log"
+  echo "Range scan $num_keys random keys for reverse_iter=${reverse_arg}"
+  cmd="./db_bench --benchmarks=seekrandom \
+       --use_existing_db=1 \
+       $params_w \
+       --threads=$num_threads \
+       --seek_nexts=$num_nexts_per_seek \
+       --reverse_iterator=$reverse_arg \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
+  eval $cmd
+  summarize_result $output_dir/${out_name} ${full_name}.t${num_threads} seekrandom
+}
+
+function run_randomtransaction {
+  echo "..."
+  cmd="./db_bench $params_r --benchmarks=randomtransaction \
+       --num=$num_keys \
+       --transaction_db \
+       --threads=5 \
+       --transaction_sets=5 \
+       2>&1 | tee $output_dir/benchmark_randomtransaction.log"
+  echo $cmd | tee $output_dir/benchmark_rangescanwhilewriting.log
+  eval $cmd
+}
+
+function now() {
+  echo `date +"%s"`
+}
+
+report="$output_dir/report.txt"
+schedule="$output_dir/schedule.txt"
+
+echo "===== Benchmark ====="
+
+# Run!!!
+IFS=',' read -a jobs <<< $1
+for job in ${jobs[@]}; do
+
+  if [ $job != debug ]; then
+    echo "Start $job at `date`" | tee -a $schedule
+  fi
+
+  start=$(now)
+  if [ $job = bulkload ]; then
+    run_bulkload
+  elif [ $job = fillseq ]; then
+    run_fillseq
+  elif [ $job = overwrite ]; then
+    run_change overwrite
+  elif [ $job = updaterandom ]; then
+    run_change updaterandom
+  elif [ $job = mergerandom ]; then
+    run_change mergerandom
+  elif [ $job = filluniquerandom ]; then
+    run_filluniquerandom
+  elif [ $job = readrandom ]; then
+    run_readrandom
+  elif [ $job = fwdrange ]; then
+    run_range $job false
+  elif [ $job = revrange ]; then
+    run_range $job true
+  elif [ $job = readwhilewriting ]; then
+    run_readwhile writing
+  elif [ $job = readwhilemerging ]; then
+    run_readwhile merging
+  elif [ $job = fwdrangewhilewriting ]; then
+    run_rangewhile writing $job false
+  elif [ $job = revrangewhilewriting ]; then
+    run_rangewhile writing $job true
+  elif [ $job = fwdrangewhilemerging ]; then
+    run_rangewhile merging $job false
+  elif [ $job = revrangewhilemerging ]; then
+    run_rangewhile merging $job true
+  elif [ $job = randomtransaction ]; then
+    run_randomtransaction
+  elif [ $job = debug ]; then
+    num_keys=1000; # debug
+    echo "Setting num_keys to $num_keys"
+  else
+    echo "unknown job $job"
+    exit
+  fi
+  end=$(now)
+
+  if [ $job != debug ]; then
+    echo "Complete $job in $((end-start)) seconds" | tee -a $schedule
+  fi
+
+  echo -e "ops/sec\tmb/sec\tSize-GB\tL0_MB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tp99.9\tp99.99\tUptime\tStall-time\tStall%\tTest"
+  tail -1 $output_dir/report.txt
+
+done
diff --git a/src/rocksdb/tools/benchmark_leveldb.sh b/src/rocksdb/tools/benchmark_leveldb.sh
new file mode 100755
index 0000000..dce66d4
--- /dev/null
+++ b/src/rocksdb/tools/benchmark_leveldb.sh
@@ -0,0 +1,185 @@
+#!/bin/bash
+# REQUIRE: db_bench binary exists in the current directory
+#
+# This should be used with the LevelDB fork listed here to use additional test options.
+# For more details on the changes see the blog post listed below.
+#   https://github.com/mdcallag/leveldb-1
+#   http://smalldatum.blogspot.com/2015/04/comparing-leveldb-and-rocksdb-take-2.html
+
+if [ $# -ne 1 ]; then
+  echo -n "./benchmark.sh [fillseq/overwrite/readrandom/readwhilewriting]"
+  exit 0
+fi
+
+# size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+
+if [ -z $DB_DIR ]; then
+  echo "DB_DIR is not defined"
+  exit 0
+fi
+
+output_dir=${OUTPUT_DIR:-/tmp/}
+if [ ! -d $output_dir ]; then
+  mkdir -p $output_dir
+fi
+
+# all multithreaded tests run with sync=1 unless
+# $DB_BENCH_NO_SYNC is defined
+syncval="1"
+if [ ! -z $DB_BENCH_NO_SYNC ]; then
+  echo "Turning sync off for all multithreaded tests"
+  syncval="0";
+fi
+
+num_threads=${NUM_THREADS:-16}
+# Only for *whilewriting, *whilemerging
+writes_per_second=${WRITES_PER_SECOND:-$((10 * K))}
+cache_size=${CACHE_SIZE:-$((1 * G))}
+
+num_keys=${NUM_KEYS:-$((1 * G))}
+key_size=20
+value_size=${VALUE_SIZE:-400}
+block_size=${BLOCK_SIZE:-4096}
+
+const_params="
+  --db=$DB_DIR \
+  \
+  --num=$num_keys \
+  --value_size=$value_size \
+  --cache_size=$cache_size \
+  --compression_ratio=0.5 \
+  \
+  --write_buffer_size=$((2 * M)) \
+  \
+  --histogram=1 \
+  \
+  --bloom_bits=10 \
+  --open_files=$((20 * K))"
+
+params_w="$const_params "
+
+function summarize_result {
+  test_out=$1
+  test_name=$2
+  bench_name=$3
+  nthr=$4
+
+  usecs_op=$( grep ^${bench_name} $test_out | awk '{ printf "%.1f", $3 }' )
+  mb_sec=$( grep ^${bench_name} $test_out | awk '{ printf "%.1f", $5 }' )
+  ops=$( grep "^Count:" $test_out | awk '{ print $2 }' )
+  ops_sec=$( echo "scale=0; (1000000.0 * $nthr) / $usecs_op" | bc )
+  avg=$( grep "^Count:" $test_out | awk '{ printf "%.1f", $4 }' )
+  p50=$( grep "^Min:" $test_out | awk '{ printf "%.1f", $4 }' )
+  echo -e "$ops_sec\t$mb_sec\t$usecs_op\t$avg\t$p50\t$test_name" \
+    >> $output_dir/report.txt
+}
+
+function run_fillseq {
+  # This runs with a vector memtable and the WAL disabled to load faster. It is still crash safe and the
+  # client can discover where to restart a load after a crash. I think this is a good way to load.
+  echo "Loading $num_keys keys sequentially"
+  cmd="./db_bench --benchmarks=fillseq \
+       --use_existing_db=0 \
+       --sync=0 \
+       $params_w \
+       --threads=1 \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/benchmark_fillseq.v${value_size}.log"
+  echo $cmd | tee $output_dir/benchmark_fillseq.v${value_size}.log
+  eval $cmd
+  summarize_result $output_dir/benchmark_fillseq.v${value_size}.log fillseq.v${value_size} fillseq 1
+}
+
+function run_change {
+  operation=$1
+  echo "Do $num_keys random $operation"
+  out_name="benchmark_${operation}.t${num_threads}.s${syncval}.log"
+  cmd="./db_bench --benchmarks=$operation \
+       --use_existing_db=1 \
+       --sync=$syncval \
+       $params_w \
+       --threads=$num_threads \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
+  eval $cmd
+  summarize_result $output_dir/${out_name} ${operation}.t${num_threads}.s${syncval} $operation $num_threads
+}
+
+function run_readrandom {
+  echo "Reading $num_keys random keys"
+  out_name="benchmark_readrandom.t${num_threads}.log"
+  cmd="./db_bench --benchmarks=readrandom \
+       --use_existing_db=1 \
+       $params_w \
+       --threads=$num_threads \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
+  eval $cmd
+  summarize_result $output_dir/${out_name} readrandom.t${num_threads} readrandom $num_threads
+}
+
+function run_readwhile {
+  operation=$1
+  echo "Reading $num_keys random keys while $operation"
+  out_name="benchmark_readwhile${operation}.t${num_threads}.log"
+  cmd="./db_bench --benchmarks=readwhile${operation} \
+       --use_existing_db=1 \
+       --sync=$syncval \
+       $params_w \
+       --threads=$num_threads \
+       --writes_per_second=$writes_per_second \
+       --seed=$( date +%s ) \
+       2>&1 | tee -a $output_dir/${out_name}"
+  echo $cmd | tee $output_dir/${out_name}
+  eval $cmd
+  summarize_result $output_dir/${out_name} readwhile${operation}.t${num_threads} readwhile${operation} $num_threads
+}
+
+function now() {
+  echo `date +"%s"`
+}
+
+report="$output_dir/report.txt"
+schedule="$output_dir/schedule.txt"
+
+echo "===== Benchmark ====="
+
+# Run!!!
+IFS=',' read -a jobs <<< $1
+for job in ${jobs[@]}; do
+
+  if [ $job != debug ]; then
+    echo "Start $job at `date`" | tee -a $schedule
+  fi
+
+  start=$(now)
+  if [ $job = fillseq ]; then
+    run_fillseq
+  elif [ $job = overwrite ]; then
+    run_change overwrite
+  elif [ $job = readrandom ]; then
+    run_readrandom
+  elif [ $job = readwhilewriting ]; then
+    run_readwhile writing
+  elif [ $job = debug ]; then
+    num_keys=1000; # debug
+    echo "Setting num_keys to $num_keys"
+  else
+    echo "unknown job $job"
+    exit
+  fi
+  end=$(now)
+
+  if [ $job != debug ]; then
+    echo "Complete $job in $((end-start)) seconds" | tee -a $schedule
+  fi
+
+  echo -e "ops/sec\tmb/sec\tusec/op\tavg\tp50\tTest"
+  tail -1 $output_dir/report.txt
+
+done
diff --git a/src/rocksdb/tools/check_format_compatible.sh b/src/rocksdb/tools/check_format_compatible.sh
new file mode 100755
index 0000000..65bbe0b
--- /dev/null
+++ b/src/rocksdb/tools/check_format_compatible.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+#
+# A shell script to load some pre generated data file to a DB using ldb tool
+# ./ldb needs to be avaible to be executed.
+#
+# Usage: <SCRIPT> [checkout]
+# `checkout` can be a tag, commit or branch name. Will build using it and check DBs generated by all previous tags can be opened by it.
+# Return value 0 means all regression tests pass. 1 if not pass.
+
+scriptpath=`dirname $BASH_SOURCE`
+test_dir=${TEST_TMPDIR:-"/tmp"}"/format_compatible_check"
+script_copy_dir=$test_dir"/script_copy"
+input_data_path=$test_dir"/test_data_input/"
+
+mkdir $test_dir || true
+mkdir $input_data_path || true
+rm -rf $script_copy_dir
+cp $scriptpath $script_copy_dir -rf
+
+# Generate four random files.
+for i in {1..6}
+do
+  input_data[$i]=$input_data_path/data$i
+  echo == Generating random input file ${input_data[$i]}
+  python - <<EOF
+import random
+random.seed($i)
+symbols=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+with open('${input_data[$i]}', 'w') as f:
+  for i in range(1,1024):
+    k = ""
+    for j in range(1, random.randint(1,32)):
+      k=k + symbols[random.randint(0, len(symbols) - 1)]
+    vb = ""
+    for j in range(1, random.randint(0,128)):
+      vb = vb + symbols[random.randint(0, len(symbols) - 1)]
+    v = ""
+    for j in range(1, random.randint(1, 5)):
+      v = v + vb
+    print >> f, k + " ==> " + v
+EOF
+done
+
+# v2.1 or older doesn't pass the debug build but OK with release build
+declare -a need_release_tags=("v1.5.7" "v2.1")
+declare -a tags=("v2.5" "v2.4" "v2.3" "v2.2" "v2.8" "v3.0" "v3.1" "v3.2" "v3.3" "v3.4" "rocksdb-3.5.1" "rocksdb-3.6.2" "rocksdb-3.7" "rocksdb-3.8" "rocksdb-3.9" "v3.10")
+declare -a forward_compatible_tags=("rocksdb-3.8" "rocksdb-3.9" "v3.10")
+
+generate_db()
+{
+    set +e
+    $script_copy_dir/generate_random_db.sh $1 $2
+    if [ $? -ne 0 ]; then
+        echo ==== Error loading data from $2 to $1 ====
+        exit 1
+    fi
+    set -e
+}
+
+compare_db()
+{
+    set +e
+    $script_copy_dir/verify_random_db.sh $1 $2 $3
+    if [ $? -ne 0 ]; then
+        echo ==== Read different content from $1 and $2 or error happened. ====
+        exit 1
+    fi
+    set -e
+}
+
+set -e
+for tag in "${tags[@]}" "${need_release_tags[@]}"
+do
+   echo == Generating DB from "$tag" ...
+   git checkout $tag
+   make clean
+   make ldb -j32
+   generate_db $input_data_path $test_dir/$tag
+done
+
+checkout_flag=${1:-"master"}
+
+echo == Building $checkout_flag debug
+git checkout $checkout_flag
+make clean
+make ldb -j32
+compare_base_db_dir=$test_dir"/base_db_dir"
+echo == Generate compare base DB to $compare_base_db_dir
+generate_db $input_data_path $compare_base_db_dir
+
+for tag in "${tags[@]}"
+do
+   echo == Opening DB from "$tag" using debug build of $checkout_flag ...
+   compare_db $test_dir/$tag $compare_base_db_dir db_dump.txt
+done
+
+echo == Building $checkout_flag release
+git checkout $checkout_flag
+make release
+for tag in "${need_release_tags[@]}"
+do
+   echo == Opening DB generated by "$tag" using release build of $checkout_flag ...
+   compare_db $test_dir/$tag $compare_base_db_dir db_dump.txt
+done
+
+for tag in "${forward_compatible_tags[@]}"
+do
+   echo == Build "$tag" and try to open DB generated using $checkout_flag...
+   git checkout $tag
+   make clean
+   make ldb -j32
+   compare_db $test_dir/$tag $compare_base_db_dir forward_${tag}_dump.txt
+done
+
+echo ==== Compatibility Test PASSED ====
diff --git a/src/rocksdb/tools/db_crashtest.py b/src/rocksdb/tools/db_crashtest.py
new file mode 100644
index 0000000..6ef20ff
--- /dev/null
+++ b/src/rocksdb/tools/db_crashtest.py
@@ -0,0 +1,203 @@
+#! /usr/bin/env python
+import os
+import re
+import sys
+import time
+import random
+import getopt
+import logging
+import tempfile
+import subprocess
+import shutil
+
+# This script runs and kills db_stress multiple times. It checks consistency
+# in case of unsafe crashes in RocksDB.
+
+def main(argv):
+    try:
+        opts, args = getopt.getopt(argv, "hsd:t:i:o:b:")
+    except getopt.GetoptError:
+        print("db_crashtest.py -d <duration_test> -t <#threads> "
+              "-i <interval for one run> -o <ops_per_thread> "
+              "-b <write_buffer_size> [-s (simple mode)]\n")
+        sys.exit(2)
+
+    # default values, will be overridden by cmdline args
+    interval = 120  # time for one db_stress instance to run
+    duration = 6000  # total time for this script to test db_stress
+    threads = 32
+    # since we will be killing anyway, use large value for ops_per_thread
+    ops_per_thread = 100000000
+    write_buf_size = 4 * 1024 * 1024
+    simple_mode = False
+    write_buf_size_set = False
+    for opt, arg in opts:
+        if opt == '-h':
+            print("db_crashtest.py -d <duration_test>"
+                  " -t <#threads> -i <interval for one run>"
+                  " -o <ops_per_thread> -b <write_buffer_size>"
+                  " [-s (simple mode)]\n")
+            sys.exit()
+        elif opt == '-s':
+            simple_mode = True
+            if not write_buf_size_set:
+                write_buf_size = 32 * 1024 * 1024
+        elif opt == "-d":
+            duration = int(arg)
+        elif opt == "-t":
+            threads = int(arg)
+        elif opt == "-i":
+            interval = int(arg)
+        elif opt == "-o":
+            ops_per_thread = int(arg)
+        elif opt == "-b":
+            write_buf_size = int(arg)
+            write_buf_size_set = True
+        else:
+            print("db_crashtest.py -d <duration_test>"
+                  " -t <#threads> -i <interval for one run>"
+                  " -o <ops_per_thread> -b <write_buffer_size>\n")
+            sys.exit(2)
+
+    exit_time = time.time() + duration
+
+    print("Running blackbox-crash-test with \ninterval_between_crash="
+          + str(interval) + "\ntotal-duration=" + str(duration)
+          + "\nthreads=" + str(threads) + "\nops_per_thread="
+          + str(ops_per_thread) + "\nwrite_buffer_size="
+          + str(write_buf_size) + "\n")
+
+    test_tmpdir = os.environ.get("TEST_TMPDIR")
+    if test_tmpdir is None or test_tmpdir == "":
+        dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_')
+    else:
+        dbname = test_tmpdir + "/rocksdb_crashtest"
+        shutil.rmtree(dbname, True)
+
+    while time.time() < exit_time:
+        run_had_errors = False
+        killtime = time.time() + interval
+
+        if simple_mode:
+            cmd = re.sub('\s+', ' ', """
+                ./db_stress
+                --column_families=1
+                --test_batches_snapshots=0
+                --ops_per_thread=%s
+                --threads=%s
+                --write_buffer_size=%s
+                --destroy_db_initially=0
+                --reopen=20
+                --readpercent=50
+                --prefixpercent=0
+                --writepercent=35
+                --delpercent=5
+                --iterpercent=10
+                --db=%s
+                --max_key=100000000
+                --mmap_read=%s
+                --block_size=16384
+                --cache_size=1048576
+                --open_files=-1
+                --verify_checksum=1
+                --sync=0
+                --progress_reports=0
+                --disable_wal=0
+                --disable_data_sync=1
+                --target_file_size_base=16777216
+                --target_file_size_multiplier=1
+                --max_write_buffer_number=3
+                --max_background_compactions=1
+                --max_bytes_for_level_base=67108864
+                --filter_deletes=%s
+                --memtablerep=skip_list
+                --prefix_size=0
+                --set_options_one_in=0
+                """ % (ops_per_thread,
+                       threads,
+                       write_buf_size,
+                       dbname,
+                       random.randint(0, 1),
+                       random.randint(0, 1)))
+        else:
+            cmd = re.sub('\s+', ' ', """
+                ./db_stress
+                --test_batches_snapshots=1
+                --ops_per_thread=%s
+                --threads=%s
+                --write_buffer_size=%s
+                --destroy_db_initially=0
+                --reopen=20
+                --readpercent=45
+                --prefixpercent=5
+                --writepercent=35
+                --delpercent=5
+                --iterpercent=10
+                --db=%s
+                --max_key=100000000
+                --mmap_read=%s
+                --block_size=16384
+                --cache_size=1048576
+                --open_files=500000
+                --verify_checksum=1
+                --sync=0
+                --progress_reports=0
+                --disable_wal=0
+                --disable_data_sync=1
+                --target_file_size_base=2097152
+                --target_file_size_multiplier=2
+                --max_write_buffer_number=3
+                --max_background_compactions=20
+                --max_bytes_for_level_base=10485760
+                --filter_deletes=%s
+                --memtablerep=prefix_hash
+                --prefix_size=7
+                --set_options_one_in=10000
+                """ % (ops_per_thread,
+                       threads,
+                       write_buf_size,
+                       dbname,
+                       random.randint(0, 1),
+                       random.randint(0, 1)))
+
+        child = subprocess.Popen([cmd],
+                                 stderr=subprocess.PIPE, shell=True)
+        print("Running db_stress with pid=%d: %s\n\n"
+              % (child.pid, cmd))
+
+        stop_early = False
+        while time.time() < killtime:
+            if child.poll() is not None:
+                print("WARNING: db_stress ended before kill: exitcode=%d\n"
+                      % child.returncode)
+                stop_early = True
+                break
+            time.sleep(1)
+
+        if not stop_early:
+            if child.poll() is not None:
+                print("WARNING: db_stress ended before kill: exitcode=%d\n"
+                      % child.returncode)
+            else:
+                child.kill()
+                print("KILLED %d\n" % child.pid)
+                time.sleep(1)  # time to stabilize after a kill
+
+        while True:
+            line = child.stderr.readline().strip()
+            if line != '':
+                run_had_errors = True
+                print('***' + line + '^')
+            else:
+                break
+
+        if run_had_errors:
+            sys.exit(2)
+
+        time.sleep(1)  # time to stabilize before the next run
+
+    # we need to clean up after ourselves -- only do this on test success
+    shutil.rmtree(dbname, True)
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/src/rocksdb/tools/db_crashtest2.py b/src/rocksdb/tools/db_crashtest2.py
new file mode 100644
index 0000000..a74053e
--- /dev/null
+++ b/src/rocksdb/tools/db_crashtest2.py
@@ -0,0 +1,231 @@
+#! /usr/bin/env python
+import os
+import re
+import sys
+import time
+import random
+import getopt
+import logging
+import tempfile
+import subprocess
+import shutil
+
+# This python script runs db_stress multiple times. Some runs with
+# kill_random_test that causes rocksdb to crash at various points in code.
+
+def main(argv):
+    try:
+        opts, args = getopt.getopt(argv, "hsd:t:k:o:b:")
+    except getopt.GetoptError:
+        print str(getopt.GetoptError)
+        print "db_crashtest2.py -d <duration_test> -t <#threads> " \
+              "-k <kills with prob 1/k> -o <ops_per_thread> "\
+              "-b <write_buffer_size> [-s (simple mode)]\n"
+        sys.exit(2)
+
+    # default values, will be overridden by cmdline args
+    kill_random_test = 97  # kill with probability 1/97 by default
+    duration = 10000  # total time for this script to test db_stress
+    threads = 32
+    ops_per_thread = 200000
+    write_buf_size = 4 * 1024 * 1024
+    simple_mode = False
+    write_buf_size_set = False
+
+    for opt, arg in opts:
+        if opt == '-h':
+            print "db_crashtest2.py -d <duration_test> -t <#threads> " \
+                  "-k <kills with prob 1/k> -o <ops_per_thread> " \
+                  "-b <write_buffer_size> [-s (simple mode)]\n"
+            sys.exit()
+        elif opt == '-s':
+            simple_mode = True
+            if not write_buf_size_set:
+                write_buf_size = 32 * 1024 * 1024
+        elif opt == "-d":
+            duration = int(arg)
+        elif opt == "-t":
+            threads = int(arg)
+        elif opt == "-k":
+            kill_random_test = int(arg)
+        elif opt == "-o":
+            ops_per_thread = int(arg)
+        elif opt == "-b":
+            write_buf_size = int(arg)
+            write_buf_size_set = True
+        else:
+            print "unrecognized option " + str(opt) + "\n"
+            print "db_crashtest2.py -d <duration_test> -t <#threads> " \
+                  "-k <kills with prob 1/k> -o <ops_per_thread> " \
+                  "-b <write_buffer_size>\n"
+            sys.exit(2)
+
+    cur_time = time.time()
+    exit_time = cur_time + duration
+    half_time = cur_time + duration / 2
+
+    print "Running whitebox-crash-test with \ntotal-duration=" + str(duration) \
+          + "\nthreads=" + str(threads) + "\nops_per_thread=" \
+          + str(ops_per_thread) + "\nwrite_buffer_size=" \
+          + str(write_buf_size) + "\n"
+
+    total_check_mode = 4
+    check_mode = 0
+
+    test_tmpdir = os.environ.get("TEST_TMPDIR")
+    if test_tmpdir is None or test_tmpdir == "":
+        dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest2_')
+    else:
+        dbname = test_tmpdir + "/rocksdb_crashtest2"
+        shutil.rmtree(dbname, True)
+
+    while time.time() < exit_time:
+        killoption = ""
+        if check_mode == 0:
+            # run with kill_random_test
+            killoption = " --kill_random_test=" + str(kill_random_test)
+            # use large ops per thread since we will kill it anyway
+            additional_opts = "--ops_per_thread=" + \
+                              str(100 * ops_per_thread) + killoption
+        elif check_mode == 1:
+            # normal run with universal compaction mode
+            additional_opts = "--ops_per_thread=" + str(ops_per_thread) + \
+                              " --compaction_style=1"
+        elif check_mode == 2:
+            # normal run with FIFO compaction mode
+            # ops_per_thread is divided by 5 because FIFO compaction
+            # style is quite a bit slower on reads with lot of files
+            additional_opts = "--ops_per_thread=" + str(ops_per_thread / 5) + \
+                              " --compaction_style=2"
+        else:
+            # normal run
+            additional_opts = "--ops_per_thread=" + str(ops_per_thread)
+
+        if simple_mode:
+            cmd = re.sub('\s+', ' ', """
+                ./db_stress
+                --column_families=1
+                --threads=%s
+                --write_buffer_size=%s
+                --destroy_db_initially=0
+                --reopen=20
+                --prefixpercent=0
+                --readpercent=50
+                --writepercent=35
+                --delpercent=5
+                --iterpercent=10
+                --db=%s
+                --max_key=100000000
+                --mmap_read=%s
+                --block_size=16384
+                --cache_size=1048576
+                --open_files=500000
+                --verify_checksum=1
+                --sync=0
+                --progress_reports=0
+                --disable_wal=0
+                --disable_data_sync=1
+                --target_file_size_base=16777216
+                --target_file_size_multiplier=1
+                --max_write_buffer_number=3
+                --max_background_compactions=1
+                --max_bytes_for_level_base=67108864
+                --filter_deletes=%s
+                --memtablerep=skip_list
+                --prefix_size=0
+                 %s
+                """ % (threads,
+                       write_buf_size,
+                       dbname,
+                       random.randint(0, 1),
+                       random.randint(0, 1),
+                       additional_opts))
+        else:
+            cmd = re.sub('\s+', ' ', """
+                ./db_stress
+                --test_batches_snapshots=%s
+                --threads=%s
+                --write_buffer_size=%s
+                --destroy_db_initially=0
+                --reopen=20
+                --readpercent=45
+                --prefixpercent=5
+                --writepercent=35
+                --delpercent=5
+                --iterpercent=10
+                --db=%s
+                --max_key=100000000
+                --mmap_read=%s
+                --block_size=16384
+                --cache_size=1048576
+                --open_files=500000
+                --verify_checksum=1
+                --sync=0
+                --progress_reports=0
+                --disable_wal=0
+                --disable_data_sync=1
+                --target_file_size_base=2097152
+                --target_file_size_multiplier=2
+                --max_write_buffer_number=3
+                --max_background_compactions=20
+                --max_bytes_for_level_base=10485760
+                --filter_deletes=%s
+                --memtablerep=prefix_hash
+                --prefix_size=7
+                %s
+                """ % (random.randint(0, 1),
+                       threads,
+                       write_buf_size,
+                       dbname,
+                       random.randint(0, 1),
+                       random.randint(0, 1),
+                       additional_opts))
+
+        print "Running:" + cmd + "\n"
+
+        popen = subprocess.Popen([cmd], stdout=subprocess.PIPE,
+                                 stderr=subprocess.STDOUT,
+                                 shell=True)
+        stdoutdata, stderrdata = popen.communicate()
+        retncode = popen.returncode
+        msg = ("check_mode={0}, kill option={1}, exitcode={2}\n".format(
+               check_mode, killoption, retncode))
+        print msg
+        print stdoutdata
+
+        expected = False
+        if (killoption == '') and (retncode == 0):
+            # we expect zero retncode if no kill option
+            expected = True
+        elif killoption != '' and retncode < 0:
+            # we expect negative retncode if kill option was given
+            expected = True
+
+        if not expected:
+            print "TEST FAILED. See kill option and exit code above!!!\n"
+            sys.exit(1)
+
+        stdoutdata = stdoutdata.lower()
+        errorcount = (stdoutdata.count('error') -
+                      stdoutdata.count('got errors 0 times'))
+        print "#times error occurred in output is " + str(errorcount) + "\n"
+
+        if (errorcount > 0):
+            print "TEST FAILED. Output has 'error'!!!\n"
+            sys.exit(2)
+        if (stdoutdata.find('fail') >= 0):
+            print "TEST FAILED. Output has 'fail'!!!\n"
+            sys.exit(2)
+
+        # First half of the duration, keep doing kill test. For the next half,
+        # try different modes.
+        if time.time() > half_time:
+            # we need to clean up after ourselves -- only do this on test
+            # success
+            shutil.rmtree(dbname, True)
+            check_mode = (check_mode + 1) % total_check_mode
+
+        time.sleep(1)  # time to stabilize after a kill
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/src/rocksdb/tools/db_repl_stress.cc b/src/rocksdb/tools/db_repl_stress.cc
new file mode 100644
index 0000000..0fca5d5
--- /dev/null
+++ b/src/rocksdb/tools/db_repl_stress.cc
@@ -0,0 +1,158 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <cstdio>
+#include <atomic>
+
+#include <gflags/gflags.h>
+
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/types.h"
+#include "util/testutil.h"
+
+// Run a thread to perform Put's.
+// Another thread uses GetUpdatesSince API to keep getting the updates.
+// options :
+// --num_inserts = the num of inserts the first thread should perform.
+// --wal_ttl = the wal ttl for the run.
+
+using namespace rocksdb;
+
+using GFLAGS::ParseCommandLineFlags;
+using GFLAGS::SetUsageMessage;
+
+struct DataPumpThread {
+  size_t no_records;
+  DB* db; // Assumption DB is Open'ed already.
+};
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+static void DataPumpThreadBody(void* arg) {
+  DataPumpThread* t = reinterpret_cast<DataPumpThread*>(arg);
+  DB* db = t->db;
+  Random rnd(301);
+  size_t i = 0;
+  while(i++ < t->no_records) {
+    if(!db->Put(WriteOptions(), Slice(RandomString(&rnd, 500)),
+                Slice(RandomString(&rnd, 500))).ok()) {
+      fprintf(stderr, "Error in put\n");
+      exit(1);
+    }
+  }
+}
+
+struct ReplicationThread {
+  std::atomic<bool> stop;
+  DB* db;
+  volatile size_t no_read;
+};
+
+static void ReplicationThreadBody(void* arg) {
+  ReplicationThread* t = reinterpret_cast<ReplicationThread*>(arg);
+  DB* db = t->db;
+  unique_ptr<TransactionLogIterator> iter;
+  SequenceNumber currentSeqNum = 1;
+  while (!t->stop.load(std::memory_order_acquire)) {
+    iter.reset();
+    Status s;
+    while(!db->GetUpdatesSince(currentSeqNum, &iter).ok()) {
+      if (t->stop.load(std::memory_order_acquire)) {
+        return;
+      }
+    }
+    fprintf(stderr, "Refreshing iterator\n");
+    for(;iter->Valid(); iter->Next(), t->no_read++, currentSeqNum++) {
+      BatchResult res = iter->GetBatch();
+      if (res.sequence != currentSeqNum) {
+        fprintf(stderr,
+                "Missed a seq no. b/w %ld and %ld\n",
+                (long)currentSeqNum,
+                (long)res.sequence);
+        exit(1);
+      }
+    }
+  }
+}
+
+DEFINE_uint64(num_inserts, 1000, "the num of inserts the first thread should"
+              " perform.");
+DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)");
+DEFINE_uint64(wal_size_limit_MB, 10, "the wal size limit for the run"
+              "(in MB)");
+
+int main(int argc, const char** argv) {
+  SetUsageMessage(
+      std::string("\nUSAGE:\n") + std::string(argv[0]) +
+      " --num_inserts=<num_inserts> --wal_ttl_seconds=<WAL_ttl_seconds>" +
+      " --wal_size_limit_MB=<WAL_size_limit_MB>");
+  ParseCommandLineFlags(&argc, const_cast<char***>(&argv), true);
+
+  Env* env = Env::Default();
+  std::string default_db_path;
+  env->GetTestDirectory(&default_db_path);
+  default_db_path += "db_repl_stress";
+  Options options;
+  options.create_if_missing = true;
+  options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
+  options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
+  DB* db;
+  DestroyDB(default_db_path, options);
+
+  Status s = DB::Open(options, default_db_path, &db);
+
+  if (!s.ok()) {
+    fprintf(stderr, "Could not open DB due to %s\n", s.ToString().c_str());
+    exit(1);
+  }
+
+  DataPumpThread dataPump;
+  dataPump.no_records = FLAGS_num_inserts;
+  dataPump.db = db;
+  env->StartThread(DataPumpThreadBody, &dataPump);
+
+  ReplicationThread replThread;
+  replThread.db = db;
+  replThread.no_read = 0;
+  replThread.stop.store(false, std::memory_order_release);
+
+  env->StartThread(ReplicationThreadBody, &replThread);
+  while(replThread.no_read < FLAGS_num_inserts);
+  replThread.stop.store(true, std::memory_order_release);
+  if (replThread.no_read < dataPump.no_records) {
+    // no. read should be => than inserted.
+    fprintf(stderr,
+            "No. of Record's written and read not same\nRead : %" ROCKSDB_PRIszt
+            " Written : %" ROCKSDB_PRIszt "\n",
+            replThread.no_read, dataPump.no_records);
+    exit(1);
+  }
+  fprintf(stderr, "Successful!\n");
+  exit(0);
+}
+
+#endif  // GFLAGS
+
+#else  // ROCKSDB_LITE
+#include <stdio.h>
+int main(int argc, char** argv) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/db_sanity_test.cc b/src/rocksdb/tools/db_sanity_test.cc
new file mode 100644
index 0000000..b7176f4
--- /dev/null
+++ b/src/rocksdb/tools/db_sanity_test.cc
@@ -0,0 +1,294 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+#include <memory>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/table.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/filter_policy.h"
+#include "port/port.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+class SanityTest {
+ public:
+  explicit SanityTest(const std::string& path)
+      : env_(Env::Default()), path_(path) {
+    env_->CreateDirIfMissing(path);
+  }
+  virtual ~SanityTest() {}
+
+  virtual std::string Name() const = 0;
+  virtual Options GetOptions() const = 0;
+
+  Status Create() {
+    Options options = GetOptions();
+    options.create_if_missing = true;
+    std::string dbname = path_ + Name();
+    DestroyDB(dbname, options);
+    DB* db = nullptr;
+    Status s = DB::Open(options, dbname, &db);
+    std::unique_ptr<DB> db_guard(db);
+    if (!s.ok()) {
+      return s;
+    }
+    for (int i = 0; i < 1000000; ++i) {
+      std::string k = "key" + ToString(i);
+      std::string v = "value" + ToString(i);
+      s = db->Put(WriteOptions(), Slice(k), Slice(v));
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    return db->Flush(FlushOptions());
+  }
+  Status Verify() {
+    DB* db = nullptr;
+    std::string dbname = path_ + Name();
+    Status s = DB::Open(GetOptions(), dbname, &db);
+    std::unique_ptr<DB> db_guard(db);
+    if (!s.ok()) {
+      return s;
+    }
+    for (int i = 0; i < 1000000; ++i) {
+      std::string k = "key" + ToString(i);
+      std::string v = "value" + ToString(i);
+      std::string result;
+      s = db->Get(ReadOptions(), Slice(k), &result);
+      if (!s.ok()) {
+        return s;
+      }
+      if (result != v) {
+        return Status::Corruption("Unexpected value for key " + k);
+      }
+    }
+    return Status::OK();
+  }
+
+ private:
+  Env* env_;
+  std::string const path_;
+};
+
+class SanityTestBasic : public SanityTest {
+ public:
+  explicit SanityTestBasic(const std::string& path) : SanityTest(path) {}
+  virtual Options GetOptions() const override {
+    Options options;
+    options.create_if_missing = true;
+    return options;
+  }
+  virtual std::string Name() const override { return "Basic"; }
+};
+
+class SanityTestSpecialComparator : public SanityTest {
+ public:
+  explicit SanityTestSpecialComparator(const std::string& path)
+      : SanityTest(path) {
+    options_.comparator = new NewComparator();
+  }
+  ~SanityTestSpecialComparator() { delete options_.comparator; }
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override { return "SpecialComparator"; }
+
+ private:
+  class NewComparator : public Comparator {
+   public:
+    virtual const char* Name() const override {
+      return "rocksdb.NewComparator";
+    }
+    virtual int Compare(const Slice& a, const Slice& b) const override {
+      return BytewiseComparator()->Compare(a, b);
+    }
+    virtual void FindShortestSeparator(std::string* s,
+                                       const Slice& l) const override {
+      BytewiseComparator()->FindShortestSeparator(s, l);
+    }
+    virtual void FindShortSuccessor(std::string* key) const override {
+      BytewiseComparator()->FindShortSuccessor(key);
+    }
+  };
+  Options options_;
+};
+
+class SanityTestZlibCompression : public SanityTest {
+ public:
+  explicit SanityTestZlibCompression(const std::string& path)
+      : SanityTest(path) {
+    options_.compression = kZlibCompression;
+  }
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override { return "ZlibCompression"; }
+
+ private:
+  Options options_;
+};
+
+class SanityTestZlibCompressionVersion2 : public SanityTest {
+ public:
+  explicit SanityTestZlibCompressionVersion2(const std::string& path)
+      : SanityTest(path) {
+    options_.compression = kZlibCompression;
+    BlockBasedTableOptions table_options;
+#if ROCKSDB_MAJOR > 3 || (ROCKSDB_MAJOR == 3 && ROCKSDB_MINOR >= 10)
+    table_options.format_version = 2;
+#endif
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override {
+    return "ZlibCompressionVersion2";
+  }
+
+ private:
+  Options options_;
+};
+
+class SanityTestLZ4Compression : public SanityTest {
+ public:
+  explicit SanityTestLZ4Compression(const std::string& path)
+      : SanityTest(path) {
+    options_.compression = kLZ4Compression;
+  }
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override { return "LZ4Compression"; }
+
+ private:
+  Options options_;
+};
+
+class SanityTestLZ4HCCompression : public SanityTest {
+ public:
+  explicit SanityTestLZ4HCCompression(const std::string& path)
+      : SanityTest(path) {
+    options_.compression = kLZ4HCCompression;
+  }
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override { return "LZ4HCCompression"; }
+
+ private:
+  Options options_;
+};
+
+class SanityTestZSTDCompression : public SanityTest {
+ public:
+  explicit SanityTestZSTDCompression(const std::string& path)
+      : SanityTest(path) {
+    options_.compression = kZSTDNotFinalCompression;
+  }
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override { return "ZSTDCompression"; }
+
+ private:
+  Options options_;
+};
+
+#ifndef ROCKSDB_LITE
+class SanityTestPlainTableFactory : public SanityTest {
+ public:
+  explicit SanityTestPlainTableFactory(const std::string& path)
+      : SanityTest(path) {
+    options_.table_factory.reset(NewPlainTableFactory());
+    options_.prefix_extractor.reset(NewFixedPrefixTransform(2));
+    options_.allow_mmap_reads = true;
+  }
+  ~SanityTestPlainTableFactory() {}
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override { return "PlainTable"; }
+
+ private:
+  Options options_;
+};
+#endif  // ROCKSDB_LITE
+
+class SanityTestBloomFilter : public SanityTest {
+ public:
+  explicit SanityTestBloomFilter(const std::string& path) : SanityTest(path) {
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+  ~SanityTestBloomFilter() {}
+  virtual Options GetOptions() const override { return options_; }
+  virtual std::string Name() const override { return "BloomFilter"; }
+
+ private:
+  Options options_;
+};
+
+namespace {
+bool RunSanityTests(const std::string& command, const std::string& path) {
+  std::vector<SanityTest*> sanity_tests = {
+      new SanityTestBasic(path),
+      new SanityTestSpecialComparator(path),
+      new SanityTestZlibCompression(path),
+      new SanityTestZlibCompressionVersion2(path),
+      new SanityTestLZ4Compression(path),
+      new SanityTestLZ4HCCompression(path),
+      new SanityTestZSTDCompression(path),
+#ifndef ROCKSDB_LITE
+      new SanityTestPlainTableFactory(path),
+#endif  // ROCKSDB_LITE
+      new SanityTestBloomFilter(path)};
+
+  if (command == "create") {
+    fprintf(stderr, "Creating...\n");
+  } else {
+    fprintf(stderr, "Verifying...\n");
+  }
+  bool result = true;
+  for (auto sanity_test : sanity_tests) {
+    Status s;
+    fprintf(stderr, "%s -- ", sanity_test->Name().c_str());
+    if (command == "create") {
+      s = sanity_test->Create();
+    } else {
+      assert(command == "verify");
+      s = sanity_test->Verify();
+    }
+    fprintf(stderr, "%s\n", s.ToString().c_str());
+    if (!s.ok()) {
+      fprintf(stderr, "FAIL\n");
+      result = false;
+    }
+
+    delete sanity_test;
+  }
+  return result;
+}
+}  // namespace
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  std::string path, command;
+  bool ok = (argc == 3);
+  if (ok) {
+    path = std::string(argv[1]);
+    command = std::string(argv[2]);
+    ok = (command == "create" || command == "verify");
+  }
+  if (!ok) {
+    fprintf(stderr, "Usage: %s <path> [create|verify] \n", argv[0]);
+    exit(1);
+  }
+  if (path.back() != '/') {
+    path += "/";
+  }
+
+  bool sanity_ok = rocksdb::RunSanityTests(command, path);
+
+  return sanity_ok ? 0 : 1;
+}
diff --git a/src/rocksdb/tools/db_stress.cc b/src/rocksdb/tools/db_stress.cc
new file mode 100644
index 0000000..634045d
--- /dev/null
+++ b/src/rocksdb/tools/db_stress.cc
@@ -0,0 +1,2197 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The test uses an array to compare against values written to the database.
+// Keys written to the array are in 1:1 correspondence to the actual values in
+// the database according to the formula in the function GenerateValue.
+
+// Space is reserved in the array from 0 to FLAGS_max_key and values are
+// randomly written/deleted/read from those positions. During verification we
+// compare all the positions in the array. To shorten/elongate the running
+// time, you could change the settings: FLAGS_max_key, FLAGS_ops_per_thread,
+// (sometimes also FLAGS_threads).
+//
+// NOTE that if FLAGS_test_batches_snapshots is set, the test will have
+// different behavior. See comment of the flag for details.
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <chrono>
+#include <exception>
+#include <thread>
+
+#include <gflags/gflags.h>
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "hdfs/env_hdfs.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/write_batch.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/histogram.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+using GFLAGS::ParseCommandLineFlags;
+using GFLAGS::RegisterFlagValidator;
+using GFLAGS::SetUsageMessage;
+
+static const long KB = 1024;
+
+static bool ValidateUint32Range(const char* flagname, uint64_t value) {
+  if (value > std::numeric_limits<uint32_t>::max()) {
+    fprintf(stderr,
+            "Invalid value for --%s: %lu, overflow\n",
+            flagname,
+            (unsigned long)value);
+    return false;
+  }
+  return true;
+}
+
+DEFINE_uint64(seed, 2341234, "Seed for PRNG");
+static const bool FLAGS_seed_dummy __attribute__((unused)) =
+    RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
+
+DEFINE_int64(max_key, 1 * KB* KB,
+             "Max number of key/values to place in database");
+
+DEFINE_int32(column_families, 10, "Number of column families");
+
+// TODO(noetzli) Add support for single deletes
+DEFINE_bool(test_batches_snapshots, false,
+            "If set, the test uses MultiGet(), MultiPut() and MultiDelete()"
+            " which read/write/delete multiple keys in a batch. In this mode,"
+            " we do not verify db content by comparing the content with the "
+            "pre-allocated array. Instead, we do partial verification inside"
+            " MultiGet() by checking various values in a batch. Benefit of"
+            " this mode:\n"
+            "\t(a) No need to acquire mutexes during writes (less cache "
+            "flushes in multi-core leading to speed up)\n"
+            "\t(b) No long validation at the end (more speed up)\n"
+            "\t(c) Test snapshot and atomicity of batch writes");
+
+DEFINE_int32(threads, 32, "Number of concurrent threads to run.");
+
+DEFINE_int32(ttl, -1,
+             "Opens the db with this ttl value if this is not -1. "
+             "Carefully specify a large value such that verifications on "
+             "deleted values don't fail");
+
+DEFINE_int32(value_size_mult, 8,
+             "Size of value will be this number times rand_int(1,3) bytes");
+
+DEFINE_bool(verify_before_write, false, "Verify before write");
+
+DEFINE_bool(histogram, false, "Print histogram of operation timings");
+
+DEFINE_bool(destroy_db_initially, true,
+            "Destroys the database dir before start if this is true");
+
+DEFINE_bool(verbose, false, "Verbose");
+
+DEFINE_bool(progress_reports, true,
+            "If true, db_stress will report number of finished operations");
+
+DEFINE_uint64(db_write_buffer_size, rocksdb::Options().db_write_buffer_size,
+              "Number of bytes to buffer in all memtables before compacting");
+
+DEFINE_int32(write_buffer_size,
+             static_cast<int32_t>(rocksdb::Options().write_buffer_size),
+             "Number of bytes to buffer in memtable before compacting");
+
+DEFINE_int32(max_write_buffer_number,
+             rocksdb::Options().max_write_buffer_number,
+             "The number of in-memory memtables. "
+             "Each memtable is of size FLAGS_write_buffer_size.");
+
+DEFINE_int32(min_write_buffer_number_to_merge,
+             rocksdb::Options().min_write_buffer_number_to_merge,
+             "The minimum number of write buffers that will be merged together "
+             "before writing to storage. This is cheap because it is an "
+             "in-memory merge. If this feature is not enabled, then all these "
+             "write buffers are flushed to L0 as separate files and this "
+             "increases read amplification because a get request has to check "
+             "in all of these files. Also, an in-memory merge may result in "
+             "writing less data to storage if there are duplicate records in"
+             " each of these individual write buffers.");
+
+DEFINE_int32(max_write_buffer_number_to_maintain,
+             rocksdb::Options().max_write_buffer_number_to_maintain,
+             "The total maximum number of write buffers to maintain in memory "
+             "including copies of buffers that have already been flushed. "
+             "Unlike max_write_buffer_number, this parameter does not affect "
+             "flushing. This controls the minimum amount of write history "
+             "that will be available in memory for conflict checking when "
+             "Transactions are used. If this value is too low, some "
+             "transactions may fail at commit time due to not being able to "
+             "determine whether there were any write conflicts. Setting this "
+             "value to 0 will cause write buffers to be freed immediately "
+             "after they are flushed.  If this value is set to -1, "
+             "'max_write_buffer_number' will be used.");
+
+DEFINE_int32(open_files, rocksdb::Options().max_open_files,
+             "Maximum number of files to keep open at the same time "
+             "(use default if == 0)");
+
+DEFINE_int64(compressed_cache_size, -1,
+             "Number of bytes to use as a cache of compressed data."
+             " Negative means use default settings.");
+
+DEFINE_int32(compaction_style, rocksdb::Options().compaction_style, "");
+
+DEFINE_int32(level0_file_num_compaction_trigger,
+             rocksdb::Options().level0_file_num_compaction_trigger,
+             "Level0 compaction start trigger");
+
+DEFINE_int32(level0_slowdown_writes_trigger,
+             rocksdb::Options().level0_slowdown_writes_trigger,
+             "Number of files in level-0 that will slow down writes");
+
+DEFINE_int32(level0_stop_writes_trigger,
+             rocksdb::Options().level0_stop_writes_trigger,
+             "Number of files in level-0 that will trigger put stop.");
+
+DEFINE_int32(block_size,
+             static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
+             "Number of bytes in a block.");
+
+DEFINE_int32(max_background_compactions,
+             rocksdb::Options().max_background_compactions,
+             "The maximum number of concurrent background compactions "
+             "that can occur in parallel.");
+
+DEFINE_int32(compaction_thread_pool_adjust_interval, 0,
+             "The interval (in milliseconds) to adjust compaction thread pool "
+             "size. Don't change it periodically if the value is 0.");
+
+DEFINE_int32(compaction_thread_pool_variations, 2,
+             "Range of background thread pool size variations when adjusted "
+             "periodically.");
+
+DEFINE_int32(max_background_flushes, rocksdb::Options().max_background_flushes,
+             "The maximum number of concurrent background flushes "
+             "that can occur in parallel.");
+
+DEFINE_int32(universal_size_ratio, 0, "The ratio of file sizes that trigger"
+             " compaction in universal style");
+
+DEFINE_int32(universal_min_merge_width, 0, "The minimum number of files to "
+             "compact in universal style compaction");
+
+DEFINE_int32(universal_max_merge_width, 0, "The max number of files to compact"
+             " in universal style compaction");
+
+DEFINE_int32(universal_max_size_amplification_percent, 0,
+             "The max size amplification for universal style compaction");
+
+DEFINE_int32(clear_column_family_one_in, 1000000,
+             "With a chance of 1/N, delete a column family and then recreate "
+             "it again. If N == 0, never drop/create column families. "
+             "When test_batches_snapshots is true, this flag has no effect");
+
+DEFINE_int32(set_options_one_in, 0,
+             "With a chance of 1/N, change some random options");
+
+DEFINE_int32(set_in_place_one_in, 0,
+             "With a chance of 1/N, toggle in place support option");
+
+DEFINE_int64(cache_size, 2LL * KB * KB * KB,
+             "Number of bytes to use as a cache of uncompressed data.");
+
+DEFINE_uint64(subcompactions, 1,
+             "Maximum number of subcompactions to divide L0-L1 compactions "
+             "into.");
+static const bool FLAGS_subcompactions_dummy __attribute__((unused)) =
+    RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
+
+static bool ValidateInt32Positive(const char* flagname, int32_t value) {
+  if (value < 0) {
+    fprintf(stderr, "Invalid value for --%s: %d, must be >=0\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(reopen, 10, "Number of times database reopens");
+static const bool FLAGS_reopen_dummy __attribute__((unused)) =
+    RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
+
+DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
+             "Negative means use default settings.");
+
+DEFINE_bool(use_block_based_filter, false, "use block based filter"
+              "instead of full filter for block based table");
+
+DEFINE_string(db, "", "Use the db with the following name.");
+
+DEFINE_bool(verify_checksum, false,
+            "Verify checksum for every block read from storage");
+
+DEFINE_bool(mmap_read, rocksdb::EnvOptions().use_mmap_reads,
+            "Allow reads to occur via mmap-ing files");
+
+// Database statistics
+static std::shared_ptr<rocksdb::Statistics> dbstats;
+DEFINE_bool(statistics, false, "Create database statistics");
+
+DEFINE_bool(sync, false, "Sync all writes to disk");
+
+DEFINE_bool(disable_data_sync, false,
+            "If true, do not wait until data is synced to disk.");
+
+DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
+
+DEFINE_int32(kill_random_test, 0,
+             "If non-zero, kill at various points in source code with "
+             "probability 1/this");
+static const bool FLAGS_kill_random_test_dummy __attribute__((unused)) =
+    RegisterFlagValidator(&FLAGS_kill_random_test, &ValidateInt32Positive);
+extern int rocksdb_kill_odds;
+
+DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
+
+DEFINE_int32(target_file_size_base, 64 * KB,
+             "Target level-1 file size for compaction");
+
+DEFINE_int32(target_file_size_multiplier, 1,
+             "A multiplier to compute target level-N file size (N >= 2)");
+
+DEFINE_uint64(max_bytes_for_level_base, 256 * KB, "Max bytes for level-1");
+
+DEFINE_int32(max_bytes_for_level_multiplier, 2,
+             "A multiplier to compute max bytes for level-N (N >= 2)");
+
+static bool ValidateInt32Percent(const char* flagname, int32_t value) {
+  if (value < 0 || value>100) {
+    fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(readpercent, 10,
+             "Ratio of reads to total workload (expressed as a percentage)");
+static const bool FLAGS_readpercent_dummy __attribute__((unused)) =
+    RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
+
+DEFINE_int32(prefixpercent, 20,
+             "Ratio of prefix iterators to total workload (expressed as a"
+             " percentage)");
+static const bool FLAGS_prefixpercent_dummy __attribute__((unused)) =
+    RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
+
+DEFINE_int32(writepercent, 45,
+             "Ratio of writes to total workload (expressed as a percentage)");
+static const bool FLAGS_writepercent_dummy __attribute__((unused)) =
+    RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
+
+DEFINE_int32(delpercent, 15,
+             "Ratio of deletes to total workload (expressed as a percentage)");
+static const bool FLAGS_delpercent_dummy __attribute__((unused)) =
+    RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
+
+DEFINE_int32(nooverwritepercent, 60,
+             "Ratio of keys without overwrite to total workload (expressed as "
+             " a percentage)");
+static const bool FLAGS_nooverwritepercent_dummy __attribute__((__unused__)) =
+    RegisterFlagValidator(&FLAGS_nooverwritepercent, &ValidateInt32Percent);
+
+DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload"
+             " (expressed as a percentage)");
+static const bool FLAGS_iterpercent_dummy __attribute__((unused)) =
+    RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
+
+DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
+static const bool FLAGS_num_iterations_dummy __attribute__((unused)) =
+    RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
+
+namespace {
+enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "none"))
+    return rocksdb::kNoCompression;
+  else if (!strcasecmp(ctype, "snappy"))
+    return rocksdb::kSnappyCompression;
+  else if (!strcasecmp(ctype, "zlib"))
+    return rocksdb::kZlibCompression;
+  else if (!strcasecmp(ctype, "bzip2"))
+    return rocksdb::kBZip2Compression;
+  else if (!strcasecmp(ctype, "lz4"))
+    return rocksdb::kLZ4Compression;
+  else if (!strcasecmp(ctype, "lz4hc"))
+    return rocksdb::kLZ4HCCompression;
+  else if (!strcasecmp(ctype, "zstd"))
+    return rocksdb::kZSTDNotFinalCompression;
+
+  fprintf(stdout, "Cannot parse compression type '%s'\n", ctype);
+  return rocksdb::kSnappyCompression; //default value
+}
+}  // namespace
+
+DEFINE_string(compression_type, "snappy",
+              "Algorithm to use to compress the database");
+static enum rocksdb::CompressionType FLAGS_compression_type_e =
+    rocksdb::kSnappyCompression;
+
+DEFINE_string(hdfs, "", "Name of hdfs environment");
+// posix or hdfs environment
+static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
+
+DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
+static const bool FLAGS_ops_per_thread_dummy __attribute__((unused)) =
+    RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
+
+DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
+static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((unused)) =
+    RegisterFlagValidator(&FLAGS_log2_keys_per_lock, &ValidateUint32Range);
+
+DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop"
+            " the delete if key not present");
+
+DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable");
+
+enum RepFactory {
+  kSkipList,
+  kHashSkipList,
+  kVectorRep
+};
+
+namespace {
+enum RepFactory StringToRepFactory(const char* ctype) {
+  assert(ctype);
+
+  if (!strcasecmp(ctype, "skip_list"))
+    return kSkipList;
+  else if (!strcasecmp(ctype, "prefix_hash"))
+    return kHashSkipList;
+  else if (!strcasecmp(ctype, "vector"))
+    return kVectorRep;
+
+  fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
+  return kSkipList;
+}
+}  // namespace
+
+static enum RepFactory FLAGS_rep_factory;
+DEFINE_string(memtablerep, "prefix_hash", "");
+
+static bool ValidatePrefixSize(const char* flagname, int32_t value) {
+  if (value < 0 || value > 8) {
+    fprintf(stderr, "Invalid value for --%s: %d. 0 <= PrefixSize <= 8\n",
+            flagname, value);
+    return false;
+  }
+  return true;
+}
+DEFINE_int32(prefix_size, 7, "Control the prefix size for HashSkipListRep");
+static const bool FLAGS_prefix_size_dummy __attribute__((unused)) =
+    RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
+
+DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge "
+            "that behaves like a Put");
+
+
+namespace rocksdb {
+
+// convert long to a big-endian slice key
+static std::string Key(long val) {
+  std::string little_endian_key;
+  std::string big_endian_key;
+  PutFixed64(&little_endian_key, val);
+  assert(little_endian_key.size() == sizeof(val));
+  big_endian_key.resize(sizeof(val));
+  for (int i=0; i<(int)sizeof(val); i++) {
+    big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
+  }
+  return big_endian_key;
+}
+
+static std::string StringToHex(const std::string& str) {
+  std::string result = "0x";
+  char buf[10];
+  for (size_t i = 0; i < str.length(); i++) {
+    snprintf(buf, 10, "%02X", (unsigned char)str[i]);
+    result += buf;
+  }
+  return result;
+}
+
+
+class StressTest;
+namespace {
+
+class Stats {
+ private:
+  double start_;
+  double finish_;
+  double seconds_;
+  long done_;
+  long gets_;
+  long prefixes_;
+  long writes_;
+  long deletes_;
+  size_t single_deletes_;
+  long iterator_size_sums_;
+  long founds_;
+  long iterations_;
+  long errors_;
+  int next_report_;
+  size_t bytes_;
+  double last_op_finish_;
+  HistogramImpl hist_;
+
+ public:
+  Stats() { }
+
+  void Start() {
+    next_report_ = 100;
+    hist_.Clear();
+    done_ = 0;
+    gets_ = 0;
+    prefixes_ = 0;
+    writes_ = 0;
+    deletes_ = 0;
+    single_deletes_ = 0;
+    iterator_size_sums_ = 0;
+    founds_ = 0;
+    iterations_ = 0;
+    errors_ = 0;
+    bytes_ = 0;
+    seconds_ = 0;
+    start_ = FLAGS_env->NowMicros();
+    last_op_finish_ = start_;
+    finish_ = start_;
+  }
+
+  void Merge(const Stats& other) {
+    hist_.Merge(other.hist_);
+    done_ += other.done_;
+    gets_ += other.gets_;
+    prefixes_ += other.prefixes_;
+    writes_ += other.writes_;
+    deletes_ += other.deletes_;
+    single_deletes_ += other.single_deletes_;
+    iterator_size_sums_ += other.iterator_size_sums_;
+    founds_ += other.founds_;
+    iterations_ += other.iterations_;
+    errors_ += other.errors_;
+    bytes_ += other.bytes_;
+    seconds_ += other.seconds_;
+    if (other.start_ < start_) start_ = other.start_;
+    if (other.finish_ > finish_) finish_ = other.finish_;
+  }
+
+  void Stop() {
+    finish_ = FLAGS_env->NowMicros();
+    seconds_ = (finish_ - start_) * 1e-6;
+  }
+
+  void FinishedSingleOp() {
+    if (FLAGS_histogram) {
+      double now = FLAGS_env->NowMicros();
+      double micros = now - last_op_finish_;
+      hist_.Add(micros);
+      if (micros > 20000) {
+        fprintf(stdout, "long op: %.1f micros%30s\r", micros, "");
+      }
+      last_op_finish_ = now;
+    }
+
+      done_++;
+    if (FLAGS_progress_reports) {
+      if (done_ >= next_report_) {
+        if      (next_report_ < 1000)   next_report_ += 100;
+        else if (next_report_ < 5000)   next_report_ += 500;
+        else if (next_report_ < 10000)  next_report_ += 1000;
+        else if (next_report_ < 50000)  next_report_ += 5000;
+        else if (next_report_ < 100000) next_report_ += 10000;
+        else if (next_report_ < 500000) next_report_ += 50000;
+        else                            next_report_ += 100000;
+        fprintf(stdout, "... finished %ld ops%30s\r", done_, "");
+      }
+    }
+  }
+
+  void AddBytesForWrites(int nwrites, size_t nbytes) {
+    writes_ += nwrites;
+    bytes_ += nbytes;
+  }
+
+  void AddGets(int ngets, int nfounds) {
+    founds_ += nfounds;
+    gets_ += ngets;
+  }
+
+  void AddPrefixes(int nprefixes, int count) {
+    prefixes_ += nprefixes;
+    iterator_size_sums_ += count;
+  }
+
+  void AddIterations(int n) {
+    iterations_ += n;
+  }
+
+  void AddDeletes(int n) {
+    deletes_ += n;
+  }
+
+  void AddSingleDeletes(size_t n) { single_deletes_ += n; }
+
+  void AddErrors(int n) {
+    errors_ += n;
+  }
+
+  void Report(const char* name) {
+    std::string extra;
+    if (bytes_ < 1 || done_ < 1) {
+      fprintf(stderr, "No writes or ops?\n");
+      return;
+    }
+
+    double elapsed = (finish_ - start_) * 1e-6;
+    double bytes_mb = bytes_ / 1048576.0;
+    double rate = bytes_mb / elapsed;
+    double throughput = (double)done_/elapsed;
+
+    fprintf(stdout, "%-12s: ", name);
+    fprintf(stdout, "%.3f micros/op %ld ops/sec\n",
+            seconds_ * 1e6 / done_, (long)throughput);
+    fprintf(stdout, "%-12s: Wrote %.2f MB (%.2f MB/sec) (%ld%% of %ld ops)\n",
+            "", bytes_mb, rate, (100*writes_)/done_, done_);
+    fprintf(stdout, "%-12s: Wrote %ld times\n", "", writes_);
+    fprintf(stdout, "%-12s: Deleted %ld times\n", "", deletes_);
+    fprintf(stdout, "%-12s: Single deleted %ld times\n", "", single_deletes_);
+    fprintf(stdout, "%-12s: %ld read and %ld found the key\n", "",
+            gets_, founds_);
+    fprintf(stdout, "%-12s: Prefix scanned %ld times\n", "", prefixes_);
+    fprintf(stdout, "%-12s: Iterator size sum is %ld\n", "",
+            iterator_size_sums_);
+    fprintf(stdout, "%-12s: Iterated %ld times\n", "", iterations_);
+    fprintf(stdout, "%-12s: Got errors %ld times\n", "", errors_);
+
+    if (FLAGS_histogram) {
+      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
+    }
+    fflush(stdout);
+  }
+};
+
+// State shared by all concurrent executions of the same benchmark.
+class SharedState {
+ public:
+  static const uint32_t SENTINEL;
+
+  explicit SharedState(StressTest* stress_test)
+      : cv_(&mu_),
+        seed_(static_cast<uint32_t>(FLAGS_seed)),
+        max_key_(FLAGS_max_key),
+        log2_keys_per_lock_(static_cast<uint32_t>(FLAGS_log2_keys_per_lock)),
+        num_threads_(FLAGS_threads),
+        num_initialized_(0),
+        num_populated_(0),
+        vote_reopen_(0),
+        num_done_(0),
+        start_(false),
+        start_verify_(false),
+        should_stop_bg_thread_(false),
+        bg_thread_finished_(false),
+        stress_test_(stress_test),
+        verification_failure_(false),
+        no_overwrite_ids_(FLAGS_column_families) {
+    // Pick random keys in each column family that will not experience
+    // overwrite
+
+    printf("Choosing random keys with no overwrite\n");
+    Random rnd(seed_);
+    size_t num_no_overwrite_keys = (max_key_ * FLAGS_nooverwritepercent) / 100;
+    for (auto& cf_ids : no_overwrite_ids_) {
+      for (size_t i = 0; i < num_no_overwrite_keys; i++) {
+        size_t rand_key;
+        do {
+          rand_key = rnd.Next() % max_key_;
+        } while (cf_ids.find(rand_key) != cf_ids.end());
+        cf_ids.insert(rand_key);
+      }
+      assert(cf_ids.size() == num_no_overwrite_keys);
+    }
+
+    if (FLAGS_test_batches_snapshots) {
+      fprintf(stdout, "No lock creation because test_batches_snapshots set\n");
+      return;
+    }
+    values_.resize(FLAGS_column_families);
+
+    for (int i = 0; i < FLAGS_column_families; ++i) {
+      values_[i] = std::vector<uint32_t>(max_key_, SENTINEL);
+    }
+
+    long num_locks = (max_key_ >> log2_keys_per_lock_);
+    if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) {
+      num_locks++;
+    }
+    fprintf(stdout, "Creating %ld locks\n", num_locks * FLAGS_column_families);
+    key_locks_.resize(FLAGS_column_families);
+
+    for (int i = 0; i < FLAGS_column_families; ++i) {
+      key_locks_[i].resize(num_locks);
+      for (auto& ptr : key_locks_[i]) {
+        ptr.reset(new port::Mutex);
+      }
+    }
+  }
+
+  ~SharedState() {}
+
+  port::Mutex* GetMutex() {
+    return &mu_;
+  }
+
+  port::CondVar* GetCondVar() {
+    return &cv_;
+  }
+
+  StressTest* GetStressTest() const {
+    return stress_test_;
+  }
+
+  long GetMaxKey() const {
+    return max_key_;
+  }
+
+  uint32_t GetNumThreads() const {
+    return num_threads_;
+  }
+
+  void IncInitialized() {
+    num_initialized_++;
+  }
+
+  void IncOperated() {
+    num_populated_++;
+  }
+
+  void IncDone() {
+    num_done_++;
+  }
+
+  void IncVotedReopen() {
+    vote_reopen_ = (vote_reopen_ + 1) % num_threads_;
+  }
+
+  bool AllInitialized() const {
+    return num_initialized_ >= num_threads_;
+  }
+
+  bool AllOperated() const {
+    return num_populated_ >= num_threads_;
+  }
+
+  bool AllDone() const {
+    return num_done_ >= num_threads_;
+  }
+
+  bool AllVotedReopen() {
+    return (vote_reopen_ == 0);
+  }
+
+  void SetStart() {
+    start_ = true;
+  }
+
+  void SetStartVerify() {
+    start_verify_ = true;
+  }
+
+  bool Started() const {
+    return start_;
+  }
+
+  bool VerifyStarted() const {
+    return start_verify_;
+  }
+
+  void SetVerificationFailure() { verification_failure_.store(true); }
+
+  bool HasVerificationFailedYet() { return verification_failure_.load(); }
+
+  port::Mutex* GetMutexForKey(int cf, long key) {
+    return key_locks_[cf][key >> log2_keys_per_lock_].get();
+  }
+
+  void LockColumnFamily(int cf) {
+    for (auto& mutex : key_locks_[cf]) {
+      mutex->Lock();
+    }
+  }
+
+  void UnlockColumnFamily(int cf) {
+    for (auto& mutex : key_locks_[cf]) {
+      mutex->Unlock();
+    }
+  }
+
+  void ClearColumnFamily(int cf) {
+    std::fill(values_[cf].begin(), values_[cf].end(), SENTINEL);
+  }
+
+  void Put(int cf, long key, uint32_t value_base) {
+    values_[cf][key] = value_base;
+  }
+
+  uint32_t Get(int cf, long key) const { return values_[cf][key]; }
+
+  void Delete(int cf, long key) { values_[cf][key] = SENTINEL; }
+
+  void SingleDelete(int cf, size_t key) { values_[cf][key] = SENTINEL; }
+
+  bool AllowsOverwrite(int cf, size_t key) {
+    return no_overwrite_ids_[cf].find(key) == no_overwrite_ids_[cf].end();
+  }
+
+  bool Exists(int cf, size_t key) { return values_[cf][key] != SENTINEL; }
+
+  uint32_t GetSeed() const { return seed_; }
+
+  void SetShouldStopBgThread() { should_stop_bg_thread_ = true; }
+
+  bool ShoudStopBgThread() { return should_stop_bg_thread_; }
+
+  void SetBgThreadFinish() { bg_thread_finished_ = true; }
+
+  bool BgThreadFinished() const { return bg_thread_finished_; }
+
+ private:
+  port::Mutex mu_;
+  port::CondVar cv_;
+  const uint32_t seed_;
+  const long max_key_;
+  const uint32_t log2_keys_per_lock_;
+  const int num_threads_;
+  long num_initialized_;
+  long num_populated_;
+  long vote_reopen_;
+  long num_done_;
+  bool start_;
+  bool start_verify_;
+  bool should_stop_bg_thread_;
+  bool bg_thread_finished_;
+  StressTest* stress_test_;
+  std::atomic<bool> verification_failure_;
+
+  // Keys that should not be overwritten
+  std::vector<std::set<size_t> > no_overwrite_ids_;
+
+  std::vector<std::vector<uint32_t>> values_;
+  // Has to make it owned by a smart ptr as port::Mutex is not copyable
+  // and storing it in the container may require copying depending on the impl.
+  std::vector<std::vector<std::unique_ptr<port::Mutex> > > key_locks_;
+};
+
+const uint32_t SharedState::SENTINEL = 0xffffffff;
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  uint32_t tid; // 0..n-1
+  Random rand;  // Has different seeds for different threads
+  SharedState* shared;
+  Stats stats;
+
+  ThreadState(uint32_t index, SharedState* _shared)
+      : tid(index), rand(1000 + index + _shared->GetSeed()), shared(_shared) {}
+};
+
+class DbStressListener : public EventListener {
+ public:
+  DbStressListener(
+      const std::string& db_name,
+      const std::vector<DbPath>& db_paths) :
+      db_name_(db_name),
+      db_paths_(db_paths),
+      rand_(301) {}
+  virtual ~DbStressListener() {}
+#ifndef ROCKSDB_LITE
+  virtual void OnFlushCompleted(
+      DB* db, const FlushJobInfo& info) override {
+    assert(db);
+    assert(db->GetName() == db_name_);
+    assert(IsValidColumnFamilyName(info.cf_name));
+    VerifyFilePath(info.file_path);
+    // pretending doing some work here
+    std::this_thread::sleep_for(
+        std::chrono::microseconds(rand_.Uniform(5000)));
+  }
+
+  virtual void OnCompactionCompleted(
+      DB *db, const CompactionJobInfo& ci) override {
+    assert(db);
+    assert(db->GetName() == db_name_);
+    assert(IsValidColumnFamilyName(ci.cf_name));
+    assert(ci.input_files.size() + ci.output_files.size() > 0U);
+    for (const auto& file_path : ci.input_files) {
+      VerifyFilePath(file_path);
+    }
+    for (const auto& file_path : ci.output_files) {
+      VerifyFilePath(file_path);
+    }
+    // pretending doing some work here
+    std::this_thread::sleep_for(
+        std::chrono::microseconds(rand_.Uniform(5000)));
+  }
+
+  virtual void OnTableFileCreated(
+      const TableFileCreationInfo& info) override {
+    assert(info.db_name == db_name_);
+    assert(IsValidColumnFamilyName(info.cf_name));
+    VerifyFilePath(info.file_path);
+    assert(info.file_size > 0);
+    assert(info.job_id > 0);
+    assert(info.table_properties.data_size > 0);
+    assert(info.table_properties.raw_key_size > 0);
+    assert(info.table_properties.num_entries > 0);
+  }
+
+ protected:
+  bool IsValidColumnFamilyName(const std::string& cf_name) const {
+    if (cf_name == kDefaultColumnFamilyName) {
+      return true;
+    }
+    // The column family names in the stress tests are numbers.
+    for (size_t i = 0; i < cf_name.size(); ++i) {
+      if (cf_name[i] < '0' || cf_name[i] > '9') {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void VerifyFileDir(const std::string& file_dir) {
+#ifndef NDEBUG
+    if (db_name_ == file_dir) {
+      return;
+    }
+    for (const auto& db_path : db_paths_) {
+      if (db_path.path == file_dir) {
+        return;
+      }
+    }
+    assert(false);
+#endif  // !NDEBUG
+  }
+
+  void VerifyFileName(const std::string& file_name) {
+#ifndef NDEBUG
+    uint64_t file_number;
+    FileType file_type;
+    bool result = ParseFileName(file_name, &file_number, &file_type);
+    assert(result);
+    assert(file_type == kTableFile);
+#endif  // !NDEBUG
+  }
+
+  void VerifyFilePath(const std::string& file_path) {
+#ifndef NDEBUG
+    size_t pos = file_path.find_last_of("/");
+    if (pos == std::string::npos) {
+      VerifyFileName(file_path);
+    } else {
+      if (pos > 0) {
+        VerifyFileDir(file_path.substr(0, pos));
+      }
+      VerifyFileName(file_path.substr(pos));
+    }
+#endif  // !NDEBUG
+  }
+#endif  // !ROCKSDB_LITE
+
+ private:
+  std::string db_name_;
+  std::vector<DbPath> db_paths_;
+  Random rand_;
+};
+
+}  // namespace
+
+class StressTest {
+ public:
+  StressTest()
+      : cache_(NewLRUCache(FLAGS_cache_size)),
+        compressed_cache_(FLAGS_compressed_cache_size >= 0
+                              ? NewLRUCache(FLAGS_compressed_cache_size)
+                              : nullptr),
+        filter_policy_(FLAGS_bloom_bits >= 0
+                   ? FLAGS_use_block_based_filter
+                     ? NewBloomFilterPolicy(FLAGS_bloom_bits, true)
+                     : NewBloomFilterPolicy(FLAGS_bloom_bits, false)
+                   : nullptr),
+        db_(nullptr),
+        new_column_family_name_(1),
+        num_times_reopened_(0) {
+    if (FLAGS_destroy_db_initially) {
+      std::vector<std::string> files;
+      FLAGS_env->GetChildren(FLAGS_db, &files);
+      for (unsigned int i = 0; i < files.size(); i++) {
+        if (Slice(files[i]).starts_with("heap-")) {
+          FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
+        }
+      }
+      DestroyDB(FLAGS_db, Options());
+    }
+  }
+
+  ~StressTest() {
+    for (auto cf : column_families_) {
+      delete cf;
+    }
+    column_families_.clear();
+    delete db_;
+  }
+
+  bool BuildOptionsTable() {
+    if (FLAGS_set_options_one_in <= 0) {
+      return true;
+    }
+
+    std::unordered_map<std::string, std::vector<std::string> > options_tbl = {
+        {"write_buffer_size",
+         {ToString(FLAGS_write_buffer_size),
+          ToString(FLAGS_write_buffer_size * 2),
+          ToString(FLAGS_write_buffer_size * 4)}},
+        {"max_write_buffer_number",
+         {ToString(FLAGS_max_write_buffer_number),
+          ToString(FLAGS_max_write_buffer_number * 2),
+          ToString(FLAGS_max_write_buffer_number * 4)}},
+        {"arena_block_size",
+         {
+             ToString(Options().arena_block_size),
+             ToString(FLAGS_write_buffer_size / 4),
+             ToString(FLAGS_write_buffer_size / 8),
+         }},
+        {"memtable_prefix_bloom_bits", {"0", "8", "10"}},
+        {"memtable_prefix_bloom_probes", {"4", "5", "6"}},
+        {"memtable_prefix_bloom_huge_page_tlb_size",
+         {"0", ToString(2 * 1024 * 1024)}},
+        {"max_successive_merges", {"0", "2", "4"}},
+        {"filter_deletes", {"0", "1"}},
+        {"inplace_update_num_locks", {"100", "200", "300"}},
+        // TODO(ljin): enable test for this option
+        // {"disable_auto_compactions", {"100", "200", "300"}},
+        {"soft_rate_limit", {"0", "0.5", "0.9"}},
+        {"hard_rate_limit", {"0", "1.1", "2.0"}},
+        {"level0_file_num_compaction_trigger",
+         {
+             ToString(FLAGS_level0_file_num_compaction_trigger),
+             ToString(FLAGS_level0_file_num_compaction_trigger + 2),
+             ToString(FLAGS_level0_file_num_compaction_trigger + 4),
+         }},
+        {"level0_slowdown_writes_trigger",
+         {
+             ToString(FLAGS_level0_slowdown_writes_trigger),
+             ToString(FLAGS_level0_slowdown_writes_trigger + 2),
+             ToString(FLAGS_level0_slowdown_writes_trigger + 4),
+         }},
+        {"level0_stop_writes_trigger",
+         {
+             ToString(FLAGS_level0_stop_writes_trigger),
+             ToString(FLAGS_level0_stop_writes_trigger + 2),
+             ToString(FLAGS_level0_stop_writes_trigger + 4),
+         }},
+        {"max_grandparent_overlap_factor",
+         {
+             ToString(Options().max_grandparent_overlap_factor - 5),
+             ToString(Options().max_grandparent_overlap_factor),
+             ToString(Options().max_grandparent_overlap_factor + 5),
+         }},
+        {"expanded_compaction_factor",
+         {
+             ToString(Options().expanded_compaction_factor - 5),
+             ToString(Options().expanded_compaction_factor),
+             ToString(Options().expanded_compaction_factor + 5),
+         }},
+        {"source_compaction_factor",
+         {
+             ToString(Options().source_compaction_factor),
+             ToString(Options().source_compaction_factor * 2),
+             ToString(Options().source_compaction_factor * 4),
+         }},
+        {"target_file_size_base",
+         {
+             ToString(FLAGS_target_file_size_base),
+             ToString(FLAGS_target_file_size_base * 2),
+             ToString(FLAGS_target_file_size_base * 4),
+         }},
+        {"target_file_size_multiplier",
+         {
+             ToString(FLAGS_target_file_size_multiplier), "1", "2",
+         }},
+        {"max_bytes_for_level_base",
+         {
+             ToString(FLAGS_max_bytes_for_level_base / 2),
+             ToString(FLAGS_max_bytes_for_level_base),
+             ToString(FLAGS_max_bytes_for_level_base * 2),
+         }},
+        {"max_bytes_for_level_multiplier",
+         {
+             ToString(FLAGS_max_bytes_for_level_multiplier), "1", "2",
+         }},
+        {"max_sequential_skip_in_iterations", {"4", "8", "12"}},
+    };
+
+    options_table_ = std::move(options_tbl);
+
+    for (const auto& iter : options_table_) {
+      options_index_.push_back(iter.first);
+    }
+    return true;
+  }
+
+  bool Run() {
+    PrintEnv();
+    BuildOptionsTable();
+    Open();
+    SharedState shared(this);
+    uint32_t n = shared.GetNumThreads();
+
+    std::vector<ThreadState*> threads(n);
+    for (uint32_t i = 0; i < n; i++) {
+      threads[i] = new ThreadState(i, &shared);
+      FLAGS_env->StartThread(ThreadBody, threads[i]);
+    }
+    ThreadState bg_thread(0, &shared);
+    if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
+      FLAGS_env->StartThread(PoolSizeChangeThread, &bg_thread);
+    }
+
+    // Each thread goes through the following states:
+    // initializing -> wait for others to init -> read/populate/depopulate
+    // wait for others to operate -> verify -> done
+
+    {
+      MutexLock l(shared.GetMutex());
+      while (!shared.AllInitialized()) {
+        shared.GetCondVar()->Wait();
+      }
+
+      double now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Starting database operations\n",
+              FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
+
+      shared.SetStart();
+      shared.GetCondVar()->SignalAll();
+      while (!shared.AllOperated()) {
+        shared.GetCondVar()->Wait();
+      }
+
+      now = FLAGS_env->NowMicros();
+      if (FLAGS_test_batches_snapshots) {
+        fprintf(stdout, "%s Limited verification already done during gets\n",
+                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
+      } else {
+        fprintf(stdout, "%s Starting verification\n",
+                FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
+      }
+
+      shared.SetStartVerify();
+      shared.GetCondVar()->SignalAll();
+      while (!shared.AllDone()) {
+        shared.GetCondVar()->Wait();
+      }
+    }
+
+    for (unsigned int i = 1; i < n; i++) {
+      threads[0]->stats.Merge(threads[i]->stats);
+    }
+    threads[0]->stats.Report("Stress Test");
+
+    for (unsigned int i = 0; i < n; i++) {
+      delete threads[i];
+      threads[i] = nullptr;
+    }
+    double now = FLAGS_env->NowMicros();
+    if (!FLAGS_test_batches_snapshots) {
+      fprintf(stdout, "%s Verification successful\n",
+              FLAGS_env->TimeToString((uint64_t) now/1000000).c_str());
+    }
+    PrintStatistics();
+
+    if (FLAGS_compaction_thread_pool_adjust_interval > 0) {
+      MutexLock l(shared.GetMutex());
+      shared.SetShouldStopBgThread();
+      while (!shared.BgThreadFinished()) {
+        shared.GetCondVar()->Wait();
+      }
+    }
+
+    if (shared.HasVerificationFailedYet()) {
+      printf("Verification failed :(\n");
+      return false;
+    }
+    return true;
+  }
+
+ private:
+
+  static void ThreadBody(void* v) {
+    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
+    SharedState* shared = thread->shared;
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncInitialized();
+      if (shared->AllInitialized()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->Started()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+    thread->shared->GetStressTest()->OperateDb(thread);
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncOperated();
+      if (shared->AllOperated()) {
+        shared->GetCondVar()->SignalAll();
+      }
+      while (!shared->VerifyStarted()) {
+        shared->GetCondVar()->Wait();
+      }
+    }
+
+    if (!FLAGS_test_batches_snapshots) {
+      thread->shared->GetStressTest()->VerifyDb(thread);
+    }
+
+    {
+      MutexLock l(shared->GetMutex());
+      shared->IncDone();
+      if (shared->AllDone()) {
+        shared->GetCondVar()->SignalAll();
+      }
+    }
+
+  }
+
+  static void PoolSizeChangeThread(void* v) {
+    assert(FLAGS_compaction_thread_pool_adjust_interval > 0);
+    ThreadState* thread = reinterpret_cast<ThreadState*>(v);
+    SharedState* shared = thread->shared;
+
+    while (true) {
+      {
+        MutexLock l(shared->GetMutex());
+        if (shared->ShoudStopBgThread()) {
+          shared->SetBgThreadFinish();
+          shared->GetCondVar()->SignalAll();
+          return;
+        }
+      }
+
+      auto thread_pool_size_base = FLAGS_max_background_compactions;
+      auto thread_pool_size_var = FLAGS_compaction_thread_pool_variations;
+      int new_thread_pool_size =
+          thread_pool_size_base - thread_pool_size_var +
+          thread->rand.Next() % (thread_pool_size_var * 2 + 1);
+      if (new_thread_pool_size < 1) {
+        new_thread_pool_size = 1;
+      }
+      FLAGS_env->SetBackgroundThreads(new_thread_pool_size);
+      // Sleep up to 3 seconds
+      FLAGS_env->SleepForMicroseconds(
+          thread->rand.Next() % FLAGS_compaction_thread_pool_adjust_interval *
+              1000 +
+          1);
+    }
+  }
+
+  // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
+  // ("9"+K, "9"+V) in DB atomically i.e in a single batch.
+  // Also refer MultiGet.
+  Status MultiPut(ThreadState* thread, const WriteOptions& writeoptions,
+                  ColumnFamilyHandle* column_family, const Slice& key,
+                  const Slice& value, size_t sz) {
+    std::string keys[10] = {"9", "8", "7", "6", "5",
+                            "4", "3", "2", "1", "0"};
+    std::string values[10] = {"9", "8", "7", "6", "5",
+                              "4", "3", "2", "1", "0"};
+    Slice value_slices[10];
+    WriteBatch batch;
+    Status s;
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key.ToString();
+      values[i] += value.ToString();
+      value_slices[i] = values[i];
+      if (FLAGS_use_merge) {
+        batch.Merge(column_family, keys[i], value_slices[i]);
+      } else {
+        batch.Put(column_family, keys[i], value_slices[i]);
+      }
+    }
+
+    s = db_->Write(writeoptions, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multiput error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      // we did 10 writes each of size sz + 1
+      thread->stats.AddBytesForWrites(10, (sz + 1) * 10);
+    }
+
+    return s;
+  }
+
+  // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K)
+  // in DB atomically i.e in a single batch. Also refer MultiGet.
+  Status MultiDelete(ThreadState* thread, const WriteOptions& writeoptions,
+                     ColumnFamilyHandle* column_family, const Slice& key) {
+    std::string keys[10] = {"9", "7", "5", "3", "1",
+                            "8", "6", "4", "2", "0"};
+
+    WriteBatch batch;
+    Status s;
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key.ToString();
+      batch.Delete(column_family, keys[i]);
+    }
+
+    s = db_->Write(writeoptions, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddDeletes(10);
+    }
+
+    return s;
+  }
+
+  // Given a key K, this gets values for "0"+K, "1"+K,..."9"+K
+  // in the same snapshot, and verifies that all the values are of the form
+  // "0"+V, "1"+V,..."9"+V.
+  // ASSUMES that MultiPut was used to put (K, V) into the DB.
+  Status MultiGet(ThreadState* thread, const ReadOptions& readoptions,
+                  ColumnFamilyHandle* column_family, const Slice& key,
+                  std::string* value) {
+    std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
+    Slice key_slices[10];
+    std::string values[10];
+    ReadOptions readoptionscopy = readoptions;
+    readoptionscopy.snapshot = db_->GetSnapshot();
+    Status s;
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key.ToString();
+      key_slices[i] = keys[i];
+      s = db_->Get(readoptionscopy, column_family, key_slices[i], value);
+      if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+        values[i] = "";
+        thread->stats.AddErrors(1);
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
+      } else if (s.IsNotFound()) {
+        values[i] = "";
+        thread->stats.AddGets(1, 0);
+      } else {
+        values[i] = *value;
+
+        char expected_prefix = (keys[i])[0];
+        char actual_prefix = (values[i])[0];
+        if (actual_prefix != expected_prefix) {
+          fprintf(stderr, "error expected prefix = %c actual = %c\n",
+                  expected_prefix, actual_prefix);
+        }
+        (values[i])[0] = ' '; // blank out the differing character
+        thread->stats.AddGets(1, 1);
+      }
+    }
+    db_->ReleaseSnapshot(readoptionscopy.snapshot);
+
+    // Now that we retrieved all values, check that they all match
+    for (int i = 1; i < 10; i++) {
+      if (values[i] != values[0]) {
+        fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
+                key.ToString(true).c_str(), StringToHex(values[0]).c_str(),
+                StringToHex(values[i]).c_str());
+      // we continue after error rather than exiting so that we can
+      // find more errors if any
+      }
+    }
+
+    return s;
+  }
+
+  // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P
+  // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes
+  // of the key. Each of these 10 scans returns a series of values;
+  // each series should be the same length, and it is verified for each
+  // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V.
+  // ASSUMES that MultiPut was used to put (K, V)
+  Status MultiPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
+                         ColumnFamilyHandle* column_family,
+                         const Slice& key) {
+    std::string prefixes[10] = {"0", "1", "2", "3", "4",
+                                "5", "6", "7", "8", "9"};
+    Slice prefix_slices[10];
+    ReadOptions readoptionscopy[10];
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Iterator* iters[10];
+    Status s = Status::OK();
+    for (int i = 0; i < 10; i++) {
+      prefixes[i] += key.ToString();
+      prefixes[i].resize(FLAGS_prefix_size);
+      prefix_slices[i] = Slice(prefixes[i]);
+      readoptionscopy[i] = readoptions;
+      readoptionscopy[i].snapshot = snapshot;
+      iters[i] = db_->NewIterator(readoptionscopy[i], column_family);
+      iters[i]->Seek(prefix_slices[i]);
+    }
+
+    int count = 0;
+    while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) {
+      count++;
+      std::string values[10];
+      // get list of all values for this iteration
+      for (int i = 0; i < 10; i++) {
+        // no iterator should finish before the first one
+        assert(iters[i]->Valid() &&
+               iters[i]->key().starts_with(prefix_slices[i]));
+        values[i] = iters[i]->value().ToString();
+
+        char expected_first = (prefixes[i])[0];
+        char actual_first = (values[i])[0];
+
+        if (actual_first != expected_first) {
+          fprintf(stderr, "error expected first = %c actual = %c\n",
+                  expected_first, actual_first);
+        }
+        (values[i])[0] = ' '; // blank out the differing character
+      }
+      // make sure all values are equivalent
+      for (int i = 0; i < 10; i++) {
+        if (values[i] != values[0]) {
+          fprintf(stderr, "error : %d, inconsistent values for prefix %s: %s, %s\n",
+                  i, prefixes[i].c_str(), StringToHex(values[0]).c_str(),
+                  StringToHex(values[i]).c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        }
+        iters[i]->Next();
+      }
+    }
+
+    // cleanup iterators and snapshot
+    for (int i = 0; i < 10; i++) {
+      // if the first iterator finished, they should have all finished
+      assert(!iters[i]->Valid() ||
+             !iters[i]->key().starts_with(prefix_slices[i]));
+      assert(iters[i]->status().ok());
+      delete iters[i];
+    }
+    db_->ReleaseSnapshot(snapshot);
+
+    if (s.ok()) {
+      thread->stats.AddPrefixes(1, count);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+
+    return s;
+  }
+
+  // Given a key K, this creates an iterator which scans to K and then
+  // does a random sequence of Next/Prev operations.
+  Status MultiIterate(ThreadState* thread, const ReadOptions& readoptions,
+                      ColumnFamilyHandle* column_family, const Slice& key) {
+    Status s;
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ReadOptions readoptionscopy = readoptions;
+    readoptionscopy.snapshot = snapshot;
+    unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, column_family));
+
+    iter->Seek(key);
+    for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
+      if (thread->rand.OneIn(2)) {
+        iter->Next();
+      } else {
+        iter->Prev();
+      }
+    }
+
+    if (s.ok()) {
+      thread->stats.AddIterations(1);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+
+    db_->ReleaseSnapshot(snapshot);
+
+    return s;
+  }
+
+  Status SetOptions(ThreadState* thread) {
+    assert(FLAGS_set_options_one_in > 0);
+    std::unordered_map<std::string, std::string> opts;
+    std::string name = options_index_[
+      thread->rand.Next() % options_index_.size()];
+    int value_idx = thread->rand.Next() % options_table_[name].size();
+    if (name == "soft_rate_limit" || name == "hard_rate_limit") {
+      opts["soft_rate_limit"] = options_table_["soft_rate_limit"][value_idx];
+      opts["hard_rate_limit"] = options_table_["hard_rate_limit"][value_idx];
+    } else if (name == "level0_file_num_compaction_trigger" ||
+               name == "level0_slowdown_writes_trigger" ||
+               name == "level0_stop_writes_trigger") {
+      opts["level0_file_num_compaction_trigger"] =
+        options_table_["level0_file_num_compaction_trigger"][value_idx];
+      opts["level0_slowdown_writes_trigger"] =
+        options_table_["level0_slowdown_writes_trigger"][value_idx];
+      opts["level0_stop_writes_trigger"] =
+        options_table_["level0_stop_writes_trigger"][value_idx];
+    } else {
+      opts[name] = options_table_[name][value_idx];
+    }
+
+    int rand_cf_idx = thread->rand.Next() % FLAGS_column_families;
+    auto cfh = column_families_[rand_cf_idx];
+    return db_->SetOptions(cfh, opts);
+  }
+
+  void OperateDb(ThreadState* thread) {
+    ReadOptions read_opts(FLAGS_verify_checksum, true);
+    WriteOptions write_opts;
+    auto shared = thread->shared;
+    char value[100];
+    long max_key = thread->shared->GetMaxKey();
+    std::string from_db;
+    if (FLAGS_sync) {
+      write_opts.sync = true;
+    }
+    write_opts.disableWAL = FLAGS_disable_wal;
+    const int prefixBound = (int)FLAGS_readpercent + (int)FLAGS_prefixpercent;
+    const int writeBound = prefixBound + (int)FLAGS_writepercent;
+    const int delBound = writeBound + (int)FLAGS_delpercent;
+
+    thread->stats.Start();
+    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
+      if (thread->shared->HasVerificationFailedYet()) {
+        break;
+      }
+      if (i != 0 && (i % (FLAGS_ops_per_thread / (FLAGS_reopen + 1))) == 0) {
+        {
+          thread->stats.FinishedSingleOp();
+          MutexLock l(thread->shared->GetMutex());
+          thread->shared->IncVotedReopen();
+          if (thread->shared->AllVotedReopen()) {
+            thread->shared->GetStressTest()->Reopen();
+            thread->shared->GetCondVar()->SignalAll();
+          }
+          else {
+            thread->shared->GetCondVar()->Wait();
+          }
+          // Commenting this out as we don't want to reset stats on each open.
+          // thread->stats.Start();
+        }
+      }
+
+      // Change Options
+      if (FLAGS_set_options_one_in > 0 &&
+          thread->rand.OneIn(FLAGS_set_options_one_in)) {
+        SetOptions(thread);
+      }
+
+      if (FLAGS_set_in_place_one_in > 0 &&
+          thread->rand.OneIn(FLAGS_set_in_place_one_in)) {
+        options_.inplace_update_support ^= options_.inplace_update_support;
+      }
+
+      if (!FLAGS_test_batches_snapshots &&
+          FLAGS_clear_column_family_one_in != 0 && FLAGS_column_families > 1) {
+        if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
+          // drop column family and then create it again (can't drop default)
+          int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
+          std::string new_name =
+              ToString(new_column_family_name_.fetch_add(1));
+          {
+            MutexLock l(thread->shared->GetMutex());
+            fprintf(
+                stdout,
+                "[CF %d] Dropping and recreating column family. new name: %s\n",
+                cf, new_name.c_str());
+          }
+          thread->shared->LockColumnFamily(cf);
+          Status s __attribute__((unused));
+          s = db_->DropColumnFamily(column_families_[cf]);
+          delete column_families_[cf];
+          if (!s.ok()) {
+            fprintf(stderr, "dropping column family error: %s\n",
+                s.ToString().c_str());
+            std::terminate();
+          }
+          s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
+                                      &column_families_[cf]);
+          column_family_names_[cf] = new_name;
+          thread->shared->ClearColumnFamily(cf);
+          if (!s.ok()) {
+            fprintf(stderr, "creating column family error: %s\n",
+                s.ToString().c_str());
+            std::terminate();
+          }
+          thread->shared->UnlockColumnFamily(cf);
+        }
+      }
+
+      long rand_key = thread->rand.Next() % max_key;
+      int rand_column_family = thread->rand.Next() % FLAGS_column_families;
+      std::string keystr = Key(rand_key);
+      Slice key = keystr;
+      std::unique_ptr<MutexLock> l;
+      if (!FLAGS_test_batches_snapshots) {
+        l.reset(new MutexLock(
+            shared->GetMutexForKey(rand_column_family, rand_key)));
+      }
+      auto column_family = column_families_[rand_column_family];
+
+      int prob_op = thread->rand.Uniform(100);
+      if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
+        // OPERATION read
+        if (!FLAGS_test_batches_snapshots) {
+          Status s = db_->Get(read_opts, column_family, key, &from_db);
+          if (s.ok()) {
+            // found case
+            thread->stats.AddGets(1, 1);
+          } else if (s.IsNotFound()) {
+            // not found case
+            thread->stats.AddGets(1, 0);
+          } else {
+            // errors case
+            thread->stats.AddErrors(1);
+          }
+        } else {
+          MultiGet(thread, read_opts, column_family, key, &from_db);
+        }
+      } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
+        // OPERATION prefix scan
+        // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
+        // (8 - FLAGS_prefix_size) bytes besides the prefix. So there will
+        // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
+        // prefix
+        if (!FLAGS_test_batches_snapshots) {
+          Slice prefix = Slice(key.data(), FLAGS_prefix_size);
+          Iterator* iter = db_->NewIterator(read_opts, column_family);
+          int64_t count = 0;
+          for (iter->Seek(prefix);
+               iter->Valid() && iter->key().starts_with(prefix); iter->Next()) {
+            ++count;
+          }
+          assert(count <=
+                 (static_cast<int64_t>(1) << ((8 - FLAGS_prefix_size) * 8)));
+          if (iter->status().ok()) {
+            thread->stats.AddPrefixes(1, static_cast<int>(count));
+          } else {
+            thread->stats.AddErrors(1);
+          }
+          delete iter;
+        } else {
+          MultiPrefixScan(thread, read_opts, column_family, key);
+        }
+      } else if (prefixBound <= prob_op && prob_op < writeBound) {
+        // OPERATION write
+        uint32_t value_base = thread->rand.Next();
+        size_t sz = GenerateValue(value_base, value, sizeof(value));
+        Slice v(value, sz);
+        if (!FLAGS_test_batches_snapshots) {
+          // If the chosen key does not allow overwrite and it already
+          // exists, choose another key.
+          while (!shared->AllowsOverwrite(rand_column_family, rand_key) &&
+                 shared->Exists(rand_column_family, rand_key)) {
+            l.reset();
+            rand_key = thread->rand.Next() % max_key;
+            rand_column_family = thread->rand.Next() % FLAGS_column_families;
+            l.reset(new MutexLock(
+                shared->GetMutexForKey(rand_column_family, rand_key)));
+          }
+
+          keystr = Key(rand_key);
+          key = keystr;
+          column_family = column_families_[rand_column_family];
+
+          if (FLAGS_verify_before_write) {
+            std::string keystr2 = Key(rand_key);
+            Slice k = keystr2;
+            Status s = db_->Get(read_opts, column_family, k, &from_db);
+            if (!VerifyValue(rand_column_family, rand_key, read_opts,
+                             thread->shared, from_db, s, true)) {
+              break;
+            }
+          }
+          shared->Put(rand_column_family, rand_key, value_base);
+          Status s;
+          if (FLAGS_use_merge) {
+            s = db_->Merge(write_opts, column_family, key, v);
+          } else {
+            s = db_->Put(write_opts, column_family, key, v);
+          }
+          if (!s.ok()) {
+            fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
+            std::terminate();
+          }
+          thread->stats.AddBytesForWrites(1, sz);
+        } else {
+          MultiPut(thread, write_opts, column_family, key, v, sz);
+        }
+        PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key),
+                      value, sz);
+      } else if (writeBound <= prob_op && prob_op < delBound) {
+        // OPERATION delete
+        if (!FLAGS_test_batches_snapshots) {
+          // If the chosen key does not allow overwrite and it does not exist,
+          // choose another key.
+          while (!shared->AllowsOverwrite(rand_column_family, rand_key) &&
+                 !shared->Exists(rand_column_family, rand_key)) {
+            l.reset();
+            rand_key = thread->rand.Next() % max_key;
+            rand_column_family = thread->rand.Next() % FLAGS_column_families;
+            l.reset(new MutexLock(
+                shared->GetMutexForKey(rand_column_family, rand_key)));
+          }
+
+          keystr = Key(rand_key);
+          key = keystr;
+          column_family = column_families_[rand_column_family];
+
+          // Use delete if the key may be overwritten and a single deletion
+          // otherwise.
+          if (shared->AllowsOverwrite(rand_column_family, rand_key)) {
+            shared->Delete(rand_column_family, rand_key);
+            Status s = db_->Delete(write_opts, column_family, key);
+            thread->stats.AddDeletes(1);
+            if (!s.ok()) {
+              fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
+              std::terminate();
+            }
+          } else {
+            shared->SingleDelete(rand_column_family, rand_key);
+            Status s = db_->SingleDelete(write_opts, column_family, key);
+            thread->stats.AddSingleDeletes(1);
+            if (!s.ok()) {
+              fprintf(stderr, "single delete error: %s\n",
+                      s.ToString().c_str());
+              std::terminate();
+            }
+          }
+        } else {
+          MultiDelete(thread, write_opts, column_family, key);
+        }
+      } else {
+        // OPERATION iterate
+        MultiIterate(thread, read_opts, column_family, key);
+      }
+      thread->stats.FinishedSingleOp();
+    }
+
+    thread->stats.Stop();
+  }
+
+  void VerifyDb(ThreadState* thread) const {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    auto shared = thread->shared;
+    const int64_t max_key = shared->GetMaxKey();
+    const int64_t keys_per_thread = max_key / shared->GetNumThreads();
+    int64_t start = keys_per_thread * thread->tid;
+    int64_t end = start + keys_per_thread;
+    if (thread->tid == shared->GetNumThreads() - 1) {
+      end = max_key;
+    }
+    for (size_t cf = 0; cf < column_families_.size(); ++cf) {
+      if (thread->shared->HasVerificationFailedYet()) {
+        break;
+      }
+      if (!thread->rand.OneIn(2)) {
+        // Use iterator to verify this range
+        unique_ptr<Iterator> iter(
+            db_->NewIterator(options, column_families_[cf]));
+        iter->Seek(Key(start));
+        for (long i = start; i < end; i++) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+          // TODO(ljin): update "long" to uint64_t
+          // Reseek when the prefix changes
+          if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) ==
+              0) {
+            iter->Seek(Key(i));
+          }
+          std::string from_db;
+          std::string keystr = Key(i);
+          Slice k = keystr;
+          Status s = iter->status();
+          if (iter->Valid()) {
+            if (iter->key().compare(k) > 0) {
+              s = Status::NotFound(Slice());
+            } else if (iter->key().compare(k) == 0) {
+              from_db = iter->value().ToString();
+              iter->Next();
+            } else if (iter->key().compare(k) < 0) {
+              VerificationAbort(shared, "An out of range key was found",
+                                static_cast<int>(cf), i);
+            }
+          } else {
+            // The iterator found no value for the key in question, so do not
+            // move to the next item in the iterator
+            s = Status::NotFound(Slice());
+          }
+          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
+                      true);
+          if (from_db.length()) {
+            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
+                          from_db.data(), from_db.length());
+          }
+        }
+      } else {
+        // Use Get to verify this range
+        for (long i = start; i < end; i++) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+          std::string from_db;
+          std::string keystr = Key(i);
+          Slice k = keystr;
+          Status s = db_->Get(options, column_families_[cf], k, &from_db);
+          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
+                      true);
+          if (from_db.length()) {
+            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
+                          from_db.data(), from_db.length());
+          }
+        }
+      }
+    }
+  }
+
+  void VerificationAbort(SharedState* shared, std::string msg, int cf,
+                         long key) const {
+    printf("Verification failed for column family %d key %ld: %s\n", cf, key,
+           msg.c_str());
+    shared->SetVerificationFailure();
+  }
+
+  bool VerifyValue(int cf, long key, const ReadOptions& opts,
+                   SharedState* shared, const std::string& value_from_db,
+                   Status s, bool strict = false) const {
+    if (shared->HasVerificationFailedYet()) {
+      return false;
+    }
+    // compare value_from_db with the value in the shared state
+    char value[100];
+    uint32_t value_base = shared->Get(cf, key);
+    if (value_base == SharedState::SENTINEL && !strict) {
+      return true;
+    }
+
+    if (s.ok()) {
+      if (value_base == SharedState::SENTINEL) {
+        VerificationAbort(shared, "Unexpected value found", cf, key);
+        return false;
+      }
+      size_t sz = GenerateValue(value_base, value, sizeof(value));
+      if (value_from_db.length() != sz) {
+        VerificationAbort(shared, "Length of value read is not equal", cf, key);
+        return false;
+      }
+      if (memcmp(value_from_db.data(), value, sz) != 0) {
+        VerificationAbort(shared, "Contents of value read don't match", cf,
+                          key);
+        return false;
+      }
+    } else {
+      if (value_base != SharedState::SENTINEL) {
+        VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key);
+        return false;
+      }
+    }
+    return true;
+  }
+
+  static void PrintKeyValue(int cf, uint32_t key, const char* value,
+                            size_t sz) {
+    if (!FLAGS_verbose) {
+      return;
+    }
+    fprintf(stdout, "[CF %d] %u ==> (%u) ", cf, key, (unsigned int)sz);
+    for (size_t i = 0; i < sz; i++) {
+      fprintf(stdout, "%X", value[i]);
+    }
+    fprintf(stdout, "\n");
+  }
+
+  static size_t GenerateValue(uint32_t rand, char *v, size_t max_sz) {
+    size_t value_sz = ((rand % 3) + 1) * FLAGS_value_size_mult;
+    assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
+    *((uint32_t*)v) = rand;
+    for (size_t i=sizeof(uint32_t); i < value_sz; i++) {
+      v[i] = (char)(rand ^ i);
+    }
+    v[value_sz] = '\0';
+    return value_sz; // the size of the value set.
+  }
+
+  void PrintEnv() const {
+    fprintf(stdout, "RocksDB version           : %d.%d\n", kMajorVersion,
+            kMinorVersion);
+    fprintf(stdout, "Column families           : %d\n", FLAGS_column_families);
+    if (!FLAGS_test_batches_snapshots) {
+      fprintf(stdout, "Clear CFs one in          : %d\n",
+              FLAGS_clear_column_family_one_in);
+    }
+    fprintf(stdout, "Number of threads         : %d\n", FLAGS_threads);
+    fprintf(stdout, "Ops per thread            : %lu\n",
+            (unsigned long)FLAGS_ops_per_thread);
+    std::string ttl_state("unused");
+    if (FLAGS_ttl > 0) {
+      ttl_state = NumberToString(FLAGS_ttl);
+    }
+    fprintf(stdout, "Time to live(sec)         : %s\n", ttl_state.c_str());
+    fprintf(stdout, "Read percentage           : %d%%\n", FLAGS_readpercent);
+    fprintf(stdout, "Prefix percentage         : %d%%\n", FLAGS_prefixpercent);
+    fprintf(stdout, "Write percentage          : %d%%\n", FLAGS_writepercent);
+    fprintf(stdout, "Delete percentage         : %d%%\n", FLAGS_delpercent);
+    fprintf(stdout, "No overwrite percentage   : %d%%\n",
+            FLAGS_nooverwritepercent);
+    fprintf(stdout, "Iterate percentage        : %d%%\n", FLAGS_iterpercent);
+    fprintf(stdout, "DB-write-buffer-size      : %" PRIu64 "\n",
+            FLAGS_db_write_buffer_size);
+    fprintf(stdout, "Write-buffer-size         : %d\n",
+            FLAGS_write_buffer_size);
+    fprintf(stdout, "Iterations                : %lu\n",
+            (unsigned long)FLAGS_num_iterations);
+    fprintf(stdout, "Max key                   : %lu\n",
+            (unsigned long)FLAGS_max_key);
+    fprintf(stdout, "Ratio #ops/#keys          : %f\n",
+            (1.0 * FLAGS_ops_per_thread * FLAGS_threads) / FLAGS_max_key);
+    fprintf(stdout, "Num times DB reopens      : %d\n", FLAGS_reopen);
+    fprintf(stdout, "Batches/snapshots         : %d\n",
+            FLAGS_test_batches_snapshots);
+    fprintf(stdout, "Deletes use filter        : %d\n", FLAGS_filter_deletes);
+    fprintf(stdout, "Do update in place        : %d\n", FLAGS_in_place_update);
+    fprintf(stdout, "Num keys per lock         : %d\n",
+            1 << FLAGS_log2_keys_per_lock);
+    std::string compression = CompressionTypeToString(FLAGS_compression_type_e);
+    fprintf(stdout, "Compression               : %s\n", compression.c_str());
+
+    const char* memtablerep = "";
+    switch (FLAGS_rep_factory) {
+      case kSkipList:
+        memtablerep = "skip_list";
+        break;
+      case kHashSkipList:
+        memtablerep = "prefix_hash";
+        break;
+      case kVectorRep:
+        memtablerep = "vector";
+        break;
+    }
+
+    fprintf(stdout, "Memtablerep               : %s\n", memtablerep);
+
+    fprintf(stdout, "------------------------------------------------\n");
+  }
+
+  void Open() {
+    assert(db_ == nullptr);
+    BlockBasedTableOptions block_based_options;
+    block_based_options.block_cache = cache_;
+    block_based_options.block_cache_compressed = compressed_cache_;
+    block_based_options.block_size = FLAGS_block_size;
+    block_based_options.format_version = 2;
+    block_based_options.filter_policy = filter_policy_;
+    options_.table_factory.reset(
+        NewBlockBasedTableFactory(block_based_options));
+    options_.db_write_buffer_size = FLAGS_db_write_buffer_size;
+    options_.write_buffer_size = FLAGS_write_buffer_size;
+    options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options_.min_write_buffer_number_to_merge =
+        FLAGS_min_write_buffer_number_to_merge;
+    options_.max_write_buffer_number_to_maintain =
+        FLAGS_max_write_buffer_number_to_maintain;
+    options_.max_background_compactions = FLAGS_max_background_compactions;
+    options_.max_background_flushes = FLAGS_max_background_flushes;
+    options_.compaction_style =
+        static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
+    options_.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size));
+    options_.max_open_files = FLAGS_open_files;
+    options_.statistics = dbstats;
+    options_.env = FLAGS_env;
+    options_.disableDataSync = FLAGS_disable_data_sync;
+    options_.use_fsync = FLAGS_use_fsync;
+    options_.allow_mmap_reads = FLAGS_mmap_read;
+    rocksdb_kill_odds = FLAGS_kill_random_test;
+    options_.target_file_size_base = FLAGS_target_file_size_base;
+    options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
+    options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
+    options_.max_bytes_for_level_multiplier =
+        FLAGS_max_bytes_for_level_multiplier;
+    options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
+    options_.level0_slowdown_writes_trigger =
+        FLAGS_level0_slowdown_writes_trigger;
+    options_.level0_file_num_compaction_trigger =
+        FLAGS_level0_file_num_compaction_trigger;
+    options_.compression = FLAGS_compression_type_e;
+    options_.create_if_missing = true;
+    options_.max_manifest_file_size = 10 * 1024;
+    options_.filter_deletes = FLAGS_filter_deletes;
+    options_.inplace_update_support = FLAGS_in_place_update;
+    options_.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
+    if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) {
+      fprintf(stderr,
+            "prefix_size should be non-zero iff memtablerep == prefix_hash\n");
+      exit(1);
+    }
+    switch (FLAGS_rep_factory) {
+      case kSkipList:
+        // no need to do anything
+        break;
+#ifndef ROCKSDB_LITE
+      case kHashSkipList:
+        options_.memtable_factory.reset(NewHashSkipListRepFactory(10000));
+        break;
+      case kVectorRep:
+        options_.memtable_factory.reset(new VectorRepFactory());
+        break;
+#else
+      default:
+        fprintf(stderr,
+                "RocksdbLite only supports skip list mem table. Skip "
+                "--rep_factory\n");
+#endif  // ROCKSDB_LITE
+    }
+
+    if (FLAGS_use_merge) {
+      options_.merge_operator = MergeOperators::CreatePutOperator();
+    }
+
+    // set universal style compaction configurations, if applicable
+    if (FLAGS_universal_size_ratio != 0) {
+      options_.compaction_options_universal.size_ratio =
+          FLAGS_universal_size_ratio;
+    }
+    if (FLAGS_universal_min_merge_width != 0) {
+      options_.compaction_options_universal.min_merge_width =
+          FLAGS_universal_min_merge_width;
+    }
+    if (FLAGS_universal_max_merge_width != 0) {
+      options_.compaction_options_universal.max_merge_width =
+          FLAGS_universal_max_merge_width;
+    }
+    if (FLAGS_universal_max_size_amplification_percent != 0) {
+      options_.compaction_options_universal.max_size_amplification_percent =
+          FLAGS_universal_max_size_amplification_percent;
+    }
+
+    fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
+
+    Status s;
+    if (FLAGS_ttl == -1) {
+      std::vector<std::string> existing_column_families;
+      s = DB::ListColumnFamilies(DBOptions(options_), FLAGS_db,
+                                 &existing_column_families);  // ignore errors
+      if (!s.ok()) {
+        // DB doesn't exist
+        assert(existing_column_families.empty());
+        assert(column_family_names_.empty());
+        column_family_names_.push_back(kDefaultColumnFamilyName);
+      } else if (column_family_names_.empty()) {
+        // this is the first call to the function Open()
+        column_family_names_ = existing_column_families;
+      } else {
+        // this is a reopen. just assert that existing column_family_names are
+        // equivalent to what we remember
+        auto sorted_cfn = column_family_names_;
+        sort(sorted_cfn.begin(), sorted_cfn.end());
+        sort(existing_column_families.begin(), existing_column_families.end());
+        if (sorted_cfn != existing_column_families) {
+          fprintf(stderr,
+                  "Expected column families differ from the existing:\n");
+          printf("Expected: {");
+          for (auto cf : sorted_cfn) {
+            printf("%s ", cf.c_str());
+          }
+          printf("}\n");
+          printf("Existing: {");
+          for (auto cf : existing_column_families) {
+            printf("%s ", cf.c_str());
+          }
+          printf("}\n");
+        }
+        assert(sorted_cfn == existing_column_families);
+      }
+      std::vector<ColumnFamilyDescriptor> cf_descriptors;
+      for (auto name : column_family_names_) {
+        if (name != kDefaultColumnFamilyName) {
+          new_column_family_name_ =
+              std::max(new_column_family_name_.load(), std::stoi(name) + 1);
+        }
+        cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
+      }
+      while (cf_descriptors.size() < (size_t)FLAGS_column_families) {
+        std::string name = ToString(new_column_family_name_.load());
+        new_column_family_name_++;
+        cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_));
+        column_family_names_.push_back(name);
+      }
+      options_.listeners.clear();
+      options_.listeners.emplace_back(
+          new DbStressListener(FLAGS_db, options_.db_paths));
+      options_.create_missing_column_families = true;
+      s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
+                   &column_families_, &db_);
+      assert(!s.ok() || column_families_.size() ==
+                            static_cast<size_t>(FLAGS_column_families));
+    } else {
+#ifndef ROCKSDB_LITE
+      DBWithTTL* db_with_ttl;
+      s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl);
+      db_ = db_with_ttl;
+#else
+      fprintf(stderr, "TTL is not supported in RocksDBLite\n");
+      exit(1);
+#endif
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  void Reopen() {
+    for (auto cf : column_families_) {
+      delete cf;
+    }
+    column_families_.clear();
+    delete db_;
+    db_ = nullptr;
+
+    num_times_reopened_++;
+    double now = FLAGS_env->NowMicros();
+    fprintf(stdout, "%s Reopening database for the %dth time\n",
+            FLAGS_env->TimeToString((uint64_t) now/1000000).c_str(),
+            num_times_reopened_);
+    Open();
+  }
+
+  void PrintStatistics() {
+    if (dbstats) {
+      fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
+    }
+  }
+
+ private:
+  std::shared_ptr<Cache> cache_;
+  std::shared_ptr<Cache> compressed_cache_;
+  std::shared_ptr<const FilterPolicy> filter_policy_;
+  DB* db_;
+  Options options_;
+  std::vector<ColumnFamilyHandle*> column_families_;
+  std::vector<std::string> column_family_names_;
+  std::atomic<int> new_column_family_name_;
+  int num_times_reopened_;
+  std::unordered_map<std::string, std::vector<std::string>> options_table_;
+  std::vector<std::string> options_index_;
+};
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_statistics) {
+    dbstats = rocksdb::CreateDBStatistics();
+  }
+  FLAGS_compression_type_e =
+    StringToCompressionType(FLAGS_compression_type.c_str());
+  if (!FLAGS_hdfs.empty()) {
+    FLAGS_env  = new rocksdb::HdfsEnv(FLAGS_hdfs);
+  }
+  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
+
+  // The number of background threads should be at least as much the
+  // max number of concurrent compactions.
+  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
+
+  if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size <= 0) {
+    fprintf(stderr,
+            "Error: prefixpercent is non-zero while prefix_size is "
+            "not positive!\n");
+    exit(1);
+  }
+  if (FLAGS_test_batches_snapshots && FLAGS_prefix_size <= 0) {
+    fprintf(stderr,
+            "Error: please specify prefix_size for "
+            "test_batches_snapshots test!\n");
+    exit(1);
+  }
+  if ((FLAGS_readpercent + FLAGS_prefixpercent +
+       FLAGS_writepercent + FLAGS_delpercent + FLAGS_iterpercent) != 100) {
+      fprintf(stderr,
+              "Error: Read+Prefix+Write+Delete+Iterate percents != 100!\n");
+      exit(1);
+  }
+  if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) {
+      fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n");
+      exit(1);
+  }
+  if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) {
+      fprintf(stderr,
+              "Error: #DB-reopens should be < ops_per_thread\n"
+              "Provided reopens = %d and ops_per_thread = %lu\n",
+              FLAGS_reopen,
+              (unsigned long)FLAGS_ops_per_thread);
+      exit(1);
+  }
+
+  // Choose a location for the test database if none given with --db=<path>
+  if (FLAGS_db.empty()) {
+      std::string default_db_path;
+      rocksdb::Env::Default()->GetTestDirectory(&default_db_path);
+      default_db_path += "/dbstress";
+      FLAGS_db = default_db_path;
+  }
+
+  rocksdb::StressTest stress;
+  if (stress.Run()) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/tools/dbench_monitor b/src/rocksdb/tools/dbench_monitor
new file mode 100755
index 0000000..10726dc
--- /dev/null
+++ b/src/rocksdb/tools/dbench_monitor
@@ -0,0 +1,102 @@
+#!/bin/bash
+#
+#(c) 2004-present, Facebook Inc. All rights reserved.
+#
+#see LICENSE file for more information on use/redistribution rights.
+#
+
+#
+#dbench_monitor: monitor db_bench process for violation of memory utilization
+#
+#default usage will monitor 'virtual memory size'. See below for standard options
+#passed to db_bench during this test.
+#
+# See also: ./pflag for the actual monitoring script that does the work
+#
+#NOTE:
+#  You may end up with some /tmp/ files if db_bench OR
+#  this script OR ./pflag was killed unceremoniously
+#
+#  If you see the script taking a long time, trying "kill"
+#  will usually cleanly exit.
+#
+#
+DIR=`dirname $0`
+LOG=/tmp/`basename $0`.$$
+DB_BENCH="$DIR/../db_bench";
+PFLAG=${DIR}/pflag
+
+usage() {
+    cat <<HELP; exit
+
+Usage: $0  [-h]
+
+-h: prints this help message
+
+This program will run the db_bench script to monitor memory usage
+using the 'pflag' program. It launches db_bench with default settings
+for certain arguments. You can change the defaults passed to
+'db_bench' program, by setting the following environment 
+variables:
+
+  bs [block_size]
+  ztype [compression_type]
+  benches [benchmarks]
+  reads [reads]
+  threads [threads]
+  cs [cache_size]
+  vsize [value_size]
+  comp [compression_ratio]
+  num [num]
+
+See the code for more info
+
+HELP
+
+}
+
+[ ! -x ${DB_BENCH} ] && echo "WARNING: ${DB_BENCH} doesn't exist, abort!" && exit -1;
+
+[ "x$1" = "x-h" ] && usage;
+
+trap 'rm -f ${LOG}; kill ${PID}; echo "Interrupted, exiting";' 1 2 3 15
+
+touch $LOG;
+
+: ${bs:=16384}
+: ${ztype:=zlib}
+: ${benches:=readwhilewriting}
+: ${reads:=$((1*1024*1024))};
+: ${threads:=8}
+: ${vsize:=2000}
+: ${comp:=0.5}
+: ${num:=10000}
+: ${cs:=$((1*1024*1024*1024))};
+
+DEBUG=1    #Set to 0 to remove chattiness 
+
+
+if [ "x$DEBUG" != "x" ]; then
+  #
+  #NOTE: under some circumstances, --use_existing_db may leave LOCK files under ${TMPDIR}/rocksdb/*
+  #cleanup the dir and re-run
+  #
+  echo DEBUG: Will run $DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs  --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db 
+
+fi
+
+$DB_BENCH --block_size=$bs --compression_type=$ztype --benchmarks="$benches" --reads="$reads" --threads="$threads" --cache_size=$cs  --value_size=$vsize --compression_ratio=$comp --num=$num --use_existing_db >$LOG 2>&1 &
+
+if [ $? -ne 0 ]; then
+  warn "WARNING: ${DB_BENCH} did not launch successfully! Abort!";
+  exit;
+fi
+PID=$!
+
+#
+#Start the monitoring. Default is "vsz" monitoring for upto cache_size ($cs) value of virtual mem
+#You could also monitor RSS and CPUTIME (bsdtime). Try 'pflag -h' for how to do this
+#
+${PFLAG} -p $PID -v
+
+rm -f $LOG;
diff --git a/src/rocksdb/tools/dump/db_dump_tool.cc b/src/rocksdb/tools/dump/db_dump_tool.cc
new file mode 100644
index 0000000..389e65d
--- /dev/null
+++ b/src/rocksdb/tools/dump/db_dump_tool.cc
@@ -0,0 +1,261 @@
+//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <iostream>
+
+#include "rocksdb/db.h"
+#include "rocksdb/db_dump_tool.h"
+#include "rocksdb/env.h"
+#include "util/coding.h"
+
+namespace rocksdb {
+
+bool DbDumpTool::Run(const DumpOptions& dump_options,
+                     rocksdb::Options options) {
+  rocksdb::DB* dbptr;
+  rocksdb::Status status;
+  std::unique_ptr<rocksdb::WritableFile> dumpfile;
+  char hostname[1024];
+  int64_t timesec;
+  std::string abspath;
+  char json[4096];
+
+  static const char* magicstr = "ROCKDUMP";
+  static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1};
+
+  rocksdb::Env* env = rocksdb::Env::Default();
+
+  // Open the database
+  options.create_if_missing = false;
+  status = rocksdb::DB::OpenForReadOnly(options, dump_options.db_path, &dbptr);
+  if (!status.ok()) {
+    std::cerr << "Unable to open database '" << dump_options.db_path
+              << "' for reading: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  const std::unique_ptr<rocksdb::DB> db(dbptr);
+
+  status = env->NewWritableFile(dump_options.dump_location, &dumpfile,
+                                rocksdb::EnvOptions());
+  if (!status.ok()) {
+    std::cerr << "Unable to open dump file '" << dump_options.dump_location
+              << "' for writing: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  rocksdb::Slice magicslice(magicstr, 8);
+  status = dumpfile->Append(magicslice);
+  if (!status.ok()) {
+    std::cerr << "Append failed: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  rocksdb::Slice versionslice(versionstr, 8);
+  status = dumpfile->Append(versionslice);
+  if (!status.ok()) {
+    std::cerr << "Append failed: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  if (dump_options.anonymous) {
+    snprintf(json, sizeof(json), "{}");
+  } else {
+    status = env->GetHostName(hostname, sizeof(hostname));
+    status = env->GetCurrentTime(&timesec);
+    status = env->GetAbsolutePath(dump_options.db_path, &abspath);
+    snprintf(json, sizeof(json),
+             "{ \"database-path\": \"%s\", \"hostname\": \"%s\", "
+             "\"creation-time\": %" PRIi64 " }",
+             abspath.c_str(), hostname, timesec);
+  }
+
+  rocksdb::Slice infoslice(json, strlen(json));
+  char infosize[4];
+  rocksdb::EncodeFixed32(infosize, (uint32_t)infoslice.size());
+  rocksdb::Slice infosizeslice(infosize, 4);
+  status = dumpfile->Append(infosizeslice);
+  if (!status.ok()) {
+    std::cerr << "Append failed: " << status.ToString() << std::endl;
+    return false;
+  }
+  status = dumpfile->Append(infoslice);
+  if (!status.ok()) {
+    std::cerr << "Append failed: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  const std::unique_ptr<rocksdb::Iterator> it(
+      db->NewIterator(rocksdb::ReadOptions()));
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    char keysize[4];
+    rocksdb::EncodeFixed32(keysize, (uint32_t)it->key().size());
+    rocksdb::Slice keysizeslice(keysize, 4);
+    status = dumpfile->Append(keysizeslice);
+    if (!status.ok()) {
+      std::cerr << "Append failed: " << status.ToString() << std::endl;
+      return false;
+    }
+    status = dumpfile->Append(it->key());
+    if (!status.ok()) {
+      std::cerr << "Append failed: " << status.ToString() << std::endl;
+      return false;
+    }
+
+    char valsize[4];
+    rocksdb::EncodeFixed32(valsize, (uint32_t)it->value().size());
+    rocksdb::Slice valsizeslice(valsize, 4);
+    status = dumpfile->Append(valsizeslice);
+    if (!status.ok()) {
+      std::cerr << "Append failed: " << status.ToString() << std::endl;
+      return false;
+    }
+    status = dumpfile->Append(it->value());
+    if (!status.ok()) {
+      std::cerr << "Append failed: " << status.ToString() << std::endl;
+      return false;
+    }
+  }
+  if (!it->status().ok()) {
+    std::cerr << "Database iteration failed: " << status.ToString()
+              << std::endl;
+    return false;
+  }
+  return true;
+}
+
+bool DbUndumpTool::Run(const UndumpOptions& undump_options,
+                       rocksdb::Options options) {
+  rocksdb::DB* dbptr;
+  rocksdb::Status status;
+  rocksdb::Env* env;
+  std::unique_ptr<rocksdb::SequentialFile> dumpfile;
+  rocksdb::Slice slice;
+  char scratch8[8];
+
+  static const char* magicstr = "ROCKDUMP";
+  static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1};
+
+  env = rocksdb::Env::Default();
+
+  status = env->NewSequentialFile(undump_options.dump_location, &dumpfile,
+                                  rocksdb::EnvOptions());
+  if (!status.ok()) {
+    std::cerr << "Unable to open dump file '" << undump_options.dump_location
+              << "' for reading: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  status = dumpfile->Read(8, &slice, scratch8);
+  if (!status.ok() || slice.size() != 8 ||
+      memcmp(slice.data(), magicstr, 8) != 0) {
+    std::cerr << "File '" << undump_options.dump_location
+              << "' is not a recognizable dump file." << std::endl;
+    return false;
+  }
+
+  status = dumpfile->Read(8, &slice, scratch8);
+  if (!status.ok() || slice.size() != 8 ||
+      memcmp(slice.data(), versionstr, 8) != 0) {
+    std::cerr << "File '" << undump_options.dump_location
+              << "' version not recognized." << std::endl;
+    return false;
+  }
+
+  status = dumpfile->Read(4, &slice, scratch8);
+  if (!status.ok() || slice.size() != 4) {
+    std::cerr << "Unable to read info blob size." << std::endl;
+    return false;
+  }
+  uint32_t infosize = rocksdb::DecodeFixed32(slice.data());
+  status = dumpfile->Skip(infosize);
+  if (!status.ok()) {
+    std::cerr << "Unable to skip info blob: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  options.create_if_missing = true;
+  status = rocksdb::DB::Open(options, undump_options.db_path, &dbptr);
+  if (!status.ok()) {
+    std::cerr << "Unable to open database '" << undump_options.db_path
+              << "' for writing: " << status.ToString() << std::endl;
+    return false;
+  }
+
+  const std::unique_ptr<rocksdb::DB> db(dbptr);
+
+  uint32_t last_keysize = 64;
+  size_t last_valsize = 1 << 20;
+  std::unique_ptr<char[]> keyscratch(new char[last_keysize]);
+  std::unique_ptr<char[]> valscratch(new char[last_valsize]);
+
+  while (1) {
+    uint32_t keysize, valsize;
+    rocksdb::Slice keyslice;
+    rocksdb::Slice valslice;
+
+    status = dumpfile->Read(4, &slice, scratch8);
+    if (!status.ok() || slice.size() != 4) break;
+    keysize = rocksdb::DecodeFixed32(slice.data());
+    if (keysize > last_keysize) {
+      while (keysize > last_keysize) last_keysize *= 2;
+      keyscratch = std::unique_ptr<char[]>(new char[last_keysize]);
+    }
+
+    status = dumpfile->Read(keysize, &keyslice, keyscratch.get());
+    if (!status.ok() || keyslice.size() != keysize) {
+      std::cerr << "Key read failure: "
+                << (status.ok() ? "insufficient data" : status.ToString())
+                << std::endl;
+      return false;
+    }
+
+    status = dumpfile->Read(4, &slice, scratch8);
+    if (!status.ok() || slice.size() != 4) {
+      std::cerr << "Unable to read value size: "
+                << (status.ok() ? "insufficient data" : status.ToString())
+                << std::endl;
+      return false;
+    }
+    valsize = rocksdb::DecodeFixed32(slice.data());
+    if (valsize > last_valsize) {
+      while (valsize > last_valsize) last_valsize *= 2;
+      valscratch = std::unique_ptr<char[]>(new char[last_valsize]);
+    }
+
+    status = dumpfile->Read(valsize, &valslice, valscratch.get());
+    if (!status.ok() || valslice.size() != valsize) {
+      std::cerr << "Unable to read value: "
+                << (status.ok() ? "insufficient data" : status.ToString())
+                << std::endl;
+      return false;
+    }
+
+    status = db->Put(rocksdb::WriteOptions(), keyslice, valslice);
+    if (!status.ok()) {
+      fprintf(stderr, "Unable to write database entry\n");
+      return false;
+    }
+  }
+
+  if (undump_options.compact_db) {
+    status = db->CompactRange(rocksdb::CompactRangeOptions(), nullptr, nullptr);
+    if (!status.ok()) {
+      fprintf(stderr,
+              "Unable to compact the database after loading the dumped file\n");
+      return false;
+    }
+  }
+  return true;
+}
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/dump/rocksdb_dump.cc b/src/rocksdb/tools/dump/rocksdb_dump.cc
new file mode 100644
index 0000000..2bfc6ce
--- /dev/null
+++ b/src/rocksdb/tools/dump/rocksdb_dump.cc
@@ -0,0 +1,63 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#if !(defined GFLAGS) || defined(ROCKSDB_LITE)
+
+#include <cstdio>
+int main() {
+#ifndef GFLAGS
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+#endif
+#ifdef ROCKSDB_LITE
+  fprintf(stderr, "DbDumpTool is not supported in ROCKSDB_LITE\n");
+#endif
+  return 1;
+}
+
+#else
+
+#include <gflags/gflags.h>
+#include "rocksdb/convenience.h"
+#include "rocksdb/db_dump_tool.h"
+
+DEFINE_string(db_path, "", "Path to the db that will be dumped");
+DEFINE_string(dump_location, "", "Path to where the dump file location");
+DEFINE_bool(anonymous, false,
+            "Remove information like db path, creation time from dumped file");
+DEFINE_string(db_options, "",
+              "Options string used to open the database that will be dumped");
+
+int main(int argc, char** argv) {
+  GFLAGS::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_db_path == "" || FLAGS_dump_location == "") {
+    fprintf(stderr, "Please set --db_path and --dump_location\n");
+    return 1;
+  }
+
+  rocksdb::DumpOptions dump_options;
+  dump_options.db_path = FLAGS_db_path;
+  dump_options.dump_location = FLAGS_dump_location;
+  dump_options.anonymous = FLAGS_anonymous;
+
+  rocksdb::Options db_options;
+  if (FLAGS_db_options != "") {
+    rocksdb::Options parsed_options;
+    rocksdb::Status s = rocksdb::GetOptionsFromString(
+        db_options, FLAGS_db_options, &parsed_options);
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot parse provided db_options\n");
+      return 1;
+    }
+    db_options = parsed_options;
+  }
+
+  rocksdb::DbDumpTool tool;
+  if (!tool.Run(dump_options, db_options)) {
+    return 1;
+  }
+  return 0;
+}
+#endif  // !(defined GFLAGS) || defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/tools/dump/rocksdb_undump.cc b/src/rocksdb/tools/dump/rocksdb_undump.cc
new file mode 100644
index 0000000..81034f0
--- /dev/null
+++ b/src/rocksdb/tools/dump/rocksdb_undump.cc
@@ -0,0 +1,62 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#if !(defined GFLAGS) || defined(ROCKSDB_LITE)
+
+#include <cstdio>
+int main() {
+#ifndef GFLAGS
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+#endif
+#ifdef ROCKSDB_LITE
+  fprintf(stderr, "DbUndumpTool is not supported in ROCKSDB_LITE\n");
+#endif
+  return 1;
+}
+
+#else
+
+#include <gflags/gflags.h>
+#include "rocksdb/convenience.h"
+#include "rocksdb/db_dump_tool.h"
+
+DEFINE_string(dump_location, "", "Path to the dump file that will be loaded");
+DEFINE_string(db_path, "", "Path to the db that we will undump the file into");
+DEFINE_bool(compact, false, "Compact the db after loading the dumped file");
+DEFINE_string(db_options, "",
+              "Options string used to open the database that will be loaded");
+
+int main(int argc, char **argv) {
+  GFLAGS::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_db_path == "" || FLAGS_dump_location == "") {
+    fprintf(stderr, "Please set --db_path and --dump_location\n");
+    return 1;
+  }
+
+  rocksdb::UndumpOptions undump_options;
+  undump_options.db_path = FLAGS_db_path;
+  undump_options.dump_location = FLAGS_dump_location;
+  undump_options.compact_db = FLAGS_compact;
+
+  rocksdb::Options db_options;
+  if (FLAGS_db_options != "") {
+    rocksdb::Options parsed_options;
+    rocksdb::Status s = rocksdb::GetOptionsFromString(
+        db_options, FLAGS_db_options, &parsed_options);
+    if (!s.ok()) {
+      fprintf(stderr, "Cannot parse provided db_options\n");
+      return 1;
+    }
+    db_options = parsed_options;
+  }
+
+  rocksdb::DbUndumpTool tool;
+  if (!tool.Run(undump_options, db_options)) {
+    return 1;
+  }
+  return 0;
+}
+#endif  // !(defined GFLAGS) || defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/tools/generate_random_db.sh b/src/rocksdb/tools/generate_random_db.sh
new file mode 100755
index 0000000..28bdceb
--- /dev/null
+++ b/src/rocksdb/tools/generate_random_db.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+#
+# A shell script to load some pre generated data file to a DB using ldb tool
+# ./ldb needs to be avaible to be executed.
+#
+# Usage: <SCRIPT> <input_data_path> <DB Path>
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: $BASH_SOURCE <input_data_path> <DB Path>"
+  exit 1
+fi
+
+input_data_dir=$1
+db_dir=$2
+rm -rf $db_dir
+
+echo == Loading data from $input_data_dir to $db_dir
+
+declare -a compression_opts=("no" "snappy" "zlib" "bzip2")
+
+set -e
+
+n=0
+
+for f in `ls -1 $input_data_dir`
+do
+  echo == Loading $f with compression ${compression_opts[n % 4]}
+  ./ldb load --db=$db_dir --compression_type=${compression_opts[n % 4]} --bloom_bits=10 --auto_compaction=false --create_if_missing < $input_data_dir/$f
+  let "n = n + 1"
+done
diff --git a/src/rocksdb/tools/ldb.cc b/src/rocksdb/tools/ldb.cc
new file mode 100644
index 0000000..cb5ef52
--- /dev/null
+++ b/src/rocksdb/tools/ldb.cc
@@ -0,0 +1,21 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/ldb_tool.h"
+
+int main(int argc, char** argv) {
+  rocksdb::LDBTool tool;
+  tool.Run(argc, argv);
+  return 0;
+}
+#else
+#include <stdio.h>
+int main(int argc, char** argv) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/ldb_test.py b/src/rocksdb/tools/ldb_test.py
new file mode 100644
index 0000000..bcf3624
--- /dev/null
+++ b/src/rocksdb/tools/ldb_test.py
@@ -0,0 +1,456 @@
+import os
+import glob
+import os.path
+import shutil
+import subprocess
+import time
+import unittest
+import tempfile
+import re
+
+def my_check_output(*popenargs, **kwargs):
+    """
+    If we had python 2.7, we should simply use subprocess.check_output.
+    This is a stop-gap solution for python 2.6
+    """
+    if 'stdout' in kwargs:
+        raise ValueError('stdout argument not allowed, it will be overridden.')
+    process = subprocess.Popen(stderr=subprocess.PIPE, stdout=subprocess.PIPE,
+                               *popenargs, **kwargs)
+    output, unused_err = process.communicate()
+    retcode = process.poll()
+    if retcode:
+        cmd = kwargs.get("args")
+        if cmd is None:
+            cmd = popenargs[0]
+        raise Exception("Exit code is not 0.  It is %d.  Command: %s" %
+                (retcode, cmd))
+    return output
+
+def run_err_null(cmd):
+    return os.system(cmd + " 2>/dev/null ")
+
+class LDBTestCase(unittest.TestCase):
+    def setUp(self):
+        self.TMP_DIR  = tempfile.mkdtemp(prefix="ldb_test_")
+        self.DB_NAME = "testdb"
+
+    def tearDown(self):
+        assert(self.TMP_DIR.strip() != "/"
+                and self.TMP_DIR.strip() != "/tmp"
+                and self.TMP_DIR.strip() != "/tmp/") #Just some paranoia
+
+        shutil.rmtree(self.TMP_DIR)
+
+    def dbParam(self, dbName):
+        return "--db=%s" % os.path.join(self.TMP_DIR, dbName)
+
+    def assertRunOKFull(self, params, expectedOutput, unexpected=False,
+                        isPattern=False):
+        """
+        All command-line params must be specified.
+        Allows full flexibility in testing; for example: missing db param.
+
+        """
+
+        output = my_check_output("./ldb %s |grep -v \"Created bg thread\"" %
+                            params, shell=True)
+        if not unexpected:
+            if isPattern:
+                self.assertNotEqual(expectedOutput.search(output.strip()),
+                                    None)
+            else:
+                self.assertEqual(output.strip(), expectedOutput.strip())
+        else:
+            if isPattern:
+                self.assertEqual(expectedOutput.search(output.strip()), None)
+            else:
+                self.assertNotEqual(output.strip(), expectedOutput.strip())
+
+    def assertRunFAILFull(self, params):
+        """
+        All command-line params must be specified.
+        Allows full flexibility in testing; for example: missing db param.
+
+        """
+        try:
+
+            my_check_output("./ldb %s >/dev/null 2>&1 |grep -v \"Created bg \
+                thread\"" % params, shell=True)
+        except Exception, e:
+            return
+        self.fail(
+            "Exception should have been raised for command with params: %s" %
+            params)
+
+    def assertRunOK(self, params, expectedOutput, unexpected=False):
+        """
+        Uses the default test db.
+
+        """
+        self.assertRunOKFull("%s %s" % (self.dbParam(self.DB_NAME), params),
+                             expectedOutput, unexpected)
+
+    def assertRunFAIL(self, params):
+        """
+        Uses the default test db.
+        """
+        self.assertRunFAILFull("%s %s" % (self.dbParam(self.DB_NAME), params))
+
+    def testSimpleStringPutGet(self):
+        print "Running testSimpleStringPutGet..."
+        self.assertRunFAIL("put x1 y1")
+        self.assertRunOK("put --create_if_missing x1 y1", "OK")
+        self.assertRunOK("get x1", "y1")
+        self.assertRunFAIL("get x2")
+
+        self.assertRunOK("put x2 y2", "OK")
+        self.assertRunOK("get x1", "y1")
+        self.assertRunOK("get x2", "y2")
+        self.assertRunFAIL("get x3")
+
+        self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2")
+        self.assertRunOK("put x3 y3", "OK")
+
+        self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2\nx3 : y3")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3")
+        self.assertRunOK("scan --from=x", "x1 : y1\nx2 : y2\nx3 : y3")
+
+        self.assertRunOK("scan --to=x2", "x1 : y1")
+        self.assertRunOK("scan --from=x1 --to=z --max_keys=1", "x1 : y1")
+        self.assertRunOK("scan --from=x1 --to=z --max_keys=2",
+                "x1 : y1\nx2 : y2")
+
+        self.assertRunOK("scan --from=x1 --to=z --max_keys=3",
+                "x1 : y1\nx2 : y2\nx3 : y3")
+        self.assertRunOK("scan --from=x1 --to=z --max_keys=4",
+                "x1 : y1\nx2 : y2\nx3 : y3")
+        self.assertRunOK("scan --from=x1 --to=x2", "x1 : y1")
+        self.assertRunOK("scan --from=x2 --to=x4", "x2 : y2\nx3 : y3")
+        self.assertRunFAIL("scan --from=x4 --to=z") # No results => FAIL
+        self.assertRunFAIL("scan --from=x1 --to=z --max_keys=foo")
+
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3")
+
+        self.assertRunOK("delete x1", "OK")
+        self.assertRunOK("scan", "x2 : y2\nx3 : y3")
+
+        self.assertRunOK("delete NonExistentKey", "OK")
+        # It is weird that GET and SCAN raise exception for
+        # non-existent key, while delete does not
+
+        self.assertRunOK("checkconsistency", "OK")
+
+    def dumpDb(self, params, dumpFile):
+        return 0 == run_err_null("./ldb dump %s > %s" % (params, dumpFile))
+
+    def loadDb(self, params, dumpFile):
+        return 0 == run_err_null("cat %s | ./ldb load %s" % (dumpFile, params))
+
+    def testStringBatchPut(self):
+        print "Running testStringBatchPut..."
+        self.assertRunOK("batchput x1 y1 --create_if_missing", "OK")
+        self.assertRunOK("scan", "x1 : y1")
+        self.assertRunOK("batchput x2 y2 x3 y3 \"x4 abc\" \"y4 xyz\"", "OK")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 abc : y4 xyz")
+        self.assertRunFAIL("batchput")
+        self.assertRunFAIL("batchput k1")
+        self.assertRunFAIL("batchput k1 v1 k2")
+
+    def testCountDelimDump(self):
+        print "Running testCountDelimDump..."
+        self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK")
+        self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK")
+        self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+        self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+        self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK")
+        self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8")
+
+    def testCountDelimIDump(self):
+        print "Running testCountDelimIDump..."
+        self.assertRunOK("batchput x.1 x1 --create_if_missing", "OK")
+        self.assertRunOK("batchput y.abc abc y.2 2 z.13c pqr", "OK")
+        self.assertRunOK("dump --count_delim", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+        self.assertRunOK("dump --count_delim=\".\"", "x => count:1\tsize:5\ny => count:2\tsize:12\nz => count:1\tsize:8")
+        self.assertRunOK("batchput x,2 x2 x,abc xabc", "OK")
+        self.assertRunOK("dump --count_delim=\",\"", "x => count:2\tsize:14\nx.1 => count:1\tsize:5\ny.2 => count:1\tsize:4\ny.abc => count:1\tsize:8\nz.13c => count:1\tsize:8")
+
+    def testInvalidCmdLines(self):
+        print "Running testInvalidCmdLines..."
+        # db not specified
+        self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
+        # No param called he
+        self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing")
+        # max_keys is not applicable for put
+        self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing")
+        # hex has invalid boolean value
+
+    def testHexPutGet(self):
+        print "Running testHexPutGet..."
+        self.assertRunOK("put a1 b1 --create_if_missing", "OK")
+        self.assertRunOK("scan", "a1 : b1")
+        self.assertRunOK("scan --hex", "0x6131 : 0x6231")
+        self.assertRunFAIL("put --hex 6132 6232")
+        self.assertRunOK("put --hex 0x6132 0x6232", "OK")
+        self.assertRunOK("scan --hex", "0x6131 : 0x6231\n0x6132 : 0x6232")
+        self.assertRunOK("scan", "a1 : b1\na2 : b2")
+        self.assertRunOK("get a1", "b1")
+        self.assertRunOK("get --hex 0x6131", "0x6231")
+        self.assertRunOK("get a2", "b2")
+        self.assertRunOK("get --hex 0x6132", "0x6232")
+        self.assertRunOK("get --key_hex 0x6132", "b2")
+        self.assertRunOK("get --key_hex --value_hex 0x6132", "0x6232")
+        self.assertRunOK("get --value_hex a2", "0x6232")
+        self.assertRunOK("scan --key_hex --value_hex",
+                "0x6131 : 0x6231\n0x6132 : 0x6232")
+        self.assertRunOK("scan --hex --from=0x6131 --to=0x6133",
+                "0x6131 : 0x6231\n0x6132 : 0x6232")
+        self.assertRunOK("scan --hex --from=0x6131 --to=0x6132",
+                "0x6131 : 0x6231")
+        self.assertRunOK("scan --key_hex", "0x6131 : b1\n0x6132 : b2")
+        self.assertRunOK("scan --value_hex", "a1 : 0x6231\na2 : 0x6232")
+        self.assertRunOK("batchput --hex 0x6133 0x6233 0x6134 0x6234", "OK")
+        self.assertRunOK("scan", "a1 : b1\na2 : b2\na3 : b3\na4 : b4")
+        self.assertRunOK("delete --hex 0x6133", "OK")
+        self.assertRunOK("scan", "a1 : b1\na2 : b2\na4 : b4")
+        self.assertRunOK("checkconsistency", "OK")
+
+    def testTtlPutGet(self):
+        print "Running testTtlPutGet..."
+        self.assertRunOK("put a1 b1 --ttl --create_if_missing", "OK")
+        self.assertRunOK("scan --hex", "0x6131 : 0x6231", True)
+        self.assertRunOK("dump --ttl ", "a1 ==> b1", True)
+        self.assertRunOK("dump --hex --ttl ",
+                         "0x6131 ==> 0x6231\nKeys in range: 1")
+        self.assertRunOK("scan --hex --ttl", "0x6131 : 0x6231")
+        self.assertRunOK("get --value_hex a1", "0x6231", True)
+        self.assertRunOK("get --ttl a1", "b1")
+        self.assertRunOK("put a3 b3 --create_if_missing", "OK")
+        # fails because timstamp's length is greater than value's
+        self.assertRunFAIL("get --ttl a3")
+        self.assertRunOK("checkconsistency", "OK")
+
+    def testInvalidCmdLines(self):
+        print "Running testInvalidCmdLines..."
+        # db not specified
+        self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
+        # No param called he
+        self.assertRunFAIL("put 0x6133 0x6233 --he --create_if_missing")
+        # max_keys is not applicable for put
+        self.assertRunFAIL("put 0x6133 0x6233 --max_keys=1 --create_if_missing")
+        # hex has invalid boolean value
+        self.assertRunFAIL("put 0x6133 0x6233 --hex=Boo --create_if_missing")
+
+    def testDumpLoad(self):
+        print "Running testDumpLoad..."
+        self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4",
+                "OK")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+        origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+
+        # Dump and load without any additional params specified
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump1")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump1")
+        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump and load in hex
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump2")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump2")
+        self.assertTrue(self.dumpDb("--db=%s --hex" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --hex --create_if_missing" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump only a portion of the key range
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump3")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump3")
+        self.assertTrue(self.dumpDb(
+            "--db=%s --from=x1 --to=x3" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2")
+
+        # Dump upto max_keys rows
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump4")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump4")
+        self.assertTrue(self.dumpDb(
+            "--db=%s --max_keys=3" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3")
+
+        # Load into an existing db, create_if_missing is not specified
+        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb("--db=%s" % loadedDbPath, dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump and load with WAL disabled
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump5")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump5")
+        self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --disable_wal --create_if_missing" % loadedDbPath,
+            dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump and load with lots of extra params specified
+        extraParams = " ".join(["--bloom_bits=14", "--block_size=1024",
+                                "--auto_compaction=true",
+                                "--write_buffer_size=4194304",
+                                "--file_size=2097152"])
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump6")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump6")
+        self.assertTrue(self.dumpDb(
+            "--db=%s %s" % (origDbPath, extraParams), dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s %s --create_if_missing" % (loadedDbPath, extraParams),
+            dumpFilePath))
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath,
+                "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        # Dump with count_only
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump7")
+        loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump7")
+        self.assertTrue(self.dumpDb(
+            "--db=%s --count_only" % origDbPath, dumpFilePath))
+        self.assertTrue(self.loadDb(
+            "--db=%s --create_if_missing" % loadedDbPath, dumpFilePath))
+        # DB should have atleast one value for scan to work
+        self.assertRunOKFull("put --db=%s k1 v1" % loadedDbPath, "OK")
+        self.assertRunOKFull("scan --db=%s" % loadedDbPath, "k1 : v1")
+
+        # Dump command fails because of typo in params
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump8")
+        self.assertFalse(self.dumpDb(
+            "--db=%s --create_if_missing" % origDbPath, dumpFilePath))
+
+    def testMiscAdminTask(self):
+        print "Running testMiscAdminTask..."
+        # These tests need to be improved; for example with asserts about
+        # whether compaction or level reduction actually took place.
+        self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4",
+                "OK")
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+        origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb compact --db=%s" % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb reduce_levels --db=%s --new_levels=2" % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb reduce_levels --db=%s --new_levels=3" % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb compact --db=%s --from=x1 --to=x3" % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        self.assertTrue(0 == run_err_null(
+            "./ldb compact --db=%s --hex --from=0x6131 --to=0x6134"
+            % origDbPath))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+        #TODO(dilip): Not sure what should be passed to WAL.Currently corrupted.
+        self.assertTrue(0 == run_err_null(
+            "./ldb dump_wal --db=%s --walfile=%s --header" % (
+                origDbPath, os.path.join(origDbPath, "LOG"))))
+        self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+
+    def testCheckConsistency(self):
+        print "Running testCheckConsistency..."
+
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put x1 y1 --create_if_missing", "OK")
+        self.assertRunOK("put x2 y2", "OK")
+        self.assertRunOK("get x1", "y1")
+        self.assertRunOK("checkconsistency", "OK")
+
+        sstFilePath = my_check_output("ls %s" % os.path.join(dbPath, "*.sst"),
+                                      shell=True)
+
+        # Modify the file
+        my_check_output("echo 'evil' > %s" % sstFilePath, shell=True)
+        self.assertRunFAIL("checkconsistency")
+
+        # Delete the file
+        my_check_output("rm -f %s" % sstFilePath, shell=True)
+        self.assertRunFAIL("checkconsistency")
+
+    def dumpLiveFiles(self, params, dumpFile):
+        return 0 == run_err_null("./ldb dump_live_files %s > %s" % (
+            params, dumpFile))
+
+    def testDumpLiveFiles(self):
+        print "Running testDumpLiveFiles..."
+
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put x1 y1 --create_if_missing", "OK")
+        self.assertRunOK("put x2 y2", "OK")
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump1")
+        self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath))
+        self.assertRunOK("delete x1", "OK")
+        self.assertRunOK("put x3 y3", "OK")
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump2")
+        self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath))
+
+    def getManifests(self, directory):
+        return glob.glob(directory + "/MANIFEST-*")
+
+    def copyManifests(self, src, dest):
+        return 0 == run_err_null("cp " + src + " " + dest)
+
+    def testManifestDump(self):
+        print "Running testManifestDump..."
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put 1 1 --create_if_missing", "OK")
+        self.assertRunOK("put 2 2", "OK")
+        self.assertRunOK("put 3 3", "OK")
+        # Pattern to expect from manifest_dump.
+        num = "[0-9]+"
+        st = ".*"
+        subpat = st + " @ " + num + ": " + num
+        regex = num + ":" + num + "\[" + subpat + ".." + subpat + "\]"
+        expected_pattern = re.compile(regex)
+        cmd = "manifest_dump --db=%s"
+        manifest_files = self.getManifests(dbPath)
+        self.assertTrue(len(manifest_files) == 1)
+        # Test with the default manifest file in dbPath.
+        self.assertRunOKFull(cmd % dbPath, expected_pattern,
+                             unexpected=False, isPattern=True)
+        self.copyManifests(manifest_files[0], manifest_files[0] + "1")
+        manifest_files = self.getManifests(dbPath)
+        self.assertTrue(len(manifest_files) == 2)
+        # Test with multiple manifest files in dbPath.
+        self.assertRunFAILFull(cmd % dbPath)
+        # Running it with the copy we just created should pass.
+        self.assertRunOKFull((cmd + " --path=%s")
+                             % (dbPath, manifest_files[1]),
+                             expected_pattern, unexpected=False,
+                             isPattern=True)
+
+    def testListColumnFamilies(self):
+        print "Running testListColumnFamilies..."
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        self.assertRunOK("put x1 y1 --create_if_missing", "OK")
+        cmd = "list_column_families %s | grep -v \"Column families\""
+        # Test on valid dbPath.
+        self.assertRunOKFull(cmd % dbPath, "{default}")
+        # Test on empty path.
+        self.assertRunFAILFull(cmd % "")
+
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/rocksdb/tools/pflag b/src/rocksdb/tools/pflag
new file mode 100755
index 0000000..adfac23
--- /dev/null
+++ b/src/rocksdb/tools/pflag
@@ -0,0 +1,217 @@
+#!/bin/bash
+#
+#(c) 2004-present, Facebook, all rights reserved. 
+# See the LICENSE file for usage and distribution rights.
+#
+
+trap 'echo "Caught exception, dying"; exit' 1 2 3 15
+
+ME=`basename $0`
+SERVER=`hostname`
+
+#parameters used
+#
+Dump_Config=0
+DEBUG=
+OS=`/bin/uname -s`
+VMEM=
+RSS=
+CPU=
+VERBOSE=
+VAR=
+LIMIT=
+ACTION=
+N=
+WAIT=
+
+#
+#supported OS: Linux only for now. Easy to add
+#
+oscheck() {
+  case ${OS} in
+    Linux)
+     VMEM=vsz
+     RSS=rss
+     CPU=bsdtime
+     ;;
+    *)
+      die "Unsupported OS ${OS}. Send a bug report with OS you need supported. Thanks."
+      ;;
+  esac
+}
+
+
+verbose() {
+  if [ "x$DEBUG" != "x" ]; then
+    echo "$@" >&2
+  fi
+}
+
+warn() {
+  echo "$@" >&2
+}
+
+die() {
+    echo "ERROR: " "$@" >&2;
+    exit;
+}
+
+dump_config() {
+  cat <<EOCONFIG;
+$ME running on ${HOSTNAME} at `date`
+
+Configuration for this run:
+  PID to monitor     : ${PID}
+  Resource monitored : ${VAR}
+  Resource limit     : ${LIMIT}
+  Check every        : ${WAIT} seconds
+  No. of times run   : ${N}
+  What to do         : ${ACTION}
+EOCONFIG
+
+}
+
+usage() {
+  cat <<USAGE; exit
+$@
+
+Usage ${ME} -p pid [-x {VMEM|RSS|CPU}] -l limit [-a {warn|die|kill}] [-n cycles] [-w wait]
+
+Monitor a process for set of violations. Options:
+
+  -p: PID of process to monitor
+
+  -x: metric to sense. Currently only VMEM/RSS/CPU are supported. Defaults to VMEM
+
+  -l: what is the threshold/limit for the metric that is being sensed.
+    Examples: "-l 100m", "-l 1.5g" (for VMEM/RSS), "-l 5:04" 5:04 in BSDTIME for CPU
+    NOTE: defaults to 1GB
+
+  -a: action. Currently {warn|die|kill} are supported. 
+    The default action is to 'warn'. Here is the behavior:
+
+    warn: complain if usage exceeds threshold, but continue monitoring
+    kill: complain, kill the db_bench process and exit
+    die:  if usage exceeds threshold, die immediately
+
+  -n: number of cycles to monitor. Default is to monitor until PID no longer exists.
+
+  -w: wait time per cycle of monitoring. Default is 5 seconds.
+
+  -v: verbose messaging
+
+USAGE
+
+}
+
+#set default values if none given
+set_defaults_if_noopt_given() {
+
+  : ${VAR:=vsz}
+  : ${LIMIT:=1024000}
+  : ${WAIT:=5}
+  : ${N:=999999}
+  : ${ACTION:=warn}
+}
+
+validate_options() {
+  if [ "x$PID" = "x" -a $Dump_Config -ne 1 ]; then
+    usage "PID is mandatory"
+  fi
+}
+
+###### START
+
+
+  while getopts ":p:x:l:a:n:t:vhd" opt; do
+    case $opt in
+      d)
+          Dump_Config=1
+          ;;
+      h)
+          usage;
+          ;;
+      a)
+        ACTION=${OPTARG};
+        ;;
+      v)
+        DEBUG=1;
+        ;;
+      p)
+        PID=$OPTARG;
+        ;;
+      x)
+        VAR=$OPTARG;
+        ;;
+      l)
+        LIMIT=$OPTARG;
+        ;;
+      w)
+        WAIT=$OPTARG;
+        ;;
+      n)
+        N=$OPTARG;
+        ;;
+      \?) 
+        usage;
+        ;;
+    esac
+  done
+
+oscheck;
+set_defaults_if_noopt_given;
+validate_options;
+
+if [ $Dump_Config -eq 1 ]; then
+    dump_config;
+    exit;
+fi
+
+Done=0
+
+verbose "Trying ${N} times, Waiting ${WAIT} seconds each iteration";
+
+while [ $Done -eq 0 ]; do
+  VAL=`/bin/ps h -p $PID -o ${VAR} | perl -pe 'chomp; s/(.*)m/$1 * 1024/e; s/(.*)g/$1 * 1024 * 1024/e;'`
+  if [ ${VAL:=0} -eq 0 ]; then
+    warn "Process $PID ended without incident."
+    Done=1;
+    break;
+  fi
+
+  if [ $VAL -ge $LIMIT ]; then
+    Done=1;
+  else
+    echo "Value of '${VAR}' (${VAL}) is less than ${LIMIT} for PID ${PID}"
+    sleep $WAIT;
+  fi
+  if [ $Done -eq 1 ]; then
+
+    if [ "$ACTION" = "kill" ]; then
+        kill ${PID} || kill -3 ${PID}
+        exit;
+
+    elif [ "$ACTION" = "warn" ]; then
+
+      # go back to monitoring.
+
+      warn "`date` WARNING: ${VAR} breached threshold ${LIMIT}, actual is ${VAL}"
+      Done=0  #go back to monitoring
+
+    elif [ "$ACTION" = "die" ]; then
+      warn "WARNING: dying without killing process ${PID} on ${SERVER}"
+      warn "The process details are below: "
+      warn "`ps -p ${PID} -o pid,ppid,bsdtime,rss,vsz,cmd,args`"
+      warn ""
+
+      #should we send email/notify someone? TODO... for now, bail.
+
+      exit -1;
+
+    fi
+  else
+      :
+      #warn "INFO: PID $PID, $VAR = $VAL, limit ($LIMIT) not exceeded";
+  fi
+done
+
diff --git a/src/rocksdb/tools/reduce_levels_test.cc b/src/rocksdb/tools/reduce_levels_test.cc
new file mode 100644
index 0000000..f3091ed
--- /dev/null
+++ b/src/rocksdb/tools/reduce_levels_test.cc
@@ -0,0 +1,217 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/db.h"
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testutil.h"
+#include "util/testharness.h"
+#include "util/ldb_cmd.h"
+
+namespace rocksdb {
+
+class ReduceLevelTest : public testing::Test {
+public:
+  ReduceLevelTest() {
+    dbname_ = test::TmpDir() + "/db_reduce_levels_test";
+    DestroyDB(dbname_, Options());
+    db_ = nullptr;
+  }
+
+  Status OpenDB(bool create_if_missing, int levels);
+
+  Status Put(const std::string& k, const std::string& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  std::string Get(const std::string& k) {
+    ReadOptions options;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  Status Flush() {
+    if (db_ == nullptr) {
+      return Status::InvalidArgument("DB not opened.");
+    }
+    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
+    return db_impl->TEST_FlushMemTable();
+  }
+
+  void MoveL0FileToLevel(int level) {
+    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
+    for (int i = 0; i < level; ++i) {
+      ASSERT_OK(db_impl->TEST_CompactRange(i, nullptr, nullptr));
+    }
+  }
+
+  void CloseDB() {
+    if (db_ != nullptr) {
+      delete db_;
+      db_ = nullptr;
+    }
+  }
+
+  bool ReduceLevels(int target_level);
+
+  int FilesOnLevel(int level) {
+    std::string property;
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(level), &property));
+    return atoi(property.c_str());
+  }
+
+private:
+  std::string dbname_;
+  DB* db_;
+};
+
+Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels) {
+  rocksdb::Options opt;
+  opt.num_levels = num_levels;
+  opt.create_if_missing = create_if_missing;
+  rocksdb::Status st = rocksdb::DB::Open(opt, dbname_, &db_);
+  if (!st.ok()) {
+    fprintf(stderr, "Can't open the db:%s\n", st.ToString().c_str());
+  }
+  return st;
+}
+
+bool ReduceLevelTest::ReduceLevels(int target_level) {
+  std::vector<std::string> args = rocksdb::ReduceDBLevelsCommand::PrepareArgs(
+      dbname_, target_level, false);
+  LDBCommand* level_reducer = LDBCommand::InitFromCmdLineArgs(
+      args, Options(), LDBOptions());
+  level_reducer->Run();
+  bool is_succeed = level_reducer->GetExecuteState().IsSucceed();
+  delete level_reducer;
+  return is_succeed;
+}
+
+TEST_F(ReduceLevelTest, Last_Level) {
+  ASSERT_OK(OpenDB(true, 4));
+  ASSERT_OK(Put("aaaa", "11111"));
+  Flush();
+  MoveL0FileToLevel(3);
+  ASSERT_EQ(FilesOnLevel(3), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(3));
+  ASSERT_OK(OpenDB(true, 3));
+  ASSERT_EQ(FilesOnLevel(2), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(2));
+  ASSERT_OK(OpenDB(true, 2));
+  ASSERT_EQ(FilesOnLevel(1), 1);
+  CloseDB();
+}
+
+TEST_F(ReduceLevelTest, Top_Level) {
+  ASSERT_OK(OpenDB(true, 5));
+  ASSERT_OK(Put("aaaa", "11111"));
+  Flush();
+  ASSERT_EQ(FilesOnLevel(0), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(4));
+  ASSERT_OK(OpenDB(true, 4));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(3));
+  ASSERT_OK(OpenDB(true, 3));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(2));
+  ASSERT_OK(OpenDB(true, 2));
+  CloseDB();
+}
+
+TEST_F(ReduceLevelTest, All_Levels) {
+  ASSERT_OK(OpenDB(true, 5));
+  ASSERT_OK(Put("a", "a11111"));
+  ASSERT_OK(Flush());
+  MoveL0FileToLevel(4);
+  ASSERT_EQ(FilesOnLevel(4), 1);
+  CloseDB();
+
+  ASSERT_OK(OpenDB(true, 5));
+  ASSERT_OK(Put("b", "b11111"));
+  ASSERT_OK(Flush());
+  MoveL0FileToLevel(3);
+  ASSERT_EQ(FilesOnLevel(3), 1);
+  ASSERT_EQ(FilesOnLevel(4), 1);
+  CloseDB();
+
+  ASSERT_OK(OpenDB(true, 5));
+  ASSERT_OK(Put("c", "c11111"));
+  ASSERT_OK(Flush());
+  MoveL0FileToLevel(2);
+  ASSERT_EQ(FilesOnLevel(2), 1);
+  ASSERT_EQ(FilesOnLevel(3), 1);
+  ASSERT_EQ(FilesOnLevel(4), 1);
+  CloseDB();
+
+  ASSERT_OK(OpenDB(true, 5));
+  ASSERT_OK(Put("d", "d11111"));
+  ASSERT_OK(Flush());
+  MoveL0FileToLevel(1);
+  ASSERT_EQ(FilesOnLevel(1), 1);
+  ASSERT_EQ(FilesOnLevel(2), 1);
+  ASSERT_EQ(FilesOnLevel(3), 1);
+  ASSERT_EQ(FilesOnLevel(4), 1);
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(4));
+  ASSERT_OK(OpenDB(true, 4));
+  ASSERT_EQ("a11111", Get("a"));
+  ASSERT_EQ("b11111", Get("b"));
+  ASSERT_EQ("c11111", Get("c"));
+  ASSERT_EQ("d11111", Get("d"));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(3));
+  ASSERT_OK(OpenDB(true, 3));
+  ASSERT_EQ("a11111", Get("a"));
+  ASSERT_EQ("b11111", Get("b"));
+  ASSERT_EQ("c11111", Get("c"));
+  ASSERT_EQ("d11111", Get("d"));
+  CloseDB();
+
+  ASSERT_TRUE(ReduceLevels(2));
+  ASSERT_OK(OpenDB(true, 2));
+  ASSERT_EQ("a11111", Get("a"));
+  ASSERT_EQ("b11111", Get("b"));
+  ASSERT_EQ("c11111", Get("c"));
+  ASSERT_EQ("d11111", Get("d"));
+  CloseDB();
+}
+
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as LDBCommand is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/tools/rocksdb_dump_test.sh b/src/rocksdb/tools/rocksdb_dump_test.sh
new file mode 100755
index 0000000..5c8b5c3
--- /dev/null
+++ b/src/rocksdb/tools/rocksdb_dump_test.sh
@@ -0,0 +1,7 @@
+TESTDIR=`mktemp -d /tmp/rocksdb-dump-test.XXXXX`
+DUMPFILE="tools/sample-dump.dmp"
+
+# Verify that the sample dump file is undumpable and then redumpable.
+./rocksdb_undump --dump_location=$DUMPFILE --db_path=$TESTDIR/db
+./rocksdb_dump --anonymous --db_path=$TESTDIR/db --dump_location=$TESTDIR/dump
+cmp $DUMPFILE $TESTDIR/dump
diff --git a/src/rocksdb/tools/run_flash_bench.sh b/src/rocksdb/tools/run_flash_bench.sh
new file mode 100755
index 0000000..b80eee6
--- /dev/null
+++ b/src/rocksdb/tools/run_flash_bench.sh
@@ -0,0 +1,282 @@
+#!/bin/bash
+# REQUIRE: benchmark.sh exists in the current directory
+# After execution of this script, log files are generated in $output_dir.
+# report.txt provides a high level statistics
+
+# This should be run from the parent of the tools directory. The command line is:
+#   [$env_vars] tools/run_flash_bench.sh [list-of-threads]
+#
+# This runs a sequence of tests in the following sequence:
+#   step 1) load - bulkload, compact, fillseq, overwrite
+#   step 2) read-only for each number of threads
+#   step 3) read-write for each number of threads
+#   step 4) merge for each number of threads
+#
+# The list of threads is optional and when not set is equivalent to "24". 
+# Were list-of-threads specified as "1 2 4" then the tests in steps 2, 3 and
+# 4 above would be repeated for 1, 2 and 4 threads. The tests in step 1 are
+# only run for 1 thread.
+
+# Test output is written to $OUTPUT_DIR, currently /tmp/output. The performance
+# summary is in $OUTPUT_DIR/report.txt. There is one file in $OUTPUT_DIR per
+# test and the tests are listed below.
+#
+# The environment variables are also optional. The variables are:
+#   NKEYS         - number of key/value pairs to load
+#   NWRITESPERSEC - the writes/second rate limit for the *whilewriting* tests.
+#                   If this is too large then the non-writer threads can get
+#                   starved.
+#   NSECONDS      - number of seconds for which to run each test in steps 2,
+#                   3 and 4. There are currently 15 tests in those steps and
+#                   they are repeated for each entry in list-of-threads so
+#                   this variable lets you control the total duration to
+#                   finish the benchmark.
+#   RANGE_LIMIT   - the number of rows to read per range query for tests that
+#                   do range queries.
+#   VAL_SIZE      - the length of the value in the key/value pairs loaded.
+#                   You can estimate the size of the test database from this,
+#                   NKEYS and the compression rate (--compression_ratio) set
+#                   in tools/benchmark.sh
+#   BLOCK_LENGTH  - value for db_bench --block_size
+#   CACHE_BYTES   - the size of the RocksDB block cache in bytes
+#   DATA_DIR      - directory in which to create database files
+#   LOG_DIR       - directory in which to create WAL files, may be the same
+#                   as DATA_DIR
+#   DO_SETUP      - when set to 0 then a backup of the database is copied from
+#                   $DATA_DIR.bak to $DATA_DIR and the load tests from step 1
+#                   The WAL directory is also copied from a backup if
+#                   DATA_DIR != LOG_DIR. This allows tests from steps 2, 3, 4
+#                   to be repeated faster.
+#   SAVE_SETUP    - saves a copy of the database at the end of step 1 to
+#                   $DATA_DIR.bak. When LOG_DIR != DATA_DIR then it is copied
+#                   to $LOG_DIR.bak.
+
+# Size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+
+num_keys=${NKEYS:-$((1 * G))}
+wps=${NWRITESPERSEC:-$((10 * K))}
+duration=${NSECONDS:-$((60 * 60))}
+nps=${RANGE_LIMIT:-10}
+vs=${VAL_SIZE:-400}
+cs=${CACHE_BYTES:-$(( 1 * G ))}
+bs=${BLOCK_LENGTH:-8192}
+
+# If no command line arguments then run for 24 threads.
+if [[ $# -eq 0 ]]; then
+  nthreads=( 24 )
+else
+  nthreads=( "$@" )
+fi
+
+for num_thr in "${nthreads[@]}" ; do
+  echo Will run for $num_thr threads
+done
+
+# Update these parameters before execution !!!
+db_dir=${DATA_DIR:-"/tmp/rocksdb/"}
+wal_dir=${LOG_DIR:-"/tmp/rocksdb/"}
+
+do_setup=${DO_SETUP:-1}
+save_setup=${SAVE_SETUP:-0}
+
+output_dir="/tmp/output"
+
+ARGS="\
+OUTPUT_DIR=$output_dir \
+NUM_KEYS=$num_keys \
+DB_DIR=$db_dir \
+WAL_DIR=$wal_dir \
+VALUE_SIZE=$vs \
+BLOCK_SIZE=$bs \
+CACHE_SIZE=$cs"
+
+mkdir -p $output_dir
+echo -e "ops/sec\tmb/sec\tSize-GB\tL0_GB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp75\tp99\tp99.9\tp99.99\tUptime\tStall-time\tStall%\tTest" \
+  > $output_dir/report.txt
+
+# Notes on test sequence:
+#   step 1) Setup database via sequential fill followed by overwrite to fragment it.
+#           Done without setting DURATION to make sure that overwrite does $num_keys writes
+#   step 2) read-only tests for all levels of concurrency requested
+#   step 3) non read-only tests for all levels of concurrency requested
+#   step 4) merge tests for all levels of concurrency requested. These must come last.
+
+###### Setup the database
+
+if [[ $do_setup != 0 ]]; then
+  echo Doing setup
+
+  # Test 1: bulk load
+  env $ARGS ./tools/benchmark.sh bulkload
+
+  # Test 2a: sequential fill with large values to get peak ingest
+  #          adjust NUM_KEYS given the use of larger values
+  env $ARGS BLOCK_SIZE=$((1 * M)) VALUE_SIZE=$((32 * K)) NUM_KEYS=$(( num_keys / 64 )) \
+       ./tools/benchmark.sh fillseq
+
+  # Test 2b: sequential fill with the configured value size
+  env $ARGS ./tools/benchmark.sh fillseq
+
+  # Test 3: single-threaded overwrite
+  env $ARGS NUM_THREADS=1 DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh overwrite
+
+else
+  echo Restoring from backup
+
+  rm -rf $db_dir
+
+  if [ ! -d ${db_dir}.bak ]; then
+    echo Database backup does not exist at ${db_dir}.bak
+    exit -1
+  fi
+
+  echo Restore database from ${db_dir}.bak
+  cp -p -r ${db_dir}.bak $db_dir
+
+  if [[ $db_dir != $wal_dir ]]; then
+    rm -rf $wal_dir
+
+    if [ ! -d ${wal_dir}.bak ]; then
+      echo WAL backup does not exist at ${wal_dir}.bak
+      exit -1
+    fi
+
+    echo Restore WAL from ${wal_dir}.bak
+    cp -p -r ${wal_dir}.bak $wal_dir
+  fi
+fi
+
+if [[ $save_setup != 0 ]]; then
+  echo Save database to ${db_dir}.bak
+  cp -p -r $db_dir ${db_dir}.bak
+
+  if [[ $db_dir != $wal_dir ]]; then
+    echo Save WAL to ${wal_dir}.bak
+    cp -p -r $wal_dir ${wal_dir}.bak
+  fi
+fi
+
+###### Read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+  # Test 4: random read
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh readrandom
+
+  # Test 5: random range scans
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr NUM_NEXTS_PER_SEEK=$nps \
+    ./tools/benchmark.sh fwdrange
+
+  # Test 6: random reverse range scans
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr NUM_NEXTS_PER_SEEK=$nps \
+    ./tools/benchmark.sh revrange
+done
+
+###### Non read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+  # Test 7: overwrite with sync=0
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \
+    ./tools/benchmark.sh overwrite
+
+  # Test 8: overwrite with sync=1
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh overwrite
+
+  # Test 9: random update with sync=0
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \
+    ./tools/benchmark.sh updaterandom
+
+  # Test 10: random update with sync=1
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh updaterandom
+
+  # Test 11: random read while writing
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh readwhilewriting
+
+  # Test 12: range scan while writing
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh fwdrangewhilewriting
+
+  # Test 13: reverse range scan while writing
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh revrangewhilewriting
+done
+
+###### Merge tests
+
+for num_thr in "${nthreads[@]}" ; do
+  # Test 14: random merge with sync=0
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \
+    ./tools/benchmark.sh mergerandom
+
+  # Test 15: random merge with sync=1
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh mergerandom
+
+  # Test 16: random read while merging 
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh readwhilemerging
+
+  # Test 17: range scan while merging 
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh fwdrangewhilemerging
+
+  # Test 18: reverse range scan while merging 
+  env $ARGS DURATION=$duration NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh revrangewhilemerging
+done
+
+echo bulkload > $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep bulkload $output_dir/report.txt >> $output_dir/report2.txt
+echo fillseq >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fillseq $output_dir/report.txt >> $output_dir/report2.txt
+echo overwrite sync=0 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep overwrite $output_dir/report.txt | grep \.s0  >> $output_dir/report2.txt
+echo overwrite sync=1 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep overwrite $output_dir/report.txt | grep \.s1  >> $output_dir/report2.txt
+echo updaterandom sync=0 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep updaterandom $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt
+echo updaterandom sync=1 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep updaterandom $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt
+echo mergerandom sync=0 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep mergerandom $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt
+echo mergerandom sync=1 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep mergerandom $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt
+echo readrandom >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep readrandom $output_dir/report.txt  >> $output_dir/report2.txt
+echo fwdrange >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fwdrange\.t $output_dir/report.txt >> $output_dir/report2.txt
+echo revrange >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep revrange\.t $output_dir/report.txt >> $output_dir/report2.txt
+echo readwhile >> $output_dir/report2.txt >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep readwhilewriting $output_dir/report.txt >> $output_dir/report2.txt
+echo readwhile >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep readwhilemerging $output_dir/report.txt >> $output_dir/report2.txt
+echo fwdreadwhilewriting >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fwdrangewhilewriting $output_dir/report.txt >> $output_dir/report2.txt
+echo fwdreadwhilemerging >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fwdrangewhilemerg $output_dir/report.txt >> $output_dir/report2.txt
+echo revreadwhilewriting >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep revrangewhilewriting $output_dir/report.txt >> $output_dir/report2.txt
+echo revreadwhilemerging >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep revrangewhilemerg $output_dir/report.txt >> $output_dir/report2.txt
+
+cat $output_dir/report2.txt
diff --git a/src/rocksdb/tools/run_leveldb.sh b/src/rocksdb/tools/run_leveldb.sh
new file mode 100755
index 0000000..2224013
--- /dev/null
+++ b/src/rocksdb/tools/run_leveldb.sh
@@ -0,0 +1,174 @@
+#!/bin/bash
+# REQUIRE: benchmark_leveldb.sh exists in the current directory
+# After execution of this script, log files are generated in $output_dir.
+# report.txt provides a high level statistics
+#
+# This should be used with the LevelDB fork listed here to use additional test options.
+# For more details on the changes see the blog post listed below.
+#   https://github.com/mdcallag/leveldb-1
+#   http://smalldatum.blogspot.com/2015/04/comparing-leveldb-and-rocksdb-take-2.html
+#
+# This should be run from the parent of the tools directory. The command line is:
+#   [$env_vars] tools/run_flash_bench.sh [list-of-threads]
+#
+# This runs a sequence of tests in the following sequence:
+#   step 1) load - bulkload, compact, fillseq, overwrite
+#   step 2) read-only for each number of threads
+#   step 3) read-write for each number of threads
+#
+# The list of threads is optional and when not set is equivalent to "24". 
+# Were list-of-threads specified as "1 2 4" then the tests in steps 2, 3 and
+# 4 above would be repeated for 1, 2 and 4 threads. The tests in step 1 are
+# only run for 1 thread.
+
+# Test output is written to $OUTPUT_DIR, currently /tmp/output. The performance
+# summary is in $OUTPUT_DIR/report.txt. There is one file in $OUTPUT_DIR per
+# test and the tests are listed below.
+#
+# The environment variables are also optional. The variables are:
+#   NKEYS         - number of key/value pairs to load
+#   NWRITESPERSEC - the writes/second rate limit for the *whilewriting* tests.
+#                   If this is too large then the non-writer threads can get
+#                   starved.
+#   VAL_SIZE      - the length of the value in the key/value pairs loaded.
+#                   You can estimate the size of the test database from this,
+#                   NKEYS and the compression rate (--compression_ratio) set
+#                   in tools/benchmark_leveldb.sh
+#   BLOCK_LENGTH  - value for db_bench --block_size
+#   CACHE_BYTES   - the size of the RocksDB block cache in bytes
+#   DATA_DIR      - directory in which to create database files
+#   DO_SETUP      - when set to 0 then a backup of the database is copied from
+#                   $DATA_DIR.bak to $DATA_DIR and the load tests from step 1
+#                   This allows tests from steps 2, 3 to be repeated faster.
+#   SAVE_SETUP    - saves a copy of the database at the end of step 1 to
+#                   $DATA_DIR.bak.
+
+# Size constants
+K=1024
+M=$((1024 * K))
+G=$((1024 * M))
+
+num_keys=${NKEYS:-$((1 * G))}
+wps=${NWRITESPERSEC:-$((10 * K))}
+vs=${VAL_SIZE:-400}
+cs=${CACHE_BYTES:-$(( 1 * G ))}
+bs=${BLOCK_LENGTH:-4096}
+
+# If no command line arguments then run for 24 threads.
+if [[ $# -eq 0 ]]; then
+  nthreads=( 24 )
+else
+  nthreads=( "$@" )
+fi
+
+for num_thr in "${nthreads[@]}" ; do
+  echo Will run for $num_thr threads
+done
+
+# Update these parameters before execution !!!
+db_dir=${DATA_DIR:-"/tmp/rocksdb/"}
+
+do_setup=${DO_SETUP:-1}
+save_setup=${SAVE_SETUP:-0}
+
+output_dir="/tmp/output"
+
+ARGS="\
+OUTPUT_DIR=$output_dir \
+NUM_KEYS=$num_keys \
+DB_DIR=$db_dir \
+VALUE_SIZE=$vs \
+BLOCK_SIZE=$bs \
+CACHE_SIZE=$cs"
+
+mkdir -p $output_dir
+echo -e "ops/sec\tmb/sec\tusec/op\tavg\tp50\tTest" \
+  > $output_dir/report.txt
+
+# Notes on test sequence:
+#   step 1) Setup database via sequential fill followed by overwrite to fragment it.
+#           Done without setting DURATION to make sure that overwrite does $num_keys writes
+#   step 2) read-only tests for all levels of concurrency requested
+#   step 3) non read-only tests for all levels of concurrency requested
+
+###### Setup the database
+
+if [[ $do_setup != 0 ]]; then
+  echo Doing setup
+
+  # Test 2a: sequential fill with large values to get peak ingest
+  #          adjust NUM_KEYS given the use of larger values
+  env $ARGS BLOCK_SIZE=$((1 * M)) VALUE_SIZE=$((32 * K)) NUM_KEYS=$(( num_keys / 64 )) \
+       ./tools/benchmark_leveldb.sh fillseq
+
+  # Test 2b: sequential fill with the configured value size
+  env $ARGS ./tools/benchmark_leveldb.sh fillseq
+
+  # Test 3: single-threaded overwrite
+  env $ARGS NUM_THREADS=1 DB_BENCH_NO_SYNC=1 ./tools/benchmark_leveldb.sh overwrite
+
+else
+  echo Restoring from backup
+
+  rm -rf $db_dir
+
+  if [ ! -d ${db_dir}.bak ]; then
+    echo Database backup does not exist at ${db_dir}.bak
+    exit -1
+  fi
+
+  echo Restore database from ${db_dir}.bak
+  cp -p -r ${db_dir}.bak $db_dir
+fi
+
+if [[ $save_setup != 0 ]]; then
+  echo Save database to ${db_dir}.bak
+  cp -p -r $db_dir ${db_dir}.bak
+fi
+
+###### Read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+  # Test 4: random read
+  env $ARGS NUM_THREADS=$num_thr ./tools/benchmark_leveldb.sh readrandom
+
+done
+
+###### Non read-only tests
+
+for num_thr in "${nthreads[@]}" ; do
+  # Test 7: overwrite with sync=0
+  env $ARGS NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \
+    ./tools/benchmark_leveldb.sh overwrite
+
+  # Test 8: overwrite with sync=1
+  # Not run for now because LevelDB db_bench doesn't have an option to limit the
+  # test run to X seconds and doing sync-per-commit for --num can take too long.
+  # env $ARGS NUM_THREADS=$num_thr ./tools/benchmark_leveldb.sh overwrite
+
+  # Test 11: random read while writing
+  env $ARGS NUM_THREADS=$num_thr WRITES_PER_SECOND=$wps \
+    ./tools/benchmark_leveldb.sh readwhilewriting
+
+done
+
+echo bulkload > $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep bulkload $output_dir/report.txt >> $output_dir/report2.txt
+echo fillseq >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep fillseq $output_dir/report.txt >> $output_dir/report2.txt
+echo overwrite sync=0 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep overwrite $output_dir/report.txt | grep \.s0  >> $output_dir/report2.txt
+echo overwrite sync=1 >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep overwrite $output_dir/report.txt | grep \.s1  >> $output_dir/report2.txt
+echo readrandom >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep readrandom $output_dir/report.txt  >> $output_dir/report2.txt
+echo readwhile >> $output_dir/report2.txt >> $output_dir/report2.txt
+head -1 $output_dir/report.txt >> $output_dir/report2.txt
+grep readwhilewriting $output_dir/report.txt >> $output_dir/report2.txt
+
+cat $output_dir/report2.txt
diff --git a/src/rocksdb/tools/sample-dump.dmp b/src/rocksdb/tools/sample-dump.dmp
new file mode 100644
index 0000000..4ec3a77
Binary files /dev/null and b/src/rocksdb/tools/sample-dump.dmp differ
diff --git a/src/rocksdb/tools/sst_dump.cc b/src/rocksdb/tools/sst_dump.cc
new file mode 100644
index 0000000..4038937
--- /dev/null
+++ b/src/rocksdb/tools/sst_dump.cc
@@ -0,0 +1,21 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/sst_dump_tool.h"
+
+int main(int argc, char** argv) {
+  rocksdb::SSTDumpTool tool;
+  tool.Run(argc, argv);
+  return 0;
+}
+#else
+#include <stdio.h>
+int main(int argc, char** argv) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/tools/verify_random_db.sh b/src/rocksdb/tools/verify_random_db.sh
new file mode 100755
index 0000000..77607b6
--- /dev/null
+++ b/src/rocksdb/tools/verify_random_db.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+#
+# A shell script to verify DB generated by generate_random_db.sh cannot opened and read correct data.
+# ./ldb needs to be avaible to be executed.
+#
+# Usage: <SCRIPT> <DB Path>
+
+scriptpath=`dirname $BASH_SOURCE`
+if [ "$#" -lt 2 ]; then
+  echo "usage: $BASH_SOURCE <db_directory> <compare_base_db_directory> [dump_file_name]"
+  exit 1
+fi
+
+db_dir=$1
+base_db_dir=$2
+dump_file_name=${3:-"dump_file.txt"}
+db_dump=$db_dir"/"$dump_file_name
+base_db_dump=$base_db_dir"/"$dump_file_name
+
+set -e
+echo == Dumping data from $db_dir to $db_dump
+./ldb dump --db=$db_dir > $db_dump
+
+echo == Dumping data from $base_db_dir to $base_db_dump
+./ldb dump --db=$base_db_dir > $base_db_dump
+
+diff $db_dump $base_db_dir
diff --git a/src/rocksdb/util/aligned_buffer.h b/src/rocksdb/util/aligned_buffer.h
new file mode 100644
index 0000000..2244316
--- /dev/null
+++ b/src/rocksdb/util/aligned_buffer.h
@@ -0,0 +1,154 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <algorithm>
+#include "port/port.h"
+
+namespace rocksdb {
+
+inline size_t TruncateToPageBoundary(size_t page_size, size_t s) {
+  s -= (s & (page_size - 1));
+  assert((s % page_size) == 0);
+  return s;
+}
+
+inline size_t Roundup(size_t x, size_t y) {
+  return ((x + y - 1) / y) * y;
+}
+
+// This class is to manage an aligned user
+// allocated buffer for unbuffered I/O purposes
+// though can be used for any purpose.
+class AlignedBuffer {
+  size_t alignment_;
+  std::unique_ptr<char[]> buf_;
+  size_t capacity_;
+  size_t cursize_;
+  char* bufstart_;
+
+public:
+  AlignedBuffer()
+    : alignment_(),
+      capacity_(0),
+      cursize_(0),
+      bufstart_(nullptr) {
+  }
+
+  AlignedBuffer(AlignedBuffer&& o) ROCKSDB_NOEXCEPT {
+    *this = std::move(o);
+  }
+
+  AlignedBuffer& operator=(AlignedBuffer&& o) ROCKSDB_NOEXCEPT {
+    alignment_ = std::move(o.alignment_);
+    buf_ = std::move(o.buf_);
+    capacity_ = std::move(o.capacity_);
+    cursize_ = std::move(o.cursize_);
+    bufstart_ = std::move(o.bufstart_);
+    return *this;
+  }
+
+  AlignedBuffer(const AlignedBuffer&) = delete;
+
+  AlignedBuffer& operator=(const AlignedBuffer&) = delete;
+
+  size_t Alignment() const {
+    return alignment_;
+  }
+
+  size_t Capacity() const {
+    return capacity_;
+  }
+
+  size_t CurrentSize() const {
+    return cursize_;
+  }
+
+  const char* BufferStart() const {
+    return bufstart_;
+  }
+
+  void Clear() {
+    cursize_ = 0;
+  }
+
+  void Alignment(size_t alignment) {
+    assert(alignment > 0);
+    assert((alignment & (alignment - 1)) == 0);
+    alignment_ = alignment;
+  }
+
+  // Allocates a new buffer and sets bufstart_ to the aligned first byte
+  void AllocateNewBuffer(size_t requestedCapacity) {
+
+    assert(alignment_ > 0);
+    assert((alignment_ & (alignment_ - 1)) == 0);
+
+    size_t size = Roundup(requestedCapacity, alignment_);
+    buf_.reset(new char[size + alignment_]);
+
+    char* p = buf_.get();
+    bufstart_ = reinterpret_cast<char*>(
+      (reinterpret_cast<uintptr_t>(p)+(alignment_ - 1)) &
+      ~static_cast<uintptr_t>(alignment_ - 1));
+    capacity_ = size;
+    cursize_ = 0;
+  }
+  // Used for write
+  // Returns the number of bytes appended
+  size_t Append(const char* src, size_t append_size) {
+    size_t buffer_remaining = capacity_ - cursize_;
+    size_t to_copy = std::min(append_size, buffer_remaining);
+
+    if (to_copy > 0) {
+      memcpy(bufstart_ + cursize_, src, to_copy);
+      cursize_ += to_copy;
+    }
+    return to_copy;
+  }
+
+  size_t Read(char* dest, size_t offset, size_t read_size) const {
+    assert(offset < cursize_);
+    size_t to_read = std::min(cursize_ - offset, read_size);
+    if (to_read > 0) {
+      memcpy(dest, bufstart_ + offset, to_read);
+    }
+    return to_read;
+  }
+
+  /// Pad to alignment
+  void PadToAlignmentWith(int padding) {
+    size_t total_size = Roundup(cursize_, alignment_);
+    size_t pad_size = total_size - cursize_;
+
+    if (pad_size > 0) {
+      assert((pad_size + cursize_) <= capacity_);
+      memset(bufstart_ + cursize_, padding, pad_size);
+      cursize_ += pad_size;
+    }
+  }
+
+  // After a partial flush move the tail to the beginning of the buffer
+  void RefitTail(size_t tail_offset, size_t tail_size) {
+    if (tail_size > 0) {
+      memmove(bufstart_, bufstart_ + tail_offset, tail_size);
+    }
+    cursize_ = tail_size;
+  }
+
+  // Returns place to start writing
+  char* Destination() {
+    return bufstart_ + cursize_;
+  }
+
+  void Size(size_t cursize) {
+    cursize_ = cursize;
+  }
+};
+}
diff --git a/src/rocksdb/util/arena.cc b/src/rocksdb/util/arena.cc
index 3f00f08..1fe455a 100644
--- a/src/rocksdb/util/arena.cc
+++ b/src/rocksdb/util/arena.cc
@@ -8,13 +8,23 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "util/arena.h"
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#include <malloc.h>
+#endif
+#ifndef OS_WIN
 #include <sys/mman.h>
+#endif
+#include "port/port.h"
 #include <algorithm>
 #include "rocksdb/env.h"
 
 namespace rocksdb {
 
+// MSVC complains that it is already defined since it is static in the header.
+#ifndef OS_WIN
 const size_t Arena::kInlineSize;
+#endif
+
 const size_t Arena::kMinBlockSize = 4096;
 const size_t Arena::kMaxBlockSize = 2 << 30;
 static const int kAlignUnit = sizeof(void*);
@@ -52,12 +62,15 @@ Arena::~Arena() {
   for (const auto& block : blocks_) {
     delete[] block;
   }
+
+#ifdef MAP_HUGETLB
   for (const auto& mmap_info : huge_blocks_) {
     auto ret = munmap(mmap_info.addr_, mmap_info.length_);
     if (ret != 0) {
       // TODO(sdong): Better handling
     }
   }
+#endif
 }
 
 char* Arena::AllocateFallback(size_t bytes, bool aligned) {
@@ -69,12 +82,14 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) {
   }
 
   // We waste the remaining space in the current block.
-  size_t size;
+  size_t size = 0;
   char* block_head = nullptr;
+#ifdef MAP_HUGETLB
   if (hugetlb_size_) {
     size = hugetlb_size_;
     block_head = AllocateFromHugePage(size);
   }
+#endif
   if (!block_head) {
     size = kBlockSize;
     block_head = AllocateNewBlock(size);
@@ -97,6 +112,11 @@ char* Arena::AllocateFromHugePage(size_t bytes) {
   if (hugetlb_size_ == 0) {
     return nullptr;
   }
+  // already reserve space in huge_blocks_ before calling mmap().
+  // this way the insertion into the vector below will not throw and we
+  // won't leak the mapping in that case. if reserve() throws, we
+  // won't leak either
+  huge_blocks_.reserve(huge_blocks_.size() + 1);
 
   void* addr = mmap(nullptr, bytes, (PROT_READ | PROT_WRITE),
                     (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0);
@@ -104,7 +124,8 @@ char* Arena::AllocateFromHugePage(size_t bytes) {
   if (addr == MAP_FAILED) {
     return nullptr;
   }
-  huge_blocks_.push_back(MmapInfo(addr, bytes));
+  // the following shouldn't throw because of the above reserve()
+  huge_blocks_.emplace_back(MmapInfo(addr, bytes));
   blocks_memory_ += bytes;
   return reinterpret_cast<char*>(addr);
 #else
@@ -154,8 +175,20 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size,
 }
 
 char* Arena::AllocateNewBlock(size_t block_bytes) {
+  // already reserve space in blocks_ before allocating memory via new.
+  // this way the insertion into the vector below will not throw and we
+  // won't leak the allocated memory in that case. if reserve() throws,
+  // we won't leak either
+  blocks_.reserve(blocks_.size() + 1);
+
   char* block = new char[block_bytes];
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  blocks_memory_ += malloc_usable_size(block);
+#else
   blocks_memory_ += block_bytes;
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  // the following shouldn't throw because of the above reserve()
   blocks_.push_back(block);
   return block;
 }
diff --git a/src/rocksdb/util/arena.h b/src/rocksdb/util/arena.h
index 1ae50e2..9149498 100644
--- a/src/rocksdb/util/arena.h
+++ b/src/rocksdb/util/arena.h
@@ -12,6 +12,9 @@
 // size, it uses malloc to directly get the requested size.
 
 #pragma once
+#ifndef OS_WIN
+#include <sys/mman.h>
+#endif
 #include <cstddef>
 #include <cerrno>
 #include <vector>
@@ -99,7 +102,9 @@ class Arena : public Allocator {
   // How many bytes left in currently active block?
   size_t alloc_bytes_remaining_ = 0;
 
+#ifdef MAP_HUGETLB
   size_t hugetlb_size_ = 0;
+#endif  // MAP_HUGETLB
   char* AllocateFromHugePage(size_t bytes);
   char* AllocateFallback(size_t bytes, bool aligned);
   char* AllocateNewBlock(size_t block_bytes);
diff --git a/src/rocksdb/util/arena_test.cc b/src/rocksdb/util/arena_test.cc
index a3b96bb..8b74af9 100644
--- a/src/rocksdb/util/arena_test.cc
+++ b/src/rocksdb/util/arena_test.cc
@@ -21,22 +21,31 @@ class ArenaTest : public testing::Test {};
 TEST_F(ArenaTest, Empty) { Arena arena0; }
 
 namespace {
+bool CheckMemoryAllocated(size_t allocated, size_t expected) {
+  // The value returned by Arena::MemoryAllocatedBytes() may be greater than
+  // the requested memory. We choose a somewhat arbitrary upper bound of
+  // max_expected = expected * 1.1 to detect critical overallocation.
+  size_t max_expected = expected * 1.1;
+  return allocated >= expected && allocated <= max_expected;
+}
+
 void MemoryAllocatedBytesTest(size_t huge_page_size) {
   const int N = 17;
   size_t req_sz;  // requested size
-  size_t bsz = 8192;  // block size
+  size_t bsz = 32 * 1024;  // block size
   size_t expected_memory_allocated;
 
   Arena arena(bsz, huge_page_size);
 
   // requested size > quarter of a block:
   //   allocate requested size separately
-  req_sz = 3001;
+  req_sz = 12 * 1024;
   for (int i = 0; i < N; i++) {
     arena.Allocate(req_sz);
   }
   expected_memory_allocated = req_sz * N + Arena::kInlineSize;
-  ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
+  ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+               expected_memory_allocated);
 
   arena.Allocate(Arena::kInlineSize - 1);
 
@@ -49,30 +58,27 @@ void MemoryAllocatedBytesTest(size_t huge_page_size) {
     arena.Allocate(req_sz);
   }
   if (huge_page_size) {
-    ASSERT_TRUE(arena.MemoryAllocatedBytes() ==
-                    expected_memory_allocated + bsz ||
-                arena.MemoryAllocatedBytes() ==
-                    expected_memory_allocated + huge_page_size);
+    ASSERT_TRUE(
+        CheckMemoryAllocated(arena.MemoryAllocatedBytes(),
+                             expected_memory_allocated + bsz) ||
+        CheckMemoryAllocated(arena.MemoryAllocatedBytes(),
+                             expected_memory_allocated + huge_page_size));
   } else {
     expected_memory_allocated += bsz;
-    ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
+    ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+                 expected_memory_allocated);
   }
 
-  // requested size > quarter of a block:
+  // requested size > size of a block:
   //   allocate requested size separately
-  req_sz = 99999999;
+  expected_memory_allocated = arena.MemoryAllocatedBytes();
+  req_sz = 8 * 1024 * 1024;
   for (int i = 0; i < N; i++) {
     arena.Allocate(req_sz);
   }
   expected_memory_allocated += req_sz * N;
-  if (huge_page_size) {
-    ASSERT_TRUE(arena.MemoryAllocatedBytes() ==
-                    expected_memory_allocated + bsz ||
-                arena.MemoryAllocatedBytes() ==
-                    expected_memory_allocated + huge_page_size);
-  } else {
-    ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated);
-  }
+  ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+               expected_memory_allocated);
 }
 
 // Make sure we didn't count the allocate but not used memory space in
@@ -89,7 +95,8 @@ static void ApproximateMemoryUsageTest(size_t huge_page_size) {
   arena.AllocateAligned(Arena::kInlineSize / 2 - 16);
   arena.AllocateAligned(Arena::kInlineSize / 2);
   ASSERT_EQ(arena.ApproximateMemoryUsage(), Arena::kInlineSize - 8);
-  ASSERT_EQ(arena.MemoryAllocatedBytes(), Arena::kInlineSize);
+  ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
+               Arena::kInlineSize);
 
   auto num_blocks = kBlockSize / kEntrySize;
 
@@ -97,10 +104,12 @@ static void ApproximateMemoryUsageTest(size_t huge_page_size) {
   arena.AllocateAligned(kEntrySize);
   auto mem_usage = arena.MemoryAllocatedBytes();
   if (huge_page_size) {
-    ASSERT_TRUE(mem_usage == kBlockSize + Arena::kInlineSize ||
-                mem_usage == huge_page_size + Arena::kInlineSize);
+    ASSERT_TRUE(
+        CheckMemoryAllocated(mem_usage, kBlockSize + Arena::kInlineSize) ||
+        CheckMemoryAllocated(mem_usage, huge_page_size + Arena::kInlineSize));
   } else {
-    ASSERT_EQ(mem_usage, kBlockSize + Arena::kInlineSize);
+    ASSERT_PRED2(CheckMemoryAllocated, mem_usage,
+                 kBlockSize + Arena::kInlineSize);
   }
   auto usage = arena.ApproximateMemoryUsage();
   ASSERT_LT(usage, mem_usage);
diff --git a/src/rocksdb/util/auto_roll_logger.cc b/src/rocksdb/util/auto_roll_logger.cc
index 684abfc..4ea0356 100644
--- a/src/rocksdb/util/auto_roll_logger.cc
+++ b/src/rocksdb/util/auto_roll_logger.cc
@@ -91,7 +91,7 @@ void AutoRollLogger::Logv(const char* format, va_list ap) {
 
 void AutoRollLogger::WriteHeaderInfo() {
   mutex_.AssertHeld();
-  for (auto header : headers_) {
+  for (auto& header : headers_) {
     LogInternal("%s", header.c_str());
   }
 }
diff --git a/src/rocksdb/util/auto_roll_logger.h b/src/rocksdb/util/auto_roll_logger.h
index e8bb596..5b6dff6 100644
--- a/src/rocksdb/util/auto_roll_logger.h
+++ b/src/rocksdb/util/auto_roll_logger.h
@@ -11,7 +11,7 @@
 
 #include "db/filename.h"
 #include "port/port.h"
-#include "util/posix_logger.h"
+#include "port/util_logger.h"
 
 namespace rocksdb {
 
diff --git a/src/rocksdb/util/auto_roll_logger_test.cc b/src/rocksdb/util/auto_roll_logger_test.cc
index 6733a62..138eb6e 100644
--- a/src/rocksdb/util/auto_roll_logger_test.cc
+++ b/src/rocksdb/util/auto_roll_logger_test.cc
@@ -23,7 +23,16 @@ namespace rocksdb {
 class AutoRollLoggerTest : public testing::Test {
  public:
   static void InitTestDb() {
-    string deleteCmd = "rm -rf " + kTestDir;
+#ifdef OS_WIN
+    // Replace all slashes in the path so windows CompSpec does not
+    // become confused
+    std::string testDir(kTestDir);
+    std::replace_if(testDir.begin(), testDir.end(),
+                    [](char ch) { return ch == '/'; }, '\\');
+    std::string deleteCmd = "if exist " + testDir + " rd /s /q " + testDir;
+#else
+    std::string deleteCmd = "rm -rf " + kTestDir;
+#endif
     ASSERT_TRUE(system(deleteCmd.c_str()) == 0);
     Env::Default()->CreateDir(kTestDir);
   }
@@ -123,7 +132,11 @@ uint64_t AutoRollLoggerTest::RollLogFileByTimeTest(
   }
 
   // -- Make the log file expire
+#ifdef OS_WIN
+  Sleep(static_cast<unsigned int>(time) * 1000);
+#else
   sleep(static_cast<unsigned int>(time));
+#endif
   LogMessage(logger, log_message.c_str());
 
   // At this time, the new log file should be created.
@@ -151,9 +164,9 @@ TEST_F(AutoRollLoggerTest, RollLogFileByTime) {
 
     InitTestDb();
     // -- Test the existence of file during the server restart.
-    ASSERT_TRUE(!env->FileExists(kLogFile));
+    ASSERT_EQ(Status::NotFound(), env->FileExists(kLogFile));
     AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, time);
-    ASSERT_TRUE(env->FileExists(kLogFile));
+    ASSERT_OK(env->FileExists(kLogFile));
 
     RollLogFileByTimeTest(&logger, time, kSampleMessage + ":RollLogFileByTime");
 }
@@ -200,6 +213,9 @@ TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
       kSampleMessage + ":CompositeRollByTimeAndSizeLogger");
 }
 
+#ifndef OS_WIN
+// TODO: does not build for Windows because of PosixLogger use below. Need to
+// port
 TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
   DBOptions options;
   shared_ptr<Logger> logger;
@@ -244,6 +260,7 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
       auto_roll_logger, options.log_file_time_to_roll,
       kSampleMessage + ":CreateLoggerFromOptions - both");
 }
+#endif
 
 TEST_F(AutoRollLoggerTest, InfoLogLevel) {
   InitTestDb();
@@ -254,28 +271,29 @@ TEST_F(AutoRollLoggerTest, InfoLogLevel) {
   // becomes out of scope.
   {
     AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0);
-    for (int log_level = InfoLogLevel::FATAL_LEVEL;
+    for (int log_level = InfoLogLevel::HEADER_LEVEL;
          log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
       logger.SetInfoLogLevel((InfoLogLevel)log_level);
       for (int log_type = InfoLogLevel::DEBUG_LEVEL;
-           log_type <= InfoLogLevel::FATAL_LEVEL; log_type++) {
+           log_type <= InfoLogLevel::HEADER_LEVEL; log_type++) {
         // log messages with log level smaller than log_level will not be
         // logged.
         LogMessage((InfoLogLevel)log_type, &logger, kSampleMessage.c_str());
       }
-      log_lines += InfoLogLevel::FATAL_LEVEL - log_level + 1;
+      log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1;
     }
-    for (int log_level = InfoLogLevel::FATAL_LEVEL;
+    for (int log_level = InfoLogLevel::HEADER_LEVEL;
          log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
       logger.SetInfoLogLevel((InfoLogLevel)log_level);
 
       // again, messages with level smaller than log_level will not be logged.
+      Log(InfoLogLevel::HEADER_LEVEL, &logger, "%s", kSampleMessage.c_str());
       Debug(&logger, "%s", kSampleMessage.c_str());
       Info(&logger, "%s", kSampleMessage.c_str());
       Warn(&logger, "%s", kSampleMessage.c_str());
       Error(&logger, "%s", kSampleMessage.c_str());
       Fatal(&logger, "%s", kSampleMessage.c_str());
-      log_lines += InfoLogLevel::FATAL_LEVEL - log_level + 1;
+      log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1;
     }
   }
   std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str());
@@ -287,17 +305,18 @@ TEST_F(AutoRollLoggerTest, InfoLogLevel) {
 
 // Test the logger Header function for roll over logs
 // We expect the new logs creates as roll over to carry the headers specified
-static list<string> GetOldFileNames(const string& path) {
+static std::vector<string> GetOldFileNames(const string& path) {
+  std::vector<string> ret;
+
   const string dirname = path.substr(/*start=*/ 0, path.find_last_of("/"));
   const string fname = path.substr(path.find_last_of("/") + 1);
 
-  vector<string> children;
+  std::vector<string> children;
   Env::Default()->GetChildren(dirname, &children);
 
   // We know that the old log files are named [path]<something>
   // Return all entities that match the pattern
-  list<string> ret;
-  for (auto child : children) {
+  for (auto& child : children) {
     if (fname != child && child.find(fname) == 0) {
       ret.push_back(dirname + "/" + child);
     }
@@ -329,41 +348,54 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) {
   static const size_t LOG_MAX_SIZE = 1024 * 5;
   static const std::string HEADER_STR = "Log header line";
 
-  InitTestDb();
-
-  AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/ "",
-                        LOG_MAX_SIZE, /*log_file_time_to_roll=*/ 0);
+  // test_num == 0 -> standard call to Header()
+  // test_num == 1 -> call to Log() with InfoLogLevel::HEADER_LEVEL
+  for (int test_num = 0; test_num < 2; test_num++) {
 
-  // log some headers
-  for (size_t i = 0; i < MAX_HEADERS; i++) {
-    Header(&logger, "%s %d", HEADER_STR.c_str(), i);
-  }
+    InitTestDb();
 
-  const string& newfname = logger.TEST_log_fname().c_str();
+    AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/ "",
+                          LOG_MAX_SIZE, /*log_file_time_to_roll=*/ 0);
 
-  // log enough data to cause a roll over
-  int i = 0;
-  for (size_t iter = 0; iter < 2; iter++) {
-    while (logger.GetLogFileSize() < LOG_MAX_SIZE) {
-      Info(&logger, (kSampleMessage + ":LogHeaderTest line %d").c_str(), i);
-      ++i;
+    if (test_num == 0) {
+      // Log some headers explicitly using Header()
+      for (size_t i = 0; i < MAX_HEADERS; i++) {
+        Header(&logger, "%s %d", HEADER_STR.c_str(), i);
+      }
+    } else if (test_num == 1) {
+      // HEADER_LEVEL should make this behave like calling Header()
+      for (size_t i = 0; i < MAX_HEADERS; i++) {
+        Log(InfoLogLevel::HEADER_LEVEL, &logger, "%s %d",
+            HEADER_STR.c_str(), i);
+      }
     }
 
-    Info(&logger, "Rollover");
-  }
+    const string newfname = logger.TEST_log_fname();
+
+    // Log enough data to cause a roll over
+    int i = 0;
+    for (size_t iter = 0; iter < 2; iter++) {
+      while (logger.GetLogFileSize() < LOG_MAX_SIZE) {
+        Info(&logger, (kSampleMessage + ":LogHeaderTest line %d").c_str(), i);
+        ++i;
+      }
 
-  // Flus the log for the latest file
-  LogFlush(&logger);
+      Info(&logger, "Rollover");
+    }
+
+    // Flush the log for the latest file
+    LogFlush(&logger);
 
-  const list<string> oldfiles = GetOldFileNames(newfname);
+    const auto oldfiles = GetOldFileNames(newfname);
 
-  ASSERT_EQ(oldfiles.size(), (size_t) 2);
+    ASSERT_EQ(oldfiles.size(), (size_t) 2);
 
-  for (auto oldfname : oldfiles) {
-    // verify that the files rolled over
-    ASSERT_NE(oldfname, newfname);
-    // verify that the old log contains all the header logs
-    ASSERT_EQ(GetLinesCount(oldfname, HEADER_STR), MAX_HEADERS);
+    for (auto& oldfname : oldfiles) {
+      // verify that the files rolled over
+      ASSERT_NE(oldfname, newfname);
+      // verify that the old log contains all the header logs
+      ASSERT_EQ(GetLinesCount(oldfname, HEADER_STR), MAX_HEADERS);
+    }
   }
 }
 
@@ -375,7 +407,7 @@ TEST_F(AutoRollLoggerTest, LogFileExistence) {
   options.max_log_file_size = 100 * 1024 * 1024;
   options.create_if_missing = true;
   ASSERT_OK(rocksdb::DB::Open(options, kTestDir, &db));
-  ASSERT_TRUE(env->FileExists(kLogFile));
+  ASSERT_OK(env->FileExists(kLogFile));
   delete db;
 }
 
diff --git a/src/rocksdb/util/autovector.h b/src/rocksdb/util/autovector.h
index c9befe9..266a53a 100644
--- a/src/rocksdb/util/autovector.h
+++ b/src/rocksdb/util/autovector.h
@@ -239,7 +239,13 @@ class autovector {
     }
   }
 
-  void push_back(const T& item) { push_back(value_type(item)); }
+  void push_back(const T& item) {
+    if (num_stack_items_ < kSize) {
+      values_[num_stack_items_++] = item;
+    } else {
+      vect_.push_back(item);
+    }
+  }
 
   template <class... Args>
   void emplace_back(Args&&... args) {
diff --git a/src/rocksdb/util/autovector_test.cc b/src/rocksdb/util/autovector_test.cc
index c597e36..94e9926 100644
--- a/src/rocksdb/util/autovector_test.cc
+++ b/src/rocksdb/util/autovector_test.cc
@@ -18,8 +18,17 @@ namespace rocksdb {
 using namespace std;
 
 class AutoVectorTest : public testing::Test {};
-
 const unsigned long kSize = 8;
+
+namespace {
+template <class T>
+void AssertAutoVectorOnlyInStack(autovector<T, kSize>* vec, bool result) {
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(vec->only_in_stack(), result);
+#endif  // !ROCKSDB_LITE
+}
+}  // namespace
+
 TEST_F(AutoVectorTest, PushBackAndPopBack) {
   autovector<size_t, kSize> vec;
   ASSERT_TRUE(vec.empty());
@@ -29,9 +38,9 @@ TEST_F(AutoVectorTest, PushBackAndPopBack) {
     vec.push_back(i);
     ASSERT_TRUE(!vec.empty());
     if (i < kSize) {
-      ASSERT_TRUE(vec.only_in_stack());
+      AssertAutoVectorOnlyInStack(&vec, true);
     } else {
-      ASSERT_TRUE(!vec.only_in_stack());
+      AssertAutoVectorOnlyInStack(&vec, false);
     }
     ASSERT_EQ(i + 1, vec.size());
     ASSERT_EQ(i, vec[i]);
@@ -42,7 +51,7 @@ TEST_F(AutoVectorTest, PushBackAndPopBack) {
   while (size != 0) {
     vec.pop_back();
     // will always be in heap
-    ASSERT_TRUE(!vec.only_in_stack());
+    AssertAutoVectorOnlyInStack(&vec, false);
     ASSERT_EQ(--size, vec.size());
   }
 
@@ -57,9 +66,9 @@ TEST_F(AutoVectorTest, EmplaceBack) {
     vec.emplace_back(i, ToString(i + 123));
     ASSERT_TRUE(!vec.empty());
     if (i < kSize) {
-      ASSERT_TRUE(vec.only_in_stack());
+      AssertAutoVectorOnlyInStack(&vec, true);
     } else {
-      ASSERT_TRUE(!vec.only_in_stack());
+      AssertAutoVectorOnlyInStack(&vec, false);
     }
 
     ASSERT_EQ(i + 1, vec.size());
@@ -69,20 +78,20 @@ TEST_F(AutoVectorTest, EmplaceBack) {
 
   vec.clear();
   ASSERT_TRUE(vec.empty());
-  ASSERT_TRUE(!vec.only_in_stack());
+  AssertAutoVectorOnlyInStack(&vec, false);
 }
 
 TEST_F(AutoVectorTest, Resize) {
   autovector<size_t, kSize> vec;
 
   vec.resize(kSize);
-  ASSERT_TRUE(vec.only_in_stack());
+  AssertAutoVectorOnlyInStack(&vec, true);
   for (size_t i = 0; i < kSize; ++i) {
     vec[i] = i;
   }
 
   vec.resize(kSize * 2);
-  ASSERT_TRUE(!vec.only_in_stack());
+  AssertAutoVectorOnlyInStack(&vec, false);
   for (size_t i = 0; i < kSize; ++i) {
     ASSERT_EQ(vec[i], i);
   }
@@ -99,7 +108,9 @@ void AssertEqual(
     const autovector<size_t, kSize>& a, const autovector<size_t, kSize>& b) {
   ASSERT_EQ(a.size(), b.size());
   ASSERT_EQ(a.empty(), b.empty());
+#ifndef ROCKSDB_LITE
   ASSERT_EQ(a.only_in_stack(), b.only_in_stack());
+#endif  // !ROCKSDB_LITE
   for (size_t i = 0; i < a.size(); ++i) {
     ASSERT_EQ(a[i], b[i]);
   }
diff --git a/src/rocksdb/util/bloom_test.cc b/src/rocksdb/util/bloom_test.cc
index 237bf7d..aac5b39 100644
--- a/src/rocksdb/util/bloom_test.cc
+++ b/src/rocksdb/util/bloom_test.cc
@@ -10,8 +10,8 @@
 #ifndef GFLAGS
 #include <cstdio>
 int main() {
-  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
-  return 1;
+  fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+  return 0;
 }
 #else
 
diff --git a/src/rocksdb/util/cache.cc b/src/rocksdb/util/cache.cc
index 781e870..e64c01e 100644
--- a/src/rocksdb/util/cache.cc
+++ b/src/rocksdb/util/cache.cc
@@ -203,14 +203,22 @@ class LRUCache {
   Cache::Handle* Lookup(const Slice& key, uint32_t hash);
   void Release(Cache::Handle* handle);
   void Erase(const Slice& key, uint32_t hash);
+
   // Although in some platforms the update of size_t is atomic, to make sure
-  // GetUsage() works correctly under any platforms, we'll protect this
-  // function with mutex.
+  // GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
+  // protect them with mutex_.
+
   size_t GetUsage() const {
     MutexLock l(&mutex_);
     return usage_;
   }
 
+  size_t GetPinnedUsage() const {
+    MutexLock l(&mutex_);
+    assert(usage_ >= lru_usage_);
+    return usage_ - lru_usage_;
+  }
+
   void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
                               bool thread_safe);
 
@@ -231,11 +239,16 @@ class LRUCache {
   // Initialized before use.
   size_t capacity_;
 
+  // Memory size for entries residing in the cache
+  size_t usage_;
+
+  // Memory size for entries residing only in the LRU list
+  size_t lru_usage_;
+
   // mutex_ protects the following state.
   // We don't count mutex_ as the cache's internal state so semantically we
   // don't mind mutex_ invoking the non-const actions.
   mutable port::Mutex mutex_;
-  size_t usage_;
 
   // Dummy head of LRU list.
   // lru.prev is newest entry, lru.next is oldest entry.
@@ -245,8 +258,7 @@ class LRUCache {
   HandleTable table_;
 };
 
-LRUCache::LRUCache()
-    : usage_(0) {
+LRUCache::LRUCache() : usage_(0), lru_usage_(0) {
   // Make empty circular linked list
   lru_.next = &lru_;
   lru_.prev = &lru_;
@@ -281,6 +293,7 @@ void LRUCache::LRU_Remove(LRUHandle* e) {
   e->next->prev = e->prev;
   e->prev->next = e->next;
   e->prev = e->next = nullptr;
+  lru_usage_ -= e->charge;
 }
 
 void LRUCache::LRU_Append(LRUHandle* e) {
@@ -291,6 +304,7 @@ void LRUCache::LRU_Append(LRUHandle* e) {
   e->prev = lru_.prev;
   e->prev->next = e;
   e->next->prev = e;
+  lru_usage_ += e->charge;
 }
 
 void LRUCache::EvictFromLRU(size_t charge,
@@ -519,7 +533,6 @@ class ShardedLRUCache : public Cache {
 
   virtual size_t GetUsage() const override {
     // We will not lock the cache when getting the usage from shards.
-    // for (size_t i = 0; i < num_shard_bits_; ++i)
     int num_shards = 1 << num_shard_bits_;
     size_t usage = 0;
     for (int s = 0; s < num_shards; s++) {
@@ -528,6 +541,20 @@ class ShardedLRUCache : public Cache {
     return usage;
   }
 
+  virtual size_t GetUsage(Handle* handle) const override {
+    return reinterpret_cast<LRUHandle*>(handle)->charge;
+  }
+
+  virtual size_t GetPinnedUsage() const override {
+    // We will not lock the cache when getting the usage from shards.
+    int num_shards = 1 << num_shard_bits_;
+    size_t usage = 0;
+    for (int s = 0; s < num_shards; s++) {
+      usage += shards_[s].GetPinnedUsage();
+    }
+    return usage;
+  }
+
   virtual void DisownData() override { shards_ = nullptr; }
 
   virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
diff --git a/src/rocksdb/util/cache_test.cc b/src/rocksdb/util/cache_test.cc
index 6fba6a7..c8b2de8 100644
--- a/src/rocksdb/util/cache_test.cc
+++ b/src/rocksdb/util/cache_test.cc
@@ -9,6 +9,7 @@
 
 #include "rocksdb/cache.h"
 
+#include <forward_list>
 #include <vector>
 #include <string>
 #include <iostream>
@@ -142,6 +143,56 @@ TEST_F(CacheTest, UsageTest) {
   ASSERT_LT(kCapacity * 0.95, cache->GetUsage());
 }
 
+TEST_F(CacheTest, PinnedUsageTest) {
+  // cache is shared_ptr and will be automatically cleaned up.
+  const uint64_t kCapacity = 100000;
+  auto cache = NewLRUCache(kCapacity, 8);
+
+  size_t pinned_usage = 0;
+  const char* value = "abcdef";
+
+  std::forward_list<Cache::Handle*> unreleased_handles;
+
+  // Add entries. Unpin some of them after insertion. Then, pin some of them
+  // again. Check GetPinnedUsage().
+  for (int i = 1; i < 100; ++i) {
+    std::string key(i, 'a');
+    auto kv_size = key.size() + 5;
+    auto handle = cache->Insert(key, (void*)value, kv_size, dumbDeleter);
+    pinned_usage += kv_size;
+    ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+    if (i % 2 == 0) {
+      cache->Release(handle);
+      pinned_usage -= kv_size;
+      ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+    } else {
+      unreleased_handles.push_front(handle);
+    }
+    if (i % 3 == 0) {
+      unreleased_handles.push_front(cache->Lookup(key));
+      // If i % 2 == 0, then the entry was unpinned before Lookup, so pinned
+      // usage increased
+      if (i % 2 == 0) {
+        pinned_usage += kv_size;
+      }
+      ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+    }
+  }
+
+  // check that overloading the cache does not change the pinned usage
+  for (uint64_t i = 1; i < 2 * kCapacity; ++i) {
+    auto key = ToString(i);
+    cache->Release(
+        cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter));
+  }
+  ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
+
+  // release handles for pinned entries to prevent memory leaks
+  for (auto handle : unreleased_handles) {
+    cache->Release(handle);
+  }
+}
+
 TEST_F(CacheTest, HitAndMiss) {
   ASSERT_EQ(-1, Lookup(100));
 
diff --git a/src/rocksdb/util/channel.h b/src/rocksdb/util/channel.h
new file mode 100644
index 0000000..a898716
--- /dev/null
+++ b/src/rocksdb/util/channel.h
@@ -0,0 +1,67 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+#include <utility>
+
+#pragma once
+
+namespace rocksdb {
+
+template <class T>
+class channel {
+ public:
+  explicit channel() : eof_(false) {}
+
+  channel(const channel&) = delete;
+  void operator=(const channel&) = delete;
+
+  void sendEof() {
+    std::lock_guard<std::mutex> lk(lock_);
+    eof_ = true;
+    cv_.notify_all();
+  }
+
+  bool eof() {
+    std::lock_guard<std::mutex> lk(lock_);
+    return buffer_.empty() && eof_;
+  }
+
+  size_t size() const {
+    std::lock_guard<std::mutex> lk(lock_);
+    return buffer_.size();
+  }
+
+  // writes elem to the queue
+  void write(T&& elem) {
+    std::unique_lock<std::mutex> lk(lock_);
+    buffer_.emplace(std::forward<T>(elem));
+    cv_.notify_one();
+  }
+
+  /// Moves a dequeued element onto elem, blocking until an element
+  /// is available.
+  // returns false if EOF
+  bool read(T& elem) {
+    std::unique_lock<std::mutex> lk(lock_);
+    cv_.wait(lk, [&] { return eof_ || !buffer_.empty(); });
+    if (eof_ && buffer_.empty()) {
+      return false;
+    }
+    elem = std::move(buffer_.front());
+    buffer_.pop();
+    cv_.notify_one();
+    return true;
+  }
+
+ private:
+  std::condition_variable cv_;
+  std::mutex lock_;
+  std::queue<T> buffer_;
+  bool eof_;
+};
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/compaction_job_stats_impl.cc b/src/rocksdb/util/compaction_job_stats_impl.cc
new file mode 100644
index 0000000..01f022f
--- /dev/null
+++ b/src/rocksdb/util/compaction_job_stats_impl.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/compaction_job_stats.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+
+void CompactionJobStats::Reset() {
+  elapsed_micros = 0;
+
+  num_input_records = 0;
+  num_input_files = 0;
+  num_input_files_at_output_level = 0;
+
+  num_output_records = 0;
+  num_output_files = 0;
+
+  is_manual_compaction = 0;
+
+  total_input_bytes = 0;
+  total_output_bytes = 0;
+
+  num_records_replaced = 0;
+
+  total_input_raw_key_bytes = 0;
+  total_input_raw_value_bytes = 0;
+
+  num_input_deletion_records = 0;
+  num_expired_deletion_records = 0;
+
+  num_corrupt_keys = 0;
+
+  file_write_nanos = 0;
+  file_range_sync_nanos = 0;
+  file_fsync_nanos = 0;
+  file_prepare_write_nanos = 0;
+}
+
+void CompactionJobStats::Add(const CompactionJobStats& stats) {
+  elapsed_micros += stats.elapsed_micros;
+
+  num_input_records += stats.num_input_records;
+  num_input_files += stats.num_input_files;
+  num_input_files_at_output_level += stats.num_input_files_at_output_level;
+
+  num_output_records += stats.num_output_records;
+  num_output_files += stats.num_output_files;
+
+  total_input_bytes += stats.total_input_bytes;
+  total_output_bytes += stats.total_output_bytes;
+
+  num_records_replaced += stats.num_records_replaced;
+
+  total_input_raw_key_bytes += stats.total_input_raw_key_bytes;
+  total_input_raw_value_bytes += stats.total_input_raw_value_bytes;
+
+  num_input_deletion_records += stats.num_input_deletion_records;
+  num_expired_deletion_records += stats.num_expired_deletion_records;
+
+  num_corrupt_keys += stats.num_corrupt_keys;
+
+  file_write_nanos += stats.file_write_nanos;
+  file_range_sync_nanos += stats.file_range_sync_nanos;
+  file_fsync_nanos += stats.file_fsync_nanos;
+  file_prepare_write_nanos += stats.file_prepare_write_nanos;
+}
+
+#else
+
+void CompactionJobStats::Reset() {}
+
+void CompactionJobStats::Add(const CompactionJobStats& stats) {}
+
+#endif  // !ROCKSDB_LITE
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/comparator.cc b/src/rocksdb/util/comparator.cc
index e606395..6d7709d 100644
--- a/src/rocksdb/util/comparator.cc
+++ b/src/rocksdb/util/comparator.cc
@@ -8,6 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include <algorithm>
+#include <memory>
 #include <stdint.h>
 #include "rocksdb/comparator.h"
 #include "rocksdb/slice.h"
@@ -31,6 +32,10 @@ class BytewiseComparatorImpl : public Comparator {
     return a.compare(b);
   }
 
+  virtual bool Equal(const Slice& a, const Slice& b) const override {
+    return a == b;
+  }
+
   virtual void FindShortestSeparator(std::string* start,
                                      const Slice& limit) const override {
     // Find length of common prefix
@@ -84,23 +89,14 @@ class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
 
 }// namespace
 
-static port::OnceType once = LEVELDB_ONCE_INIT;
-static const Comparator* bytewise;
-static const Comparator* rbytewise;
-
-static void InitModule() {
-  bytewise = new BytewiseComparatorImpl;
-  rbytewise= new ReverseBytewiseComparatorImpl;
-}
-
 const Comparator* BytewiseComparator() {
-  port::InitOnce(&once, InitModule);
-  return bytewise;
+  static BytewiseComparatorImpl bytewise;
+  return &bytewise;
 }
 
 const Comparator* ReverseBytewiseComparator() {
-  port::InitOnce(&once, InitModule);
-  return rbytewise;
+  static ReverseBytewiseComparatorImpl rbytewise;
+  return &rbytewise;
 }
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/compression.h b/src/rocksdb/util/compression.h
index 36e36d5..dd4d013 100644
--- a/src/rocksdb/util/compression.h
+++ b/src/rocksdb/util/compression.h
@@ -11,6 +11,7 @@
 
 #include <algorithm>
 #include <limits>
+#include <string>
 
 #include "rocksdb/options.h"
 #include "util/coding.h"
@@ -32,6 +33,10 @@
 #include <lz4hc.h>
 #endif
 
+#if defined(ZSTD)
+#include <zstd.h>
+#endif
+
 namespace rocksdb {
 
 inline bool Snappy_Supported() {
@@ -62,6 +67,57 @@ inline bool LZ4_Supported() {
   return false;
 }
 
+inline bool ZSTD_Supported() {
+#ifdef ZSTD
+  return true;
+#endif
+  return false;
+}
+
+inline bool CompressionTypeSupported(CompressionType compression_type) {
+  switch (compression_type) {
+    case kNoCompression:
+      return true;
+    case kSnappyCompression:
+      return Snappy_Supported();
+    case kZlibCompression:
+      return Zlib_Supported();
+    case kBZip2Compression:
+      return BZip2_Supported();
+    case kLZ4Compression:
+      return LZ4_Supported();
+    case kLZ4HCCompression:
+      return LZ4_Supported();
+    case kZSTDNotFinalCompression:
+      return ZSTD_Supported();
+    default:
+      assert(false);
+      return false;
+  }
+}
+
+inline std::string CompressionTypeToString(CompressionType compression_type) {
+  switch (compression_type) {
+    case kNoCompression:
+      return "NoCompression";
+    case kSnappyCompression:
+      return "Snappy";
+    case kZlibCompression:
+      return "Zlib";
+    case kBZip2Compression:
+      return "BZip2";
+    case kLZ4Compression:
+      return "LZ4";
+    case kLZ4HCCompression:
+      return "LZ4HC";
+    case kZSTDNotFinalCompression:
+      return "ZSTD";
+    default:
+      assert(false);
+      return "";
+  }
+}
+
 // compress_format_version can have two values:
 // 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed
 // block. Also, decompressed sizes for LZ4 are encoded in platform-dependent
@@ -550,4 +606,47 @@ inline bool LZ4HC_Compress(const CompressionOptions& opts,
   return false;
 }
 
+inline bool ZSTD_Compress(const CompressionOptions& opts, const char* input,
+                          size_t length, ::std::string* output) {
+#ifdef ZSTD
+  if (length > std::numeric_limits<uint32_t>::max()) {
+    // Can't compress more than 4GB
+    return false;
+  }
+
+  size_t output_header_len = compression::PutDecompressedSizeInfo(
+      output, static_cast<uint32_t>(length));
+
+  size_t compressBound = ZSTD_compressBound(length);
+  output->resize(static_cast<size_t>(output_header_len + compressBound));
+  size_t outlen = ZSTD_compress(&(*output)[output_header_len], compressBound,
+                                input, length);
+  if (outlen == 0) {
+    return false;
+  }
+  output->resize(output_header_len + outlen);
+  return true;
+#endif
+  return false;
+}
+
+inline char* ZSTD_Uncompress(const char* input_data, size_t input_length,
+                             int* decompress_size) {
+#ifdef ZSTD
+  uint32_t output_len = 0;
+  if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
+                                            &output_len)) {
+    return nullptr;
+  }
+
+  char* output = new char[output_len];
+  size_t actual_output_length =
+      ZSTD_decompress(output, output_len, input_data, input_length);
+  assert(actual_output_length == output_len);
+  *decompress_size = static_cast<int>(actual_output_length);
+  return output;
+#endif
+  return nullptr;
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/crc32c.cc b/src/rocksdb/util/crc32c.cc
index 8f1a09e..b8d281a 100644
--- a/src/rocksdb/util/crc32c.cc
+++ b/src/rocksdb/util/crc32c.cc
@@ -383,6 +383,14 @@ static inline Function Choose_Extend() {
   return isSSE42() ? ExtendImpl<Fast_CRC32> : ExtendImpl<Slow_CRC32>;
 }
 
+bool IsFastCrc32Supported() {
+#ifdef __SSE4_2__
+  return isSSE42();
+#else
+  return false;
+#endif
+}
+
 Function ChosenExtend = Choose_Extend();
 
 uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
diff --git a/src/rocksdb/util/crc32c.h b/src/rocksdb/util/crc32c.h
index e5e6e14..14167c1 100644
--- a/src/rocksdb/util/crc32c.h
+++ b/src/rocksdb/util/crc32c.h
@@ -14,6 +14,8 @@
 namespace rocksdb {
 namespace crc32c {
 
+extern bool IsFastCrc32Supported();
+
 // Return the crc32c of concat(A, data[0,n-1]) where init_crc is the
 // crc32c of some string A.  Extend() is often used to maintain the
 // crc32c of a stream of data.
diff --git a/src/rocksdb/util/db_info_dumper.cc b/src/rocksdb/util/db_info_dumper.cc
index 9c70928..6cb978f 100644
--- a/src/rocksdb/util/db_info_dumper.cc
+++ b/src/rocksdb/util/db_info_dumper.cc
@@ -34,11 +34,11 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) {
   uint64_t file_size;
   std::string file_info, wal_info;
 
-  Log(InfoLogLevel::INFO_LEVEL, options.info_log, "DB SUMMARY\n");
+  Header(options.info_log, "DB SUMMARY\n");
   // Get files in dbname dir
   if (!env->GetChildren(dbname, &files).ok()) {
-    Log(InfoLogLevel::ERROR_LEVEL,
-        options.info_log, "Error when reading %s dir\n", dbname.c_str());
+    Error(options.info_log,
+          "Error when reading %s dir\n", dbname.c_str());
   }
   std::sort(files.begin(), files.end());
   for (std::string file : files) {
@@ -47,25 +47,22 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) {
     }
     switch (type) {
       case kCurrentFile:
-        Log(InfoLogLevel::INFO_LEVEL, options.info_log,
-            "CURRENT file:  %s\n", file.c_str());
+        Header(options.info_log, "CURRENT file:  %s\n", file.c_str());
         break;
       case kIdentityFile:
-        Log(InfoLogLevel::INFO_LEVEL, options.info_log,
-            "IDENTITY file:  %s\n", file.c_str());
+        Header(options.info_log, "IDENTITY file:  %s\n", file.c_str());
         break;
       case kDescriptorFile:
         env->GetFileSize(dbname + "/" + file, &file_size);
-        Log(InfoLogLevel::INFO_LEVEL, options.info_log,
-            "MANIFEST file:  %s size: %" PRIu64 " Bytes\n",
-            file.c_str(), file_size);
+        Header(options.info_log, "MANIFEST file:  %s size: %" PRIu64 " Bytes\n",
+               file.c_str(), file_size);
         break;
       case kLogFile:
         env->GetFileSize(dbname + "/" + file, &file_size);
-        char str[8];
+        char str[16];
         snprintf(str, sizeof(str), "%" PRIu64, file_size);
         wal_info.append(file).append(" size: ").
-            append(str, sizeof(str)).append(" ;");
+            append(str).append(" ; ");
         break;
       case kTableFile:
         if (++file_num < 10) {
@@ -81,7 +78,7 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) {
   for (auto& db_path : options.db_paths) {
     if (dbname.compare(db_path.path) != 0) {
       if (!env->GetChildren(db_path.path, &files).ok()) {
-        Log(InfoLogLevel::ERROR_LEVEL, options.info_log,
+        Error(options.info_log,
             "Error when reading %s dir\n",
             db_path.path.c_str());
         continue;
@@ -95,9 +92,9 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) {
         }
       }
     }
-    Log(InfoLogLevel::INFO_LEVEL, options.info_log,
-        "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n",
-        db_path.path.c_str(), file_num, file_info.c_str());
+    Header(options.info_log,
+           "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n",
+           db_path.path.c_str(), file_num, file_info.c_str());
     file_num = 0;
     file_info.clear();
   }
@@ -105,7 +102,7 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) {
   // Get wal file in wal_dir
   if (dbname.compare(options.wal_dir) != 0) {
     if (!env->GetChildren(options.wal_dir, &files).ok()) {
-      Log(InfoLogLevel::ERROR_LEVEL, options.info_log,
+      Error(options.info_log,
           "Error when reading %s dir\n",
           options.wal_dir.c_str());
       return;
@@ -115,16 +112,15 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) {
       if (ParseFileName(file, &number, &type)) {
         if (type == kLogFile) {
           env->GetFileSize(options.wal_dir + "/" + file, &file_size);
-          char str[8];
+          char str[16];
           snprintf(str, sizeof(str), "%" PRIu64, file_size);
           wal_info.append(file).append(" size: ").
-              append(str, sizeof(str)).append(" ;");
+              append(str).append(" ; ");
         }
       }
     }
   }
-  Log(InfoLogLevel::INFO_LEVEL, options.info_log,
-      "Write Ahead Log file in %s: %s\n",
-      options.wal_dir.c_str(), wal_info.c_str());
+  Header(options.info_log, "Write Ahead Log file in %s: %s\n",
+         options.wal_dir.c_str(), wal_info.c_str());
 }
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/db_test_util.cc b/src/rocksdb/util/db_test_util.cc
new file mode 100644
index 0000000..6b494ea
--- /dev/null
+++ b/src/rocksdb/util/db_test_util.cc
@@ -0,0 +1,981 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/db_test_util.h"
+
+namespace rocksdb {
+
+// Special Env used to delay background operations
+
+SpecialEnv::SpecialEnv(Env* base)
+    : EnvWrapper(base),
+      rnd_(301),
+      sleep_counter_(this),
+      addon_time_(0),
+      no_sleep_(false) {
+  delay_sstable_sync_.store(false, std::memory_order_release);
+  drop_writes_.store(false, std::memory_order_release);
+  no_space_.store(false, std::memory_order_release);
+  non_writable_.store(false, std::memory_order_release);
+  count_random_reads_ = false;
+  count_sequential_reads_ = false;
+  manifest_sync_error_.store(false, std::memory_order_release);
+  manifest_write_error_.store(false, std::memory_order_release);
+  log_write_error_.store(false, std::memory_order_release);
+  random_file_open_counter_.store(0, std::memory_order_relaxed);
+  log_write_slowdown_ = 0;
+  bytes_written_ = 0;
+  sync_counter_ = 0;
+  non_writeable_rate_ = 0;
+  new_writable_count_ = 0;
+  non_writable_count_ = 0;
+  table_write_callback_ = nullptr;
+}
+
+
+DBTestBase::DBTestBase(const std::string path) : option_config_(kDefault),
+           mem_env_(!getenv("MEM_ENV") ? nullptr :
+                                         new MockEnv(Env::Default())),
+           env_(new SpecialEnv(mem_env_ ? mem_env_ : Env::Default())) {
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  dbname_ = test::TmpDir(env_) + path;
+  alternative_wal_dir_ = dbname_ + "/wal";
+  alternative_db_log_dir_ = dbname_ + "/db_log_dir";
+  auto options = CurrentOptions();
+  auto delete_options = options;
+  delete_options.wal_dir = alternative_wal_dir_;
+  EXPECT_OK(DestroyDB(dbname_, delete_options));
+  // Destroy it for not alternative WAL dir is used.
+  EXPECT_OK(DestroyDB(dbname_, options));
+  db_ = nullptr;
+  Reopen(options);
+}
+
+DBTestBase::~DBTestBase() {
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({});
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+  Close();
+  Options options;
+  options.db_paths.emplace_back(dbname_, 0);
+  options.db_paths.emplace_back(dbname_ + "_2", 0);
+  options.db_paths.emplace_back(dbname_ + "_3", 0);
+  options.db_paths.emplace_back(dbname_ + "_4", 0);
+  EXPECT_OK(DestroyDB(dbname_, options));
+  delete env_;
+}
+
+// Switch to a fresh database with the next option configuration to
+// test.  Return false if there are no more configurations to test.
+bool DBTestBase::ChangeOptions(int skip_mask) {
+  for (option_config_++; option_config_ < kEnd; option_config_++) {
+    if ((skip_mask & kSkipDeletesFilterFirst) &&
+        option_config_ == kDeletesFilterFirst) {
+      continue;
+    }
+    if ((skip_mask & kSkipUniversalCompaction) &&
+        (option_config_ == kUniversalCompaction ||
+         option_config_ == kUniversalCompactionMultiLevel)) {
+      continue;
+    }
+    if ((skip_mask & kSkipMergePut) && option_config_ == kMergePut) {
+      continue;
+    }
+    if ((skip_mask & kSkipNoSeekToLast) &&
+        (option_config_ == kHashLinkList ||
+         option_config_ == kHashSkipList)) {;
+      continue;
+    }
+    if ((skip_mask & kSkipPlainTable) &&
+        (option_config_ == kPlainTableAllBytesPrefix ||
+         option_config_ == kPlainTableFirstBytePrefix ||
+         option_config_ == kPlainTableCappedPrefix ||
+         option_config_ == kPlainTableCappedPrefixNonMmap)) {
+      continue;
+    }
+    if ((skip_mask & kSkipHashIndex) &&
+        (option_config_ == kBlockBasedTableWithPrefixHashIndex ||
+         option_config_ == kBlockBasedTableWithWholeKeyHashIndex)) {
+      continue;
+    }
+    if ((skip_mask & kSkipHashCuckoo) && (option_config_ == kHashCuckoo)) {
+      continue;
+    }
+    if ((skip_mask & kSkipFIFOCompaction) &&
+        option_config_ == kFIFOCompaction) {
+      continue;
+    }
+    if ((skip_mask & kSkipMmapReads) &&
+        option_config_ == kWalDirAndMmapReads) {
+      continue;
+    }
+    break;
+  }
+
+  if (option_config_ >= kEnd) {
+    Destroy(last_options_);
+    return false;
+  } else {
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    return true;
+  }
+}
+
+// Switch between different compaction styles.
+bool DBTestBase::ChangeCompactOptions() {
+  if (option_config_ == kDefault) {
+    option_config_ = kUniversalCompaction;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompaction) {
+    option_config_ = kUniversalCompactionMultiLevel;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompactionMultiLevel) {
+    option_config_ = kLevelSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kLevelSubcompactions) {
+    option_config_ = kUniversalSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    TryReopen(options);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Switch between different filter policy
+// Jump from kDefault to kFilter to kFullFilter
+bool DBTestBase::ChangeFilterOptions() {
+  if (option_config_ == kDefault) {
+    option_config_ = kFilter;
+  } else if (option_config_ == kFilter) {
+    option_config_ = kFullFilterWithNewTableReaderForCompactions;
+  } else {
+    return false;
+  }
+  Destroy(last_options_);
+
+  auto options = CurrentOptions();
+  options.create_if_missing = true;
+  TryReopen(options);
+  return true;
+}
+
+// Return the current option configuration.
+Options DBTestBase::CurrentOptions(
+    const anon::OptionsOverride& options_override) {
+  Options options;
+  options.write_buffer_size = 4090 * 4096;
+  return CurrentOptions(options, options_override);
+}
+
+Options DBTestBase::CurrentOptions(
+    const Options& defaultOptions,
+    const anon::OptionsOverride& options_override) {
+  // this redundant copy is to minimize code change w/o having lint error.
+  Options options = defaultOptions;
+  XFUNC_TEST("", "dbtest_options", inplace_options1, GetXFTestOptions,
+             reinterpret_cast<Options*>(&options),
+             options_override.skip_policy);
+  BlockBasedTableOptions table_options;
+  bool set_block_based_table_factory = true;
+  switch (option_config_) {
+    case kHashSkipList:
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      options.memtable_factory.reset(
+          NewHashSkipListRepFactory(16));
+      break;
+    case kPlainTableFirstBytePrefix:
+      options.table_factory.reset(new PlainTableFactory());
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      options.allow_mmap_reads = true;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kPlainTableCappedPrefix:
+      options.table_factory.reset(new PlainTableFactory());
+      options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+      options.allow_mmap_reads = true;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kPlainTableCappedPrefixNonMmap:
+      options.table_factory.reset(new PlainTableFactory());
+      options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+      options.allow_mmap_reads = false;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kPlainTableAllBytesPrefix:
+      options.table_factory.reset(new PlainTableFactory());
+      options.prefix_extractor.reset(NewNoopTransform());
+      options.allow_mmap_reads = true;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kMergePut:
+      options.merge_operator = MergeOperators::CreatePutOperator();
+      break;
+    case kFilter:
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+      break;
+    case kFullFilterWithNewTableReaderForCompactions:
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+      options.new_table_reader_for_compaction_inputs = true;
+      options.compaction_readahead_size = 10 * 1024 * 1024;
+      break;
+    case kUncompressed:
+      options.compression = kNoCompression;
+      break;
+    case kNumLevel_3:
+      options.num_levels = 3;
+      break;
+    case kDBLogDir:
+      options.db_log_dir = alternative_db_log_dir_;
+      break;
+    case kWalDirAndMmapReads:
+      options.wal_dir = alternative_wal_dir_;
+      // mmap reads should be orthogonal to WalDir setting, so we piggyback to
+      // this option config to test mmap reads as well
+      options.allow_mmap_reads = true;
+      break;
+    case kManifestFileSize:
+      options.max_manifest_file_size = 50;  // 50 bytes
+    case kPerfOptions:
+      options.soft_rate_limit = 2.0;
+      options.delayed_write_rate = 8 * 1024 * 1024;
+      // TODO(3.13) -- test more options
+      break;
+    case kDeletesFilterFirst:
+      options.filter_deletes = true;
+      break;
+    case kVectorRep:
+      options.memtable_factory.reset(new VectorRepFactory(100));
+      break;
+    case kHashLinkList:
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      options.memtable_factory.reset(
+          NewHashLinkListRepFactory(4, 0, 3, true, 4));
+      break;
+    case kHashCuckoo:
+      options.memtable_factory.reset(
+          NewHashCuckooRepFactory(options.write_buffer_size));
+      break;
+    case kUniversalCompaction:
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+      break;
+    case kUniversalCompactionMultiLevel:
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 8;
+      break;
+    case kCompressedBlockCache:
+      options.allow_mmap_writes = true;
+      table_options.block_cache_compressed = NewLRUCache(8*1024*1024);
+      break;
+    case kInfiniteMaxOpenFiles:
+      options.max_open_files = -1;
+      break;
+    case kxxHashChecksum: {
+      table_options.checksum = kxxHash;
+      break;
+    }
+    case kFIFOCompaction: {
+      options.compaction_style = kCompactionStyleFIFO;
+      break;
+    }
+    case kBlockBasedTableWithPrefixHashIndex: {
+      table_options.index_type = BlockBasedTableOptions::kHashSearch;
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      break;
+    }
+    case kBlockBasedTableWithWholeKeyHashIndex: {
+      table_options.index_type = BlockBasedTableOptions::kHashSearch;
+      options.prefix_extractor.reset(NewNoopTransform());
+      break;
+    }
+    case kOptimizeFiltersForHits: {
+      options.optimize_filters_for_hits = true;
+      set_block_based_table_factory = true;
+      break;
+    }
+    case kRowCache: {
+      options.row_cache = NewLRUCache(1024 * 1024);
+      break;
+    }
+    case kLevelSubcompactions: {
+      options.max_subcompactions = 4;
+      break;
+    }
+    case kUniversalSubcompactions: {
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 8;
+      options.max_subcompactions = 4;
+      break;
+    }
+
+    default:
+      break;
+  }
+
+  if (options_override.filter_policy) {
+    table_options.filter_policy = options_override.filter_policy;
+  }
+  if (set_block_based_table_factory) {
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+  options.env = env_;
+  options.create_if_missing = true;
+  return options;
+}
+
+void DBTestBase::CreateColumnFamilies(const std::vector<std::string>& cfs,
+                          const Options& options) {
+  ColumnFamilyOptions cf_opts(options);
+  size_t cfi = handles_.size();
+  handles_.resize(cfi + cfs.size());
+  for (auto cf : cfs) {
+    ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+  }
+}
+
+void DBTestBase::CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                           const Options& options) {
+  CreateColumnFamilies(cfs, options);
+  std::vector<std::string> cfs_plus_default = cfs;
+  cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+  ReopenWithColumnFamilies(cfs_plus_default, options);
+}
+
+void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                              const std::vector<Options>& options) {
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+}
+
+void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                              const Options& options) {
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+}
+
+Status DBTestBase::TryReopenWithColumnFamilies(
+    const std::vector<std::string>& cfs,
+    const std::vector<Options>& options) {
+  Close();
+  EXPECT_EQ(cfs.size(), options.size());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (size_t i = 0; i < cfs.size(); ++i) {
+    column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+  }
+  DBOptions db_opts = DBOptions(options[0]);
+  return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+}
+
+Status DBTestBase::TryReopenWithColumnFamilies(
+    const std::vector<std::string>& cfs,
+    const Options& options) {
+  Close();
+  std::vector<Options> v_opts(cfs.size(), options);
+  return TryReopenWithColumnFamilies(cfs, v_opts);
+}
+
+void DBTestBase::Reopen(const Options& options) {
+  ASSERT_OK(TryReopen(options));
+}
+
+void DBTestBase::Close() {
+  for (auto h : handles_) {
+    delete h;
+  }
+  handles_.clear();
+  delete db_;
+  db_ = nullptr;
+}
+
+void DBTestBase::DestroyAndReopen(const Options& options) {
+  // Destroy using last options
+  Destroy(last_options_);
+  ASSERT_OK(TryReopen(options));
+}
+
+void DBTestBase::Destroy(const Options& options) {
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+}
+
+Status DBTestBase::ReadOnlyReopen(const Options& options) {
+  return DB::OpenForReadOnly(options, dbname_, &db_);
+}
+
+Status DBTestBase::TryReopen(const Options& options) {
+  Close();
+  last_options_ = options;
+  return DB::Open(options, dbname_, &db_);
+}
+
+Status DBTestBase::Flush(int cf) {
+  if (cf == 0) {
+    return db_->Flush(FlushOptions());
+  } else {
+    return db_->Flush(FlushOptions(), handles_[cf]);
+  }
+}
+
+Status DBTestBase::Put(const Slice& k, const Slice& v, WriteOptions wo) {
+  if (kMergePut == option_config_) {
+    return db_->Merge(wo, k, v);
+  } else {
+    return db_->Put(wo, k, v);
+  }
+}
+
+Status DBTestBase::Put(int cf, const Slice& k, const Slice& v,
+           WriteOptions wo) {
+  if (kMergePut == option_config_) {
+    return db_->Merge(wo, handles_[cf], k, v);
+  } else {
+    return db_->Put(wo, handles_[cf], k, v);
+  }
+}
+
+Status DBTestBase::Delete(const std::string& k) {
+  return db_->Delete(WriteOptions(), k);
+}
+
+Status DBTestBase::Delete(int cf, const std::string& k) {
+  return db_->Delete(WriteOptions(), handles_[cf], k);
+}
+
+Status DBTestBase::SingleDelete(const std::string& k) {
+  return db_->SingleDelete(WriteOptions(), k);
+}
+
+Status DBTestBase::SingleDelete(int cf, const std::string& k) {
+  return db_->SingleDelete(WriteOptions(), handles_[cf], k);
+}
+
+std::string DBTestBase::Get(const std::string& k, const Snapshot* snapshot) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  std::string result;
+  Status s = db_->Get(options, k, &result);
+  if (s.IsNotFound()) {
+    result = "NOT_FOUND";
+  } else if (!s.ok()) {
+    result = s.ToString();
+  }
+  return result;
+}
+
+std::string DBTestBase::Get(int cf, const std::string& k,
+                const Snapshot* snapshot) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  std::string result;
+  Status s = db_->Get(options, handles_[cf], k, &result);
+  if (s.IsNotFound()) {
+    result = "NOT_FOUND";
+  } else if (!s.ok()) {
+    result = s.ToString();
+  }
+  return result;
+}
+
+uint64_t DBTestBase::GetNumSnapshots() {
+  uint64_t int_num;
+  EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.num-snapshots", &int_num));
+  return int_num;
+}
+
+uint64_t DBTestBase::GetTimeOldestSnapshots() {
+  uint64_t int_num;
+  EXPECT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.oldest-snapshot-time", &int_num));
+  return int_num;
+}
+
+// Return a string that contains all key,value pairs in order,
+// formatted like "(k1->v1)(k2->v2)".
+std::string DBTestBase::Contents(int cf) {
+  std::vector<std::string> forward;
+  std::string result;
+  Iterator* iter = (cf == 0) ? db_->NewIterator(ReadOptions())
+                             : db_->NewIterator(ReadOptions(), handles_[cf]);
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    std::string s = IterStatus(iter);
+    result.push_back('(');
+    result.append(s);
+    result.push_back(')');
+    forward.push_back(s);
+  }
+
+  // Check reverse iteration results are the reverse of forward results
+  unsigned int matched = 0;
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+    EXPECT_LT(matched, forward.size());
+    EXPECT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
+    matched++;
+  }
+  EXPECT_EQ(matched, forward.size());
+
+  delete iter;
+  return result;
+}
+
+std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) {
+  Arena arena;
+  ScopedArenaIterator iter;
+  if (cf == 0) {
+    iter.set(dbfull()->TEST_NewInternalIterator(&arena));
+  } else {
+    iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf]));
+  }
+  InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+  iter->Seek(target.Encode());
+  std::string result;
+  if (!iter->status().ok()) {
+    result = iter->status().ToString();
+  } else {
+    result = "[ ";
+    bool first = true;
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      if (!ParseInternalKey(iter->key(), &ikey)) {
+        result += "CORRUPTED";
+      } else {
+        if (!last_options_.comparator->Equal(ikey.user_key, user_key)) {
+          break;
+        }
+        if (!first) {
+          result += ", ";
+        }
+        first = false;
+        switch (ikey.type) {
+          case kTypeValue:
+            result += iter->value().ToString();
+            break;
+          case kTypeMerge:
+            // keep it the same as kTypeValue for testing kMergePut
+            result += iter->value().ToString();
+            break;
+          case kTypeDeletion:
+            result += "DEL";
+            break;
+          case kTypeSingleDeletion:
+            result += "SDEL";
+            break;
+          default:
+            assert(false);
+            break;
+        }
+      }
+      iter->Next();
+    }
+    if (!first) {
+      result += " ";
+    }
+    result += "]";
+  }
+  return result;
+}
+
+int DBTestBase::NumSortedRuns(int cf) {
+  ColumnFamilyMetaData cf_meta;
+  if (cf == 0) {
+    db_->GetColumnFamilyMetaData(&cf_meta);
+  } else {
+    db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+  }
+  int num_sr = static_cast<int>(cf_meta.levels[0].files.size());
+  for (size_t i = 1U; i < cf_meta.levels.size(); i++) {
+    if (cf_meta.levels[i].files.size() > 0) {
+      num_sr++;
+    }
+  }
+  return num_sr;
+}
+
+uint64_t DBTestBase::TotalSize(int cf) {
+  ColumnFamilyMetaData cf_meta;
+  if (cf == 0) {
+    db_->GetColumnFamilyMetaData(&cf_meta);
+  } else {
+    db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+  }
+  return cf_meta.size;
+}
+
+int DBTestBase::NumTableFilesAtLevel(int level, int cf) {
+  std::string property;
+  if (cf == 0) {
+    // default cfd
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(level), &property));
+  } else {
+    EXPECT_TRUE(db_->GetProperty(
+        handles_[cf], "rocksdb.num-files-at-level" + NumberToString(level),
+        &property));
+  }
+  return atoi(property.c_str());
+}
+
+uint64_t DBTestBase::SizeAtLevel(int level) {
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  uint64_t sum = 0;
+  for (const auto& m : metadata) {
+    if (m.level == level) {
+      sum += m.size;
+    }
+  }
+  return sum;
+}
+
+int DBTestBase::TotalLiveFiles(int cf) {
+  ColumnFamilyMetaData cf_meta;
+  if (cf == 0) {
+    db_->GetColumnFamilyMetaData(&cf_meta);
+  } else {
+    db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+  }
+  int num_files = 0;
+  for (auto& level : cf_meta.levels) {
+    num_files += level.files.size();
+  }
+  return num_files;
+}
+
+int DBTestBase::TotalTableFiles(int cf, int levels) {
+  if (levels == -1) {
+    levels = CurrentOptions().num_levels;
+  }
+  int result = 0;
+  for (int level = 0; level < levels; level++) {
+    result += NumTableFilesAtLevel(level, cf);
+  }
+  return result;
+}
+
+// Return spread of files per level
+std::string DBTestBase::FilesPerLevel(int cf) {
+  int num_levels =
+      (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+  std::string result;
+  size_t last_non_zero_offset = 0;
+  for (int level = 0; level < num_levels; level++) {
+    int f = NumTableFilesAtLevel(level, cf);
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+    result += buf;
+    if (f > 0) {
+      last_non_zero_offset = result.size();
+    }
+  }
+  result.resize(last_non_zero_offset);
+  return result;
+}
+
+size_t DBTestBase::CountFiles() {
+  std::vector<std::string> files;
+  env_->GetChildren(dbname_, &files);
+
+  std::vector<std::string> logfiles;
+  if (dbname_ != last_options_.wal_dir) {
+    env_->GetChildren(last_options_.wal_dir, &logfiles);
+  }
+
+  return files.size() + logfiles.size();
+}
+
+size_t DBTestBase::CountLiveFiles() {
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  return metadata.size();
+}
+
+uint64_t DBTestBase::Size(const Slice& start, const Slice& limit, int cf) {
+  Range r(start, limit);
+  uint64_t size;
+  if (cf == 0) {
+    db_->GetApproximateSizes(&r, 1, &size);
+  } else {
+    db_->GetApproximateSizes(handles_[1], &r, 1, &size);
+  }
+  return size;
+}
+
+void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit,
+             uint32_t target_path_id) {
+  CompactRangeOptions compact_options;
+  compact_options.target_path_id = target_path_id;
+  ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
+}
+
+void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit) {
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+}
+
+void DBTestBase::Compact(const Slice& start, const Slice& limit) {
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
+}
+
+// Do n memtable compactions, each of which produces an sstable
+// covering the range [small,large].
+void DBTestBase::MakeTables(
+    int n, const std::string& small,
+    const std::string& large, int cf) {
+  for (int i = 0; i < n; i++) {
+    ASSERT_OK(Put(cf, small, "begin"));
+    ASSERT_OK(Put(cf, large, "end"));
+    ASSERT_OK(Flush(cf));
+    MoveFilesToLevel(n - i - 1, cf);
+  }
+}
+
+// Prevent pushing of new sstables into deeper levels by adding
+// tables that cover a specified range to all levels.
+void DBTestBase::FillLevels(
+    const std::string& smallest, const std::string& largest, int cf) {
+  MakeTables(db_->NumberLevels(handles_[cf]), smallest, largest, cf);
+}
+
+void DBTestBase::MoveFilesToLevel(int level, int cf) {
+  for (int l = 0; l < level; ++l) {
+    if (cf > 0) {
+      dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf]);
+    } else {
+      dbfull()->TEST_CompactRange(l, nullptr, nullptr);
+    }
+  }
+}
+
+void DBTestBase::DumpFileCounts(const char* label) {
+  fprintf(stderr, "---\n%s:\n", label);
+  fprintf(stderr, "maxoverlap: %" PRIu64 "\n",
+      dbfull()->TEST_MaxNextLevelOverlappingBytes());
+  for (int level = 0; level < db_->NumberLevels(); level++) {
+    int num = NumTableFilesAtLevel(level);
+    if (num > 0) {
+      fprintf(stderr, "  level %3d : %d files\n", level, num);
+    }
+  }
+}
+
+std::string DBTestBase::DumpSSTableList() {
+  std::string property;
+  db_->GetProperty("rocksdb.sstables", &property);
+  return property;
+}
+
+void DBTestBase::GetSstFiles(std::string path,
+                             std::vector<std::string>* files) {
+  env_->GetChildren(path, files);
+
+  files->erase(
+      std::remove_if(files->begin(), files->end(), [](std::string name) {
+        uint64_t number;
+        FileType type;
+        return !(ParseFileName(name, &number, &type) && type == kTableFile);
+      }), files->end());
+}
+
+int DBTestBase::GetSstFileCount(std::string path) {
+  std::vector<std::string> files;
+  GetSstFiles(path, &files);
+  return static_cast<int>(files.size());
+}
+
+// this will generate non-overlapping files since it keeps increasing key_idx
+void DBTestBase::GenerateNewFile(int cf, Random* rnd, int* key_idx,
+                                 bool nowait) {
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(cf, Key(*key_idx), RandomString(rnd, (i == 99) ? 1 : 990)));
+    (*key_idx)++;
+  }
+  if (!nowait) {
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+}
+
+// this will generate non-overlapping files since it keeps increasing key_idx
+void DBTestBase::GenerateNewFile(Random* rnd, int* key_idx, bool nowait) {
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(*key_idx), RandomString(rnd, (i == 99) ? 1 : 990)));
+    (*key_idx)++;
+  }
+  if (!nowait) {
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+}
+
+void DBTestBase::GenerateNewRandomFile(Random* rnd, bool nowait) {
+  for (int i = 0; i < 51; i++) {
+    ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 2000)));
+  }
+  ASSERT_OK(Put("key" + RandomString(rnd, 7), RandomString(rnd, 200)));
+  if (!nowait) {
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->TEST_WaitForCompact();
+  }
+}
+
+std::string DBTestBase::IterStatus(Iterator* iter) {
+  std::string result;
+  if (iter->Valid()) {
+    result = iter->key().ToString() + "->" + iter->value().ToString();
+  } else {
+    result = "(invalid)";
+  }
+  return result;
+}
+
+Options DBTestBase::OptionsForLogIterTest() {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.WAL_ttl_seconds = 1000;
+  return options;
+}
+
+std::string DBTestBase::DummyString(size_t len, char c) {
+  return std::string(len, c);
+}
+
+void DBTestBase::VerifyIterLast(std::string expected_key, int cf) {
+  Iterator* iter;
+  ReadOptions ro;
+  if (cf == 0) {
+    iter = db_->NewIterator(ro);
+  } else {
+    iter = db_->NewIterator(ro, handles_[cf]);
+  }
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), expected_key);
+  delete iter;
+}
+
+// Used to test InplaceUpdate
+
+// If previous value is nullptr or delta is > than previous value,
+//   sets newValue with delta
+// If previous value is not empty,
+//   updates previous value with 'b' string of previous value size - 1.
+UpdateStatus DBTestBase::updateInPlaceSmallerSize(
+    char* prevValue, uint32_t* prevSize,
+    Slice delta, std::string* newValue) {
+  if (prevValue == nullptr) {
+    *newValue = std::string(delta.size(), 'c');
+    return UpdateStatus::UPDATED;
+  } else {
+    *prevSize = *prevSize - 1;
+    std::string str_b = std::string(*prevSize, 'b');
+    memcpy(prevValue, str_b.c_str(), str_b.size());
+    return UpdateStatus::UPDATED_INPLACE;
+  }
+}
+
+UpdateStatus DBTestBase::updateInPlaceSmallerVarintSize(
+    char* prevValue, uint32_t* prevSize,
+    Slice delta, std::string* newValue) {
+  if (prevValue == nullptr) {
+    *newValue = std::string(delta.size(), 'c');
+    return UpdateStatus::UPDATED;
+  } else {
+    *prevSize = 1;
+    std::string str_b = std::string(*prevSize, 'b');
+    memcpy(prevValue, str_b.c_str(), str_b.size());
+    return UpdateStatus::UPDATED_INPLACE;
+  }
+}
+
+UpdateStatus DBTestBase::updateInPlaceLargerSize(
+    char* prevValue, uint32_t* prevSize,
+    Slice delta, std::string* newValue) {
+  *newValue = std::string(delta.size(), 'c');
+  return UpdateStatus::UPDATED;
+}
+
+UpdateStatus DBTestBase::updateInPlaceNoAction(
+    char* prevValue, uint32_t* prevSize,
+    Slice delta, std::string* newValue) {
+  return UpdateStatus::UPDATE_FAILED;
+}
+
+// Utility method to test InplaceUpdate
+void DBTestBase::validateNumberOfEntries(int numValues, int cf) {
+  ScopedArenaIterator iter;
+  Arena arena;
+  if (cf != 0) {
+    iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf]));
+  } else {
+    iter.set(dbfull()->TEST_NewInternalIterator(&arena));
+  }
+  iter->SeekToFirst();
+  ASSERT_EQ(iter->status().ok(), true);
+  int seq = numValues;
+  while (iter->Valid()) {
+    ParsedInternalKey ikey;
+    ikey.sequence = -1;
+    ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
+
+    // checks sequence number for updates
+    ASSERT_EQ(ikey.sequence, (unsigned)seq--);
+    iter->Next();
+  }
+  ASSERT_EQ(0, seq);
+}
+
+void DBTestBase::CopyFile(
+    const std::string& source, const std::string& destination,
+    uint64_t size) {
+  const EnvOptions soptions;
+  unique_ptr<SequentialFile> srcfile;
+  ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
+  unique_ptr<WritableFile> destfile;
+  ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+
+  if (size == 0) {
+    // default argument means copy everything
+    ASSERT_OK(env_->GetFileSize(source, &size));
+  }
+
+  char buffer[4096];
+  Slice slice;
+  while (size > 0) {
+    uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+    ASSERT_OK(srcfile->Read(one, &slice, buffer));
+    ASSERT_OK(destfile->Append(slice));
+    size -= slice.size();
+  }
+  ASSERT_OK(destfile->Close());
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/db_test_util.h b/src/rocksdb/util/db_test_util.h
new file mode 100644
index 0000000..774cce8
--- /dev/null
+++ b/src/rocksdb/util/db_test_util.h
@@ -0,0 +1,669 @@
+// Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <fcntl.h>
+#include <inttypes.h>
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "table/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/plain_table_factory.h"
+#include "util/compression.h"
+#include "util/db_test_util.h"
+#include "util/hash_linklist_rep.h"
+#include "util/mock_env.h"
+#include "util/mutexlock.h"
+#include "util/scoped_arena_iterator.h"
+#include "util/string_util.h"
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+#include "util/sync_point.h"
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "util/xfunc.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+namespace anon {
+class AtomicCounter {
+ public:
+  explicit AtomicCounter(Env* env = NULL)
+      : env_(env), cond_count_(&mu_), count_(0) {}
+
+  void Increment() {
+    MutexLock l(&mu_);
+    count_++;
+    cond_count_.SignalAll();
+  }
+
+  int Read() {
+    MutexLock l(&mu_);
+    return count_;
+  }
+
+  bool WaitFor(int count) {
+    MutexLock l(&mu_);
+
+    uint64_t start = env_->NowMicros();
+    while (count_ < count) {
+      uint64_t now = env_->NowMicros();
+      cond_count_.TimedWait(now + /*1s*/ 1 * 1000 * 1000);
+      if (env_->NowMicros() - start > /*10s*/ 10 * 1000 * 1000) {
+        return false;
+      }
+      if (count_ < count) {
+        GTEST_LOG_(WARNING) << "WaitFor is taking more time than usual";
+      }
+    }
+
+    return true;
+  }
+
+  void Reset() {
+    MutexLock l(&mu_);
+    count_ = 0;
+    cond_count_.SignalAll();
+  }
+
+ private:
+  Env* env_;
+  port::Mutex mu_;
+  port::CondVar cond_count_;
+  int count_;
+};
+
+struct OptionsOverride {
+  std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+
+  // Used as a bit mask of individual enums in which to skip an XF test point
+  int skip_policy = 0;
+};
+
+}  // namespace anon
+
+// Special Env used to delay background operations
+class SpecialEnv : public EnvWrapper {
+ public:
+  explicit SpecialEnv(Env* base);
+
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& soptions) override {
+    class SSTableFile : public WritableFile {
+     private:
+      SpecialEnv* env_;
+      unique_ptr<WritableFile> base_;
+
+     public:
+      SSTableFile(SpecialEnv* env, unique_ptr<WritableFile>&& base)
+          : env_(env),
+            base_(std::move(base)) {
+      }
+      Status Append(const Slice& data) override {
+        if (env_->table_write_callback_) {
+          (*env_->table_write_callback_)();
+        }
+        if (env_->drop_writes_.load(std::memory_order_acquire)) {
+          // Drop writes on the floor
+          return Status::OK();
+        } else if (env_->no_space_.load(std::memory_order_acquire)) {
+          return Status::IOError("No space left on device");
+        } else {
+          env_->bytes_written_ += data.size();
+          return base_->Append(data);
+        }
+      }
+      Status Truncate(uint64_t size) override {
+        return base_->Truncate(size);
+      }
+      Status Close() override {
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        // Check preallocation size
+        // preallocation size is never passed to base file.
+        size_t preallocation_size = preallocation_block_size();
+        TEST_SYNC_POINT_CALLBACK("DBTestWritableFile.GetPreallocationStatus",
+                                 &preallocation_size);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+        return base_->Close();
+      }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) {
+          env_->SleepForMicroseconds(100000);
+        }
+        return base_->Sync();
+      }
+      void SetIOPriority(Env::IOPriority pri) override {
+        base_->SetIOPriority(pri);
+      }
+      Env::IOPriority GetIOPriority() override {
+        return base_->GetIOPriority();
+      }
+    };
+    class ManifestFile : public WritableFile {
+     public:
+      ManifestFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) { }
+      Status Append(const Slice& data) override {
+        if (env_->manifest_write_error_.load(std::memory_order_acquire)) {
+          return Status::IOError("simulated writer error");
+        } else {
+          return base_->Append(data);
+        }
+      }
+      Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      Status Close() override { return base_->Close(); }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        if (env_->manifest_sync_error_.load(std::memory_order_acquire)) {
+          return Status::IOError("simulated sync error");
+        } else {
+          return base_->Sync();
+        }
+      }
+      uint64_t GetFileSize() override { return base_->GetFileSize(); }
+
+     private:
+      SpecialEnv* env_;
+      unique_ptr<WritableFile> base_;
+    };
+    class WalFile : public WritableFile {
+     public:
+      WalFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) {}
+      Status Append(const Slice& data) override {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT("SpecialEnv::WalFile::Append:1");
+#endif
+        Status s;
+        if (env_->log_write_error_.load(std::memory_order_acquire)) {
+          s = Status::IOError("simulated writer error");
+        } else {
+          int slowdown =
+              env_->log_write_slowdown_.load(std::memory_order_acquire);
+          if (slowdown > 0) {
+            env_->SleepForMicroseconds(slowdown);
+          }
+          s = base_->Append(data);
+        }
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT("SpecialEnv::WalFile::Append:2");
+#endif
+        return s;
+      }
+      Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      Status Close() override { return base_->Close(); }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        return base_->Sync();
+      }
+      bool IsSyncThreadSafe() const override {
+        return env_->is_wal_sync_thread_safe_.load();
+      }
+
+     private:
+      SpecialEnv* env_;
+      unique_ptr<WritableFile> base_;
+    };
+
+    if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
+      uint32_t random_number;
+      {
+        MutexLock l(&rnd_mutex_);
+        random_number = rnd_.Uniform(100);
+      }
+      if (random_number < non_writeable_rate_.load()) {
+        return Status::IOError("simulated random write error");
+      }
+    }
+
+    new_writable_count_++;
+
+    if (non_writable_count_.load() > 0) {
+      non_writable_count_--;
+      return Status::IOError("simulated write error");
+    }
+
+    Status s = target()->NewWritableFile(f, r, soptions);
+    if (s.ok()) {
+      if (strstr(f.c_str(), ".sst") != nullptr) {
+        r->reset(new SSTableFile(this, std::move(*r)));
+      } else if (strstr(f.c_str(), "MANIFEST") != nullptr) {
+        r->reset(new ManifestFile(this, std::move(*r)));
+      } else if (strstr(f.c_str(), "log") != nullptr) {
+        r->reset(new WalFile(this, std::move(*r)));
+      }
+    }
+    return s;
+  }
+
+  Status NewRandomAccessFile(const std::string& f,
+                             unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& soptions) override {
+    class CountingFile : public RandomAccessFile {
+     public:
+      CountingFile(unique_ptr<RandomAccessFile>&& target,
+                   anon::AtomicCounter* counter)
+          : target_(std::move(target)), counter_(counter) {
+      }
+      virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                          char* scratch) const override {
+        counter_->Increment();
+        return target_->Read(offset, n, result, scratch);
+      }
+
+     private:
+      unique_ptr<RandomAccessFile> target_;
+      anon::AtomicCounter* counter_;
+    };
+
+    Status s = target()->NewRandomAccessFile(f, r, soptions);
+    random_file_open_counter_++;
+    if (s.ok() && count_random_reads_) {
+      r->reset(new CountingFile(std::move(*r), &random_read_counter_));
+    }
+    return s;
+  }
+
+  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+                           const EnvOptions& soptions) override {
+    class CountingFile : public SequentialFile {
+     public:
+      CountingFile(unique_ptr<SequentialFile>&& target,
+                   anon::AtomicCounter* counter)
+          : target_(std::move(target)), counter_(counter) {}
+      virtual Status Read(size_t n, Slice* result, char* scratch) override {
+        counter_->Increment();
+        return target_->Read(n, result, scratch);
+      }
+      virtual Status Skip(uint64_t n) override { return target_->Skip(n); }
+
+     private:
+      unique_ptr<SequentialFile> target_;
+      anon::AtomicCounter* counter_;
+    };
+
+    Status s = target()->NewSequentialFile(f, r, soptions);
+    if (s.ok() && count_sequential_reads_) {
+      r->reset(new CountingFile(std::move(*r), &sequential_read_counter_));
+    }
+    return s;
+  }
+
+
+  virtual void SleepForMicroseconds(int micros) override {
+    sleep_counter_.Increment();
+    if (no_sleep_) {
+      addon_time_.fetch_add(micros);
+    } else {
+      target()->SleepForMicroseconds(micros);
+    }
+  }
+
+  virtual Status GetCurrentTime(int64_t* unix_time) override {
+    Status s = target()->GetCurrentTime(unix_time);
+    if (s.ok()) {
+      *unix_time += addon_time_.load();
+    }
+    return s;
+  }
+
+  virtual uint64_t NowNanos() override {
+    return target()->NowNanos() + addon_time_.load() * 1000;
+  }
+
+  virtual uint64_t NowMicros() override {
+    return target()->NowMicros() + addon_time_.load();
+  }
+
+  Random rnd_;
+  port::Mutex rnd_mutex_;  // Lock to pretect rnd_
+
+  // sstable Sync() calls are blocked while this pointer is non-nullptr.
+  std::atomic<bool> delay_sstable_sync_;
+
+  // Drop writes on the floor while this pointer is non-nullptr.
+  std::atomic<bool> drop_writes_;
+
+  // Simulate no-space errors while this pointer is non-nullptr.
+  std::atomic<bool> no_space_;
+
+  // Simulate non-writable file system while this pointer is non-nullptr
+  std::atomic<bool> non_writable_;
+
+  // Force sync of manifest files to fail while this pointer is non-nullptr
+  std::atomic<bool> manifest_sync_error_;
+
+  // Force write to manifest files to fail while this pointer is non-nullptr
+  std::atomic<bool> manifest_write_error_;
+
+  // Force write to log files to fail while this pointer is non-nullptr
+  std::atomic<bool> log_write_error_;
+
+  // Slow down every log write, in micro-seconds.
+  std::atomic<int> log_write_slowdown_;
+
+  bool count_random_reads_;
+  anon::AtomicCounter random_read_counter_;
+  std::atomic<int> random_file_open_counter_;
+
+  bool count_sequential_reads_;
+  anon::AtomicCounter sequential_read_counter_;
+
+  anon::AtomicCounter sleep_counter_;
+
+  std::atomic<int64_t> bytes_written_;
+
+  std::atomic<int> sync_counter_;
+
+  std::atomic<uint32_t> non_writeable_rate_;
+
+  std::atomic<uint32_t> new_writable_count_;
+
+  std::atomic<uint32_t> non_writable_count_;
+
+  std::function<void()>* table_write_callback_;
+
+  std::atomic<int64_t> addon_time_;
+  bool no_sleep_;
+
+  std::atomic<bool> is_wal_sync_thread_safe_ {true};
+};
+
+class DBTestBase : public testing::Test {
+ protected:
+  // Sequence of option configurations to try
+  enum OptionConfig {
+    kDefault = 0,
+    kBlockBasedTableWithPrefixHashIndex = 1,
+    kBlockBasedTableWithWholeKeyHashIndex = 2,
+    kPlainTableFirstBytePrefix = 3,
+    kPlainTableCappedPrefix = 4,
+    kPlainTableCappedPrefixNonMmap = 5,
+    kPlainTableAllBytesPrefix = 6,
+    kVectorRep = 7,
+    kHashLinkList = 8,
+    kHashCuckoo = 9,
+    kMergePut = 10,
+    kFilter = 11,
+    kFullFilterWithNewTableReaderForCompactions = 12,
+    kUncompressed = 13,
+    kNumLevel_3 = 14,
+    kDBLogDir = 15,
+    kWalDirAndMmapReads = 16,
+    kManifestFileSize = 17,
+    kPerfOptions = 18,
+    kDeletesFilterFirst = 19,
+    kHashSkipList = 20,
+    kUniversalCompaction = 21,
+    kUniversalCompactionMultiLevel = 22,
+    kCompressedBlockCache = 23,
+    kInfiniteMaxOpenFiles = 24,
+    kxxHashChecksum = 25,
+    kFIFOCompaction = 26,
+    kOptimizeFiltersForHits = 27,
+    kRowCache = 28,
+    kLevelSubcompactions = 29,
+    kUniversalSubcompactions = 30,
+    kEnd = 29
+  };
+  int option_config_;
+
+ public:
+  std::string dbname_;
+  std::string alternative_wal_dir_;
+  std::string alternative_db_log_dir_;
+  MockEnv* mem_env_;
+  SpecialEnv* env_;
+  DB* db_;
+  std::vector<ColumnFamilyHandle*> handles_;
+
+  Options last_options_;
+
+  // Skip some options, as they may not be applicable to a specific test.
+  // To add more skip constants, use values 4, 8, 16, etc.
+  enum OptionSkip {
+    kNoSkip = 0,
+    kSkipDeletesFilterFirst = 1,
+    kSkipUniversalCompaction = 2,
+    kSkipMergePut = 4,
+    kSkipPlainTable = 8,
+    kSkipHashIndex = 16,
+    kSkipNoSeekToLast = 32,
+    kSkipHashCuckoo = 64,
+    kSkipFIFOCompaction = 128,
+    kSkipMmapReads = 256,
+  };
+
+  explicit DBTestBase(const std::string path);
+
+  ~DBTestBase();
+
+  static std::string RandomString(Random* rnd, int len) {
+    std::string r;
+    test::RandomString(rnd, len, &r);
+    return r;
+  }
+
+  static std::string Key(int i) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "key%06d", i);
+    return std::string(buf);
+  }
+
+  // Switch to a fresh database with the next option configuration to
+  // test.  Return false if there are no more configurations to test.
+  bool ChangeOptions(int skip_mask = kNoSkip);
+
+  // Switch between different compaction styles (we have only 2 now).
+  bool ChangeCompactOptions();
+
+  // Switch between different filter policy
+  // Jump from kDefault to kFilter to kFullFilter
+  bool ChangeFilterOptions();
+
+  // Return the current option configuration.
+  Options CurrentOptions(
+      const anon::OptionsOverride& options_override = anon::OptionsOverride());
+
+  Options CurrentOptions(
+      const Options& defaultOptions,
+      const anon::OptionsOverride& options_override = anon::OptionsOverride());
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void CreateColumnFamilies(const std::vector<std::string>& cfs,
+                            const Options& options);
+
+  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                             const Options& options);
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const std::vector<Options>& options);
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const Options& options);
+
+  Status TryReopenWithColumnFamilies(
+      const std::vector<std::string>& cfs,
+      const std::vector<Options>& options);
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const Options& options);
+
+  void Reopen(const Options& options);
+
+  void Close();
+
+  void DestroyAndReopen(const Options& options);
+
+  void Destroy(const Options& options);
+
+  Status ReadOnlyReopen(const Options& options);
+
+  Status TryReopen(const Options& options);
+
+  Status Flush(int cf = 0);
+
+  Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions());
+
+  Status Put(int cf, const Slice& k, const Slice& v,
+             WriteOptions wo = WriteOptions());
+
+  Status Delete(const std::string& k);
+
+  Status Delete(int cf, const std::string& k);
+
+  Status SingleDelete(const std::string& k);
+
+  Status SingleDelete(int cf, const std::string& k);
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr);
+
+  std::string Get(int cf, const std::string& k,
+                  const Snapshot* snapshot = nullptr);
+
+  uint64_t GetNumSnapshots();
+
+  uint64_t GetTimeOldestSnapshots();
+
+  // Return a string that contains all key,value pairs in order,
+  // formatted like "(k1->v1)(k2->v2)".
+  std::string Contents(int cf = 0);
+
+  std::string AllEntriesFor(const Slice& user_key, int cf = 0);
+
+  int NumSortedRuns(int cf = 0);
+
+  uint64_t TotalSize(int cf = 0);
+
+  int NumTableFilesAtLevel(int level, int cf = 0);
+
+  uint64_t SizeAtLevel(int level);
+
+  int TotalLiveFiles(int cf = 0);
+
+  int TotalTableFiles(int cf = 0, int levels = -1);
+
+  // Return spread of files per level
+  std::string FilesPerLevel(int cf = 0);
+
+  size_t CountFiles();
+
+  size_t CountLiveFiles();
+
+  uint64_t Size(const Slice& start, const Slice& limit, int cf = 0);
+
+  void Compact(int cf, const Slice& start, const Slice& limit,
+               uint32_t target_path_id);
+
+  void Compact(int cf, const Slice& start, const Slice& limit);
+
+  void Compact(const Slice& start, const Slice& limit);
+
+  // Do n memtable compactions, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int n, const std::string& small, const std::string& large,
+                  int cf = 0);
+
+  // Prevent pushing of new sstables into deeper levels by adding
+  // tables that cover a specified range to all levels.
+  void FillLevels(const std::string& smallest, const std::string& largest,
+                  int cf);
+
+  void MoveFilesToLevel(int level, int cf = 0);
+
+  void DumpFileCounts(const char* label);
+
+  std::string DumpSSTableList();
+
+  void GetSstFiles(std::string path, std::vector<std::string>* files);
+
+  int GetSstFileCount(std::string path);
+
+  // this will generate non-overlapping files since it keeps increasing key_idx
+  void GenerateNewFile(Random* rnd, int* key_idx, bool nowait = false);
+
+  void GenerateNewFile(int fd, Random* rnd, int* key_idx, bool nowait = false);
+
+  void GenerateNewRandomFile(Random* rnd, bool nowait = false);
+
+  std::string IterStatus(Iterator* iter);
+
+  Options OptionsForLogIterTest();
+
+  std::string DummyString(size_t len, char c = 'a');
+
+  void VerifyIterLast(std::string expected_key, int cf = 0);
+
+  // Used to test InplaceUpdate
+
+  // If previous value is nullptr or delta is > than previous value,
+  //   sets newValue with delta
+  // If previous value is not empty,
+  //   updates previous value with 'b' string of previous value size - 1.
+  static UpdateStatus updateInPlaceSmallerSize(
+      char* prevValue, uint32_t* prevSize,
+      Slice delta, std::string* newValue);
+
+  static UpdateStatus updateInPlaceSmallerVarintSize(
+      char* prevValue, uint32_t* prevSize,
+      Slice delta, std::string* newValue);
+
+  static UpdateStatus updateInPlaceLargerSize(
+      char* prevValue, uint32_t* prevSize,
+      Slice delta, std::string* newValue);
+
+  static UpdateStatus updateInPlaceNoAction(
+      char* prevValue, uint32_t* prevSize,
+      Slice delta, std::string* newValue);
+
+  // Utility method to test InplaceUpdate
+  void validateNumberOfEntries(int numValues, int cf = 0);
+
+  void CopyFile(const std::string& source, const std::string& destination,
+                uint64_t size = 0);
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/delete_scheduler_impl.cc b/src/rocksdb/util/delete_scheduler_impl.cc
new file mode 100644
index 0000000..e0f7511
--- /dev/null
+++ b/src/rocksdb/util/delete_scheduler_impl.cc
@@ -0,0 +1,231 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include "util/delete_scheduler_impl.h"
+
+#include <thread>
+#include <vector>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "util/mutexlock.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+DeleteSchedulerImpl::DeleteSchedulerImpl(Env* env, const std::string& trash_dir,
+                                         int64_t rate_bytes_per_sec,
+                                         std::shared_ptr<Logger> info_log)
+    : env_(env),
+      trash_dir_(trash_dir),
+      rate_bytes_per_sec_(rate_bytes_per_sec),
+      pending_files_(0),
+      closing_(false),
+      cv_(&mu_),
+      info_log_(info_log) {
+  if (rate_bytes_per_sec_ == 0) {
+    // Rate limiting is disabled
+    bg_thread_.reset();
+  } else {
+    bg_thread_.reset(
+        new std::thread(&DeleteSchedulerImpl::BackgroundEmptyTrash, this));
+  }
+}
+
+DeleteSchedulerImpl::~DeleteSchedulerImpl() {
+  {
+    MutexLock l(&mu_);
+    closing_ = true;
+    cv_.SignalAll();
+  }
+  if (bg_thread_) {
+    bg_thread_->join();
+  }
+}
+
+Status DeleteSchedulerImpl::DeleteFile(const std::string& file_path) {
+  if (rate_bytes_per_sec_ == 0) {
+    // Rate limiting is disabled
+    return env_->DeleteFile(file_path);
+  }
+
+  // Move file to trash
+  std::string path_in_trash;
+  Status s = MoveToTrash(file_path, &path_in_trash);
+  if (!s.ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, info_log_,
+        "Failed to move %s to trash directory (%s)", file_path.c_str(),
+        trash_dir_.c_str());
+    return env_->DeleteFile(file_path);
+  }
+
+  // Add file to delete queue
+  {
+    MutexLock l(&mu_);
+    queue_.push(path_in_trash);
+    pending_files_++;
+    if (pending_files_ == 1) {
+      cv_.SignalAll();
+    }
+  }
+  return s;
+}
+
+std::map<std::string, Status> DeleteSchedulerImpl::GetBackgroundErrors() {
+  MutexLock l(&mu_);
+  return bg_errors_;
+}
+
+Status DeleteSchedulerImpl::MoveToTrash(const std::string& file_path,
+                                        std::string* path_in_trash) {
+  Status s;
+  // Figure out the name of the file in trash folder
+  size_t idx = file_path.rfind("/");
+  if (idx == std::string::npos || idx == file_path.size() - 1) {
+    return Status::InvalidArgument("file_path is corrupted");
+  }
+  *path_in_trash = trash_dir_ + file_path.substr(idx);
+  std::string unique_suffix = "";
+
+  if (*path_in_trash == file_path) {
+    // This file is already in trash
+    return s;
+  }
+
+  // TODO(tec) : Implement Env::RenameFileIfNotExist and remove
+  //             file_move_mu mutex.
+  MutexLock l(&file_move_mu_);
+  while (true) {
+    s = env_->FileExists(*path_in_trash + unique_suffix);
+    if (s.IsNotFound()) {
+      // We found a path for our file in trash
+      *path_in_trash += unique_suffix;
+      s = env_->RenameFile(file_path, *path_in_trash);
+      break;
+    } else if (s.ok()) {
+      // Name conflict, generate new random suffix
+      unique_suffix = env_->GenerateUniqueId();
+    } else {
+      // Error during FileExists call, we cannot continue
+      break;
+    }
+  }
+  return s;
+}
+
+void DeleteSchedulerImpl::BackgroundEmptyTrash() {
+  TEST_SYNC_POINT("DeleteSchedulerImpl::BackgroundEmptyTrash");
+
+  while (true) {
+    MutexLock l(&mu_);
+    while (queue_.empty() && !closing_) {
+      cv_.Wait();
+    }
+
+    if (closing_) {
+      return;
+    }
+
+    // Delete all files in queue_
+    uint64_t start_time = env_->NowMicros();
+    uint64_t total_deleted_bytes = 0;
+    while (!queue_.empty() && !closing_) {
+      std::string path_in_trash = queue_.front();
+      queue_.pop();
+
+      // We dont need to hold the lock while deleting the file
+      mu_.Unlock();
+      uint64_t deleted_bytes = 0;
+      // Delete file from trash and update total_penlty value
+      Status s = DeleteTrashFile(path_in_trash,  &deleted_bytes);
+      total_deleted_bytes += deleted_bytes;
+      mu_.Lock();
+
+      if (!s.ok()) {
+        bg_errors_[path_in_trash] = s;
+      }
+
+      // Apply penlty if necessary
+      uint64_t total_penlty =
+          ((total_deleted_bytes * kMicrosInSecond) / rate_bytes_per_sec_);
+      while (!closing_ && !cv_.TimedWait(start_time + total_penlty)) {}
+      TEST_SYNC_POINT_CALLBACK("DeleteSchedulerImpl::BackgroundEmptyTrash:Wait",
+                               &total_penlty);
+
+      pending_files_--;
+      if (pending_files_ == 0) {
+        // Unblock WaitForEmptyTrash since there are no more files waiting
+        // to be deleted
+        cv_.SignalAll();
+      }
+    }
+  }
+}
+
+Status DeleteSchedulerImpl::DeleteTrashFile(const std::string& path_in_trash,
+                                            uint64_t* deleted_bytes) {
+  uint64_t file_size;
+  Status s = env_->GetFileSize(path_in_trash, &file_size);
+  if (s.ok()) {
+    TEST_SYNC_POINT("DeleteSchedulerImpl::DeleteTrashFile:DeleteFile");
+    s = env_->DeleteFile(path_in_trash);
+  }
+
+  if (!s.ok()) {
+    // Error while getting file size or while deleting
+    Log(InfoLogLevel::ERROR_LEVEL, info_log_,
+        "Failed to delete %s from trash -- %s", path_in_trash.c_str(),
+        s.ToString().c_str());
+    *deleted_bytes = 0;
+  } else {
+    *deleted_bytes = file_size;
+  }
+
+  return s;
+}
+
+void DeleteSchedulerImpl::WaitForEmptyTrash() {
+  MutexLock l(&mu_);
+  while (pending_files_ > 0 && !closing_) {
+    cv_.Wait();
+  }
+}
+
+DeleteScheduler* NewDeleteScheduler(Env* env, const std::string& trash_dir,
+                                    int64_t rate_bytes_per_sec,
+                                    std::shared_ptr<Logger> info_log,
+                                    bool delete_exisitng_trash,
+                                    Status* status) {
+  DeleteScheduler* res =
+      new DeleteSchedulerImpl(env, trash_dir, rate_bytes_per_sec, info_log);
+
+  Status s;
+  if (trash_dir != "") {
+    s = env->CreateDirIfMissing(trash_dir);
+    if (s.ok() && delete_exisitng_trash) {
+      std::vector<std::string> files_in_trash;
+      s = env->GetChildren(trash_dir, &files_in_trash);
+      if (s.ok()) {
+        for (const std::string& trash_file : files_in_trash) {
+          if (trash_file == "." || trash_file == "..") {
+            continue;
+          }
+          Status file_delete = res->DeleteFile(trash_dir + "/" + trash_file);
+          if (s.ok() && !file_delete.ok()) {
+            s = file_delete;
+          }
+        }
+      }
+    }
+  }
+
+  if (status) {
+    *status = s;
+  }
+
+  return res;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/delete_scheduler_impl.h b/src/rocksdb/util/delete_scheduler_impl.h
new file mode 100644
index 0000000..32ef65f
--- /dev/null
+++ b/src/rocksdb/util/delete_scheduler_impl.h
@@ -0,0 +1,81 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <map>
+#include <queue>
+#include <string>
+#include <thread>
+
+#include "port/port.h"
+
+#include "rocksdb/delete_scheduler.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class Env;
+class Logger;
+
+class DeleteSchedulerImpl : public DeleteScheduler {
+ public:
+  DeleteSchedulerImpl(Env* env, const std::string& trash_dir,
+                      int64_t rate_bytes_per_sec,
+                      std::shared_ptr<Logger> info_log);
+
+  ~DeleteSchedulerImpl();
+
+  // Return delete rate limit in bytes per second
+  int64_t GetRateBytesPerSecond() { return rate_bytes_per_sec_; }
+
+  // Move file to trash directory and schedule it's deletion
+  Status DeleteFile(const std::string& fname);
+
+  // Wait for all files being deleteing in the background to finish or for
+  // destructor to be called.
+  void WaitForEmptyTrash();
+
+  // Return a map containing errors that happened in BackgroundEmptyTrash
+  // file_path => error status
+  std::map<std::string, Status> GetBackgroundErrors();
+
+ private:
+  Status MoveToTrash(const std::string& file_path, std::string* path_in_trash);
+
+  Status DeleteTrashFile(const std::string& path_in_trash,
+                         uint64_t* deleted_bytes);
+
+  void BackgroundEmptyTrash();
+
+  Env* env_;
+  // Path to the trash directory
+  std::string trash_dir_;
+  // Maximum number of bytes that should be deleted per second
+  int64_t rate_bytes_per_sec_;
+  // Mutex to protect queue_, pending_files_, bg_errors_, closing_
+  port::Mutex mu_;
+  // Queue of files in trash that need to be deleted
+  std::queue<std::string> queue_;
+  // Number of files in trash that are waiting to be deleted
+  int32_t pending_files_;
+  // Errors that happened in BackgroundEmptyTrash (file_path => error)
+  std::map<std::string, Status> bg_errors_;
+  // Set to true in ~DeleteSchedulerImpl() to force BackgroundEmptyTrash to stop
+  bool closing_;
+  // Condition variable signaled in these conditions
+  //    - pending_files_ value change from 0 => 1
+  //    - pending_files_ value change from 1 => 0
+  //    - closing_ value is set to true
+  port::CondVar cv_;
+  // Background thread running BackgroundEmptyTrash
+  std::unique_ptr<std::thread> bg_thread_;
+  // Mutex to protect threads from file name conflicts
+  port::Mutex file_move_mu_;
+  std::shared_ptr<Logger> info_log_;
+  static const uint64_t kMicrosInSecond = 1000 * 1000LL;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/delete_scheduler_test.cc b/src/rocksdb/util/delete_scheduler_test.cc
new file mode 100644
index 0000000..fcd821c
--- /dev/null
+++ b/src/rocksdb/util/delete_scheduler_test.cc
@@ -0,0 +1,469 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <atomic>
+#include <thread>
+#include <vector>
+
+#include "rocksdb/delete_scheduler.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "util/string_util.h"
+#include "util/sync_point.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class DeleteSchedulerTest : public testing::Test {
+ public:
+  DeleteSchedulerTest() : env_(Env::Default()) {
+    dummy_files_dir_ = test::TmpDir(env_) + "/dummy_data_dir";
+    DestroyAndCreateDir(dummy_files_dir_);
+    trash_dir_ = test::TmpDir(env_) + "/trash";
+    DestroyAndCreateDir(trash_dir_);
+  }
+
+  ~DeleteSchedulerTest() {
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+    rocksdb::SyncPoint::GetInstance()->LoadDependency({});
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+    DestroyDir(dummy_files_dir_);
+  }
+
+  void DestroyDir(const std::string& dir) {
+    if (env_->FileExists(dir).IsNotFound()) {
+      return;
+    }
+    std::vector<std::string> files_in_dir;
+    EXPECT_OK(env_->GetChildren(dir, &files_in_dir));
+    for (auto& file_in_dir : files_in_dir) {
+      if (file_in_dir == "." || file_in_dir == "..") {
+        continue;
+      }
+      EXPECT_OK(env_->DeleteFile(dir + "/" + file_in_dir));
+    }
+    EXPECT_OK(env_->DeleteDir(dir));
+  }
+
+  void DestroyAndCreateDir(const std::string& dir) {
+    DestroyDir(dir);
+    EXPECT_OK(env_->CreateDir(dir));
+  }
+
+  int CountFilesInDir(const std::string& dir) {
+    std::vector<std::string> files_in_dir;
+    EXPECT_OK(env_->GetChildren(dir, &files_in_dir));
+    // Ignore "." and ".."
+    return static_cast<int>(files_in_dir.size()) - 2;
+  }
+
+  std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024) {
+    std::string file_path = dummy_files_dir_ + "/" + file_name;
+    std::unique_ptr<WritableFile> f;
+    env_->NewWritableFile(file_path, &f, EnvOptions());
+    std::string data(size, 'A');
+    EXPECT_OK(f->Append(data));
+    EXPECT_OK(f->Close());
+    return file_path;
+  }
+
+  Env* env_;
+  std::string dummy_files_dir_;
+  std::string trash_dir_;
+  int64_t rate_bytes_per_sec_;
+  std::shared_ptr<DeleteScheduler> delete_scheduler_;
+};
+
+// Test the basic functionality of DeleteScheduler (Rate Limiting).
+// 1- Create 100 dummy files
+// 2- Delete the 100 dummy files using DeleteScheduler
+// --- Hold DeleteSchedulerImpl::BackgroundEmptyTrash ---
+// 3- Wait for DeleteScheduler to delete all files in trash
+// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files
+// 5- Make sure that all created files were completely deleted
+TEST_F(DeleteSchedulerTest, BasicRateLimiting) {
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::BasicRateLimiting:1",
+       "DeleteSchedulerImpl::BackgroundEmptyTrash"},
+  });
+
+  std::vector<uint64_t> penalties;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteSchedulerImpl::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); });
+
+  int num_files = 100;  // 100 files
+  uint64_t file_size = 1024;  // every file is 1 kb
+  std::vector<uint64_t> delete_kbs_per_sec = {512, 200, 100, 50, 25};
+
+  for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+    penalties.clear();
+    rocksdb::SyncPoint::GetInstance()->ClearTrace();
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndCreateDir(dummy_files_dir_);
+    rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+    delete_scheduler_.reset(
+        NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_));
+
+    // Create 100 dummy files, every file is 1 Kb
+    std::vector<std::string> generated_files;
+    for (int i = 0; i < num_files; i++) {
+      std::string file_name = "file" + ToString(i) + ".data";
+      generated_files.push_back(NewDummyFile(file_name, file_size));
+    }
+
+    // Delete dummy files and measure time spent to empty trash
+    for (int i = 0; i < num_files; i++) {
+      ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i]));
+    }
+    ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
+
+    uint64_t delete_start_time = env_->NowMicros();
+    TEST_SYNC_POINT("DeleteSchedulerTest::BasicRateLimiting:1");
+    delete_scheduler_->WaitForEmptyTrash();
+    uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+
+    uint64_t total_files_size = 0;
+    uint64_t expected_penlty = 0;
+    ASSERT_EQ(penalties.size(), num_files);
+    for (int i = 0; i < num_files; i++) {
+      total_files_size += file_size;
+      expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+      ASSERT_EQ(expected_penlty, penalties[i]);
+    }
+    ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+
+    ASSERT_EQ(CountFilesInDir(trash_dir_), 0);
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+// Same as the BasicRateLimiting test but delete files in multiple threads.
+// 1- Create 100 dummy files
+// 2- Delete the 100 dummy files using DeleteScheduler using 10 threads
+// --- Hold DeleteSchedulerImpl::BackgroundEmptyTrash ---
+// 3- Wait for DeleteScheduler to delete all files in queue
+// 4- Verify that BackgroundEmptyTrash used to correct penlties for the files
+// 5- Make sure that all created files were completely deleted
+TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) {
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::RateLimitingMultiThreaded:1",
+       "DeleteSchedulerImpl::BackgroundEmptyTrash"},
+  });
+
+  std::vector<uint64_t> penalties;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteSchedulerImpl::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); });
+
+  int thread_cnt = 10;
+  int num_files = 10;  // 10 files per thread
+  uint64_t file_size = 1024;  // every file is 1 kb
+
+  std::vector<uint64_t> delete_kbs_per_sec = {512, 200, 100, 50, 25};
+  for (size_t t = 0; t < delete_kbs_per_sec.size(); t++) {
+    penalties.clear();
+    rocksdb::SyncPoint::GetInstance()->ClearTrace();
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndCreateDir(dummy_files_dir_);
+    rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
+    delete_scheduler_.reset(
+        NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_));
+
+    // Create 100 dummy files, every file is 1 Kb
+    std::vector<std::string> generated_files;
+    for (int i = 0; i < num_files * thread_cnt; i++) {
+      std::string file_name = "file" + ToString(i) + ".data";
+      generated_files.push_back(NewDummyFile(file_name, file_size));
+    }
+
+    // Delete dummy files using 10 threads and measure time spent to empty trash
+    std::atomic<int> thread_num(0);
+    std::vector<std::thread> threads;
+    std::function<void()> delete_thread = [&]() {
+      int idx = thread_num.fetch_add(1);
+      int range_start = idx * num_files;
+      int range_end = range_start + num_files;
+      for (int j = range_start; j < range_end; j++) {
+        ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[j]));
+      }
+    };
+
+    for (int i = 0; i < thread_cnt; i++) {
+      threads.emplace_back(delete_thread);
+    }
+
+    for (size_t i = 0; i < threads.size(); i++) {
+      threads[i].join();
+    }
+
+    uint64_t delete_start_time = env_->NowMicros();
+    TEST_SYNC_POINT("DeleteSchedulerTest::RateLimitingMultiThreaded:1");
+    delete_scheduler_->WaitForEmptyTrash();
+    uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+
+    uint64_t total_files_size = 0;
+    uint64_t expected_penlty = 0;
+    ASSERT_EQ(penalties.size(), num_files * thread_cnt);
+    for (int i = 0; i < num_files * thread_cnt; i++) {
+      total_files_size += file_size;
+      expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec_);
+      ASSERT_EQ(expected_penlty, penalties[i]);
+    }
+    ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+
+    ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
+    ASSERT_EQ(CountFilesInDir(trash_dir_), 0);
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+// Disable rate limiting by setting rate_bytes_per_sec_ to 0 and make sure
+// that when DeleteScheduler delete a file it delete it immediately and dont
+// move it to trash
+TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
+  int bg_delete_file = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteSchedulerImpl::DeleteTrashFile:DeleteFile",
+      [&](void* arg) { bg_delete_file++; });
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  delete_scheduler_.reset(NewDeleteScheduler(env_, "", 0));
+
+  for (int i = 0; i < 10; i++) {
+    // Every file we delete will be deleted immediately
+    std::string dummy_file = NewDummyFile("dummy.data");
+    ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file));
+    ASSERT_TRUE(env_->FileExists(dummy_file).IsNotFound());
+    ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
+    ASSERT_EQ(CountFilesInDir(trash_dir_), 0);
+  }
+
+  ASSERT_EQ(bg_delete_file, 0);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Testing that moving files to trash with the same name is not a problem
+// 1- Create 10 files with the same name "conflict.data"
+// 2- Delete the 10 files using DeleteScheduler
+// 3- Make sure that trash directory contain 10 files ("conflict.data" x 10)
+// --- Hold DeleteSchedulerImpl::BackgroundEmptyTrash ---
+// 4- Make sure that files are deleted from trash
+TEST_F(DeleteSchedulerTest, ConflictNames) {
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::ConflictNames:1",
+       "DeleteSchedulerImpl::BackgroundEmptyTrash"},
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 Mb/sec
+  delete_scheduler_.reset(
+      NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_));
+
+  // Create "conflict.data" and move it to trash 10 times
+  for (int i = 0; i < 10; i++) {
+    std::string dummy_file = NewDummyFile("conflict.data");
+    ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file));
+  }
+  ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
+  // 10 files ("conflict.data" x 10) in trash
+  ASSERT_EQ(CountFilesInDir(trash_dir_), 10);
+
+  // Hold BackgroundEmptyTrash
+  TEST_SYNC_POINT("DeleteSchedulerTest::ConflictNames:1");
+  delete_scheduler_->WaitForEmptyTrash();
+  ASSERT_EQ(CountFilesInDir(trash_dir_), 0);
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// 1- Create 10 dummy files
+// 2- Delete the 10 files using DeleteScheduler (move them to trsah)
+// 3- Delete the 10 files directly (using env_->DeleteFile)
+// --- Hold DeleteSchedulerImpl::BackgroundEmptyTrash ---
+// 4- Make sure that DeleteScheduler failed to delete the 10 files and
+//    reported 10 background errors
+TEST_F(DeleteSchedulerTest, BackgroundError) {
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::BackgroundError:1",
+       "DeleteSchedulerImpl::BackgroundEmptyTrash"},
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 Mb/sec
+  delete_scheduler_.reset(
+      NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_));
+
+  // Generate 10 dummy files and move them to trash
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = "data_" + ToString(i) + ".data";
+    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name)));
+  }
+  ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
+  ASSERT_EQ(CountFilesInDir(trash_dir_), 10);
+
+  // Delete 10 files from trash, this will cause background errors in
+  // BackgroundEmptyTrash since we already deleted the files it was
+  // goind to delete
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = "data_" + ToString(i) + ".data";
+    ASSERT_OK(env_->DeleteFile(trash_dir_ + "/" + file_name));
+  }
+
+  // Hold BackgroundEmptyTrash
+  TEST_SYNC_POINT("DeleteSchedulerTest::BackgroundError:1");
+  delete_scheduler_->WaitForEmptyTrash();
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 10);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// 1- Create 10 files in trash
+// 2- Create a DeleteScheduler with delete_exisitng_trash = true
+// 3- Wait for DeleteScheduler to delete all files in queue
+// 4- Make sure that all files in trash directory were deleted
+TEST_F(DeleteSchedulerTest, TrashWithExistingFiles) {
+  std::vector<std::string> dummy_files;
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = "data_" + ToString(i) + ".data";
+    std::string trash_path = trash_dir_ + "/" + file_name;
+    env_->RenameFile(NewDummyFile(file_name), trash_path);
+  }
+  ASSERT_EQ(CountFilesInDir(trash_dir_), 10);
+
+  Status s;
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 Mb/sec
+  delete_scheduler_.reset(NewDeleteScheduler(
+      env_, trash_dir_, rate_bytes_per_sec_, nullptr, true, &s));
+  ASSERT_OK(s);
+
+  delete_scheduler_->WaitForEmptyTrash();
+  ASSERT_EQ(CountFilesInDir(trash_dir_), 0);
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+}
+
+// 1- Create 10 dummy files
+// 2- Delete 10 dummy files using DeleteScheduler
+// 3- Wait for DeleteScheduler to delete all files in queue
+// 4- Make sure all files in trash directory were deleted
+// 5- Repeat previous steps 5 times
+TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) {
+  int bg_delete_file = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteSchedulerImpl::DeleteTrashFile:DeleteFile",
+      [&](void* arg) { bg_delete_file++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  delete_scheduler_.reset(
+      NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_));
+
+  // Move files to trash, wait for empty trash, start again
+  for (int run = 1; run <= 5; run++) {
+    // Generate 10 dummy files and move them to trash
+    for (int i = 0; i < 10; i++) {
+      std::string file_name = "data_" + ToString(i) + ".data";
+      ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name)));
+    }
+    ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
+    delete_scheduler_->WaitForEmptyTrash();
+    ASSERT_EQ(bg_delete_file, 10 * run);
+    ASSERT_EQ(CountFilesInDir(trash_dir_), 0);
+
+    auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+    ASSERT_EQ(bg_errors.size(), 0);
+  }
+
+  ASSERT_EQ(bg_delete_file, 50);
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+}
+
+// 1- Create a DeleteScheduler with very slow rate limit (1 Byte / sec)
+// 2- Delete 100 files using DeleteScheduler
+// 3- Delete the DeleteScheduler (call the destructor while queue is not empty)
+// 4- Make sure that not all files were deleted from trash and that
+//    DeleteScheduler background thread did not delete all files
+TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) {
+  int bg_delete_file = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteSchedulerImpl::DeleteTrashFile:DeleteFile",
+      [&](void* arg) { bg_delete_file++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1;  // 1 Byte / sec
+  delete_scheduler_.reset(
+      NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_));
+
+  for (int i = 0; i < 100; i++) {
+    std::string file_name = "data_" + ToString(i) + ".data";
+    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name)));
+  }
+
+  // Deleting 100 files will need >28 hours to delete
+  // we will delete the DeleteScheduler while delete queue is not empty
+  delete_scheduler_.reset();
+
+  ASSERT_LT(bg_delete_file, 100);
+  ASSERT_GT(CountFilesInDir(trash_dir_), 0);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// 1- Delete the trash directory
+// 2- Delete 10 files using DeleteScheduler
+// 3- Make sure that the 10 files were deleted immediately since DeleteScheduler
+//    failed to move them to trash directory
+TEST_F(DeleteSchedulerTest, MoveToTrashError) {
+  int bg_delete_file = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteSchedulerImpl::DeleteTrashFile:DeleteFile",
+      [&](void* arg) { bg_delete_file++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024;  // 1 Kb / sec
+  delete_scheduler_.reset(
+      NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_));
+
+  // We will delete the trash directory, that mean that DeleteScheduler wont
+  // be able to move files to trash and will delete files them immediately.
+  DestroyDir(trash_dir_);
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = "data_" + ToString(i) + ".data";
+    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name)));
+  }
+
+  ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
+  ASSERT_EQ(bg_delete_file, 0);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/dynamic_bloom.h b/src/rocksdb/util/dynamic_bloom.h
index a6e4d73..e2ac56e 100644
--- a/src/rocksdb/util/dynamic_bloom.h
+++ b/src/rocksdb/util/dynamic_bloom.h
@@ -9,7 +9,7 @@
 
 #include "rocksdb/slice.h"
 
-#include "port/port_posix.h"
+#include "port/port.h"
 
 #include <atomic>
 #include <memory>
diff --git a/src/rocksdb/util/dynamic_bloom_test.cc b/src/rocksdb/util/dynamic_bloom_test.cc
index fb10d09..cb38366 100644
--- a/src/rocksdb/util/dynamic_bloom_test.cc
+++ b/src/rocksdb/util/dynamic_bloom_test.cc
@@ -6,8 +6,8 @@
 #ifndef GFLAGS
 #include <cstdio>
 int main() {
-  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
-  return 1;
+  fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+  return 0;
 }
 #else
 
diff --git a/src/rocksdb/util/env.cc b/src/rocksdb/util/env.cc
index 0695b55..effa7f5 100644
--- a/src/rocksdb/util/env.cc
+++ b/src/rocksdb/util/env.cc
@@ -9,7 +9,10 @@
 
 #include "rocksdb/env.h"
 
-#include <sys/time.h>
+#include <thread>
+#include "port/port.h"
+#include "port/sys_time.h"
+
 #include "rocksdb/options.h"
 #include "util/arena.h"
 #include "util/autovector.h"
@@ -19,6 +22,11 @@ namespace rocksdb {
 Env::~Env() {
 }
 
+uint64_t Env::GetThreadID() const {
+  std::hash<std::thread::id> hasher;
+  return hasher(std::this_thread::get_id());
+}
+
 SequentialFile::~SequentialFile() {
 }
 
@@ -49,12 +57,41 @@ void Log(Logger* info_log, const char* format, ...) {
   }
 }
 
+void Logger::Logv(const InfoLogLevel log_level, const char* format, va_list ap) {
+  static const char* kInfoLogLevelNames[5] = { "DEBUG", "INFO", "WARN",
+    "ERROR", "FATAL" };
+  if (log_level < log_level_) {
+    return;
+  }
+
+  if (log_level == InfoLogLevel::INFO_LEVEL) {
+    // Doesn't print log level if it is INFO level.
+    // This is to avoid unexpected performance regression after we add
+    // the feature of log level. All the logs before we add the feature
+    // are INFO level. We don't want to add extra costs to those existing
+    // logging.
+    Logv(format, ap);
+  } else {
+    char new_format[500];
+    snprintf(new_format, sizeof(new_format) - 1, "[%s] %s",
+      kInfoLogLevelNames[log_level], format);
+    Logv(new_format, ap);
+  }
+}
+
+
 void Log(const InfoLogLevel log_level, Logger* info_log, const char* format,
          ...) {
   if (info_log && info_log->GetInfoLogLevel() <= log_level) {
     va_list ap;
     va_start(ap, format);
-    info_log->Logv(log_level, format, ap);
+
+    if (log_level == InfoLogLevel::HEADER_LEVEL) {
+      info_log->LogHeader(format, ap);
+    } else {
+      info_log->Logv(log_level, format, ap);
+    }
+
     va_end(ap);
   }
 }
@@ -245,6 +282,7 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
   env_options->set_fd_cloexec = options.is_fd_close_on_exec;
   env_options->bytes_per_sync = options.bytes_per_sync;
   env_options->rate_limiter = options.rate_limiter.get();
+  env_options->allow_fallocate = options.allow_fallocate;
 }
 
 }
diff --git a/src/rocksdb/util/env_hdfs.cc b/src/rocksdb/util/env_hdfs.cc
index 298eb48..30e7962 100644
--- a/src/rocksdb/util/env_hdfs.cc
+++ b/src/rocksdb/util/env_hdfs.cc
@@ -3,6 +3,10 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
+
+#include "rocksdb/env.h"
+#include "hdfs/env_hdfs.h"
+
 #ifdef USE_HDFS
 #ifndef ROCKSDB_HDFS_FILE_C
 #define ROCKSDB_HDFS_FILE_C
@@ -13,9 +17,7 @@
 #include <time.h>
 #include <iostream>
 #include <sstream>
-#include "rocksdb/env.h"
 #include "rocksdb/status.h"
-#include "hdfs/env_hdfs.h"
 
 #define HDFS_EXISTS 0
 #define HDFS_DOESNT_EXIST -1
@@ -415,12 +417,6 @@ Status HdfsEnv::NewWritableFile(const std::string& fname,
   return Status::OK();
 }
 
-Status HdfsEnv::NewRandomRWFile(const std::string& fname,
-                                unique_ptr<RandomRWFile>* result,
-                                const EnvOptions& options) {
-  return Status::NotSupported("NewRandomRWFile not supported on HdfsEnv");
-}
-
 class HdfsDirectory : public Directory {
  public:
   explicit HdfsDirectory(int fd) : fd_(fd) {}
@@ -448,20 +444,18 @@ Status HdfsEnv::NewDirectory(const std::string& name,
   }
 }
 
-bool HdfsEnv::FileExists(const std::string& fname) {
-
+Status HdfsEnv::FileExists(const std::string& fname) {
   int value = hdfsExists(fileSys_, fname.c_str());
   switch (value) {
     case HDFS_EXISTS:
-    return true;
+      return Status::OK();
     case HDFS_DOESNT_EXIST:
-      return false;
+      return Status::NotFound();
     default:  // anything else should be an error
       Log(InfoLogLevel::FATAL_LEVEL,
           mylog, "FileExists hdfsExists call failed");
-      throw HdfsFatalException("hdfsExists call failed with error " +
-                               ToString(value) + " on path " + fname +
-                               ".\n");
+      return Status::IOError("hdfsExists call failed with error " +
+                             ToString(value) + " on path " + fname + ".\n");
   }
 }
 
@@ -606,8 +600,6 @@ Status HdfsEnv::NewLogger(const std::string& fname,
 #else // USE_HDFS
 
 // dummy placeholders used when HDFS is not available
-#include "rocksdb/env.h"
-#include "hdfs/env_hdfs.h"
 namespace rocksdb {
  Status HdfsEnv::NewSequentialFile(const std::string& fname,
                                    unique_ptr<SequentialFile>* result,
diff --git a/src/rocksdb/util/env_posix.cc b/src/rocksdb/util/env_posix.cc
index 3cdd12b..5c031a7 100644
--- a/src/rocksdb/util/env_posix.cc
+++ b/src/rocksdb/util/env_posix.cc
@@ -40,7 +40,7 @@
 #include "util/posix_logger.h"
 #include "util/random.h"
 #include "util/iostats_context_imp.h"
-#include "util/rate_limiter.h"
+#include "util/string_util.h"
 #include "util/sync_point.h"
 #include "util/thread_status_updater.h"
 #include "util/thread_status_util.h"
@@ -74,9 +74,6 @@
 #define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */
 #endif
 
-// This is only set from db_stress.cc and for testing only.
-// If non-zero, kill at various points in source code with probability 1/this
-int rocksdb_kill_odds = 0;
 
 namespace rocksdb {
 
@@ -104,39 +101,6 @@ static Status IOError(const std::string& context, int err_number) {
   return Status::IOError(context, strerror(err_number));
 }
 
-#ifdef NDEBUG
-// empty in release build
-#define TEST_KILL_RANDOM(rocksdb_kill_odds)
-#else
-
-// Kill the process with probablity 1/odds for testing.
-static void TestKillRandom(int odds, const std::string& srcfile,
-                           int srcline) {
-  time_t curtime = time(nullptr);
-  Random r((uint32_t)curtime);
-
-  assert(odds > 0);
-  bool crash = r.OneIn(odds);
-  if (crash) {
-    fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline);
-    fflush(stdout);
-    kill(getpid(), SIGTERM);
-  }
-}
-
-// To avoid crashing always at some frequently executed codepaths (during
-// kill random test), use this factor to reduce odds
-#define REDUCE_ODDS 2
-#define REDUCE_ODDS2 4
-
-#define TEST_KILL_RANDOM(rocksdb_kill_odds) {   \
-  if (rocksdb_kill_odds > 0) { \
-    TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__);     \
-  } \
-}
-
-#endif
-
 #if defined(OS_LINUX)
 namespace {
   static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
@@ -186,13 +150,8 @@ class PosixSequentialFile: public SequentialFile {
     Status s;
     size_t r = 0;
     do {
-#ifndef CYGWIN
       r = fread_unlocked(scratch, 1, n, file_);
-#else
-      r = fread(scratch, 1, n, file_);
-#endif
     } while (r == 0 && ferror(file_) && errno == EINTR);
-    IOSTATS_ADD(bytes_read, r);
     *result = Slice(scratch, r);
     if (r < n) {
       if (feof(file_)) {
@@ -257,6 +216,7 @@ class PosixRandomAccessFile: public RandomAccessFile {
     char* ptr = scratch;
     while (left > 0) {
       r = pread(fd_, ptr, left, static_cast<off_t>(offset));
+
       if (r <= 0) {
         if (errno == EINTR) {
           continue;
@@ -268,7 +228,6 @@ class PosixRandomAccessFile: public RandomAccessFile {
       left -= r;
     }
 
-    IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left);
     *result = Slice(scratch, (r < 0) ? 0 : n - left);
     if (r < 0) {
       // An error: return a non-ok status
@@ -346,7 +305,7 @@ class PosixMmapReadableFile: public RandomAccessFile {
   virtual ~PosixMmapReadableFile() {
     int ret = munmap(mmapped_region_, length_);
     if (ret != 0) {
-      fprintf(stdout, "failed to munmap %p length %zu \n",
+      fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
               mmapped_region_, length_);
     }
   }
@@ -354,12 +313,13 @@ class PosixMmapReadableFile: public RandomAccessFile {
   virtual Status Read(uint64_t offset, size_t n, Slice* result,
                       char* scratch) const override {
     Status s;
-    if (offset + n > length_) {
+    if (offset > length_) {
       *result = Slice();
-      s = IOError(filename_, EINVAL);
-    } else {
-      *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
+      return IOError(filename_, EINVAL);
+    } else if (offset + n > length_) {
+      n = length_ - offset;
     }
+    *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
     return s;
   }
   virtual Status InvalidateCache(size_t offset, size_t length) override {
@@ -391,9 +351,8 @@ class PosixMmapFile : public WritableFile {
   char* dst_;             // Where to write next  (in range [base_,limit_])
   char* last_sync_;       // Where have we synced up to
   uint64_t file_offset_;  // Offset of base_ in file
-  // Have we done an munmap of unsynced data?
-  bool pending_sync_;
 #ifdef ROCKSDB_FALLOCATE_PRESENT
+  bool allow_fallocate_;  // If false, fallocate calls are bypassed
   bool fallocate_with_keep_size_;
 #endif
 
@@ -411,10 +370,6 @@ class PosixMmapFile : public WritableFile {
   Status UnmapCurrentRegion() {
     TEST_KILL_RANDOM(rocksdb_kill_odds);
     if (base_ != nullptr) {
-      if (last_sync_ < limit_) {
-        // Defer syncing this data until next Sync() call, if any
-        pending_sync_ = true;
-      }
       int munmap_status = munmap(base_, limit_ - base_);
       if (munmap_status != 0) {
         return IOError(filename_, munmap_status);
@@ -439,14 +394,17 @@ class PosixMmapFile : public WritableFile {
 
     TEST_KILL_RANDOM(rocksdb_kill_odds);
     // we can't fallocate with FALLOC_FL_KEEP_SIZE here
-    int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
-    if (alloc_status != 0) {
-      // fallback to posix_fallocate
-      alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
-    }
-    if (alloc_status != 0) {
-      return Status::IOError("Error allocating space to file : " + filename_ +
-        "Error : " + strerror(alloc_status));
+    if (allow_fallocate_) {
+      IOSTATS_TIMER_GUARD(allocate_nanos);
+      int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
+      if (alloc_status != 0) {
+        // fallback to posix_fallocate
+        alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
+      }
+      if (alloc_status != 0) {
+        return Status::IOError("Error allocating space to file : " + filename_ +
+          "Error : " + strerror(alloc_status));
+      }
     }
 
     TEST_KILL_RANDOM(rocksdb_kill_odds);
@@ -455,7 +413,6 @@ class PosixMmapFile : public WritableFile {
     if (ptr == MAP_FAILED) {
       return Status::IOError("MMap failed on " + filename_);
     }
-
     TEST_KILL_RANDOM(rocksdb_kill_odds);
 
     base_ = reinterpret_cast<char*>(ptr);
@@ -468,6 +425,22 @@ class PosixMmapFile : public WritableFile {
 #endif
   }
 
+  Status Msync() {
+    if (dst_ == last_sync_) {
+      return Status::OK();
+    }
+    // Find the beginnings of the pages that contain the first and last
+    // bytes to be synced.
+    size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
+    size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
+    last_sync_ = dst_;
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
+      return IOError(filename_, errno);
+    }
+    return Status::OK();
+  }
+
  public:
   PosixMmapFile(const std::string& fname, int fd, size_t page_size,
                 const EnvOptions& options)
@@ -479,9 +452,9 @@ class PosixMmapFile : public WritableFile {
         limit_(nullptr),
         dst_(nullptr),
         last_sync_(nullptr),
-        file_offset_(0),
-        pending_sync_(false) {
+        file_offset_(0) {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
+    allow_fallocate_ = options.allow_fallocate;
     fallocate_with_keep_size_ = options.fallocate_with_keep_size;
 #endif
     assert((page_size & (page_size - 1)) == 0);
@@ -498,8 +471,6 @@ class PosixMmapFile : public WritableFile {
   virtual Status Append(const Slice& data) override {
     const char* src = data.data();
     size_t left = data.size();
-    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS);
-    PrepareWrite(static_cast<size_t>(GetFileSize()), left);
     while (left > 0) {
       assert(base_ <= dst_);
       assert(dst_ <= limit_);
@@ -518,12 +489,16 @@ class PosixMmapFile : public WritableFile {
 
       size_t n = (left <= avail) ? left : avail;
       memcpy(dst_, src, n);
-      IOSTATS_ADD(bytes_written, n);
       dst_ += n;
       src += n;
       left -= n;
     }
-    TEST_KILL_RANDOM(rocksdb_kill_odds);
+    return Status::OK();
+  }
+
+  // Means Close() will properly take care of truncate
+  // and it does not need any additional information
+  virtual Status Truncate(uint64_t size) override {
     return Status::OK();
   }
 
@@ -531,8 +506,6 @@ class PosixMmapFile : public WritableFile {
     Status s;
     size_t unused = limit_ - dst_;
 
-    TEST_KILL_RANDOM(rocksdb_kill_odds);
-
     s = UnmapCurrentRegion();
     if (!s.ok()) {
       s = IOError(filename_, errno);
@@ -543,8 +516,6 @@ class PosixMmapFile : public WritableFile {
       }
     }
 
-    TEST_KILL_RANDOM(rocksdb_kill_odds);
-
     if (close(fd_) < 0) {
       if (s.ok()) {
         s = IOError(filename_, errno);
@@ -558,55 +529,26 @@ class PosixMmapFile : public WritableFile {
   }
 
   virtual Status Flush() override {
-    TEST_KILL_RANDOM(rocksdb_kill_odds);
     return Status::OK();
   }
 
   virtual Status Sync() override {
-    Status s;
-
-    if (pending_sync_) {
-      // Some unmapped data was not synced
-      TEST_KILL_RANDOM(rocksdb_kill_odds);
-      pending_sync_ = false;
-      if (fdatasync(fd_) < 0) {
-        s = IOError(filename_, errno);
-      }
-      TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS);
-    }
-
-    if (dst_ > last_sync_) {
-      // Find the beginnings of the pages that contain the first and last
-      // bytes to be synced.
-      size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
-      size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
-      last_sync_ = dst_;
-      TEST_KILL_RANDOM(rocksdb_kill_odds);
-      if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
-        s = IOError(filename_, errno);
-      }
-      TEST_KILL_RANDOM(rocksdb_kill_odds);
+    if (fdatasync(fd_) < 0) {
+      return IOError(filename_, errno);
     }
 
-    return s;
+    return Msync();
   }
 
   /**
    * Flush data as well as metadata to stable storage.
    */
   virtual Status Fsync() override {
-    if (pending_sync_) {
-      // Some unmapped data was not synced
-      TEST_KILL_RANDOM(rocksdb_kill_odds);
-      pending_sync_ = false;
-      if (fsync(fd_) < 0) {
-        return IOError(filename_, errno);
-      }
-      TEST_KILL_RANDOM(rocksdb_kill_odds);
+    if (fsync(fd_) < 0) {
+      return IOError(filename_, errno);
     }
-    // This invocation to Sync will not issue the call to
-    // fdatasync because pending_sync_ has already been cleared.
-    return Sync();
+
+    return Msync();
   }
 
   /**
@@ -635,8 +577,12 @@ class PosixMmapFile : public WritableFile {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
   virtual Status Allocate(off_t offset, off_t len) override {
     TEST_KILL_RANDOM(rocksdb_kill_odds);
-    int alloc_status = fallocate(
-        fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
+    int alloc_status = 0;
+    if (allow_fallocate_) {
+      alloc_status =
+          fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
+                    offset, len);
+    }
     if (alloc_status == 0) {
       return Status::OK();
     } else {
@@ -651,34 +597,17 @@ class PosixWritableFile : public WritableFile {
  private:
   const std::string filename_;
   int fd_;
-  size_t cursize_;      // current size of cached data in buf_
-  size_t capacity_;     // max size of buf_
-  unique_ptr<char[]> buf_;           // a buffer to cache writes
   uint64_t filesize_;
-  bool pending_sync_;
-  bool pending_fsync_;
-  uint64_t last_sync_size_;
-  uint64_t bytes_per_sync_;
 #ifdef ROCKSDB_FALLOCATE_PRESENT
+  bool allow_fallocate_;
   bool fallocate_with_keep_size_;
 #endif
-  RateLimiter* rate_limiter_;
 
  public:
-  PosixWritableFile(const std::string& fname, int fd, size_t capacity,
-                    const EnvOptions& options)
-      : filename_(fname),
-        fd_(fd),
-        cursize_(0),
-        capacity_(capacity),
-        buf_(new char[capacity]),
-        filesize_(0),
-        pending_sync_(false),
-        pending_fsync_(false),
-        last_sync_size_(0),
-        bytes_per_sync_(options.bytes_per_sync),
-        rate_limiter_(options.rate_limiter) {
+  PosixWritableFile(const std::string& fname, int fd, const EnvOptions& options)
+      : filename_(fname), fd_(fd), filesize_(0) {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
+    allow_fallocate_ = options.allow_fallocate;
     fallocate_with_keep_size_ = options.fallocate_with_keep_size;
 #endif
     assert(!options.use_mmap_writes);
@@ -693,60 +622,29 @@ class PosixWritableFile : public WritableFile {
   virtual Status Append(const Slice& data) override {
     const char* src = data.data();
     size_t left = data.size();
-    Status s;
-    pending_sync_ = true;
-    pending_fsync_ = true;
-
-    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
-
-    PrepareWrite(static_cast<size_t>(GetFileSize()), left);
-    // if there is no space in the cache, then flush
-    if (cursize_ + left > capacity_) {
-      s = Flush();
-      if (!s.ok()) {
-        return s;
-      }
-      // Increase the buffer size, but capped at 1MB
-      if (capacity_ < (1<<20)) {
-        capacity_ *= 2;
-        buf_.reset(new char[capacity_]);
-      }
-      assert(cursize_ == 0);
-    }
-
-    // if the write fits into the cache, then write to cache
-    // otherwise do a write() syscall to write to OS buffers.
-    if (cursize_ + left <= capacity_) {
-      memcpy(buf_.get()+cursize_, src, left);
-      cursize_ += left;
-    } else {
-      while (left != 0) {
-        ssize_t done = write(fd_, src, RequestToken(left));
-        if (done < 0) {
-          if (errno == EINTR) {
-            continue;
-          }
-          return IOError(filename_, errno);
+    while (left != 0) {
+      ssize_t done = write(fd_, src, left);
+      if (done < 0) {
+        if (errno == EINTR) {
+          continue;
         }
-        IOSTATS_ADD(bytes_written, done);
-        TEST_KILL_RANDOM(rocksdb_kill_odds);
-
-        left -= done;
-        src += done;
+        return IOError(filename_, errno);
       }
+      left -= done;
+      src += done;
     }
     filesize_ += data.size();
     return Status::OK();
   }
 
+  // Means Close() will properly take care of truncate
+  // and it does not need any additional information
+  virtual Status Truncate(uint64_t size) override {
+    return Status::OK();
+  }
+
   virtual Status Close() override {
     Status s;
-    s = Flush(); // flush cache to OS
-    if (!s.ok()) {
-      return s;
-    }
-
-    TEST_KILL_RANDOM(rocksdb_kill_odds);
 
     size_t block_size;
     size_t last_allocated_block;
@@ -769,8 +667,11 @@ class PosixWritableFile : public WritableFile {
       //   tmpfs (since Linux 3.5)
       // We ignore error since failure of this operation does not affect
       // correctness.
-      fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
-                filesize_, block_size * last_allocated_block - filesize_);
+      IOSTATS_TIMER_GUARD(allocate_nanos);
+      if (allow_fallocate_) {
+        fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
+                  block_size * last_allocated_block - filesize_);
+      }
 #endif
     }
 
@@ -783,66 +684,27 @@ class PosixWritableFile : public WritableFile {
 
   // write out the cached data to the OS cache
   virtual Status Flush() override {
-    TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
-    size_t left = cursize_;
-    char* src = buf_.get();
-    while (left != 0) {
-      ssize_t done = write(fd_, src, RequestToken(left));
-      if (done < 0) {
-        if (errno == EINTR) {
-          continue;
-        }
-        return IOError(filename_, errno);
-      }
-      IOSTATS_ADD(bytes_written, done);
-      TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
-      left -= done;
-      src += done;
-    }
-    cursize_ = 0;
-
-    // sync OS cache to disk for every bytes_per_sync_
-    // TODO: give log file and sst file different options (log
-    // files could be potentially cached in OS for their whole
-    // life time, thus we might not want to flush at all).
-    if (bytes_per_sync_ &&
-        filesize_ - last_sync_size_ >= bytes_per_sync_) {
-      RangeSync(last_sync_size_, filesize_ - last_sync_size_);
-      last_sync_size_ = filesize_;
-    }
-
     return Status::OK();
   }
 
   virtual Status Sync() override {
-    Status s = Flush();
-    if (!s.ok()) {
-      return s;
-    }
-    TEST_KILL_RANDOM(rocksdb_kill_odds);
-    if (pending_sync_ && fdatasync(fd_) < 0) {
+    if (fdatasync(fd_) < 0) {
       return IOError(filename_, errno);
     }
-    TEST_KILL_RANDOM(rocksdb_kill_odds);
-    pending_sync_ = false;
     return Status::OK();
   }
 
   virtual Status Fsync() override {
-    Status s = Flush();
-    if (!s.ok()) {
-      return s;
-    }
-    TEST_KILL_RANDOM(rocksdb_kill_odds);
-    if (pending_fsync_ && fsync(fd_) < 0) {
+    if (fsync(fd_) < 0) {
       return IOError(filename_, errno);
     }
-    TEST_KILL_RANDOM(rocksdb_kill_odds);
-    pending_fsync_ = false;
-    pending_sync_ = false;
     return Status::OK();
   }
 
+  virtual bool IsSyncThreadSafe() const override {
+    return true;
+  }
+
   virtual uint64_t GetFileSize() override { return filesize_; }
 
   virtual Status InvalidateCache(size_t offset, size_t length) override {
@@ -861,8 +723,13 @@ class PosixWritableFile : public WritableFile {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
   virtual Status Allocate(off_t offset, off_t len) override {
     TEST_KILL_RANDOM(rocksdb_kill_odds);
-    int alloc_status = fallocate(
-        fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
+    IOSTATS_TIMER_GUARD(allocate_nanos);
+    int alloc_status = 0;
+    if (allow_fallocate_) {
+      alloc_status =
+          fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
+                    offset, len);
+    }
     if (alloc_status == 0) {
       return Status::OK();
     } else {
@@ -881,135 +748,6 @@ class PosixWritableFile : public WritableFile {
     return GetUniqueIdFromFile(fd_, id, max_size);
   }
 #endif
-
- private:
-  inline size_t RequestToken(size_t bytes) {
-    if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) {
-      bytes = std::min(bytes,
-          static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()));
-      rate_limiter_->Request(bytes, io_priority_);
-    }
-    return bytes;
-  }
-};
-
-class PosixRandomRWFile : public RandomRWFile {
- private:
-  const std::string filename_;
-  int fd_;
-  bool pending_sync_;
-  bool pending_fsync_;
-#ifdef ROCKSDB_FALLOCATE_PRESENT
-  bool fallocate_with_keep_size_;
-#endif
-
- public:
-  PosixRandomRWFile(const std::string& fname, int fd, const EnvOptions& options)
-      : filename_(fname),
-        fd_(fd),
-        pending_sync_(false),
-        pending_fsync_(false) {
-#ifdef ROCKSDB_FALLOCATE_PRESENT
-    fallocate_with_keep_size_ = options.fallocate_with_keep_size;
-#endif
-    assert(!options.use_mmap_writes && !options.use_mmap_reads);
-  }
-
-  ~PosixRandomRWFile() {
-    if (fd_ >= 0) {
-      Close();
-    }
-  }
-
-  virtual Status Write(uint64_t offset, const Slice& data) override {
-    const char* src = data.data();
-    size_t left = data.size();
-    Status s;
-    pending_sync_ = true;
-    pending_fsync_ = true;
-
-    while (left != 0) {
-      ssize_t done = pwrite(fd_, src, left, offset);
-      if (done < 0) {
-        if (errno == EINTR) {
-          continue;
-        }
-        return IOError(filename_, errno);
-      }
-      IOSTATS_ADD(bytes_written, done);
-
-      left -= done;
-      src += done;
-      offset += done;
-    }
-
-    return Status::OK();
-  }
-
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const override {
-    Status s;
-    ssize_t r = -1;
-    size_t left = n;
-    char* ptr = scratch;
-    while (left > 0) {
-      r = pread(fd_, ptr, left, static_cast<off_t>(offset));
-      if (r <= 0) {
-        if (errno == EINTR) {
-          continue;
-        }
-        break;
-      }
-      ptr += r;
-      offset += r;
-      left -= r;
-    }
-    IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left);
-    *result = Slice(scratch, (r < 0) ? 0 : n - left);
-    if (r < 0) {
-      s = IOError(filename_, errno);
-    }
-    return s;
-  }
-
-  virtual Status Close() override {
-    Status s = Status::OK();
-    if (fd_ >= 0 && close(fd_) < 0) {
-      s = IOError(filename_, errno);
-    }
-    fd_ = -1;
-    return s;
-  }
-
-  virtual Status Sync() override {
-    if (pending_sync_ && fdatasync(fd_) < 0) {
-      return IOError(filename_, errno);
-    }
-    pending_sync_ = false;
-    return Status::OK();
-  }
-
-  virtual Status Fsync() override {
-    if (pending_fsync_ && fsync(fd_) < 0) {
-      return IOError(filename_, errno);
-    }
-    pending_fsync_ = false;
-    pending_sync_ = false;
-    return Status::OK();
-  }
-
-#ifdef ROCKSDB_FALLOCATE_PRESENT
-  virtual Status Allocate(off_t offset, off_t len) override {
-    TEST_KILL_RANDOM(rocksdb_kill_odds);
-    int alloc_status = fallocate(
-        fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
-    if (alloc_status == 0) {
-      return Status::OK();
-    } else {
-      return IOError(filename_, errno);
-    }
-  }
-#endif
 };
 
 class PosixDirectory : public Directory {
@@ -1109,6 +847,7 @@ class PosixEnv : public Env {
     result->reset();
     FILE* f = nullptr;
     do {
+      IOSTATS_TIMER_GUARD(open_nanos);
       f = fopen(fname.c_str(), "r");
     } while (f == nullptr && errno == EINTR);
     if (f == nullptr) {
@@ -1127,7 +866,11 @@ class PosixEnv : public Env {
                                      const EnvOptions& options) override {
     result->reset();
     Status s;
-    int fd = open(fname.c_str(), O_RDONLY);
+    int fd;
+    {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      fd = open(fname.c_str(), O_RDONLY);
+    }
     SetFD_CLOEXEC(fd, &options);
     if (fd < 0) {
       s = IOError(fname, errno);
@@ -1160,6 +903,7 @@ class PosixEnv : public Env {
     Status s;
     int fd = -1;
     do {
+      IOSTATS_TIMER_GUARD(open_nanos);
       fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
     } while (fd < 0 && errno == EINTR);
     if (fd < 0) {
@@ -1183,37 +927,20 @@ class PosixEnv : public Env {
         EnvOptions no_mmap_writes_options = options;
         no_mmap_writes_options.use_mmap_writes = false;
 
-        result->reset(
-            new PosixWritableFile(fname, fd, 65536, no_mmap_writes_options)
-        );
+        result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
       }
     }
     return s;
   }
 
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options) override {
-    result->reset();
-    // no support for mmap yet
-    if (options.use_mmap_writes || options.use_mmap_reads) {
-      return Status::NotSupported("No support for mmap read/write yet");
-    }
-    Status s;
-    const int fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644);
-    if (fd < 0) {
-      s = IOError(fname, errno);
-    } else {
-      SetFD_CLOEXEC(fd, &options);
-      result->reset(new PosixRandomRWFile(fname, fd, options));
-    }
-    return s;
-  }
-
   virtual Status NewDirectory(const std::string& name,
                               unique_ptr<Directory>* result) override {
     result->reset();
-    const int fd = open(name.c_str(), 0);
+    int fd;
+    {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      fd = open(name.c_str(), 0);
+    }
     if (fd < 0) {
       return IOError(name, errno);
     } else {
@@ -1222,8 +949,25 @@ class PosixEnv : public Env {
     return Status::OK();
   }
 
-  virtual bool FileExists(const std::string& fname) override {
-    return access(fname.c_str(), F_OK) == 0;
+  virtual Status FileExists(const std::string& fname) override {
+    int result = access(fname.c_str(), F_OK);
+
+    if (result == 0) {
+      return Status::OK();
+    }
+
+    switch (errno) {
+      case EACCES:
+      case ELOOP:
+      case ENAMETOOLONG:
+      case ENOENT:
+      case ENOTDIR:
+        return Status::NotFound();
+      default:
+        assert(result == EIO || result == ENOMEM);
+        return Status::IOError("Unexpected error(" + ToString(result) +
+                               ") accessing file `" + fname + "' ");
+    }
   }
 
   virtual Status GetChildren(const std::string& dir,
@@ -1325,7 +1069,11 @@ class PosixEnv : public Env {
   virtual Status LockFile(const std::string& fname, FileLock** lock) override {
     *lock = nullptr;
     Status result;
-    int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
+    int fd;
+    {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
+    }
     if (fd < 0) {
       result = IOError(fname, errno);
     } else if (LockOrUnlock(fname, fd, true) == -1) {
@@ -1394,14 +1142,25 @@ class PosixEnv : public Env {
     return gettid(tid);
   }
 
+  virtual uint64_t GetThreadID() const override {
+    return gettid(pthread_self());
+  }
+
   virtual Status NewLogger(const std::string& fname,
                            shared_ptr<Logger>* result) override {
-    FILE* f = fopen(fname.c_str(), "w");
+    FILE* f;
+    {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      f = fopen(fname.c_str(), "w");
+    }
     if (f == nullptr) {
       result->reset();
       return IOError(fname, errno);
     } else {
       int fd = fileno(f);
+#ifdef ROCKSDB_FALLOCATE_PRESENT
+      fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 4 * 1024);
+#endif
       SetFD_CLOEXEC(fd, nullptr);
       result->reset(new PosixLogger(f, &PosixEnv::gettid, this));
       return Status::OK();
@@ -1518,9 +1277,7 @@ class PosixEnv : public Env {
     // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
     // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
     // test and make this false
-    // CEPH: we don't care about replication and want to avoid updating the
-    // inode... set this to false! [-sage]
-    optimized.fallocate_with_keep_size = false;  //true;
+    optimized.fallocate_with_keep_size = true;
     return optimized;
   }
 
@@ -1719,7 +1476,7 @@ class PosixEnv : public Env {
       ThreadPool* tp = meta->thread_pool_;
 #if ROCKSDB_USING_THREAD_STATUS
       // for thread-status
-      ThreadStatusUtil::SetThreadType(tp->env_,
+      ThreadStatusUtil::RegisterThread(tp->env_,
           (tp->GetThreadPriority() == Env::Priority::HIGH ?
               ThreadStatus::HIGH_PRIORITY :
               ThreadStatus::LOW_PRIORITY));
@@ -1772,7 +1529,8 @@ class PosixEnv : public Env {
 #if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
 #if __GLIBC_PREREQ(2, 12)
         char name_buf[16];
-        snprintf(name_buf, sizeof name_buf, "rocksdb:bg%zu", bgthreads_.size());
+        snprintf(name_buf, sizeof name_buf, "rocksdb:bg%" ROCKSDB_PRIszt,
+                 bgthreads_.size());
         name_buf[sizeof name_buf - 1] = '\0';
         pthread_setname_np(t, name_buf);
 #endif
@@ -1864,10 +1622,11 @@ class PosixEnv : public Env {
 
 };
 
-PosixEnv::PosixEnv() : checkedDiskForMmap_(false),
-                       forceMmapOff(false),
-                       page_size_(getpagesize()),
-                       thread_pools_(Priority::TOTAL) {
+PosixEnv::PosixEnv()
+    : checkedDiskForMmap_(false),
+      forceMmapOff(false),
+      page_size_(getpagesize()),
+      thread_pools_(Priority::TOTAL) {
   PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
   for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
     thread_pools_[pool_id].SetThreadPriority(
@@ -1928,9 +1687,11 @@ void PosixEnv::WaitForJoin() {
 
 std::string Env::GenerateUniqueId() {
   std::string uuid_file = "/proc/sys/kernel/random/uuid";
-  if (FileExists(uuid_file)) {
+
+  Status s = FileExists(uuid_file);
+  if (s.ok()) {
     std::string uuid;
-    Status s = ReadFileToString(this, uuid_file, &uuid);
+    s = ReadFileToString(this, uuid_file, &uuid);
     if (s.ok()) {
       return uuid;
     }
diff --git a/src/rocksdb/util/env_test.cc b/src/rocksdb/util/env_test.cc
index 081a10f..7f5e4b9 100644
--- a/src/rocksdb/util/env_test.cc
+++ b/src/rocksdb/util/env_test.cc
@@ -7,8 +7,10 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#include <sys/types.h>
+#ifndef OS_WIN
 #include <sys/ioctl.h>
+#endif
+#include <sys/types.h>
 
 #include <iostream>
 #include <unordered_set>
@@ -34,6 +36,7 @@
 #include "util/mutexlock.h"
 #include "util/string_util.h"
 #include "util/testharness.h"
+#include "util/testutil.h"
 
 namespace rocksdb {
 
@@ -54,46 +57,6 @@ static void SetBool(void* ptr) {
       ->store(true, std::memory_order_relaxed);
 }
 
-class SleepingBackgroundTask {
- public:
-  explicit SleepingBackgroundTask()
-      : bg_cv_(&mutex_), should_sleep_(true), sleeping_(false) {}
-  void DoSleep() {
-    MutexLock l(&mutex_);
-    sleeping_ = true;
-    while (should_sleep_) {
-      bg_cv_.Wait();
-    }
-    sleeping_ = false;
-    bg_cv_.SignalAll();
-  }
-
-  void WakeUp() {
-    MutexLock l(&mutex_);
-    should_sleep_ = false;
-    bg_cv_.SignalAll();
-
-    while (sleeping_) {
-      bg_cv_.Wait();
-    }
-  }
-
-  bool IsSleeping() {
-    MutexLock l(&mutex_);
-    return sleeping_;
-  }
-
-  static void DoSleepTask(void* arg) {
-    reinterpret_cast<SleepingBackgroundTask*>(arg)->DoSleep();
-  }
-
- private:
-  port::Mutex mutex_;
-  port::CondVar bg_cv_;  // Signalled when background work finishes
-  bool should_sleep_;
-  bool sleeping_;
-};
-
 TEST_F(EnvPosixTest, RunImmediately) {
   std::atomic<bool> called(false);
   env_->Schedule(&SetBool, &called);
@@ -106,12 +69,12 @@ TEST_F(EnvPosixTest, UnSchedule) {
   env_->SetBackgroundThreads(1, Env::LOW);
 
   /* Block the low priority queue */
-  SleepingBackgroundTask sleeping_task, sleeping_task1;
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+  test::SleepingBackgroundTask sleeping_task, sleeping_task1;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
                  Env::Priority::LOW);
 
   /* Schedule another task */
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task1,
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task1,
                  Env::Priority::LOW, &sleeping_task1);
 
   /* Remove it with a different tag  */
@@ -319,7 +282,7 @@ TEST_F(EnvPosixTest, TwoPools) {
 }
 
 TEST_F(EnvPosixTest, DecreaseNumBgThreads) {
-  std::vector<SleepingBackgroundTask> tasks(10);
+  std::vector<test::SleepingBackgroundTask> tasks(10);
 
   // Set number of thread to 1 first.
   env_->SetBackgroundThreads(1, Env::Priority::HIGH);
@@ -327,7 +290,7 @@ TEST_F(EnvPosixTest, DecreaseNumBgThreads) {
 
   // Schedule 3 tasks. 0 running; Task 1, 2 waiting.
   for (size_t i = 0; i < 3; i++) {
-    env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &tasks[i],
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[i],
                    Env::Priority::HIGH);
     Env::Default()->SleepForMicroseconds(kDelayMicros);
   }
@@ -391,7 +354,7 @@ TEST_F(EnvPosixTest, DecreaseNumBgThreads) {
   // Enqueue 5 more tasks. Thread pool size now is 4.
   // Task 0, 3, 4, 5 running;6, 7 waiting.
   for (size_t i = 3; i < 8; i++) {
-    env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &tasks[i],
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[i],
                    Env::Priority::HIGH);
   }
   Env::Default()->SleepForMicroseconds(kDelayMicros);
@@ -433,9 +396,9 @@ TEST_F(EnvPosixTest, DecreaseNumBgThreads) {
   ASSERT_TRUE(!tasks[7].IsSleeping());
 
   // Enqueue thread 8 and 9. Task 5 running; one of 8, 9 might be running.
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &tasks[8],
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[8],
                  Env::Priority::HIGH);
-  env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &tasks[9],
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &tasks[9],
                  Env::Priority::HIGH);
   Env::Default()->SleepForMicroseconds(kDelayMicros);
   ASSERT_GT(env_->GetThreadPoolQueueLen(Env::Priority::HIGH), (unsigned int)0);
@@ -662,6 +625,7 @@ TEST_F(EnvPosixTest, AllocateTest) {
   size_t kPageSize = 4096;
   std::string data(1024 * 1024, 'a');
   wfile->SetPreallocationBlockSize(kPreallocateSize);
+  wfile->PrepareWrite(wfile->GetFileSize(), data.size());
   ASSERT_OK(wfile->Append(Slice(data)));
   ASSERT_OK(wfile->Flush());
 
@@ -821,28 +785,6 @@ TEST_F(EnvPosixTest, InvalidateCache) {
 #endif  // not TRAVIS
 #endif  // OS_LINUX
 
-TEST_F(EnvPosixTest, PosixRandomRWFileTest) {
-  EnvOptions soptions;
-  soptions.use_mmap_writes = soptions.use_mmap_reads = false;
-  std::string fname = test::TmpDir() + "/" + "testfile";
-
-  unique_ptr<RandomRWFile> file;
-  ASSERT_OK(env_->NewRandomRWFile(fname, &file, soptions));
-  // If you run the unit test on tmpfs, then tmpfs might not
-  // support fallocate. It is still better to trigger that
-  // code-path instead of eliminating it completely.
-  file.get()->Allocate(0, 10*1024*1024);
-  ASSERT_OK(file.get()->Write(100, Slice("Hello world")));
-  ASSERT_OK(file.get()->Write(105, Slice("Hello world")));
-  ASSERT_OK(file.get()->Sync());
-  ASSERT_OK(file.get()->Fsync());
-  char scratch[100];
-  Slice result;
-  ASSERT_OK(file.get()->Read(100, 16, &result, scratch));
-  ASSERT_EQ(result.compare("HelloHello world"), 0);
-  ASSERT_OK(file.get()->Close());
-}
-
 class TestLogger : public Logger {
  public:
   using Logger::Logv;
@@ -857,6 +799,13 @@ class TestLogger : public Logger {
       int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap);
       // 48 bytes for extra information + bytes allocated
 
+// When we have n == -1 there is not a terminating zero expected
+#ifdef OS_WIN
+      if (n < 0) {
+        char_0_count++;
+      }
+#endif
+
       if (new_format[0] == '[') {
         // "[DEBUG] "
         ASSERT_TRUE(n <= 56 + (512 - static_cast<int>(sizeof(struct timeval))));
@@ -965,23 +914,109 @@ TEST_F(EnvPosixTest, Preallocation) {
   ASSERT_EQ(last_allocated_block, 0UL);
 
   // Small write should preallocate one block
-  srcfile->Append("test");
+  std::string str = "test";
+  srcfile->PrepareWrite(srcfile->GetFileSize(), str.size());
+  srcfile->Append(str);
   srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
   ASSERT_EQ(last_allocated_block, 1UL);
 
   // Write an entire preallocation block, make sure we increased by two.
   std::string buf(block_size, ' ');
+  srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size());
   srcfile->Append(buf);
   srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
   ASSERT_EQ(last_allocated_block, 2UL);
 
   // Write five more blocks at once, ensure we're where we need to be.
   buf = std::string(block_size * 5, ' ');
+  srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size());
   srcfile->Append(buf);
   srcfile->GetPreallocationStatus(&block_size, &last_allocated_block);
   ASSERT_EQ(last_allocated_block, 7UL);
 }
 
+// Test that all WritableFileWrapper forwards all calls to WritableFile.
+TEST_F(EnvPosixTest, WritableFileWrapper) {
+  class Base : public WritableFile {
+   public:
+    mutable int *step_;
+
+    void inc(int x) const {
+      EXPECT_EQ(x, (*step_)++);
+    }
+
+    explicit Base(int* step) : step_(step) {
+      inc(0);
+    }
+
+    Status Append(const Slice& data) override { inc(1); return Status::OK(); }
+    Status Truncate(uint64_t size) override { return Status::OK(); }
+    Status Close() override { inc(2); return Status::OK(); }
+    Status Flush() override { inc(3); return Status::OK(); }
+    Status Sync() override { inc(4); return Status::OK(); }
+    Status Fsync() override { inc(5); return Status::OK(); }
+    void SetIOPriority(Env::IOPriority pri) override { inc(6); }
+    uint64_t GetFileSize() override { inc(7); return 0; }
+    void GetPreallocationStatus(size_t* block_size,
+                                size_t* last_allocated_block) override {
+      inc(8);
+    }
+    size_t GetUniqueId(char* id, size_t max_size) const override {
+      inc(9);
+      return 0;
+    }
+    Status InvalidateCache(size_t offset, size_t length) override {
+      inc(10);
+      return Status::OK();
+    }
+
+   protected:
+    Status Allocate(off_t offset, off_t len) override {
+      inc(11);
+      return Status::OK();
+    }
+    Status RangeSync(off_t offset, off_t nbytes) override {
+      inc(12);
+      return Status::OK();
+    }
+
+   public:
+    ~Base() {
+      inc(13);
+    }
+  };
+
+  class Wrapper : public WritableFileWrapper {
+   public:
+    explicit Wrapper(WritableFile* target) : WritableFileWrapper(target) {}
+
+    void CallProtectedMethods() {
+      Allocate(0, 0);
+      RangeSync(0, 0);
+    }
+  };
+
+  int step = 0;
+
+  {
+    Base b(&step);
+    Wrapper w(&b);
+    w.Append(Slice());
+    w.Close();
+    w.Flush();
+    w.Sync();
+    w.Fsync();
+    w.SetIOPriority(Env::IOPriority::IO_HIGH);
+    w.GetFileSize();
+    w.GetPreallocationStatus(nullptr, nullptr);
+    w.GetUniqueId(nullptr, 0);
+    w.InvalidateCache(0, 0);
+    w.CallProtectedMethods();
+  }
+
+  EXPECT_EQ(14, step);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/src/rocksdb/util/event_logger.cc b/src/rocksdb/util/event_logger.cc
index fdecb8e..92a781c 100644
--- a/src/rocksdb/util/event_logger.cc
+++ b/src/rocksdb/util/event_logger.cc
@@ -18,29 +18,49 @@
 
 namespace rocksdb {
 
-const char* kEventLoggerPrefix = "EVENT_LOG_v1";
 
 EventLoggerStream::EventLoggerStream(Logger* logger)
-    : logger_(logger), log_buffer_(nullptr), json_writter_(nullptr) {}
+    : logger_(logger), log_buffer_(nullptr), json_writer_(nullptr) {}
 
 EventLoggerStream::EventLoggerStream(LogBuffer* log_buffer)
-    : logger_(nullptr), log_buffer_(log_buffer), json_writter_(nullptr) {}
+    : logger_(nullptr), log_buffer_(log_buffer), json_writer_(nullptr) {}
 
 EventLoggerStream::~EventLoggerStream() {
-  if (json_writter_) {
-    json_writter_->EndObject();
+  if (json_writer_) {
+    json_writer_->EndObject();
 #ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT
-    printf("%s\n", json_writter_->Get().c_str());
+    printf("%s\n", json_writer_->Get().c_str());
 #else
     if (logger_) {
-      Log(logger_, "%s %s", kEventLoggerPrefix, json_writter_->Get().c_str());
+      EventLogger::Log(logger_, *json_writer_);
     } else if (log_buffer_) {
-      LogToBuffer(log_buffer_, "%s %s", kEventLoggerPrefix,
-                  json_writter_->Get().c_str());
+      EventLogger::LogToBuffer(log_buffer_, *json_writer_);
     }
 #endif
-    delete json_writter_;
+    delete json_writer_;
   }
 }
 
+void EventLogger::Log(const JSONWriter& jwriter) {
+  Log(logger_, jwriter);
+}
+
+void EventLogger::Log(Logger* logger, const JSONWriter& jwriter) {
+#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT
+  printf("%s\n", jwriter.Get().c_str());
+#else
+  rocksdb::Log(logger, "%s %s", Prefix(), jwriter.Get().c_str());
+#endif
+}
+
+void EventLogger::LogToBuffer(
+    LogBuffer* log_buffer, const JSONWriter& jwriter) {
+#ifdef ROCKSDB_PRINT_EVENTS_TO_STDOUT
+  printf("%s\n", jwriter.Get().c_str());
+#else
+  assert(log_buffer);
+  rocksdb::LogToBuffer(log_buffer, "%s %s", Prefix(), jwriter.Get().c_str());
+#endif
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/event_logger.h b/src/rocksdb/util/event_logger.h
index 806b4e5..53a40c2 100644
--- a/src/rocksdb/util/event_logger.h
+++ b/src/rocksdb/util/event_logger.h
@@ -15,11 +15,11 @@
 
 namespace rocksdb {
 
-// JSONWritter doesn't support objects in arrays yet. There wasn't a need for
-// that.
-class JSONWritter {
+class JSONWriter {
  public:
-  JSONWritter() : state_(kExpectKey), first_element_(true) { stream_ << "{"; }
+  JSONWriter() : state_(kExpectKey), first_element_(true), in_array_(false) {
+    stream_ << "{";
+  }
 
   void AddKey(const std::string& key) {
     assert(state_ == kExpectKey);
@@ -59,6 +59,7 @@ class JSONWritter {
   void StartArray() {
     assert(state_ == kExpectValue);
     state_ = kInArray;
+    in_array_ = true;
     stream_ << "[";
     first_element_ = true;
   }
@@ -66,6 +67,7 @@ class JSONWritter {
   void EndArray() {
     assert(state_ == kInArray);
     state_ = kExpectKey;
+    in_array_ = false;
     stream_ << "]";
     first_element_ = false;
   }
@@ -83,9 +85,24 @@ class JSONWritter {
     first_element_ = false;
   }
 
+  void StartArrayedObject() {
+    assert(state_ == kInArray && in_array_);
+    state_ = kExpectValue;
+    if (!first_element_) {
+      stream_ << ", ";
+    }
+    StartObject();
+  }
+
+  void EndArrayedObject() {
+    assert(in_array_);
+    EndObject();
+    state_ = kInArray;
+  }
+
   std::string Get() const { return stream_.str(); }
 
-  JSONWritter& operator<<(const char* val) {
+  JSONWriter& operator<<(const char* val) {
     if (state_ == kExpectKey) {
       AddKey(val);
     } else {
@@ -94,25 +111,27 @@ class JSONWritter {
     return *this;
   }
 
-  JSONWritter& operator<<(const std::string& val) {
+  JSONWriter& operator<<(const std::string& val) {
     return *this << val.c_str();
   }
 
   template <typename T>
-  JSONWritter& operator<<(const T& val) {
+  JSONWriter& operator<<(const T& val) {
     assert(state_ != kExpectKey);
     AddValue(val);
     return *this;
   }
 
  private:
-  enum JSONWritterState {
+  enum JSONWriterState {
     kExpectKey,
     kExpectValue,
     kInArray,
+    kInArrayedObject,
   };
-  JSONWritterState state_;
+  JSONWriterState state_;
   bool first_element_;
+  bool in_array_;
   std::ostringstream stream_;
 };
 
@@ -121,21 +140,21 @@ class EventLoggerStream {
   template <typename T>
   EventLoggerStream& operator<<(const T& val) {
     MakeStream();
-    *json_writter_ << val;
+    *json_writer_ << val;
     return *this;
   }
 
-  void StartArray() { json_writter_->StartArray(); }
-  void EndArray() { json_writter_->EndArray(); }
-  void StartObject() { json_writter_->StartObject(); }
-  void EndObject() { json_writter_->EndObject(); }
+  void StartArray() { json_writer_->StartArray(); }
+  void EndArray() { json_writer_->EndArray(); }
+  void StartObject() { json_writer_->StartObject(); }
+  void EndObject() { json_writer_->EndObject(); }
 
   ~EventLoggerStream();
 
  private:
   void MakeStream() {
-    if (!json_writter_) {
-      json_writter_ = new JSONWritter();
+    if (!json_writer_) {
+      json_writer_ = new JSONWriter();
       *this << "time_micros"
             << std::chrono::duration_cast<std::chrono::microseconds>(
                    std::chrono::system_clock::now().time_since_epoch()).count();
@@ -148,7 +167,7 @@ class EventLoggerStream {
   Logger* const logger_;
   LogBuffer* const log_buffer_;
   // ownership
-  JSONWritter* json_writter_;
+  JSONWriter* json_writer_;
 };
 
 // here is an example of the output that will show up in the LOG:
@@ -157,11 +176,18 @@ class EventLoggerStream {
 // "file_size": 1909699}
 class EventLogger {
  public:
+  static const char* Prefix() {
+    return "EVENT_LOG_v1";
+  }
+
   explicit EventLogger(Logger* logger) : logger_(logger) {}
   EventLoggerStream Log() { return EventLoggerStream(logger_); }
   EventLoggerStream LogToBuffer(LogBuffer* log_buffer) {
     return EventLoggerStream(log_buffer);
   }
+  void Log(const JSONWriter& jwriter);
+  static void Log(Logger* logger, const JSONWriter& jwriter);
+  static void LogToBuffer(LogBuffer* log_buffer, const JSONWriter& jwriter);
 
  private:
   Logger* logger_;
diff --git a/src/rocksdb/util/file_reader_writer.cc b/src/rocksdb/util/file_reader_writer.cc
new file mode 100644
index 0000000..86d70b6
--- /dev/null
+++ b/src/rocksdb/util/file_reader_writer.cc
@@ -0,0 +1,471 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/file_reader_writer.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include "port/port.h"
+#include "util/histogram.h"
+#include "util/iostats_context_imp.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+namespace {
+  const size_t c_OneMb = (1 << 20);
+}
+
+Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
+  Status s = file_->Read(n, result, scratch);
+  IOSTATS_ADD(bytes_read, result->size());
+  return s;
+}
+
+Status SequentialFileReader::Skip(uint64_t n) { return file_->Skip(n); }
+
+Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
+                                    char* scratch) const {
+  Status s;
+  uint64_t elapsed = 0;
+  {
+    StopWatch sw(env_, stats_, hist_type_,
+                 (stats_ != nullptr) ? &elapsed : nullptr);
+    IOSTATS_TIMER_GUARD(read_nanos);
+    s = file_->Read(offset, n, result, scratch);
+    IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
+  }
+  if (stats_ != nullptr && file_read_hist_ != nullptr) {
+    file_read_hist_->Add(elapsed);
+  }
+  return s;
+}
+
+Status WritableFileWriter::Append(const Slice& data) {
+  const char* src = data.data();
+  size_t left = data.size();
+  Status s;
+  pending_sync_ = true;
+  pending_fsync_ = true;
+
+  TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
+
+  {
+    IOSTATS_TIMER_GUARD(prepare_write_nanos);
+    TEST_SYNC_POINT("WritableFileWriter::Append:BeforePrepareWrite");
+    writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left);
+  }
+
+  // Flush only when I/O is buffered
+  if (use_os_buffer_ &&
+    (buf_.Capacity() - buf_.CurrentSize()) < left) {
+    if (buf_.CurrentSize() > 0) {
+      s = Flush();
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    if (buf_.Capacity() < c_OneMb) {
+      size_t desiredCapacity = buf_.Capacity() * 2;
+      desiredCapacity = std::min(desiredCapacity, c_OneMb);
+      buf_.AllocateNewBuffer(desiredCapacity);
+    }
+    assert(buf_.CurrentSize() == 0);
+  }
+
+  // We never write directly to disk with unbuffered I/O on.
+  // or we simply use it for its original purpose to accumulate many small
+  // chunks
+  if (!use_os_buffer_ || (buf_.Capacity() >= left)) {
+    while (left > 0) {
+      size_t appended = buf_.Append(src, left);
+      left -= appended;
+      src += appended;
+
+      if (left > 0) {
+        s = Flush();
+        if (!s.ok()) {
+          break;
+        }
+
+        // We double the buffer here because
+        // Flush calls do not keep up with the incoming bytes
+        // This is the only place when buffer is changed with unbuffered I/O
+        if (buf_.Capacity() < (1 << 20)) {
+          size_t desiredCapacity = buf_.Capacity() * 2;
+          desiredCapacity = std::min(desiredCapacity, c_OneMb);
+          buf_.AllocateNewBuffer(desiredCapacity);
+        }
+      }
+    }
+  } else {
+    // Writing directly to file bypassing the buffer
+    assert(buf_.CurrentSize() == 0);
+    s = WriteBuffered(src, left);
+  }
+
+  TEST_KILL_RANDOM(rocksdb_kill_odds);
+  filesize_ += data.size();
+  return Status::OK();
+}
+
+Status WritableFileWriter::Close() {
+
+  // Do not quit immediately on failure the file MUST be closed
+  Status s;
+
+  // Possible to close it twice now as we MUST close
+  // in __dtor, simply flushing is not enough
+  // Windows when pre-allocating does not fill with zeros
+  // also with unbuffered access we also set the end of data.
+  if (!writable_file_) {
+    return s;
+  }
+
+  s = Flush();  // flush cache to OS
+
+  // In unbuffered mode we write whole pages so
+  // we need to let the file know where data ends.
+  Status interim = writable_file_->Truncate(filesize_);
+  if (!interim.ok() && s.ok()) {
+    s = interim;
+  }
+
+  TEST_KILL_RANDOM(rocksdb_kill_odds);
+  interim = writable_file_->Close();
+  if (!interim.ok() && s.ok()) {
+    s = interim;
+  }
+
+  writable_file_.reset();
+
+  return s;
+}
+
+
+// write out the cached data to the OS cache
+Status WritableFileWriter::Flush() {
+  Status s;
+  TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2);
+
+  if (buf_.CurrentSize() > 0) {
+    if (use_os_buffer_) {
+      s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
+    } else {
+      s = WriteUnbuffered();
+    }
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  s = writable_file_->Flush();
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // sync OS cache to disk for every bytes_per_sync_
+  // TODO: give log file and sst file different options (log
+  // files could be potentially cached in OS for their whole
+  // life time, thus we might not want to flush at all).
+
+  // We try to avoid sync to the last 1MB of data. For two reasons:
+  // (1) avoid rewrite the same page that is modified later.
+  // (2) for older version of OS, write can block while writing out
+  //     the page.
+  // Xfs does neighbor page flushing outside of the specified ranges. We
+  // need to make sure sync range is far from the write offset.
+  if (!direct_io_ && bytes_per_sync_) {
+    const uint64_t kBytesNotSyncRange = 1024 * 1024;  // recent 1MB is not synced.
+    const uint64_t kBytesAlignWhenSync = 4 * 1024;    // Align 4KB.
+    if (filesize_ > kBytesNotSyncRange) {
+      uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
+      offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
+      assert(offset_sync_to >= last_sync_size_);
+      if (offset_sync_to > 0 &&
+          offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
+        s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
+        last_sync_size_ = offset_sync_to;
+      }
+    }
+  }
+
+  return s;
+}
+
+Status WritableFileWriter::Sync(bool use_fsync) {
+  Status s = Flush();
+  if (!s.ok()) {
+    return s;
+  }
+  TEST_KILL_RANDOM(rocksdb_kill_odds);
+  if (!direct_io_ && pending_sync_) {
+    s = SyncInternal(use_fsync);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  TEST_KILL_RANDOM(rocksdb_kill_odds);
+  pending_sync_ = false;
+  if (use_fsync) {
+    pending_fsync_ = false;
+  }
+  return Status::OK();
+}
+
+Status WritableFileWriter::SyncWithoutFlush(bool use_fsync) {
+  if (!writable_file_->IsSyncThreadSafe()) {
+    return Status::NotSupported(
+      "Can't WritableFileWriter::SyncWithoutFlush() because "
+      "WritableFile::IsSyncThreadSafe() is false");
+  }
+  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:1");
+  Status s = SyncInternal(use_fsync);
+  TEST_SYNC_POINT("WritableFileWriter::SyncWithoutFlush:2");
+  return s;
+}
+
+Status WritableFileWriter::SyncInternal(bool use_fsync) {
+  Status s;
+  IOSTATS_TIMER_GUARD(fsync_nanos);
+  TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
+  if (use_fsync) {
+    s = writable_file_->Fsync();
+  } else {
+    s = writable_file_->Sync();
+  }
+  return s;
+}
+
+Status WritableFileWriter::RangeSync(off_t offset, off_t nbytes) {
+  IOSTATS_TIMER_GUARD(range_sync_nanos);
+  TEST_SYNC_POINT("WritableFileWriter::RangeSync:0");
+  return writable_file_->RangeSync(offset, nbytes);
+}
+
+size_t WritableFileWriter::RequestToken(size_t bytes, bool align) {
+  Env::IOPriority io_priority;
+  if (rate_limiter_ && (io_priority = writable_file_->GetIOPriority()) <
+      Env::IO_TOTAL) {
+    bytes = std::min(
+      bytes, static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()));
+
+    if (align) {
+      // Here we may actually require more than burst and block
+      // but we can not write less than one page at a time on unbuffered
+      // thus we may want not to use ratelimiter s
+      size_t alignment = buf_.Alignment();
+      bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes));
+    }
+    rate_limiter_->Request(bytes, io_priority);
+  }
+  return bytes;
+}
+
+// This method writes to disk the specified data and makes use of the rate
+// limiter if available
+Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
+  Status s;
+  assert(use_os_buffer_);
+  const char* src = data;
+  size_t left = size;
+
+  while (left > 0) {
+    size_t allowed = RequestToken(left, false);
+
+    {
+      IOSTATS_TIMER_GUARD(write_nanos);
+      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+      s = writable_file_->Append(Slice(src, allowed));
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    IOSTATS_ADD(bytes_written, allowed);
+    TEST_KILL_RANDOM(rocksdb_kill_odds);
+
+    left -= allowed;
+    src += allowed;
+  }
+  buf_.Size(0);
+  return s;
+}
+
+
+// This flushes the accumulated data in the buffer. We pad data with zeros if
+// necessary to the whole page.
+// However, during automatic flushes padding would not be necessary.
+// We always use RateLimiter if available. We move (Refit) any buffer bytes
+// that are left over the
+// whole number of pages to be written again on the next flush because we can
+// only write on aligned
+// offsets.
+Status WritableFileWriter::WriteUnbuffered() {
+  Status s;
+
+  assert(!use_os_buffer_);
+  const size_t alignment = buf_.Alignment();
+  assert((next_write_offset_ % alignment) == 0);
+
+  // Calculate whole page final file advance if all writes succeed
+  size_t file_advance =
+    TruncateToPageBoundary(alignment, buf_.CurrentSize());
+
+  // Calculate the leftover tail, we write it here padded with zeros BUT we
+  // will write
+  // it again in the future either on Close() OR when the current whole page
+  // fills out
+  size_t leftover_tail = buf_.CurrentSize() - file_advance;
+
+  // Round up and pad
+  buf_.PadToAlignmentWith(0);
+
+  const char* src = buf_.BufferStart();
+  uint64_t write_offset = next_write_offset_;
+  size_t left = buf_.CurrentSize();
+
+  while (left > 0) {
+    // Check how much is allowed
+    size_t size = RequestToken(left, true);
+
+    {
+      IOSTATS_TIMER_GUARD(write_nanos);
+      TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+      // Unbuffered writes must be positional
+      s = writable_file_->PositionedAppend(Slice(src, size), write_offset);
+      if (!s.ok()) {
+        buf_.Size(file_advance + leftover_tail);
+        return s;
+      }
+    }
+
+    IOSTATS_ADD(bytes_written, size);
+    left -= size;
+    src += size;
+    write_offset += size;
+    assert((next_write_offset_ % alignment) == 0);
+  }
+
+  if (s.ok()) {
+    // Move the tail to the beginning of the buffer
+    // This never happens during normal Append but rather during
+    // explicit call to Flush()/Sync() or Close()
+    buf_.RefitTail(file_advance, leftover_tail);
+    // This is where we start writing next time which may or not be
+    // the actual file size on disk. They match if the buffer size
+    // is a multiple of whole pages otherwise filesize_ is leftover_tail
+    // behind
+    next_write_offset_ += file_advance;
+  }
+  return s;
+}
+
+
+namespace {
+class ReadaheadRandomAccessFile : public RandomAccessFile {
+ public:
+   ReadaheadRandomAccessFile(std::unique_ptr<RandomAccessFile>&& file,
+     size_t readahead_size)
+     : file_(std::move(file)),
+       readahead_size_(readahead_size),
+       forward_calls_(file_->ShouldForwardRawRequest()),
+       buffer_(new char[readahead_size_]),
+       buffer_offset_(0),
+       buffer_len_(0) {}
+
+ ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete;
+
+ ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) = delete;
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                      char* scratch) const override {
+    if (n >= readahead_size_) {
+      return file_->Read(offset, n, result, scratch);
+    }
+
+    // On Windows in unbuffered mode this will lead to double buffering
+    // and double locking so we avoid that.
+    // In normal mode Windows caches so much data from disk that we do
+    // not need readahead.
+    if (forward_calls_) {
+      return file_->Read(offset, n, result, scratch);
+    }
+
+    std::unique_lock<std::mutex> lk(lock_);
+
+    size_t copied = 0;
+    // if offset between [buffer_offset_, buffer_offset_ + buffer_len>
+    if (offset >= buffer_offset_ && offset < buffer_len_ + buffer_offset_) {
+      uint64_t offset_in_buffer = offset - buffer_offset_;
+      copied = std::min(static_cast<uint64_t>(buffer_len_) - offset_in_buffer,
+        static_cast<uint64_t>(n));
+      memcpy(scratch, buffer_.get() + offset_in_buffer, copied);
+      if (copied == n) {
+        // fully cached
+        *result = Slice(scratch, n);
+        return Status::OK();
+      }
+    }
+    Slice readahead_result;
+    Status s = file_->Read(offset + copied, readahead_size_, &readahead_result,
+      buffer_.get());
+    if (!s.ok()) {
+      return s;
+    }
+
+    auto left_to_copy = std::min(readahead_result.size(), n - copied);
+    memcpy(scratch + copied, readahead_result.data(), left_to_copy);
+    *result = Slice(scratch, copied + left_to_copy);
+
+    if (readahead_result.data() == buffer_.get()) {
+      buffer_offset_ = offset + copied;
+      buffer_len_ = readahead_result.size();
+    } else {
+      buffer_len_ = 0;
+    }
+
+    return Status::OK();
+  }
+
+  virtual size_t GetUniqueId(char* id, size_t max_size) const override {
+    return file_->GetUniqueId(id, max_size);
+  }
+
+  virtual void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
+
+  virtual Status InvalidateCache(size_t offset, size_t length) override {
+    return file_->InvalidateCache(offset, length);
+  }
+
+ private:
+  std::unique_ptr<RandomAccessFile> file_;
+  size_t               readahead_size_;
+  const bool           forward_calls_;
+
+  mutable std::mutex   lock_;
+  mutable std::unique_ptr<char[]> buffer_;
+  mutable uint64_t     buffer_offset_;
+  mutable size_t       buffer_len_;
+};
+}  // namespace
+
+std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
+    std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size) {
+  std::unique_ptr<RandomAccessFile> result(
+    new ReadaheadRandomAccessFile(std::move(file), readahead_size));
+  return result;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/file_reader_writer.h b/src/rocksdb/util/file_reader_writer.h
new file mode 100644
index 0000000..4134a0e
--- /dev/null
+++ b/src/rocksdb/util/file_reader_writer.h
@@ -0,0 +1,166 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include "rocksdb/env.h"
+#include "util/aligned_buffer.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+class Statistics;
+class HistogramImpl;
+
+std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
+  std::unique_ptr<RandomAccessFile>&& file, size_t readahead_size);
+
+class SequentialFileReader {
+ private:
+  std::unique_ptr<SequentialFile> file_;
+
+ public:
+  explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file)
+      : file_(std::move(_file)) {}
+
+  SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
+    *this = std::move(o);
+  }
+
+  SequentialFileReader& operator=(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
+    file_ = std::move(o.file_);
+    return *this;
+  }
+
+  SequentialFileReader(SequentialFileReader&) = delete;
+  SequentialFileReader& operator=(SequentialFileReader&) = delete;
+
+  Status Read(size_t n, Slice* result, char* scratch);
+
+  Status Skip(uint64_t n);
+
+  SequentialFile* file() { return file_.get(); }
+};
+
+class RandomAccessFileReader {
+ private:
+  std::unique_ptr<RandomAccessFile> file_;
+  Env*            env_;
+  Statistics*     stats_;
+  uint32_t        hist_type_;
+  HistogramImpl*  file_read_hist_;
+
+ public:
+  explicit RandomAccessFileReader(std::unique_ptr<RandomAccessFile>&& raf,
+                                  Env* env = nullptr,
+                                  Statistics* stats = nullptr,
+                                  uint32_t hist_type = 0,
+                                  HistogramImpl* file_read_hist = nullptr)
+      : file_(std::move(raf)),
+        env_(env),
+        stats_(stats),
+        hist_type_(hist_type),
+        file_read_hist_(file_read_hist) {}
+
+  RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT {
+    *this = std::move(o);
+  }
+
+  RandomAccessFileReader& operator=(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT{
+    file_ = std::move(o.file_);
+    env_ = std::move(o.env_);
+    stats_ = std::move(o.stats_);
+    hist_type_ = std::move(o.hist_type_);
+    file_read_hist_ = std::move(o.file_read_hist_);
+    return *this;
+  }
+
+  RandomAccessFileReader(const RandomAccessFileReader&) = delete;
+  RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;
+
+  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const;
+
+  RandomAccessFile* file() { return file_.get(); }
+};
+
+// Use posix write to write data to a file.
+class WritableFileWriter {
+ private:
+  std::unique_ptr<WritableFile> writable_file_;
+  AlignedBuffer           buf_;
+  // Actually written data size can be used for truncate
+  // not counting padding data
+  uint64_t                filesize_;
+  // This is necessary when we use unbuffered access
+  // and writes must happen on aligned offsets
+  // so we need to go back and write that page again
+  uint64_t                next_write_offset_;
+  bool                    pending_sync_;
+  bool                    pending_fsync_;
+  const bool              direct_io_;
+  const bool              use_os_buffer_;
+  uint64_t                last_sync_size_;
+  uint64_t                bytes_per_sync_;
+  RateLimiter*            rate_limiter_;
+
+ public:
+  WritableFileWriter(std::unique_ptr<WritableFile>&& file,
+                     const EnvOptions& options)
+      : writable_file_(std::move(file)),
+        buf_(),
+        filesize_(0),
+        next_write_offset_(0),
+        pending_sync_(false),
+        pending_fsync_(false),
+        direct_io_(writable_file_->UseDirectIO()),
+        use_os_buffer_(writable_file_->UseOSBuffer()),
+        last_sync_size_(0),
+        bytes_per_sync_(options.bytes_per_sync),
+        rate_limiter_(options.rate_limiter) {
+
+    buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
+    buf_.AllocateNewBuffer(65536);
+  }
+
+  WritableFileWriter(const WritableFileWriter&) = delete;
+
+  WritableFileWriter& operator=(const WritableFileWriter&) = delete;
+
+  ~WritableFileWriter() { Close(); }
+
+  Status Append(const Slice& data);
+
+  Status Flush();
+
+  Status Close();
+
+  Status Sync(bool use_fsync);
+
+  // Sync only the data that was already Flush()ed. Safe to call concurrently
+  // with Append() and Flush(). If !writable_file_->IsSyncThreadSafe(),
+  // returns NotSupported status.
+  Status SyncWithoutFlush(bool use_fsync);
+
+  uint64_t GetFileSize() { return filesize_; }
+
+  Status InvalidateCache(size_t offset, size_t length) {
+    return writable_file_->InvalidateCache(offset, length);
+  }
+
+  WritableFile* writable_file() const { return writable_file_.get(); }
+
+ private:
+  // Used when os buffering is OFF and we are writing
+  // DMA such as in Windows unbuffered mode
+  Status WriteUnbuffered();
+  // Normal write
+  Status WriteBuffered(const char* data, size_t size);
+  Status RangeSync(off_t offset, off_t nbytes);
+  size_t RequestToken(size_t bytes, bool align);
+  Status SyncInternal(bool use_fsync);
+};
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/file_reader_writer_test.cc b/src/rocksdb/util/file_reader_writer_test.cc
new file mode 100644
index 0000000..d1f0dcb
--- /dev/null
+++ b/src/rocksdb/util/file_reader_writer_test.cc
@@ -0,0 +1,92 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include <vector>
+#include "util/file_reader_writer.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace rocksdb {
+
+class WritableFileWriterTest : public testing::Test {};
+
+const uint32_t kMb = 1 << 20;
+
+TEST_F(WritableFileWriterTest, RangeSync) {
+  class FakeWF : public WritableFile {
+   public:
+    explicit FakeWF() : size_(0), last_synced_(0) {}
+    ~FakeWF() {}
+
+    Status Append(const Slice& data) override {
+      size_ += data.size();
+      return Status::OK();
+    }
+    virtual Status Truncate(uint64_t size) override {
+      return Status::OK();
+    }
+    Status Close() override {
+      EXPECT_GE(size_, last_synced_ + kMb);
+      EXPECT_LT(size_, last_synced_ + 2 * kMb);
+      // Make sure random writes generated enough writes.
+      EXPECT_GT(size_, 10 * kMb);
+      return Status::OK();
+    }
+    Status Flush() override { return Status::OK(); }
+    Status Sync() override { return Status::OK(); }
+    Status Fsync() override { return Status::OK(); }
+    void SetIOPriority(Env::IOPriority pri) override {}
+    uint64_t GetFileSize() override { return size_; }
+    void GetPreallocationStatus(size_t* block_size,
+                                size_t* last_allocated_block) override {}
+    size_t GetUniqueId(char* id, size_t max_size) const override { return 0; }
+    Status InvalidateCache(size_t offset, size_t length) override {
+      return Status::OK();
+    }
+
+   protected:
+    Status Allocate(off_t offset, off_t len) override { return Status::OK(); }
+    Status RangeSync(off_t offset, off_t nbytes) override {
+      EXPECT_EQ(offset % 4096, 0u);
+      EXPECT_EQ(nbytes % 4096, 0u);
+
+      EXPECT_EQ(offset, last_synced_);
+      last_synced_ = offset + nbytes;
+      EXPECT_GE(size_, last_synced_ + kMb);
+      if (size_ > 2 * kMb) {
+        EXPECT_LT(size_, last_synced_ + 2 * kMb);
+      }
+      return Status::OK();
+    }
+
+    uint64_t size_;
+    uint64_t last_synced_;
+  };
+
+  EnvOptions env_options;
+  env_options.bytes_per_sync = kMb;
+  unique_ptr<FakeWF> wf(new FakeWF);
+  unique_ptr<WritableFileWriter> writer(
+      new WritableFileWriter(std::move(wf), env_options));
+  Random r(301);
+  std::unique_ptr<char[]> large_buf(new char[10 * kMb]);
+  for (int i = 0; i < 1000; i++) {
+    int skew_limit = (i < 700) ? 10 : 15;
+    uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100);
+    writer->Append(Slice(large_buf.get(), num));
+
+    // Flush in a chance of 1/10.
+    if (r.Uniform(10) == 0) {
+      writer->Flush();
+    }
+  }
+  writer->Close();
+}
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/file_util.cc b/src/rocksdb/util/file_util.cc
index c75d59c..1bcf3ed 100644
--- a/src/rocksdb/util/file_util.cc
+++ b/src/rocksdb/util/file_util.cc
@@ -3,11 +3,16 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
+#include "util/file_util.h"
+
 #include <string>
 #include <algorithm>
-#include "util/file_util.h"
+
+#include "rocksdb/delete_scheduler.h"
 #include "rocksdb/env.h"
+#include "rocksdb/options.h"
 #include "db/filename.h"
+#include "util/file_reader_writer.h"
 
 namespace rocksdb {
 
@@ -15,8 +20,12 @@ namespace rocksdb {
 Status CopyFile(Env* env, const std::string& source,
                 const std::string& destination, uint64_t size) {
   const EnvOptions soptions;
-  unique_ptr<SequentialFile> srcfile;
   Status s;
+  unique_ptr<SequentialFileReader> src_reader;
+  unique_ptr<WritableFileWriter> dest_writer;
+
+  {
+    unique_ptr<SequentialFile> srcfile;
   s = env->NewSequentialFile(source, &srcfile, soptions);
   unique_ptr<WritableFile> destfile;
   if (s.ok()) {
@@ -33,6 +42,9 @@ Status CopyFile(Env* env, const std::string& source,
       return s;
     }
   }
+  src_reader.reset(new SequentialFileReader(std::move(srcfile)));
+  dest_writer.reset(new WritableFileWriter(std::move(destfile), soptions));
+  }
 
   char buffer[4096];
   Slice slice;
@@ -40,13 +52,13 @@ Status CopyFile(Env* env, const std::string& source,
     uint64_t bytes_to_read =
         std::min(static_cast<uint64_t>(sizeof(buffer)), size);
     if (s.ok()) {
-      s = srcfile->Read(bytes_to_read, &slice, buffer);
+      s = src_reader->Read(bytes_to_read, &slice, buffer);
     }
     if (s.ok()) {
       if (slice.size() == 0) {
         return Status::Corruption("file too small");
       }
-      s = destfile->Append(slice);
+      s = dest_writer->Append(slice);
     }
     if (!s.ok()) {
       return s;
@@ -56,4 +68,13 @@ Status CopyFile(Env* env, const std::string& source,
   return Status::OK();
 }
 
+Status DeleteOrMoveToTrash(const DBOptions* db_options,
+                           const std::string& fname) {
+  if (db_options->delete_scheduler == nullptr) {
+    return db_options->env->DeleteFile(fname);
+  } else {
+    return db_options->delete_scheduler->DeleteFile(fname);
+  }
+}
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/file_util.h b/src/rocksdb/util/file_util.h
index 84b3734..f3e02fb 100644
--- a/src/rocksdb/util/file_util.h
+++ b/src/rocksdb/util/file_util.h
@@ -3,16 +3,20 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
+#pragma once
 #include <string>
 
-#pragma once
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
 #include "rocksdb/env.h"
+#include "rocksdb/options.h"
 
 namespace rocksdb {
 
 extern Status CopyFile(Env* env, const std::string& source,
                        const std::string& destination, uint64_t size = 0);
 
+extern Status DeleteOrMoveToTrash(const DBOptions* db_options,
+                                  const std::string& fname);
+
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/hash_cuckoo_rep.cc b/src/rocksdb/util/hash_cuckoo_rep.cc
index 3ac5ba7..6e5057a 100644
--- a/src/rocksdb/util/hash_cuckoo_rep.cc
+++ b/src/rocksdb/util/hash_cuckoo_rep.cc
@@ -5,20 +5,21 @@
 //
 
 #ifndef ROCKSDB_LITE
+
 #include "util/hash_cuckoo_rep.h"
 
 #include <algorithm>
 #include <atomic>
 #include <limits>
+#include <memory>
 #include <queue>
 #include <string>
-#include <memory>
 #include <vector>
 
-#include "rocksdb/memtablerep.h"
-#include "util/murmurhash.h"
 #include "db/memtable.h"
 #include "db/skiplist.h"
+#include "rocksdb/memtablerep.h"
+#include "util/murmurhash.h"
 #include "util/stl_wrappers.h"
 
 namespace rocksdb {
@@ -39,8 +40,15 @@ struct CuckooStep {
 
   CuckooStep() : bucket_id_(-1), prev_step_id_(kNullStep), depth_(1) {}
 
-  CuckooStep(CuckooStep&&) = default;
-  CuckooStep& operator=(CuckooStep&&) = default;
+  // MSVC does not support = default yet
+  CuckooStep(CuckooStep&& o) ROCKSDB_NOEXCEPT { *this = std::move(o); }
+
+  CuckooStep& operator=(CuckooStep&& rhs) {
+    bucket_id_ = std::move(rhs.bucket_id_);
+    prev_step_id_ = std::move(rhs.prev_step_id_);
+    depth_ = std::move(rhs.depth_);
+    return *this;
+  }
 
   CuckooStep(const CuckooStep&) = delete;
   CuckooStep& operator=(const CuckooStep&) = delete;
@@ -54,18 +62,20 @@ class HashCuckooRep : public MemTableRep {
   explicit HashCuckooRep(const MemTableRep::KeyComparator& compare,
                          MemTableAllocator* allocator,
                          const size_t bucket_count,
-                         const unsigned int hash_func_count)
+                         const unsigned int hash_func_count,
+                         const size_t approximate_entry_size)
       : MemTableRep(allocator),
         compare_(compare),
         allocator_(allocator),
         bucket_count_(bucket_count),
+        approximate_entry_size_(approximate_entry_size),
         cuckoo_path_max_depth_(kDefaultCuckooPathMaxDepth),
         occupied_count_(0),
         hash_function_count_(hash_func_count),
         backup_table_(nullptr) {
     char* mem = reinterpret_cast<char*>(
         allocator_->Allocate(sizeof(std::atomic<const char*>) * bucket_count_));
-    cuckoo_array_ = new (mem) std::atomic<const char*>[bucket_count_];
+    cuckoo_array_ = new (mem) std::atomic<char*>[bucket_count_];
     for (unsigned int bid = 0; bid < bucket_count_; ++bid) {
       cuckoo_array_[bid].store(nullptr, std::memory_order_relaxed);
     }
@@ -91,15 +101,15 @@ class HashCuckooRep : public MemTableRep {
   // the current mem-table already contains the specified key.
   virtual void Insert(KeyHandle handle) override;
 
-  // This function returns std::numeric_limits<size_t>::max() in the following
-  // three cases to disallow further write operations:
+  // This function returns bucket_count_ * approximate_entry_size_ when any
+  // of the followings happen to disallow further write operations:
   // 1. when the fullness reaches kMaxFullnes.
   // 2. when the backup_table_ is used.
   //
   // otherwise, this function will always return 0.
   virtual size_t ApproximateMemoryUsage() override {
     if (is_nearly_full_) {
-      return std::numeric_limits<size_t>::max();
+      return bucket_count_ * approximate_entry_size_;
     }
     return 0;
   }
@@ -110,7 +120,7 @@ class HashCuckooRep : public MemTableRep {
 
   class Iterator : public MemTableRep::Iterator {
     std::shared_ptr<std::vector<const char*>> bucket_;
-    typename std::vector<const char*>::const_iterator mutable cit_;
+    std::vector<const char*>::const_iterator mutable cit_;
     const KeyComparator& compare_;
     std::string tmp_;  // For passing to EncodeKey
     bool mutable sorted_;
@@ -186,6 +196,8 @@ class HashCuckooRep : public MemTableRep {
   MemTableAllocator* const allocator_;
   // the number of hash bucket in the hash table.
   const size_t bucket_count_;
+  // approximate size of each entry
+  const size_t approximate_entry_size_;
   // the maxinum depth of the cuckoo path.
   const unsigned int cuckoo_path_max_depth_;
   // the current number of entries in cuckoo_array_ which has been occupied.
@@ -196,7 +208,7 @@ class HashCuckooRep : public MemTableRep {
   // a vacant bucket for inserting the key of a put request.
   std::shared_ptr<MemTableRep> backup_table_;
   // the array to store pointers, pointing to the actual data.
-  std::atomic<const char*>* cuckoo_array_;
+  std::atomic<char*>* cuckoo_array_;
   // a buffer to store cuckoo path
   int* cuckoo_path_;
   // a boolean flag indicating whether the fullness of bucket array
@@ -287,8 +299,8 @@ void HashCuckooRep::Get(const LookupKey& key, void* callback_args,
     const char* bucket =
         cuckoo_array_[GetHash(user_key, hid)].load(std::memory_order_acquire);
     if (bucket != nullptr) {
-      auto bucket_user_key = UserKey(bucket);
-      if (user_key.compare(bucket_user_key) == 0) {
+      Slice bucket_user_key = UserKey(bucket);
+      if (user_key == bucket_user_key) {
         callback_func(callback_args, bucket);
         break;
       }
@@ -400,8 +412,8 @@ bool HashCuckooRep::QuickInsert(const char* internal_key, const Slice& user_key,
   }
 
   if (cuckoo_bucket_id != -1) {
-    cuckoo_array_[cuckoo_bucket_id]
-        .store(internal_key, std::memory_order_release);
+    cuckoo_array_[cuckoo_bucket_id].store(const_cast<char*>(internal_key),
+                                          std::memory_order_release);
     return true;
   }
 
@@ -454,10 +466,10 @@ bool HashCuckooRep::FindCuckooPath(const char* internal_key,
     }
     // again, we can perform no barrier load safely here as the current
     // thread is the only writer.
-    auto bucket_user_key =
+    Slice bucket_user_key =
         UserKey(cuckoo_array_[step.bucket_id_].load(std::memory_order_relaxed));
     if (step.prev_step_id_ != CuckooStep::kNullStep) {
-      if (bucket_user_key.compare(user_key) == 0) {
+      if (bucket_user_key == user_key) {
         // then there is a loop in the current path, stop discovering this path.
         continue;
       }
@@ -622,7 +634,8 @@ MemTableRep* HashCuckooRepFactory::CreateMemTableRep(
     hash_function_count = kMaxHashCount;
   }
   return new HashCuckooRep(compare, allocator, bucket_count,
-                           hash_function_count);
+                           hash_function_count,
+                           (average_data_size_ + pointer_size) / kFullness);
 }
 
 MemTableRepFactory* NewHashCuckooRepFactory(size_t write_buffer_size,
diff --git a/src/rocksdb/util/hash_linklist_rep.cc b/src/rocksdb/util/hash_linklist_rep.cc
index ea4cd99..1e6eadf 100644
--- a/src/rocksdb/util/hash_linklist_rep.cc
+++ b/src/rocksdb/util/hash_linklist_rep.cc
@@ -82,11 +82,18 @@ struct Node {
 
   void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); }
 
+  // Needed for placement new below which is fine
+  Node() {}
+
  private:
   std::atomic<Node*> next_;
 
+  // Prohibit copying due to the below
+  Node(const Node&) = delete;
+  Node& operator=(const Node&) = delete;
+
  public:
-  char key[0];
+  char key[1];
 };
 
 // Memory structure of the mem table:
@@ -587,9 +594,9 @@ void HashLinkListRep::Insert(KeyHandle handle) {
   if (bucket_entries_logging_threshold_ > 0 &&
       header->GetNumEntries() ==
           static_cast<uint32_t>(bucket_entries_logging_threshold_)) {
-    Info(logger_,
-         "HashLinkedList bucket %zu has more than %d "
-         "entries. Key to insert: %s",
+    Info(logger_, "HashLinkedList bucket %" ROCKSDB_PRIszt
+                  " has more than %d "
+                  "entries. Key to insert: %s",
          GetHash(transformed), header->GetNumEntries(),
          GetLengthPrefixedSlice(x->key).ToString(true).c_str());
   }
diff --git a/src/rocksdb/util/heap.h b/src/rocksdb/util/heap.h
new file mode 100644
index 0000000..7d9e111
--- /dev/null
+++ b/src/rocksdb/util/heap.h
@@ -0,0 +1,140 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include "util/autovector.h"
+
+namespace rocksdb {
+
+// Binary heap implementation optimized for use in multi-way merge sort.
+// Comparison to std::priority_queue:
+// - In libstdc++, std::priority_queue::pop() usually performs just over logN
+//   comparisons but never fewer.
+// - std::priority_queue does not have a replace-top operation, requiring a
+//   pop+push.  If the replacement element is the new top, this requires
+//   around 2logN comparisons.
+// - This heap's pop() uses a "schoolbook" downheap which requires up to ~2logN
+//   comparisons.
+// - This heap provides a replace_top() operation which requires [1, 2logN]
+//   comparisons.  When the replacement element is also the new top, this
+//   takes just 1 or 2 comparisons.
+//
+// The last property can yield an order-of-magnitude performance improvement
+// when merge-sorting real-world non-random data.  If the merge operation is
+// likely to take chunks of elements from the same input stream, only 1
+// comparison per element is needed.  In RocksDB-land, this happens when
+// compacting a database where keys are not randomly distributed across L0
+// files but nearby keys are likely to be in the same L0 file.
+//
+// The container uses the same counterintuitive ordering as
+// std::priority_queue: the comparison operator is expected to provide the
+// less-than relation, but top() will return the maximum.
+
+template<typename T, typename Compare = std::less<T>>
+class BinaryHeap {
+ public:
+  BinaryHeap() { }
+  explicit BinaryHeap(Compare cmp) : cmp_(std::move(cmp)) { }
+
+  void push(const T& value) {
+    data_.push_back(value);
+    upheap(data_.size() - 1);
+  }
+
+  void push(T&& value) {
+    data_.push_back(std::move(value));
+    upheap(data_.size() - 1);
+  }
+
+  const T& top() const {
+    assert(!empty());
+    return data_.front();
+  }
+
+  void replace_top(const T& value) {
+    assert(!empty());
+    data_.front() = value;
+    downheap(get_root());
+  }
+
+  void replace_top(T&& value) {
+    assert(!empty());
+    data_.front() = std::move(value);
+    downheap(get_root());
+  }
+
+  void pop() {
+    assert(!empty());
+    data_.front() = std::move(data_.back());
+    data_.pop_back();
+    if (!empty()) {
+      downheap(get_root());
+    }
+  }
+
+  void swap(BinaryHeap &other) {
+    std::swap(cmp_, other.cmp_);
+    data_.swap(other.data_);
+  }
+
+  void clear() {
+    data_.clear();
+  }
+
+  bool empty() const {
+    return data_.empty();
+  }
+
+ private:
+  static inline size_t get_root() { return 0; }
+  static inline size_t get_parent(size_t index) { return (index - 1) / 2; }
+  static inline size_t get_left(size_t index) { return 2 * index + 1; }
+  static inline size_t get_right(size_t index) { return 2 * index + 2; }
+
+  void upheap(size_t index) {
+    T v = std::move(data_[index]);
+    while (index > get_root()) {
+      const size_t parent = get_parent(index);
+      if (!cmp_(data_[parent], v)) {
+        break;
+      }
+      data_[index] = std::move(data_[parent]);
+      index = parent;
+    }
+    data_[index] = std::move(v);
+  }
+
+  void downheap(size_t index) {
+    T v = std::move(data_[index]);
+    while (1) {
+      const size_t left_child = get_left(index);
+      if (get_left(index) >= data_.size()) {
+        break;
+      }
+      const size_t right_child = left_child + 1;
+      assert(right_child == get_right(index));
+      size_t picked_child = left_child;
+      if (right_child < data_.size() &&
+          cmp_(data_[left_child], data_[right_child])) {
+        picked_child = right_child;
+      }
+      if (!cmp_(v, data_[picked_child])) {
+        break;
+      }
+      data_[index] = std::move(data_[picked_child]);
+      index = picked_child;
+    }
+    data_[index] = std::move(v);
+  }
+
+  Compare cmp_;
+  autovector<T> data_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/heap_test.cc b/src/rocksdb/util/heap_test.cc
new file mode 100644
index 0000000..dd73e11
--- /dev/null
+++ b/src/rocksdb/util/heap_test.cc
@@ -0,0 +1,139 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#include <gtest/gtest.h>
+
+#include <climits>
+
+#include <queue>
+#include <random>
+#include <utility>
+
+#include "util/heap.h"
+
+#ifndef GFLAGS
+const int64_t FLAGS_iters = 100000;
+#else
+#include <gflags/gflags.h>
+DEFINE_int64(iters, 100000, "number of pseudo-random operations in each test");
+#endif  // GFLAGS
+
+/*
+ * Compares the custom heap implementation in util/heap.h against
+ * std::priority_queue on a pseudo-random sequence of operations.
+ */
+
+namespace rocksdb {
+
+using HeapTestValue = uint64_t;
+using Params = std::tuple<size_t, HeapTestValue, int64_t>;
+
+class HeapTest : public ::testing::TestWithParam<Params> {
+};
+
+TEST_P(HeapTest, Test) {
+  // This test performs the same pseudorandom sequence of operations on a
+  // BinaryHeap and an std::priority_queue, comparing output.  The three
+  // possible operations are insert, replace top and pop.
+  //
+  // Insert is chosen slightly more often than the others so that the size of
+  // the heap slowly grows.  Once the size heats the MAX_HEAP_SIZE limit, we
+  // disallow inserting until the heap becomes empty, testing the "draining"
+  // scenario.
+
+  const auto MAX_HEAP_SIZE = std::get<0>(GetParam());
+  const auto MAX_VALUE = std::get<1>(GetParam());
+  const auto RNG_SEED = std::get<2>(GetParam());
+
+  BinaryHeap<HeapTestValue> heap;
+  std::priority_queue<HeapTestValue> ref;
+
+  std::mt19937 rng(static_cast<unsigned int>(RNG_SEED));
+  std::uniform_int_distribution<HeapTestValue> value_dist(0, MAX_VALUE);
+  int ndrains = 0;
+  bool draining = false;     // hit max size, draining until we empty the heap
+  size_t size = 0;
+  for (int64_t i = 0; i < FLAGS_iters; ++i) {
+    if (size == 0) {
+      draining = false;
+    }
+
+    if (!draining &&
+        (size == 0 || std::bernoulli_distribution(0.4)(rng))) {
+      // insert
+      HeapTestValue val = value_dist(rng);
+      heap.push(val);
+      ref.push(val);
+      ++size;
+      if (size == MAX_HEAP_SIZE) {
+        draining = true;
+        ++ndrains;
+      }
+    } else if (std::bernoulli_distribution(0.5)(rng)) {
+      // replace top
+      HeapTestValue val = value_dist(rng);
+      heap.replace_top(val);
+      ref.pop();
+      ref.push(val);
+    } else {
+      // pop
+      assert(size > 0);
+      heap.pop();
+      ref.pop();
+      --size;
+    }
+
+    // After every operation, check that the public methods give the same
+    // results
+    assert((size == 0) == ref.empty());
+    ASSERT_EQ(size == 0, heap.empty());
+    if (size > 0) {
+      ASSERT_EQ(ref.top(), heap.top());
+    }
+  }
+
+  // Probabilities should be set up to occasionally hit the max heap size and
+  // drain it
+  assert(ndrains > 0);
+
+  heap.clear();
+  ASSERT_TRUE(heap.empty());
+}
+
+// Basic test, MAX_VALUE = 3*MAX_HEAP_SIZE (occasional duplicates)
+INSTANTIATE_TEST_CASE_P(
+  Basic, HeapTest,
+  ::testing::Values(Params(1000, 3000, 0x1b575cf05b708945))
+);
+// Mid-size heap with small values (many duplicates)
+INSTANTIATE_TEST_CASE_P(
+  SmallValues, HeapTest,
+  ::testing::Values(Params(100, 10, 0x5ae213f7bd5dccd0))
+);
+// Small heap, large value range (no duplicates)
+INSTANTIATE_TEST_CASE_P(
+  SmallHeap, HeapTest,
+  ::testing::Values(Params(10, ULLONG_MAX, 0x3e1fa8f4d01707cf))
+);
+// Two-element heap
+INSTANTIATE_TEST_CASE_P(
+  TwoElementHeap, HeapTest,
+  ::testing::Values(Params(2, 5, 0x4b5e13ea988c6abc))
+);
+// One-element heap
+INSTANTIATE_TEST_CASE_P(
+  OneElementHeap, HeapTest,
+  ::testing::Values(Params(1, 3, 0x176a1019ab0b612e))
+);
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+#ifdef GFLAGS
+  GFLAGS::ParseCommandLineFlags(&argc, &argv, true);
+#endif  // GFLAGS
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/histogram.h b/src/rocksdb/util/histogram.h
index 77ed9be..5f73bf3 100644
--- a/src/rocksdb/util/histogram.h
+++ b/src/rocksdb/util/histogram.h
@@ -15,6 +15,8 @@
 #include <vector>
 #include <map>
 
+#include <string.h>
+
 namespace rocksdb {
 
 class HistogramBucketMapper {
@@ -52,6 +54,7 @@ class HistogramBucketMapper {
 
 class HistogramImpl {
  public:
+  HistogramImpl() { memset(buckets_, 0, sizeof(buckets_)); }
   virtual void Clear();
   virtual bool Empty();
   virtual void Add(uint64_t value);
@@ -75,7 +78,7 @@ class HistogramImpl {
   double num_ = 0;
   double sum_ = 0;
   double sum_squares_ = 0;
-  uint64_t buckets_[138] = {0};  // this is BucketMapper::BucketCount()
+  uint64_t buckets_[138];  // this is BucketMapper::BucketCount()
 };
 
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/iostats_context.cc b/src/rocksdb/util/iostats_context.cc
index 090813a..50a6e8a 100644
--- a/src/rocksdb/util/iostats_context.cc
+++ b/src/rocksdb/util/iostats_context.cc
@@ -10,22 +10,36 @@
 namespace rocksdb {
 
 #ifndef IOS_CROSS_COMPILE
+# ifdef _WIN32
+__declspec(thread) IOStatsContext iostats_context;
+# else
 __thread IOStatsContext iostats_context;
+# endif
 #endif  // IOS_CROSS_COMPILE
 
 void IOStatsContext::Reset() {
   thread_pool_id = Env::Priority::TOTAL;
   bytes_read = 0;
   bytes_written = 0;
+  open_nanos = 0;
+  allocate_nanos = 0;
+  write_nanos = 0;
+  read_nanos = 0;
+  range_sync_nanos = 0;
+  prepare_write_nanos = 0;
+  fsync_nanos = 0;
+  logger_nanos = 0;
 }
 
 #define OUTPUT(counter) #counter << " = " << counter << ", "
 
 std::string IOStatsContext::ToString() const {
   std::ostringstream ss;
-  ss << OUTPUT(thread_pool_id)
-     << OUTPUT(bytes_read)
-     << OUTPUT(bytes_written);
+  ss << OUTPUT(thread_pool_id) << OUTPUT(bytes_read) << OUTPUT(bytes_written)
+     << OUTPUT(open_nanos) << OUTPUT(allocate_nanos) << OUTPUT(write_nanos)
+     << OUTPUT(read_nanos) << OUTPUT(range_sync_nanos) << OUTPUT(fsync_nanos)
+     << OUTPUT(prepare_write_nanos) << OUTPUT(logger_nanos);
+
   return ss.str();
 }
 
diff --git a/src/rocksdb/util/iostats_context_imp.h b/src/rocksdb/util/iostats_context_imp.h
index b271ddf..4617b41 100644
--- a/src/rocksdb/util/iostats_context_imp.h
+++ b/src/rocksdb/util/iostats_context_imp.h
@@ -5,6 +5,7 @@
 //
 #pragma once
 #include "rocksdb/iostats_context.h"
+#include "util/perf_step_timer.h"
 
 #ifndef IOS_CROSS_COMPILE
 
@@ -33,6 +34,11 @@
 #define IOSTATS(metric)                        \
   (iostats_context.metric)
 
+// Declare and set start time of the timer
+#define IOSTATS_TIMER_GUARD(metric)                                       \
+  PerfStepTimer iostats_step_timer_ ## metric(&(iostats_context.metric));  \
+  iostats_step_timer_ ## metric.Start();
+
 #else  // IOS_CROSS_COMPILE
 
 #define IOSTATS_ADD(metric, value)
@@ -43,4 +49,6 @@
 #define IOSTATS_THREAD_POOL_ID()
 #define IOSTATS(metric) 0
 
+#define IOSTATS_TIMER_GUARD(metric)
+
 #endif  // IOS_CROSS_COMPILE
diff --git a/src/rocksdb/util/ldb_cmd.cc b/src/rocksdb/util/ldb_cmd.cc
index e7b29d2..a441d71 100644
--- a/src/rocksdb/util/ldb_cmd.cc
+++ b/src/rocksdb/util/ldb_cmd.cc
@@ -6,6 +6,12 @@
 #ifndef ROCKSDB_LITE
 #include "util/ldb_cmd.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+
 #include "db/dbformat.h"
 #include "db/db_impl.h"
 #include "db/log_reader.h"
@@ -15,6 +21,7 @@
 #include "rocksdb/write_batch.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/table_properties.h"
+#include "port/dirent.h"
 #include "util/coding.h"
 #include "util/sst_dump_tool_imp.h"
 #include "util/string_util.h"
@@ -23,7 +30,6 @@
 
 #include <cstdlib>
 #include <ctime>
-#include <dirent.h>
 #include <limits>
 #include <sstream>
 #include <string>
@@ -281,6 +287,8 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
       opt.compression = kLZ4Compression;
     } else if (comp == "lz4hc") {
       opt.compression = kLZ4HCCompression;
+    } else if (comp == "zstd") {
+      opt.compression = kZSTDNotFinalCompression;
     } else {
       // Unknown compression.
       exec_state_ =
@@ -441,7 +449,7 @@ void CompactorCommand::DoCommand() {
     end = new Slice(to_);
   }
 
-  db_->CompactRange(begin, end);
+  db_->CompactRange(CompactRangeOptions(), begin, end);
   exec_state_ = LDBCommandExecuteResult::Succeed("");
 
   delete begin;
@@ -519,7 +527,7 @@ void DBLoaderCommand::DoCommand() {
     cout << "Warning: " << bad_lines << " bad lines ignored." << endl;
   }
   if (compact_) {
-    db_->CompactRange(nullptr, nullptr);
+    db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
   }
 }
 
@@ -527,7 +535,7 @@ void DBLoaderCommand::DoCommand() {
 
 namespace {
 
-void DumpManifestFile(std::string file, bool verbose, bool hex) {
+void DumpManifestFile(std::string file, bool verbose, bool hex, bool json) {
   Options options;
   EnvOptions sopt;
   std::string dbname("dummy");
@@ -537,10 +545,11 @@ void DumpManifestFile(std::string file, bool verbose, bool hex) {
   // if VersionSet::DumpManifest() depends on any option done by
   // SanitizeOptions(), we need to initialize it manually.
   options.db_paths.emplace_back("dummy", 0);
-  WriteController wc;
+  options.num_levels = 64;
+  WriteController wc(options.delayed_write_rate);
   WriteBuffer wb(options.db_write_buffer_size);
   VersionSet versions(dbname, &options, sopt, tc.get(), &wb, &wc);
-  Status s = versions.DumpManifest(options, file, verbose, hex);
+  Status s = versions.DumpManifest(options, file, verbose, hex, json);
   if (!s.ok()) {
     printf("Error in processing file %s %s\n", file.c_str(),
            s.ToString().c_str());
@@ -550,12 +559,14 @@ void DumpManifestFile(std::string file, bool verbose, bool hex) {
 }  // namespace
 
 const string ManifestDumpCommand::ARG_VERBOSE = "verbose";
-const string ManifestDumpCommand::ARG_PATH    = "path";
+const string ManifestDumpCommand::ARG_JSON = "json";
+const string ManifestDumpCommand::ARG_PATH = "path";
 
 void ManifestDumpCommand::Help(string& ret) {
   ret.append("  ");
   ret.append(ManifestDumpCommand::Name());
   ret.append(" [--" + ARG_VERBOSE + "]");
+  ret.append(" [--" + ARG_JSON + "]");
   ret.append(" [--" + ARG_PATH + "=<path_to_manifest_file>]");
   ret.append("\n");
 }
@@ -563,11 +574,13 @@ void ManifestDumpCommand::Help(string& ret) {
 ManifestDumpCommand::ManifestDumpCommand(const vector<string>& params,
       const map<string, string>& options, const vector<string>& flags) :
     LDBCommand(options, flags, false,
-               BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX})),
+               BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX, ARG_JSON})),
     verbose_(false),
+    json_(false),
     path_("")
 {
   verbose_ = IsFlagPresent(flags, ARG_VERBOSE);
+  json_ = IsFlagPresent(flags, ARG_JSON);
 
   map<string, string>::const_iterator itr = options.find(ARG_PATH);
   if (itr != options.end()) {
@@ -588,40 +601,40 @@ void ManifestDumpCommand::DoCommand() {
     bool found = false;
     // We need to find the manifest file by searching the directory
     // containing the db for files of the form MANIFEST_[0-9]+
-    DIR* d = opendir(db_path_.c_str());
+
+    auto CloseDir = [](DIR* p) { closedir(p); };
+    std::unique_ptr<DIR, decltype(CloseDir)> d(opendir(db_path_.c_str()),
+                                               CloseDir);
+
     if (d == nullptr) {
       exec_state_ =
           LDBCommandExecuteResult::Failed(db_path_ + " is not a directory");
       return;
     }
     struct dirent* entry;
-    while ((entry = readdir(d)) != nullptr) {
+    while ((entry = readdir(d.get())) != nullptr) {
       unsigned int match;
-      unsigned long long num;
-      if (sscanf(entry->d_name,
-                 "MANIFEST-%ln%ln",
-                 (unsigned long*)&num,
-                 (unsigned long*)&match)
-          && match == strlen(entry->d_name)) {
+      uint64_t num;
+      if (sscanf(entry->d_name, "MANIFEST-%" PRIu64 "%n", &num, &match) &&
+          match == strlen(entry->d_name)) {
         if (!found) {
           manifestfile = db_path_ + "/" + std::string(entry->d_name);
           found = true;
         } else {
           exec_state_ = LDBCommandExecuteResult::Failed(
               "Multiple MANIFEST files found; use --path to select one");
-          closedir(d);
           return;
         }
       }
     }
-    closedir(d);
   }
 
   if (verbose_) {
     printf("Processing Manifest file %s\n", manifestfile.c_str());
   }
 
-  DumpManifestFile(manifestfile, verbose_, is_key_hex_);
+  DumpManifestFile(manifestfile, verbose_, is_key_hex_, json_);
+
   if (verbose_) {
     printf("Processing Manifest file %s done\n", manifestfile.c_str());
   }
@@ -676,7 +689,9 @@ namespace {
 string ReadableTime(int unixtime) {
   char time_buffer [80];
   time_t rawtime = unixtime;
-  struct tm * timeinfo = localtime(&rawtime);
+  struct tm tInfo;
+  struct tm* timeinfo = localtime_r(&rawtime, &tInfo);
+  assert(timeinfo == &tInfo);
   strftime(time_buffer, 80, "%c", timeinfo);
   return string(time_buffer);
 }
@@ -1136,7 +1151,6 @@ Options ReduceDBLevelsCommand::PrepareOptionsForOpenDB() {
   // Disable size compaction
   opt.max_bytes_for_level_base = 1ULL << 50;
   opt.max_bytes_for_level_multiplier = 1;
-  opt.max_mem_compaction_level = 0;
   return opt;
 }
 
@@ -1146,7 +1160,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
   std::shared_ptr<Cache> tc(
       NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits));
   const InternalKeyComparator cmp(opt.comparator);
-  WriteController wc;
+  WriteController wc(opt.delayed_write_rate);
   WriteBuffer wb(opt.db_write_buffer_size);
   VersionSet versions(db_path_, &opt, soptions, tc.get(), &wb, &wc);
   std::vector<ColumnFamilyDescriptor> dummy;
@@ -1204,7 +1218,7 @@ void ReduceDBLevelsCommand::DoCommand() {
   }
   // Compact the whole DB to put all files to the highest level.
   fprintf(stdout, "Compacting the db...\n");
-  db_->CompactRange(nullptr, nullptr);
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
   CloseDB();
 
   EnvOptions soptions;
@@ -1309,9 +1323,10 @@ void ChangeCompactionStyleCommand::DoCommand() {
           files_per_level.c_str());
 
   // manual compact into a single file and move the file to level 0
-  db_->CompactRange(nullptr, nullptr,
-                    true /* reduce level */,
-                    0    /* reduce to level 0 */);
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 0;
+  db_->CompactRange(compact_options, nullptr, nullptr);
 
   // verify compaction result
   files_per_level = "";
@@ -1401,10 +1416,18 @@ class InMemoryHandler : public WriteBatch::Handler {
 
 void DumpWalFile(std::string wal_file, bool print_header, bool print_values,
                  LDBCommandExecuteResult* exec_state) {
-  unique_ptr<SequentialFile> file;
   Env* env_ = Env::Default();
   EnvOptions soptions;
-  Status status = env_->NewSequentialFile(wal_file, &file, soptions);
+  unique_ptr<SequentialFileReader> wal_file_reader;
+
+  Status status;
+  {
+    unique_ptr<SequentialFile> file;
+    status = env_->NewSequentialFile(wal_file, &file, soptions);
+    if (status.ok()) {
+      wal_file_reader.reset(new SequentialFileReader(std::move(file)));
+    }
+  }
   if (!status.ok()) {
     if (exec_state) {
       *exec_state = LDBCommandExecuteResult::Failed("Failed to open WAL file " +
@@ -1415,7 +1438,7 @@ void DumpWalFile(std::string wal_file, bool print_header, bool print_values,
     }
   } else {
     StdErrReporter reporter;
-    log::Reader reader(move(file), &reporter, true, 0);
+    log::Reader reader(move(wal_file_reader), &reporter, true, 0);
     string scratch;
     WriteBatch batch;
     Slice record;
@@ -2022,7 +2045,7 @@ void DBFileDumperCommand::DoCommand() {
   manifest_filename.resize(manifest_filename.size() - 1);
   string manifest_filepath = db_->GetName() + "/" + manifest_filename;
   std::cout << manifest_filepath << std::endl;
-  DumpManifestFile(manifest_filepath, false, false);
+  DumpManifestFile(manifest_filepath, false, false, false);
   std::cout << std::endl;
 
   std::cout << "SST Files" << std::endl;
diff --git a/src/rocksdb/util/ldb_cmd.h b/src/rocksdb/util/ldb_cmd.h
index 50de4de..d48fcf6 100644
--- a/src/rocksdb/util/ldb_cmd.h
+++ b/src/rocksdb/util/ldb_cmd.h
@@ -130,18 +130,38 @@ public:
   }
 
   static string HexToString(const string& str) {
+    std::string::size_type len = str.length();
     string parsed;
-    if (str[0] != '0' || str[1] != 'x') {
+    static const char* const hexas = "0123456789ABCDEF";
+    parsed.reserve(len / 2);
+
+    if (len < 2 || str[0] != '0' || str[1] != 'x') {
       fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n",
               str.c_str());
       throw "Invalid hex input";
     }
 
-    for (unsigned int i = 2; i < str.length();) {
-      int c;
-      sscanf(str.c_str() + i, "%2X", &c);
-      parsed.push_back(c);
-      i += 2;
+    for (unsigned int i = 2; i < len; i += 2) {
+      char a = static_cast<char>(toupper(str[i]));
+      const char* p = std::lower_bound(hexas, hexas + 16, a);
+      if (*p != a) {
+        throw "Invalid hex value";
+      }
+
+      if (i + 1 >= len) {
+        // if odd number of chars than we just hit end of string
+        parsed.push_back(p - hexas);
+        break;
+      }
+
+      char b = static_cast<char>(toupper(str[i + 1]));
+      const char* q = std::lower_bound(hexas, hexas + 16, b);
+      if (*q == b) {
+        // pairwise compute decimal value from hex
+        parsed.push_back(((p - hexas) << 4) | (q - hexas));
+      } else {
+        throw "Invalid hex value";
+      }
     }
     return parsed;
   }
@@ -357,7 +377,9 @@ private:
    * Otherwise an exception is thrown.
    */
   bool StringToBool(string val) {
-    std::transform(val.begin(), val.end(), val.begin(), ::tolower);
+    std::transform(val.begin(), val.end(), val.begin(),
+                   [](char ch) -> char { return ::tolower(ch); });
+
     if (val == "true") {
       return true;
     } else if (val == "false") {
@@ -505,9 +527,11 @@ public:
 
 private:
   bool verbose_;
+  bool json_;
   string path_;
 
   static const string ARG_VERBOSE;
+  static const string ARG_JSON;
   static const string ARG_PATH;
 };
 
diff --git a/src/rocksdb/util/ldb_cmd_execute_result.h b/src/rocksdb/util/ldb_cmd_execute_result.h
index 35e9610..29ebfc2 100644
--- a/src/rocksdb/util/ldb_cmd_execute_result.h
+++ b/src/rocksdb/util/ldb_cmd_execute_result.h
@@ -5,6 +5,10 @@
 //
 #pragma once
 
+#ifdef FAILED
+#undef FAILED
+#endif
+
 namespace rocksdb {
 
 class LDBCommandExecuteResult {
diff --git a/src/rocksdb/util/ldb_cmd_test.cc b/src/rocksdb/util/ldb_cmd_test.cc
new file mode 100644
index 0000000..c918cf5
--- /dev/null
+++ b/src/rocksdb/util/ldb_cmd_test.cc
@@ -0,0 +1,44 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#include "util/ldb_cmd.h"
+#include "util/testharness.h"
+
+class LdbCmdTest : public testing::Test {};
+
+TEST_F(LdbCmdTest, HexToString) {
+  // map input to expected outputs.
+  map<string, vector<int>> inputMap = {
+      {"0x7", {7}},         {"0x5050", {80, 80}}, {"0xFF", {-1}},
+      {"0x1234", {18, 52}}, {"0xaa", {-86}}, {"0x123", {18, 3}},
+  };
+
+  for (const auto& inPair : inputMap) {
+    auto actual = rocksdb::LDBCommand::HexToString(inPair.first);
+    auto expected = inPair.second;
+    for (unsigned int i = 0; i < actual.length(); i++) {
+      ASSERT_EQ(expected[i], static_cast<int>(actual[i]));
+    }
+  }
+}
+
+TEST_F(LdbCmdTest, HexToStringBadInputs) {
+  const vector<string> badInputs = {
+      "0xZZ", "123", "0xx5", "0x11G", "Ox12", "0xT", "0x1Q1",
+  };
+  for (const auto badInput : badInputs) {
+    try {
+      rocksdb::LDBCommand::HexToString(badInput);
+      std::cerr << "Should fail on bad hex value: " << badInput << "\n";
+      FAIL();
+    } catch (...) {
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/util/log_buffer.cc b/src/rocksdb/util/log_buffer.cc
index ddddaec..7d15cf2 100644
--- a/src/rocksdb/util/log_buffer.cc
+++ b/src/rocksdb/util/log_buffer.cc
@@ -5,7 +5,8 @@
 
 #include "util/log_buffer.h"
 
-#include <sys/time.h>
+#include "port/sys_time.h"
+#include "port/port.h"
 
 namespace rocksdb {
 
@@ -33,8 +34,15 @@ void LogBuffer::AddLogToBuffer(size_t max_log_size, const char* format,
     va_list backup_ap;
     va_copy(backup_ap, ap);
     auto n = vsnprintf(p, limit - p, format, backup_ap);
+#ifndef OS_WIN
+    // MS reports -1 when the buffer is too short
     assert(n >= 0);
-    p += n;
+#endif
+    if (n > 0) {
+      p += n;
+    } else {
+      p = limit;
+    }
     va_end(backup_ap);
   }
 
diff --git a/src/rocksdb/util/log_buffer.h b/src/rocksdb/util/log_buffer.h
index b5cf1d5..bd842b7 100644
--- a/src/rocksdb/util/log_buffer.h
+++ b/src/rocksdb/util/log_buffer.h
@@ -5,10 +5,10 @@
 
 #pragma once
 
-#include <sys/time.h>
 #include "rocksdb/env.h"
 #include "util/arena.h"
 #include "util/autovector.h"
+#include "port/sys_time.h"
 #include <ctime>
 
 namespace rocksdb {
diff --git a/src/rocksdb/util/manual_compaction_test.cc b/src/rocksdb/util/manual_compaction_test.cc
index 6eedd03..8613b7b 100644
--- a/src/rocksdb/util/manual_compaction_test.cc
+++ b/src/rocksdb/util/manual_compaction_test.cc
@@ -13,6 +13,7 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/write_batch.h"
 #include "util/testharness.h"
+#include "port/port.h"
 
 using namespace rocksdb;
 
@@ -77,7 +78,7 @@ TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
     db->Put(WriteOptions(), Slice("key4"), Slice("destroy"));
 
     Slice key4("key4");
-    db->CompactRange(nullptr, &key4);
+    db->CompactRange(CompactRangeOptions(), nullptr, &key4);
     Iterator* itr = db->NewIterator(ReadOptions());
     itr->SeekToFirst();
     ASSERT_TRUE(itr->Valid());
@@ -130,7 +131,7 @@ TEST_F(ManualCompactionTest, Test) {
   rocksdb::Slice greatest(end_key.data(), end_key.size());
 
   // commenting out the line below causes the example to work correctly
-  db->CompactRange(&least, &greatest);
+  db->CompactRange(CompactRangeOptions(), &least, &greatest);
 
   // count the keys
   rocksdb::Iterator* iter = db->NewIterator(rocksdb::ReadOptions());
diff --git a/src/rocksdb/util/memenv.cc b/src/rocksdb/util/memenv.cc
index c89411f..5737370 100644
--- a/src/rocksdb/util/memenv.cc
+++ b/src/rocksdb/util/memenv.cc
@@ -232,7 +232,9 @@ class WritableFileImpl : public WritableFile {
   virtual Status Append(const Slice& data) override {
     return file_->Append(data);
   }
-
+  virtual Status Truncate(uint64_t size) override {
+    return Status::OK();
+  }
   virtual Status Close() override { return Status::OK(); }
   virtual Status Flush() override { return Status::OK(); }
   virtual Status Sync() override { return Status::OK(); }
@@ -308,10 +310,14 @@ class InMemoryEnv : public EnvWrapper {
     return Status::OK();
   }
 
-  virtual bool FileExists(const std::string& fname) override {
+  virtual Status FileExists(const std::string& fname) override {
     std::string nfname = NormalizeFileName(fname);
     MutexLock lock(&mutex_);
-    return file_map_.find(nfname) != file_map_.end();
+    if (file_map_.find(nfname) != file_map_.end()) {
+      return Status::OK();
+    } else {
+      return Status::NotFound();
+    }
   }
 
   virtual Status GetChildren(const std::string& dir,
diff --git a/src/rocksdb/util/memenv_test.cc b/src/rocksdb/util/memenv_test.cc
index 9222dc6..24190da 100644
--- a/src/rocksdb/util/memenv_test.cc
+++ b/src/rocksdb/util/memenv_test.cc
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#ifndef ROCKSDB_LITE
+
 #include "db/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@@ -33,7 +35,7 @@ TEST_F(MemEnvTest, Basics) {
   ASSERT_OK(env_->CreateDir("/dir"));
 
   // Check that the directory is empty.
-  ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
+  ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/non_existent"));
   ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
   ASSERT_OK(env_->GetChildren("/dir", &children));
   ASSERT_EQ(0U, children.size());
@@ -43,7 +45,7 @@ TEST_F(MemEnvTest, Basics) {
   writable_file.reset();
 
   // Check that the file exists.
-  ASSERT_TRUE(env_->FileExists("/dir/f"));
+  ASSERT_OK(env_->FileExists("/dir/f"));
   ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
   ASSERT_EQ(0U, file_size);
   ASSERT_OK(env_->GetChildren("/dir", &children));
@@ -62,8 +64,8 @@ TEST_F(MemEnvTest, Basics) {
   // Check that renaming works.
   ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
   ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
-  ASSERT_TRUE(!env_->FileExists("/dir/f"));
-  ASSERT_TRUE(env_->FileExists("/dir/g"));
+  ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/f"));
+  ASSERT_OK(env_->FileExists("/dir/g"));
   ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
   ASSERT_EQ(3U, file_size);
 
@@ -80,7 +82,7 @@ TEST_F(MemEnvTest, Basics) {
   // Check that deleting works.
   ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
   ASSERT_OK(env_->DeleteFile("/dir/g"));
-  ASSERT_TRUE(!env_->FileExists("/dir/g"));
+  ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/g"));
   ASSERT_OK(env_->GetChildren("/dir", &children));
   ASSERT_EQ(0U, children.size());
   ASSERT_OK(env_->DeleteDir("/dir"));
@@ -239,3 +241,13 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as MemEnv is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/util/mock_env.cc b/src/rocksdb/util/mock_env.cc
index 26dffba..409e16e 100644
--- a/src/rocksdb/util/mock_env.cc
+++ b/src/rocksdb/util/mock_env.cc
@@ -8,7 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "util/mock_env.h"
-#include <sys/time.h>
+#include "port/sys_time.h"
 #include <algorithm>
 #include <chrono>
 #include "util/rate_limiter.h"
@@ -250,7 +250,9 @@ class MockWritableFile : public WritableFile {
     }
     return Status::OK();
   }
-
+  virtual Status Truncate(uint64_t size) override {
+    return Status::OK();
+  }
   virtual Status Close() override { return file_->Fsync(); }
 
   virtual Status Flush() override { return Status::OK(); }
@@ -456,24 +458,18 @@ Status MockEnv::NewWritableFile(const std::string& fname,
   return Status::OK();
 }
 
-Status MockEnv::NewRandomRWFile(const std::string& fname,
-                                   unique_ptr<RandomRWFile>* result,
-                                   const EnvOptions& options) {
-  return Status::OK();
-}
-
 Status MockEnv::NewDirectory(const std::string& name,
                                 unique_ptr<Directory>* result) {
   result->reset(new MockEnvDirectory());
   return Status::OK();
 }
 
-bool MockEnv::FileExists(const std::string& fname) {
+Status MockEnv::FileExists(const std::string& fname) {
   auto fn = NormalizePath(fname);
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) != file_map_.end()) {
     // File exists
-    return true;
+    return Status::OK();
   }
   // Now also check if fn exists as a dir
   for (const auto& iter : file_map_) {
@@ -481,10 +477,10 @@ bool MockEnv::FileExists(const std::string& fname) {
     if (filename.size() >= fn.size() + 1 &&
         filename[fn.size()] == '/' &&
         Slice(filename).starts_with(Slice(fn))) {
-      return true;
+      return Status::OK();
     }
   }
-  return false;
+  return Status::NotFound();
 }
 
 Status MockEnv::GetChildren(const std::string& dir,
diff --git a/src/rocksdb/util/mock_env.h b/src/rocksdb/util/mock_env.h
index 55ef24b..bcc74a7 100644
--- a/src/rocksdb/util/mock_env.h
+++ b/src/rocksdb/util/mock_env.h
@@ -39,14 +39,10 @@ class MockEnv : public EnvWrapper {
                                  unique_ptr<WritableFile>* result,
                                  const EnvOptions& env_options) override;
 
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options) override;
-
   virtual Status NewDirectory(const std::string& name,
                               unique_ptr<Directory>* result) override;
 
-  virtual bool FileExists(const std::string& fname) override;
+  virtual Status FileExists(const std::string& fname) override;
 
   virtual Status GetChildren(const std::string& dir,
                              std::vector<std::string>* result) override;
diff --git a/src/rocksdb/util/mock_env_test.cc b/src/rocksdb/util/mock_env_test.cc
index e3d4970..2f50c2a 100644
--- a/src/rocksdb/util/mock_env_test.cc
+++ b/src/rocksdb/util/mock_env_test.cc
@@ -34,7 +34,7 @@ TEST_F(MockEnvTest, Basics) {
   ASSERT_OK(env_->CreateDir("/dir"));
 
   // Check that the directory is empty.
-  ASSERT_TRUE(!env_->FileExists("/dir/non_existent"));
+  ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/non_existent"));
   ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok());
   ASSERT_OK(env_->GetChildren("/dir", &children));
   ASSERT_EQ(0U, children.size());
@@ -44,7 +44,7 @@ TEST_F(MockEnvTest, Basics) {
   writable_file.reset();
 
   // Check that the file exists.
-  ASSERT_TRUE(env_->FileExists("/dir/f"));
+  ASSERT_OK(env_->FileExists("/dir/f"));
   ASSERT_OK(env_->GetFileSize("/dir/f", &file_size));
   ASSERT_EQ(0U, file_size);
   ASSERT_OK(env_->GetChildren("/dir", &children));
@@ -63,8 +63,8 @@ TEST_F(MockEnvTest, Basics) {
   // Check that renaming works.
   ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok());
   ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g"));
-  ASSERT_TRUE(!env_->FileExists("/dir/f"));
-  ASSERT_TRUE(env_->FileExists("/dir/g"));
+  ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/f"));
+  ASSERT_OK(env_->FileExists("/dir/g"));
   ASSERT_OK(env_->GetFileSize("/dir/g", &file_size));
   ASSERT_EQ(3U, file_size);
 
@@ -81,7 +81,7 @@ TEST_F(MockEnvTest, Basics) {
   // Check that deleting works.
   ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok());
   ASSERT_OK(env_->DeleteFile("/dir/g"));
-  ASSERT_TRUE(!env_->FileExists("/dir/g"));
+  ASSERT_EQ(Status::NotFound(), env_->FileExists("/dir/g"));
   ASSERT_OK(env_->GetChildren("/dir", &children));
   ASSERT_EQ(0U, children.size());
   ASSERT_OK(env_->DeleteDir("/dir"));
@@ -252,6 +252,8 @@ TEST_F(MockEnvTest, DBTest) {
   ASSERT_TRUE(!iterator->Valid());
   delete iterator;
 
+  // TEST_FlushMemTable() is not supported in ROCKSDB_LITE
+  #ifndef ROCKSDB_LITE
   DBImpl* dbi = reinterpret_cast<DBImpl*>(db);
   ASSERT_OK(dbi->TEST_FlushMemTable());
 
@@ -260,6 +262,7 @@ TEST_F(MockEnvTest, DBTest) {
     ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
     ASSERT_TRUE(res == vals[i]);
   }
+  #endif  // ROCKSDB_LITE
 
   delete db;
 }
diff --git a/src/rocksdb/util/mutable_cf_options.cc b/src/rocksdb/util/mutable_cf_options.cc
index 187a97a..fafd154 100644
--- a/src/rocksdb/util/mutable_cf_options.cc
+++ b/src/rocksdb/util/mutable_cf_options.cc
@@ -3,6 +3,8 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#include "util/mutable_cf_options.h"
+
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
 #endif
@@ -11,10 +13,10 @@
 #include <limits>
 #include <cassert>
 #include <string>
+#include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
 #include "rocksdb/immutable_options.h"
-#include "util/mutable_cf_options.h"
 
 namespace rocksdb {
 
@@ -62,17 +64,19 @@ uint64_t MutableCFOptions::ExpandedCompactionByteSizeLimit(int level) const {
 
 void MutableCFOptions::Dump(Logger* log) const {
   // Memtable related options
-  Log(log, "                        write_buffer_size: %zu", write_buffer_size);
+  Log(log, "                        write_buffer_size: %" ROCKSDB_PRIszt,
+      write_buffer_size);
   Log(log, "                  max_write_buffer_number: %d",
       max_write_buffer_number);
-  Log(log, "                         arena_block_size: %zu", arena_block_size);
+  Log(log, "                         arena_block_size: %" ROCKSDB_PRIszt,
+      arena_block_size);
   Log(log, "               memtable_prefix_bloom_bits: %" PRIu32,
       memtable_prefix_bloom_bits);
   Log(log, "             memtable_prefix_bloom_probes: %" PRIu32,
       memtable_prefix_bloom_probes);
-  Log(log, " memtable_prefix_bloom_huge_page_tlb_size: %zu",
+  Log(log, " memtable_prefix_bloom_huge_page_tlb_size: %" ROCKSDB_PRIszt,
       memtable_prefix_bloom_huge_page_tlb_size);
-  Log(log, "                    max_successive_merges: %zu",
+  Log(log, "                    max_successive_merges: %" ROCKSDB_PRIszt,
       max_successive_merges);
   Log(log, "                           filter_deletes: %d",
       filter_deletes);
@@ -80,8 +84,8 @@ void MutableCFOptions::Dump(Logger* log) const {
       disable_auto_compactions);
   Log(log, "                          soft_rate_limit: %lf",
       soft_rate_limit);
-  Log(log, "                          hard_rate_limit: %lf",
-      hard_rate_limit);
+  Log(log, "      hard_pending_compaction_bytes_limit: %" PRIu64,
+      hard_pending_compaction_bytes_limit);
   Log(log, "       level0_file_num_compaction_trigger: %d",
       level0_file_num_compaction_trigger);
   Log(log, "           level0_slowdown_writes_trigger: %d",
@@ -110,8 +114,6 @@ void MutableCFOptions::Dump(Logger* log) const {
   }
   result.resize(result.size() - 2);
   Log(log, "max_bytes_for_level_multiplier_additional: %s", result.c_str());
-  Log(log, "                 max_mem_compaction_level: %d",
-      max_mem_compaction_level);
   Log(log, "           verify_checksums_in_compaction: %d",
       verify_checksums_in_compaction);
   Log(log, "        max_sequential_skip_in_iterations: %" PRIu64,
diff --git a/src/rocksdb/util/mutable_cf_options.h b/src/rocksdb/util/mutable_cf_options.h
index 20845d9..94c4019 100644
--- a/src/rocksdb/util/mutable_cf_options.h
+++ b/src/rocksdb/util/mutable_cf_options.h
@@ -13,68 +13,73 @@ namespace rocksdb {
 
 struct MutableCFOptions {
   MutableCFOptions(const Options& options, const ImmutableCFOptions& ioptions)
-    : write_buffer_size(options.write_buffer_size),
-      max_write_buffer_number(options.max_write_buffer_number),
-      arena_block_size(options.arena_block_size),
-      memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
-      memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
-      memtable_prefix_bloom_huge_page_tlb_size(
-          options.memtable_prefix_bloom_huge_page_tlb_size),
-      max_successive_merges(options.max_successive_merges),
-      filter_deletes(options.filter_deletes),
-      inplace_update_num_locks(options.inplace_update_num_locks),
-      disable_auto_compactions(options.disable_auto_compactions),
-      soft_rate_limit(options.soft_rate_limit),
-      hard_rate_limit(options.hard_rate_limit),
-      level0_file_num_compaction_trigger(
-          options.level0_file_num_compaction_trigger),
-      level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger),
-      level0_stop_writes_trigger(options.level0_stop_writes_trigger),
-      max_grandparent_overlap_factor(options.max_grandparent_overlap_factor),
-      expanded_compaction_factor(options.expanded_compaction_factor),
-      source_compaction_factor(options.source_compaction_factor),
-      target_file_size_base(options.target_file_size_base),
-      target_file_size_multiplier(options.target_file_size_multiplier),
-      max_bytes_for_level_base(options.max_bytes_for_level_base),
-      max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
-      max_bytes_for_level_multiplier_additional(
-          options.max_bytes_for_level_multiplier_additional),
-      max_mem_compaction_level(options.max_mem_compaction_level),
-      verify_checksums_in_compaction(options.verify_checksums_in_compaction),
-      max_sequential_skip_in_iterations(
-          options.max_sequential_skip_in_iterations),
-      paranoid_file_checks(options.paranoid_file_checks)
+      : write_buffer_size(options.write_buffer_size),
+        max_write_buffer_number(options.max_write_buffer_number),
+        arena_block_size(options.arena_block_size),
+        memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits),
+        memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes),
+        memtable_prefix_bloom_huge_page_tlb_size(
+            options.memtable_prefix_bloom_huge_page_tlb_size),
+        max_successive_merges(options.max_successive_merges),
+        filter_deletes(options.filter_deletes),
+        inplace_update_num_locks(options.inplace_update_num_locks),
+        disable_auto_compactions(options.disable_auto_compactions),
+        soft_rate_limit(options.soft_rate_limit),
+        hard_pending_compaction_bytes_limit(
+            options.hard_pending_compaction_bytes_limit),
+        level0_file_num_compaction_trigger(
+            options.level0_file_num_compaction_trigger),
+        level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger),
+        level0_stop_writes_trigger(options.level0_stop_writes_trigger),
+        compaction_pri(options.compaction_pri),
+        max_grandparent_overlap_factor(options.max_grandparent_overlap_factor),
+        expanded_compaction_factor(options.expanded_compaction_factor),
+        source_compaction_factor(options.source_compaction_factor),
+        target_file_size_base(options.target_file_size_base),
+        target_file_size_multiplier(options.target_file_size_multiplier),
+        max_bytes_for_level_base(options.max_bytes_for_level_base),
+        max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
+        max_bytes_for_level_multiplier_additional(
+            options.max_bytes_for_level_multiplier_additional),
+        verify_checksums_in_compaction(options.verify_checksums_in_compaction),
+        max_subcompactions(options.max_subcompactions),
+        max_sequential_skip_in_iterations(
+            options.max_sequential_skip_in_iterations),
+        paranoid_file_checks(options.paranoid_file_checks),
+        compaction_measure_io_stats(options.compaction_measure_io_stats)
+
   {
     RefreshDerivedOptions(ioptions);
   }
   MutableCFOptions()
-    : write_buffer_size(0),
-      max_write_buffer_number(0),
-      arena_block_size(0),
-      memtable_prefix_bloom_bits(0),
-      memtable_prefix_bloom_probes(0),
-      memtable_prefix_bloom_huge_page_tlb_size(0),
-      max_successive_merges(0),
-      filter_deletes(false),
-      inplace_update_num_locks(0),
-      disable_auto_compactions(false),
-      soft_rate_limit(0),
-      hard_rate_limit(0),
-      level0_file_num_compaction_trigger(0),
-      level0_slowdown_writes_trigger(0),
-      level0_stop_writes_trigger(0),
-      max_grandparent_overlap_factor(0),
-      expanded_compaction_factor(0),
-      source_compaction_factor(0),
-      target_file_size_base(0),
-      target_file_size_multiplier(0),
-      max_bytes_for_level_base(0),
-      max_bytes_for_level_multiplier(0),
-      max_mem_compaction_level(0),
-      verify_checksums_in_compaction(false),
-      max_sequential_skip_in_iterations(0),
-      paranoid_file_checks(false)
-  {}
+      : write_buffer_size(0),
+        max_write_buffer_number(0),
+        arena_block_size(0),
+        memtable_prefix_bloom_bits(0),
+        memtable_prefix_bloom_probes(0),
+        memtable_prefix_bloom_huge_page_tlb_size(0),
+        max_successive_merges(0),
+        filter_deletes(false),
+        inplace_update_num_locks(0),
+        disable_auto_compactions(false),
+        soft_rate_limit(0),
+        hard_pending_compaction_bytes_limit(0),
+        level0_file_num_compaction_trigger(0),
+        level0_slowdown_writes_trigger(0),
+        level0_stop_writes_trigger(0),
+        compaction_pri(kCompactionPriByCompensatedSize),
+        max_grandparent_overlap_factor(0),
+        expanded_compaction_factor(0),
+        source_compaction_factor(0),
+        target_file_size_base(0),
+        target_file_size_multiplier(0),
+        max_bytes_for_level_base(0),
+        max_bytes_for_level_multiplier(0),
+        verify_checksums_in_compaction(false),
+        max_subcompactions(1),
+        max_sequential_skip_in_iterations(0),
+        paranoid_file_checks(false),
+        compaction_measure_io_stats(false) {}
 
   // Must be called after any change to MutableCFOptions
   void RefreshDerivedOptions(const ImmutableCFOptions& ioptions);
@@ -110,10 +115,11 @@ struct MutableCFOptions {
   // Compaction related options
   bool disable_auto_compactions;
   double soft_rate_limit;
-  double hard_rate_limit;
+  uint64_t hard_pending_compaction_bytes_limit;
   int level0_file_num_compaction_trigger;
   int level0_slowdown_writes_trigger;
   int level0_stop_writes_trigger;
+  CompactionPri compaction_pri;
   int max_grandparent_overlap_factor;
   int expanded_compaction_factor;
   int source_compaction_factor;
@@ -122,12 +128,13 @@ struct MutableCFOptions {
   uint64_t max_bytes_for_level_base;
   int max_bytes_for_level_multiplier;
   std::vector<int> max_bytes_for_level_multiplier_additional;
-  int max_mem_compaction_level;
   bool verify_checksums_in_compaction;
+  int max_subcompactions;
 
   // Misc options
   uint64_t max_sequential_skip_in_iterations;
   bool paranoid_file_checks;
+  bool compaction_measure_io_stats;
 
   // Derived options
   // Per-level target file size.
diff --git a/src/rocksdb/util/options.cc b/src/rocksdb/util/options.cc
index 6bb462a..14b69e6 100644
--- a/src/rocksdb/util/options.cc
+++ b/src/rocksdb/util/options.cc
@@ -21,6 +21,7 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/comparator.h"
+#include "rocksdb/delete_scheduler.h"
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/merge_operator.h"
@@ -44,7 +45,6 @@ ImmutableCFOptions::ImmutableCFOptions(const Options& options)
       merge_operator(options.merge_operator.get()),
       compaction_filter(options.compaction_filter),
       compaction_filter_factory(options.compaction_filter_factory.get()),
-      compaction_filter_factory_v2(options.compaction_filter_factory_v2.get()),
       inplace_update_support(options.inplace_update_support),
       inplace_callback(options.inplace_callback),
       info_log(options.info_log.get()),
@@ -69,34 +69,29 @@ ImmutableCFOptions::ImmutableCFOptions(const Options& options)
       level_compaction_dynamic_level_bytes(
           options.level_compaction_dynamic_level_bytes),
       access_hint_on_compaction_start(options.access_hint_on_compaction_start),
+      new_table_reader_for_compaction_inputs(
+          options.new_table_reader_for_compaction_inputs),
+      compaction_readahead_size(options.compaction_readahead_size),
       num_levels(options.num_levels),
-      optimize_filters_for_hits(options.optimize_filters_for_hits)
-#ifndef ROCKSDB_LITE
-      ,
-      listeners(options.listeners) {
-}
-#else  // ROCKSDB_LITE
-{
-}
-#endif  // ROCKSDB_LITE
+      optimize_filters_for_hits(options.optimize_filters_for_hits),
+      listeners(options.listeners),
+      row_cache(options.row_cache) {}
 
 ColumnFamilyOptions::ColumnFamilyOptions()
     : comparator(BytewiseComparator()),
       merge_operator(nullptr),
       compaction_filter(nullptr),
-      compaction_filter_factory(std::shared_ptr<CompactionFilterFactory>(
-          new DefaultCompactionFilterFactory())),
-      compaction_filter_factory_v2(new DefaultCompactionFilterFactoryV2()),
+      compaction_filter_factory(nullptr),
       write_buffer_size(4 << 20),
       max_write_buffer_number(2),
       min_write_buffer_number_to_merge(1),
-      compression(kSnappyCompression),
+      max_write_buffer_number_to_maintain(0),
+      compression(Snappy_Supported() ? kSnappyCompression : kNoCompression),
       prefix_extractor(nullptr),
       num_levels(7),
       level0_file_num_compaction_trigger(4),
       level0_slowdown_writes_trigger(20),
       level0_stop_writes_trigger(24),
-      max_mem_compaction_level(2),
       target_file_size_base(2 * 1048576),
       target_file_size_multiplier(1),
       max_bytes_for_level_base(10 * 1048576),
@@ -108,11 +103,13 @@ ColumnFamilyOptions::ColumnFamilyOptions()
       max_grandparent_overlap_factor(10),
       soft_rate_limit(0.0),
       hard_rate_limit(0.0),
+      hard_pending_compaction_bytes_limit(0),
       rate_limit_delay_max_milliseconds(1000),
       arena_block_size(0),
       disable_auto_compactions(false),
       purge_redundant_kvs_while_flush(true),
       compaction_style(kCompactionStyleLevel),
+      compaction_pri(kCompactionPriByCompensatedSize),
       verify_checksums_in_compaction(true),
       filter_deletes(false),
       max_sequential_skip_in_iterations(8),
@@ -129,13 +126,8 @@ ColumnFamilyOptions::ColumnFamilyOptions()
       max_successive_merges(0),
       min_partial_merge_operands(2),
       optimize_filters_for_hits(false),
-      paranoid_file_checks(false)
-#ifndef ROCKSDB_LITE
-      ,
-      listeners() {
-#else  // ROCKSDB_LITE
-{
-#endif  // ROCKSDB_LITE
+      paranoid_file_checks(false),
+      compaction_measure_io_stats(false) {
   assert(memtable_factory.get() != nullptr);
 }
 
@@ -144,11 +136,12 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
       merge_operator(options.merge_operator),
       compaction_filter(options.compaction_filter),
       compaction_filter_factory(options.compaction_filter_factory),
-      compaction_filter_factory_v2(options.compaction_filter_factory_v2),
       write_buffer_size(options.write_buffer_size),
       max_write_buffer_number(options.max_write_buffer_number),
       min_write_buffer_number_to_merge(
           options.min_write_buffer_number_to_merge),
+      max_write_buffer_number_to_maintain(
+          options.max_write_buffer_number_to_maintain),
       compression(options.compression),
       compression_per_level(options.compression_per_level),
       compression_opts(options.compression_opts),
@@ -158,7 +151,6 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
           options.level0_file_num_compaction_trigger),
       level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger),
       level0_stop_writes_trigger(options.level0_stop_writes_trigger),
-      max_mem_compaction_level(options.max_mem_compaction_level),
       target_file_size_base(options.target_file_size_base),
       target_file_size_multiplier(options.target_file_size_multiplier),
       max_bytes_for_level_base(options.max_bytes_for_level_base),
@@ -171,13 +163,15 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
       source_compaction_factor(options.source_compaction_factor),
       max_grandparent_overlap_factor(options.max_grandparent_overlap_factor),
       soft_rate_limit(options.soft_rate_limit),
-      hard_rate_limit(options.hard_rate_limit),
+      hard_pending_compaction_bytes_limit(
+          options.hard_pending_compaction_bytes_limit),
       rate_limit_delay_max_milliseconds(
           options.rate_limit_delay_max_milliseconds),
       arena_block_size(options.arena_block_size),
       disable_auto_compactions(options.disable_auto_compactions),
       purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush),
       compaction_style(options.compaction_style),
+      compaction_pri(options.compaction_pri),
       verify_checksums_in_compaction(options.verify_checksums_in_compaction),
       compaction_options_universal(options.compaction_options_universal),
       compaction_options_fifo(options.compaction_options_fifo),
@@ -199,13 +193,8 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
       max_successive_merges(options.max_successive_merges),
       min_partial_merge_operands(options.min_partial_merge_operands),
       optimize_filters_for_hits(options.optimize_filters_for_hits),
-      paranoid_file_checks(options.paranoid_file_checks)
-#ifndef ROCKSDB_LITE
-      ,
-      listeners(options.listeners) {
-#else   // ROCKSDB_LITE
-{
-#endif  // ROCKSDB_LITE
+      paranoid_file_checks(options.paranoid_file_checks),
+      compaction_measure_io_stats(options.compaction_measure_io_stats) {
   assert(memtable_factory.get() != nullptr);
   if (max_bytes_for_level_multiplier_additional.size() <
       static_cast<unsigned int>(num_levels)) {
@@ -220,6 +209,7 @@ DBOptions::DBOptions()
       paranoid_checks(true),
       env(Env::Default()),
       rate_limiter(nullptr),
+      delete_scheduler(nullptr),
       info_log(nullptr),
 #ifdef NDEBUG
       info_log_level(INFO_LEVEL),
@@ -227,6 +217,7 @@ DBOptions::DBOptions()
       info_log_level(DEBUG_LEVEL),
 #endif  // NDEBUG
       max_open_files(5000),
+      max_file_opening_threads(1),
       max_total_wal_size(0),
       statistics(nullptr),
       disableDataSync(false),
@@ -235,6 +226,7 @@ DBOptions::DBOptions()
       wal_dir(""),
       delete_obsolete_files_period_micros(6 * 60 * 60 * 1000000UL),
       max_background_compactions(1),
+      max_subcompactions(1),
       max_background_flushes(1),
       max_log_file_size(0),
       log_file_time_to_roll(0),
@@ -247,16 +239,23 @@ DBOptions::DBOptions()
       allow_os_buffer(true),
       allow_mmap_reads(false),
       allow_mmap_writes(false),
+      allow_fallocate(true),
       is_fd_close_on_exec(true),
       skip_log_error_on_recovery(false),
-      stats_dump_period_sec(3600),
+      stats_dump_period_sec(600),
       advise_random_on_open(true),
       db_write_buffer_size(0),
       access_hint_on_compaction_start(NORMAL),
+      new_table_reader_for_compaction_inputs(false),
+      compaction_readahead_size(0),
       use_adaptive_mutex(false),
       bytes_per_sync(0),
       wal_bytes_per_sync(0),
-      enable_thread_tracking(false) {
+      listeners(),
+      enable_thread_tracking(false),
+      delayed_write_rate(1024U * 1024U),
+      skip_stats_update_on_db_open(false),
+      wal_recovery_mode(WALRecoveryMode::kTolerateCorruptedTailRecords) {
 }
 
 DBOptions::DBOptions(const Options& options)
@@ -266,9 +265,11 @@ DBOptions::DBOptions(const Options& options)
       paranoid_checks(options.paranoid_checks),
       env(options.env),
       rate_limiter(options.rate_limiter),
+      delete_scheduler(options.delete_scheduler),
       info_log(options.info_log),
       info_log_level(options.info_log_level),
       max_open_files(options.max_open_files),
+      max_file_opening_threads(options.max_file_opening_threads),
       max_total_wal_size(options.max_total_wal_size),
       statistics(options.statistics),
       disableDataSync(options.disableDataSync),
@@ -279,6 +280,7 @@ DBOptions::DBOptions(const Options& options)
       delete_obsolete_files_period_micros(
           options.delete_obsolete_files_period_micros),
       max_background_compactions(options.max_background_compactions),
+      max_subcompactions(options.max_subcompactions),
       max_background_flushes(options.max_background_flushes),
       max_log_file_size(options.max_log_file_size),
       log_file_time_to_roll(options.log_file_time_to_roll),
@@ -291,214 +293,262 @@ DBOptions::DBOptions(const Options& options)
       allow_os_buffer(options.allow_os_buffer),
       allow_mmap_reads(options.allow_mmap_reads),
       allow_mmap_writes(options.allow_mmap_writes),
+      allow_fallocate(options.allow_fallocate),
       is_fd_close_on_exec(options.is_fd_close_on_exec),
       skip_log_error_on_recovery(options.skip_log_error_on_recovery),
       stats_dump_period_sec(options.stats_dump_period_sec),
       advise_random_on_open(options.advise_random_on_open),
       db_write_buffer_size(options.db_write_buffer_size),
       access_hint_on_compaction_start(options.access_hint_on_compaction_start),
+      new_table_reader_for_compaction_inputs(
+          options.new_table_reader_for_compaction_inputs),
+      compaction_readahead_size(options.compaction_readahead_size),
       use_adaptive_mutex(options.use_adaptive_mutex),
       bytes_per_sync(options.bytes_per_sync),
       wal_bytes_per_sync(options.wal_bytes_per_sync),
-      enable_thread_tracking(options.enable_thread_tracking) {}
+      listeners(options.listeners),
+      enable_thread_tracking(options.enable_thread_tracking),
+      delayed_write_rate(options.delayed_write_rate),
+      skip_stats_update_on_db_open(options.skip_stats_update_on_db_open),
+      wal_recovery_mode(options.wal_recovery_mode),
+      row_cache(options.row_cache) {}
 
 static const char* const access_hints[] = {
   "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED"
 };
 
 void DBOptions::Dump(Logger* log) const {
-    Log(log,"         Options.error_if_exists: %d", error_if_exists);
-    Log(log,"       Options.create_if_missing: %d", create_if_missing);
-    Log(log,"         Options.paranoid_checks: %d", paranoid_checks);
-    Log(log,"                     Options.env: %p", env);
-    Log(log,"                Options.info_log: %p", info_log.get());
-    Log(log,"          Options.max_open_files: %d", max_open_files);
-    Log(log,"      Options.max_total_wal_size: %" PRIu64, max_total_wal_size);
-    Log(log, "       Options.disableDataSync: %d", disableDataSync);
-    Log(log, "             Options.use_fsync: %d", use_fsync);
-    Log(log, "     Options.max_log_file_size: %zu", max_log_file_size);
-    Log(log, "Options.max_manifest_file_size: %" PRIu64,
-        max_manifest_file_size);
-    Log(log, "     Options.log_file_time_to_roll: %zu", log_file_time_to_roll);
-    Log(log, "     Options.keep_log_file_num: %zu", keep_log_file_num);
-    Log(log, "       Options.allow_os_buffer: %d", allow_os_buffer);
-    Log(log, "      Options.allow_mmap_reads: %d", allow_mmap_reads);
-    Log(log, "     Options.allow_mmap_writes: %d", allow_mmap_writes);
-    Log(log, "         Options.create_missing_column_families: %d",
+    Header(log, "         Options.error_if_exists: %d", error_if_exists);
+    Header(log, "       Options.create_if_missing: %d", create_if_missing);
+    Header(log, "         Options.paranoid_checks: %d", paranoid_checks);
+    Header(log, "                     Options.env: %p", env);
+    Header(log, "                Options.info_log: %p", info_log.get());
+    Header(log, "          Options.max_open_files: %d", max_open_files);
+    Header(log,
+        "Options.max_file_opening_threads: %d", max_file_opening_threads);
+    Header(log,
+        "      Options.max_total_wal_size: %" PRIu64, max_total_wal_size);
+    Header(log, "       Options.disableDataSync: %d", disableDataSync);
+    Header(log, "             Options.use_fsync: %d", use_fsync);
+    Header(log, "     Options.max_log_file_size: %" ROCKSDB_PRIszt,
+         max_log_file_size);
+    Header(log, "Options.max_manifest_file_size: %" PRIu64,
+         max_manifest_file_size);
+    Header(log, "     Options.log_file_time_to_roll: %" ROCKSDB_PRIszt,
+         log_file_time_to_roll);
+    Header(log, "     Options.keep_log_file_num: %" ROCKSDB_PRIszt,
+         keep_log_file_num);
+    Header(log, "       Options.allow_os_buffer: %d", allow_os_buffer);
+    Header(log, "      Options.allow_mmap_reads: %d", allow_mmap_reads);
+    Header(log, "      Options.allow_fallocate: %d", allow_fallocate);
+    Header(log, "     Options.allow_mmap_writes: %d", allow_mmap_writes);
+    Header(log, "         Options.create_missing_column_families: %d",
         create_missing_column_families);
-    Log(log, "                             Options.db_log_dir: %s",
+    Header(log, "                             Options.db_log_dir: %s",
         db_log_dir.c_str());
-    Log(log, "                                Options.wal_dir: %s",
+    Header(log, "                                Options.wal_dir: %s",
         wal_dir.c_str());
-    Log(log, "               Options.table_cache_numshardbits: %d",
+    Header(log, "               Options.table_cache_numshardbits: %d",
         table_cache_numshardbits);
-    Log(log, "    Options.delete_obsolete_files_period_micros: %" PRIu64,
+    Header(log, "    Options.delete_obsolete_files_period_micros: %" PRIu64,
         delete_obsolete_files_period_micros);
-    Log(log, "             Options.max_background_compactions: %d",
+    Header(log, "             Options.max_background_compactions: %d",
         max_background_compactions);
-    Log(log, "                 Options.max_background_flushes: %d",
+    Header(log, "                     Options.max_subcompactions: %" PRIu32,
+        max_subcompactions);
+    Header(log, "                 Options.max_background_flushes: %d",
         max_background_flushes);
-    Log(log, "                        Options.WAL_ttl_seconds: %" PRIu64,
+    Header(log, "                        Options.WAL_ttl_seconds: %" PRIu64,
         WAL_ttl_seconds);
-    Log(log, "                      Options.WAL_size_limit_MB: %" PRIu64,
+    Header(log, "                      Options.WAL_size_limit_MB: %" PRIu64,
         WAL_size_limit_MB);
-    Log(log, "            Options.manifest_preallocation_size: %zu",
-        manifest_preallocation_size);
-    Log(log, "                         Options.allow_os_buffer: %d",
+    Header(log,
+         "            Options.manifest_preallocation_size: %" ROCKSDB_PRIszt,
+         manifest_preallocation_size);
+    Header(log, "                         Options.allow_os_buffer: %d",
         allow_os_buffer);
-    Log(log, "                        Options.allow_mmap_reads: %d",
+    Header(log, "                        Options.allow_mmap_reads: %d",
         allow_mmap_reads);
-    Log(log, "                       Options.allow_mmap_writes: %d",
+    Header(log, "                       Options.allow_mmap_writes: %d",
         allow_mmap_writes);
-    Log(log, "                     Options.is_fd_close_on_exec: %d",
+    Header(log, "                     Options.is_fd_close_on_exec: %d",
         is_fd_close_on_exec);
-    Log(log, "                   Options.stats_dump_period_sec: %u",
+    Header(log, "                   Options.stats_dump_period_sec: %u",
         stats_dump_period_sec);
-    Log(log, "                   Options.advise_random_on_open: %d",
+    Header(log, "                   Options.advise_random_on_open: %d",
         advise_random_on_open);
-    Log(log, "                    Options.db_write_buffer_size: %zd",
-        db_write_buffer_size);
-    Log(log, "         Options.access_hint_on_compaction_start: %s",
+    Header(log,
+         "                    Options.db_write_buffer_size: %" ROCKSDB_PRIszt
+         "d",
+         db_write_buffer_size);
+    Header(log, "         Options.access_hint_on_compaction_start: %s",
         access_hints[access_hint_on_compaction_start]);
-    Log(log, "                      Options.use_adaptive_mutex: %d",
+    Header(log, "  Options.new_table_reader_for_compaction_inputs: %d",
+         new_table_reader_for_compaction_inputs);
+    Header(log,
+         "               Options.compaction_readahead_size: %" ROCKSDB_PRIszt
+         "d",
+         compaction_readahead_size);
+    Header(log, "                      Options.use_adaptive_mutex: %d",
         use_adaptive_mutex);
-    Log(log, "                            Options.rate_limiter: %p",
+    Header(log, "                            Options.rate_limiter: %p",
         rate_limiter.get());
-    Log(log, "                          Options.bytes_per_sync: %" PRIu64,
+    Header(log, "     Options.delete_scheduler.rate_bytes_per_sec: %" PRIi64,
+         delete_scheduler ? delete_scheduler->GetRateBytesPerSecond() : 0);
+    Header(log, "                          Options.bytes_per_sync: %" PRIu64,
         bytes_per_sync);
-    Log(log, "                      Options.wal_bytes_per_sync: %" PRIu64,
+    Header(log, "                      Options.wal_bytes_per_sync: %" PRIu64,
         wal_bytes_per_sync);
-    Log(log, "                  Options.enable_thread_tracking: %d",
+    Header(log, "                       Options.wal_recovery_mode: %d",
+        wal_recovery_mode);
+    Header(log, "                  Options.enable_thread_tracking: %d",
         enable_thread_tracking);
+    if (row_cache) {
+      Header(log, "                               Options.row_cache: %" PRIu64,
+           row_cache->GetCapacity());
+    } else {
+      Header(log, "                               Options.row_cache: None");
+    }
 }  // DBOptions::Dump
 
 void ColumnFamilyOptions::Dump(Logger* log) const {
-  Log(log, "              Options.comparator: %s", comparator->Name());
-  Log(log, "          Options.merge_operator: %s",
+  Header(log, "              Options.comparator: %s", comparator->Name());
+  Header(log, "          Options.merge_operator: %s",
       merge_operator ? merge_operator->Name() : "None");
-  Log(log, "       Options.compaction_filter: %s",
+  Header(log, "       Options.compaction_filter: %s",
       compaction_filter ? compaction_filter->Name() : "None");
-  Log(log, "       Options.compaction_filter_factory: %s",
-      compaction_filter_factory->Name());
-  Log(log, "       Options.compaction_filter_factory_v2: %s",
-      compaction_filter_factory_v2->Name());
-  Log(log, "        Options.memtable_factory: %s", memtable_factory->Name());
-  Log(log, "           Options.table_factory: %s", table_factory->Name());
-  Log(log, "           table_factory options: %s",
+  Header(log, "       Options.compaction_filter_factory: %s",
+      compaction_filter_factory ? compaction_filter_factory->Name() : "None");
+  Header(log, "        Options.memtable_factory: %s", memtable_factory->Name());
+  Header(log, "           Options.table_factory: %s", table_factory->Name());
+  Header(log, "           table_factory options: %s",
       table_factory->GetPrintableTableOptions().c_str());
-  Log(log, "       Options.write_buffer_size: %zd", write_buffer_size);
-  Log(log, " Options.max_write_buffer_number: %d", max_write_buffer_number);
+  Header(log, "       Options.write_buffer_size: %" ROCKSDB_PRIszt,
+       write_buffer_size);
+  Header(log, " Options.max_write_buffer_number: %d", max_write_buffer_number);
     if (!compression_per_level.empty()) {
       for (unsigned int i = 0; i < compression_per_level.size(); i++) {
-        Log(log, "       Options.compression[%d]: %s", i,
-            CompressionTypeToString(compression_per_level[i]));
+        Header(log, "       Options.compression[%d]: %s", i,
+            CompressionTypeToString(compression_per_level[i]).c_str());
       }
     } else {
-      Log(log, "         Options.compression: %s",
-          CompressionTypeToString(compression));
+      Header(log, "         Options.compression: %s",
+          CompressionTypeToString(compression).c_str());
     }
-    Log(log,"      Options.prefix_extractor: %s",
+    Header(log, "      Options.prefix_extractor: %s",
         prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name());
-    Log(log,"            Options.num_levels: %d", num_levels);
-    Log(log,"       Options.min_write_buffer_number_to_merge: %d",
+    Header(log, "            Options.num_levels: %d", num_levels);
+    Header(log, "       Options.min_write_buffer_number_to_merge: %d",
         min_write_buffer_number_to_merge);
-    Log(log,"        Options.purge_redundant_kvs_while_flush: %d",
-         purge_redundant_kvs_while_flush);
-    Log(log,"           Options.compression_opts.window_bits: %d",
+    Header(log, "    Options.max_write_buffer_number_to_maintain: %d",
+         max_write_buffer_number_to_maintain);
+    Header(log, "           Options.compression_opts.window_bits: %d",
         compression_opts.window_bits);
-    Log(log,"                 Options.compression_opts.level: %d",
+    Header(log, "                 Options.compression_opts.level: %d",
         compression_opts.level);
-    Log(log,"              Options.compression_opts.strategy: %d",
+    Header(log, "              Options.compression_opts.strategy: %d",
         compression_opts.strategy);
-    Log(log,"     Options.level0_file_num_compaction_trigger: %d",
+    Header(log, "     Options.level0_file_num_compaction_trigger: %d",
         level0_file_num_compaction_trigger);
-    Log(log,"         Options.level0_slowdown_writes_trigger: %d",
+    Header(log, "         Options.level0_slowdown_writes_trigger: %d",
         level0_slowdown_writes_trigger);
-    Log(log,"             Options.level0_stop_writes_trigger: %d",
+    Header(log, "             Options.level0_stop_writes_trigger: %d",
         level0_stop_writes_trigger);
-    Log(log,"               Options.max_mem_compaction_level: %d",
-        max_mem_compaction_level);
-    Log(log,"                  Options.target_file_size_base: %" PRIu64,
+    Header(log, "                  Options.target_file_size_base: %" PRIu64,
         target_file_size_base);
-    Log(log,"            Options.target_file_size_multiplier: %d",
+    Header(log, "            Options.target_file_size_multiplier: %d",
         target_file_size_multiplier);
-    Log(log,"               Options.max_bytes_for_level_base: %" PRIu64,
+    Header(log, "               Options.max_bytes_for_level_base: %" PRIu64,
         max_bytes_for_level_base);
-    Log(log, "Options.level_compaction_dynamic_level_bytes: %d",
+    Header(log, "Options.level_compaction_dynamic_level_bytes: %d",
         level_compaction_dynamic_level_bytes);
-    Log(log,"         Options.max_bytes_for_level_multiplier: %d",
+    Header(log, "         Options.max_bytes_for_level_multiplier: %d",
         max_bytes_for_level_multiplier);
     for (size_t i = 0; i < max_bytes_for_level_multiplier_additional.size();
          i++) {
-      Log(log, "Options.max_bytes_for_level_multiplier_addtl[%zu]: %d", i,
-          max_bytes_for_level_multiplier_additional[i]);
+      Header(log,
+          "Options.max_bytes_for_level_multiplier_addtl[%" ROCKSDB_PRIszt
+                "]: %d",
+           i, max_bytes_for_level_multiplier_additional[i]);
     }
-    Log(log,"      Options.max_sequential_skip_in_iterations: %" PRIu64,
+    Header(log, "      Options.max_sequential_skip_in_iterations: %" PRIu64,
         max_sequential_skip_in_iterations);
-    Log(log,"             Options.expanded_compaction_factor: %d",
+    Header(log, "             Options.expanded_compaction_factor: %d",
         expanded_compaction_factor);
-    Log(log,"               Options.source_compaction_factor: %d",
+    Header(log, "               Options.source_compaction_factor: %d",
         source_compaction_factor);
-    Log(log,"         Options.max_grandparent_overlap_factor: %d",
+    Header(log, "         Options.max_grandparent_overlap_factor: %d",
         max_grandparent_overlap_factor);
-    Log(log,"                       Options.arena_block_size: %zu",
-        arena_block_size);
-    Log(log,"                      Options.soft_rate_limit: %.2f",
+
+    Header(log,
+         "                       Options.arena_block_size: %" ROCKSDB_PRIszt,
+         arena_block_size);
+    Header(log, "                      Options.soft_rate_limit: %.2f",
         soft_rate_limit);
-    Log(log,"                      Options.hard_rate_limit: %.2f",
-        hard_rate_limit);
-    Log(log,"      Options.rate_limit_delay_max_milliseconds: %u",
+    Header(log, "  Options.hard_pending_compaction_bytes_limit: %" PRIu64,
+         hard_pending_compaction_bytes_limit);
+    Header(log, "      Options.rate_limit_delay_max_milliseconds: %u",
         rate_limit_delay_max_milliseconds);
-    Log(log,"               Options.disable_auto_compactions: %d",
+    Header(log, "               Options.disable_auto_compactions: %d",
         disable_auto_compactions);
-    Log(log,"         Options.purge_redundant_kvs_while_flush: %d",
-        purge_redundant_kvs_while_flush);
-    Log(log,"                          Options.filter_deletes: %d",
+    Header(log, "                          Options.filter_deletes: %d",
         filter_deletes);
-    Log(log, "          Options.verify_checksums_in_compaction: %d",
+    Header(log, "          Options.verify_checksums_in_compaction: %d",
         verify_checksums_in_compaction);
-    Log(log,"                        Options.compaction_style: %d",
+    Header(log, "                        Options.compaction_style: %d",
         compaction_style);
-    Log(log," Options.compaction_options_universal.size_ratio: %u",
+    Header(log, "                          Options.compaction_pri: %d",
+           compaction_pri);
+    Header(log, " Options.compaction_options_universal.size_ratio: %u",
         compaction_options_universal.size_ratio);
-    Log(log,"Options.compaction_options_universal.min_merge_width: %u",
+    Header(log, "Options.compaction_options_universal.min_merge_width: %u",
         compaction_options_universal.min_merge_width);
-    Log(log,"Options.compaction_options_universal.max_merge_width: %u",
+    Header(log, "Options.compaction_options_universal.max_merge_width: %u",
         compaction_options_universal.max_merge_width);
-    Log(log,"Options.compaction_options_universal."
+    Header(log, "Options.compaction_options_universal."
             "max_size_amplification_percent: %u",
         compaction_options_universal.max_size_amplification_percent);
-    Log(log,
+    Header(log,
         "Options.compaction_options_universal.compression_size_percent: %d",
         compaction_options_universal.compression_size_percent);
-    Log(log, "Options.compaction_options_fifo.max_table_files_size: %" PRIu64,
+    Header(log,
+        "Options.compaction_options_fifo.max_table_files_size: %" PRIu64,
         compaction_options_fifo.max_table_files_size);
     std::string collector_names;
     for (const auto& collector_factory : table_properties_collector_factories) {
       collector_names.append(collector_factory->Name());
       collector_names.append("; ");
     }
-    Log(log, "                  Options.table_properties_collectors: %s",
+    Header(log, "                  Options.table_properties_collectors: %s",
         collector_names.c_str());
-    Log(log, "                  Options.inplace_update_support: %d",
+    Header(log, "                  Options.inplace_update_support: %d",
         inplace_update_support);
-    Log(log, "                Options.inplace_update_num_locks: %zd",
-        inplace_update_num_locks);
-    Log(log, "              Options.min_partial_merge_operands: %u",
+    Header(log,
+         "                Options.inplace_update_num_locks: %" ROCKSDB_PRIszt,
+         inplace_update_num_locks);
+    Header(log, "              Options.min_partial_merge_operands: %u",
         min_partial_merge_operands);
     // TODO: easier config for bloom (maybe based on avg key/value size)
-    Log(log, "              Options.memtable_prefix_bloom_bits: %d",
+    Header(log, "              Options.memtable_prefix_bloom_bits: %d",
         memtable_prefix_bloom_bits);
-    Log(log, "            Options.memtable_prefix_bloom_probes: %d",
+    Header(log, "            Options.memtable_prefix_bloom_probes: %d",
         memtable_prefix_bloom_probes);
-    Log(log, "  Options.memtable_prefix_bloom_huge_page_tlb_size: %zu",
-        memtable_prefix_bloom_huge_page_tlb_size);
-    Log(log, "                          Options.bloom_locality: %d",
+
+    Header(log,
+         "  Options.memtable_prefix_bloom_huge_page_tlb_size: %" ROCKSDB_PRIszt,
+         memtable_prefix_bloom_huge_page_tlb_size);
+    Header(log, "                          Options.bloom_locality: %d",
         bloom_locality);
-    Log(log, "                   Options.max_successive_merges: %zd",
-        max_successive_merges);
-    Log(log, "               Options.optimize_fllters_for_hits: %d",
+
+    Header(log,
+         "                   Options.max_successive_merges: %" ROCKSDB_PRIszt,
+         max_successive_merges);
+    Header(log, "               Options.optimize_fllters_for_hits: %d",
         optimize_filters_for_hits);
+    Header(log, "               Options.paranoid_file_checks: %d",
+         paranoid_file_checks);
+    Header(log, "               Options.compaction_measure_io_stats: %d",
+         compaction_measure_io_stats);
 }  // ColumnFamilyOptions::Dump
 
 void Options::Dump(Logger* log) const {
@@ -506,6 +556,10 @@ void Options::Dump(Logger* log) const {
   ColumnFamilyOptions::Dump(log);
 }   // Options::Dump
 
+void Options::DumpCFOptions(Logger* log) const {
+  ColumnFamilyOptions::Dump(log);
+}  // Options::DumpCFOptions
+
 //
 // The goal of this method is to create a configuration that
 // allows an application to write all files into L0 and
@@ -551,46 +605,6 @@ Options::PrepareForBulkLoad()
   return this;
 }
 
-const char* CompressionTypeToString(CompressionType compression_type) {
-  switch (compression_type) {
-    case kNoCompression:
-      return "NoCompression";
-    case kSnappyCompression:
-      return "Snappy";
-    case kZlibCompression:
-      return "Zlib";
-    case kBZip2Compression:
-      return "BZip2";
-    case kLZ4Compression:
-      return "LZ4";
-    case kLZ4HCCompression:
-      return "LZ4HC";
-    default:
-      assert(false);
-      return "";
-  }
-}
-
-bool CompressionTypeSupported(CompressionType compression_type) {
-  switch (compression_type) {
-    case kNoCompression:
-      return true;
-    case kSnappyCompression:
-      return Snappy_Supported();
-    case kZlibCompression:
-      return Zlib_Supported();
-    case kBZip2Compression:
-      return BZip2_Supported();
-    case kLZ4Compression:
-      return LZ4_Supported();
-    case kLZ4HCCompression:
-      return LZ4_Supported();
-    default:
-      assert(false);
-      return false;
-  }
-}
-
 #ifndef ROCKSDB_LITE
 // Optimization functions
 ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup(
diff --git a/src/rocksdb/util/options_helper.cc b/src/rocksdb/util/options_helper.cc
index 07fc053..78ae599 100644
--- a/src/rocksdb/util/options_helper.cc
+++ b/src/rocksdb/util/options_helper.cc
@@ -2,44 +2,176 @@
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
+#include "util/options_helper.h"
 
 #include <cassert>
 #include <cctype>
 #include <cstdlib>
 #include <unordered_set>
+#include <vector>
 #include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/filter_policy.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
-#include "rocksdb/utilities/convenience.h"
 #include "table/block_based_table_factory.h"
 #include "util/logging.h"
-#include "util/options_helper.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
 #ifndef ROCKSDB_LITE
+bool isSpecialChar(const char c) {
+  if (c == '\\' || c == '#' || c == ':' || c == '\r' || c == '\n') {
+    return true;
+  }
+  return false;
+}
+
+char UnescapeChar(const char c) {
+  static const std::unordered_map<char, char> convert_map = {{'r', '\r'},
+                                                             {'n', '\n'}};
+
+  auto iter = convert_map.find(c);
+  if (iter == convert_map.end()) {
+    return c;
+  }
+  return iter->second;
+}
+
+char EscapeChar(const char c) {
+  static const std::unordered_map<char, char> convert_map = {{'\n', 'n'},
+                                                             {'\r', 'r'}};
+
+  auto iter = convert_map.find(c);
+  if (iter == convert_map.end()) {
+    return c;
+  }
+  return iter->second;
+}
+
+std::string EscapeOptionString(const std::string& raw_string) {
+  std::string output;
+  for (auto c : raw_string) {
+    if (isSpecialChar(c)) {
+      output += '\\';
+      output += EscapeChar(c);
+    } else {
+      output += c;
+    }
+  }
+
+  return output;
+}
+
+std::string UnescapeOptionString(const std::string& escaped_string) {
+  bool escaped = false;
+  std::string output;
+
+  for (auto c : escaped_string) {
+    if (escaped) {
+      output += UnescapeChar(c);
+      escaped = false;
+    } else {
+      if (c == '\\') {
+        escaped = true;
+        continue;
+      }
+      output += c;
+    }
+  }
+  return output;
+}
 
 namespace {
-CompressionType ParseCompressionType(const std::string& type) {
-  if (type == "kNoCompression") {
-    return kNoCompression;
-  } else if (type == "kSnappyCompression") {
-    return kSnappyCompression;
-  } else if (type == "kZlibCompression") {
-    return kZlibCompression;
-  } else if (type == "kBZip2Compression") {
-    return kBZip2Compression;
-  } else if (type == "kLZ4Compression") {
-    return kLZ4Compression;
-  } else if (type == "kLZ4HCCompression") {
-    return kLZ4HCCompression;
+std::string trim(const std::string& str) {
+  if (str.empty()) return std::string();
+  size_t start = 0;
+  size_t end = str.size() - 1;
+  while (isspace(str[start]) != 0 && start <= end) {
+    ++start;
+  }
+  while (isspace(str[end]) != 0 && start <= end) {
+    --end;
+  }
+  if (start <= end) {
+    return str.substr(start, end - start + 1);
+  }
+  return std::string();
+}
+
+bool SerializeCompressionType(const CompressionType& type, std::string* value) {
+  switch (type) {
+    case kNoCompression:
+      *value = "kNoCompression";
+      return true;
+    case kSnappyCompression:
+      *value = "kSnappyCompression";
+      return true;
+    case kZlibCompression:
+      *value = "kZlibCompression";
+      return true;
+    case kBZip2Compression:
+      *value = "kBZip2Compression";
+      return true;
+    case kLZ4Compression:
+      *value = "kLZ4Compression";
+      return true;
+    case kLZ4HCCompression:
+      *value = "kLZ4HCCompression";
+      return true;
+    case kZSTDNotFinalCompression:
+      *value = "kZSTDNotFinalCompression";
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool SerializeVectorCompressionType(const std::vector<CompressionType>& types,
+                                    std::string* value) {
+  std::stringstream ss;
+  bool result;
+  for (size_t i = 0; i < types.size(); ++i) {
+    if (i > 0) {
+      ss << ':';
+    }
+    std::string string_type;
+    result = SerializeCompressionType(types[i], &string_type);
+    if (result == false) {
+      return result;
+    }
+    ss << string_type;
+  }
+  *value = ss.str();
+  return true;
+}
+
+bool ParseCompressionType(const std::string& string_value,
+                          CompressionType* type) {
+  if (string_value == "kNoCompression") {
+    *type = kNoCompression;
+  } else if (string_value == "kSnappyCompression") {
+    *type = kSnappyCompression;
+  } else if (string_value == "kZlibCompression") {
+    *type = kZlibCompression;
+  } else if (string_value == "kBZip2Compression") {
+    *type = kBZip2Compression;
+  } else if (string_value == "kLZ4Compression") {
+    *type = kLZ4Compression;
+  } else if (string_value == "kLZ4HCCompression") {
+    *type = kLZ4HCCompression;
+  } else if (string_value == "kZSTDNotFinalCompression") {
+    *type = kZSTDNotFinalCompression;
   } else {
-    throw std::invalid_argument("Unknown compression type: " + type);
+    return false;
   }
-  return kNoCompression;
+  return true;
 }
 
 BlockBasedTableOptions::IndexType ParseBlockBasedTableIndexType(
@@ -141,21 +273,253 @@ double ParseDouble(const std::string& value) {
   return std::strtod(value.c_str(), 0);
 #endif
 }
+static const std::unordered_map<char, std::string>
+    compaction_style_to_string_map = {
+        {kCompactionStyleLevel, "kCompactionStyleLevel"},
+        {kCompactionStyleUniversal, "kCompactionStyleUniversal"},
+        {kCompactionStyleFIFO, "kCompactionStyleFIFO"},
+        {kCompactionStyleNone, "kCompactionStyleNone"}};
 
 CompactionStyle ParseCompactionStyle(const std::string& type) {
-  if (type == "kCompactionStyleLevel") {
-    return kCompactionStyleLevel;
-  } else if (type == "kCompactionStyleUniversal") {
-    return kCompactionStyleUniversal;
-  } else if (type == "kCompactionStyleFIFO") {
-    return kCompactionStyleFIFO;
-  } else {
-    throw std::invalid_argument("unknown compaction style: " + type);
+  for (auto const& entry : compaction_style_to_string_map) {
+    if (entry.second == type) {
+      return static_cast<CompactionStyle>(entry.first);
+    }
   }
+  throw std::invalid_argument("unknown compaction style: " + type);
   return kCompactionStyleLevel;
 }
+
+std::string CompactionStyleToString(const CompactionStyle style) {
+  auto iter = compaction_style_to_string_map.find(style);
+  assert(iter != compaction_style_to_string_map.end());
+  return iter->second;
+}
+
+bool ParseVectorCompressionType(
+    const std::string& value,
+    std::vector<CompressionType>* compression_per_level) {
+  compression_per_level->clear();
+  size_t start = 0;
+  while (start < value.size()) {
+    size_t end = value.find(':', start);
+    bool is_ok;
+    CompressionType type;
+    if (end == std::string::npos) {
+      is_ok = ParseCompressionType(value.substr(start), &type);
+      if (!is_ok) {
+        return false;
+      }
+      compression_per_level->emplace_back(type);
+      break;
+    } else {
+      is_ok = ParseCompressionType(value.substr(start, end - start), &type);
+      if (!is_ok) {
+        return false;
+      }
+      compression_per_level->emplace_back(type);
+      start = end + 1;
+    }
+  }
+  return true;
+}
+
+bool ParseSliceTransformHelper(
+    const std::string& kFixedPrefixName, const std::string& kCappedPrefixName,
+    const std::string& value,
+    std::shared_ptr<const SliceTransform>* slice_transform) {
+  auto& pe_value = value;
+  if (pe_value.size() > kFixedPrefixName.size() &&
+      pe_value.compare(0, kFixedPrefixName.size(), kFixedPrefixName) == 0) {
+    int prefix_length = ParseInt(trim(value.substr(kFixedPrefixName.size())));
+    slice_transform->reset(NewFixedPrefixTransform(prefix_length));
+  } else if (pe_value.size() > kCappedPrefixName.size() &&
+             pe_value.compare(0, kCappedPrefixName.size(), kCappedPrefixName) ==
+                 0) {
+    int prefix_length =
+        ParseInt(trim(pe_value.substr(kCappedPrefixName.size())));
+    slice_transform->reset(NewCappedPrefixTransform(prefix_length));
+  } else if (value == "nullptr") {
+    slice_transform->reset();
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+bool ParseSliceTransform(
+    const std::string& value,
+    std::shared_ptr<const SliceTransform>* slice_transform) {
+  // While we normally don't convert the string representation of a
+  // pointer-typed option into its instance, here we do so for backward
+  // compatibility as we allow this action in SetOption().
+
+  // TODO(yhchiang): A possible better place for these serialization /
+  // deserialization is inside the class definition of pointer-typed
+  // option itself, but this requires a bigger change of public API.
+  bool result =
+      ParseSliceTransformHelper("fixed:", "capped:", value, slice_transform);
+  if (result) {
+    return result;
+  }
+  result = ParseSliceTransformHelper(
+      "rocksdb.FixedPrefix.", "rocksdb.CappedPrefix.", value, slice_transform);
+  if (result) {
+    return result;
+  }
+  // TODO(yhchiang): we can further support other default
+  //                 SliceTransforms here.
+  return false;
+}
+
+bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
+                       const std::string& value) {
+  switch (opt_type) {
+    case OptionType::kBoolean:
+      *reinterpret_cast<bool*>(opt_address) = ParseBoolean("", value);
+      break;
+    case OptionType::kInt:
+      *reinterpret_cast<int*>(opt_address) = ParseInt(value);
+      break;
+    case OptionType::kUInt:
+      *reinterpret_cast<unsigned int*>(opt_address) = ParseUint32(value);
+      break;
+    case OptionType::kUInt32T:
+      *reinterpret_cast<uint32_t*>(opt_address) = ParseUint32(value);
+      break;
+    case OptionType::kUInt64T:
+      *reinterpret_cast<uint64_t*>(opt_address) = ParseUint64(value);
+      break;
+    case OptionType::kSizeT:
+      *reinterpret_cast<size_t*>(opt_address) = ParseSizeT(value);
+      break;
+    case OptionType::kString:
+      *reinterpret_cast<std::string*>(opt_address) = value;
+      break;
+    case OptionType::kDouble:
+      *reinterpret_cast<double*>(opt_address) = ParseDouble(value);
+      break;
+    case OptionType::kCompactionStyle:
+      *reinterpret_cast<CompactionStyle*>(opt_address) =
+          ParseCompactionStyle(value);
+      break;
+    case OptionType::kCompressionType:
+      return ParseCompressionType(
+          value, reinterpret_cast<CompressionType*>(opt_address));
+    case OptionType::kVectorCompressionType:
+      return ParseVectorCompressionType(
+          value, reinterpret_cast<std::vector<CompressionType>*>(opt_address));
+    case OptionType::kSliceTransform:
+      return ParseSliceTransform(
+          value, reinterpret_cast<std::shared_ptr<const SliceTransform>*>(
+                     opt_address));
+    default:
+      return false;
+  }
+  return true;
+}
+
 }  // anonymouse namespace
 
+bool SerializeSingleOptionHelper(const char* opt_address,
+                                 const OptionType opt_type,
+                                 std::string* value) {
+  assert(value);
+  switch (opt_type) {
+    case OptionType::kBoolean:
+      *value = *(reinterpret_cast<const bool*>(opt_address)) ? "true" : "false";
+      break;
+    case OptionType::kInt:
+      *value = ToString(*(reinterpret_cast<const int*>(opt_address)));
+      break;
+    case OptionType::kUInt:
+      *value = ToString(*(reinterpret_cast<const unsigned int*>(opt_address)));
+      break;
+    case OptionType::kUInt32T:
+      *value = ToString(*(reinterpret_cast<const uint32_t*>(opt_address)));
+      break;
+    case OptionType::kUInt64T:
+      *value = ToString(*(reinterpret_cast<const uint64_t*>(opt_address)));
+      break;
+    case OptionType::kSizeT:
+      *value = ToString(*(reinterpret_cast<const size_t*>(opt_address)));
+      break;
+    case OptionType::kDouble:
+      *value = ToString(*(reinterpret_cast<const double*>(opt_address)));
+      break;
+    case OptionType::kString:
+      *value = EscapeOptionString(
+          *(reinterpret_cast<const std::string*>(opt_address)));
+      break;
+    case OptionType::kCompactionStyle:
+      *value = CompactionStyleToString(
+          *(reinterpret_cast<const CompactionStyle*>(opt_address)));
+      break;
+    case OptionType::kCompressionType:
+      return SerializeCompressionType(
+          *(reinterpret_cast<const CompressionType*>(opt_address)), value);
+    case OptionType::kVectorCompressionType:
+      return SerializeVectorCompressionType(
+          *(reinterpret_cast<const std::vector<CompressionType>*>(opt_address)),
+          value);
+      break;
+    case OptionType::kSliceTransform: {
+      const auto* slice_transform_ptr =
+          reinterpret_cast<const std::shared_ptr<const SliceTransform>*>(
+              opt_address);
+      *value = slice_transform_ptr->get() ? slice_transform_ptr->get()->Name()
+                                          : "nullptr";
+      break;
+    }
+    case OptionType::kTableFactory: {
+      const auto* table_factory_ptr =
+          reinterpret_cast<const std::shared_ptr<const TableFactory>*>(
+              opt_address);
+      *value = table_factory_ptr->get() ? table_factory_ptr->get()->Name()
+                                        : "nullptr";
+      break;
+    }
+    case OptionType::kComparator: {
+      // it's a const pointer of const Comparator*
+      const auto* ptr = reinterpret_cast<const Comparator* const*>(opt_address);
+      *value = *ptr ? (*ptr)->Name() : "nullptr";
+      break;
+    }
+    case OptionType::kCompactionFilter: {
+      // it's a const pointer of const CompactionFilter*
+      const auto* ptr =
+          reinterpret_cast<const CompactionFilter* const*>(opt_address);
+      *value = *ptr ? (*ptr)->Name() : "nullptr";
+      break;
+    }
+    case OptionType::kCompactionFilterFactory: {
+      const auto* ptr =
+          reinterpret_cast<const std::shared_ptr<CompactionFilterFactory>*>(
+              opt_address);
+      *value = ptr->get() ? ptr->get()->Name() : "nullptr";
+      break;
+    }
+    case OptionType::kMemTableRepFactory: {
+      const auto* ptr =
+          reinterpret_cast<const std::shared_ptr<MemTableRepFactory>*>(
+              opt_address);
+      *value = ptr->get() ? ptr->get()->Name() : "nullptr";
+      break;
+    }
+    case OptionType::kMergeOperator: {
+      const auto* ptr =
+          reinterpret_cast<const std::shared_ptr<MergeOperator>*>(opt_address);
+      *value = ptr->get() ? ptr->get()->Name() : "nullptr";
+      break;
+    }
+    default:
+      return false;
+  }
+  return true;
+}
+
+
 template<typename OptionsType>
 bool ParseMemtableOptions(const std::string& name, const std::string& value,
                           OptionsType* new_options) {
@@ -191,8 +555,11 @@ bool ParseCompactionOptions(const std::string& name, const std::string& value,
     new_options->disable_auto_compactions = ParseBoolean(name, value);
   } else if (name == "soft_rate_limit") {
     new_options->soft_rate_limit = ParseDouble(value);
+  } else if (name == "hard_pending_compaction_bytes_limit") {
+    new_options->hard_pending_compaction_bytes_limit = ParseUint64(value);
   } else if (name == "hard_rate_limit") {
-    new_options->hard_rate_limit = ParseDouble(value);
+    // Deprecated options but still leave it here to avoid older options
+    // strings can be consumed.
   } else if (name == "level0_file_num_compaction_trigger") {
     new_options->level0_file_num_compaction_trigger = ParseInt(value);
   } else if (name == "level0_slowdown_writes_trigger") {
@@ -228,8 +595,6 @@ bool ParseCompactionOptions(const std::string& name, const std::string& value,
         start = end + 1;
       }
     }
-  } else if (name == "max_mem_compaction_level") {
-    new_options->max_mem_compaction_level = ParseInt(value);
   } else if (name == "verify_checksums_in_compaction") {
     new_options->verify_checksums_in_compaction = ParseBoolean(name, value);
   } else {
@@ -274,25 +639,6 @@ Status GetMutableOptionsFromStrings(
   return Status::OK();
 }
 
-namespace {
-
-std::string trim(const std::string& str) {
-  size_t start = 0;
-  size_t end = str.size() - 1;
-  while (isspace(str[start]) != 0 && start <= end) {
-    ++start;
-  }
-  while (isspace(str[end]) != 0 && start <= end) {
-    --end;
-  }
-  if (start <= end) {
-    return str.substr(start, end - start + 1);
-  }
-  return std::string();
-}
-
-}  // anonymous namespace
-
 Status StringToMap(const std::string& opts_str,
                    std::unordered_map<std::string, std::string>* opts_map) {
   assert(opts_map);
@@ -369,12 +715,28 @@ Status StringToMap(const std::string& opts_str,
   return Status::OK();
 }
 
-bool ParseColumnFamilyOption(const std::string& name, const std::string& value,
-                             ColumnFamilyOptions* new_options) {
+bool ParseColumnFamilyOption(const std::string& name,
+                             const std::string& org_value,
+                             ColumnFamilyOptions* new_options,
+                             bool input_string_escaped = false) {
+  const std::string& value =
+      input_string_escaped ? UnescapeOptionString(org_value) : org_value;
   try {
-    if (ParseMemtableOptions(name, value, new_options)) {
-    } else if (ParseCompactionOptions(name, value, new_options)) {
-    } else if (ParseMiscOptions(name, value, new_options)) {
+    if (name == "max_bytes_for_level_multiplier_additional") {
+      new_options->max_bytes_for_level_multiplier_additional.clear();
+      size_t start = 0;
+      while (true) {
+        size_t end = value.find(':', start);
+        if (end == std::string::npos) {
+          new_options->max_bytes_for_level_multiplier_additional.push_back(
+              ParseInt(value.substr(start)));
+          break;
+        } else {
+          new_options->max_bytes_for_level_multiplier_additional.push_back(
+              ParseInt(value.substr(start, end - start)));
+          start = end + 1;
+        }
+      }
     } else if (name == "block_based_table_factory") {
       // Nested options
       BlockBasedTableOptions table_opt, base_table_options;
@@ -389,25 +751,6 @@ bool ParseColumnFamilyOption(const std::string& name, const std::string& value,
         return false;
       }
       new_options->table_factory.reset(NewBlockBasedTableFactory(table_opt));
-    } else if (name == "min_write_buffer_number_to_merge") {
-      new_options->min_write_buffer_number_to_merge = ParseInt(value);
-    } else if (name == "compression") {
-      new_options->compression = ParseCompressionType(value);
-    } else if (name == "compression_per_level") {
-      new_options->compression_per_level.clear();
-      size_t start = 0;
-      while (true) {
-        size_t end = value.find(':', start);
-        if (end == std::string::npos) {
-          new_options->compression_per_level.push_back(
-              ParseCompressionType(value.substr(start)));
-          break;
-        } else {
-          new_options->compression_per_level.push_back(
-              ParseCompressionType(value.substr(start, end - start)));
-          start = end + 1;
-        }
-      }
     } else if (name == "compression_opts") {
       size_t start = 0;
       size_t end = value.find(':');
@@ -429,139 +772,137 @@ bool ParseColumnFamilyOption(const std::string& name, const std::string& value,
       }
       new_options->compression_opts.strategy =
           ParseInt(value.substr(start, value.size() - start));
-    } else if (name == "num_levels") {
-      new_options->num_levels = ParseInt(value);
-    } else if (name == "level_compaction_dynamic_level_bytes") {
-      new_options->level_compaction_dynamic_level_bytes =
-          ParseBoolean(name, value);
-    } else if (name == "purge_redundant_kvs_while_flush") {
-      new_options->purge_redundant_kvs_while_flush =
-          ParseBoolean(name, value);
-    } else if (name == "compaction_style") {
-      new_options->compaction_style = ParseCompactionStyle(value);
     } else if (name == "compaction_options_universal") {
       // TODO(ljin): add support
       return false;
     } else if (name == "compaction_options_fifo") {
       new_options->compaction_options_fifo.max_table_files_size =
           ParseUint64(value);
-    } else if (name == "bloom_locality") {
-      new_options->bloom_locality = ParseUint32(value);
-    } else if (name == "min_partial_merge_operands") {
-      new_options->min_partial_merge_operands = ParseUint32(value);
-    } else if (name == "inplace_update_support") {
-      new_options->inplace_update_support = ParseBoolean(name, value);
-    } else if (name == "prefix_extractor") {
-      const std::string kFixedPrefixName = "fixed:";
-      const std::string kCappedPrefixName = "capped:";
-      auto& pe_value = value;
-      if (pe_value.size() > kFixedPrefixName.size() &&
-          pe_value.compare(0, kFixedPrefixName.size(), kFixedPrefixName) == 0) {
-        int prefix_length =
-            ParseInt(trim(value.substr(kFixedPrefixName.size())));
-        new_options->prefix_extractor.reset(
-            NewFixedPrefixTransform(prefix_length));
-      } else if (pe_value.size() > kCappedPrefixName.size() &&
-                 pe_value.compare(0, kCappedPrefixName.size(),
-                                  kCappedPrefixName) == 0) {
-        int prefix_length =
-            ParseInt(trim(pe_value.substr(kCappedPrefixName.size())));
-        new_options->prefix_extractor.reset(
-            NewCappedPrefixTransform(prefix_length));
-      } else {
+    } else {
+      auto iter = cf_options_type_info.find(name);
+      if (iter == cf_options_type_info.end()) {
         return false;
       }
-    } else if (name == "optimize_filters_for_hits") {
-      new_options->optimize_filters_for_hits = ParseBoolean(name, value);
-    } else {
-      return false;
+      const auto& opt_info = iter->second;
+      return ParseOptionHelper(
+          reinterpret_cast<char*>(new_options) + opt_info.offset, opt_info.type,
+          value);
     }
-  }
-  catch (std::exception& e) {
+  } catch (std::exception& e) {
     return false;
   }
   return true;
 }
 
-bool ParseDBOption(const std::string& name, const std::string& value,
-                   DBOptions* new_options) {
+bool SerializeSingleDBOption(std::string* opt_string,
+                             const DBOptions& db_options,
+                             const std::string& name,
+                             const std::string& delimiter) {
+  auto iter = db_options_type_info.find(name);
+  if (iter == db_options_type_info.end()) {
+    return false;
+  }
+  auto& opt_info = iter->second;
+  const char* opt_address =
+      reinterpret_cast<const char*>(&db_options) + opt_info.offset;
+  std::string value;
+  bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value);
+  if (result) {
+    *opt_string = name + "=" + value + delimiter;
+  }
+  return result;
+}
+
+Status GetStringFromDBOptions(std::string* opt_string,
+                              const DBOptions& db_options,
+                              const std::string& delimiter) {
+  assert(opt_string);
+  opt_string->clear();
+  for (auto iter = db_options_type_info.begin();
+       iter != db_options_type_info.end(); ++iter) {
+    if (iter->second.verification == OptionVerificationType::kDeprecated) {
+      // If the option is no longer used in rocksdb and marked as deprecated,
+      // we skip it in the serialization.
+      continue;
+    }
+    std::string single_output;
+    bool result = SerializeSingleDBOption(&single_output, db_options,
+                                          iter->first, delimiter);
+    assert(result);
+    if (result) {
+      opt_string->append(single_output);
+    }
+  }
+  return Status::OK();
+}
+
+bool SerializeSingleColumnFamilyOption(std::string* opt_string,
+                                       const ColumnFamilyOptions& cf_options,
+                                       const std::string& name,
+                                       const std::string& delimiter) {
+  auto iter = cf_options_type_info.find(name);
+  if (iter == cf_options_type_info.end()) {
+    return false;
+  }
+  auto& opt_info = iter->second;
+  const char* opt_address =
+      reinterpret_cast<const char*>(&cf_options) + opt_info.offset;
+  std::string value;
+  bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value);
+  if (result) {
+    *opt_string = name + "=" + value + delimiter;
+  }
+  return result;
+}
+
+Status GetStringFromColumnFamilyOptions(std::string* opt_string,
+                                        const ColumnFamilyOptions& cf_options,
+                                        const std::string& delimiter) {
+  assert(opt_string);
+  opt_string->clear();
+  for (auto iter = cf_options_type_info.begin();
+       iter != cf_options_type_info.end(); ++iter) {
+    if (iter->second.verification == OptionVerificationType::kDeprecated) {
+      // If the option is no longer used in rocksdb and marked as deprecated,
+      // we skip it in the serialization.
+      continue;
+    }
+    std::string single_output;
+    bool result = SerializeSingleColumnFamilyOption(&single_output, cf_options,
+                                                    iter->first, delimiter);
+    if (result) {
+      opt_string->append(single_output);
+    } else {
+      return Status::InvalidArgument("failed to serialize %s\n",
+                                     iter->first.c_str());
+    }
+    assert(result);
+  }
+  return Status::OK();
+}
+
+bool ParseDBOption(const std::string& name, const std::string& org_value,
+                   DBOptions* new_options, bool input_string_escaped = false) {
+  const std::string& value =
+      input_string_escaped ? UnescapeOptionString(org_value) : org_value;
   try {
-    if (name == "create_if_missing") {
-      new_options->create_if_missing = ParseBoolean(name, value);
-    } else if (name == "create_missing_column_families") {
-      new_options->create_missing_column_families =
-          ParseBoolean(name, value);
-    } else if (name == "error_if_exists") {
-      new_options->error_if_exists = ParseBoolean(name, value);
-    } else if (name == "paranoid_checks") {
-      new_options->paranoid_checks = ParseBoolean(name, value);
-    } else if (name == "rate_limiter_bytes_per_sec") {
+    if (name == "rate_limiter_bytes_per_sec") {
       new_options->rate_limiter.reset(
           NewGenericRateLimiter(static_cast<int64_t>(ParseUint64(value))));
-    } else if (name == "max_open_files") {
-      new_options->max_open_files = ParseInt(value);
-    } else if (name == "max_total_wal_size") {
-      new_options->max_total_wal_size = ParseUint64(value);
-    } else if (name == "disable_data_sync") {
-      new_options->disableDataSync = ParseBoolean(name, value);
-    } else if (name == "use_fsync") {
-      new_options->use_fsync = ParseBoolean(name, value);
-    } else if (name == "db_paths") {
-      // TODO(ljin): add support
-      return false;
-    } else if (name == "db_log_dir") {
-      new_options->db_log_dir = value;
-    } else if (name == "wal_dir") {
-      new_options->wal_dir = value;
-    } else if (name == "delete_obsolete_files_period_micros") {
-      new_options->delete_obsolete_files_period_micros = ParseUint64(value);
-    } else if (name == "max_background_compactions") {
-      new_options->max_background_compactions = ParseInt(value);
-    } else if (name == "max_background_flushes") {
-      new_options->max_background_flushes = ParseInt(value);
-    } else if (name == "max_log_file_size") {
-      new_options->max_log_file_size = ParseSizeT(value);
-    } else if (name == "log_file_time_to_roll") {
-      new_options->log_file_time_to_roll = ParseSizeT(value);
-    } else if (name == "keep_log_file_num") {
-      new_options->keep_log_file_num = ParseSizeT(value);
-    } else if (name == "max_manifest_file_size") {
-      new_options->max_manifest_file_size = ParseUint64(value);
-    } else if (name == "table_cache_numshardbits") {
-      new_options->table_cache_numshardbits = ParseInt(value);
-    } else if (name == "WAL_ttl_seconds") {
-      new_options->WAL_ttl_seconds = ParseUint64(value);
-    } else if (name == "WAL_size_limit_MB") {
-      new_options->WAL_size_limit_MB = ParseUint64(value);
-    } else if (name == "manifest_preallocation_size") {
-      new_options->manifest_preallocation_size = ParseSizeT(value);
-    } else if (name == "allow_os_buffer") {
-      new_options->allow_os_buffer = ParseBoolean(name, value);
-    } else if (name == "allow_mmap_reads") {
-      new_options->allow_mmap_reads = ParseBoolean(name, value);
-    } else if (name == "allow_mmap_writes") {
-      new_options->allow_mmap_writes = ParseBoolean(name, value);
-    } else if (name == "is_fd_close_on_exec") {
-      new_options->is_fd_close_on_exec = ParseBoolean(name, value);
-    } else if (name == "skip_log_error_on_recovery") {
-      new_options->skip_log_error_on_recovery = ParseBoolean(name, value);
-    } else if (name == "stats_dump_period_sec") {
-      new_options->stats_dump_period_sec = ParseUint32(value);
-    } else if (name == "advise_random_on_open") {
-      new_options->advise_random_on_open = ParseBoolean(name, value);
-    } else if (name == "db_write_buffer_size") {
-      new_options->db_write_buffer_size = ParseUint64(value);
-    } else if (name == "use_adaptive_mutex") {
-      new_options->use_adaptive_mutex = ParseBoolean(name, value);
-    } else if (name == "bytes_per_sync") {
-      new_options->bytes_per_sync = ParseUint64(value);
-    } else if (name == "wal_bytes_per_sync") {
-      new_options->wal_bytes_per_sync = ParseUint64(value);
     } else {
-      return false;
+      auto iter = db_options_type_info.find(name);
+      if (iter == db_options_type_info.end()) {
+        return false;
+      }
+      const auto& opt_info = iter->second;
+      if (opt_info.verification != OptionVerificationType::kByName &&
+          opt_info.verification != OptionVerificationType::kDeprecated) {
+        return ParseOptionHelper(
+            reinterpret_cast<char*>(new_options) + opt_info.offset,
+            opt_info.type, value);
+      }
     }
-  }
-  catch (std::exception& e) {
+  } catch (const std::exception& e) {
     return false;
   }
   return true;
@@ -646,15 +987,64 @@ Status GetBlockBasedTableOptionsFromString(
                                           new_table_options);
 }
 
+Status GetPlainTableOptionsFromMap(
+    const PlainTableOptions& table_options,
+    const std::unordered_map<std::string, std::string>& opts_map,
+    PlainTableOptions* new_table_options) {
+  assert(new_table_options);
+  *new_table_options = table_options;
+
+  for (const auto& o : opts_map) {
+    try {
+      if (o.first == "user_key_len") {
+        new_table_options->user_key_len = ParseUint32(o.second);
+      } else if (o.first == "bloom_bits_per_key") {
+        new_table_options->bloom_bits_per_key = ParseInt(o.second);
+      } else if (o.first == "hash_table_ratio") {
+        new_table_options->hash_table_ratio = ParseDouble(o.second);
+      } else if (o.first == "index_sparseness") {
+        new_table_options->index_sparseness = ParseSizeT(o.second);
+      } else if (o.first == "huge_page_tlb_size") {
+        new_table_options->huge_page_tlb_size = ParseSizeT(o.second);
+      } else if (o.first == "encoding_type") {
+        if (o.second == "kPlain") {
+          new_table_options->encoding_type = kPlain;
+        } else if (o.second == "kPrefix") {
+          new_table_options->encoding_type = kPrefix;
+        } else {
+          throw std::invalid_argument("Unknown encoding_type: " + o.second);
+        }
+      } else if (o.first == "full_scan_mode") {
+        new_table_options->full_scan_mode = ParseBoolean(o.first, o.second);
+      } else if (o.first == "store_index_in_file") {
+        new_table_options->store_index_in_file =
+            ParseBoolean(o.first, o.second);
+      } else {
+        return Status::InvalidArgument("Unrecognized option: " + o.first);
+      }
+    } catch (std::exception& e) {
+      return Status::InvalidArgument("error parsing " + o.first + ":" +
+                                     std::string(e.what()));
+    }
+  }
+  return Status::OK();
+}
+
 Status GetColumnFamilyOptionsFromMap(
     const ColumnFamilyOptions& base_options,
     const std::unordered_map<std::string, std::string>& opts_map,
-    ColumnFamilyOptions* new_options) {
+    ColumnFamilyOptions* new_options, bool input_strings_escaped) {
   assert(new_options);
   *new_options = base_options;
   for (const auto& o : opts_map) {
-    if (!ParseColumnFamilyOption(o.first, o.second, new_options)) {
-      return Status::InvalidArgument("Can't parse option " + o.first);
+    if (!ParseColumnFamilyOption(o.first, o.second, new_options,
+                                 input_strings_escaped)) {
+      auto iter = cf_options_type_info.find(o.first);
+      if (iter == cf_options_type_info.end() ||
+          (iter->second.verification != OptionVerificationType::kByName &&
+           iter->second.verification != OptionVerificationType::kDeprecated)) {
+        return Status::InvalidArgument("Can't parse option " + o.first);
+      }
     }
   }
   return Status::OK();
@@ -675,11 +1065,13 @@ Status GetColumnFamilyOptionsFromString(
 Status GetDBOptionsFromMap(
     const DBOptions& base_options,
     const std::unordered_map<std::string, std::string>& opts_map,
-    DBOptions* new_options) {
+    DBOptions* new_options, bool input_strings_escaped) {
   assert(new_options);
   *new_options = base_options;
   for (const auto& o : opts_map) {
-    if (!ParseDBOption(o.first, o.second, new_options)) {
+    if (!ParseDBOption(o.first, o.second, new_options, input_strings_escaped)) {
+      // Note that options with kDeprecated validation will pass ParseDBOption
+      // and will not hit the below statement.
       return Status::InvalidArgument("Can't parse option " + o.first);
     }
   }
@@ -718,5 +1110,5 @@ Status GetOptionsFromString(const Options& base_options,
   return Status::OK();
 }
 
-#endif  // ROCKSDB_LITE
+#endif  // !ROCKSDB_LITE
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/options_helper.h b/src/rocksdb/util/options_helper.h
index 02c7881..d72a375 100644
--- a/src/rocksdb/util/options_helper.h
+++ b/src/rocksdb/util/options_helper.h
@@ -7,14 +7,400 @@
 
 #include <string>
 #include <stdexcept>
-#include "util/mutable_cf_options.h"
+#include "rocksdb/options.h"
 #include "rocksdb/status.h"
+#include "util/mutable_cf_options.h"
 
+#ifndef ROCKSDB_LITE
 namespace rocksdb {
 
+// Returns true if the input char "c" is considered as a special character
+// that will be escaped when EscapeOptionString() is called.
+//
+// @param c the input char
+// @return true if the input char "c" is considered as a special character.
+// @see EscapeOptionString
+bool isSpecialChar(const char c);
+
+// If the input char is an escaped char, it will return the its
+// associated raw-char.  Otherwise, the function will simply return
+// the original input char.
+char UnescapeChar(const char c);
+
+// If the input char is a control char, it will return the its
+// associated escaped char.  Otherwise, the function will simply return
+// the original input char.
+char EscapeChar(const char c);
+
+// Converts a raw string to an escaped string.  Escaped-characters are
+// defined via the isSpecialChar() function.  When a char in the input
+// string "raw_string" is classified as a special characters, then it
+// will be prefixed by '\' in the output.
+//
+// It's inverse function is UnescapeOptionString().
+// @param raw_string the input string
+// @return the '\' escaped string of the input "raw_string"
+// @see isSpecialChar, UnescapeOptionString
+std::string EscapeOptionString(const std::string& raw_string);
+
+// The inverse function of EscapeOptionString.  It converts
+// an '\' escaped string back to a raw string.
+//
+// @param escaped_string the input '\' escaped string
+// @return the raw string of the input "escaped_string"
+std::string UnescapeOptionString(const std::string& escaped_string);
+
 Status GetMutableOptionsFromStrings(
     const MutableCFOptions& base_options,
     const std::unordered_map<std::string, std::string>& options_map,
     MutableCFOptions* new_options);
 
+enum class OptionType {
+  kBoolean,
+  kInt,
+  kUInt,
+  kUInt32T,
+  kUInt64T,
+  kSizeT,
+  kString,
+  kDouble,
+  kCompactionStyle,
+  kSliceTransform,
+  kCompressionType,
+  kVectorCompressionType,
+  kTableFactory,
+  kComparator,
+  kCompactionFilter,
+  kCompactionFilterFactory,
+  kMergeOperator,
+  kMemTableRepFactory,
+  kUnknown
+};
+
+enum class OptionVerificationType {
+  kNormal,
+  kByName,     // The option is pointer typed so we can only verify
+               // based on it's name.
+  kDeprecated  // The option is no longer used in rocksdb. The RocksDB
+               // OptionsParser will still accept this option if it
+               // happen to exists in some Options file.  However, the
+               // parser will not include it in serialization and
+               // verification processes.
+};
+
+// A struct for storing constant option information such as option name,
+// option type, and offset.
+struct OptionTypeInfo {
+  int offset;
+  OptionType type;
+  OptionVerificationType verification;
+};
+
+// A helper function that converts "opt_address" to a std::string
+// based on the specified OptionType.
+bool SerializeSingleOptionHelper(const char* opt_address,
+                                 const OptionType opt_type, std::string* value);
+
+static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
+    /*
+     // not yet supported
+      AccessHint access_hint_on_compaction_start;
+      Env* env;
+      InfoLogLevel info_log_level;
+      WALRecoveryMode wal_recovery_mode;
+      std::shared_ptr<Cache> row_cache;
+      std::shared_ptr<DeleteScheduler> delete_scheduler;
+      std::shared_ptr<Logger> info_log;
+      std::shared_ptr<RateLimiter> rate_limiter;
+      std::shared_ptr<Statistics> statistics;
+      std::vector<DbPath> db_paths;
+      std::vector<std::shared_ptr<EventListener>> listeners;
+     */
+    {"advise_random_on_open",
+     {offsetof(struct DBOptions, advise_random_on_open), OptionType::kBoolean,
+      OptionVerificationType::kNormal}},
+    {"allow_mmap_reads",
+     {offsetof(struct DBOptions, allow_mmap_reads), OptionType::kBoolean,
+      OptionVerificationType::kNormal}},
+    {"allow_fallocate",
+     {offsetof(struct DBOptions, allow_fallocate), OptionType::kBoolean,
+      OptionVerificationType::kNormal}},
+    {"allow_mmap_writes",
+     {offsetof(struct DBOptions, allow_mmap_writes), OptionType::kBoolean,
+      OptionVerificationType::kNormal}},
+    {"allow_os_buffer",
+     {offsetof(struct DBOptions, allow_os_buffer), OptionType::kBoolean,
+      OptionVerificationType::kNormal}},
+    {"create_if_missing",
+     {offsetof(struct DBOptions, create_if_missing), OptionType::kBoolean,
+      OptionVerificationType::kNormal}},
+    {"create_missing_column_families",
+     {offsetof(struct DBOptions, create_missing_column_families),
+      OptionType::kBoolean, OptionVerificationType::kNormal}},
+    {"disableDataSync",
+     {offsetof(struct DBOptions, disableDataSync), OptionType::kBoolean,
+      OptionVerificationType::kNormal}},
+    {"disable_data_sync",  // for compatibility
+     {offsetof(struct DBOptions, disableDataSync), OptionType::kBoolean,
+      OptionVerificationType::kNormal}},
+    {"enable_thread_tracking",
+     {offsetof(struct DBOptions, enable_thread_tracking), OptionType::kBoolean,
+      OptionVerificationType::kNormal}},
+    {"error_if_exists",
+     {offsetof(struct DBOptions, error_if_exists), OptionType::kBoolean,
+      OptionVerificationType::kNormal}},
+    {"is_fd_close_on_exec",
+     {offsetof(struct DBOptions, is_fd_close_on_exec), OptionType::kBoolean,
+      OptionVerificationType::kNormal}},
+    {"paranoid_checks",
+     {offsetof(struct DBOptions, paranoid_checks), OptionType::kBoolean,
+      OptionVerificationType::kNormal}},
+    {"skip_log_error_on_recovery",
+     {offsetof(struct DBOptions, skip_log_error_on_recovery),
+      OptionType::kBoolean, OptionVerificationType::kNormal}},
+    {"skip_stats_update_on_db_open",
+     {offsetof(struct DBOptions, skip_stats_update_on_db_open),
+      OptionType::kBoolean, OptionVerificationType::kNormal}},
+    {"new_table_reader_for_compaction_inputs",
+     {offsetof(struct DBOptions, new_table_reader_for_compaction_inputs),
+      OptionType::kBoolean, OptionVerificationType::kNormal}},
+    {"compaction_readahead_size",
+     {offsetof(struct DBOptions, compaction_readahead_size), OptionType::kSizeT,
+      OptionVerificationType::kNormal}},
+    {"use_adaptive_mutex",
+     {offsetof(struct DBOptions, use_adaptive_mutex), OptionType::kBoolean,
+      OptionVerificationType::kNormal}},
+    {"use_fsync",
+     {offsetof(struct DBOptions, use_fsync), OptionType::kBoolean,
+      OptionVerificationType::kNormal}},
+    {"max_background_compactions",
+     {offsetof(struct DBOptions, max_background_compactions), OptionType::kInt,
+      OptionVerificationType::kNormal}},
+    {"max_background_flushes",
+     {offsetof(struct DBOptions, max_background_flushes), OptionType::kInt,
+      OptionVerificationType::kNormal}},
+    {"max_file_opening_threads",
+     {offsetof(struct DBOptions, max_file_opening_threads), OptionType::kInt,
+      OptionVerificationType::kNormal}},
+    {"max_open_files",
+     {offsetof(struct DBOptions, max_open_files), OptionType::kInt,
+      OptionVerificationType::kNormal}},
+    {"table_cache_numshardbits",
+     {offsetof(struct DBOptions, table_cache_numshardbits), OptionType::kInt,
+      OptionVerificationType::kNormal}},
+    {"db_write_buffer_size",
+     {offsetof(struct DBOptions, db_write_buffer_size), OptionType::kSizeT,
+      OptionVerificationType::kNormal}},
+    {"keep_log_file_num",
+     {offsetof(struct DBOptions, keep_log_file_num), OptionType::kSizeT,
+      OptionVerificationType::kNormal}},
+    {"log_file_time_to_roll",
+     {offsetof(struct DBOptions, log_file_time_to_roll), OptionType::kSizeT,
+      OptionVerificationType::kNormal}},
+    {"manifest_preallocation_size",
+     {offsetof(struct DBOptions, manifest_preallocation_size),
+      OptionType::kSizeT, OptionVerificationType::kNormal}},
+    {"max_log_file_size",
+     {offsetof(struct DBOptions, max_log_file_size), OptionType::kSizeT,
+      OptionVerificationType::kNormal}},
+    {"db_log_dir",
+     {offsetof(struct DBOptions, db_log_dir), OptionType::kString,
+      OptionVerificationType::kNormal}},
+    {"wal_dir",
+     {offsetof(struct DBOptions, wal_dir), OptionType::kString,
+      OptionVerificationType::kNormal}},
+    {"max_subcompactions",
+     {offsetof(struct DBOptions, max_subcompactions), OptionType::kUInt32T,
+      OptionVerificationType::kNormal}},
+    {"WAL_size_limit_MB",
+     {offsetof(struct DBOptions, WAL_size_limit_MB), OptionType::kUInt64T,
+      OptionVerificationType::kNormal}},
+    {"WAL_ttl_seconds",
+     {offsetof(struct DBOptions, WAL_ttl_seconds), OptionType::kUInt64T,
+      OptionVerificationType::kNormal}},
+    {"bytes_per_sync",
+     {offsetof(struct DBOptions, bytes_per_sync), OptionType::kUInt64T,
+      OptionVerificationType::kNormal}},
+    {"delayed_write_rate",
+     {offsetof(struct DBOptions, delayed_write_rate), OptionType::kUInt64T,
+      OptionVerificationType::kNormal}},
+    {"delete_obsolete_files_period_micros",
+     {offsetof(struct DBOptions, delete_obsolete_files_period_micros),
+      OptionType::kUInt64T, OptionVerificationType::kNormal}},
+    {"max_manifest_file_size",
+     {offsetof(struct DBOptions, max_manifest_file_size), OptionType::kUInt64T,
+      OptionVerificationType::kNormal}},
+    {"max_total_wal_size",
+     {offsetof(struct DBOptions, max_total_wal_size), OptionType::kUInt64T,
+      OptionVerificationType::kNormal}},
+    {"wal_bytes_per_sync",
+     {offsetof(struct DBOptions, wal_bytes_per_sync), OptionType::kUInt64T,
+      OptionVerificationType::kNormal}},
+    {"stats_dump_period_sec",
+     {offsetof(struct DBOptions, stats_dump_period_sec), OptionType::kUInt,
+      OptionVerificationType::kNormal}}};
+
+static std::unordered_map<std::string, OptionTypeInfo> cf_options_type_info = {
+    /* not yet supported
+    CompactionOptionsFIFO compaction_options_fifo;
+    CompactionOptionsUniversal compaction_options_universal;
+    CompressionOptions compression_opts;
+    TablePropertiesCollectorFactories table_properties_collector_factories;
+    typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+        TablePropertiesCollectorFactories;
+    UpdateStatus (*inplace_callback)(char* existing_value,
+                                     uint34_t* existing_value_size,
+                                     Slice delta_value,
+                                     std::string* merged_value);
+    std::vector<int> max_bytes_for_level_multiplier_additional;
+     */
+    {"compaction_measure_io_stats",
+     {offsetof(struct ColumnFamilyOptions, compaction_measure_io_stats),
+      OptionType::kBoolean, OptionVerificationType::kNormal}},
+    {"disable_auto_compactions",
+     {offsetof(struct ColumnFamilyOptions, disable_auto_compactions),
+      OptionType::kBoolean, OptionVerificationType::kNormal}},
+    {"filter_deletes",
+     {offsetof(struct ColumnFamilyOptions, filter_deletes),
+      OptionType::kBoolean, OptionVerificationType::kNormal}},
+    {"inplace_update_support",
+     {offsetof(struct ColumnFamilyOptions, inplace_update_support),
+      OptionType::kBoolean, OptionVerificationType::kNormal}},
+    {"level_compaction_dynamic_level_bytes",
+     {offsetof(struct ColumnFamilyOptions,
+               level_compaction_dynamic_level_bytes),
+      OptionType::kBoolean, OptionVerificationType::kNormal}},
+    {"optimize_filters_for_hits",
+     {offsetof(struct ColumnFamilyOptions, optimize_filters_for_hits),
+      OptionType::kBoolean, OptionVerificationType::kNormal}},
+    {"paranoid_file_checks",
+     {offsetof(struct ColumnFamilyOptions, paranoid_file_checks),
+      OptionType::kBoolean, OptionVerificationType::kNormal}},
+    {"purge_redundant_kvs_while_flush",
+     {offsetof(struct ColumnFamilyOptions, purge_redundant_kvs_while_flush),
+      OptionType::kBoolean, OptionVerificationType::kNormal}},
+    {"verify_checksums_in_compaction",
+     {offsetof(struct ColumnFamilyOptions, verify_checksums_in_compaction),
+      OptionType::kBoolean, OptionVerificationType::kNormal}},
+    {"hard_pending_compaction_bytes_limit",
+     {offsetof(struct ColumnFamilyOptions, hard_pending_compaction_bytes_limit),
+      OptionType::kUInt64T, OptionVerificationType::kNormal}},
+    {"hard_rate_limit",
+     {offsetof(struct ColumnFamilyOptions, hard_rate_limit),
+      OptionType::kDouble, OptionVerificationType::kDeprecated}},
+    {"soft_rate_limit",
+     {offsetof(struct ColumnFamilyOptions, soft_rate_limit),
+      OptionType::kDouble, OptionVerificationType::kNormal}},
+    {"expanded_compaction_factor",
+     {offsetof(struct ColumnFamilyOptions, expanded_compaction_factor),
+      OptionType::kInt, OptionVerificationType::kNormal}},
+    {"level0_file_num_compaction_trigger",
+     {offsetof(struct ColumnFamilyOptions, level0_file_num_compaction_trigger),
+      OptionType::kInt, OptionVerificationType::kNormal}},
+    {"level0_slowdown_writes_trigger",
+     {offsetof(struct ColumnFamilyOptions, level0_slowdown_writes_trigger),
+      OptionType::kInt, OptionVerificationType::kNormal}},
+    {"level0_stop_writes_trigger",
+     {offsetof(struct ColumnFamilyOptions, level0_stop_writes_trigger),
+      OptionType::kInt, OptionVerificationType::kNormal}},
+    {"max_bytes_for_level_multiplier",
+     {offsetof(struct ColumnFamilyOptions, max_bytes_for_level_multiplier),
+      OptionType::kInt, OptionVerificationType::kNormal}},
+    {"max_grandparent_overlap_factor",
+     {offsetof(struct ColumnFamilyOptions, max_grandparent_overlap_factor),
+      OptionType::kInt, OptionVerificationType::kNormal}},
+    {"max_mem_compaction_level",
+     {offsetof(struct ColumnFamilyOptions, max_mem_compaction_level),
+      OptionType::kInt, OptionVerificationType::kDeprecated}},
+    {"max_write_buffer_number",
+     {offsetof(struct ColumnFamilyOptions, max_write_buffer_number),
+      OptionType::kInt, OptionVerificationType::kNormal}},
+    {"max_write_buffer_number_to_maintain",
+     {offsetof(struct ColumnFamilyOptions, max_write_buffer_number_to_maintain),
+      OptionType::kInt, OptionVerificationType::kNormal}},
+    {"min_write_buffer_number_to_merge",
+     {offsetof(struct ColumnFamilyOptions, min_write_buffer_number_to_merge),
+      OptionType::kInt, OptionVerificationType::kNormal}},
+    {"num_levels",
+     {offsetof(struct ColumnFamilyOptions, num_levels), OptionType::kInt,
+      OptionVerificationType::kNormal}},
+    {"source_compaction_factor",
+     {offsetof(struct ColumnFamilyOptions, source_compaction_factor),
+      OptionType::kInt, OptionVerificationType::kNormal}},
+    {"target_file_size_multiplier",
+     {offsetof(struct ColumnFamilyOptions, target_file_size_multiplier),
+      OptionType::kInt, OptionVerificationType::kNormal}},
+    {"arena_block_size",
+     {offsetof(struct ColumnFamilyOptions, arena_block_size),
+      OptionType::kSizeT, OptionVerificationType::kNormal}},
+    {"inplace_update_num_locks",
+     {offsetof(struct ColumnFamilyOptions, inplace_update_num_locks),
+      OptionType::kSizeT, OptionVerificationType::kNormal}},
+    {"max_successive_merges",
+     {offsetof(struct ColumnFamilyOptions, max_successive_merges),
+      OptionType::kSizeT, OptionVerificationType::kNormal}},
+    {"memtable_prefix_bloom_huge_page_tlb_size",
+     {offsetof(struct ColumnFamilyOptions,
+               memtable_prefix_bloom_huge_page_tlb_size),
+      OptionType::kSizeT, OptionVerificationType::kNormal}},
+    {"write_buffer_size",
+     {offsetof(struct ColumnFamilyOptions, write_buffer_size),
+      OptionType::kSizeT, OptionVerificationType::kNormal}},
+    {"bloom_locality",
+     {offsetof(struct ColumnFamilyOptions, bloom_locality),
+      OptionType::kUInt32T, OptionVerificationType::kNormal}},
+    {"memtable_prefix_bloom_bits",
+     {offsetof(struct ColumnFamilyOptions, memtable_prefix_bloom_bits),
+      OptionType::kUInt32T, OptionVerificationType::kNormal}},
+    {"memtable_prefix_bloom_probes",
+     {offsetof(struct ColumnFamilyOptions, memtable_prefix_bloom_probes),
+      OptionType::kUInt32T, OptionVerificationType::kNormal}},
+    {"min_partial_merge_operands",
+     {offsetof(struct ColumnFamilyOptions, min_partial_merge_operands),
+      OptionType::kUInt32T, OptionVerificationType::kNormal}},
+    {"max_bytes_for_level_base",
+     {offsetof(struct ColumnFamilyOptions, max_bytes_for_level_base),
+      OptionType::kUInt64T, OptionVerificationType::kNormal}},
+    {"max_sequential_skip_in_iterations",
+     {offsetof(struct ColumnFamilyOptions, max_sequential_skip_in_iterations),
+      OptionType::kUInt64T, OptionVerificationType::kNormal}},
+    {"target_file_size_base",
+     {offsetof(struct ColumnFamilyOptions, target_file_size_base),
+      OptionType::kUInt64T, OptionVerificationType::kNormal}},
+    {"rate_limit_delay_max_milliseconds",
+     {offsetof(struct ColumnFamilyOptions, rate_limit_delay_max_milliseconds),
+      OptionType::kUInt, OptionVerificationType::kDeprecated}},
+    {"compression",
+     {offsetof(struct ColumnFamilyOptions, compression),
+      OptionType::kCompressionType, OptionVerificationType::kNormal}},
+    {"compression_per_level",
+     {offsetof(struct ColumnFamilyOptions, compression_per_level),
+      OptionType::kVectorCompressionType, OptionVerificationType::kNormal}},
+    {"comparator",
+     {offsetof(struct ColumnFamilyOptions, comparator), OptionType::kComparator,
+      OptionVerificationType::kByName}},
+    {"prefix_extractor",
+     {offsetof(struct ColumnFamilyOptions, prefix_extractor),
+      OptionType::kSliceTransform, OptionVerificationType::kByName}},
+    {"memtable_factory",
+     {offsetof(struct ColumnFamilyOptions, memtable_factory),
+      OptionType::kMemTableRepFactory, OptionVerificationType::kByName}},
+    {"table_factory",
+     {offsetof(struct ColumnFamilyOptions, table_factory),
+      OptionType::kTableFactory, OptionVerificationType::kByName}},
+    {"compaction_filter",
+     {offsetof(struct ColumnFamilyOptions, compaction_filter),
+      OptionType::kCompactionFilter, OptionVerificationType::kByName}},
+    {"compaction_filter_factory",
+     {offsetof(struct ColumnFamilyOptions, compaction_filter_factory),
+      OptionType::kCompactionFilterFactory, OptionVerificationType::kByName}},
+    {"merge_operator",
+     {offsetof(struct ColumnFamilyOptions, merge_operator),
+      OptionType::kMergeOperator, OptionVerificationType::kByName}},
+    {"compaction_style",
+     {offsetof(struct ColumnFamilyOptions, compaction_style),
+      OptionType::kCompactionStyle, OptionVerificationType::kNormal}}};
+
 }  // namespace rocksdb
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/util/options_parser.cc b/src/rocksdb/util/options_parser.cc
new file mode 100644
index 0000000..d792554
--- /dev/null
+++ b/src/rocksdb/util/options_parser.cc
@@ -0,0 +1,612 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include "util/options_parser.h"
+
+#include <cmath>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "util/options_helper.h"
+#include "util/string_util.h"
+
+#include "port/port.h"
+
+namespace rocksdb {
+
+static const std::string option_file_header =
+    "# This is a RocksDB option file.\n"
+    "#\n"
+    "# For detailed file format spec, please refer to the example file\n"
+    "# in examples/rocksdb_option_file_example.ini\n"
+    "#\n"
+    "\n";
+
+Status PersistRocksDBOptions(const DBOptions& db_opt,
+                             const std::vector<std::string>& cf_names,
+                             const std::vector<ColumnFamilyOptions>& cf_opts,
+                             const std::string& file_name, Env* env) {
+  if (cf_names.size() != cf_opts.size()) {
+    return Status::InvalidArgument(
+        "cf_names.size() and cf_opts.size() must be the same");
+  }
+  std::unique_ptr<WritableFile> writable;
+
+  Status s = env->NewWritableFile(file_name, &writable, EnvOptions());
+  if (!s.ok()) {
+    return s;
+  }
+  std::string options_file_content;
+
+  writable->Append(option_file_header + "[" +
+                   opt_section_titles[kOptionSectionVersion] +
+                   "]\n"
+                   "  rocksdb_version=" +
+                   ToString(ROCKSDB_MAJOR) + "." + ToString(ROCKSDB_MINOR) +
+                   "." + ToString(ROCKSDB_PATCH) + "\n");
+  writable->Append("  options_file_version=" +
+                   ToString(ROCKSDB_OPTION_FILE_MAJOR) + "." +
+                   ToString(ROCKSDB_OPTION_FILE_MINOR) + "\n");
+  writable->Append("\n[" + opt_section_titles[kOptionSectionDBOptions] +
+                   "]\n  ");
+
+  s = GetStringFromDBOptions(&options_file_content, db_opt, "\n  ");
+  if (!s.ok()) {
+    writable->Close();
+    return s;
+  }
+  writable->Append(options_file_content + "\n");
+
+  for (size_t i = 0; i < cf_opts.size(); ++i) {
+    writable->Append("\n[" + opt_section_titles[kOptionSectionCFOptions] +
+                     " \"" + EscapeOptionString(cf_names[i]) + "\"]\n  ");
+    s = GetStringFromColumnFamilyOptions(&options_file_content, cf_opts[i],
+                                         "\n  ");
+    if (!s.ok()) {
+      writable->Close();
+      return s;
+    }
+    writable->Append(options_file_content + "\n");
+  }
+  writable->Flush();
+  writable->Fsync();
+  writable->Close();
+
+  return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
+      db_opt, cf_names, cf_opts, file_name, env);
+}
+
+RocksDBOptionsParser::RocksDBOptionsParser() { Reset(); }
+
+void RocksDBOptionsParser::Reset() {
+  db_opt_ = DBOptions();
+  db_opt_map_.clear();
+  cf_names_.clear();
+  cf_opts_.clear();
+  cf_opt_maps_.clear();
+  has_version_section_ = false;
+  has_db_options_ = false;
+  has_default_cf_options_ = false;
+  for (int i = 0; i < 3; ++i) {
+    db_version[i] = 0;
+    opt_file_version[i] = 0;
+  }
+}
+
+bool RocksDBOptionsParser::IsSection(const std::string& line) {
+  if (line.size() < 2) {
+    return false;
+  }
+  if (line[0] != '[' || line[line.size() - 1] != ']') {
+    return false;
+  }
+  return true;
+}
+
+Status RocksDBOptionsParser::ParseSection(OptionSection* section,
+                                          std::string* argument,
+                                          const std::string& line,
+                                          const int line_num) {
+  *section = kOptionSectionUnknown;
+  std::string sec_string;
+  // A section is of the form [<SectionName> "<SectionArg>"], where
+  // "<SectionArg>" is optional.
+  size_t arg_start_pos = line.find("\"");
+  size_t arg_end_pos = line.rfind("\"");
+  // The following if-then check tries to identify whether the input
+  // section has the optional section argument.
+  if (arg_start_pos != std::string::npos && arg_start_pos != arg_end_pos) {
+    sec_string = TrimAndRemoveComment(line.substr(1, arg_start_pos - 1), true);
+    *argument = UnescapeOptionString(
+        line.substr(arg_start_pos + 1, arg_end_pos - arg_start_pos - 1));
+  } else {
+    sec_string = TrimAndRemoveComment(line.substr(1, line.size() - 2), true);
+    *argument = "";
+  }
+  for (int i = 0; i < kOptionSectionUnknown; ++i) {
+    if (opt_section_titles[i] == sec_string) {
+      *section = static_cast<OptionSection>(i);
+      return CheckSection(*section, *argument, line_num);
+    }
+  }
+  return Status::InvalidArgument(std::string("Unknown section ") + line);
+}
+
+Status RocksDBOptionsParser::InvalidArgument(const int line_num,
+                                             const std::string& message) {
+  return Status::InvalidArgument(
+      "[RocksDBOptionsParser Error] ",
+      message + " (at line " + ToString(line_num) + ")");
+}
+
+Status RocksDBOptionsParser::ParseStatement(std::string* name,
+                                            std::string* value,
+                                            const std::string& line,
+                                            const int line_num) {
+  size_t eq_pos = line.find("=");
+  if (eq_pos == std::string::npos) {
+    return InvalidArgument(line_num, "A valid statement must have a '='.");
+  }
+
+  *name = TrimAndRemoveComment(line.substr(0, eq_pos), true);
+  *value =
+      TrimAndRemoveComment(line.substr(eq_pos + 1, line.size() - eq_pos - 1));
+  if (name->empty()) {
+    return InvalidArgument(line_num,
+                           "A valid statement must have a variable name.");
+  }
+  return Status::OK();
+}
+
+namespace {
+bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
+                 std::string* output, bool* has_data, Status* result) {
+  const int kBufferSize = 4096;
+  char buffer[kBufferSize + 1];
+  Slice input_slice;
+
+  std::string line;
+  bool has_complete_line = false;
+  while (!has_complete_line) {
+    if (std::getline(*iss, line)) {
+      has_complete_line = !iss->eof();
+    } else {
+      has_complete_line = false;
+    }
+    if (!has_complete_line) {
+      // if we're not sure whether we have a complete line,
+      // further read from the file.
+      if (*has_data) {
+        *result = seq_file->Read(kBufferSize, &input_slice, buffer);
+      }
+      if (input_slice.size() == 0) {
+        // meaning we have read all the data
+        *has_data = false;
+        break;
+      } else {
+        iss->str(line + input_slice.ToString());
+        // reset the internal state of iss so that we can keep reading it.
+        iss->clear();
+        *has_data = (input_slice.size() == kBufferSize);
+        continue;
+      }
+    }
+  }
+  *output = line;
+  return *has_data || has_complete_line;
+}
+}  // namespace
+
+Status RocksDBOptionsParser::Parse(const std::string& file_name, Env* env) {
+  Reset();
+
+  std::unique_ptr<SequentialFile> seq_file;
+  Status s = env->NewSequentialFile(file_name, &seq_file, EnvOptions());
+  if (!s.ok()) {
+    return s;
+  }
+
+  OptionSection section = kOptionSectionUnknown;
+  std::string argument;
+  std::unordered_map<std::string, std::string> opt_map;
+  std::istringstream iss;
+  std::string line;
+  bool has_data = true;
+  // we only support single-lined statement.
+  for (int line_num = 1;
+       ReadOneLine(&iss, seq_file.get(), &line, &has_data, &s); ++line_num) {
+    if (!s.ok()) {
+      return s;
+    }
+    line = TrimAndRemoveComment(line);
+    if (line.empty()) {
+      continue;
+    }
+    if (IsSection(line)) {
+      s = EndSection(section, argument, opt_map);
+      opt_map.clear();
+      if (!s.ok()) {
+        return s;
+      }
+      s = ParseSection(&section, &argument, line, line_num);
+      if (!s.ok()) {
+        return s;
+      }
+    } else {
+      std::string name;
+      std::string value;
+      s = ParseStatement(&name, &value, line, line_num);
+      if (!s.ok()) {
+        return s;
+      }
+      opt_map.insert({name, value});
+    }
+  }
+
+  s = EndSection(section, argument, opt_map);
+  opt_map.clear();
+  if (!s.ok()) {
+    return s;
+  }
+  return ValidityCheck();
+}
+
+Status RocksDBOptionsParser::CheckSection(const OptionSection section,
+                                          const std::string& section_arg,
+                                          const int line_num) {
+  if (section == kOptionSectionDBOptions) {
+    if (has_db_options_) {
+      return InvalidArgument(
+          line_num,
+          "More than one DBOption section found in the option config file");
+    }
+    has_db_options_ = true;
+  } else if (section == kOptionSectionCFOptions) {
+    bool is_default_cf = (section_arg == kDefaultColumnFamilyName);
+    if (cf_opts_.size() == 0 && !is_default_cf) {
+      return InvalidArgument(
+          line_num,
+          "Default column family must be the first CFOptions section "
+          "in the option config file");
+    } else if (cf_opts_.size() != 0 && is_default_cf) {
+      return InvalidArgument(
+          line_num,
+          "Default column family must be the first CFOptions section "
+          "in the option config file");
+    } else if (GetCFOptions(section_arg) != nullptr) {
+      return InvalidArgument(
+          line_num,
+          "Two identical column families found in option config file");
+    }
+    has_default_cf_options_ |= is_default_cf;
+  } else if (section == kOptionSectionVersion) {
+    if (has_version_section_) {
+      return InvalidArgument(
+          line_num,
+          "More than one Version section found in the option config file.");
+    }
+    has_version_section_ = true;
+  }
+  return Status::OK();
+}
+
+Status RocksDBOptionsParser::ParseVersionNumber(const std::string& ver_name,
+                                                const std::string& ver_string,
+                                                const int max_count,
+                                                int* version) {
+  int version_index = 0;
+  int current_number = 0;
+  int current_digit_count = 0;
+  bool has_dot = false;
+  for (int i = 0; i < max_count; ++i) {
+    version[i] = 0;
+  }
+  const int kBufferSize = 200;
+  char buffer[kBufferSize];
+  for (size_t i = 0; i < ver_string.size(); ++i) {
+    if (ver_string[i] == '.') {
+      if (version_index >= max_count - 1) {
+        snprintf(buffer, sizeof(buffer) - 1,
+                 "A valid %s can only contains at most %d dots.",
+                 ver_name.c_str(), max_count - 1);
+        return Status::InvalidArgument(buffer);
+      }
+      if (current_digit_count == 0) {
+        snprintf(buffer, sizeof(buffer) - 1,
+                 "A valid %s must have at least one digit before each dot.",
+                 ver_name.c_str());
+        return Status::InvalidArgument(buffer);
+      }
+      version[version_index++] = current_number;
+      current_number = 0;
+      current_digit_count = 0;
+      has_dot = true;
+    } else if (isdigit(ver_string[i])) {
+      current_number = current_number * 10 + (ver_string[i] - '0');
+      current_digit_count++;
+    } else {
+      snprintf(buffer, sizeof(buffer) - 1,
+               "A valid %s can only contains dots and numbers.",
+               ver_name.c_str());
+      return Status::InvalidArgument(buffer);
+    }
+  }
+  version[version_index] = current_number;
+  if (has_dot && current_digit_count == 0) {
+    snprintf(buffer, sizeof(buffer) - 1,
+             "A valid %s must have at least one digit after each dot.",
+             ver_name.c_str());
+    return Status::InvalidArgument(buffer);
+  }
+  return Status::OK();
+}
+
+Status RocksDBOptionsParser::EndSection(
+    const OptionSection section, const std::string& section_arg,
+    const std::unordered_map<std::string, std::string>& opt_map) {
+  Status s;
+  if (section == kOptionSectionDBOptions) {
+    s = GetDBOptionsFromMap(DBOptions(), opt_map, &db_opt_, true);
+    if (!s.ok()) {
+      return s;
+    }
+    db_opt_map_ = opt_map;
+  } else if (section == kOptionSectionCFOptions) {
+    // This condition should be ensured earlier in ParseSection
+    // so we make an assertion here.
+    assert(GetCFOptions(section_arg) == nullptr);
+    cf_names_.emplace_back(section_arg);
+    cf_opts_.emplace_back();
+    s = GetColumnFamilyOptionsFromMap(ColumnFamilyOptions(), opt_map,
+                                      &cf_opts_.back(), true);
+    if (!s.ok()) {
+      return s;
+    }
+    // keep the parsed string.
+    cf_opt_maps_.emplace_back(opt_map);
+  } else if (section == kOptionSectionVersion) {
+    for (const auto pair : opt_map) {
+      if (pair.first == "rocksdb_version") {
+        s = ParseVersionNumber(pair.first, pair.second, 3, db_version);
+        if (!s.ok()) {
+          return s;
+        }
+      } else if (pair.first == "options_file_version") {
+        s = ParseVersionNumber(pair.first, pair.second, 2, opt_file_version);
+        if (!s.ok()) {
+          return s;
+        }
+        if (opt_file_version[0] < 1) {
+          return Status::InvalidArgument(
+              "A valid options_file_version must be at least 1.");
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status RocksDBOptionsParser::ValidityCheck() {
+  if (!has_db_options_) {
+    return Status::Corruption(
+        "A RocksDB Option file must have a single DBOptions section");
+  }
+  if (!has_default_cf_options_) {
+    return Status::Corruption(
+        "A RocksDB Option file must have a single CFOptions:default section");
+  }
+
+  return Status::OK();
+}
+
+std::string RocksDBOptionsParser::TrimAndRemoveComment(const std::string& line,
+                                                       bool trim_only) {
+  size_t start = 0;
+  size_t end = line.size();
+
+  // we only support "#" style comment
+  if (!trim_only) {
+    size_t search_pos = 0;
+    while (search_pos < line.size()) {
+      size_t comment_pos = line.find('#', search_pos);
+      if (comment_pos == std::string::npos) {
+        break;
+      }
+      if (comment_pos == 0 || line[comment_pos - 1] != '\\') {
+        end = comment_pos;
+        break;
+      }
+      search_pos = comment_pos + 1;
+    }
+  }
+
+  while (start < end && isspace(line[start]) != 0) {
+    ++start;
+  }
+
+  // start < end implies end > 0.
+  while (start < end && isspace(line[end - 1]) != 0) {
+    --end;
+  }
+
+  if (start < end) {
+    return line.substr(start, end - start);
+  }
+
+  return "";
+}
+
+namespace {
+bool AreEqualDoubles(const double a, const double b) {
+  return (fabs(a - b) < 0.00001);
+}
+
+bool AreEqualOptions(
+    const char* opt1, const char* opt2, const OptionTypeInfo& type_info,
+    const std::string& opt_name,
+    const std::unordered_map<std::string, std::string>* opt_map) {
+  const char* offset1 = opt1 + type_info.offset;
+  const char* offset2 = opt2 + type_info.offset;
+  switch (type_info.type) {
+    case OptionType::kBoolean:
+      return (*reinterpret_cast<const bool*>(offset1) ==
+              *reinterpret_cast<const bool*>(offset2));
+    case OptionType::kInt:
+      return (*reinterpret_cast<const int*>(offset1) ==
+              *reinterpret_cast<const int*>(offset2));
+    case OptionType::kUInt:
+      return (*reinterpret_cast<const unsigned int*>(offset1) ==
+              *reinterpret_cast<const unsigned int*>(offset2));
+    case OptionType::kUInt32T:
+      return (*reinterpret_cast<const uint32_t*>(offset1) ==
+              *reinterpret_cast<const uint32_t*>(offset2));
+    case OptionType::kUInt64T:
+      return (*reinterpret_cast<const uint64_t*>(offset1) ==
+              *reinterpret_cast<const uint64_t*>(offset2));
+    case OptionType::kSizeT:
+      return (*reinterpret_cast<const size_t*>(offset1) ==
+              *reinterpret_cast<const size_t*>(offset2));
+    case OptionType::kString:
+      return (*reinterpret_cast<const std::string*>(offset1) ==
+              *reinterpret_cast<const std::string*>(offset2));
+    case OptionType::kDouble:
+      return AreEqualDoubles(*reinterpret_cast<const double*>(offset1),
+                             *reinterpret_cast<const double*>(offset2));
+    case OptionType::kCompactionStyle:
+      return (*reinterpret_cast<const CompactionStyle*>(offset1) ==
+              *reinterpret_cast<const CompactionStyle*>(offset2));
+    case OptionType::kCompressionType:
+      return (*reinterpret_cast<const CompressionType*>(offset1) ==
+              *reinterpret_cast<const CompressionType*>(offset2));
+    case OptionType::kVectorCompressionType: {
+      const auto* vec1 =
+          reinterpret_cast<const std::vector<CompressionType>*>(offset1);
+      const auto* vec2 =
+          reinterpret_cast<const std::vector<CompressionType>*>(offset2);
+      return (*vec1 == *vec2);
+    }
+    default:
+      if (type_info.verification == OptionVerificationType::kByName) {
+        std::string value1;
+        bool result =
+            SerializeSingleOptionHelper(offset1, type_info.type, &value1);
+        if (result == false) {
+          return false;
+        }
+        if (opt_map == nullptr) {
+          return true;
+        }
+        auto iter = opt_map->find(opt_name);
+        if (iter == opt_map->end()) {
+          return true;
+        } else {
+          return (value1 == iter->second);
+        }
+      }
+      return false;
+  }
+}
+
+}  // namespace
+
+Status RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
+    const DBOptions& db_opt, const std::vector<std::string>& cf_names,
+    const std::vector<ColumnFamilyOptions>& cf_opts,
+    const std::string& file_name, Env* env) {
+  RocksDBOptionsParser parser;
+  std::unique_ptr<SequentialFile> seq_file;
+  Status s = parser.Parse(file_name, env);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Verify DBOptions
+  s = VerifyDBOptions(db_opt, *parser.db_opt(), parser.db_opt_map());
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Verify ColumnFamily Name
+  if (cf_names.size() != parser.cf_names()->size()) {
+    return Status::Corruption(
+        "[RocksDBOptionParser Error] The persisted options does not have"
+        "the same number of column family names as the db instance.");
+  }
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    if (cf_names[i] != parser.cf_names()->at(i)) {
+      return Status::Corruption(
+          "[RocksDBOptionParser Error] The persisted options and the db"
+          "instance does not have the same name for column family ",
+          ToString(i));
+    }
+  }
+
+  // Verify Column Family Options
+  if (cf_opts.size() != parser.cf_opts()->size()) {
+    return Status::Corruption(
+        "[RocksDBOptionParser Error] The persisted options does not have"
+        "the same number of column families as the db instance.");
+  }
+  for (size_t i = 0; i < cf_opts.size(); ++i) {
+    s = VerifyCFOptions(cf_opts[i], parser.cf_opts()->at(i),
+                        &(parser.cf_opt_maps()->at(i)));
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status RocksDBOptionsParser::VerifyDBOptions(
+    const DBOptions& base_opt, const DBOptions& new_opt,
+    const std::unordered_map<std::string, std::string>* opt_map) {
+  for (auto pair : db_options_type_info) {
+    if (pair.second.verification == OptionVerificationType::kDeprecated) {
+      // We skip checking deprecated variables as they might
+      // contain random values since they might not be initialized
+      continue;
+    }
+    if (!AreEqualOptions(reinterpret_cast<const char*>(&base_opt),
+                         reinterpret_cast<const char*>(&new_opt), pair.second,
+                         pair.first, nullptr)) {
+      return Status::Corruption(
+          "[RocksDBOptionsParser]: "
+          "failed the verification on DBOptions::",
+          pair.first);
+    }
+  }
+  return Status::OK();
+}
+
+Status RocksDBOptionsParser::VerifyCFOptions(
+    const ColumnFamilyOptions& base_opt, const ColumnFamilyOptions& new_opt,
+    const std::unordered_map<std::string, std::string>* new_opt_map) {
+  for (auto& pair : cf_options_type_info) {
+    if (pair.second.verification == OptionVerificationType::kDeprecated) {
+      // We skip checking deprecated variables as they might
+      // contain random values since they might not be initialized
+      continue;
+    }
+    if (!AreEqualOptions(reinterpret_cast<const char*>(&base_opt),
+                         reinterpret_cast<const char*>(&new_opt), pair.second,
+                         pair.first, new_opt_map)) {
+      return Status::Corruption(
+          "[RocksDBOptionsParser]: "
+          "failed the verification on ColumnFamilyOptions::",
+          pair.first);
+    }
+  }
+  return Status::OK();
+}
+}  // namespace rocksdb
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/util/options_parser.h b/src/rocksdb/util/options_parser.h
new file mode 100644
index 0000000..f308fcb
--- /dev/null
+++ b/src/rocksdb/util/options_parser.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2014, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+
+#define ROCKSDB_OPTION_FILE_MAJOR 1
+#define ROCKSDB_OPTION_FILE_MINOR 0
+
+enum OptionSection : char {
+  kOptionSectionVersion = 0,
+  kOptionSectionDBOptions,
+  kOptionSectionCFOptions,
+  kOptionSectionUnknown
+};
+
+static const std::string opt_section_titles[] = {"Version", "DBOptions",
+                                                 "CFOptions", "Unknown"};
+
+Status PersistRocksDBOptions(const DBOptions& db_opt,
+                             const std::vector<std::string>& cf_names,
+                             const std::vector<ColumnFamilyOptions>& cf_opts,
+                             const std::string& file_name, Env* env);
+
+class RocksDBOptionsParser {
+ public:
+  explicit RocksDBOptionsParser();
+  ~RocksDBOptionsParser() {}
+  void Reset();
+
+  Status Parse(const std::string& file_name, Env* env);
+  static std::string TrimAndRemoveComment(const std::string& line,
+                                          const bool trim_only = false);
+
+  const DBOptions* db_opt() const { return &db_opt_; }
+  const std::unordered_map<std::string, std::string>* db_opt_map() const {
+    return &db_opt_map_;
+  }
+  const std::vector<ColumnFamilyOptions>* cf_opts() const { return &cf_opts_; }
+  const std::vector<std::string>* cf_names() const { return &cf_names_; }
+  const std::vector<std::unordered_map<std::string, std::string>>* cf_opt_maps()
+      const {
+    return &cf_opt_maps_;
+  }
+
+  const ColumnFamilyOptions* GetCFOptions(const std::string& name) const {
+    assert(cf_names_.size() == cf_opts_.size());
+    for (size_t i = 0; i < cf_names_.size(); ++i) {
+      if (cf_names_[i] == name) {
+        return &cf_opts_[i];
+      }
+    }
+    return nullptr;
+  }
+  size_t NumColumnFamilies() { return cf_opts_.size(); }
+
+  static Status VerifyRocksDBOptionsFromFile(
+      const DBOptions& db_opt, const std::vector<std::string>& cf_names,
+      const std::vector<ColumnFamilyOptions>& cf_opts,
+      const std::string& file_name, Env* env);
+
+  static Status VerifyDBOptions(
+      const DBOptions& base_opt, const DBOptions& new_opt,
+      const std::unordered_map<std::string, std::string>* new_opt_map =
+          nullptr);
+
+  static Status VerifyCFOptions(
+      const ColumnFamilyOptions& base_opt, const ColumnFamilyOptions& new_opt,
+      const std::unordered_map<std::string, std::string>* new_opt_map =
+          nullptr);
+
+  static Status ExtraParserCheck(const RocksDBOptionsParser& input_parser);
+
+ protected:
+  bool IsSection(const std::string& line);
+  Status ParseSection(OptionSection* section, std::string* argument,
+                      const std::string& line, const int line_num);
+
+  Status CheckSection(const OptionSection section,
+                      const std::string& section_arg, const int line_num);
+
+  Status ParseStatement(std::string* name, std::string* value,
+                        const std::string& line, const int line_num);
+
+  Status EndSection(
+      const OptionSection section, const std::string& section_arg,
+      const std::unordered_map<std::string, std::string>& opt_map);
+
+  Status ValidityCheck();
+
+  Status InvalidArgument(const int line_num, const std::string& message);
+
+  Status ParseVersionNumber(const std::string& ver_name,
+                            const std::string& ver_string, const int max_count,
+                            int* version);
+
+ private:
+  DBOptions db_opt_;
+  std::unordered_map<std::string, std::string> db_opt_map_;
+  std::vector<std::string> cf_names_;
+  std::vector<ColumnFamilyOptions> cf_opts_;
+  std::vector<std::unordered_map<std::string, std::string>> cf_opt_maps_;
+  bool has_version_section_;
+  bool has_db_options_;
+  bool has_default_cf_options_;
+  int db_version[3];
+  int opt_file_version[3];
+};
+
+#endif  // !ROCKSDB_LITE
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/options_test.cc b/src/rocksdb/util/options_test.cc
index 6a3b2d4..ee13540 100644
--- a/src/rocksdb/util/options_test.cc
+++ b/src/rocksdb/util/options_test.cc
@@ -11,17 +11,23 @@
 #define __STDC_FORMAT_MACROS
 #endif
 
+#include <cctype>
 #include <unordered_map>
 #include <inttypes.h>
 
 #include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
-#include "rocksdb/utilities/convenience.h"
 #include "rocksdb/utilities/leveldb_options.h"
 #include "table/block_based_table_factory.h"
+#include "util/options_helper.h"
+#include "util/options_parser.h"
 #include "util/random.h"
 #include "util/testharness.h"
+#include "util/testutil.h"
 
 #ifndef GFLAGS
 bool FLAGS_enable_print = false;
@@ -33,8 +39,6 @@ DEFINE_bool(enable_print, false, "Print options generated to console.");
 
 namespace rocksdb {
 
-class OptionsTest : public testing::Test {};
-
 class StderrLogger : public Logger {
  public:
   using Logger::Logv;
@@ -51,12 +55,12 @@ Options PrintAndGetOptions(size_t total_write_buffer_limit,
   StderrLogger logger;
 
   if (FLAGS_enable_print) {
-    printf(
-        "---- total_write_buffer_limit: %zu "
-        "read_amplification_threshold: %d write_amplification_threshold: %d "
-        "target_db_size %" PRIu64 " ----\n",
-        total_write_buffer_limit, read_amplification_threshold,
-        write_amplification_threshold, target_db_size);
+    printf("---- total_write_buffer_limit: %" ROCKSDB_PRIszt
+           " "
+           "read_amplification_threshold: %d write_amplification_threshold: %d "
+           "target_db_size %" PRIu64 " ----\n",
+           total_write_buffer_limit, read_amplification_threshold,
+           write_amplification_threshold, target_db_size);
   }
 
   Options options =
@@ -69,6 +73,165 @@ Options PrintAndGetOptions(size_t total_write_buffer_limit,
   return options;
 }
 
+class StringEnv : public EnvWrapper {
+ public:
+  class SeqStringSource : public SequentialFile {
+   public:
+    explicit SeqStringSource(const std::string& data)
+        : data_(data), offset_(0) {}
+    ~SeqStringSource() {}
+    Status Read(size_t n, Slice* result, char* scratch) override {
+      std::string output;
+      if (offset_ < data_.size()) {
+        n = std::min(data_.size() - offset_, n);
+        memcpy(scratch, data_.data() + offset_, n);
+        offset_ += n;
+        *result = Slice(scratch, n);
+      } else {
+        return Status::InvalidArgument(
+            "Attemp to read when it already reached eof.");
+      }
+      return Status::OK();
+    }
+    Status Skip(uint64_t n) override {
+      if (offset_ >= data_.size()) {
+        return Status::InvalidArgument(
+            "Attemp to read when it already reached eof.");
+      }
+      // TODO(yhchiang): Currently doesn't handle the overflow case.
+      offset_ += n;
+      return Status::OK();
+    }
+
+   private:
+    std::string data_;
+    size_t offset_;
+  };
+
+  class StringSink : public WritableFile {
+   public:
+    explicit StringSink(std::string* contents)
+        : WritableFile(), contents_(contents) {}
+    virtual Status Truncate(uint64_t size) override {
+      contents_->resize(size);
+      return Status::OK();
+    }
+    virtual Status Close() override { return Status::OK(); }
+    virtual Status Flush() override { return Status::OK(); }
+    virtual Status Sync() override { return Status::OK(); }
+    virtual Status Append(const Slice& slice) override {
+      contents_->append(slice.data(), slice.size());
+      return Status::OK();
+    }
+
+   private:
+    std::string* contents_;
+  };
+
+  explicit StringEnv(Env* t) : EnvWrapper(t) {}
+  virtual ~StringEnv() {}
+
+  const std::string& GetContent(const std::string& f) { return files_[f]; }
+
+  const Status WriteToNewFile(const std::string& file_name,
+                              const std::string& content) {
+    unique_ptr<WritableFile> r;
+    auto s = NewWritableFile(file_name, &r, EnvOptions());
+    if (!s.ok()) {
+      return s;
+    }
+    r->Append(content);
+    r->Flush();
+    r->Close();
+    assert(files_[file_name] == content);
+    return Status::OK();
+  }
+
+  // The following text is boilerplate that forwards all methods to target()
+  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+                           const EnvOptions& options) override {
+    auto iter = files_.find(f);
+    if (iter == files_.end()) {
+      return Status::NotFound("The specified file does not exist", f);
+    }
+    r->reset(new SeqStringSource(iter->second));
+    return Status::OK();
+  }
+  Status NewRandomAccessFile(const std::string& f,
+                             unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& options) override {
+    return Status::NotSupported();
+  }
+  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+                         const EnvOptions& options) override {
+    auto iter = files_.find(f);
+    if (iter != files_.end()) {
+      return Status::IOError("The specified file already exists", f);
+    }
+    r->reset(new StringSink(&files_[f]));
+    return Status::OK();
+  }
+  virtual Status NewDirectory(const std::string& name,
+                              unique_ptr<Directory>* result) override {
+    return Status::NotSupported();
+  }
+  Status FileExists(const std::string& f) override {
+    if (files_.find(f) == files_.end()) {
+      return Status::NotFound();
+    }
+    return Status::OK();
+  }
+  Status GetChildren(const std::string& dir,
+                     std::vector<std::string>* r) override {
+    return Status::NotSupported();
+  }
+  Status DeleteFile(const std::string& f) override {
+    files_.erase(f);
+    return Status::OK();
+  }
+  Status CreateDir(const std::string& d) override {
+    return Status::NotSupported();
+  }
+  Status CreateDirIfMissing(const std::string& d) override {
+    return Status::NotSupported();
+  }
+  Status DeleteDir(const std::string& d) override {
+    return Status::NotSupported();
+  }
+  Status GetFileSize(const std::string& f, uint64_t* s) override {
+    auto iter = files_.find(f);
+    if (iter == files_.end()) {
+      return Status::NotFound("The specified file does not exist:", f);
+    }
+    *s = iter->second.size();
+    return Status::OK();
+  }
+
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) override {
+    return Status::NotSupported();
+  }
+
+  Status RenameFile(const std::string& s, const std::string& t) override {
+    return Status::NotSupported();
+  }
+
+  Status LinkFile(const std::string& s, const std::string& t) override {
+    return Status::NotSupported();
+  }
+
+  Status LockFile(const std::string& f, FileLock** l) override {
+    return Status::NotSupported();
+  }
+
+  Status UnlockFile(FileLock* l) override { return Status::NotSupported(); }
+
+ protected:
+  std::unordered_map<std::string, std::string> files_;
+};
+
+class OptionsTest : public testing::Test {};
+
 TEST_F(OptionsTest, LooseCondition) {
   Options options;
   PrintAndGetOptions(static_cast<size_t>(10) * 1024 * 1024 * 1024, 100, 100);
@@ -96,6 +259,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"write_buffer_size", "1"},
       {"max_write_buffer_number", "2"},
       {"min_write_buffer_number_to_merge", "3"},
+      {"max_write_buffer_number_to_maintain", "99"},
       {"compression", "kSnappyCompression"},
       {"compression_per_level",
        "kNoCompression:"
@@ -103,13 +267,13 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
        "kZlibCompression:"
        "kBZip2Compression:"
        "kLZ4Compression:"
-       "kLZ4HCCompression"},
+       "kLZ4HCCompression:"
+       "kZSTDNotFinalCompression"},
       {"compression_opts", "4:5:6"},
       {"num_levels", "7"},
       {"level0_file_num_compaction_trigger", "8"},
       {"level0_slowdown_writes_trigger", "9"},
       {"level0_stop_writes_trigger", "10"},
-      {"max_mem_compaction_level", "11"},
       {"target_file_size_base", "12"},
       {"target_file_size_multiplier", "13"},
       {"max_bytes_for_level_base", "14"},
@@ -121,15 +285,16 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"max_grandparent_overlap_factor", "21"},
       {"soft_rate_limit", "1.1"},
       {"hard_rate_limit", "2.1"},
+      {"hard_pending_compaction_bytes_limit", "211"},
       {"arena_block_size", "22"},
       {"disable_auto_compactions", "true"},
-      {"purge_redundant_kvs_while_flush", "1"},
       {"compaction_style", "kCompactionStyleLevel"},
       {"verify_checksums_in_compaction", "false"},
       {"compaction_options_fifo", "23"},
       {"filter_deletes", "0"},
       {"max_sequential_skip_in_iterations", "24"},
       {"inplace_update_support", "true"},
+      {"compaction_measure_io_stats", "true"},
       {"inplace_update_num_locks", "25"},
       {"memtable_prefix_bloom_bits", "26"},
       {"memtable_prefix_bloom_probes", "27"},
@@ -142,38 +307,39 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   };
 
   std::unordered_map<std::string, std::string> db_options_map = {
-    {"create_if_missing", "false"},
-    {"create_missing_column_families", "true"},
-    {"error_if_exists", "false"},
-    {"paranoid_checks", "true"},
-    {"max_open_files", "32"},
-    {"max_total_wal_size", "33"},
-    {"disable_data_sync", "false"},
-    {"use_fsync", "true"},
-    {"db_log_dir", "/db_log_dir"},
-    {"wal_dir", "/wal_dir"},
-    {"delete_obsolete_files_period_micros", "34"},
-    {"max_background_compactions", "35"},
-    {"max_background_flushes", "36"},
-    {"max_log_file_size", "37"},
-    {"log_file_time_to_roll", "38"},
-    {"keep_log_file_num", "39"},
-    {"max_manifest_file_size", "40"},
-    {"table_cache_numshardbits", "41"},
-    {"WAL_ttl_seconds", "43"},
-    {"WAL_size_limit_MB", "44"},
-    {"manifest_preallocation_size", "45"},
-    {"allow_os_buffer", "false"},
-    {"allow_mmap_reads", "true"},
-    {"allow_mmap_writes", "false"},
-    {"is_fd_close_on_exec", "true"},
-    {"skip_log_error_on_recovery", "false"},
-    {"stats_dump_period_sec", "46"},
-    {"advise_random_on_open", "true"},
-    {"use_adaptive_mutex", "false"},
-    {"bytes_per_sync", "47"},
-    {"wal_bytes_per_sync", "48"},
-  };
+      {"create_if_missing", "false"},
+      {"create_missing_column_families", "true"},
+      {"error_if_exists", "false"},
+      {"paranoid_checks", "true"},
+      {"max_open_files", "32"},
+      {"max_total_wal_size", "33"},
+      {"disable_data_sync", "false"},
+      {"use_fsync", "true"},
+      {"db_log_dir", "/db_log_dir"},
+      {"wal_dir", "/wal_dir"},
+      {"delete_obsolete_files_period_micros", "34"},
+      {"max_background_compactions", "35"},
+      {"max_background_flushes", "36"},
+      {"max_log_file_size", "37"},
+      {"log_file_time_to_roll", "38"},
+      {"keep_log_file_num", "39"},
+      {"max_manifest_file_size", "40"},
+      {"table_cache_numshardbits", "41"},
+      {"WAL_ttl_seconds", "43"},
+      {"WAL_size_limit_MB", "44"},
+      {"manifest_preallocation_size", "45"},
+      {"allow_os_buffer", "false"},
+      {"allow_mmap_reads", "true"},
+      {"allow_mmap_writes", "false"},
+      {"is_fd_close_on_exec", "true"},
+      {"skip_log_error_on_recovery", "false"},
+      {"stats_dump_period_sec", "46"},
+      {"advise_random_on_open", "true"},
+      {"use_adaptive_mutex", "false"},
+      {"new_table_reader_for_compaction_inputs", "true"},
+      {"compaction_readahead_size", "100"},
+      {"bytes_per_sync", "47"},
+      {"wal_bytes_per_sync", "48"}, };
 
   ColumnFamilyOptions base_cf_opt;
   ColumnFamilyOptions new_cf_opt;
@@ -182,14 +348,16 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.write_buffer_size, 1U);
   ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2);
   ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number_to_maintain, 99);
   ASSERT_EQ(new_cf_opt.compression, kSnappyCompression);
-  ASSERT_EQ(new_cf_opt.compression_per_level.size(), 6U);
+  ASSERT_EQ(new_cf_opt.compression_per_level.size(), 7U);
   ASSERT_EQ(new_cf_opt.compression_per_level[0], kNoCompression);
   ASSERT_EQ(new_cf_opt.compression_per_level[1], kSnappyCompression);
   ASSERT_EQ(new_cf_opt.compression_per_level[2], kZlibCompression);
   ASSERT_EQ(new_cf_opt.compression_per_level[3], kBZip2Compression);
   ASSERT_EQ(new_cf_opt.compression_per_level[4], kLZ4Compression);
   ASSERT_EQ(new_cf_opt.compression_per_level[5], kLZ4HCCompression);
+  ASSERT_EQ(new_cf_opt.compression_per_level[6], kZSTDNotFinalCompression);
   ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4);
   ASSERT_EQ(new_cf_opt.compression_opts.level, 5);
   ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6);
@@ -197,7 +365,6 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.level0_file_num_compaction_trigger, 8);
   ASSERT_EQ(new_cf_opt.level0_slowdown_writes_trigger, 9);
   ASSERT_EQ(new_cf_opt.level0_stop_writes_trigger, 10);
-  ASSERT_EQ(new_cf_opt.max_mem_compaction_level, 11);
   ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast<uint64_t>(12));
   ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13);
   ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U);
@@ -211,10 +378,9 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.source_compaction_factor, 20);
   ASSERT_EQ(new_cf_opt.max_grandparent_overlap_factor, 21);
   ASSERT_EQ(new_cf_opt.soft_rate_limit, 1.1);
-  ASSERT_EQ(new_cf_opt.hard_rate_limit, 2.1);
+  ASSERT_EQ(new_cf_opt.hard_pending_compaction_bytes_limit, 211);
   ASSERT_EQ(new_cf_opt.arena_block_size, 22U);
   ASSERT_EQ(new_cf_opt.disable_auto_compactions, true);
-  ASSERT_EQ(new_cf_opt.purge_redundant_kvs_while_flush, true);
   ASSERT_EQ(new_cf_opt.compaction_style, kCompactionStyleLevel);
   ASSERT_EQ(new_cf_opt.verify_checksums_in_compaction, false);
   ASSERT_EQ(new_cf_opt.compaction_options_fifo.max_table_files_size,
@@ -278,6 +444,8 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U);
   ASSERT_EQ(new_db_opt.advise_random_on_open, true);
   ASSERT_EQ(new_db_opt.use_adaptive_mutex, false);
+  ASSERT_EQ(new_db_opt.new_table_reader_for_compaction_inputs, true);
+  ASSERT_EQ(new_db_opt.compaction_readahead_size, 100);
   ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47));
   ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast<uint64_t>(48));
 }
@@ -325,26 +493,33 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
   // Missing option name
   ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
              "write_buffer_size=13; =100;", &new_cf_opt));
+
+  const int64_t kilo = 1024UL;
+  const int64_t mega = 1024 * kilo;
+  const int64_t giga = 1024 * mega;
+  const int64_t tera = 1024 * giga;
+
   // Units (k)
   ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
             "memtable_prefix_bloom_bits=14k;max_write_buffer_number=-15K",
             &new_cf_opt));
-  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_bits, 14UL*1024UL);
-  ASSERT_EQ(new_cf_opt.max_write_buffer_number, -15*1024);
+  ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_bits, 14UL * kilo);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, -15 * kilo);
   // Units (m)
   ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
             "max_write_buffer_number=16m;inplace_update_num_locks=17M",
             &new_cf_opt));
-  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16*1024*1024);
-  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17*1024UL*1024UL);
+  ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16 * mega);
+  ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17 * mega);
   // Units (g)
   ASSERT_OK(GetColumnFamilyOptionsFromString(
       base_cf_opt,
       "write_buffer_size=18g;prefix_extractor=capped:8;"
       "arena_block_size=19G",
       &new_cf_opt));
-  ASSERT_EQ(new_cf_opt.write_buffer_size, 18*1024UL*1024UL*1024UL);
-  ASSERT_EQ(new_cf_opt.arena_block_size, 19*1024UL*1024UL*1024UL);
+
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 18 * giga);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 19 * giga);
   ASSERT_TRUE(new_cf_opt.prefix_extractor.get() != nullptr);
   std::string prefix_name(new_cf_opt.prefix_extractor->Name());
   ASSERT_EQ(prefix_name, "rocksdb.CappedPrefix.8");
@@ -352,8 +527,8 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
   // Units (t)
   ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt,
             "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt));
-  ASSERT_EQ(new_cf_opt.write_buffer_size, 20*1024UL*1024UL*1024UL*1024UL);
-  ASSERT_EQ(new_cf_opt.arena_block_size, 21*1024UL*1024UL*1024UL*1024UL);
+  ASSERT_EQ(new_cf_opt.write_buffer_size, 20 * tera);
+  ASSERT_EQ(new_cf_opt.arena_block_size, 21 * tera);
 
   // Nested block based table options
   // Emtpy
@@ -498,6 +673,304 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) {
   ASSERT_EQ(new_options.max_open_files, 1);
   ASSERT_TRUE(new_options.rate_limiter.get() != nullptr);
 }
+
+namespace {
+void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
+  // boolean options
+  db_opt->advise_random_on_open = rnd->Uniform(2);
+  db_opt->allow_mmap_reads = rnd->Uniform(2);
+  db_opt->allow_mmap_writes = rnd->Uniform(2);
+  db_opt->allow_os_buffer = rnd->Uniform(2);
+  db_opt->create_if_missing = rnd->Uniform(2);
+  db_opt->create_missing_column_families = rnd->Uniform(2);
+  db_opt->disableDataSync = rnd->Uniform(2);
+  db_opt->enable_thread_tracking = rnd->Uniform(2);
+  db_opt->error_if_exists = rnd->Uniform(2);
+  db_opt->is_fd_close_on_exec = rnd->Uniform(2);
+  db_opt->paranoid_checks = rnd->Uniform(2);
+  db_opt->skip_log_error_on_recovery = rnd->Uniform(2);
+  db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
+  db_opt->use_adaptive_mutex = rnd->Uniform(2);
+  db_opt->use_fsync = rnd->Uniform(2);
+
+  // int options
+  db_opt->max_background_compactions = rnd->Uniform(100);
+  db_opt->max_background_flushes = rnd->Uniform(100);
+  db_opt->max_file_opening_threads = rnd->Uniform(100);
+  db_opt->max_open_files = rnd->Uniform(100);
+  db_opt->table_cache_numshardbits = rnd->Uniform(100);
+
+  // size_t options
+  db_opt->db_write_buffer_size = rnd->Uniform(10000);
+  db_opt->keep_log_file_num = rnd->Uniform(10000);
+  db_opt->log_file_time_to_roll = rnd->Uniform(10000);
+  db_opt->manifest_preallocation_size = rnd->Uniform(10000);
+  db_opt->max_log_file_size = rnd->Uniform(10000);
+
+  // std::string options
+  db_opt->db_log_dir = "path/to/db_log_dir";
+  db_opt->wal_dir = "path/to/wal_dir";
+
+  // uint32_t options
+  db_opt->max_subcompactions = rnd->Uniform(100000);
+
+  // uint64_t options
+  static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX);
+  db_opt->WAL_size_limit_MB = uint_max + rnd->Uniform(100000);
+  db_opt->WAL_ttl_seconds = uint_max + rnd->Uniform(100000);
+  db_opt->bytes_per_sync = uint_max + rnd->Uniform(100000);
+  db_opt->delayed_write_rate = uint_max + rnd->Uniform(100000);
+  db_opt->delete_obsolete_files_period_micros = uint_max + rnd->Uniform(100000);
+  db_opt->max_manifest_file_size = uint_max + rnd->Uniform(100000);
+  db_opt->max_total_wal_size = uint_max + rnd->Uniform(100000);
+  db_opt->wal_bytes_per_sync = uint_max + rnd->Uniform(100000);
+
+  // unsigned int options
+  db_opt->stats_dump_period_sec = rnd->Uniform(100000);
+}
+
+}  // namespace
+
+TEST_F(OptionsTest, DBOptionsSerialization) {
+  Options base_options, new_options;
+  Random rnd(301);
+
+  // Phase 1: Make big change in base_options
+  RandomInitDBOptions(&base_options, &rnd);
+
+  // Phase 2: obtain a string from base_option
+  std::string base_options_file_content;
+  ASSERT_OK(GetStringFromDBOptions(&base_options_file_content, base_options));
+
+  // Phase 3: Set new_options from the derived string and expect
+  //          new_options == base_options
+  ASSERT_OK(GetDBOptionsFromString(DBOptions(), base_options_file_content,
+                                   &new_options));
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(base_options, new_options));
+}
+
+namespace {
+CompressionType RandomCompressionType(Random* rnd) {
+  return static_cast<CompressionType>(rnd->Uniform(6));
+}
+
+void RandomCompressionTypeVector(const size_t count,
+                                 std::vector<CompressionType>* types,
+                                 Random* rnd) {
+  types->clear();
+  for (size_t i = 0; i < count; ++i) {
+    types->emplace_back(RandomCompressionType(rnd));
+  }
+}
+
+const SliceTransform* RandomSliceTransform(Random* rnd, int pre_defined = -1) {
+  int random_num = pre_defined >= 0 ? pre_defined : rnd->Uniform(4);
+  switch (random_num) {
+    case 0:
+      return NewFixedPrefixTransform(rnd->Uniform(20) + 1);
+    case 1:
+      return NewCappedPrefixTransform(rnd->Uniform(20) + 1);
+    case 2:
+      return NewNoopTransform();
+    default:
+      return nullptr;
+  }
+}
+
+TableFactory* RandomTableFactory(Random* rnd, int pre_defined = -1) {
+  int random_num = pre_defined >= 0 ? pre_defined : rnd->Uniform(3);
+  switch (random_num) {
+    case 0:
+      return NewPlainTableFactory();
+    case 1:
+      return NewCuckooTableFactory();
+    default:
+      return NewBlockBasedTableFactory();
+  }
+}
+
+std::string RandomString(Random* rnd, const size_t len) {
+  std::stringstream ss;
+  for (size_t i = 0; i < len; ++i) {
+    ss << static_cast<char>(rnd->Uniform(26) + 'a');
+  }
+  return ss.str();
+}
+
+class ChanglingMergeOperator : public MergeOperator {
+ public:
+  explicit ChanglingMergeOperator(const std::string& name)
+      : name_(name + "MergeOperator") {}
+  ~ChanglingMergeOperator() {}
+
+  void SetName(const std::string& name) { name_ = name; }
+
+  virtual bool FullMerge(const Slice& key, const Slice* existing_value,
+                         const std::deque<std::string>& operand_list,
+                         std::string* new_value,
+                         Logger* logger) const override {
+    return false;
+  }
+  virtual bool PartialMergeMulti(const Slice& key,
+                                 const std::deque<Slice>& operand_list,
+                                 std::string* new_value,
+                                 Logger* logger) const override {
+    return false;
+  }
+  virtual const char* Name() const override { return name_.c_str(); }
+
+ protected:
+  std::string name_;
+};
+
+MergeOperator* RandomMergeOperator(Random* rnd) {
+  return new ChanglingMergeOperator(RandomString(rnd, 10));
+}
+
+class ChanglingCompactionFilter : public CompactionFilter {
+ public:
+  explicit ChanglingCompactionFilter(const std::string& name)
+      : name_(name + "CompactionFilter") {}
+  ~ChanglingCompactionFilter() {}
+
+  void SetName(const std::string& name) { name_ = name; }
+
+  bool Filter(int level, const Slice& key, const Slice& existing_value,
+              std::string* new_value, bool* value_changed) const override {
+    return false;
+  }
+
+  const char* Name() const override { return name_.c_str(); }
+
+ private:
+  std::string name_;
+};
+
+CompactionFilter* RandomCompactionFilter(Random* rnd) {
+  return new ChanglingCompactionFilter(RandomString(rnd, 10));
+}
+
+class ChanglingCompactionFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ChanglingCompactionFilterFactory(const std::string& name)
+      : name_(name + "CompactionFilterFactory") {}
+  ~ChanglingCompactionFilterFactory() {}
+
+  void SetName(const std::string& name) { name_ = name; }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    return std::unique_ptr<CompactionFilter>();
+  }
+
+  // Returns a name that identifies this compaction filter factory.
+  const char* Name() const override { return name_.c_str(); }
+
+ protected:
+  std::string name_;
+};
+
+CompactionFilterFactory* RandomCompactionFilterFactory(Random* rnd) {
+  return new ChanglingCompactionFilterFactory(RandomString(rnd, 10));
+}
+
+// Note that the caller is responsible for releasing non-null
+// cf_opt->compaction_filter.
+void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) {
+  cf_opt->compaction_style = (CompactionStyle)(rnd->Uniform(4));
+
+  // boolean options
+  cf_opt->compaction_measure_io_stats = rnd->Uniform(2);
+  cf_opt->disable_auto_compactions = rnd->Uniform(2);
+  cf_opt->filter_deletes = rnd->Uniform(2);
+  cf_opt->inplace_update_support = rnd->Uniform(2);
+  cf_opt->level_compaction_dynamic_level_bytes = rnd->Uniform(2);
+  cf_opt->optimize_filters_for_hits = rnd->Uniform(2);
+  cf_opt->paranoid_file_checks = rnd->Uniform(2);
+  cf_opt->purge_redundant_kvs_while_flush = rnd->Uniform(2);
+  cf_opt->verify_checksums_in_compaction = rnd->Uniform(2);
+
+  // double options
+  cf_opt->hard_rate_limit = static_cast<double>(rnd->Uniform(10000)) / 13;
+  cf_opt->soft_rate_limit = static_cast<double>(rnd->Uniform(10000)) / 13;
+
+  // int options
+  cf_opt->expanded_compaction_factor = rnd->Uniform(100);
+  cf_opt->level0_file_num_compaction_trigger = rnd->Uniform(100);
+  cf_opt->level0_slowdown_writes_trigger = rnd->Uniform(100);
+  cf_opt->level0_stop_writes_trigger = rnd->Uniform(100);
+  cf_opt->max_bytes_for_level_multiplier = rnd->Uniform(100);
+  cf_opt->max_grandparent_overlap_factor = rnd->Uniform(100);
+  cf_opt->max_mem_compaction_level = rnd->Uniform(100);
+  cf_opt->max_write_buffer_number = rnd->Uniform(100);
+  cf_opt->max_write_buffer_number_to_maintain = rnd->Uniform(100);
+  cf_opt->min_write_buffer_number_to_merge = rnd->Uniform(100);
+  cf_opt->num_levels = rnd->Uniform(100);
+  cf_opt->source_compaction_factor = rnd->Uniform(100);
+  cf_opt->target_file_size_multiplier = rnd->Uniform(100);
+
+  // size_t options
+  cf_opt->arena_block_size = rnd->Uniform(10000);
+  cf_opt->inplace_update_num_locks = rnd->Uniform(10000);
+  cf_opt->max_successive_merges = rnd->Uniform(10000);
+  cf_opt->memtable_prefix_bloom_huge_page_tlb_size = rnd->Uniform(10000);
+  cf_opt->write_buffer_size = rnd->Uniform(10000);
+
+  // uint32_t options
+  cf_opt->bloom_locality = rnd->Uniform(10000);
+  cf_opt->memtable_prefix_bloom_bits = rnd->Uniform(10000);
+  cf_opt->memtable_prefix_bloom_probes = rnd->Uniform(10000);
+  cf_opt->min_partial_merge_operands = rnd->Uniform(10000);
+  cf_opt->max_bytes_for_level_base = rnd->Uniform(10000);
+
+  // uint64_t options
+  static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX);
+  cf_opt->max_sequential_skip_in_iterations = uint_max + rnd->Uniform(10000);
+  cf_opt->target_file_size_base = uint_max + rnd->Uniform(10000);
+
+  // unsigned int options
+  cf_opt->rate_limit_delay_max_milliseconds = rnd->Uniform(10000);
+
+  // pointer typed options
+  cf_opt->prefix_extractor.reset(RandomSliceTransform(rnd));
+  cf_opt->table_factory.reset(RandomTableFactory(rnd));
+  cf_opt->merge_operator.reset(RandomMergeOperator(rnd));
+  if (cf_opt->compaction_filter) {
+    delete cf_opt->compaction_filter;
+  }
+  cf_opt->compaction_filter = RandomCompactionFilter(rnd);
+  cf_opt->compaction_filter_factory.reset(RandomCompactionFilterFactory(rnd));
+
+  // custom typed options
+  cf_opt->compression = RandomCompressionType(rnd);
+  RandomCompressionTypeVector(cf_opt->num_levels,
+                              &cf_opt->compression_per_level, rnd);
+}
+
+}  // namespace
+
+TEST_F(OptionsTest, ColumnFamilyOptionsSerialization) {
+  ColumnFamilyOptions base_opt, new_opt;
+  Random rnd(302);
+  // Phase 1: randomly assign base_opt
+  // custom type options
+  RandomInitCFOptions(&base_opt, &rnd);
+
+  // Phase 2: obtain a string from base_opt
+  std::string base_options_file_content;
+  ASSERT_OK(
+      GetStringFromColumnFamilyOptions(&base_options_file_content, base_opt));
+
+  // Phase 3: Set new_opt from the derived string and expect
+  //          new_opt == base_opt
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      ColumnFamilyOptions(), base_options_file_content, &new_opt));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_opt, new_opt));
+  if (base_opt.compaction_filter) {
+    delete base_opt.compaction_filter;
+  }
+}
+
 #endif  // !ROCKSDB_LITE
 
 
@@ -699,6 +1172,476 @@ TEST_F(OptionsTest, ConvertOptionsTest) {
   ASSERT_EQ(table_opt.filter_policy.get(), leveldb_opt.filter_policy);
 }
 
+#ifndef ROCKSDB_LITE
+class OptionsParserTest : public testing::Test {
+ public:
+  OptionsParserTest() { env_.reset(new StringEnv(Env::Default())); }
+
+ protected:
+  std::unique_ptr<StringEnv> env_;
+};
+
+TEST_F(OptionsParserTest, Comment) {
+  DBOptions db_opt;
+  db_opt.max_open_files = 12345;
+  db_opt.max_background_flushes = 301;
+  db_opt.max_total_wal_size = 1024;
+  ColumnFamilyOptions cf_opt;
+
+  std::string options_file_content =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=3.14.0\n"
+      "  options_file_version=1\n"
+      "[ DBOptions ]\n"
+      "  # note that we don't support space around \"=\"\n"
+      "  max_open_files=12345;\n"
+      "  max_background_flushes=301  # comment after a statement is fine\n"
+      "  # max_background_flushes=1000  # this line would be ignored\n"
+      "  # max_background_compactions=2000 # so does this one\n"
+      "  max_total_wal_size=1024  # keep_log_file_num=1000\n"
+      "[CFOptions   \"default\"]  # column family must be specified\n"
+      "                     # in the correct order\n"
+      "  # if a section is blank, we will use the default\n";
+
+  const std::string kTestFileName = "test-rocksdb-options.ini";
+  env_->WriteToNewFile(kTestFileName, options_file_content);
+  RocksDBOptionsParser parser;
+  ASSERT_OK(parser.Parse(kTestFileName, env_.get()));
+
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(*parser.db_opt(), db_opt));
+  ASSERT_EQ(parser.NumColumnFamilies(), 1U);
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+      *parser.GetCFOptions("default"), cf_opt));
+}
+
+TEST_F(OptionsParserTest, ExtraSpace) {
+  std::string options_file_content =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[      Version   ]\n"
+      "  rocksdb_version     = 3.14.0      \n"
+      "  options_file_version=1   # some comment\n"
+      "[DBOptions  ]  # some comment\n"
+      "max_open_files=12345   \n"
+      "    max_background_flushes   =    301   \n"
+      " max_total_wal_size     =   1024  # keep_log_file_num=1000\n"
+      "        [CFOptions      \"default\"     ]\n"
+      "  # if a section is blank, we will use the default\n";
+
+  const std::string kTestFileName = "test-rocksdb-options.ini";
+  env_->WriteToNewFile(kTestFileName, options_file_content);
+  RocksDBOptionsParser parser;
+  ASSERT_OK(parser.Parse(kTestFileName, env_.get()));
+}
+
+TEST_F(OptionsParserTest, MissingDBOptions) {
+  std::string options_file_content =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=3.14.0\n"
+      "  options_file_version=1\n"
+      "[CFOptions \"default\"]\n"
+      "  # if a section is blank, we will use the default\n";
+
+  const std::string kTestFileName = "test-rocksdb-options.ini";
+  env_->WriteToNewFile(kTestFileName, options_file_content);
+  RocksDBOptionsParser parser;
+  ASSERT_NOK(parser.Parse(kTestFileName, env_.get()));
+}
+
+TEST_F(OptionsParserTest, DoubleDBOptions) {
+  DBOptions db_opt;
+  db_opt.max_open_files = 12345;
+  db_opt.max_background_flushes = 301;
+  db_opt.max_total_wal_size = 1024;
+  ColumnFamilyOptions cf_opt;
+
+  std::string options_file_content =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=3.14.0\n"
+      "  options_file_version=1\n"
+      "[DBOptions]\n"
+      "  max_open_files=12345\n"
+      "  max_background_flushes=301\n"
+      "  max_total_wal_size=1024  # keep_log_file_num=1000\n"
+      "[DBOptions]\n"
+      "[CFOptions \"default\"]\n"
+      "  # if a section is blank, we will use the default\n";
+
+  const std::string kTestFileName = "test-rocksdb-options.ini";
+  env_->WriteToNewFile(kTestFileName, options_file_content);
+  RocksDBOptionsParser parser;
+  ASSERT_NOK(parser.Parse(kTestFileName, env_.get()));
+}
+
+TEST_F(OptionsParserTest, NoDefaultCFOptions) {
+  DBOptions db_opt;
+  db_opt.max_open_files = 12345;
+  db_opt.max_background_flushes = 301;
+  db_opt.max_total_wal_size = 1024;
+  ColumnFamilyOptions cf_opt;
+
+  std::string options_file_content =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=3.14.0\n"
+      "  options_file_version=1\n"
+      "[DBOptions]\n"
+      "  max_open_files=12345\n"
+      "  max_background_flushes=301\n"
+      "  max_total_wal_size=1024  # keep_log_file_num=1000\n"
+      "[CFOptions \"something_else\"]\n"
+      "  # if a section is blank, we will use the default\n";
+
+  const std::string kTestFileName = "test-rocksdb-options.ini";
+  env_->WriteToNewFile(kTestFileName, options_file_content);
+  RocksDBOptionsParser parser;
+  ASSERT_NOK(parser.Parse(kTestFileName, env_.get()));
+}
+
+TEST_F(OptionsParserTest, DefaultCFOptionsMustBeTheFirst) {
+  DBOptions db_opt;
+  db_opt.max_open_files = 12345;
+  db_opt.max_background_flushes = 301;
+  db_opt.max_total_wal_size = 1024;
+  ColumnFamilyOptions cf_opt;
+
+  std::string options_file_content =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=3.14.0\n"
+      "  options_file_version=1\n"
+      "[DBOptions]\n"
+      "  max_open_files=12345\n"
+      "  max_background_flushes=301\n"
+      "  max_total_wal_size=1024  # keep_log_file_num=1000\n"
+      "[CFOptions \"something_else\"]\n"
+      "  # if a section is blank, we will use the default\n"
+      "[CFOptions \"default\"]\n"
+      "  # if a section is blank, we will use the default\n";
+
+  const std::string kTestFileName = "test-rocksdb-options.ini";
+  env_->WriteToNewFile(kTestFileName, options_file_content);
+  RocksDBOptionsParser parser;
+  ASSERT_NOK(parser.Parse(kTestFileName, env_.get()));
+}
+
+TEST_F(OptionsParserTest, DuplicateCFOptions) {
+  DBOptions db_opt;
+  db_opt.max_open_files = 12345;
+  db_opt.max_background_flushes = 301;
+  db_opt.max_total_wal_size = 1024;
+  ColumnFamilyOptions cf_opt;
+
+  std::string options_file_content =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=3.14.0\n"
+      "  options_file_version=1\n"
+      "[DBOptions]\n"
+      "  max_open_files=12345\n"
+      "  max_background_flushes=301\n"
+      "  max_total_wal_size=1024  # keep_log_file_num=1000\n"
+      "[CFOptions \"default\"]\n"
+      "[CFOptions \"something_else\"]\n"
+      "[CFOptions \"something_else\"]\n";
+
+  const std::string kTestFileName = "test-rocksdb-options.ini";
+  env_->WriteToNewFile(kTestFileName, options_file_content);
+  RocksDBOptionsParser parser;
+  ASSERT_NOK(parser.Parse(kTestFileName, env_.get()));
+}
+
+TEST_F(OptionsParserTest, ParseVersion) {
+  DBOptions db_opt;
+  db_opt.max_open_files = 12345;
+  db_opt.max_background_flushes = 301;
+  db_opt.max_total_wal_size = 1024;
+  ColumnFamilyOptions cf_opt;
+
+  std::string file_template =
+      "# This is a testing option string.\n"
+      "# Currently we only support \"#\" styled comment.\n"
+      "\n"
+      "[Version]\n"
+      "  rocksdb_version=3.13.1\n"
+      "  options_file_version=%s\n"
+      "[DBOptions]\n"
+      "[CFOptions \"default\"]\n";
+  const int kLength = 1000;
+  char buffer[kLength];
+  RocksDBOptionsParser parser;
+
+  const std::vector<std::string> invalid_versions = {
+      "a.b.c", "3.2.2b", "3.-12", "3. 1",  // only digits and dots are allowed
+      "1.2.3.4",
+      "1.2.3"  // can only contains at most one dot.
+      "0",     // options_file_version must be at least one
+      "3..2",
+      ".", ".1.2",             // must have at least one digit before each dot
+      "1.2.", "1.", "2.34."};  // must have at least one digit after each dot
+  for (auto iv : invalid_versions) {
+    snprintf(buffer, kLength - 1, file_template.c_str(), iv.c_str());
+
+    parser.Reset();
+    env_->WriteToNewFile(iv, buffer);
+    ASSERT_NOK(parser.Parse(iv, env_.get()));
+  }
+
+  const std::vector<std::string> valid_versions = {
+      "1.232", "100", "3.12", "1", "12.3  ", "  1.25  "};
+  for (auto vv : valid_versions) {
+    snprintf(buffer, kLength - 1, file_template.c_str(), vv.c_str());
+    parser.Reset();
+    env_->WriteToNewFile(vv, buffer);
+    ASSERT_OK(parser.Parse(vv, env_.get()));
+  }
+}
+
+void VerifyCFPointerTypedOptions(
+    ColumnFamilyOptions* base_cf_opt, const ColumnFamilyOptions* new_cf_opt,
+    const std::unordered_map<std::string, std::string>* new_cf_opt_map) {
+  std::string name_buffer;
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt,
+                                                  new_cf_opt_map));
+
+  // change the name of merge operator back-and-forth
+  {
+    auto* merge_operator = dynamic_cast<ChanglingMergeOperator*>(
+        base_cf_opt->merge_operator.get());
+    if (merge_operator != nullptr) {
+      name_buffer = merge_operator->Name();
+      // change the name  and expect non-ok status
+      merge_operator->SetName("some-other-name");
+      ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+          *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+      // change the name back and expect ok status
+      merge_operator->SetName(name_buffer);
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt,
+                                                      new_cf_opt_map));
+    }
+  }
+
+  // change the name of the compaction filter factory back-and-forth
+  {
+    auto* compaction_filter_factory =
+        dynamic_cast<ChanglingCompactionFilterFactory*>(
+            base_cf_opt->compaction_filter_factory.get());
+    if (compaction_filter_factory != nullptr) {
+      name_buffer = compaction_filter_factory->Name();
+      // change the name and expect non-ok status
+      compaction_filter_factory->SetName("some-other-name");
+      ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+          *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+      // change the name back and expect ok status
+      compaction_filter_factory->SetName(name_buffer);
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt,
+                                                      new_cf_opt_map));
+    }
+  }
+
+  // test by setting compaction_filter to nullptr
+  {
+    auto* tmp_compaction_filter = base_cf_opt->compaction_filter;
+    if (tmp_compaction_filter != nullptr) {
+      base_cf_opt->compaction_filter = nullptr;
+      // set compaction_filter to nullptr and expect non-ok status
+      ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+          *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+      // set the value back and expect ok status
+      base_cf_opt->compaction_filter = tmp_compaction_filter;
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt,
+                                                      new_cf_opt_map));
+    }
+  }
+
+  // test by setting table_factory to nullptr
+  {
+    auto tmp_table_factory = base_cf_opt->table_factory;
+    if (tmp_table_factory != nullptr) {
+      base_cf_opt->table_factory.reset();
+      // set table_factory to nullptr and expect non-ok status
+      ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+          *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+      // set the value back and expect ok status
+      base_cf_opt->table_factory = tmp_table_factory;
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt,
+                                                      new_cf_opt_map));
+    }
+  }
+
+  // test by setting memtable_factory to nullptr
+  {
+    auto tmp_memtable_factory = base_cf_opt->memtable_factory;
+    if (tmp_memtable_factory != nullptr) {
+      base_cf_opt->memtable_factory.reset();
+      // set memtable_factory to nullptr and expect non-ok status
+      ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions(
+          *base_cf_opt, *new_cf_opt, new_cf_opt_map));
+      // set the value back and expect ok status
+      base_cf_opt->memtable_factory = tmp_memtable_factory;
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(*base_cf_opt, *new_cf_opt,
+                                                      new_cf_opt_map));
+    }
+  }
+}
+
+TEST_F(OptionsParserTest, DumpAndParse) {
+  DBOptions base_db_opt;
+  std::vector<ColumnFamilyOptions> base_cf_opts;
+  std::vector<std::string> cf_names = {"default", "cf1", "cf2", "cf3",
+                                       "c:f:4:4:4"
+                                       "p\\i\\k\\a\\chu\\\\\\",
+                                       "###rocksdb#1-testcf#2###"};
+  const int num_cf = static_cast<int>(cf_names.size());
+  Random rnd(302);
+  RandomInitDBOptions(&base_db_opt, &rnd);
+  base_db_opt.db_log_dir += "/#odd #but #could #happen #path #/\\\\#OMG";
+  for (int c = 0; c < num_cf; ++c) {
+    ColumnFamilyOptions cf_opt;
+    Random cf_rnd(0xFB + c);
+    RandomInitCFOptions(&cf_opt, &cf_rnd);
+    if (c < 4) {
+      cf_opt.prefix_extractor.reset(RandomSliceTransform(&rnd, c));
+    }
+    if (c < 3) {
+      cf_opt.table_factory.reset(RandomTableFactory(&rnd, c));
+    }
+    base_cf_opts.emplace_back(cf_opt);
+  }
+
+  const std::string kOptionsFileName = "test-persisted-options.ini";
+  ASSERT_OK(PersistRocksDBOptions(base_db_opt, cf_names, base_cf_opts,
+                                  kOptionsFileName, env_.get()));
+
+  RocksDBOptionsParser parser;
+  ASSERT_OK(parser.Parse(kOptionsFileName, env_.get()));
+
+  ASSERT_OK(RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
+      base_db_opt, cf_names, base_cf_opts, kOptionsFileName, env_.get()));
+
+  ASSERT_OK(
+      RocksDBOptionsParser::VerifyDBOptions(*parser.db_opt(), base_db_opt));
+  for (int c = 0; c < num_cf; ++c) {
+    const auto* cf_opt = parser.GetCFOptions(cf_names[c]);
+    ASSERT_NE(cf_opt, nullptr);
+    ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+        base_cf_opts[c], *cf_opt, &(parser.cf_opt_maps()->at(c))));
+  }
+
+  // Further verify pointer-typed options
+  for (int c = 0; c < num_cf; ++c) {
+    const auto* cf_opt = parser.GetCFOptions(cf_names[c]);
+    ASSERT_NE(cf_opt, nullptr);
+    VerifyCFPointerTypedOptions(&base_cf_opts[c], cf_opt,
+                                &(parser.cf_opt_maps()->at(c)));
+  }
+
+  ASSERT_EQ(parser.GetCFOptions("does not exist"), nullptr);
+
+  base_db_opt.max_open_files++;
+  ASSERT_NOK(RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
+      base_db_opt, cf_names, base_cf_opts, kOptionsFileName, env_.get()));
+
+  for (int c = 0; c < num_cf; ++c) {
+    if (base_cf_opts[c].compaction_filter) {
+      delete base_cf_opts[c].compaction_filter;
+    }
+  }
+}
+
+namespace {
+bool IsEscapedString(const std::string& str) {
+  for (size_t i = 0; i < str.size(); ++i) {
+    if (str[i] == '\\') {
+      // since we already handle those two consecutive '\'s in
+      // the next if-then branch, any '\' appear at the end
+      // of an escaped string in such case is not valid.
+      if (i == str.size() - 1) {
+        return false;
+      }
+      if (str[i + 1] == '\\') {
+        // if there're two consecutive '\'s, skip the second one.
+        i++;
+        continue;
+      }
+      switch (str[i + 1]) {
+        case ':':
+        case '\\':
+        case '#':
+          continue;
+        default:
+          // if true, '\' together with str[i + 1] is not a valid escape.
+          if (UnescapeChar(str[i + 1]) == str[i + 1]) {
+            return false;
+          }
+      }
+    } else if (isSpecialChar(str[i]) && (i == 0 || str[i - 1] != '\\')) {
+      return false;
+    }
+  }
+  return true;
+}
+}  // namespace
+
+TEST_F(OptionsParserTest, EscapeOptionString) {
+  ASSERT_EQ(UnescapeOptionString(
+                "This is a test string with \\# \\: and \\\\ escape chars."),
+            "This is a test string with # : and \\ escape chars.");
+
+  ASSERT_EQ(
+      EscapeOptionString("This is a test string with # : and \\ escape chars."),
+      "This is a test string with \\# \\: and \\\\ escape chars.");
+
+  std::string readible_chars =
+      "A String like this \"1234567890-=_)(*&^%$#@!ertyuiop[]{POIU"
+      "YTREWQasdfghjkl;':LKJHGFDSAzxcvbnm,.?>"
+      "<MNBVCXZ\\\" should be okay to \\#\\\\\\:\\#\\#\\#\\ "
+      "be serialized and deserialized";
+
+  std::string escaped_string = EscapeOptionString(readible_chars);
+  ASSERT_TRUE(IsEscapedString(escaped_string));
+  // This two transformations should be canceled and should output
+  // the original input.
+  ASSERT_EQ(UnescapeOptionString(escaped_string), readible_chars);
+
+  std::string all_chars;
+  for (unsigned char c = 0;; ++c) {
+    all_chars += c;
+    if (c == 255) {
+      break;
+    }
+  }
+  escaped_string = EscapeOptionString(all_chars);
+  ASSERT_TRUE(IsEscapedString(escaped_string));
+  ASSERT_EQ(UnescapeOptionString(escaped_string), all_chars);
+
+  ASSERT_EQ(RocksDBOptionsParser::TrimAndRemoveComment(
+                "     A simple statement with a comment.  # like this :)"),
+            "A simple statement with a comment.");
+
+  ASSERT_EQ(RocksDBOptionsParser::TrimAndRemoveComment(
+                "Escape \\# and # comment together   ."),
+            "Escape \\# and");
+}
+
+#endif  // !ROCKSDB_LITE
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/src/rocksdb/util/perf_context.cc b/src/rocksdb/util/perf_context.cc
index 7be9980..2825165 100644
--- a/src/rocksdb/util/perf_context.cc
+++ b/src/rocksdb/util/perf_context.cc
@@ -10,22 +10,13 @@
 namespace rocksdb {
 
 #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)
-PerfLevel perf_level = kEnableCount;
-// This is a dummy variable since some place references it
-PerfContext perf_context;
+  PerfContext perf_context;
+#elif _WIN32
+  __declspec(thread) PerfContext perf_context;
 #else
-__thread PerfLevel perf_level = kEnableCount;
-__thread PerfContext perf_context;
+  __thread PerfContext perf_context;
 #endif
 
-void SetPerfLevel(PerfLevel level) {
-  perf_level = level;
-}
-
-PerfLevel GetPerfLevel() {
-  return perf_level;
-}
-
 void PerfContext::Reset() {
 #if !defined(NPERF_CONTEXT) && !defined(IOS_CROSS_COMPILE)
   user_key_comparison_count = 0;
@@ -53,9 +44,20 @@ void PerfContext::Reset() {
   find_next_user_entry_time = 0;
   write_pre_and_post_process_time = 0;
   write_memtable_time = 0;
+  write_delay_time = 0;
   db_mutex_lock_nanos = 0;
   db_condition_wait_nanos = 0;
   merge_operator_time_nanos = 0;
+  read_index_block_nanos = 0;
+  read_filter_block_nanos = 0;
+  new_table_block_iter_nanos = 0;
+  new_table_iterator_nanos = 0;
+  block_seek_nanos = 0;
+  find_table_nanos = 0;
+  bloom_memtable_hit_count = 0;
+  bloom_memtable_miss_count = 0;
+  bloom_sst_hit_count = 0;
+  bloom_sst_miss_count = 0;
 #endif
 }
 
@@ -79,7 +81,12 @@ std::string PerfContext::ToString() const {
      << OUTPUT(seek_internal_seek_time) << OUTPUT(find_next_user_entry_time)
      << OUTPUT(write_pre_and_post_process_time) << OUTPUT(write_memtable_time)
      << OUTPUT(db_mutex_lock_nanos) << OUTPUT(db_condition_wait_nanos)
-     << OUTPUT(merge_operator_time_nanos);
+     << OUTPUT(merge_operator_time_nanos) << OUTPUT(write_delay_time)
+     << OUTPUT(read_index_block_nanos) << OUTPUT(read_filter_block_nanos)
+     << OUTPUT(new_table_block_iter_nanos) << OUTPUT(new_table_iterator_nanos)
+     << OUTPUT(block_seek_nanos) << OUTPUT(find_table_nanos)
+     << OUTPUT(bloom_memtable_hit_count) << OUTPUT(bloom_memtable_miss_count)
+     << OUTPUT(bloom_sst_hit_count) << OUTPUT(bloom_sst_miss_count);
   return ss.str();
 #endif
 }
diff --git a/src/rocksdb/util/perf_context_imp.h b/src/rocksdb/util/perf_context_imp.h
index e397901..cde7ee3 100644
--- a/src/rocksdb/util/perf_context_imp.h
+++ b/src/rocksdb/util/perf_context_imp.h
@@ -5,6 +5,7 @@
 //
 #pragma once
 #include "rocksdb/perf_context.h"
+#include "util/perf_step_timer.h"
 #include "util/stop_watch.h"
 
 namespace rocksdb {
@@ -19,49 +20,6 @@ namespace rocksdb {
 
 #else
 
-extern __thread PerfLevel perf_level;
-
-class PerfStepTimer {
- public:
-  PerfStepTimer(uint64_t* metric)
-    : enabled_(perf_level >= PerfLevel::kEnableTime),
-      env_(enabled_ ? Env::Default() : nullptr),
-      start_(0),
-      metric_(metric) {
-  }
-
-  ~PerfStepTimer() {
-    Stop();
-  }
-
-  void Start() {
-    if (enabled_) {
-      start_ = env_->NowNanos();
-    }
-  }
-
-  void Measure() {
-    if (start_) {
-      uint64_t now = env_->NowNanos();
-      *metric_ += now - start_;
-      start_ = now;
-    }
-  }
-
-  void Stop() {
-    if (start_) {
-      *metric_ += env_->NowNanos() - start_;
-      start_ = 0;
-    }
-  }
-
- private:
-  const bool enabled_;
-  Env* const env_;
-  uint64_t start_;
-  uint64_t* metric_;
-};
-
 // Stop the timer and update the metric
 #define PERF_TIMER_STOP(metric)          \
   perf_step_timer_ ## metric.Stop();
@@ -70,8 +28,8 @@ class PerfStepTimer {
   perf_step_timer_ ## metric.Start();
 
 // Declare and set start time of the timer
-#define PERF_TIMER_GUARD(metric)           \
-  PerfStepTimer perf_step_timer_ ## metric(&(perf_context.metric));          \
+#define PERF_TIMER_GUARD(metric)                                      \
+  PerfStepTimer perf_step_timer_ ## metric(&(perf_context.metric));   \
   perf_step_timer_ ## metric.Start();
 
 // Update metric with time elapsed since last START. start time is reset
diff --git a/src/rocksdb/util/perf_level.cc b/src/rocksdb/util/perf_level.cc
new file mode 100644
index 0000000..387ff5f
--- /dev/null
+++ b/src/rocksdb/util/perf_level.cc
@@ -0,0 +1,27 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+
+#include <sstream>
+#include "util/perf_level_imp.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+#if defined(IOS_CROSS_COMPILE)
+PerfLevel perf_level = kEnableCount;
+#else
+__thread PerfLevel perf_level = kEnableCount;
+#endif
+
+void SetPerfLevel(PerfLevel level) {
+  perf_level = level;
+}
+
+PerfLevel GetPerfLevel() {
+  return perf_level;
+}
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/perf_level_imp.h b/src/rocksdb/util/perf_level_imp.h
new file mode 100644
index 0000000..7a83410
--- /dev/null
+++ b/src/rocksdb/util/perf_level_imp.h
@@ -0,0 +1,18 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "rocksdb/perf_level.h"
+#include "port/port.h"
+
+namespace rocksdb {
+
+#if defined(IOS_CROSS_COMPILE)
+extern PerfLevel perf_level;
+#else
+extern __thread PerfLevel perf_level;
+#endif
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/perf_step_timer.h b/src/rocksdb/util/perf_step_timer.h
new file mode 100644
index 0000000..9502583
--- /dev/null
+++ b/src/rocksdb/util/perf_step_timer.h
@@ -0,0 +1,54 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+#pragma once
+#include "rocksdb/env.h"
+#include "util/perf_level_imp.h"
+#include "util/stop_watch.h"
+
+namespace rocksdb {
+
+class PerfStepTimer {
+ public:
+  PerfStepTimer(uint64_t* metric)
+    : enabled_(perf_level >= PerfLevel::kEnableTime),
+      env_(enabled_ ? Env::Default() : nullptr),
+      start_(0),
+      metric_(metric) {
+  }
+
+  ~PerfStepTimer() {
+    Stop();
+  }
+
+  void Start() {
+    if (enabled_) {
+      start_ = env_->NowNanos();
+    }
+  }
+
+  void Measure() {
+    if (start_) {
+      uint64_t now = env_->NowNanos();
+      *metric_ += now - start_;
+      start_ = now;
+    }
+  }
+
+  void Stop() {
+    if (start_) {
+      *metric_ += env_->NowNanos() - start_;
+      start_ = 0;
+    }
+  }
+
+ private:
+  const bool enabled_;
+  Env* const env_;
+  uint64_t start_;
+  uint64_t* metric_;
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/posix_logger.h b/src/rocksdb/util/posix_logger.h
index 213a652..55cb34a 100644
--- a/src/rocksdb/util/posix_logger.h
+++ b/src/rocksdb/util/posix_logger.h
@@ -13,14 +13,14 @@
 #pragma once
 #include <algorithm>
 #include <stdio.h>
-#include <sys/time.h>
+#include "port/sys_time.h"
 #include <time.h>
 #include <fcntl.h>
-#include <unistd.h>
 #ifdef OS_LINUX
 #include <linux/falloc.h>
 #endif
 #include "rocksdb/env.h"
+#include "util/iostats_context_imp.h"
 #include <atomic>
 
 namespace rocksdb {
@@ -61,6 +61,8 @@ class PosixLogger : public Logger {
 
   using Logger::Logv;
   virtual void Logv(const char* format, va_list ap) override {
+    IOSTATS_TIMER_GUARD(logger_nanos);
+
     const uint64_t thread_id = (*gettid_)();
 
     // We try twice: the first time with a fixed-size stack allocated buffer,
diff --git a/src/rocksdb/util/rate_limiter.cc b/src/rocksdb/util/rate_limiter.cc
index 3eff506..188d5f0 100644
--- a/src/rocksdb/util/rate_limiter.cc
+++ b/src/rocksdb/util/rate_limiter.cc
@@ -32,13 +32,13 @@ GenericRateLimiter::GenericRateLimiter(int64_t rate_bytes_per_sec,
       stop_(false),
       exit_cv_(&request_mutex_),
       requests_to_wait_(0),
-      total_requests_{0, 0},
-      total_bytes_through_{0, 0},
       available_bytes_(0),
       next_refill_us_(env_->NowMicros()),
       fairness_(fairness > 100 ? 100 : fairness),
       rnd_((uint32_t)time(nullptr)),
       leader_(nullptr) {
+  total_requests_[0] = 0;
+  total_requests_[1] = 0;
   total_bytes_through_[0] = 0;
   total_bytes_through_[1] = 0;
 }
diff --git a/src/rocksdb/util/rate_limiter.h b/src/rocksdb/util/rate_limiter.h
index 3840c4e..2a54d4c 100644
--- a/src/rocksdb/util/rate_limiter.h
+++ b/src/rocksdb/util/rate_limiter.h
@@ -11,7 +11,7 @@
 
 #include <atomic>
 #include <deque>
-#include "port/port_posix.h"
+#include "port/port.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "rocksdb/env.h"
@@ -31,7 +31,7 @@ class GenericRateLimiter : public RateLimiter {
 
   // Request for token to write bytes. If this request can not be satisfied,
   // the call is blocked. Caller is responsible to make sure
-  // bytes < GetSingleBurstBytes()
+  // bytes <= GetSingleBurstBytes()
   virtual void Request(const int64_t bytes, const Env::IOPriority pri) override;
 
   virtual int64_t GetSingleBurstBytes() const override {
diff --git a/src/rocksdb/util/skiplistrep.cc b/src/rocksdb/util/skiplistrep.cc
index ee57372..112a7ab 100644
--- a/src/rocksdb/util/skiplistrep.cc
+++ b/src/rocksdb/util/skiplistrep.cc
@@ -52,6 +52,15 @@ public:
     }
   }
 
+  uint64_t ApproximateNumEntries(const Slice& start_ikey,
+                                 const Slice& end_ikey) override {
+    std::string tmp;
+    uint64_t start_count =
+        skip_list_.EstimateCount(EncodeKey(&tmp, start_ikey));
+    uint64_t end_count = skip_list_.EstimateCount(EncodeKey(&tmp, end_ikey));
+    return (end_count >= start_count) ? (end_count - start_count) : 0;
+  }
+
   virtual ~SkipListRep() override { }
 
   // Iteration over the contents of a skip list
diff --git a/src/rocksdb/util/slice.cc b/src/rocksdb/util/slice.cc
index 6484e16..4c50ff9 100644
--- a/src/rocksdb/util/slice.cc
+++ b/src/rocksdb/util/slice.cc
@@ -7,9 +7,11 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#include <algorithm>
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/slice.h"
 #include "util/string_util.h"
+#include <stdio.h>
 
 namespace rocksdb {
 
@@ -23,6 +25,11 @@ class FixedPrefixTransform : public SliceTransform {
  public:
   explicit FixedPrefixTransform(size_t prefix_len)
       : prefix_len_(prefix_len),
+        // Note that if any part of the name format changes, it will require
+        // changes on options_helper in order to make RocksDBOptionsParser work
+        // for the new change.
+        // TODO(yhchiang): move serialization / deserializaion code inside
+        // the class implementation itself.
         name_("rocksdb.FixedPrefix." + ToString(prefix_len_)) {}
 
   virtual const char* Name() const override { return name_.c_str(); }
@@ -53,6 +60,11 @@ class CappedPrefixTransform : public SliceTransform {
  public:
   explicit CappedPrefixTransform(size_t cap_len)
       : cap_len_(cap_len),
+        // Note that if any part of the name format changes, it will require
+        // changes on options_helper in order to make RocksDBOptionsParser work
+        // for the new change.
+        // TODO(yhchiang): move serialization / deserializaion code inside
+        // the class implementation itself.
         name_("rocksdb.CappedPrefix." + ToString(cap_len_)) {}
 
   virtual const char* Name() const override { return name_.c_str(); }
@@ -92,6 +104,27 @@ class NoopTransform : public SliceTransform {
 
 }
 
+// Do not want to include the whole /port/port.h here for one define
+#ifdef OS_WIN
+#define snprintf _snprintf
+#endif
+
+// Return a string that contains the copy of the referenced data.
+std::string Slice::ToString(bool hex) const {
+  std::string result;  // RVO/NRVO/move
+  if (hex) {
+    char buf[10];
+    for (size_t i = 0; i < size_; i++) {
+      snprintf(buf, 10, "%02X", (unsigned char)data_[i]);
+      result += buf;
+    }
+    return result;
+  } else {
+    result.assign(data_, size_);
+    return result;
+  }
+}
+
 const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) {
   return new FixedPrefixTransform(prefix_len);
 }
diff --git a/src/rocksdb/util/sst_dump_test.cc b/src/rocksdb/util/sst_dump_test.cc
index 03d7299..50e9f10 100644
--- a/src/rocksdb/util/sst_dump_test.cc
+++ b/src/rocksdb/util/sst_dump_test.cc
@@ -7,12 +7,15 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#ifndef ROCKSDB_LITE
+
 #include <stdint.h>
 #include "rocksdb/sst_dump_tool.h"
 
 #include "rocksdb/filter_policy.h"
 #include "table/block_based_table_factory.h"
 #include "table/table_builder.h"
+#include "util/file_reader_writer.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
@@ -53,12 +56,13 @@ void createSST(const std::string& file_name,
   opts.table_factory = tf;
   std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
       int_tbl_prop_collector_factories;
-
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(file), EnvOptions()));
   tb.reset(opts.table_factory->NewTableBuilder(
       TableBuilderOptions(imoptions, ikc, &int_tbl_prop_collector_factories,
                           CompressionType::kNoCompression, CompressionOptions(),
                           false),
-      file.get()));
+      file_writer.get()));
 
   // Populate slightly more than 1K keys
   uint32_t num_keys = 1024;
@@ -66,7 +70,7 @@ void createSST(const std::string& file_name,
     tb->Add(MakeKey(i), MakeValue(i));
   }
   tb->Finish();
-  file->Close();
+  file_writer->Close();
 }
 
 void cleanup(const std::string& file_name) {
@@ -174,9 +178,41 @@ TEST_F(SSTDumpToolTest, GetProperties) {
     delete[] usage[i];
   }
 }
+
+TEST_F(SSTDumpToolTest, CompressedSizes) {
+  table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
+  std::string file_name = "rocksdb_sst_test.sst";
+  createSST(file_name, table_options_);
+
+  char* usage[3];
+  for (int i = 0; i < 3; i++) {
+    usage[i] = new char[optLength];
+  }
+
+  snprintf(usage[0], optLength, "./sst_dump");
+  snprintf(usage[1], optLength, "--show_compression_sizes");
+  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+  rocksdb::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage));
+
+  cleanup(file_name);
+  for (int i = 0; i < 3; i++) {
+    delete[] usage[i];
+  }
+}
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as SSTDumpTool is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE  return RUN_ALL_TESTS();
diff --git a/src/rocksdb/util/sst_dump_tool.cc b/src/rocksdb/util/sst_dump_tool.cc
index 04486da..2e31901 100644
--- a/src/rocksdb/util/sst_dump_tool.cc
+++ b/src/rocksdb/util/sst_dump_tool.cc
@@ -12,6 +12,7 @@
 #endif
 
 #include <inttypes.h>
+#include "port/port.h"
 
 namespace rocksdb {
 
@@ -24,7 +25,6 @@ SstFileReader::SstFileReader(const std::string& file_path,
     output_hex_(output_hex), ioptions_(options_),
     internal_comparator_(BytewiseComparator()) {
   fprintf(stdout, "Process %s\n", file_path.c_str());
-
   init_result_ = GetTableReader(file_name_);
 }
 
@@ -33,6 +33,8 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber;
 extern const uint64_t kPlainTableMagicNumber;
 extern const uint64_t kLegacyPlainTableMagicNumber;
 
+const char* testFileName = "test_file_name";
+
 Status SstFileReader::GetTableReader(const std::string& file_path) {
   uint64_t magic_number;
 
@@ -41,10 +43,13 @@ Status SstFileReader::GetTableReader(const std::string& file_path) {
 
   unique_ptr<RandomAccessFile> file;
   uint64_t file_size;
-  Status s = options_.env->NewRandomAccessFile(file_path, &file_, soptions_);
+  Status s = options_.env->NewRandomAccessFile(file_path, &file, soptions_);
   if (s.ok()) {
     s = options_.env->GetFileSize(file_path, &file_size);
   }
+
+  file_.reset(new RandomAccessFileReader(std::move(file)));
+
   if (s.ok()) {
     s = ReadFooterFromFile(file_.get(), file_size, &footer);
   }
@@ -56,7 +61,8 @@ Status SstFileReader::GetTableReader(const std::string& file_path) {
     if (magic_number == kPlainTableMagicNumber ||
         magic_number == kLegacyPlainTableMagicNumber) {
       soptions_.use_mmap_reads = true;
-      options_.env->NewRandomAccessFile(file_path, &file_, soptions_);
+      options_.env->NewRandomAccessFile(file_path, &file, soptions_);
+      file_.reset(new RandomAccessFileReader(std::move(file)));
     }
     options_.comparator = &internal_comparator_;
     // For old sst format, ReadTableProperties might fail but file can be read
@@ -68,16 +74,15 @@ Status SstFileReader::GetTableReader(const std::string& file_path) {
   }
 
   if (s.ok()) {
-    s = NewTableReader(ioptions_, soptions_, internal_comparator_,
-                       std::move(file_), file_size, &table_reader_);
+    s = NewTableReader(ioptions_, soptions_, internal_comparator_, file_size,
+                       &table_reader_);
   }
   return s;
 }
 
 Status SstFileReader::NewTableReader(
     const ImmutableCFOptions& ioptions, const EnvOptions& soptions,
-    const InternalKeyComparator& internal_comparator,
-    unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+    const InternalKeyComparator& internal_comparator, uint64_t file_size,
     unique_ptr<TableReader>* table_reader) {
   // We need to turn off pre-fetching of index and filter nodes for
   // BlockBasedTable
@@ -86,16 +91,16 @@ Status SstFileReader::NewTableReader(
 
   if (block_table_factory) {
     return block_table_factory->NewTableReader(
-        ioptions_, soptions_, internal_comparator_, std::move(file_), file_size,
-        &table_reader_, /*enable_prefetch=*/false);
+        TableReaderOptions(ioptions_, soptions_, internal_comparator_),
+        std::move(file_), file_size, &table_reader_, /*enable_prefetch=*/false);
   }
 
   assert(!block_table_factory);
 
   // For all other factory implementation
   return options_.table_factory->NewTableReader(
-      ioptions_, soptions_, internal_comparator_, std::move(file_), file_size,
-      &table_reader_);
+      TableReaderOptions(ioptions_, soptions_, internal_comparator_),
+      std::move(file_), file_size, &table_reader_);
 }
 
 Status SstFileReader::DumpTable(const std::string& out_filename) {
@@ -107,8 +112,79 @@ Status SstFileReader::DumpTable(const std::string& out_filename) {
   return s;
 }
 
+uint64_t SstFileReader::CalculateCompressedTableSize(
+    const TableBuilderOptions& tb_options, size_t block_size) {
+  unique_ptr<WritableFile> out_file;
+  unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  env->NewWritableFile(testFileName, &out_file, soptions_);
+  unique_ptr<WritableFileWriter> dest_writer;
+  dest_writer.reset(new WritableFileWriter(std::move(out_file), soptions_));
+  BlockBasedTableOptions table_options;
+  table_options.block_size = block_size;
+  BlockBasedTableFactory block_based_tf(table_options);
+  unique_ptr<TableBuilder> table_builder;
+  table_builder.reset(block_based_tf.NewTableBuilder(
+                         tb_options, dest_writer.get()));
+  unique_ptr<Iterator> iter(table_reader_->NewIterator(ReadOptions()));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    if (!iter->status().ok()) {
+      fputs(iter->status().ToString().c_str(), stderr);
+      exit(1);
+    }
+    table_builder->Add(iter->key(), iter->value());
+  }
+  Status s = table_builder->Finish();
+  if (!s.ok()) {
+    fputs(s.ToString().c_str(), stderr);
+    exit(1);
+  }
+  uint64_t size = table_builder->FileSize();
+  env->DeleteFile(testFileName);
+  return size;
+}
+
+int SstFileReader::ShowAllCompressionSizes(size_t block_size) {
+  ReadOptions read_options;
+  Options opts;
+  const ImmutableCFOptions imoptions(opts);
+  rocksdb::InternalKeyComparator ikc(opts.comparator);
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
+      block_based_table_factories;
+
+  std::map<CompressionType, const char*> compress_type;
+  compress_type.insert(
+      std::make_pair(CompressionType::kNoCompression, "kNoCompression"));
+  compress_type.insert(std::make_pair(CompressionType::kSnappyCompression,
+                                      "kSnappyCompression"));
+  compress_type.insert(
+      std::make_pair(CompressionType::kZlibCompression, "kZlibCompression"));
+  compress_type.insert(
+      std::make_pair(CompressionType::kBZip2Compression, "kBZip2Compression"));
+  compress_type.insert(
+      std::make_pair(CompressionType::kLZ4Compression, "kLZ4Compression"));
+  compress_type.insert(
+      std::make_pair(CompressionType::kLZ4HCCompression, "kLZ4HCCompression"));
+  compress_type.insert(std::make_pair(CompressionType::kZSTDNotFinalCompression,
+                                      "kZSTDNotFinalCompression"));
+
+  fprintf(stdout, "Block Size: %" ROCKSDB_PRIszt "\n", block_size);
+
+  for (CompressionType i = CompressionType::kNoCompression;
+       i <= CompressionType::kZSTDNotFinalCompression;
+       i = (i == kLZ4HCCompression) ? kZSTDNotFinalCompression
+                                    : CompressionType(i + 1)) {
+    CompressionOptions compress_opt;
+    TableBuilderOptions tb_opts(imoptions, ikc, &block_based_table_factories, i,
+                                compress_opt, false);
+    uint64_t file_size = CalculateCompressedTableSize(tb_opts, block_size);
+    fprintf(stdout, "Compression: %s", compress_type.find(i)->second);
+    fprintf(stdout, " Size: %" PRIu64 "\n", file_size);
+  }
+  return 0;
+}
+
 Status SstFileReader::ReadTableProperties(uint64_t table_magic_number,
-                                          RandomAccessFile* file,
+                                          RandomAccessFileReader* file,
                                           uint64_t file_size) {
   TableProperties* table_properties = nullptr;
   Status s = rocksdb::ReadTableProperties(file, file_size, table_magic_number,
@@ -248,24 +324,9 @@ void print_help() {
           " [--from=<user_key>]"
           " [--to=<user_key>]"
           " [--read_num=NUM]"
-          " [--show_properties]\n");
-}
-
-string HexToString(const string& str) {
-  string parsed;
-  if (str[0] != '0' || str[1] != 'x') {
-    fprintf(stderr, "Invalid hex input %s.  Must start with 0x\n",
-            str.c_str());
-    throw "Invalid hex input";
-  }
-
-  for (unsigned int i = 2; i < str.length();) {
-    int c;
-    sscanf(str.c_str() + i, "%2X", &c);
-    parsed.push_back(c);
-    i += 2;
-  }
-  return parsed;
+          " [--show_properties]"
+          " [--show_compression_sizes]"
+          " [--show_compression_sizes [--set_block_size=<block_size>]]\n");
 }
 
 }  // namespace
@@ -283,8 +344,12 @@ int SSTDumpTool::Run(int argc, char** argv) {
   bool has_from = false;
   bool has_to = false;
   bool show_properties = false;
+  bool show_compression_sizes = false;
+  bool set_block_size = false;
   std::string from_key;
   std::string to_key;
+  std::string block_size_str;
+  size_t block_size;
   for (int i = 1; i < argc; i++) {
     if (strncmp(argv[i], "--file=", 7) == 0) {
       dir_or_file = argv[i] + 7;
@@ -308,6 +373,17 @@ int SSTDumpTool::Run(int argc, char** argv) {
       has_to = true;
     } else if (strcmp(argv[i], "--show_properties") == 0) {
       show_properties = true;
+    } else if (strcmp(argv[i], "--show_compression_sizes") == 0) {
+      show_compression_sizes = true;
+    } else if (strncmp(argv[i], "--set_block_size=", 17) == 0) {
+      set_block_size = true;
+      block_size_str = argv[i] + 17;
+      std::istringstream iss(block_size_str);
+      if (iss.fail()) {
+        fprintf(stderr, "block size must be numeric");
+        exit(1);
+      }
+      iss >> block_size;
     } else {
       print_help();
       exit(1);
@@ -316,10 +392,10 @@ int SSTDumpTool::Run(int argc, char** argv) {
 
   if (input_key_hex) {
     if (has_from) {
-      from_key = HexToString(from_key);
+      from_key = rocksdb::LDBCommand::HexToString(from_key);
     }
     if (has_to) {
-      to_key = HexToString(to_key);
+      to_key = rocksdb::LDBCommand::HexToString(to_key);
     }
   }
 
@@ -362,6 +438,15 @@ int SSTDumpTool::Run(int argc, char** argv) {
       exit(1);
     }
 
+    if (show_compression_sizes) {
+      if (set_block_size) {
+        reader.ShowAllCompressionSizes(block_size);
+      } else {
+        reader.ShowAllCompressionSizes(16384);
+      }
+      return 0;
+    }
+
     if (command == "raw") {
       std::string out_filename = filename.substr(0, filename.length() - 4);
       out_filename.append("_dump.txt");
diff --git a/src/rocksdb/util/sst_dump_tool_imp.h b/src/rocksdb/util/sst_dump_tool_imp.h
index a5f2267..b7d9e4d 100644
--- a/src/rocksdb/util/sst_dump_tool_imp.h
+++ b/src/rocksdb/util/sst_dump_tool_imp.h
@@ -8,6 +8,7 @@
 #include "rocksdb/sst_dump_tool.h"
 
 #include <map>
+#include <sstream>
 #include <string>
 #include <vector>
 
@@ -20,14 +21,15 @@
 #include "rocksdb/iterator.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/status.h"
-#include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "table/block.h"
+#include "table/block_based_table_builder.h"
 #include "table/block_based_table_factory.h"
 #include "table/block_builder.h"
 #include "table/format.h"
 #include "table/meta_blocks.h"
 #include "table/plain_table_factory.h"
+#include "util/file_reader_writer.h"
 #include "util/ldb_cmd.h"
 #include "util/random.h"
 #include "util/testharness.h"
@@ -52,11 +54,17 @@ class SstFileReader {
   Status DumpTable(const std::string& out_filename);
   Status getStatus() { return init_result_; }
 
+  int ShowAllCompressionSizes(size_t block_size);
+
  private:
   // Get the TableReader implementation for the sst file
   Status GetTableReader(const std::string& file_path);
   Status ReadTableProperties(uint64_t table_magic_number,
-                             RandomAccessFile* file, uint64_t file_size);
+                             RandomAccessFileReader* file, uint64_t file_size);
+
+  uint64_t CalculateCompressedTableSize(const TableBuilderOptions& tb_options,
+                                        size_t block_size);
+
   Status SetTableOptionsByMagicNumber(uint64_t table_magic_number);
   Status SetOldTableOptions();
 
@@ -65,7 +73,7 @@ class SstFileReader {
   Status NewTableReader(const ImmutableCFOptions& ioptions,
                         const EnvOptions& soptions,
                         const InternalKeyComparator& internal_comparator,
-                        unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
+                        uint64_t file_size,
                         unique_ptr<TableReader>* table_reader);
 
   std::string file_name_;
@@ -76,7 +84,7 @@ class SstFileReader {
 
   Status init_result_;
   unique_ptr<TableReader> table_reader_;
-  unique_ptr<RandomAccessFile> file_;
+  unique_ptr<RandomAccessFileReader> file_;
   // options_ and internal_comparator_ will also be used in
   // ReadSequential internally (specifically, seek-related operations)
   Options options_;
diff --git a/src/rocksdb/util/statistics.cc b/src/rocksdb/util/statistics.cc
index ba7670b..8a7525c 100644
--- a/src/rocksdb/util/statistics.cc
+++ b/src/rocksdb/util/statistics.cc
@@ -50,13 +50,19 @@ void StatisticsImpl::histogramData(uint32_t histogramType,
   histograms_[histogramType].Data(data);
 }
 
+std::string StatisticsImpl::getHistogramString(uint32_t histogramType) const {
+  assert(enable_internal_stats_ ? histogramType < INTERNAL_HISTOGRAM_ENUM_MAX
+                                : histogramType < HISTOGRAM_ENUM_MAX);
+  return histograms_[histogramType].ToString();
+}
+
 void StatisticsImpl::setTickerCount(uint32_t tickerType, uint64_t count) {
   assert(
     enable_internal_stats_ ?
       tickerType < INTERNAL_TICKER_ENUM_MAX :
       tickerType < TICKER_ENUM_MAX);
   if (tickerType < TICKER_ENUM_MAX || enable_internal_stats_) {
-    tickers_[tickerType].value = count;
+    tickers_[tickerType].value.store(count, std::memory_order_relaxed);
   }
   if (stats_ && tickerType < TICKER_ENUM_MAX) {
     stats_->setTickerCount(tickerType, count);
@@ -69,7 +75,7 @@ void StatisticsImpl::recordTick(uint32_t tickerType, uint64_t count) {
       tickerType < INTERNAL_TICKER_ENUM_MAX :
       tickerType < TICKER_ENUM_MAX);
   if (tickerType < TICKER_ENUM_MAX || enable_internal_stats_) {
-    tickers_[tickerType].value += count;
+    tickers_[tickerType].value.fetch_add(count, std::memory_order_relaxed);
   }
   if (stats_ && tickerType < TICKER_ENUM_MAX) {
     stats_->recordTick(tickerType, count);
diff --git a/src/rocksdb/util/statistics.h b/src/rocksdb/util/statistics.h
index c56900a..55914f5 100644
--- a/src/rocksdb/util/statistics.h
+++ b/src/rocksdb/util/statistics.h
@@ -37,6 +37,7 @@ class StatisticsImpl : public Statistics {
   virtual uint64_t getTickerCount(uint32_t ticker_type) const override;
   virtual void histogramData(uint32_t histogram_type,
                              HistogramData* const data) const override;
+  std::string getHistogramString(uint32_t histogram_type) const override;
 
   virtual void setTickerCount(uint32_t ticker_type, uint64_t count) override;
   virtual void recordTick(uint32_t ticker_type, uint64_t count) override;
diff --git a/src/rocksdb/util/status.cc b/src/rocksdb/util/status.cc
index f0112d3..6ff5005 100644
--- a/src/rocksdb/util/status.cc
+++ b/src/rocksdb/util/status.cc
@@ -21,7 +21,8 @@ const char* Status::CopyState(const char* state) {
   return result;
 }
 
-Status::Status(Code _code, const Slice& msg, const Slice& msg2) : code_(_code) {
+Status::Status(Code _code, const Slice& msg, const Slice& msg2)
+    : code_(_code), subcode_(kNone) {
   assert(code_ != kOk);
   const uint32_t len1 = static_cast<uint32_t>(msg.size());
   const uint32_t len2 = static_cast<uint32_t>(msg2.size());
@@ -73,6 +74,15 @@ std::string Status::ToString() const {
     case kAborted:
       type = "Operation aborted: ";
       break;
+    case kBusy:
+      type = "Resource busy: ";
+      break;
+    case kExpired:
+      type = "Operation expired: ";
+      break;
+    case kTryAgain:
+      type = "Operation failed. Try again.: ";
+      break;
     default:
       snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
                static_cast<int>(code()));
@@ -80,6 +90,12 @@ std::string Status::ToString() const {
       break;
   }
   std::string result(type);
+  if (subcode_ != kNone) {
+    uint32_t index = static_cast<int32_t>(subcode_);
+    assert(sizeof(msgs) > index);
+    result.append(msgs[index]);
+  }
+
   if (state_ != nullptr) {
     uint32_t length;
     memcpy(&length, state_, sizeof(length));
diff --git a/src/rocksdb/util/status_message.cc b/src/rocksdb/util/status_message.cc
new file mode 100644
index 0000000..26ab06d
--- /dev/null
+++ b/src/rocksdb/util/status_message.cc
@@ -0,0 +1,17 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+const char* Status::msgs[] = {
+    "",                                                  // kNone
+    "Timeout Acquiring Mutex",                           // kMutexTimeout
+    "Timeout waiting to lock key",                       // kLockTimeout
+    "Failed to acquire lock due to max_num_locks limit"  // kLockLimit
+};
+
+}  // namespace rocksdb
diff --git a/src/rocksdb/util/stl_wrappers.h b/src/rocksdb/util/stl_wrappers.h
index b4c14b4..15b9bdf 100644
--- a/src/rocksdb/util/stl_wrappers.h
+++ b/src/rocksdb/util/stl_wrappers.h
@@ -2,31 +2,45 @@
 //  This source code is licensed under the BSD-style license found in the
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
-//
 #pragma once
 
-#include "util/murmurhash.h"
-#include "util/coding.h"
+#include <map>
+#include <string>
 
+#include "rocksdb/comparator.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/slice.h"
+#include "util/coding.h"
+#include "util/murmurhash.h"
 
 namespace rocksdb {
 namespace stl_wrappers {
-  class Base {
-   protected:
-    const MemTableRep::KeyComparator& compare_;
-    explicit Base(const MemTableRep::KeyComparator& compare)
-      : compare_(compare) { }
-  };
-
-  struct Compare : private Base {
-    explicit Compare(const MemTableRep::KeyComparator& compare)
-      : Base(compare) { }
-    inline bool operator()(const char* a, const char* b) const {
-      return compare_(a, b) < 0;
-    }
-  };
 
+class Base {
+ protected:
+  const MemTableRep::KeyComparator& compare_;
+  explicit Base(const MemTableRep::KeyComparator& compare)
+      : compare_(compare) {}
+};
+
+struct Compare : private Base {
+  explicit Compare(const MemTableRep::KeyComparator& compare) : Base(compare) {}
+  inline bool operator()(const char* a, const char* b) const {
+    return compare_(a, b) < 0;
+  }
+};
+
+struct LessOfComparator {
+  explicit LessOfComparator(const Comparator* c = BytewiseComparator())
+      : cmp(c) {}
+
+  bool operator()(const std::string& a, const std::string& b) const {
+    return cmp->Compare(Slice(a), Slice(b)) < 0;
+  }
+
+  const Comparator* cmp;
+};
+
+typedef std::map<std::string, std::string, LessOfComparator> KVMap;
 }
 }
diff --git a/src/rocksdb/util/stop_watch.h b/src/rocksdb/util/stop_watch.h
index 3637533..86cb265 100644
--- a/src/rocksdb/util/stop_watch.h
+++ b/src/rocksdb/util/stop_watch.h
@@ -67,6 +67,10 @@ class StopWatchNano {
     return elapsed;
   }
 
+  uint64_t ElapsedNanosSafe(bool reset = false) {
+    return (env_ != nullptr) ? ElapsedNanos(reset) : 0U;
+  }
+
  private:
   Env* const env_;
   uint64_t start_;
diff --git a/src/rocksdb/util/string_util.h b/src/rocksdb/util/string_util.h
index dfbe505..c7cc57d 100644
--- a/src/rocksdb/util/string_util.h
+++ b/src/rocksdb/util/string_util.h
@@ -3,11 +3,13 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
+
+#pragma once
+
 #include <sstream>
 #include <string>
 #include <vector>
 
-#pragma once
 namespace rocksdb {
 
 extern std::vector<std::string> StringSplit(const std::string& arg, char delim);
diff --git a/src/rocksdb/util/sync_point.cc b/src/rocksdb/util/sync_point.cc
index 3c224bf..7051b51 100644
--- a/src/rocksdb/util/sync_point.cc
+++ b/src/rocksdb/util/sync_point.cc
@@ -4,10 +4,25 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 
 #include "util/sync_point.h"
+#include "port/port.h"
+#include "util/random.h"
+
+int rocksdb_kill_odds = 0;
 
 #ifndef NDEBUG
 namespace rocksdb {
 
+void TestKillRandom(int odds, const std::string& srcfile, int srcline) {
+  time_t curtime = time(nullptr);
+  Random r((uint32_t)curtime);
+
+  assert(odds > 0);
+  bool crash = r.OneIn(odds);
+  if (crash) {
+    port::Crash(srcfile, srcline);
+  }
+}
+
 SyncPoint* SyncPoint::GetInstance() {
   static SyncPoint sync_point;
   return &sync_point;
diff --git a/src/rocksdb/util/sync_point.h b/src/rocksdb/util/sync_point.h
index 7827d28..6a4629c 100644
--- a/src/rocksdb/util/sync_point.h
+++ b/src/rocksdb/util/sync_point.h
@@ -4,6 +4,7 @@
 //  of patent rights can be found in the PATENTS file in the same directory.
 #pragma once
 
+#include <assert.h>
 #include <condition_variable>
 #include <mutex>
 #include <string>
@@ -11,6 +12,33 @@
 #include <unordered_map>
 #include <vector>
 
+// This is only set from db_stress.cc and for testing only.
+// If non-zero, kill at various points in source code with probability 1/this
+extern int rocksdb_kill_odds;
+
+#ifdef NDEBUG
+// empty in release build
+#define TEST_KILL_RANDOM(rocksdb_kill_odds)
+#else
+
+namespace rocksdb {
+// Kill the process with probablity 1/odds for testing.
+extern void TestKillRandom(int odds, const std::string& srcfile, int srcline);
+
+// To avoid crashing always at some frequently executed codepaths (during
+// kill random test), use this factor to reduce odds
+#define REDUCE_ODDS 2
+#define REDUCE_ODDS2 4
+
+#define TEST_KILL_RANDOM(rocksdb_kill_odds)                  \
+  {                                                          \
+    if (rocksdb_kill_odds > 0) {                             \
+      TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__); \
+    }                                                        \
+  }
+}  // namespace rocksdb
+#endif
+
 #ifdef NDEBUG
 #define TEST_SYNC_POINT(x)
 #define TEST_SYNC_POINT_CALLBACK(x, y)
diff --git a/src/rocksdb/util/testutil.cc b/src/rocksdb/util/testutil.cc
index 20f22c2..5f74221 100644
--- a/src/rocksdb/util/testutil.cc
+++ b/src/rocksdb/util/testutil.cc
@@ -10,6 +10,7 @@
 #include "util/testutil.h"
 
 #include "port/port.h"
+#include "util/file_reader_writer.h"
 #include "util/random.h"
 
 namespace rocksdb {
@@ -107,5 +108,35 @@ const Comparator* Uint64Comparator() {
   return uint64comp;
 }
 
+WritableFileWriter* GetWritableFileWriter(WritableFile* wf) {
+  unique_ptr<WritableFile> file(wf);
+  return new WritableFileWriter(std::move(file), EnvOptions());
+}
+
+RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf) {
+  unique_ptr<RandomAccessFile> file(raf);
+  return new RandomAccessFileReader(std::move(file));
+}
+
+SequentialFileReader* GetSequentialFileReader(SequentialFile* se) {
+  unique_ptr<SequentialFile> file(se);
+  return new SequentialFileReader(std::move(file));
+}
+
+void CorruptKeyType(InternalKey* ikey) {
+  std::string keystr = ikey->Encode().ToString();
+  keystr[keystr.size() - 8] = kTypeLogData;
+  ikey->DecodeFrom(Slice(keystr.data(), keystr.size()));
+}
+
+std::string KeyStr(const std::string& user_key, const SequenceNumber& seq,
+                   const ValueType& t, bool corrupt) {
+  InternalKey k(user_key, seq, t);
+  if (corrupt) {
+    CorruptKeyType(&k);
+  }
+  return k.Encode().ToString();
+}
+
 }  // namespace test
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/testutil.h b/src/rocksdb/util/testutil.h
index 9584838..2980628 100644
--- a/src/rocksdb/util/testutil.h
+++ b/src/rocksdb/util/testutil.h
@@ -8,13 +8,22 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
+#include <algorithm>
 #include <string>
+#include <vector>
+
 #include "db/dbformat.h"
+#include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
 #include "rocksdb/slice.h"
+#include "util/mutexlock.h"
 #include "util/random.h"
 
 namespace rocksdb {
+class SequentialFile;
+class SequentialFileReader;
+
 namespace test {
 
 // Store in *dst a random string of length "len" and return a Slice that
@@ -117,5 +126,254 @@ class SimpleSuffixReverseComparator : public Comparator {
 // endian machines.
 extern const Comparator* Uint64Comparator();
 
+// Iterator over a vector of keys/values
+class VectorIterator : public Iterator {
+ public:
+  explicit VectorIterator(const std::vector<std::string>& keys)
+      : keys_(keys), current_(keys.size()) {
+    std::sort(keys_.begin(), keys_.end());
+    values_.resize(keys.size());
+  }
+
+  VectorIterator(const std::vector<std::string>& keys,
+      const std::vector<std::string>& values)
+    : keys_(keys), values_(values), current_(keys.size()) {
+    assert(keys_.size() == values_.size());
+  }
+
+  virtual bool Valid() const override { return current_ < keys_.size(); }
+
+  virtual void SeekToFirst() override { current_ = 0; }
+  virtual void SeekToLast() override { current_ = keys_.size() - 1; }
+
+  virtual void Seek(const Slice& target) override {
+    current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) -
+               keys_.begin();
+  }
+
+  virtual void Next() override { current_++; }
+  virtual void Prev() override { current_--; }
+
+  virtual Slice key() const override { return Slice(keys_[current_]); }
+  virtual Slice value() const override { return Slice(values_[current_]); }
+
+  virtual Status status() const override { return Status::OK(); }
+
+ private:
+  std::vector<std::string> keys_;
+  std::vector<std::string> values_;
+  size_t current_;
+};
+extern WritableFileWriter* GetWritableFileWriter(WritableFile* wf);
+
+extern RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf);
+
+extern SequentialFileReader* GetSequentialFileReader(SequentialFile* se);
+
+class StringSink: public WritableFile {
+ public:
+  std::string contents_;
+
+  explicit StringSink(Slice* reader_contents = nullptr) :
+      WritableFile(),
+      contents_(""),
+      reader_contents_(reader_contents),
+      last_flush_(0) {
+    if (reader_contents_ != nullptr) {
+      *reader_contents_ = Slice(contents_.data(), 0);
+    }
+  }
+
+  const std::string& contents() const { return contents_; }
+
+  virtual Status Truncate(uint64_t size) override {
+    contents_.resize(size);
+    return Status::OK();
+  }
+  virtual Status Close() override { return Status::OK(); }
+  virtual Status Flush() override {
+    if (reader_contents_ != nullptr) {
+      assert(reader_contents_->size() <= last_flush_);
+      size_t offset = last_flush_ - reader_contents_->size();
+      *reader_contents_ = Slice(
+          contents_.data() + offset,
+          contents_.size() - offset);
+      last_flush_ = contents_.size();
+    }
+
+    return Status::OK();
+  }
+  virtual Status Sync() override { return Status::OK(); }
+  virtual Status Append(const Slice& slice) override {
+    contents_.append(slice.data(), slice.size());
+    return Status::OK();
+  }
+  void Drop(size_t bytes) {
+    if (reader_contents_ != nullptr) {
+      contents_.resize(contents_.size() - bytes);
+      *reader_contents_ = Slice(
+          reader_contents_->data(), reader_contents_->size() - bytes);
+      last_flush_ = contents_.size();
+    }
+  }
+
+ private:
+  Slice* reader_contents_;
+  size_t last_flush_;
+};
+
+class StringSource: public RandomAccessFile {
+ public:
+  explicit StringSource(const Slice& contents, uint64_t uniq_id = 0,
+                        bool mmap = false)
+      : contents_(contents.data(), contents.size()),
+        uniq_id_(uniq_id),
+        mmap_(mmap) {}
+
+  virtual ~StringSource() { }
+
+  uint64_t Size() const { return contents_.size(); }
+
+  virtual Status Read(uint64_t offset, size_t n, Slice* result,
+      char* scratch) const override {
+    if (offset > contents_.size()) {
+      return Status::InvalidArgument("invalid Read offset");
+    }
+    if (offset + n > contents_.size()) {
+      n = contents_.size() - offset;
+    }
+    if (!mmap_) {
+      memcpy(scratch, &contents_[offset], n);
+      *result = Slice(scratch, n);
+    } else {
+      *result = Slice(&contents_[offset], n);
+    }
+    return Status::OK();
+  }
+
+  virtual size_t GetUniqueId(char* id, size_t max_size) const override {
+    if (max_size < 20) {
+      return 0;
+    }
+
+    char* rid = id;
+    rid = EncodeVarint64(rid, uniq_id_);
+    rid = EncodeVarint64(rid, 0);
+    return static_cast<size_t>(rid-id);
+  }
+
+ private:
+  std::string contents_;
+  uint64_t uniq_id_;
+  bool mmap_;
+};
+
+class NullLogger : public Logger {
+ public:
+  using Logger::Logv;
+  virtual void Logv(const char* format, va_list ap) override {}
+  virtual size_t GetLogFileSize() const override { return 0; }
+};
+
+// Corrupts key by changing the type
+extern void CorruptKeyType(InternalKey* ikey);
+
+extern std::string KeyStr(const std::string& user_key,
+                          const SequenceNumber& seq, const ValueType& t,
+                          bool corrupt = false);
+
+class SleepingBackgroundTask {
+ public:
+  SleepingBackgroundTask()
+      : bg_cv_(&mutex_),
+        should_sleep_(true),
+        done_with_sleep_(false),
+        sleeping_(false) {}
+
+  bool IsSleeping() {
+    MutexLock l(&mutex_);
+    return sleeping_;
+  }
+  void DoSleep() {
+    MutexLock l(&mutex_);
+    sleeping_ = true;
+    while (should_sleep_) {
+      bg_cv_.Wait();
+    }
+    sleeping_ = false;
+    done_with_sleep_ = true;
+    bg_cv_.SignalAll();
+  }
+  void WakeUp() {
+    MutexLock l(&mutex_);
+    should_sleep_ = false;
+    bg_cv_.SignalAll();
+  }
+  void WaitUntilDone() {
+    MutexLock l(&mutex_);
+    while (!done_with_sleep_) {
+      bg_cv_.Wait();
+    }
+  }
+  bool WokenUp() {
+    MutexLock l(&mutex_);
+    return should_sleep_ == false;
+  }
+
+  void Reset() {
+    MutexLock l(&mutex_);
+    should_sleep_ = true;
+    done_with_sleep_ = false;
+  }
+
+  static void DoSleepTask(void* arg) {
+    reinterpret_cast<SleepingBackgroundTask*>(arg)->DoSleep();
+  }
+
+ private:
+  port::Mutex mutex_;
+  port::CondVar bg_cv_;  // Signalled when background work finishes
+  bool should_sleep_;
+  bool done_with_sleep_;
+  bool sleeping_;
+};
+
+// Filters merge operands and values that are equal to `num`.
+class FilterNumber : public CompactionFilter {
+ public:
+  explicit FilterNumber(uint64_t num) : num_(num) {}
+
+  std::string last_merge_operand_key() { return last_merge_operand_key_; }
+
+  bool Filter(int level, const rocksdb::Slice& key, const rocksdb::Slice& value,
+              std::string* new_value, bool* value_changed) const override {
+    if (value.size() == sizeof(uint64_t)) {
+      return num_ == DecodeFixed64(value.data());
+    }
+    return true;
+  }
+
+  bool FilterMergeOperand(int level, const rocksdb::Slice& key,
+                          const rocksdb::Slice& value) const override {
+    last_merge_operand_key_ = key.ToString();
+    if (value.size() == sizeof(uint64_t)) {
+      return num_ == DecodeFixed64(value.data());
+    }
+    return true;
+  }
+
+  const char* Name() const override { return "FilterBadMergeOperand"; }
+
+ private:
+  mutable std::string last_merge_operand_key_;
+  uint64_t num_;
+};
+
+inline std::string EncodeInt(uint64_t x) {
+  std::string result;
+  PutFixed64(&result, x);
+  return result;
+}
+
 }  // namespace test
 }  // namespace rocksdb
diff --git a/src/rocksdb/util/thread_local.cc b/src/rocksdb/util/thread_local.cc
index af0c8e1..21adf4f 100644
--- a/src/rocksdb/util/thread_local.cc
+++ b/src/rocksdb/util/thread_local.cc
@@ -15,10 +15,94 @@
 namespace rocksdb {
 
 port::Mutex ThreadLocalPtr::StaticMeta::mutex_;
-#if !defined(OS_MACOSX)
+#if ROCKSDB_SUPPORT_THREAD_LOCAL
 __thread ThreadLocalPtr::ThreadData* ThreadLocalPtr::StaticMeta::tls_ = nullptr;
 #endif
 
+// Windows doesn't support a per-thread destructor with its
+// TLS primitives.  So, we build it manually by inserting a
+// function to be called on each thread's exit.
+// See http://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way
+// and http://www.nynaeve.net/?p=183
+//
+// really we do this to have clear conscience since using TLS with thread-pools
+// is iffy
+// although OK within a request. But otherwise, threads have no identity in its
+// modern use.
+
+// This runs on windows only called from the System Loader
+#ifdef OS_WIN
+
+// Windows cleanup routine is invoked from a System Loader with a different
+// signature so we can not directly hookup the original OnThreadExit which is
+// private member
+// so we make StaticMeta class share with the us the address of the function so
+// we can invoke it.
+namespace wintlscleanup {
+
+// This is set to OnThreadExit in StaticMeta singleton constructor
+UnrefHandler thread_local_inclass_routine = nullptr;
+pthread_key_t thread_local_key = -1;
+
+// Static callback function to call with each thread termination.
+void NTAPI WinOnThreadExit(PVOID module, DWORD reason, PVOID reserved) {
+  // We decided to punt on PROCESS_EXIT
+  if (DLL_THREAD_DETACH == reason) {
+    if (thread_local_key != -1 && thread_local_inclass_routine != nullptr) {
+      void* tls = pthread_getspecific(thread_local_key);
+      if (tls != nullptr) {
+        thread_local_inclass_routine(tls);
+      }
+    }
+  }
+}
+
+}  // wintlscleanup
+
+#ifdef _WIN64
+
+#pragma comment(linker, "/include:_tls_used")
+#pragma comment(linker, "/include:p_thread_callback_on_exit")
+
+#else  // _WIN64
+
+#pragma comment(linker, "/INCLUDE:__tls_used")
+#pragma comment(linker, "/INCLUDE:_p_thread_callback_on_exit")
+
+#endif  // _WIN64
+
+// extern "C" suppresses C++ name mangling so we know the symbol name for the
+// linker /INCLUDE:symbol pragma above.
+extern "C" {
+
+// The linker must not discard thread_callback_on_exit.  (We force a reference
+// to this variable with a linker /include:symbol pragma to ensure that.) If
+// this variable is discarded, the OnThreadExit function will never be called.
+#ifdef _WIN64
+
+// .CRT section is merged with .rdata on x64 so it must be constant data.
+#pragma const_seg(".CRT$XLB")
+// When defining a const variable, it must have external linkage to be sure the
+// linker doesn't discard it.
+extern const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit;
+const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit =
+    wintlscleanup::WinOnThreadExit;
+// Reset the default section.
+#pragma const_seg()
+
+#else  // _WIN64
+
+#pragma data_seg(".CRT$XLB")
+PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = wintlscleanup::WinOnThreadExit;
+// Reset the default section.
+#pragma data_seg()
+
+#endif  // _WIN64
+
+}  // extern "C"
+
+#endif  // OS_WIN
+
 ThreadLocalPtr::StaticMeta* ThreadLocalPtr::Instance() {
   static ThreadLocalPtr::StaticMeta inst;
   return &inst;
@@ -53,8 +137,41 @@ ThreadLocalPtr::StaticMeta::StaticMeta() : next_instance_id_(0) {
   if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) {
     abort();
   }
+
+  // OnThreadExit is not getting called on the main thread.
+  // Call through the static destructor mechanism to avoid memory leak.
+  //
+  // Caveats: ~A() will be invoked _after_ ~StaticMeta for the global
+  // singleton (destructors are invoked in reverse order of constructor
+  // _completion_); the latter must not mutate internal members. This
+  // cleanup mechanism inherently relies on use-after-release of the
+  // StaticMeta, and is brittle with respect to compiler-specific handling
+  // of memory backing destructed statically-scoped objects. Perhaps
+  // registering with atexit(3) would be more robust.
+  //
+// This is not required on Windows.
+#if !defined(OS_WIN)
+  static struct A {
+    ~A() {
+#if !(ROCKSDB_SUPPORT_THREAD_LOCAL)
+      ThreadData* tls_ =
+        static_cast<ThreadData*>(pthread_getspecific(Instance()->pthread_key_));
+#endif
+      if (tls_) {
+        OnThreadExit(tls_);
+      }
+    }
+  } a;
+#endif  // !defined(OS_WIN)
+
   head_.next = &head_;
   head_.prev = &head_;
+
+#ifdef OS_WIN
+  // Share with Windows its cleanup routine and the key
+  wintlscleanup::thread_local_inclass_routine = OnThreadExit;
+  wintlscleanup::thread_local_key = pthread_key_;
+#endif
 }
 
 void ThreadLocalPtr::StaticMeta::AddThreadData(ThreadLocalPtr::ThreadData* d) {
@@ -74,7 +191,7 @@ void ThreadLocalPtr::StaticMeta::RemoveThreadData(
 }
 
 ThreadLocalPtr::ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() {
-#if defined(OS_MACOSX)
+#if !(ROCKSDB_SUPPORT_THREAD_LOCAL)
   // Make this local variable name look like a member variable so that we
   // can share all the code below
   ThreadData* tls_ =
diff --git a/src/rocksdb/util/thread_local.h b/src/rocksdb/util/thread_local.h
index 6884ed1..828a737 100644
--- a/src/rocksdb/util/thread_local.h
+++ b/src/rocksdb/util/thread_local.h
@@ -15,8 +15,12 @@
 #include <vector>
 
 #include "util/autovector.h"
-#include "port/port_posix.h"
-#include "util/thread_local.h"
+#include "port/port.h"
+
+#ifndef ROCKSDB_SUPPORT_THREAD_LOCAL
+#define ROCKSDB_SUPPORT_THREAD_LOCAL \
+  !defined(OS_WIN) && !defined(OS_MACOSX) && !defined(IOS_CROSS_COMPILE)
+#endif
 
 namespace rocksdb {
 
@@ -150,10 +154,11 @@ class ThreadLocalPtr {
     // protect inst, next_instance_id_, free_instance_ids_, head_,
     // ThreadData.entries
     static port::Mutex mutex_;
-#if !defined(OS_MACOSX)
+#if ROCKSDB_SUPPORT_THREAD_LOCAL
     // Thread local storage
     static __thread ThreadData* tls_;
 #endif
+
     // Used to make thread exit trigger possible if !defined(OS_MACOSX).
     // Otherwise, used to retrieve thread data.
     pthread_key_t pthread_key_;
diff --git a/src/rocksdb/util/thread_local_test.cc b/src/rocksdb/util/thread_local_test.cc
index 49e7775..a78a849 100644
--- a/src/rocksdb/util/thread_local_test.cc
+++ b/src/rocksdb/util/thread_local_test.cc
@@ -6,7 +6,7 @@
 #include <atomic>
 
 #include "rocksdb/env.h"
-#include "port/port_posix.h"
+#include "port/port.h"
 #include "util/autovector.h"
 #include "util/thread_local.h"
 #include "util/testharness.h"
diff --git a/src/rocksdb/util/thread_operation.h b/src/rocksdb/util/thread_operation.h
index 709e755..e55596c 100644
--- a/src/rocksdb/util/thread_operation.h
+++ b/src/rocksdb/util/thread_operation.h
@@ -13,7 +13,7 @@
 
 #pragma once
 
-#include "include/rocksdb/thread_status.h"
+#include "rocksdb/thread_status.h"
 
 #include <string>
 
@@ -61,8 +61,6 @@ static OperationStageInfo global_op_stage_table[] = {
       "CompactionJob::Run"},
   {ThreadStatus::STAGE_COMPACTION_PROCESS_KV,
       "CompactionJob::ProcessKeyValueCompaction"},
-  {ThreadStatus::STAGE_COMPACTION_FILTER_V2,
-      "CompactionJob::CallCompactionFilterV2"},
   {ThreadStatus::STAGE_COMPACTION_INSTALL,
       "CompactionJob::Install"},
   {ThreadStatus::STAGE_COMPACTION_SYNC_FILE,
diff --git a/src/rocksdb/util/thread_status_impl.cc b/src/rocksdb/util/thread_status_impl.cc
index bd64d44..50cb355 100644
--- a/src/rocksdb/util/thread_status_impl.cc
+++ b/src/rocksdb/util/thread_status_impl.cc
@@ -100,7 +100,7 @@ std::map<std::string, uint64_t>
       property_map.insert(
           {"BaseInputLevel", op_properties[i] >> 32});
       property_map.insert(
-          {"OutputLevel", op_properties[i] % (1ULL << 32)});
+          {"OutputLevel", op_properties[i] % (uint64_t(1) << 32U)});
     } else if (op_type == OP_COMPACTION &&
                i == COMPACTION_PROP_FLAGS) {
       property_map.insert(
diff --git a/src/rocksdb/util/thread_status_updater.cc b/src/rocksdb/util/thread_status_updater.cc
index 31845cc..3b93f20 100644
--- a/src/rocksdb/util/thread_status_updater.cc
+++ b/src/rocksdb/util/thread_status_updater.cc
@@ -15,6 +15,19 @@ namespace rocksdb {
 
 __thread ThreadStatusData* ThreadStatusUpdater::thread_status_data_ = nullptr;
 
+void ThreadStatusUpdater::RegisterThread(
+    ThreadStatus::ThreadType ttype, uint64_t thread_id) {
+  if (UNLIKELY(thread_status_data_ == nullptr)) {
+    thread_status_data_ = new ThreadStatusData();
+    thread_status_data_->thread_type = ttype;
+    thread_status_data_->thread_id = thread_id;
+    std::lock_guard<std::mutex> lck(thread_list_mutex_);
+    thread_data_set_.insert(thread_status_data_);
+  }
+
+  ClearThreadOperationProperties();
+}
+
 void ThreadStatusUpdater::UnregisterThread() {
   if (thread_status_data_ != nullptr) {
     std::lock_guard<std::mutex> lck(thread_list_mutex_);
@@ -24,13 +37,6 @@ void ThreadStatusUpdater::UnregisterThread() {
   }
 }
 
-void ThreadStatusUpdater::SetThreadType(
-    ThreadStatus::ThreadType ttype) {
-  auto* data = InitAndGet();
-  data->thread_type.store(ttype, std::memory_order_relaxed);
-  ClearThreadOperationProperties();
-}
-
 void ThreadStatusUpdater::ResetThreadStatus() {
   ClearThreadState();
   ClearThreadOperation();
@@ -39,17 +45,20 @@ void ThreadStatusUpdater::ResetThreadStatus() {
 
 void ThreadStatusUpdater::SetColumnFamilyInfoKey(
     const void* cf_key) {
-  auto* data = InitAndGet();
+  auto* data = Get();
+  if (data == nullptr) {
+    return;
+  }
   // set the tracking flag based on whether cf_key is non-null or not.
   // If enable_thread_tracking is set to false, the input cf_key
   // would be nullptr.
   data->enable_tracking = (cf_key != nullptr);
-  data->cf_key.store(cf_key, std::memory_order_relaxed);
+  data->cf_key.store(const_cast<void*>(cf_key), std::memory_order_relaxed);
 }
 
 const void* ThreadStatusUpdater::GetColumnFamilyInfoKey() {
-  auto* data = InitAndGet();
-  if (data->enable_tracking == false) {
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
     return nullptr;
   }
   return data->cf_key.load(std::memory_order_relaxed);
@@ -57,9 +66,8 @@ const void* ThreadStatusUpdater::GetColumnFamilyInfoKey() {
 
 void ThreadStatusUpdater::SetThreadOperation(
     const ThreadStatus::OperationType type) {
-  auto* data = InitAndGet();
-  if (!data->enable_tracking) {
-    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
     return;
   }
   // NOTE: Our practice here is to set all the thread operation properties
@@ -77,9 +85,8 @@ void ThreadStatusUpdater::SetThreadOperation(
 
 void ThreadStatusUpdater::SetThreadOperationProperty(
     int i, uint64_t value) {
-  auto* data = InitAndGet();
-  if (!data->enable_tracking) {
-    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
     return;
   }
   data->op_properties[i].store(value, std::memory_order_relaxed);
@@ -87,27 +94,24 @@ void ThreadStatusUpdater::SetThreadOperationProperty(
 
 void ThreadStatusUpdater::IncreaseThreadOperationProperty(
     int i, uint64_t delta) {
-  auto* data = InitAndGet();
-  if (!data->enable_tracking) {
-    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
     return;
   }
   data->op_properties[i].fetch_add(delta, std::memory_order_relaxed);
 }
 
 void ThreadStatusUpdater::SetOperationStartTime(const uint64_t start_time) {
-  auto* data = InitAndGet();
-  if (!data->enable_tracking) {
-    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
     return;
   }
   data->op_start_time.store(start_time, std::memory_order_relaxed);
 }
 
 void ThreadStatusUpdater::ClearThreadOperation() {
-  auto* data = InitAndGet();
-  if (!data->enable_tracking) {
-    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
     return;
   }
   data->operation_stage.store(ThreadStatus::STAGE_UNKNOWN,
@@ -118,9 +122,8 @@ void ThreadStatusUpdater::ClearThreadOperation() {
 }
 
 void ThreadStatusUpdater::ClearThreadOperationProperties() {
-  auto* data = InitAndGet();
-  if (!data->enable_tracking) {
-    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
     return;
   }
   for (int i = 0; i < ThreadStatus::kNumOperationProperties; ++i) {
@@ -130,9 +133,8 @@ void ThreadStatusUpdater::ClearThreadOperationProperties() {
 
 ThreadStatus::OperationStage ThreadStatusUpdater::SetThreadOperationStage(
     ThreadStatus::OperationStage stage) {
-  auto* data = InitAndGet();
-  if (!data->enable_tracking) {
-    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
     return ThreadStatus::STAGE_UNKNOWN;
   }
   return data->operation_stage.exchange(
@@ -141,18 +143,16 @@ ThreadStatus::OperationStage ThreadStatusUpdater::SetThreadOperationStage(
 
 void ThreadStatusUpdater::SetThreadState(
     const ThreadStatus::StateType type) {
-  auto* data = InitAndGet();
-  if (!data->enable_tracking) {
-    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
     return;
   }
   data->state_type.store(type, std::memory_order_relaxed);
 }
 
 void ThreadStatusUpdater::ClearThreadState() {
-  auto* data = InitAndGet();
-  if (!data->enable_tracking) {
-    assert(data->cf_key.load(std::memory_order_relaxed) == nullptr);
+  auto* data = GetLocalThreadStatus();
+  if (data == nullptr) {
     return;
   }
   data->state_type.store(
@@ -168,6 +168,8 @@ Status ThreadStatusUpdater::GetThreadList(
   std::lock_guard<std::mutex> lck(thread_list_mutex_);
   for (auto* thread_data : thread_data_set_) {
     assert(thread_data);
+    auto thread_id = thread_data->thread_id.load(
+        std::memory_order_relaxed);
     auto thread_type = thread_data->thread_type.load(
         std::memory_order_relaxed);
     // Since any change to cf_info_map requires thread_list_mutex,
@@ -176,7 +178,6 @@ Status ThreadStatusUpdater::GetThreadList(
     auto cf_key = thread_data->cf_key.load(
         std::memory_order_relaxed);
     auto iter = cf_info_map_.find(cf_key);
-    assert(cf_key == 0 || iter != cf_info_map_.end());
     auto* cf_info = iter != cf_info_map_.end() ?
         iter->second.get() : nullptr;
     const std::string* db_name = nullptr;
@@ -206,7 +207,7 @@ Status ThreadStatusUpdater::GetThreadList(
       }
     }
     thread_list->emplace_back(
-        thread_data->thread_id, thread_type,
+        thread_id, thread_type,
         db_name ? *db_name : "",
         cf_name ? *cf_name : "",
         op_type, op_elapsed_micros, op_stage, op_props,
@@ -216,13 +217,14 @@ Status ThreadStatusUpdater::GetThreadList(
   return Status::OK();
 }
 
-ThreadStatusData* ThreadStatusUpdater::InitAndGet() {
-  if (UNLIKELY(thread_status_data_ == nullptr)) {
-    thread_status_data_ = new ThreadStatusData();
-    thread_status_data_->thread_id = reinterpret_cast<uint64_t>(
-        thread_status_data_);
-    std::lock_guard<std::mutex> lck(thread_list_mutex_);
-    thread_data_set_.insert(thread_status_data_);
+ThreadStatusData* ThreadStatusUpdater::GetLocalThreadStatus() {
+  if (thread_status_data_ == nullptr) {
+    return nullptr;
+  }
+  if (!thread_status_data_->enable_tracking) {
+    assert(thread_status_data_->cf_key.load(
+        std::memory_order_relaxed) == nullptr);
+    return nullptr;
   }
   return thread_status_data_;
 }
@@ -286,14 +288,14 @@ void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) {
 
 #else
 
-void ThreadStatusUpdater::UnregisterThread() {
+void ThreadStatusUpdater::RegisterThread(
+    ThreadStatus::ThreadType ttype, uint64_t thread_id) {
 }
 
-void ThreadStatusUpdater::ResetThreadStatus() {
+void ThreadStatusUpdater::UnregisterThread() {
 }
 
-void ThreadStatusUpdater::SetThreadType(
-    ThreadStatus::ThreadType ttype) {
+void ThreadStatusUpdater::ResetThreadStatus() {
 }
 
 void ThreadStatusUpdater::SetColumnFamilyInfoKey(
diff --git a/src/rocksdb/util/thread_status_updater.h b/src/rocksdb/util/thread_status_updater.h
index b511a8d..e7c7007 100644
--- a/src/rocksdb/util/thread_status_updater.h
+++ b/src/rocksdb/util/thread_status_updater.h
@@ -38,7 +38,7 @@
 
 #include "rocksdb/status.h"
 #include "rocksdb/thread_status.h"
-#include "port/port_posix.h"
+#include "port/port.h"
 #include "util/thread_operation.h"
 
 namespace rocksdb {
@@ -64,7 +64,8 @@ struct ConstantColumnFamilyInfo {
 // status of a thread using a set of atomic pointers.
 struct ThreadStatusData {
 #if ROCKSDB_USING_THREAD_STATUS
-  explicit ThreadStatusData() : thread_id(0), enable_tracking(false) {
+  explicit ThreadStatusData() : enable_tracking(false) {
+    thread_id.store(0);
     thread_type.store(ThreadStatus::USER);
     cf_key.store(nullptr);
     operation_type.store(ThreadStatus::OP_UNKNOWN);
@@ -72,8 +73,6 @@ struct ThreadStatusData {
     state_type.store(ThreadStatus::STATE_UNKNOWN);
   }
 
-  uint64_t thread_id;
-
   // A flag to indicate whether the thread tracking is enabled
   // in the current thread.  This value will be updated based on whether
   // the associated Options::enable_thread_tracking is set to true
@@ -83,8 +82,9 @@ struct ThreadStatusData {
   // will be no-op.
   bool enable_tracking;
 
+  std::atomic<uint64_t> thread_id;
   std::atomic<ThreadStatus::ThreadType> thread_type;
-  std::atomic<const void*> cf_key;
+  std::atomic<void*> cf_key;
   std::atomic<ThreadStatus::OperationType> operation_type;
   std::atomic<uint64_t> op_start_time;
   std::atomic<ThreadStatus::OperationStage> operation_stage;
@@ -115,8 +115,11 @@ class ThreadStatusUpdater {
   // ColumnFamilyInfoKey, ThreadOperation, and ThreadState.
   void ResetThreadStatus();
 
-  // Set the thread type of the current thread.
-  void SetThreadType(ThreadStatus::ThreadType ttype);
+  // Set the id of the current thread.
+  void SetThreadID(uint64_t thread_id);
+
+  // Register the current thread for tracking.
+  void RegisterThread(ThreadStatus::ThreadType ttype, uint64_t thread_id);
 
   // Update the column-family info of the current thread by setting
   // its thread-local pointer of ThreadStateInfo to the correct entry.
@@ -195,9 +198,15 @@ class ThreadStatusUpdater {
   // The thread-local variable for storing thread status.
   static __thread ThreadStatusData* thread_status_data_;
 
-  // Obtain the pointer to the thread status data.  It also performs
-  // initialization when necessary.
-  ThreadStatusData* InitAndGet();
+  // Returns the pointer to the thread status data only when the
+  // thread status data is non-null and has enable_tracking == true.
+  ThreadStatusData* GetLocalThreadStatus();
+
+  // Directly returns the pointer to thread_status_data_ without
+  // checking whether enabling_tracking is true of not.
+  ThreadStatusData* Get() {
+    return thread_status_data_;
+  }
 
   // The mutex that protects cf_info_map and db_key_map.
   std::mutex thread_list_mutex_;
diff --git a/src/rocksdb/util/thread_status_util.cc b/src/rocksdb/util/thread_status_util.cc
index c498971..e67a8e4 100644
--- a/src/rocksdb/util/thread_status_util.cc
+++ b/src/rocksdb/util/thread_status_util.cc
@@ -15,13 +15,14 @@ __thread ThreadStatusUpdater*
     ThreadStatusUtil::thread_updater_local_cache_ = nullptr;
 __thread bool ThreadStatusUtil::thread_updater_initialized_ = false;
 
-void ThreadStatusUtil::SetThreadType(
+void ThreadStatusUtil::RegisterThread(
     const Env* env, ThreadStatus::ThreadType thread_type) {
   if (!MaybeInitThreadLocalUpdater(env)) {
     return;
   }
   assert(thread_updater_local_cache_);
-  thread_updater_local_cache_->SetThreadType(thread_type);
+  thread_updater_local_cache_->RegisterThread(
+      thread_type, env->GetThreadID());
 }
 
 void ThreadStatusUtil::UnregisterThread() {
diff --git a/src/rocksdb/util/thread_status_util.h b/src/rocksdb/util/thread_status_util.h
index ba0238d..aa13a6c 100644
--- a/src/rocksdb/util/thread_status_util.h
+++ b/src/rocksdb/util/thread_status_util.h
@@ -27,8 +27,8 @@ class ColumnFamilyData;
 // all function calls to ThreadStatusUtil will be no-op.
 class ThreadStatusUtil {
  public:
-  // Set the thread type of the current thread.
-  static void SetThreadType(
+  // Register the current thread for tracking.
+  static void RegisterThread(
       const Env* env, ThreadStatus::ThreadType thread_type);
 
   // Unregister the current thread.
diff --git a/src/rocksdb/util/vectorrep.cc b/src/rocksdb/util/vectorrep.cc
index 4e4827a..017f89f 100644
--- a/src/rocksdb/util/vectorrep.cc
+++ b/src/rocksdb/util/vectorrep.cc
@@ -50,7 +50,7 @@ class VectorRep : public MemTableRep {
   class Iterator : public MemTableRep::Iterator {
     class VectorRep* vrep_;
     std::shared_ptr<std::vector<const char*>> bucket_;
-    typename std::vector<const char*>::const_iterator mutable cit_;
+    std::vector<const char*>::const_iterator mutable cit_;
     const KeyComparator& compare_;
     std::string tmp_;       // For passing to EncodeKey
     bool mutable sorted_;
diff --git a/src/rocksdb/util/xfunc.cc b/src/rocksdb/util/xfunc.cc
index c5d6b5a..98de1c5 100644
--- a/src/rocksdb/util/xfunc.cc
+++ b/src/rocksdb/util/xfunc.cc
@@ -7,7 +7,11 @@
 #include <string>
 #include "db/db_impl.h"
 #include "db/managed_iterator.h"
+#include "db/write_callback.h"
+#include "rocksdb/db.h"
 #include "rocksdb/options.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/write_batch.h"
 #include "util/xfunc.h"
 
 
@@ -64,6 +68,116 @@ void xf_manage_new(DBImpl* db, ReadOptions* read_options,
 
 void xf_manage_create(ManagedIterator* iter) { iter->SetDropOld(false); }
 
+void xf_transaction_set_memtable_history(
+    int32_t* max_write_buffer_number_to_maintain) {
+  *max_write_buffer_number_to_maintain = 10;
+}
+
+void xf_transaction_clear_memtable_history(
+    int32_t* max_write_buffer_number_to_maintain) {
+  *max_write_buffer_number_to_maintain = 0;
+}
+
+class XFTransactionWriteHandler : public WriteBatch::Handler {
+ public:
+  OptimisticTransaction* txn_;
+  DBImpl* db_impl_;
+
+  XFTransactionWriteHandler(OptimisticTransaction* txn, DBImpl* db_impl)
+      : txn_(txn), db_impl_(db_impl) {}
+
+  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                       const Slice& value) override {
+    InstrumentedMutexLock l(&db_impl_->mutex_);
+
+    ColumnFamilyHandle* cfh = db_impl_->GetColumnFamilyHandle(column_family_id);
+    if (cfh == nullptr) {
+      return Status::InvalidArgument(
+          "XFUNC test could not find column family "
+          "handle for id ",
+          ToString(column_family_id));
+    }
+
+    txn_->Put(cfh, key, value);
+
+    return Status::OK();
+  }
+
+  virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+                         const Slice& value) override {
+    InstrumentedMutexLock l(&db_impl_->mutex_);
+
+    ColumnFamilyHandle* cfh = db_impl_->GetColumnFamilyHandle(column_family_id);
+    if (cfh == nullptr) {
+      return Status::InvalidArgument(
+          "XFUNC test could not find column family "
+          "handle for id ",
+          ToString(column_family_id));
+    }
+
+    txn_->Merge(cfh, key, value);
+
+    return Status::OK();
+  }
+
+  virtual Status DeleteCF(uint32_t column_family_id,
+                          const Slice& key) override {
+    InstrumentedMutexLock l(&db_impl_->mutex_);
+
+    ColumnFamilyHandle* cfh = db_impl_->GetColumnFamilyHandle(column_family_id);
+    if (cfh == nullptr) {
+      return Status::InvalidArgument(
+          "XFUNC test could not find column family "
+          "handle for id ",
+          ToString(column_family_id));
+    }
+
+    txn_->Delete(cfh, key);
+
+    return Status::OK();
+  }
+
+  virtual void LogData(const Slice& blob) override { txn_->PutLogData(blob); }
+};
+
+// Whenever DBImpl::Write is called, create a transaction and do the write via
+// the transaction.
+void xf_transaction_write(const WriteOptions& write_options,
+                          const DBOptions& db_options, WriteBatch* my_batch,
+                          WriteCallback* callback, DBImpl* db_impl, Status* s,
+                          bool* write_attempted) {
+  if (callback != nullptr) {
+    // We may already be in a transaction, don't force a transaction
+    *write_attempted = false;
+    return;
+  }
+
+  OptimisticTransactionDB* txn_db = new OptimisticTransactionDB(db_impl);
+  OptimisticTransaction* txn =
+      OptimisticTransaction::BeginTransaction(txn_db, write_options);
+
+  XFTransactionWriteHandler handler(txn, db_impl);
+  *s = my_batch->Iterate(&handler);
+
+  if (!s->ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options.info_log,
+        "XFUNC test could not iterate batch.  status: $s\n",
+        s->ToString().c_str());
+  }
+
+  *s = txn->Commit();
+
+  if (!s->ok()) {
+    Log(InfoLogLevel::ERROR_LEVEL, db_options.info_log,
+        "XFUNC test could not commit transaction.  status: $s\n",
+        s->ToString().c_str());
+  }
+
+  *write_attempted = true;
+  delete txn;
+  delete txn_db;
+}
+
 }  // namespace rocksdb
 
 #endif  // XFUNC
diff --git a/src/rocksdb/util/xfunc.h b/src/rocksdb/util/xfunc.h
index 78004cb..2b3b0e3 100644
--- a/src/rocksdb/util/xfunc.h
+++ b/src/rocksdb/util/xfunc.h
@@ -32,6 +32,7 @@ namespace rocksdb {
 #else
 
 struct Options;
+struct WriteOptions;
 class ManagedIterator;
 class DBImpl;
 void GetXFTestOptions(Options* options, int skip_policy);
@@ -40,6 +41,15 @@ void xf_manage_new(DBImpl* db, ReadOptions* readoptions,
                    bool is_snapshot_supported);
 void xf_manage_create(ManagedIterator* iter);
 void xf_manage_options(ReadOptions* read_options);
+void xf_transaction_set_memtable_history(
+    int32_t* max_write_buffer_number_to_maintain);
+void xf_transaction_clear_memtable_history(
+    int32_t* max_write_buffer_number_to_maintain);
+void xf_transaction_write(const WriteOptions& write_options,
+                          const DBOptions& db_options,
+                          class WriteBatch* my_batch,
+                          class WriteCallback* callback, DBImpl* db_impl,
+                          Status* success, bool* write_attempted);
 
 // This class provides the facility to run custom code to test a specific
 // feature typically with all existing unit tests.
diff --git a/src/rocksdb/utilities/backupable/backupable_db.cc b/src/rocksdb/utilities/backupable/backupable_db.cc
index ab640ed..16f6d52 100644
--- a/src/rocksdb/utilities/backupable/backupable_db.cc
+++ b/src/rocksdb/utilities/backupable/backupable_db.cc
@@ -11,11 +11,15 @@
 
 #include "rocksdb/utilities/backupable_db.h"
 #include "db/filename.h"
+#include "util/channel.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
+#include "util/file_reader_writer.h"
 #include "util/logging.h"
 #include "util/string_util.h"
+#include "rocksdb/rate_limiter.h"
 #include "rocksdb/transaction_log.h"
+#include "port/port.h"
 
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
@@ -26,56 +30,19 @@
 #include <algorithm>
 #include <vector>
 #include <map>
+#include <mutex>
 #include <sstream>
 #include <string>
 #include <limits>
 #include <atomic>
+#include <future>
+#include <thread>
 #include <unordered_map>
+#include <unordered_set>
+#include "port/port.h"
 
-namespace rocksdb {
-
-namespace {
-class BackupRateLimiter {
- public:
-  BackupRateLimiter(Env* env, uint64_t max_bytes_per_second,
-                   uint64_t bytes_per_check)
-      : env_(env),
-        max_bytes_per_second_(max_bytes_per_second),
-        bytes_per_check_(bytes_per_check),
-        micros_start_time_(env->NowMicros()),
-        bytes_since_start_(0) {}
-
-  void ReportAndWait(uint64_t bytes_since_last_call) {
-    bytes_since_start_ += bytes_since_last_call;
-    if (bytes_since_start_ < bytes_per_check_) {
-      // not enough bytes to be rate-limited
-      return;
-    }
-
-    uint64_t now = env_->NowMicros();
-    uint64_t interval = now - micros_start_time_;
-    uint64_t should_take_micros =
-        (bytes_since_start_ * kMicrosInSecond) / max_bytes_per_second_;
 
-    if (should_take_micros > interval) {
-      env_->SleepForMicroseconds(
-          static_cast<int>(should_take_micros - interval));
-      now = env_->NowMicros();
-    }
-    // reset interval
-    micros_start_time_ = now;
-    bytes_since_start_ = 0;
-  }
-
- private:
-  Env* env_;
-  uint64_t max_bytes_per_second_;
-  uint64_t bytes_per_check_;
-  uint64_t micros_start_time_;
-  uint64_t bytes_since_start_;
-  static const uint64_t kMicrosInSecond = 1000 * 1000LL;
-};
-}  // namespace
+namespace rocksdb {
 
 void BackupStatistics::IncrementNumberSuccessBackup() {
   number_success_backup++;
@@ -99,18 +66,21 @@ std::string BackupStatistics::ToString() const {
 }
 
 void BackupableDBOptions::Dump(Logger* logger) const {
-  Log(logger, "        Options.backup_dir: %s", backup_dir.c_str());
-  Log(logger, "        Options.backup_env: %p", backup_env);
-  Log(logger, " Options.share_table_files: %d",
+  Log(logger, "               Options.backup_dir: %s", backup_dir.c_str());
+  Log(logger, "               Options.backup_env: %p", backup_env);
+  Log(logger, "        Options.share_table_files: %d",
       static_cast<int>(share_table_files));
-  Log(logger, "          Options.info_log: %p", info_log);
-  Log(logger, "              Options.sync: %d", static_cast<int>(sync));
-  Log(logger, "  Options.destroy_old_data: %d",
+  Log(logger, "                 Options.info_log: %p", info_log);
+  Log(logger, "                     Options.sync: %d", static_cast<int>(sync));
+  Log(logger, "         Options.destroy_old_data: %d",
       static_cast<int>(destroy_old_data));
-  Log(logger, "  Options.backup_log_files: %d",
+  Log(logger, "         Options.backup_log_files: %d",
       static_cast<int>(backup_log_files));
-  Log(logger, " Options.backup_rate_limit: %" PRIu64, backup_rate_limit);
-  Log(logger, "Options.restore_rate_limit: %" PRIu64, restore_rate_limit);
+  Log(logger, "        Options.backup_rate_limit: %" PRIu64, backup_rate_limit);
+  Log(logger, "       Options.restore_rate_limit: %" PRIu64,
+      restore_rate_limit);
+  Log(logger, "Options.max_background_operations: %d",
+      max_background_operations);
 }
 
 // -------- BackupEngineImpl class ---------
@@ -119,7 +89,9 @@ class BackupEngineImpl : public BackupEngine {
   BackupEngineImpl(Env* db_env, const BackupableDBOptions& options,
                    bool read_only = false);
   ~BackupEngineImpl();
-  Status CreateNewBackup(DB* db, bool flush_before_backup = false) override;
+  Status CreateNewBackup(DB* db, bool flush_before_backup = false,
+                         std::function<void()> progress_callback = []() {
+                         }) override;
   Status PurgeOldBackups(uint32_t num_backups_to_keep) override;
   Status DeleteBackup(BackupID backup_id) override;
   void StopBackup() override {
@@ -139,6 +111,10 @@ class BackupEngineImpl : public BackupEngine {
                                restore_options);
   }
 
+  virtual Status VerifyBackup(BackupID backup_id) override;
+
+  Status Initialize();
+
  private:
   void DeleteChildren(const std::string& dir, uint32_t file_type_filter = 0);
 
@@ -187,7 +163,7 @@ class BackupEngineImpl : public BackupEngine {
 
     Status AddFile(std::shared_ptr<FileInfo> file_info);
 
-    void Delete(bool delete_meta = true);
+    Status Delete(bool delete_meta = true);
 
     bool Empty() {
       return files_.empty();
@@ -291,33 +267,138 @@ class BackupEngineImpl : public BackupEngine {
     return GetBackupMetaDir() + "/" + rocksdb::ToString(backup_id);
   }
 
-  Status GetLatestBackupFileContents(uint32_t* latest_backup);
   Status PutLatestBackupFileContents(uint32_t latest_backup);
   // if size_limit == 0, there is no size limit, copy everything
-  Status CopyFile(const std::string& src,
-                  const std::string& dst,
-                  Env* src_env,
-                  Env* dst_env,
-                  bool sync,
-                  BackupRateLimiter* rate_limiter,
-                  uint64_t* size = nullptr,
-                  uint32_t* checksum_value = nullptr,
-                  uint64_t size_limit = 0);
-  // if size_limit == 0, there is no size limit, copy everything
-  Status BackupFile(BackupID backup_id,
-                    BackupMeta* backup,
-                    bool shared,
-                    const std::string& src_dir,
-                    const std::string& src_fname,  // starts with "/"
-                    BackupRateLimiter* rate_limiter,
-                    uint64_t size_limit = 0,
-                    bool shared_checksum = false);
+  Status CopyFile(const std::string& src, const std::string& dst, Env* src_env,
+                  Env* dst_env, bool sync, RateLimiter* rate_limiter,
+                  uint64_t* size = nullptr, uint32_t* checksum_value = nullptr,
+                  uint64_t size_limit = 0,
+                  std::function<void()> progress_callback = []() {});
 
   Status CalculateChecksum(const std::string& src,
                            Env* src_env,
                            uint64_t size_limit,
                            uint32_t* checksum_value);
 
+  struct CopyResult {
+    uint64_t size;
+    uint32_t checksum_value;
+    Status status;
+  };
+  struct CopyWorkItem {
+    std::string src_path;
+    std::string dst_path;
+    Env* src_env;
+    Env* dst_env;
+    bool sync;
+    RateLimiter* rate_limiter;
+    uint64_t size_limit;
+    std::promise<CopyResult> result;
+    std::function<void()> progress_callback;
+
+    CopyWorkItem() {}
+    CopyWorkItem(const CopyWorkItem&) = delete;
+    CopyWorkItem& operator=(const CopyWorkItem&) = delete;
+
+    CopyWorkItem(CopyWorkItem&& o) ROCKSDB_NOEXCEPT { *this = std::move(o); }
+
+    CopyWorkItem& operator=(CopyWorkItem&& o) ROCKSDB_NOEXCEPT {
+      src_path = std::move(o.src_path);
+      dst_path = std::move(o.dst_path);
+      src_env = o.src_env;
+      dst_env = o.dst_env;
+      sync = o.sync;
+      rate_limiter = o.rate_limiter;
+      size_limit = o.size_limit;
+      result = std::move(o.result);
+      progress_callback = std::move(o.progress_callback);
+      return *this;
+    }
+
+    CopyWorkItem(std::string _src_path, std::string _dst_path, Env* _src_env,
+                 Env* _dst_env, bool _sync, RateLimiter* _rate_limiter,
+                 uint64_t _size_limit,
+                 std::function<void()> _progress_callback = []() {})
+        : src_path(std::move(_src_path)),
+          dst_path(std::move(_dst_path)),
+          src_env(_src_env),
+          dst_env(_dst_env),
+          sync(_sync),
+          rate_limiter(_rate_limiter),
+          size_limit(_size_limit),
+          progress_callback(_progress_callback) {}
+  };
+
+  struct BackupAfterCopyWorkItem {
+    std::future<CopyResult> result;
+    bool shared;
+    bool needed_to_copy;
+    Env* backup_env;
+    std::string dst_path_tmp;
+    std::string dst_path;
+    std::string dst_relative;
+    BackupAfterCopyWorkItem() {}
+
+    BackupAfterCopyWorkItem(BackupAfterCopyWorkItem&& o) ROCKSDB_NOEXCEPT {
+      *this = std::move(o);
+    }
+
+    BackupAfterCopyWorkItem& operator=(BackupAfterCopyWorkItem&& o) ROCKSDB_NOEXCEPT {
+      result = std::move(o.result);
+      shared = o.shared;
+      needed_to_copy = o.needed_to_copy;
+      backup_env = o.backup_env;
+      dst_path_tmp = std::move(o.dst_path_tmp);
+      dst_path = std::move(o.dst_path);
+      dst_relative = std::move(o.dst_relative);
+      return *this;
+    }
+
+    BackupAfterCopyWorkItem(std::future<CopyResult>&& _result, bool _shared,
+                            bool _needed_to_copy, Env* _backup_env,
+                            std::string _dst_path_tmp, std::string _dst_path,
+                            std::string _dst_relative)
+        : result(std::move(_result)),
+          shared(_shared),
+          needed_to_copy(_needed_to_copy),
+          backup_env(_backup_env),
+          dst_path_tmp(std::move(_dst_path_tmp)),
+          dst_path(std::move(_dst_path)),
+          dst_relative(std::move(_dst_relative)) {}
+  };
+
+  struct RestoreAfterCopyWorkItem {
+    std::future<CopyResult> result;
+    uint32_t checksum_value;
+    RestoreAfterCopyWorkItem() {}
+    RestoreAfterCopyWorkItem(std::future<CopyResult>&& _result,
+                             uint32_t _checksum_value)
+        : result(std::move(_result)), checksum_value(_checksum_value) {}
+    RestoreAfterCopyWorkItem(RestoreAfterCopyWorkItem&& o) ROCKSDB_NOEXCEPT {
+      *this = std::move(o);
+    }
+
+    RestoreAfterCopyWorkItem& operator=(RestoreAfterCopyWorkItem&& o) ROCKSDB_NOEXCEPT {
+      result = std::move(o.result);
+      checksum_value = o.checksum_value;
+      return *this;
+    }
+  };
+
+  bool initialized_;
+  std::mutex byte_report_mutex_;
+  channel<CopyWorkItem> files_to_copy_;
+  std::vector<std::thread> threads_;
+
+  Status AddBackupFileWorkItem(
+      std::unordered_set<std::string>& live_dst_paths,
+      std::vector<BackupAfterCopyWorkItem>& backup_items_to_finish,
+      BackupID backup_id, bool shared, const std::string& src_dir,
+      const std::string& src_fname,  // starts with "/"
+      RateLimiter* rate_limiter, uint64_t size_limit = 0,
+      bool shared_checksum = false,
+      std::function<void()> progress_callback = []() {});
+
   // backup state data
   BackupID latest_backup_id_;
   std::map<BackupID, unique_ptr<BackupMeta>> backups_;
@@ -344,57 +425,83 @@ class BackupEngineImpl : public BackupEngine {
   BackupStatistics backup_statistics_;
 };
 
-BackupEngine* BackupEngine::NewBackupEngine(
-    Env* db_env, const BackupableDBOptions& options) {
-  return new BackupEngineImpl(db_env, options);
-}
-
-Status BackupEngine::Open(Env* env,
-                          const BackupableDBOptions& options,
+Status BackupEngine::Open(Env* env, const BackupableDBOptions& options,
                           BackupEngine** backup_engine_ptr) {
-  *backup_engine_ptr = new BackupEngineImpl(env, options);
+  std::unique_ptr<BackupEngineImpl> backup_engine(
+      new BackupEngineImpl(env, options));
+  auto s = backup_engine->Initialize();
+  if (!s.ok()) {
+    *backup_engine_ptr = nullptr;
+    return s;
+  }
+  *backup_engine_ptr = backup_engine.release();
   return Status::OK();
 }
 
 BackupEngineImpl::BackupEngineImpl(Env* db_env,
                                    const BackupableDBOptions& options,
                                    bool read_only)
-    : stop_backup_(false),
+    : initialized_(false),
+      stop_backup_(false),
       options_(options),
       db_env_(db_env),
       backup_env_(options.backup_env != nullptr ? options.backup_env : db_env_),
       copy_file_buffer_size_(kDefaultCopyFileBufferSize),
-      read_only_(read_only) {
+      read_only_(read_only) {}
+
+BackupEngineImpl::~BackupEngineImpl() {
+  files_to_copy_.sendEof();
+  for (auto& t : threads_) {
+    t.join();
+  }
+  LogFlush(options_.info_log);
+}
+
+Status BackupEngineImpl::Initialize() {
+  assert(!initialized_);
+  initialized_ = true;
   if (read_only_) {
     Log(options_.info_log, "Starting read_only backup engine");
   }
   options_.Dump(options_.info_log);
 
   if (!read_only_) {
-    // create all the dirs we need
-    backup_env_->CreateDirIfMissing(GetAbsolutePath());
-    backup_env_->NewDirectory(GetAbsolutePath(), &backup_directory_);
+    // gather the list of directories that we need to create
+    std::vector<std::pair<std::string, std::unique_ptr<Directory>*>>
+        directories;
+    directories.emplace_back(GetAbsolutePath(), &backup_directory_);
     if (options_.share_table_files) {
       if (options_.share_files_with_checksum) {
-        backup_env_->CreateDirIfMissing(GetAbsolutePath(
-            GetSharedFileWithChecksumRel()));
-        backup_env_->NewDirectory(GetAbsolutePath(
-            GetSharedFileWithChecksumRel()), &shared_directory_);
+        directories.emplace_back(
+            GetAbsolutePath(GetSharedFileWithChecksumRel()),
+            &shared_directory_);
       } else {
-        backup_env_->CreateDirIfMissing(GetAbsolutePath(GetSharedFileRel()));
-        backup_env_->NewDirectory(GetAbsolutePath(GetSharedFileRel()),
-                                  &shared_directory_);
+        directories.emplace_back(GetAbsolutePath(GetSharedFileRel()),
+                                 &shared_directory_);
+      }
+    }
+    directories.emplace_back(GetAbsolutePath(GetPrivateDirRel()),
+                             &private_directory_);
+    directories.emplace_back(GetBackupMetaDir(), &meta_directory_);
+    // create all the dirs we need
+    for (const auto& d : directories) {
+      auto s = backup_env_->CreateDirIfMissing(d.first);
+      if (s.ok()) {
+        s = backup_env_->NewDirectory(d.first, d.second);
+      }
+      if (!s.ok()) {
+        return s;
       }
     }
-    backup_env_->CreateDirIfMissing(GetAbsolutePath(GetPrivateDirRel()));
-    backup_env_->NewDirectory(GetAbsolutePath(GetPrivateDirRel()),
-                              &private_directory_);
-    backup_env_->CreateDirIfMissing(GetBackupMetaDir());
-    backup_env_->NewDirectory(GetBackupMetaDir(), &meta_directory_);
   }
 
   std::vector<std::string> backup_meta_files;
-  backup_env_->GetChildren(GetBackupMetaDir(), &backup_meta_files);
+  {
+    auto s = backup_env_->GetChildren(GetBackupMetaDir(), &backup_meta_files);
+    if (!s.ok()) {
+      return s;
+    }
+  }
   // create backups_ structure
   for (auto& file : backup_meta_files) {
     if (file == "." || file == "..") {
@@ -405,10 +512,10 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env,
     sscanf(file.c_str(), "%u", &backup_id);
     if (backup_id == 0 || file != rocksdb::ToString(backup_id)) {
       if (!read_only_) {
-        Log(options_.info_log, "Unrecognized meta file %s, deleting",
-            file.c_str());
         // invalid file name, delete that
-        backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file);
+        auto s = backup_env_->DeleteFile(GetBackupMetaDir() + "/" + file);
+        Log(options_.info_log, "Unrecognized meta file %s, deleting -- %s",
+            file.c_str(), s.ToString().c_str());
       }
       continue;
     }
@@ -419,15 +526,19 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env,
                                       &backuped_file_infos_, backup_env_)))));
   }
 
+  latest_backup_id_ = 0;
   if (options_.destroy_old_data) {  // Destroy old data
     assert(!read_only_);
     Log(options_.info_log,
         "Backup Engine started with destroy_old_data == true, deleting all "
         "backups");
-    PurgeOldBackups(0);
-    (void) GarbageCollect();
-    // start from beginning
-    latest_backup_id_ = 0;
+    auto s = PurgeOldBackups(0);
+    if (s.ok()) {
+      s = GarbageCollect();
+    }
+    if (!s.ok()) {
+      return s;
+    }
   } else {  // Load data from storage
     // load the backups if any
     for (auto& backup : backups_) {
@@ -440,56 +551,48 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env,
       } else {
         Log(options_.info_log, "Loading backup %" PRIu32 " OK:\n%s",
             backup.first, backup.second->GetInfoString().c_str());
+        latest_backup_id_ = std::max(latest_backup_id_, backup.first);
       }
     }
 
     for (const auto& corrupt : corrupt_backups_) {
       backups_.erase(backups_.find(corrupt.first));
     }
-
-    Status s = GetLatestBackupFileContents(&latest_backup_id_);
-
-    // If latest backup file is corrupted or non-existent
-    // set latest backup as the biggest backup we have
-    // or 0 if we have no backups
-    if (!s.ok() ||
-        backups_.find(latest_backup_id_) == backups_.end()) {
-      auto itr = backups_.end();
-      latest_backup_id_ = (itr == backups_.begin()) ? 0 : (--itr)->first;
-    }
   }
 
   Log(options_.info_log, "Latest backup is %u", latest_backup_id_);
 
-  // delete any backups that claim to be later than latest
-  std::vector<BackupID> later_ids;
-  for (auto itr = backups_.lower_bound(latest_backup_id_ + 1);
-       itr != backups_.end(); itr++) {
-    Log(options_.info_log,
-        "Found backup claiming to be later than latest: %" PRIu32, itr->first);
-    later_ids.push_back(itr->first);
-  }
-  for (auto id : later_ids) {
-    if (!read_only_) {
-      DeleteBackup(id);
-    } else {
-      auto backup = backups_.find(id);
-      // We just found it couple of lines earlier!
-      assert(backup != backups_.end());
-      backup->second->Delete(false);
-      backups_.erase(backup);
+  if (!read_only_) {
+    auto s = PutLatestBackupFileContents(latest_backup_id_);
+    if (!s.ok()) {
+      return s;
     }
   }
 
-  if (!read_only_) {
-    PutLatestBackupFileContents(latest_backup_id_);  // Ignore errors
+  // set up threads perform copies from files_to_copy_ in the background
+  for (int t = 0; t < options_.max_background_operations; t++) {
+    threads_.emplace_back([&]() {
+      CopyWorkItem work_item;
+      while (files_to_copy_.read(work_item)) {
+        CopyResult result;
+        result.status =
+            CopyFile(work_item.src_path, work_item.dst_path, work_item.src_env,
+                     work_item.dst_env, work_item.sync, work_item.rate_limiter,
+                     &result.size, &result.checksum_value, work_item.size_limit,
+                     work_item.progress_callback);
+        work_item.result.set_value(std::move(result));
+      }
+    });
   }
+
   Log(options_.info_log, "Initialized BackupEngine");
-}
 
-BackupEngineImpl::~BackupEngineImpl() { LogFlush(options_.info_log); }
+  return Status::OK();
+}
 
-Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
+Status BackupEngineImpl::CreateNewBackup(
+    DB* db, bool flush_before_backup, std::function<void()> progress_callback) {
+  assert(initialized_);
   assert(!read_only_);
   Status s;
   std::vector<std::string> live_files;
@@ -532,14 +635,21 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
   s = backup_env_->CreateDir(
       GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)));
 
-  unique_ptr<BackupRateLimiter> rate_limiter;
+  unique_ptr<RateLimiter> rate_limiter;
   if (options_.backup_rate_limit > 0) {
-    copy_file_buffer_size_ = options_.backup_rate_limit / 10;
-    rate_limiter.reset(new BackupRateLimiter(db_env_,
-          options_.backup_rate_limit, copy_file_buffer_size_));
+    rate_limiter.reset(NewGenericRateLimiter(options_.backup_rate_limit));
+    copy_file_buffer_size_ = rate_limiter->GetSingleBurstBytes();
   }
 
-  // copy live_files
+  // A set into which we will insert the dst_paths that are calculated for live
+  // files and live WAL files.
+  // This is used to check whether a live files shares a dst_path with another
+  // live file.
+  std::unordered_set<std::string> live_dst_paths;
+  live_dst_paths.reserve(live_files.size() + live_wal_files.size());
+
+  std::vector<BackupAfterCopyWorkItem> backup_items_to_finish;
+  // Add a CopyWorkItem to the channel for each live file
   for (size_t i = 0; s.ok() && i < live_files.size(); ++i) {
     uint64_t number;
     FileType type;
@@ -555,27 +665,46 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
     // rules:
     // * if it's kTableFile, then it's shared
     // * if it's kDescriptorFile, limit the size to manifest_file_size
-    s = BackupFile(new_backup_id,
-                   new_backup.get(),
-                   options_.share_table_files && type == kTableFile,
-                   db->GetName(),            /* src_dir */
-                   live_files[i],            /* src_fname */
-                   rate_limiter.get(),
-                   (type == kDescriptorFile) ? manifest_file_size : 0,
-                   options_.share_files_with_checksum && type == kTableFile);
-  }
-
-  // copy WAL files
+    s = AddBackupFileWorkItem(
+        live_dst_paths, backup_items_to_finish, new_backup_id,
+        options_.share_table_files && type == kTableFile, db->GetName(),
+        live_files[i], rate_limiter.get(),
+        (type == kDescriptorFile) ? manifest_file_size : 0,
+        options_.share_files_with_checksum && type == kTableFile,
+        progress_callback);
+  }
+  // Add a CopyWorkItem to the channel for each WAL file
   for (size_t i = 0; s.ok() && i < live_wal_files.size(); ++i) {
     if (live_wal_files[i]->Type() == kAliveLogFile) {
       // we only care about live log files
       // copy the file into backup_dir/files/<new backup>/
-      s = BackupFile(new_backup_id,
-                     new_backup.get(),
-                     false, /* not shared */
-                     db->GetOptions().wal_dir,
-                     live_wal_files[i]->PathName(),
-                     rate_limiter.get());
+      s = AddBackupFileWorkItem(live_dst_paths,
+                                backup_items_to_finish,
+                                new_backup_id,
+                                false, /* not shared */
+                                db->GetOptions().wal_dir,
+                                live_wal_files[i]->PathName(),
+                                rate_limiter.get());
+    }
+  }
+
+  Status item_status;
+  for (auto& item : backup_items_to_finish) {
+    item.result.wait();
+    auto result = item.result.get();
+    item_status = result.status;
+    if (item_status.ok() && item.shared && item.needed_to_copy) {
+      item_status = item.backup_env->RenameFile(item.dst_path_tmp,
+                                                item.dst_path);
+    }
+    if (item_status.ok()) {
+      item_status = new_backup.get()->AddFile(
+              std::make_shared<FileInfo>(item.dst_relative,
+                                         result.size,
+                                         result.checksum_value));
+    }
+    if (!item_status.ok()) {
+      s = item_status;
     }
   }
 
@@ -660,6 +789,7 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) {
 }
 
 Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
+  assert(initialized_);
   assert(!read_only_);
   Log(options_.info_log, "Purging old backups, keeping %u",
       num_backups_to_keep);
@@ -670,24 +800,34 @@ Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) {
     itr++;
   }
   for (auto backup_id : to_delete) {
-    DeleteBackup(backup_id);
+    auto s = DeleteBackup(backup_id);
+    if (!s.ok()) {
+      return s;
+    }
   }
   return Status::OK();
 }
 
 Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
+  assert(initialized_);
   assert(!read_only_);
   Log(options_.info_log, "Deleting backup %u", backup_id);
   auto backup = backups_.find(backup_id);
   if (backup != backups_.end()) {
-    backup->second->Delete();
+    auto s = backup->second->Delete();
+    if (!s.ok()) {
+      return s;
+    }
     backups_.erase(backup);
   } else {
     auto corrupt = corrupt_backups_.find(backup_id);
     if (corrupt == corrupt_backups_.end()) {
       return Status::NotFound("Backup not found");
     }
-    corrupt->second.second->Delete();
+    auto s = corrupt->second.second->Delete();
+    if (!s.ok()) {
+      return s;
+    }
     corrupt_backups_.erase(corrupt);
   }
 
@@ -714,6 +854,7 @@ Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
 }
 
 void BackupEngineImpl::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
+  assert(initialized_);
   backup_info->reserve(backups_.size());
   for (auto& backup : backups_) {
     if (!backup.second->Empty()) {
@@ -728,6 +869,7 @@ void BackupEngineImpl::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
 void
 BackupEngineImpl::GetCorruptedBackups(
     std::vector<BackupID>* corrupt_backup_ids) {
+  assert(initialized_);
   corrupt_backup_ids->reserve(corrupt_backups_.size());
   for (auto& backup : corrupt_backups_) {
     corrupt_backup_ids->push_back(backup.first);
@@ -737,6 +879,7 @@ BackupEngineImpl::GetCorruptedBackups(
 Status BackupEngineImpl::RestoreDBFromBackup(
     BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
     const RestoreOptions& restore_options) {
+  assert(initialized_);
   auto corrupt_itr = corrupt_backups_.find(backup_id);
   if (corrupt_itr != corrupt_backups_.end()) {
     return corrupt_itr->second.first;
@@ -787,13 +930,13 @@ Status BackupEngineImpl::RestoreDBFromBackup(
     DeleteChildren(db_dir);
   }
 
-  unique_ptr<BackupRateLimiter> rate_limiter;
+  unique_ptr<RateLimiter> rate_limiter;
   if (options_.restore_rate_limit > 0) {
-    copy_file_buffer_size_ = options_.restore_rate_limit / 10;
-    rate_limiter.reset(new BackupRateLimiter(db_env_,
-          options_.restore_rate_limit, copy_file_buffer_size_));
+    rate_limiter.reset(NewGenericRateLimiter(options_.restore_rate_limit));
+    copy_file_buffer_size_ = rate_limiter->GetSingleBurstBytes();
   }
   Status s;
+  std::vector<RestoreAfterCopyWorkItem> restore_items_to_finish;
   for (const auto& file_info : backup->GetFiles()) {
     const std::string &file = file_info->filename;
     std::string dst;
@@ -823,14 +966,30 @@ Status BackupEngineImpl::RestoreDBFromBackup(
       "/" + dst;
 
     Log(options_.info_log, "Restoring %s to %s\n", file.c_str(), dst.c_str());
-    uint32_t checksum_value;
-    s = CopyFile(GetAbsolutePath(file), dst, backup_env_, db_env_, false,
-                 rate_limiter.get(), nullptr /* size */, &checksum_value);
-    if (!s.ok()) {
+    CopyWorkItem copy_work_item(GetAbsolutePath(file),
+                                dst,
+                                backup_env_,
+                                db_env_,
+                                false,
+                                rate_limiter.get(),
+                                0 /* size_limit */);
+    RestoreAfterCopyWorkItem after_copy_work_item(
+            copy_work_item.result.get_future(),
+            file_info->checksum_value);
+    files_to_copy_.write(std::move(copy_work_item));
+    restore_items_to_finish.push_back(std::move(after_copy_work_item));
+  }
+  Status item_status;
+  for (auto& item : restore_items_to_finish) {
+    item.result.wait();
+    auto result = item.result.get();
+    item_status = result.status;
+    // Note: It is possible that both of the following bad-status cases occur
+    // during copying. But, we only return one status.
+    if (!item_status.ok()) {
+      s = item_status;
       break;
-    }
-
-    if (file_info->checksum_value != checksum_value) {
+    } else if (item.checksum_value != result.checksum_value) {
       s = Status::Corruption("Checksum check failed");
       break;
     }
@@ -840,29 +999,41 @@ Status BackupEngineImpl::RestoreDBFromBackup(
   return s;
 }
 
-// latest backup id is an ASCII representation of latest backup id
-Status BackupEngineImpl::GetLatestBackupFileContents(uint32_t* latest_backup) {
-  Status s;
-  unique_ptr<SequentialFile> file;
-  s = backup_env_->NewSequentialFile(GetLatestBackupFile(),
-                                     &file,
-                                     EnvOptions());
-  if (!s.ok()) {
-    return s;
+Status BackupEngineImpl::VerifyBackup(BackupID backup_id) {
+  assert(initialized_);
+  auto corrupt_itr = corrupt_backups_.find(backup_id);
+  if (corrupt_itr != corrupt_backups_.end()) {
+    return corrupt_itr->second.first;
   }
 
-  char buf[11];
-  Slice data;
-  s = file->Read(10, &data, buf);
-  if (!s.ok() || data.size() == 0) {
-    return s.ok() ? Status::Corruption("Latest backup file corrupted") : s;
+  auto backup_itr = backups_.find(backup_id);
+  if (backup_itr == backups_.end()) {
+    return Status::NotFound();
   }
-  buf[data.size()] = 0;
 
-  *latest_backup = 0;
-  sscanf(data.data(), "%u", latest_backup);
-  if (backup_env_->FileExists(GetBackupMetaFile(*latest_backup)) == false) {
-    s = Status::Corruption("Latest backup file corrupted");
+  auto& backup = backup_itr->second;
+  if (backup->Empty()) {
+    return Status::NotFound();
+  }
+
+  Log(options_.info_log, "Verifying backup id %u\n", backup_id);
+
+  uint64_t size;
+  Status result;
+  std::string file_path;
+  for (const auto& file_info : backup->GetFiles()) {
+    const std::string& file = file_info->filename;
+    file_path = GetAbsolutePath(file);
+    result = backup_env_->FileExists(file_path);
+    if (!result.ok()) {
+      return result;
+    }
+    result = backup_env_->GetFileSize(file_path, &size);
+    if (!result.ok()) {
+      return result;
+    } else if (size != file_info->size) {
+      return Status::Corruption("File corrupted: " + file);
+    }
   }
   return Status::OK();
 }
@@ -885,14 +1056,17 @@ Status BackupEngineImpl::PutLatestBackupFileContents(uint32_t latest_backup) {
     return s;
   }
 
+  unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(file), env_options));
   char file_contents[10];
-  int len = sprintf(file_contents, "%u\n", latest_backup);
-  s = file->Append(Slice(file_contents, len));
+  int len =
+      snprintf(file_contents, sizeof(file_contents), "%u\n", latest_backup);
+  s = file_writer->Append(Slice(file_contents, len));
   if (s.ok() && options_.sync) {
-    file->Sync();
+    file_writer->Sync(false);
   }
   if (s.ok()) {
-    s = file->Close();
+    s = file_writer->Close();
   }
   if (s.ok()) {
     // atomically replace real file with new tmp
@@ -902,13 +1076,12 @@ Status BackupEngineImpl::PutLatestBackupFileContents(uint32_t latest_backup) {
   return s;
 }
 
-Status BackupEngineImpl::CopyFile(
-    const std::string& src,
-    const std::string& dst, Env* src_env,
-    Env* dst_env, bool sync,
-    BackupRateLimiter* rate_limiter, uint64_t* size,
-    uint32_t* checksum_value,
-    uint64_t size_limit) {
+Status BackupEngineImpl::CopyFile(const std::string& src,
+                                  const std::string& dst, Env* src_env,
+                                  Env* dst_env, bool sync,
+                                  RateLimiter* rate_limiter, uint64_t* size,
+                                  uint32_t* checksum_value, uint64_t size_limit,
+                                  std::function<void()> progress_callback) {
   Status s;
   unique_ptr<WritableFile> dst_file;
   unique_ptr<SequentialFile> src_file;
@@ -935,16 +1108,21 @@ Status BackupEngineImpl::CopyFile(
     return s;
   }
 
+  unique_ptr<WritableFileWriter> dest_writer(
+      new WritableFileWriter(std::move(dst_file), env_options));
+  unique_ptr<SequentialFileReader> src_reader(
+      new SequentialFileReader(std::move(src_file)));
   unique_ptr<char[]> buf(new char[copy_file_buffer_size_]);
   Slice data;
 
+  uint64_t processed_buffer_size = 0;
   do {
     if (stop_backup_.load(std::memory_order_acquire)) {
       return Status::Incomplete("Backup stopped");
     }
     size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ?
       copy_file_buffer_size_ : size_limit;
-    s = src_file->Read(buffer_to_read, &data, buf.get());
+    s = src_reader->Read(buffer_to_read, &data, buf.get());
     size_limit -= data.size();
 
     if (!s.ok()) {
@@ -958,27 +1136,33 @@ Status BackupEngineImpl::CopyFile(
       *checksum_value = crc32c::Extend(*checksum_value, data.data(),
                                        data.size());
     }
-    s = dst_file->Append(data);
+    s = dest_writer->Append(data);
     if (rate_limiter != nullptr) {
-      rate_limiter->ReportAndWait(data.size());
+      rate_limiter->Request(data.size(), Env::IO_LOW);
+    }
+    processed_buffer_size += buffer_to_read;
+    if (processed_buffer_size > options_.callback_trigger_interval_size) {
+      processed_buffer_size -= options_.callback_trigger_interval_size;
+      std::lock_guard<std::mutex> lock(byte_report_mutex_);
+      progress_callback();
     }
   } while (s.ok() && data.size() > 0 && size_limit > 0);
 
   if (s.ok() && sync) {
-    s = dst_file->Sync();
+    s = dest_writer->Sync(false);
   }
 
   return s;
 }
 
 // src_fname will always start with "/"
-Status BackupEngineImpl::BackupFile(BackupID backup_id, BackupMeta* backup,
-                                    bool shared, const std::string& src_dir,
-                                    const std::string& src_fname,
-                                    BackupRateLimiter* rate_limiter,
-                                    uint64_t size_limit,
-                                    bool shared_checksum) {
-
+Status BackupEngineImpl::AddBackupFileWorkItem(
+    std::unordered_set<std::string>& live_dst_paths,
+    std::vector<BackupAfterCopyWorkItem>& backup_items_to_finish,
+    BackupID backup_id, bool shared, const std::string& src_dir,
+    const std::string& src_fname, RateLimiter* rate_limiter,
+    uint64_t size_limit, bool shared_checksum,
+    std::function<void()> progress_callback) {
   assert(src_fname.size() > 0 && src_fname[0] == '/');
   std::string dst_relative = src_fname.substr(1);
   std::string dst_relative_tmp;
@@ -1012,17 +1196,34 @@ Status BackupEngineImpl::BackupFile(BackupID backup_id, BackupMeta* backup,
   std::string dst_path = GetAbsolutePath(dst_relative);
   std::string dst_path_tmp = GetAbsolutePath(dst_relative_tmp);
 
-  // if it's shared, we also need to check if it exists -- if it does,
-  // no need to copy it again
+  // if it's shared, we also need to check if it exists -- if it does, no need
+  // to copy it again.
   bool need_to_copy = true;
-  if (shared && backup_env_->FileExists(dst_path)) {
+  // true if dst_path is the same path as another live file
+  const bool same_path =
+      live_dst_paths.find(dst_path) != live_dst_paths.end();
+
+  bool file_exists = false;
+  if (shared && !same_path) {
+    Status exist = backup_env_->FileExists(dst_path);
+    if (exist.ok()) {
+      file_exists = true;
+    } else if (exist.IsNotFound()) {
+      file_exists = false;
+    } else {
+      assert(s.IsIOError());
+      return exist;
+    }
+  }
+
+  if (shared && (same_path || file_exists)) {
     need_to_copy = false;
     if (shared_checksum) {
       Log(options_.info_log,
           "%s already present, with checksum %u and size %" PRIu64,
           src_fname.c_str(), checksum_value, size);
     } else if (backuped_file_infos_.find(dst_relative) ==
-               backuped_file_infos_.end()) {
+               backuped_file_infos_.end() && !same_path) {
       // file already exists, but it's not referenced by any backup. overwrite
       // the file
       Log(options_.info_log,
@@ -1040,25 +1241,40 @@ Status BackupEngineImpl::BackupFile(BackupID backup_id, BackupMeta* backup,
                             &checksum_value);
     }
   }
+  live_dst_paths.insert(dst_path);
+
   if (need_to_copy) {
     Log(options_.info_log, "Copying %s to %s", src_fname.c_str(),
-        dst_path_tmp.c_str());
-    s = CopyFile(src_dir + src_fname,
-                 dst_path_tmp,
-                 db_env_,
-                 backup_env_,
-                 options_.sync,
-                 rate_limiter,
-                 &size,
-                 &checksum_value,
-                 size_limit);
-    if (s.ok() && shared) {
-      s = backup_env_->RenameFile(dst_path_tmp, dst_path);
-    }
-  }
-  if (s.ok()) {
-    s = backup->AddFile(std::make_shared<FileInfo>(
-          dst_relative, size, checksum_value));
+            dst_path_tmp.c_str());
+    CopyWorkItem copy_work_item(src_dir + src_fname, dst_path_tmp, db_env_,
+                                backup_env_, options_.sync, rate_limiter,
+                                size_limit, progress_callback);
+    BackupAfterCopyWorkItem after_copy_work_item(
+            copy_work_item.result.get_future(),
+            shared,
+            need_to_copy,
+            backup_env_,
+            dst_path_tmp,
+            dst_path,
+            dst_relative);
+    files_to_copy_.write(std::move(copy_work_item));
+    backup_items_to_finish.push_back(std::move(after_copy_work_item));
+  } else {
+    std::promise<CopyResult> promise_result;
+    BackupAfterCopyWorkItem after_copy_work_item(
+            promise_result.get_future(),
+            shared,
+            need_to_copy,
+            backup_env_,
+            dst_path_tmp,
+            dst_path,
+            dst_relative);
+    backup_items_to_finish.push_back(std::move(after_copy_work_item));
+    CopyResult result;
+    result.status = s;
+    result.size = size;
+    result.checksum_value = checksum_value;
+    promise_result.set_value(std::move(result));
   }
   return s;
 }
@@ -1081,6 +1297,8 @@ Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env,
     return s;
   }
 
+  unique_ptr<SequentialFileReader> src_reader(
+      new SequentialFileReader(std::move(src_file)));
   std::unique_ptr<char[]> buf(new char[copy_file_buffer_size_]);
   Slice data;
 
@@ -1090,7 +1308,7 @@ Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env,
     }
     size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ?
       copy_file_buffer_size_ : size_limit;
-    s = src_file->Read(buffer_to_read, &data, buf.get());
+    s = src_reader->Read(buffer_to_read, &data, buf.get());
 
     if (!s.ok()) {
       return s;
@@ -1126,8 +1344,13 @@ Status BackupEngineImpl::GarbageCollect() {
 
   // delete obsolete shared files
   std::vector<std::string> shared_children;
-  backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()),
-                           &shared_children);
+  {
+    auto s = backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()),
+                                      &shared_children);
+    if (!s.ok()) {
+      return s;
+    }
+  }
   for (auto& child : shared_children) {
     std::string rel_fname = GetSharedFileRel(child);
     auto child_itr = backuped_file_infos_.find(rel_fname);
@@ -1137,17 +1360,21 @@ Status BackupEngineImpl::GarbageCollect() {
       // this might be a directory, but DeleteFile will just fail in that
       // case, so we're good
       Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname));
-      if (s.ok()) {
-        Log(options_.info_log, "Deleted %s", rel_fname.c_str());
-      }
+      Log(options_.info_log, "Deleting %s -- %s", rel_fname.c_str(),
+          s.ToString().c_str());
       backuped_file_infos_.erase(rel_fname);
     }
   }
 
   // delete obsolete private files
   std::vector<std::string> private_children;
-  backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()),
-                           &private_children);
+  {
+    auto s = backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()),
+                                      &private_children);
+    if (!s.ok()) {
+      return s;
+    }
+  }
   for (auto& child : private_children) {
     BackupID backup_id = 0;
     bool tmp_dir = child.find(".tmp") != std::string::npos;
@@ -1164,14 +1391,12 @@ Status BackupEngineImpl::GarbageCollect() {
     backup_env_->GetChildren(full_private_path, &subchildren);
     for (auto& subchild : subchildren) {
       Status s = backup_env_->DeleteFile(full_private_path + subchild);
-      if (s.ok()) {
-        Log(options_.info_log, "Deleted %s",
-            (full_private_path + subchild).c_str());
-      }
+      Log(options_.info_log, "Deleting %s -- %s",
+          (full_private_path + subchild).c_str(), s.ToString().c_str());
     }
     // finally delete the private dir
     Status s = backup_env_->DeleteDir(full_private_path);
-    Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(),
+    Log(options_.info_log, "Deleting dir %s -- %s", full_private_path.c_str(),
         s.ToString().c_str());
   }
 
@@ -1207,16 +1432,23 @@ Status BackupEngineImpl::BackupMeta::AddFile(
   return Status::OK();
 }
 
-void BackupEngineImpl::BackupMeta::Delete(bool delete_meta) {
+Status BackupEngineImpl::BackupMeta::Delete(bool delete_meta) {
+  Status s;
   for (const auto& file : files_) {
     --file->refs;  // decrease refcount
   }
   files_.clear();
   // delete meta file
   if (delete_meta) {
-    env_->DeleteFile(meta_filename_);
+    s = env_->FileExists(meta_filename_);
+    if (s.ok()) {
+      s = env_->DeleteFile(meta_filename_);
+    } else if (s.IsNotFound()) {
+      s = Status::OK();  // nothing to delete
+    }
   }
   timestamp_ = 0;
+  return s;
 }
 
 // each backup meta file is of the format:
@@ -1236,9 +1468,11 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
     return s;
   }
 
+  unique_ptr<SequentialFileReader> backup_meta_reader(
+      new SequentialFileReader(std::move(backup_meta_file)));
   unique_ptr<char[]> buf(new char[max_backup_meta_file_size_ + 1]);
   Slice data;
-  s = backup_meta_file->Read(max_backup_meta_file_size_, &data, buf.get());
+  s = backup_meta_reader->Read(max_backup_meta_file_size_, &data, buf.get());
 
   if (!s.ok() || data.size() == max_backup_meta_file_size_) {
     return s.ok() ? Status::Corruption("File size too big") : s;
@@ -1330,7 +1564,8 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
   len += snprintf(buf.get(), buf_size, "%" PRId64 "\n", timestamp_);
   len += snprintf(buf.get() + len, buf_size - len, "%" PRIu64 "\n",
                   sequence_number_);
-  len += snprintf(buf.get() + len, buf_size - len, "%zu\n", files_.size());
+  len += snprintf(buf.get() + len, buf_size - len, "%" ROCKSDB_PRIszt "\n",
+                  files_.size());
   for (const auto& file : files_) {
     // use crc32 for now, switch to something else if needed
     len += snprintf(buf.get() + len, buf_size - len, "%s crc32 %u\n",
@@ -1381,58 +1616,79 @@ class BackupEngineReadOnlyImpl : public BackupEngineReadOnly {
                                                      restore_options);
   }
 
+  virtual Status VerifyBackup(BackupID backup_id) override {
+    return backup_engine_->VerifyBackup(backup_id);
+  }
+
+  Status Initialize() { return backup_engine_->Initialize(); }
+
  private:
   std::unique_ptr<BackupEngineImpl> backup_engine_;
 };
 
-BackupEngineReadOnly* BackupEngineReadOnly::NewReadOnlyBackupEngine(
-    Env* db_env, const BackupableDBOptions& options) {
-  if (options.destroy_old_data) {
-    assert(false);
-    return nullptr;
-  }
-  return new BackupEngineReadOnlyImpl(db_env, options);
-}
-
 Status BackupEngineReadOnly::Open(Env* env, const BackupableDBOptions& options,
                                   BackupEngineReadOnly** backup_engine_ptr) {
   if (options.destroy_old_data) {
-    assert(false);
     return Status::InvalidArgument(
         "Can't destroy old data with ReadOnly BackupEngine");
   }
-  *backup_engine_ptr = new BackupEngineReadOnlyImpl(env, options);
+  std::unique_ptr<BackupEngineReadOnlyImpl> backup_engine(
+      new BackupEngineReadOnlyImpl(env, options));
+  auto s = backup_engine->Initialize();
+  if (!s.ok()) {
+    *backup_engine_ptr = nullptr;
+    return s;
+  }
+  *backup_engine_ptr = backup_engine.release();
   return Status::OK();
 }
 
 // --- BackupableDB methods --------
 
 BackupableDB::BackupableDB(DB* db, const BackupableDBOptions& options)
-    : StackableDB(db),
-      backup_engine_(new BackupEngineImpl(db->GetEnv(), options)) {}
+    : StackableDB(db) {
+  auto backup_engine_impl = new BackupEngineImpl(db->GetEnv(), options);
+  status_ = backup_engine_impl->Initialize();
+  backup_engine_ = backup_engine_impl;
+}
 
 BackupableDB::~BackupableDB() {
   delete backup_engine_;
 }
 
 Status BackupableDB::CreateNewBackup(bool flush_before_backup) {
+  if (!status_.ok()) {
+    return status_;
+  }
   return backup_engine_->CreateNewBackup(this, flush_before_backup);
 }
 
 void BackupableDB::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
+  if (!status_.ok()) {
+    return;
+  }
   backup_engine_->GetBackupInfo(backup_info);
 }
 
 void
 BackupableDB::GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids) {
+  if (!status_.ok()) {
+    return;
+  }
   backup_engine_->GetCorruptedBackups(corrupt_backup_ids);
 }
 
 Status BackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) {
+  if (!status_.ok()) {
+    return status_;
+  }
   return backup_engine_->PurgeOldBackups(num_backups_to_keep);
 }
 
 Status BackupableDB::DeleteBackup(BackupID backup_id) {
+  if (!status_.ok()) {
+    return status_;
+  }
   return backup_engine_->DeleteBackup(backup_id);
 }
 
@@ -1441,14 +1697,20 @@ void BackupableDB::StopBackup() {
 }
 
 Status BackupableDB::GarbageCollect() {
+  if (!status_.ok()) {
+    return status_;
+  }
   return backup_engine_->GarbageCollect();
 }
 
 // --- RestoreBackupableDB methods ------
 
 RestoreBackupableDB::RestoreBackupableDB(Env* db_env,
-                                         const BackupableDBOptions& options)
-    : backup_engine_(new BackupEngineImpl(db_env, options)) {}
+                                         const BackupableDBOptions& options) {
+  auto backup_engine_impl = new BackupEngineImpl(db_env, options);
+  status_ = backup_engine_impl->Initialize();
+  backup_engine_ = backup_engine_impl;
+}
 
 RestoreBackupableDB::~RestoreBackupableDB() {
   delete backup_engine_;
@@ -1456,17 +1718,26 @@ RestoreBackupableDB::~RestoreBackupableDB() {
 
 void
 RestoreBackupableDB::GetBackupInfo(std::vector<BackupInfo>* backup_info) {
+  if (!status_.ok()) {
+    return;
+  }
   backup_engine_->GetBackupInfo(backup_info);
 }
 
 void RestoreBackupableDB::GetCorruptedBackups(
     std::vector<BackupID>* corrupt_backup_ids) {
+  if (!status_.ok()) {
+    return;
+  }
   backup_engine_->GetCorruptedBackups(corrupt_backup_ids);
 }
 
 Status RestoreBackupableDB::RestoreDBFromBackup(
     BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
     const RestoreOptions& restore_options) {
+  if (!status_.ok()) {
+    return status_;
+  }
   return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir,
                                              restore_options);
 }
@@ -1474,19 +1745,31 @@ Status RestoreBackupableDB::RestoreDBFromBackup(
 Status RestoreBackupableDB::RestoreDBFromLatestBackup(
     const std::string& db_dir, const std::string& wal_dir,
     const RestoreOptions& restore_options) {
+  if (!status_.ok()) {
+    return status_;
+  }
   return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir,
                                                    restore_options);
 }
 
 Status RestoreBackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) {
+  if (!status_.ok()) {
+    return status_;
+  }
   return backup_engine_->PurgeOldBackups(num_backups_to_keep);
 }
 
 Status RestoreBackupableDB::DeleteBackup(BackupID backup_id) {
+  if (!status_.ok()) {
+    return status_;
+  }
   return backup_engine_->DeleteBackup(backup_id);
 }
 
 Status RestoreBackupableDB::GarbageCollect() {
+  if (!status_.ok()) {
+    return status_;
+  }
   return backup_engine_->GarbageCollect();
 }
 
diff --git a/src/rocksdb/utilities/backupable/backupable_db_test.cc b/src/rocksdb/utilities/backupable/backupable_db_test.cc
index 1476d9d..5e4d690 100644
--- a/src/rocksdb/utilities/backupable/backupable_db_test.cc
+++ b/src/rocksdb/utilities/backupable/backupable_db_test.cc
@@ -7,19 +7,25 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#ifndef ROCKSDB_LITE
+
 #include <string>
 #include <algorithm>
 #include <iostream>
 
 #include "port/port.h"
+#include "port/stack_trace.h"
 #include "rocksdb/types.h"
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/utilities/backupable_db.h"
+#include "util/file_reader_writer.h"
 #include "util/testharness.h"
 #include "util/random.h"
 #include "util/mutexlock.h"
+#include "util/string_util.h"
 #include "util/testutil.h"
 #include "util/auto_roll_logger.h"
+#include "util/mock_env.h"
 
 namespace rocksdb {
 
@@ -215,12 +221,44 @@ class TestEnv : public EnvWrapper {
     dummy_sequential_file_ = dummy_sequential_file;
   }
 
+  void SetGetChildrenFailure(bool fail) { get_children_failure_ = fail; }
+  Status GetChildren(const std::string& dir,
+                     std::vector<std::string>* r) override {
+    if (get_children_failure_) {
+      return Status::IOError("SimulatedFailure");
+    }
+    return EnvWrapper::GetChildren(dir, r);
+  }
+
+  void SetCreateDirIfMissingFailure(bool fail) {
+    create_dir_if_missing_failure_ = fail;
+  }
+  Status CreateDirIfMissing(const std::string& d) override {
+    if (create_dir_if_missing_failure_) {
+      return Status::IOError("SimulatedFailure");
+    }
+    return EnvWrapper::CreateDirIfMissing(d);
+  }
+
+  void SetNewDirectoryFailure(bool fail) { new_directory_failure_ = fail; }
+  virtual Status NewDirectory(const std::string& name,
+                              unique_ptr<Directory>* result) override {
+    if (new_directory_failure_) {
+      return Status::IOError("SimulatedFailure");
+    }
+    return EnvWrapper::NewDirectory(name, result);
+  }
+
  private:
   port::Mutex mutex_;
   bool dummy_sequential_file_ = false;
   std::vector<std::string> written_files_;
   uint64_t limit_written_files_ = 1000000;
   uint64_t limit_delete_files_ = 1000000;
+
+  bool get_children_failure_ = false;
+  bool create_dir_if_missing_failure_ = false;
+  bool new_directory_failure_ = false;
 };  // TestEnv
 
 class FileManager : public EnvWrapper {
@@ -244,26 +282,41 @@ class FileManager : public EnvWrapper {
     return Status::NotFound("");
   }
 
+  Status AppendToRandomFileInDir(const std::string& dir,
+                                 const std::string& data) {
+    std::vector<std::string> children;
+    GetChildren(dir, &children);
+    if (children.size() <= 2) {
+      return Status::NotFound("");
+    }
+    while (true) {
+      int i = rnd_.Next() % children.size();
+      if (children[i] != "." && children[i] != "..") {
+        return WriteToFile(dir + "/" + children[i], data);
+      }
+    }
+    // should never get here
+    assert(false);
+    return Status::NotFound("");
+  }
+
   Status CorruptFile(const std::string& fname, uint64_t bytes_to_corrupt) {
-    uint64_t size;
-    Status s = GetFileSize(fname, &size);
+    std::string file_contents;
+    Status s = ReadFileToString(this, fname, &file_contents);
     if (!s.ok()) {
       return s;
     }
-    unique_ptr<RandomRWFile> file;
-    EnvOptions env_options;
-    env_options.use_mmap_writes = false;
-    s = NewRandomRWFile(fname, &file, env_options);
+    s = DeleteFile(fname);
     if (!s.ok()) {
       return s;
     }
 
-    for (uint64_t i = 0; s.ok() && i < bytes_to_corrupt; ++i) {
+    for (uint64_t i = 0; i < bytes_to_corrupt; ++i) {
       std::string tmp;
-      // write one random byte to a random position
-      s = file->Write(rnd_.Next() % size, test::RandomString(&rnd_, 1, &tmp));
+      test::RandomString(&rnd_, 1, &tmp);
+      file_contents[rnd_.Next() % file_contents.size()] = tmp[0];
     }
-    return s;
+    return WriteToFile(fname, file_contents);
   }
 
   Status CorruptChecksum(const std::string& fname, bool appear_valid) {
@@ -323,8 +376,8 @@ class FileManager : public EnvWrapper {
 static size_t FillDB(DB* db, int from, int to) {
   size_t bytes_written = 0;
   for (int i = from; i < to; ++i) {
-    std::string key = "testkey" + std::to_string(i);
-    std::string value = "testvalue" + std::to_string(i);
+    std::string key = "testkey" + ToString(i);
+    std::string value = "testvalue" + ToString(i);
     bytes_written += key.size() + value.size();
 
     EXPECT_OK(db->Put(WriteOptions(), Slice(key), Slice(value)));
@@ -334,17 +387,17 @@ static size_t FillDB(DB* db, int from, int to) {
 
 static void AssertExists(DB* db, int from, int to) {
   for (int i = from; i < to; ++i) {
-    std::string key = "testkey" + std::to_string(i);
+    std::string key = "testkey" + ToString(i);
     std::string value;
     Status s = db->Get(ReadOptions(), Slice(key), &value);
-    ASSERT_EQ(value, "testvalue" + std::to_string(i));
+    ASSERT_EQ(value, "testvalue" + ToString(i));
   }
 }
 
 static void AssertEmpty(DB* db, int from, int to) {
   for (int i = from; i < to; ++i) {
-    std::string key = "testkey" + std::to_string(i);
-    std::string value = "testvalue" + std::to_string(i);
+    std::string key = "testkey" + ToString(i);
+    std::string value = "testvalue" + ToString(i);
 
     Status s = db->Get(ReadOptions(), Slice(key), &value);
     ASSERT_TRUE(s.IsNotFound());
@@ -360,6 +413,7 @@ class BackupableDBTest : public testing::Test {
 
     // set up envs
     env_ = Env::Default();
+    mock_env_.reset(new MockEnv(env_));
     test_db_env_.reset(new TestEnv(env_));
     test_backup_env_.reset(new TestEnv(env_));
     file_manager_.reset(new FileManager(env_));
@@ -376,6 +430,9 @@ class BackupableDBTest : public testing::Test {
     backupable_options_.reset(new BackupableDBOptions(
         backupdir_, test_backup_env_.get(), true, logger_.get(), true));
 
+    // most tests will use multi-threaded backups
+    backupable_options_->max_background_operations = 7;
+
     // delete old files in db
     DestroyDB(dbname_, Options());
   }
@@ -386,9 +443,9 @@ class BackupableDBTest : public testing::Test {
     return db;
   }
 
-  void OpenBackupableDB(bool destroy_old_data = false, bool dummy = false,
-                        bool share_table_files = true,
-                        bool share_with_checksums = false) {
+  void OpenDBAndBackupEngine(bool destroy_old_data = false, bool dummy = false,
+                             bool share_table_files = true,
+                             bool share_with_checksums = false) {
     // reset all the defaults
     test_backup_env_->SetLimitWrittenFiles(1000000);
     test_db_env_->SetLimitWrittenFiles(1000000);
@@ -401,25 +458,30 @@ class BackupableDBTest : public testing::Test {
     } else {
       ASSERT_OK(DB::Open(options_, dbname_, &db));
     }
+    db_.reset(db);
     backupable_options_->destroy_old_data = destroy_old_data;
     backupable_options_->share_table_files = share_table_files;
     backupable_options_->share_files_with_checksum = share_with_checksums;
-    db_.reset(new BackupableDB(db, *backupable_options_));
+    BackupEngine* backup_engine;
+    ASSERT_OK(BackupEngine::Open(test_db_env_.get(), *backupable_options_,
+                                 &backup_engine));
+    backup_engine_.reset(backup_engine);
   }
 
-  void CloseBackupableDB() {
-    db_.reset(nullptr);
+  void CloseDBAndBackupEngine() {
+    db_.reset();
+    backup_engine_.reset();
   }
 
-  void OpenRestoreDB() {
+  void OpenBackupEngine() {
     backupable_options_->destroy_old_data = false;
-    restore_db_.reset(
-        new RestoreBackupableDB(test_db_env_.get(), *backupable_options_));
+    BackupEngine* backup_engine;
+    ASSERT_OK(BackupEngine::Open(test_db_env_.get(), *backupable_options_,
+                                 &backup_engine));
+    backup_engine_.reset(backup_engine);
   }
 
-  void CloseRestoreDB() {
-    restore_db_.reset(nullptr);
-  }
+  void CloseBackupEngine() { backup_engine_.reset(nullptr); }
 
   // restores backup backup_id and asserts the existence of
   // [start_exist, end_exist> and not-existence of
@@ -431,17 +493,17 @@ class BackupableDBTest : public testing::Test {
                                uint32_t end_exist, uint32_t end = 0,
                                bool keep_log_files = false) {
     RestoreOptions restore_options(keep_log_files);
-    bool opened_restore = false;
-    if (restore_db_.get() == nullptr) {
-      opened_restore = true;
-      OpenRestoreDB();
+    bool opened_backup_engine = false;
+    if (backup_engine_.get() == nullptr) {
+      opened_backup_engine = true;
+      OpenBackupEngine();
     }
     if (backup_id > 0) {
-      ASSERT_OK(restore_db_->RestoreDBFromBackup(backup_id, dbname_, dbname_,
-                                                 restore_options));
+      ASSERT_OK(backup_engine_->RestoreDBFromBackup(backup_id, dbname_, dbname_,
+                                                    restore_options));
     } else {
-      ASSERT_OK(restore_db_->RestoreDBFromLatestBackup(dbname_, dbname_,
-                                                       restore_options));
+      ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_,
+                                                          restore_options));
     }
     DB* db = OpenDB();
     AssertExists(db, start_exist, end_exist);
@@ -449,8 +511,8 @@ class BackupableDBTest : public testing::Test {
       AssertEmpty(db, end_exist, end);
     }
     delete db;
-    if (opened_restore) {
-      CloseRestoreDB();
+    if (opened_backup_engine) {
+      CloseBackupEngine();
     }
   }
 
@@ -473,14 +535,15 @@ class BackupableDBTest : public testing::Test {
 
   // envs
   Env* env_;
+  unique_ptr<MockEnv> mock_env_;
   unique_ptr<TestEnv> test_db_env_;
   unique_ptr<TestEnv> test_backup_env_;
   unique_ptr<FileManager> file_manager_;
 
   // all the dbs!
   DummyDB* dummy_db_; // BackupableDB owns dummy_db_
-  unique_ptr<BackupableDB> db_;
-  unique_ptr<RestoreBackupableDB> restore_db_;
+  unique_ptr<DB> db_;
+  unique_ptr<BackupEngine> backup_engine_;
 
   // options
   Options options_;
@@ -496,7 +559,7 @@ void AppendPath(const std::string& path, std::vector<std::string>& v) {
 
 // this will make sure that backup does not copy the same file twice
 TEST_F(BackupableDBTest, NoDoubleCopy) {
-  OpenBackupableDB(true, true);
+  OpenDBAndBackupEngine(true, true);
 
   // should write 5 DB files + LATEST_BACKUP + one meta file
   test_backup_env_->SetLimitWrittenFiles(7);
@@ -505,16 +568,12 @@ TEST_F(BackupableDBTest, NoDoubleCopy) {
   dummy_db_->live_files_ = { "/00010.sst", "/00011.sst",
                              "/CURRENT",   "/MANIFEST-01" };
   dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
-  ASSERT_OK(db_->CreateNewBackup(false));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
   std::vector<std::string> should_have_written = {
-    "/shared/00010.sst.tmp",
-    "/shared/00011.sst.tmp",
-    "/private/1.tmp/CURRENT",
-    "/private/1.tmp/MANIFEST-01",
-    "/private/1.tmp/00011.log",
-    "/meta/1.tmp",
-    "/LATEST_BACKUP.tmp"
-  };
+      "/shared/00010.sst.tmp",    "/shared/00011.sst.tmp",
+      "/private/1.tmp/CURRENT",   "/private/1.tmp/MANIFEST-01",
+      "/private/1.tmp/00011.log", "/meta/1.tmp",
+      "/LATEST_BACKUP.tmp"};
   AppendPath(dbname_ + "_backup", should_have_written);
   test_backup_env_->AssertWrittenFiles(should_have_written);
 
@@ -525,7 +584,7 @@ TEST_F(BackupableDBTest, NoDoubleCopy) {
   dummy_db_->live_files_ = { "/00010.sst", "/00015.sst",
                              "/CURRENT",   "/MANIFEST-01" };
   dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
-  ASSERT_OK(db_->CreateNewBackup(false));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
   // should not open 00010.sst - it's already there
   should_have_written = {
     "/shared/00015.sst.tmp",
@@ -538,11 +597,13 @@ TEST_F(BackupableDBTest, NoDoubleCopy) {
   AppendPath(dbname_ + "_backup", should_have_written);
   test_backup_env_->AssertWrittenFiles(should_have_written);
 
-  ASSERT_OK(db_->DeleteBackup(1));
-  ASSERT_TRUE(test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst"));
+  ASSERT_OK(backup_engine_->DeleteBackup(1));
+  ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00010.sst"));
+
   // 00011.sst was only in backup 1, should be deleted
-  ASSERT_FALSE(test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst"));
-  ASSERT_TRUE(test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst"));
+  ASSERT_EQ(Status::NotFound(),
+            test_backup_env_->FileExists(backupdir_ + "/shared/00011.sst"));
+  ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst"));
 
   // MANIFEST file size should be only 100
   uint64_t size;
@@ -551,7 +612,40 @@ TEST_F(BackupableDBTest, NoDoubleCopy) {
   test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size);
   ASSERT_EQ(200UL, size);
 
-  CloseBackupableDB();
+  CloseDBAndBackupEngine();
+}
+
+// Verify that backup works when the database environment is not the same as
+// the backup environment
+// TODO(agf): Make all/most tests use different db and backup environments.
+//            This will probably require more implementation of MockEnv.
+//            For example, MockEnv::RenameFile() must be able to rename
+//            directories.
+TEST_F(BackupableDBTest, DifferentEnvs) {
+  test_db_env_.reset(new TestEnv(mock_env_.get()));
+  options_.env = test_db_env_.get();
+
+  OpenDBAndBackupEngine(true, true);
+
+  // should write 5 DB files + LATEST_BACKUP + one meta file
+  test_backup_env_->SetLimitWrittenFiles(7);
+  test_backup_env_->ClearWrittenFiles();
+  test_db_env_->SetLimitWrittenFiles(0);
+  dummy_db_->live_files_ = { "/00010.sst", "/00011.sst",
+                             "/CURRENT",   "/MANIFEST-01" };
+  dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}};
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+
+  CloseDBAndBackupEngine();
+
+  // try simple backup and verify correctness
+  OpenDBAndBackupEngine(true);
+  FillDB(db_.get(), 0, 100);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+  DestroyDB(dbname_, Options());
+
+  AssertBackupConsistency(0, 0, 100, 500);
 }
 
 // test various kind of corruptions that may happen:
@@ -568,11 +662,11 @@ TEST_F(BackupableDBTest, CorruptionsTest) {
   Random rnd(6);
   Status s;
 
-  OpenBackupableDB(true);
+  OpenDBAndBackupEngine(true);
   // create five backups
   for (int i = 0; i < 5; ++i) {
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
-    ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
   }
 
   // ---------- case 1. - fail a write -----------
@@ -580,11 +674,11 @@ TEST_F(BackupableDBTest, CorruptionsTest) {
   FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6);
   test_backup_env_->SetLimitWrittenFiles(2);
   // should fail
-  s = db_->CreateNewBackup(!!(rnd.Next() % 2));
+  s = backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2));
   ASSERT_TRUE(!s.ok());
   test_backup_env_->SetLimitWrittenFiles(1000000);
   // latest backup should have all the keys
-  CloseBackupableDB();
+  CloseDBAndBackupEngine();
   AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6);
 
   // ---------- case 2. - corrupt/delete latest backup -----------
@@ -593,30 +687,40 @@ TEST_F(BackupableDBTest, CorruptionsTest) {
   ASSERT_OK(file_manager_->DeleteFile(backupdir_ + "/LATEST_BACKUP"));
   AssertBackupConsistency(0, 0, keys_iteration * 5);
   // create backup 6, point LATEST_BACKUP to 5
-  OpenBackupableDB();
+  // behavior change: this used to delete backup 6. however, now we ignore
+  // LATEST_BACKUP contents so BackupEngine sets latest backup to 6.
+  OpenDBAndBackupEngine();
   FillDB(db_.get(), keys_iteration * 5, keys_iteration * 6);
-  ASSERT_OK(db_->CreateNewBackup(false));
-  CloseBackupableDB();
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+  CloseDBAndBackupEngine();
   ASSERT_OK(file_manager_->WriteToFile(backupdir_ + "/LATEST_BACKUP", "5"));
-  AssertBackupConsistency(0, 0, keys_iteration * 5, keys_iteration * 6);
-  // assert that all 6 data is gone!
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/6") == false);
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/6") == false);
+  AssertBackupConsistency(0, 0, keys_iteration * 6);
+  // assert that all 6 data is still here
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/6"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/6"));
+  // assert that we wrote 6 to LATEST_BACKUP
+  {
+    std::string latest_backup_contents;
+    ReadFileToString(env_, backupdir_ + "/LATEST_BACKUP",
+                     &latest_backup_contents);
+    ASSERT_EQ(std::atol(latest_backup_contents.c_str()), 6);
+  }
 
   // --------- case 3. corrupted backup meta or missing backuped file ----
   ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/5", 3));
+  ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/6", 3));
   // since 5 meta is now corrupted, latest backup should be 4
   AssertBackupConsistency(0, 0, keys_iteration * 4, keys_iteration * 5);
-  OpenRestoreDB();
-  s = restore_db_->RestoreDBFromBackup(5, dbname_, dbname_);
+  OpenBackupEngine();
+  s = backup_engine_->RestoreDBFromBackup(5, dbname_, dbname_);
   ASSERT_TRUE(!s.ok());
-  CloseRestoreDB();
+  CloseBackupEngine();
   ASSERT_OK(file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/4"));
   // 4 is corrupted, 3 is the latest backup now
   AssertBackupConsistency(0, 0, keys_iteration * 3, keys_iteration * 5);
-  OpenRestoreDB();
-  s = restore_db_->RestoreDBFromBackup(4, dbname_, dbname_);
-  CloseRestoreDB();
+  OpenBackupEngine();
+  s = backup_engine_->RestoreDBFromBackup(4, dbname_, dbname_);
+  CloseBackupEngine();
   ASSERT_TRUE(!s.ok());
 
   // --------- case 4. corrupted checksum value ----
@@ -627,50 +731,91 @@ TEST_F(BackupableDBTest, CorruptionsTest) {
   // checksum of the backup 2 appears to be valid, this can cause checksum
   // mismatch and abort restore process
   ASSERT_OK(file_manager_->CorruptChecksum(backupdir_ + "/meta/2", true));
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2"));
-  OpenRestoreDB();
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2"));
-  s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_);
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/2"));
+  OpenBackupEngine();
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/2"));
+  s = backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_);
   ASSERT_TRUE(!s.ok());
 
   // make sure that no corrupt backups have actually been deleted!
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/1"));
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2"));
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/3"));
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/4"));
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/5"));
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/1"));
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/2"));
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/3"));
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/4"));
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/5"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/1"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/2"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/3"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/4"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/5"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/1"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/2"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/3"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/4"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/5"));
 
   // delete the corrupt backups and then make sure they're actually deleted
-  ASSERT_OK(restore_db_->DeleteBackup(5));
-  ASSERT_OK(restore_db_->DeleteBackup(4));
-  ASSERT_OK(restore_db_->DeleteBackup(3));
-  ASSERT_OK(restore_db_->DeleteBackup(2));
-  (void) restore_db_->GarbageCollect();
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/5") == false);
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/5") == false);
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/4") == false);
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/4") == false);
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/3") == false);
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/3") == false);
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2") == false);
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/2") == false);
-
-  CloseRestoreDB();
+  ASSERT_OK(backup_engine_->DeleteBackup(5));
+  ASSERT_OK(backup_engine_->DeleteBackup(4));
+  ASSERT_OK(backup_engine_->DeleteBackup(3));
+  ASSERT_OK(backup_engine_->DeleteBackup(2));
+  (void)backup_engine_->GarbageCollect();
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/meta/5"));
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/private/5"));
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/meta/4"));
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/private/4"));
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/meta/3"));
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/private/3"));
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/meta/2"));
+  ASSERT_EQ(Status::NotFound(),
+            file_manager_->FileExists(backupdir_ + "/private/2"));
+
+  CloseBackupEngine();
   AssertBackupConsistency(0, 0, keys_iteration * 1, keys_iteration * 5);
 
   // new backup should be 2!
-  OpenBackupableDB();
+  OpenDBAndBackupEngine();
   FillDB(db_.get(), keys_iteration * 1, keys_iteration * 2);
-  ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
-  CloseBackupableDB();
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
+  CloseDBAndBackupEngine();
   AssertBackupConsistency(2, 0, keys_iteration * 2, keys_iteration * 5);
 }
 
+// This test verifies that the verifyBackup method correctly identifies
+// invalid backups
+TEST_F(BackupableDBTest, VerifyBackup) {
+  const int keys_iteration = 5000;
+  Random rnd(6);
+  Status s;
+  OpenDBAndBackupEngine(true);
+  // create five backups
+  for (int i = 0; i < 5; ++i) {
+    FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  }
+  CloseDBAndBackupEngine();
+
+  OpenDBAndBackupEngine();
+  // ---------- case 1. - valid backup -----------
+  ASSERT_TRUE(backup_engine_->VerifyBackup(1).ok());
+
+  // ---------- case 2. - delete a file -----------i
+  file_manager_->DeleteRandomFileInDir(backupdir_ + "/private/1");
+  ASSERT_TRUE(backup_engine_->VerifyBackup(1).IsNotFound());
+
+  // ---------- case 3. - corrupt a file -----------
+  std::string append_data = "Corrupting a random file";
+  file_manager_->AppendToRandomFileInDir(backupdir_ + "/private/2",
+                                         append_data);
+  ASSERT_TRUE(backup_engine_->VerifyBackup(2).IsCorruption());
+
+  // ---------- case 4. - invalid backup -----------
+  ASSERT_TRUE(backup_engine_->VerifyBackup(6).IsNotFound());
+  CloseDBAndBackupEngine();
+}
+
 // This test verifies we don't delete the latest backup when read-only option is
 // set
 TEST_F(BackupableDBTest, NoDeleteWithReadOnly) {
@@ -678,13 +823,13 @@ TEST_F(BackupableDBTest, NoDeleteWithReadOnly) {
   Random rnd(6);
   Status s;
 
-  OpenBackupableDB(true);
+  OpenDBAndBackupEngine(true);
   // create five backups
   for (int i = 0; i < 5; ++i) {
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
-    ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
   }
-  CloseBackupableDB();
+  CloseDBAndBackupEngine();
   ASSERT_OK(file_manager_->WriteToFile(backupdir_ + "/LATEST_BACKUP", "4"));
 
   backupable_options_->destroy_old_data = false;
@@ -694,13 +839,14 @@ TEST_F(BackupableDBTest, NoDeleteWithReadOnly) {
 
   // assert that data from backup 5 is still here (even though LATEST_BACKUP
   // says 4 is latest)
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/5") == true);
-  ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/5") == true);
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/meta/5"));
+  ASSERT_OK(file_manager_->FileExists(backupdir_ + "/private/5"));
 
-  // even though 5 is here, we should only see 4 backups
+  // Behavior change: We now ignore LATEST_BACKUP contents. This means that
+  // we should have 5 backups, even if LATEST_BACKUP says 4.
   std::vector<BackupInfo> backup_info;
   read_only_backup_engine->GetBackupInfo(&backup_info);
-  ASSERT_EQ(4UL, backup_info.size());
+  ASSERT_EQ(5UL, backup_info.size());
   delete read_only_backup_engine;
 }
 
@@ -725,11 +871,11 @@ TEST_F(BackupableDBTest, OfflineIntegrationTest) {
       // in last iteration, put smaller amount of data,
       int fill_up_to = std::min(keys_iteration * (i + 1), max_key);
       // ---- insert new data and back up ----
-      OpenBackupableDB(destroy_data);
+      OpenDBAndBackupEngine(destroy_data);
       destroy_data = false;
       FillDB(db_.get(), keys_iteration * i, fill_up_to);
-      ASSERT_OK(db_->CreateNewBackup(iter == 0));
-      CloseBackupableDB();
+      ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), iter == 0));
+      CloseDBAndBackupEngine();
       DestroyDB(dbname_, Options());
 
       // ---- make sure it's empty ----
@@ -738,15 +884,15 @@ TEST_F(BackupableDBTest, OfflineIntegrationTest) {
       delete db;
 
       // ---- restore the DB ----
-      OpenRestoreDB();
-      if (i >= 3) { // test purge old backups
+      OpenBackupEngine();
+      if (i >= 3) {  // test purge old backups
         // when i == 4, purge to only 1 backup
         // when i == 3, purge to 2 backups
-        ASSERT_OK(restore_db_->PurgeOldBackups(5 - i));
+        ASSERT_OK(backup_engine_->PurgeOldBackups(5 - i));
       }
       // ---- make sure the data is there ---
       AssertBackupConsistency(0, 0, fill_up_to, max_key);
-      CloseRestoreDB();
+      CloseBackupEngine();
     }
   }
 }
@@ -760,14 +906,12 @@ TEST_F(BackupableDBTest, OnlineIntegrationTest) {
   // delete old data
   DestroyDB(dbname_, Options());
 
-  OpenBackupableDB(true);
+  OpenDBAndBackupEngine(true);
   // write some data, backup, repeat
   for (int i = 0; i < 5; ++i) {
     if (i == 4) {
       // delete backup number 2, online delete!
-      OpenRestoreDB();
-      ASSERT_OK(restore_db_->DeleteBackup(2));
-      CloseRestoreDB();
+      ASSERT_OK(backup_engine_->DeleteBackup(2));
     }
     // in last iteration, put smaller amount of data,
     // so that backups can share sst files
@@ -775,10 +919,10 @@ TEST_F(BackupableDBTest, OnlineIntegrationTest) {
     FillDB(db_.get(), keys_iteration * i, fill_up_to);
     // we should get consistent results with flush_before_backup
     // set to both true and false
-    ASSERT_OK(db_->CreateNewBackup(!!(rnd.Next() % 2)));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)));
   }
   // close and destroy
-  CloseBackupableDB();
+  CloseDBAndBackupEngine();
   DestroyDB(dbname_, Options());
 
   // ---- make sure it's empty ----
@@ -787,11 +931,11 @@ TEST_F(BackupableDBTest, OnlineIntegrationTest) {
   delete db;
 
   // ---- restore every backup and verify all the data is there ----
-  OpenRestoreDB();
+  OpenBackupEngine();
   for (int i = 1; i <= 5; ++i) {
     if (i == 2) {
       // we deleted backup 2
-      Status s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_);
+      Status s = backup_engine_->RestoreDBFromBackup(2, dbname_, dbname_);
       ASSERT_TRUE(!s.ok());
     } else {
       int fill_up_to = std::min(keys_iteration * i, max_key);
@@ -800,11 +944,11 @@ TEST_F(BackupableDBTest, OnlineIntegrationTest) {
   }
 
   // delete some backups -- this should leave only backups 3 and 5 alive
-  ASSERT_OK(restore_db_->DeleteBackup(4));
-  ASSERT_OK(restore_db_->PurgeOldBackups(2));
+  ASSERT_OK(backup_engine_->DeleteBackup(4));
+  ASSERT_OK(backup_engine_->PurgeOldBackups(2));
 
   std::vector<BackupInfo> backup_info;
-  restore_db_->GetBackupInfo(&backup_info);
+  backup_engine_->GetBackupInfo(&backup_info);
   ASSERT_EQ(2UL, backup_info.size());
 
   // check backup 3
@@ -812,30 +956,32 @@ TEST_F(BackupableDBTest, OnlineIntegrationTest) {
   // check backup 5
   AssertBackupConsistency(5, 0, max_key);
 
-  CloseRestoreDB();
+  CloseBackupEngine();
 }
 
 TEST_F(BackupableDBTest, FailOverwritingBackups) {
   options_.write_buffer_size = 1024 * 1024 * 1024;  // 1GB
+  options_.disable_auto_compactions = true;
+
   // create backups 1, 2, 3, 4, 5
-  OpenBackupableDB(true);
+  OpenDBAndBackupEngine(true);
   for (int i = 0; i < 5; ++i) {
-    CloseBackupableDB();
+    CloseDBAndBackupEngine();
     DeleteLogFiles();
-    OpenBackupableDB(false);
+    OpenDBAndBackupEngine(false);
     FillDB(db_.get(), 100 * i, 100 * (i + 1));
-    ASSERT_OK(db_->CreateNewBackup(true));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
   }
-  CloseBackupableDB();
+  CloseDBAndBackupEngine();
 
   // restore 3
-  OpenRestoreDB();
-  ASSERT_OK(restore_db_->RestoreDBFromBackup(3, dbname_, dbname_));
-  CloseRestoreDB();
+  OpenBackupEngine();
+  ASSERT_OK(backup_engine_->RestoreDBFromBackup(3, dbname_, dbname_));
+  CloseBackupEngine();
 
-  OpenBackupableDB(false);
+  OpenDBAndBackupEngine(false);
   FillDB(db_.get(), 0, 300);
-  Status s = db_->CreateNewBackup(true);
+  Status s = backup_engine_->CreateNewBackup(db_.get(), true);
   // the new backup fails because new table files
   // clash with old table files from backups 4 and 5
   // (since write_buffer_size is huge, we can be sure that
@@ -843,21 +989,21 @@ TEST_F(BackupableDBTest, FailOverwritingBackups) {
   // a file generated by a new backup is the same as
   // sst file generated by backup 4)
   ASSERT_TRUE(s.IsCorruption());
-  ASSERT_OK(db_->DeleteBackup(4));
-  ASSERT_OK(db_->DeleteBackup(5));
+  ASSERT_OK(backup_engine_->DeleteBackup(4));
+  ASSERT_OK(backup_engine_->DeleteBackup(5));
   // now, the backup can succeed
-  ASSERT_OK(db_->CreateNewBackup(true));
-  CloseBackupableDB();
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
 }
 
 TEST_F(BackupableDBTest, NoShareTableFiles) {
   const int keys_iteration = 5000;
-  OpenBackupableDB(true, false, false);
+  OpenDBAndBackupEngine(true, false, false);
   for (int i = 0; i < 5; ++i) {
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
-    ASSERT_OK(db_->CreateNewBackup(!!(i % 2)));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(i % 2)));
   }
-  CloseBackupableDB();
+  CloseDBAndBackupEngine();
 
   for (int i = 0; i < 5; ++i) {
     AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
@@ -868,12 +1014,12 @@ TEST_F(BackupableDBTest, NoShareTableFiles) {
 // Verify that you can backup and restore with share_files_with_checksum on
 TEST_F(BackupableDBTest, ShareTableFilesWithChecksums) {
   const int keys_iteration = 5000;
-  OpenBackupableDB(true, false, true, true);
+  OpenDBAndBackupEngine(true, false, true, true);
   for (int i = 0; i < 5; ++i) {
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
-    ASSERT_OK(db_->CreateNewBackup(!!(i % 2)));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), !!(i % 2)));
   }
-  CloseBackupableDB();
+  CloseDBAndBackupEngine();
 
   for (int i = 0; i < 5; ++i) {
     AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
@@ -886,12 +1032,12 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksums) {
 TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
   const int keys_iteration = 5000;
   // set share_files_with_checksum to false
-  OpenBackupableDB(true, false, true, false);
+  OpenDBAndBackupEngine(true, false, true, false);
   for (int i = 0; i < 5; ++i) {
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
-    ASSERT_OK(db_->CreateNewBackup(true));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
   }
-  CloseBackupableDB();
+  CloseDBAndBackupEngine();
 
   for (int i = 0; i < 5; ++i) {
     AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 1),
@@ -899,12 +1045,12 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
   }
 
   // set share_files_with_checksum to true and do some more backups
-  OpenBackupableDB(true, false, true, true);
+  OpenDBAndBackupEngine(true, false, true, true);
   for (int i = 5; i < 10; ++i) {
     FillDB(db_.get(), keys_iteration * i, keys_iteration * (i + 1));
-    ASSERT_OK(db_->CreateNewBackup(true));
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
   }
-  CloseBackupableDB();
+  CloseDBAndBackupEngine();
 
   for (int i = 0; i < 5; ++i) {
     AssertBackupConsistency(i + 1, 0, keys_iteration * (i + 5 + 1),
@@ -913,92 +1059,97 @@ TEST_F(BackupableDBTest, ShareTableFilesWithChecksumsTransition) {
 }
 
 TEST_F(BackupableDBTest, DeleteTmpFiles) {
-  OpenBackupableDB();
-  CloseBackupableDB();
+  OpenDBAndBackupEngine();
+  CloseDBAndBackupEngine();
   std::string shared_tmp = backupdir_ + "/shared/00006.sst.tmp";
   std::string private_tmp_dir = backupdir_ + "/private/10.tmp";
   std::string private_tmp_file = private_tmp_dir + "/00003.sst";
   file_manager_->WriteToFile(shared_tmp, "tmp");
   file_manager_->CreateDir(private_tmp_dir);
   file_manager_->WriteToFile(private_tmp_file, "tmp");
-  ASSERT_TRUE(file_manager_->FileExists(private_tmp_dir));
-  OpenBackupableDB();
+  ASSERT_OK(file_manager_->FileExists(private_tmp_dir));
+  OpenDBAndBackupEngine();
   // Need to call this explicitly to delete tmp files
-  (void) db_->GarbageCollect();
-  CloseBackupableDB();
-  ASSERT_FALSE(file_manager_->FileExists(shared_tmp));
-  ASSERT_FALSE(file_manager_->FileExists(private_tmp_file));
-  ASSERT_FALSE(file_manager_->FileExists(private_tmp_dir));
+  (void)backup_engine_->GarbageCollect();
+  CloseDBAndBackupEngine();
+  ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(shared_tmp));
+  ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(private_tmp_file));
+  ASSERT_EQ(Status::NotFound(), file_manager_->FileExists(private_tmp_dir));
 }
 
 TEST_F(BackupableDBTest, KeepLogFiles) {
   backupable_options_->backup_log_files = false;
   // basically infinite
   options_.WAL_ttl_seconds = 24 * 60 * 60;
-  OpenBackupableDB(true);
+  OpenDBAndBackupEngine(true);
   FillDB(db_.get(), 0, 100);
   ASSERT_OK(db_->Flush(FlushOptions()));
   FillDB(db_.get(), 100, 200);
-  ASSERT_OK(db_->CreateNewBackup(false));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
   FillDB(db_.get(), 200, 300);
   ASSERT_OK(db_->Flush(FlushOptions()));
   FillDB(db_.get(), 300, 400);
   ASSERT_OK(db_->Flush(FlushOptions()));
   FillDB(db_.get(), 400, 500);
   ASSERT_OK(db_->Flush(FlushOptions()));
-  CloseBackupableDB();
+  CloseDBAndBackupEngine();
 
   // all data should be there if we call with keep_log_files = true
   AssertBackupConsistency(0, 0, 500, 600, true);
 }
 
 TEST_F(BackupableDBTest, RateLimiting) {
-  uint64_t const KB = 1024 * 1024;
-  size_t const kMicrosPerSec = 1000 * 1000LL;
+  // iter 0 -- single threaded
+  // iter 1 -- multi threaded
+  for (int iter = 0; iter < 2; ++iter) {
+    uint64_t const KB = 1024 * 1024;
+    size_t const kMicrosPerSec = 1000 * 1000LL;
 
-  std::vector<std::pair<uint64_t, uint64_t>> limits(
-      {{KB, 5 * KB}, {2 * KB, 3 * KB}});
+    std::vector<std::pair<uint64_t, uint64_t>> limits(
+        {{KB, 5 * KB}, {2 * KB, 3 * KB}});
 
-  for (const auto& limit : limits) {
-    // destroy old data
-    DestroyDB(dbname_, Options());
+    for (const auto& limit : limits) {
+      // destroy old data
+      DestroyDB(dbname_, Options());
 
-    backupable_options_->backup_rate_limit = limit.first;
-    backupable_options_->restore_rate_limit = limit.second;
-    options_.compression = kNoCompression;
-    OpenBackupableDB(true);
-    size_t bytes_written = FillDB(db_.get(), 0, 100000);
-
-    auto start_backup = env_->NowMicros();
-    ASSERT_OK(db_->CreateNewBackup(false));
-    auto backup_time = env_->NowMicros() - start_backup;
-    auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) /
-                                    backupable_options_->backup_rate_limit;
-    ASSERT_GT(backup_time, 0.8 * rate_limited_backup_time);
-
-    CloseBackupableDB();
-
-    OpenRestoreDB();
-    auto start_restore = env_->NowMicros();
-    ASSERT_OK(restore_db_->RestoreDBFromLatestBackup(dbname_, dbname_));
-    auto restore_time = env_->NowMicros() - start_restore;
-    CloseRestoreDB();
-    auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) /
-                                     backupable_options_->restore_rate_limit;
-    ASSERT_GT(restore_time, 0.8 * rate_limited_restore_time);
-
-    AssertBackupConsistency(0, 0, 100000, 100010);
+      backupable_options_->backup_rate_limit = limit.first;
+      backupable_options_->restore_rate_limit = limit.second;
+      backupable_options_->max_background_operations = (iter == 0) ? 1 : 10;
+      options_.compression = kNoCompression;
+      OpenDBAndBackupEngine(true);
+      size_t bytes_written = FillDB(db_.get(), 0, 100000);
+
+      auto start_backup = env_->NowMicros();
+      ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
+      auto backup_time = env_->NowMicros() - start_backup;
+      auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) /
+                                      backupable_options_->backup_rate_limit;
+      ASSERT_GT(backup_time, 0.8 * rate_limited_backup_time);
+
+      CloseDBAndBackupEngine();
+
+      OpenBackupEngine();
+      auto start_restore = env_->NowMicros();
+      ASSERT_OK(backup_engine_->RestoreDBFromLatestBackup(dbname_, dbname_));
+      auto restore_time = env_->NowMicros() - start_restore;
+      CloseBackupEngine();
+      auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) /
+                                       backupable_options_->restore_rate_limit;
+      ASSERT_GT(restore_time, 0.8 * rate_limited_restore_time);
+
+      AssertBackupConsistency(0, 0, 100000, 100010);
+    }
   }
 }
 
 TEST_F(BackupableDBTest, ReadOnlyBackupEngine) {
   DestroyDB(dbname_, Options());
-  OpenBackupableDB(true);
+  OpenDBAndBackupEngine(true);
   FillDB(db_.get(), 0, 100);
-  ASSERT_OK(db_->CreateNewBackup(true));
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
   FillDB(db_.get(), 100, 200);
-  ASSERT_OK(db_->CreateNewBackup(true));
-  CloseBackupableDB();
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
   DestroyDB(dbname_, Options());
 
   backupable_options_->destroy_old_data = false;
@@ -1023,9 +1174,23 @@ TEST_F(BackupableDBTest, ReadOnlyBackupEngine) {
   delete db;
 }
 
+TEST_F(BackupableDBTest, ProgressCallbackDuringBackup) {
+  DestroyDB(dbname_, Options());
+  OpenDBAndBackupEngine(true);
+  FillDB(db_.get(), 0, 100);
+  bool is_callback_invoked = false;
+  ASSERT_OK(backup_engine_->CreateNewBackup(
+      db_.get(), true,
+      [&is_callback_invoked]() { is_callback_invoked = true; }));
+
+  ASSERT_TRUE(is_callback_invoked);
+  CloseDBAndBackupEngine();
+  DestroyDB(dbname_, Options());
+}
+
 TEST_F(BackupableDBTest, GarbageCollectionBeforeBackup) {
   DestroyDB(dbname_, Options());
-  OpenBackupableDB(true);
+  OpenDBAndBackupEngine(true);
 
   env_->CreateDirIfMissing(backupdir_ + "/shared");
   std::string file_five = backupdir_ + "/shared/000005.sst";
@@ -1035,23 +1200,70 @@ TEST_F(BackupableDBTest, GarbageCollectionBeforeBackup) {
 
   FillDB(db_.get(), 0, 100);
   // backup overwrites file 000005.sst
-  ASSERT_TRUE(db_->CreateNewBackup(true).ok());
+  ASSERT_TRUE(backup_engine_->CreateNewBackup(db_.get(), true).ok());
 
   std::string new_file_five_contents;
   ASSERT_OK(ReadFileToString(env_, file_five, &new_file_five_contents));
   // file 000005.sst was overwritten
   ASSERT_TRUE(new_file_five_contents != file_five_contents);
 
-  CloseBackupableDB();
+  CloseDBAndBackupEngine();
 
   AssertBackupConsistency(0, 0, 100);
 }
 
+// Test that we properly propagate Env failures
+TEST_F(BackupableDBTest, EnvFailures) {
+  BackupEngine* backup_engine;
+
+  // get children failure
+  {
+    test_backup_env_->SetGetChildrenFailure(true);
+    ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *backupable_options_,
+                                  &backup_engine));
+    test_backup_env_->SetGetChildrenFailure(false);
+  }
+
+  // created dir failure
+  {
+    test_backup_env_->SetCreateDirIfMissingFailure(true);
+    ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *backupable_options_,
+                                  &backup_engine));
+    test_backup_env_->SetCreateDirIfMissingFailure(false);
+  }
+
+  // new directory failure
+  {
+    test_backup_env_->SetNewDirectoryFailure(true);
+    ASSERT_NOK(BackupEngine::Open(test_db_env_.get(), *backupable_options_,
+                                  &backup_engine));
+    test_backup_env_->SetNewDirectoryFailure(false);
+  }
+
+  // no failure
+  {
+    ASSERT_OK(BackupEngine::Open(test_db_env_.get(), *backupable_options_,
+                                 &backup_engine));
+    delete backup_engine;
+  }
+}
+
 }  // anon namespace
 
 } //  namespace rocksdb
 
 int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as BackupableDB is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/checkpoint/checkpoint.cc b/src/rocksdb/utilities/checkpoint/checkpoint.cc
index 760a6db..6e6fac0 100644
--- a/src/rocksdb/utilities/checkpoint/checkpoint.cc
+++ b/src/rocksdb/utilities/checkpoint/checkpoint.cc
@@ -19,9 +19,12 @@
 #include <algorithm>
 #include <string>
 #include "db/filename.h"
+#include "db/wal_manager.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
 #include "util/file_util.h"
+#include "port/port.h"
 
 namespace rocksdb {
 
@@ -60,9 +63,14 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) {
   uint64_t manifest_file_size = 0;
   uint64_t sequence_number = db_->GetLatestSequenceNumber();
   bool same_fs = true;
+  VectorLogPtr live_wal_files;
 
-  if (db_->GetEnv()->FileExists(checkpoint_dir)) {
+  s = db_->GetEnv()->FileExists(checkpoint_dir);
+  if (s.ok()) {
     return Status::InvalidArgument("Directory exists");
+  } else if (!s.IsNotFound()) {
+    assert(s.IsIOError());
+    return s;
   }
 
   s = db_->DisableFileDeletions();
@@ -70,11 +78,16 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) {
     // this will return live_files prefixed with "/"
     s = db_->GetLiveFiles(live_files, &manifest_file_size, true);
   }
+  // if we have more than one column family, we need to also get WAL files
+  if (s.ok()) {
+    s = db_->GetSortedWalFiles(live_wal_files);
+  }
   if (!s.ok()) {
     db_->EnableFileDeletions(false);
     return s;
   }
 
+  size_t wal_size = live_wal_files.size();
   Log(db_->GetOptions().info_log,
       "Started the snapshot process -- creating snapshot in directory %s",
       checkpoint_dir.c_str());
@@ -119,6 +132,44 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) {
                    (type == kDescriptorFile) ? manifest_file_size : 0);
     }
   }
+  Log(db_->GetOptions().info_log, "Number of log files %" ROCKSDB_PRIszt,
+      live_wal_files.size());
+
+  // Link WAL files. Copy exact size of last one because it is the only one
+  // that has changes after the last flush.
+  for (size_t i = 0; s.ok() && i < wal_size; ++i) {
+    if ((live_wal_files[i]->Type() == kAliveLogFile) &&
+        (live_wal_files[i]->StartSequence() >= sequence_number)) {
+      if (i + 1 == wal_size) {
+        Log(db_->GetOptions().info_log, "Copying %s",
+            live_wal_files[i]->PathName().c_str());
+        s = CopyFile(db_->GetEnv(),
+                     db_->GetOptions().wal_dir + live_wal_files[i]->PathName(),
+                     full_private_path + live_wal_files[i]->PathName(),
+                     live_wal_files[i]->SizeFileBytes());
+        break;
+      }
+      if (same_fs) {
+        // we only care about live log files
+        Log(db_->GetOptions().info_log, "Hard Linking %s",
+            live_wal_files[i]->PathName().c_str());
+        s = db_->GetEnv()->LinkFile(
+            db_->GetOptions().wal_dir + live_wal_files[i]->PathName(),
+            full_private_path + live_wal_files[i]->PathName());
+        if (s.IsNotSupported()) {
+          same_fs = false;
+          s = Status::OK();
+        }
+      }
+      if (!same_fs) {
+        Log(db_->GetOptions().info_log, "Copying %s",
+            live_wal_files[i]->PathName().c_str());
+        s = CopyFile(db_->GetEnv(),
+                     db_->GetOptions().wal_dir + live_wal_files[i]->PathName(),
+                     full_private_path + live_wal_files[i]->PathName(), 0);
+      }
+    }
+  }
 
   // we copied all the files, enable file deletions
   db_->EnableFileDeletions(false);
diff --git a/src/rocksdb/utilities/checkpoint/checkpoint_test.cc b/src/rocksdb/utilities/checkpoint/checkpoint_test.cc
new file mode 100644
index 0000000..5cd72ea
--- /dev/null
+++ b/src/rocksdb/utilities/checkpoint/checkpoint_test.cc
@@ -0,0 +1,373 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Syncpoint prevents us building and running tests in release
+#ifndef ROCKSDB_LITE
+
+#if !defined(NDEBUG) || !defined(OS_WIN)
+
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#include <iostream>
+#include <thread>
+#include <utility>
+#include "db/db_impl.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "util/sync_point.h"
+#include "util/testharness.h"
+#include "util/xfunc.h"
+
+namespace rocksdb {
+class DBTest : public testing::Test {
+ protected:
+  // Sequence of option configurations to try
+  enum OptionConfig {
+    kDefault = 0,
+  };
+  int option_config_;
+
+ public:
+  std::string dbname_;
+  std::string alternative_wal_dir_;
+  Env* env_;
+  DB* db_;
+  Options last_options_;
+  std::vector<ColumnFamilyHandle*> handles_;
+
+  DBTest() : env_(Env::Default()) {
+    env_->SetBackgroundThreads(1, Env::LOW);
+    env_->SetBackgroundThreads(1, Env::HIGH);
+    dbname_ = test::TmpDir(env_) + "/db_test";
+    alternative_wal_dir_ = dbname_ + "/wal";
+    auto options = CurrentOptions();
+    auto delete_options = options;
+    delete_options.wal_dir = alternative_wal_dir_;
+    EXPECT_OK(DestroyDB(dbname_, delete_options));
+    // Destroy it for not alternative WAL dir is used.
+    EXPECT_OK(DestroyDB(dbname_, options));
+    db_ = nullptr;
+    Reopen(options);
+  }
+
+  ~DBTest() {
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+    rocksdb::SyncPoint::GetInstance()->LoadDependency({});
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+    Close();
+    Options options;
+    options.db_paths.emplace_back(dbname_, 0);
+    options.db_paths.emplace_back(dbname_ + "_2", 0);
+    options.db_paths.emplace_back(dbname_ + "_3", 0);
+    options.db_paths.emplace_back(dbname_ + "_4", 0);
+    EXPECT_OK(DestroyDB(dbname_, options));
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+    options.env = env_;
+    options.create_if_missing = true;
+    return options;
+  }
+
+  void CreateColumnFamilies(const std::vector<std::string>& cfs,
+                            const Options& options) {
+    ColumnFamilyOptions cf_opts(options);
+    size_t cfi = handles_.size();
+    handles_.resize(cfi + cfs.size());
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+    }
+  }
+
+  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                             const Options& options) {
+    CreateColumnFamilies(cfs, options);
+    std::vector<std::string> cfs_plus_default = cfs;
+    cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+    ReopenWithColumnFamilies(cfs_plus_default, options);
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const std::vector<Options>& options) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const Options& options) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  Status TryReopenWithColumnFamilies(
+      const std::vector<std::string>& cfs,
+      const std::vector<Options>& options) {
+    Close();
+    EXPECT_EQ(cfs.size(), options.size());
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+    }
+    DBOptions db_opts = DBOptions(options[0]);
+    return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const Options& options) {
+    Close();
+    std::vector<Options> v_opts(cfs.size(), options);
+    return TryReopenWithColumnFamilies(cfs, v_opts);
+  }
+
+  void Reopen(const Options& options) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      delete h;
+    }
+    handles_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(const Options& options) {
+    // Destroy using last options
+    Destroy(last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(const Options& options) {
+    Close();
+    ASSERT_OK(DestroyDB(dbname_, options));
+  }
+
+  Status ReadOnlyReopen(const Options& options) {
+    return DB::OpenForReadOnly(options, dbname_, &db_);
+  }
+
+  Status TryReopen(const Options& options) {
+    Close();
+    last_options_ = options;
+    return DB::Open(options, dbname_, &db_);
+  }
+
+  Status Flush(int cf = 0) {
+    if (cf == 0) {
+      return db_->Flush(FlushOptions());
+    } else {
+      return db_->Flush(FlushOptions(), handles_[cf]);
+    }
+  }
+
+  Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, k, v);
+  }
+
+  Status Put(int cf, const Slice& k, const Slice& v,
+             WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, handles_[cf], k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  Status Delete(int cf, const std::string& k) {
+    return db_->Delete(WriteOptions(), handles_[cf], k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  std::string Get(int cf, const std::string& k,
+                  const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, handles_[cf], k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+};
+
+TEST_F(DBTest, GetSnapshotLink) {
+    Options options;
+    const std::string snapshot_name = test::TmpDir(env_) + "/snapshot";
+    DB* snapshotDB;
+    ReadOptions roptions;
+    std::string result;
+    Checkpoint* checkpoint;
+
+    options = CurrentOptions();
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DestroyDB(snapshot_name, options));
+    env_->DeleteDir(snapshot_name);
+
+    // Create a database
+    Status s;
+    options.create_if_missing = true;
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    std::string key = std::string("foo");
+    ASSERT_OK(Put(key, "v1"));
+    // Take a snapshot
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name));
+    ASSERT_OK(Put(key, "v2"));
+    ASSERT_EQ("v2", Get(key));
+    ASSERT_OK(Flush());
+    ASSERT_EQ("v2", Get(key));
+    // Open snapshot and verify contents while DB is running
+    options.create_if_missing = false;
+    ASSERT_OK(DB::Open(options, snapshot_name, &snapshotDB));
+    ASSERT_OK(snapshotDB->Get(roptions, key, &result));
+    ASSERT_EQ("v1", result);
+    delete snapshotDB;
+    snapshotDB = nullptr;
+    delete db_;
+    db_ = nullptr;
+
+    // Destroy original DB
+    ASSERT_OK(DestroyDB(dbname_, options));
+
+    // Open snapshot and verify contents
+    options.create_if_missing = false;
+    dbname_ = snapshot_name;
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    ASSERT_EQ("v1", Get(key));
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    delete checkpoint;
+
+    // Restore DB name
+    dbname_ = test::TmpDir(env_) + "/db_test";
+}
+
+TEST_F(DBTest, CheckpointCF) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"one", "two", "three", "four", "five"}, options);
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBTest::CheckpointCF:2",
+        "DBImpl::GetLiveFiles:2"},
+       {"DBImpl::GetLiveFiles:1",
+        "DBTest::CheckpointCF:1"}});
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(0, "Default", "Default"));
+  ASSERT_OK(Put(1, "one", "one"));
+  ASSERT_OK(Put(2, "two", "two"));
+  ASSERT_OK(Put(3, "three", "three"));
+  ASSERT_OK(Put(4, "four", "four"));
+  ASSERT_OK(Put(5, "five", "five"));
+
+  const std::string snapshot_name = test::TmpDir(env_) + "/snapshot";
+  DB* snapshotDB;
+  ReadOptions roptions;
+  std::string result;
+  std::vector<ColumnFamilyHandle*> cphandles;
+
+  ASSERT_OK(DestroyDB(snapshot_name, options));
+  env_->DeleteDir(snapshot_name);
+
+  Status s;
+  // Take a snapshot
+  std::thread t([&]() {
+    Checkpoint* checkpoint;
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name));
+    delete checkpoint;
+  });
+  TEST_SYNC_POINT("DBTest::CheckpointCF:1");
+  ASSERT_OK(Put(0, "Default", "Default1"));
+  ASSERT_OK(Put(1, "one", "eleven"));
+  ASSERT_OK(Put(2, "two", "twelve"));
+  ASSERT_OK(Put(3, "three", "thirteen"));
+  ASSERT_OK(Put(4, "four", "fourteen"));
+  ASSERT_OK(Put(5, "five", "fifteen"));
+  TEST_SYNC_POINT("DBTest::CheckpointCF:2");
+  t.join();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_OK(Put(1, "one", "twentyone"));
+  ASSERT_OK(Put(2, "two", "twentytwo"));
+  ASSERT_OK(Put(3, "three", "twentythree"));
+  ASSERT_OK(Put(4, "four", "twentyfour"));
+  ASSERT_OK(Put(5, "five", "twentyfive"));
+  ASSERT_OK(Flush());
+
+  // Open snapshot and verify contents while DB is running
+  options.create_if_missing = false;
+  std::vector<std::string> cfs;
+  cfs=  {kDefaultColumnFamilyName, "one", "two", "three", "four", "five"};
+  std::vector<ColumnFamilyDescriptor> column_families;
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(cfs[i], options));
+    }
+  ASSERT_OK(DB::Open(options, snapshot_name,
+        column_families, &cphandles, &snapshotDB));
+  ASSERT_OK(snapshotDB->Get(roptions, cphandles[0], "Default", &result));
+  ASSERT_EQ("Default1", result);
+  ASSERT_OK(snapshotDB->Get(roptions, cphandles[1], "one", &result));
+  ASSERT_EQ("eleven", result);
+  ASSERT_OK(snapshotDB->Get(roptions, cphandles[2], "two", &result));
+  for (auto h : cphandles) {
+      delete h;
+  }
+  cphandles.clear();
+  delete snapshotDB;
+  snapshotDB = nullptr;
+  ASSERT_OK(DestroyDB(snapshot_name, options));
+}
+
+}  // namespace rocksdb
+
+#endif
+
+int main(int argc, char** argv) {
+#if !defined(NDEBUG) || !defined(OS_WIN)
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif
+}
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as Checkpoint is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/compacted_db/compacted_db_impl.cc b/src/rocksdb/utilities/compacted_db/compacted_db_impl.cc
deleted file mode 100644
index 55bcbca..0000000
--- a/src/rocksdb/utilities/compacted_db/compacted_db_impl.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#ifndef ROCKSDB_LITE
-#include "utilities/compacted_db/compacted_db_impl.h"
-#include "db/db_impl.h"
-#include "db/version_set.h"
-#include "table/get_context.h"
-
-namespace rocksdb {
-
-extern void MarkKeyMayExist(void* arg);
-extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
-                      const Slice& v, bool hit_and_return);
-
-CompactedDBImpl::CompactedDBImpl(
-  const DBOptions& options, const std::string& dbname)
-  : DBImpl(options, dbname) {
-}
-
-CompactedDBImpl::~CompactedDBImpl() {
-}
-
-size_t CompactedDBImpl::FindFile(const Slice& key) {
-  size_t left = 0;
-  size_t right = files_.num_files - 1;
-  while (left < right) {
-    size_t mid = (left + right) >> 1;
-    const FdWithKeyRange& f = files_.files[mid];
-    if (user_comparator_->Compare(ExtractUserKey(f.largest_key), key) < 0) {
-      // Key at "mid.largest" is < "target".  Therefore all
-      // files at or before "mid" are uninteresting.
-      left = mid + 1;
-    } else {
-      // Key at "mid.largest" is >= "target".  Therefore all files
-      // after "mid" are uninteresting.
-      right = mid;
-    }
-  }
-  return right;
-}
-
-Status CompactedDBImpl::Get(const ReadOptions& options,
-     ColumnFamilyHandle*, const Slice& key, std::string* value) {
-  GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
-                         GetContext::kNotFound, key, value, nullptr, nullptr,
-                         nullptr);
-  LookupKey lkey(key, kMaxSequenceNumber);
-  files_.files[FindFile(key)].fd.table_reader->Get(
-      options, lkey.internal_key(), &get_context);
-  if (get_context.State() == GetContext::kFound) {
-    return Status::OK();
-  }
-  return Status::NotFound();
-}
-
-std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
-    const std::vector<ColumnFamilyHandle*>&,
-    const std::vector<Slice>& keys, std::vector<std::string>* values) {
-  autovector<TableReader*, 16> reader_list;
-  for (const auto& key : keys) {
-    const FdWithKeyRange& f = files_.files[FindFile(key)];
-    if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) {
-      reader_list.push_back(nullptr);
-    } else {
-      LookupKey lkey(key, kMaxSequenceNumber);
-      f.fd.table_reader->Prepare(lkey.internal_key());
-      reader_list.push_back(f.fd.table_reader);
-    }
-  }
-  std::vector<Status> statuses(keys.size(), Status::NotFound());
-  values->resize(keys.size());
-  int idx = 0;
-  for (auto* r : reader_list) {
-    if (r != nullptr) {
-      GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
-                             GetContext::kNotFound, keys[idx], &(*values)[idx],
-                             nullptr, nullptr, nullptr);
-      LookupKey lkey(keys[idx], kMaxSequenceNumber);
-      r->Get(options, lkey.internal_key(), &get_context);
-      if (get_context.State() == GetContext::kFound) {
-        statuses[idx] = Status::OK();
-      }
-    }
-    ++idx;
-  }
-  return statuses;
-}
-
-Status CompactedDBImpl::Init(const Options& options) {
-  mutex_.Lock();
-  ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
-                            ColumnFamilyOptions(options));
-  Status s = Recover({ cf }, true /* read only */, false);
-  if (s.ok()) {
-    cfd_ = reinterpret_cast<ColumnFamilyHandleImpl*>(
-              DefaultColumnFamily())->cfd();
-    delete cfd_->InstallSuperVersion(new SuperVersion(), &mutex_);
-  }
-  mutex_.Unlock();
-  if (!s.ok()) {
-    return s;
-  }
-  NewThreadStatusCfInfo(cfd_);
-  version_ = cfd_->GetSuperVersion()->current;
-  user_comparator_ = cfd_->user_comparator();
-  auto* vstorage = version_->storage_info();
-  if (vstorage->num_non_empty_levels() == 0) {
-    return Status::NotSupported("no file exists");
-  }
-  const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
-  // L0 should not have files
-  if (l0.num_files > 1) {
-    return Status::NotSupported("L0 contain more than 1 file");
-  }
-  if (l0.num_files == 1) {
-    if (vstorage->num_non_empty_levels() > 1) {
-      return Status::NotSupported("Both L0 and other level contain files");
-    }
-    files_ = l0;
-    return Status::OK();
-  }
-
-  for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) {
-    if (vstorage->LevelFilesBrief(i).num_files > 0) {
-      return Status::NotSupported("Other levels also contain files");
-    }
-  }
-
-  int level = vstorage->num_non_empty_levels() - 1;
-  if (vstorage->LevelFilesBrief(level).num_files > 0) {
-    files_ = vstorage->LevelFilesBrief(level);
-    return Status::OK();
-  }
-  return Status::NotSupported("no file exists");
-}
-
-Status CompactedDBImpl::Open(const Options& options,
-                             const std::string& dbname, DB** dbptr) {
-  *dbptr = nullptr;
-
-  if (options.max_open_files != -1) {
-    return Status::InvalidArgument("require max_open_files = -1");
-  }
-  if (options.merge_operator.get() != nullptr) {
-    return Status::InvalidArgument("merge operator is not supported");
-  }
-  DBOptions db_options(options);
-  std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
-  Status s = db->Init(options);
-  if (s.ok()) {
-    Log(INFO_LEVEL, db->db_options_.info_log,
-        "Opened the db as fully compacted mode");
-    LogFlush(db->db_options_.info_log);
-    *dbptr = db.release();
-  }
-  return s;
-}
-
-}   // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/compacted_db/compacted_db_impl.h b/src/rocksdb/utilities/compacted_db/compacted_db_impl.h
deleted file mode 100644
index e1ac92d..0000000
--- a/src/rocksdb/utilities/compacted_db/compacted_db_impl.h
+++ /dev/null
@@ -1,96 +0,0 @@
-//  Copyright (c) 2014, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-
-#pragma once
-#ifndef ROCKSDB_LITE
-#include "db/db_impl.h"
-#include <vector>
-#include <string>
-
-namespace rocksdb {
-
-class CompactedDBImpl : public DBImpl {
- public:
-  CompactedDBImpl(const DBOptions& options, const std::string& dbname);
-  virtual ~CompactedDBImpl();
-
-  static Status Open(const Options& options, const std::string& dbname,
-                     DB** dbptr);
-
-  // Implementations of the DB interface
-  using DB::Get;
-  virtual Status Get(const ReadOptions& options,
-                     ColumnFamilyHandle* column_family, const Slice& key,
-                     std::string* value) override;
-  using DB::MultiGet;
-  virtual std::vector<Status> MultiGet(
-      const ReadOptions& options,
-      const std::vector<ColumnFamilyHandle*>&,
-      const std::vector<Slice>& keys, std::vector<std::string>* values)
-    override;
-
-  using DBImpl::Put;
-  virtual Status Put(const WriteOptions& options,
-                     ColumnFamilyHandle* column_family, const Slice& key,
-                     const Slice& value) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  using DBImpl::Merge;
-  virtual Status Merge(const WriteOptions& options,
-                       ColumnFamilyHandle* column_family, const Slice& key,
-                       const Slice& value) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  using DBImpl::Delete;
-  virtual Status Delete(const WriteOptions& options,
-                        ColumnFamilyHandle* column_family,
-                        const Slice& key) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  virtual Status Write(const WriteOptions& options,
-                       WriteBatch* updates) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  using DBImpl::CompactRange;
-  virtual Status CompactRange(ColumnFamilyHandle* column_family,
-                              const Slice* begin, const Slice* end,
-                              bool reduce_level = false, int target_level = -1,
-                              uint32_t target_path_id = 0) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-
-  virtual Status DisableFileDeletions() override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  virtual Status EnableFileDeletions(bool force) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  virtual Status GetLiveFiles(std::vector<std::string>&,
-                              uint64_t* manifest_file_size,
-                              bool flush_memtable = true) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-  using DBImpl::Flush;
-  virtual Status Flush(const FlushOptions& options,
-                       ColumnFamilyHandle* column_family) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
-  }
-
- private:
-  friend class DB;
-  inline size_t FindFile(const Slice& key);
-  Status Init(const Options& options);
-
-  ColumnFamilyData* cfd_;
-  Version* version_;
-  const Comparator* user_comparator_;
-  LevelFilesBrief files_;
-
-  // No copying allowed
-  CompactedDBImpl(const CompactedDBImpl&);
-  void operator=(const CompactedDBImpl&);
-};
-}
-#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc
new file mode 100644
index 0000000..4ef4edf
--- /dev/null
+++ b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/slice.h"
+#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h"
+
+namespace rocksdb {
+
+const char* RemoveEmptyValueCompactionFilter::Name() const {
+  return "RemoveEmptyValueCompactionFilter";
+}
+
+bool RemoveEmptyValueCompactionFilter::Filter(int level,
+    const Slice& key,
+    const Slice& existing_value,
+    std::string* new_value,
+    bool* value_changed) const {
+
+  // remove kv pairs that have empty values
+  return existing_value.empty();
+}
+
+}  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h
new file mode 100644
index 0000000..ec9342d
--- /dev/null
+++ b/src/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#pragma once
+
+#include <string>
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+
+class RemoveEmptyValueCompactionFilter : public CompactionFilter {
+ public:
+    const char* Name() const override;
+    bool Filter(int level,
+        const Slice& key,
+        const Slice& existing_value,
+        std::string* new_value,
+        bool* value_changed) const override;
+};
+}  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/convenience/convenience.cc b/src/rocksdb/utilities/convenience/convenience.cc
deleted file mode 100644
index b91bc9c..0000000
--- a/src/rocksdb/utilities/convenience/convenience.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under the BSD-style license found in the
-//  LICENSE file in the root directory of this source tree. An additional grant
-//  of patent rights can be found in the PATENTS file in the same directory.
-//
-// Copyright (c) 2012 Facebook.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef ROCKSDB_LITE
-
-#include "rocksdb/utilities/convenience.h"
-
-#include "db/db_impl.h"
-
-namespace rocksdb {
-
-void CancelAllBackgroundWork(DB* db, bool wait) {
-  (dynamic_cast<DBImpl*>(db))->CancelAllBackgroundWork(wait);
-}
-}  // namespace rocksdb
-
-#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/convenience/info_log_finder.cc b/src/rocksdb/utilities/convenience/info_log_finder.cc
new file mode 100644
index 0000000..acdec51
--- /dev/null
+++ b/src/rocksdb/utilities/convenience/info_log_finder.cc
@@ -0,0 +1,48 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2012 Facebook.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "db/filename.h"
+#include "rocksdb/env.h"
+#include "rocksdb/utilities/info_log_finder.h"
+
+namespace rocksdb {
+
+Status GetInfoLogList(DB* db, std::vector<std::string>* info_log_list) {
+  uint64_t number = 0;
+  FileType type;
+  std::string path;
+
+  if (!db) {
+    return Status::InvalidArgument("DB pointer is not valid");
+  }
+
+  const Options& options = db->GetOptions();
+  if (!options.db_log_dir.empty()) {
+    path = options.db_log_dir;
+  } else {
+    path = db->GetName();
+  }
+  InfoLogPrefix info_log_prefix(!options.db_log_dir.empty(), db->GetName());
+  auto* env = options.env;
+  std::vector<std::string> file_names;
+  Status s = env->GetChildren(path, &file_names);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (auto f : file_names) {
+    if (ParseFileName(f, &number, info_log_prefix.prefix, &type) &&
+        (type == kInfoLogFile)) {
+      info_log_list->push_back(f);
+    }
+  }
+  return Status::OK();
+}
+}  // namespace rocksdb
diff --git a/src/rocksdb/utilities/document/document_db_test.cc b/src/rocksdb/utilities/document/document_db_test.cc
index d02b58f..03bebf4 100644
--- a/src/rocksdb/utilities/document/document_db_test.cc
+++ b/src/rocksdb/utilities/document/document_db_test.cc
@@ -3,6 +3,8 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef ROCKSDB_LITE
+
 #include <algorithm>
 
 #include "rocksdb/utilities/json_document.h"
@@ -322,3 +324,13 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as DocumentDB is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/document/json_document.cc b/src/rocksdb/utilities/document/json_document.cc
index 213bc53..99376d2 100644
--- a/src/rocksdb/utilities/document/json_document.cc
+++ b/src/rocksdb/utilities/document/json_document.cc
@@ -484,7 +484,7 @@ std::string JSONDocument::DebugString() const {
 
 JSONDocument::ItemsIteratorGenerator JSONDocument::Items() const {
   assert(IsObject());
-  return ItemsIteratorGenerator(*(static_cast<fbson::ObjectVal*>(value_)));
+  return ItemsIteratorGenerator(*(reinterpret_cast<fbson::ObjectVal*>(value_)));
 }
 
 // TODO(icanadi) (perf) allocate objects with arena
diff --git a/src/rocksdb/utilities/document/json_document_test.cc b/src/rocksdb/utilities/document/json_document_test.cc
index d15cd0c..b9d6dcf 100644
--- a/src/rocksdb/utilities/document/json_document_test.cc
+++ b/src/rocksdb/utilities/document/json_document_test.cc
@@ -3,6 +3,8 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef ROCKSDB_LITE
+
 #include <map>
 #include <set>
 #include <string>
@@ -327,3 +329,13 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as JSONDocument is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/geodb/geodb_impl.cc b/src/rocksdb/utilities/geodb/geodb_impl.cc
index 6f285fb..afa2f4c 100644
--- a/src/rocksdb/utilities/geodb/geodb_impl.cc
+++ b/src/rocksdb/utilities/geodb/geodb_impl.cc
@@ -35,6 +35,13 @@
 
 namespace rocksdb {
 
+const double GeoDBImpl::PI = 3.141592653589793;
+const double GeoDBImpl::EarthRadius = 6378137;
+const double GeoDBImpl::MinLatitude = -85.05112878;
+const double GeoDBImpl::MaxLatitude = 85.05112878;
+const double GeoDBImpl::MinLongitude = -180;
+const double GeoDBImpl::MaxLongitude = 180;
+
 GeoDBImpl::GeoDBImpl(DB* db, const GeoDBOptions& options) :
   GeoDB(db, options), db_(db), options_(options) {
 }
diff --git a/src/rocksdb/utilities/geodb/geodb_impl.h b/src/rocksdb/utilities/geodb/geodb_impl.h
index 35b7a85..aaf3a25 100644
--- a/src/rocksdb/utilities/geodb/geodb_impl.h
+++ b/src/rocksdb/utilities/geodb/geodb_impl.h
@@ -56,8 +56,9 @@ class GeoDBImpl : public GeoDB {
   const WriteOptions woptions_;
   const ReadOptions roptions_;
 
+  // MSVC requires the definition for this static const to be in .CC file
   // The value of PI
-  static constexpr double PI = 3.141592653589793;
+  static const double PI;
 
   // convert degrees to radians
   static double radians(double x);
@@ -95,11 +96,12 @@ class GeoDBImpl : public GeoDB {
   // http://www.tuicool.com/articles/NBrE73
   //
   const int Detail = 23;
-  static constexpr double EarthRadius = 6378137;
-  static constexpr double MinLatitude = -85.05112878;
-  static constexpr double MaxLatitude = 85.05112878;
-  static constexpr double MinLongitude = -180;
-  static constexpr double MaxLongitude = 180;
+  // MSVC requires the definition for this static const to be in .CC file
+  static const double EarthRadius;
+  static const double MinLatitude;
+  static const double MaxLatitude;
+  static const double MinLongitude;
+  static const double MaxLongitude;
 
   // clips a number to the specified minimum and maximum values.
   static double clip(double n, double minValue, double maxValue) {
diff --git a/src/rocksdb/utilities/geodb/geodb_test.cc b/src/rocksdb/utilities/geodb/geodb_test.cc
index 93fa1e1..503e533 100644
--- a/src/rocksdb/utilities/geodb/geodb_test.cc
+++ b/src/rocksdb/utilities/geodb/geodb_test.cc
@@ -3,7 +3,7 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 //
-//
+#ifndef ROCKSDB_LITE
 #include "utilities/geodb/geodb_impl.h"
 
 #include <cctype>
@@ -35,7 +35,7 @@ class GeoDBTest : public testing::Test {
   }
 };
 
-const std::string GeoDBTest::kDefaultDbName = "/tmp/geodefault";
+const std::string GeoDBTest::kDefaultDbName = test::TmpDir() + "/geodb_test";
 Options GeoDBTest::options = Options();
 
 // Insert, Get and Remove
@@ -122,3 +122,13 @@ int main(int argc, char* argv[]) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+#else
+
+#include <stdio.h>
+
+int main() {
+  fprintf(stderr, "SKIPPED\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc b/src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc
index b2e0358..1dd8262 100644
--- a/src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc
+++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend2.cc
@@ -33,7 +33,7 @@ bool StringAppendTESTOperator::FullMerge(
   new_value->clear();
 
   // Compute the space needed for the final result.
-  int numBytes = 0;
+  size_t numBytes = 0;
   for(auto it = operands.begin(); it != operands.end(); ++it) {
     numBytes += it->size() + 1;   // Plus 1 for the delimiter
   }
diff --git a/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc b/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc
index a0d137c..a12e130 100644
--- a/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc
+++ b/src/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc
@@ -23,7 +23,7 @@ using namespace rocksdb;
 namespace rocksdb {
 
 // Path to the database on file system
-const std::string kDbName = "/tmp/mergetestdb";
+const std::string kDbName = test::TmpDir() + "/stringappend_test";
 
 namespace {
 // OpenDb opens a (possibly new) rocksdb database with a StringAppendOperator
@@ -36,6 +36,7 @@ std::shared_ptr<DB> OpenNormalDb(char delim_char) {
   return std::shared_ptr<DB>(db);
 }
 
+#ifndef ROCKSDB_LITE  // TtlDb is not supported in Lite
 // Open a TtlDB with a non-associative StringAppendTESTOperator
 std::shared_ptr<DB> OpenTtlDb(char delim_char) {
   DBWithTTL* db;
@@ -45,6 +46,7 @@ std::shared_ptr<DB> OpenTtlDb(char delim_char) {
   EXPECT_OK(DBWithTTL::Open(options, kDbName, &db, 123456));
   return std::shared_ptr<DB>(db);
 }
+#endif  // !ROCKSDB_LITE
 }  // namespace
 
 /// StringLists represents a set of string-lists, each with a key-index.
@@ -515,7 +517,7 @@ TEST_F(StringAppendOperatorTest, PersistentFlushAndCompaction) {
     slists.Append("c", "bbnagnagsx");
     slists.Append("a", "sa");
     slists.Append("b", "df");
-    db->CompactRange(nullptr, nullptr);
+    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
     slists.Get("a", &a);
     slists.Get("b", &b);
     slists.Get("c", &c);
@@ -536,7 +538,7 @@ TEST_F(StringAppendOperatorTest, PersistentFlushAndCompaction) {
     ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
 
     // Compact, Get
-    db->CompactRange(nullptr, nullptr);
+    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
     ASSERT_EQ(a, "x\nt\nr\nsa\ngh\njk");
     ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;");
     ASSERT_EQ(c, "asdasd\nasdasd\nbbnagnagsx\nrogosh");
@@ -544,7 +546,7 @@ TEST_F(StringAppendOperatorTest, PersistentFlushAndCompaction) {
     // Append, Flush, Compact, Get
     slists.Append("b", "afcg");
     db->Flush(rocksdb::FlushOptions());
-    db->CompactRange(nullptr, nullptr);
+    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
     slists.Get("b", &b);
     ASSERT_EQ(b, "y\n2\nmonkey\ndf\nl;\nafcg");
   }
@@ -585,12 +587,14 @@ int main(int argc, char** argv) {
     result = RUN_ALL_TESTS();
   }
 
+#ifndef ROCKSDB_LITE  // TtlDb is not supported in Lite
   // Run with TTL
   {
     fprintf(stderr, "Running tests with ttl db and generic operator.\n");
     StringAppendOperatorTest::SetOpenDbFunction(&OpenTtlDb);
     result |= RUN_ALL_TESTS();
   }
+#endif  // !ROCKSDB_LITE
 
   return result;
 }
diff --git a/src/rocksdb/utilities/merge_operators/uint64add.cc b/src/rocksdb/utilities/merge_operators/uint64add.cc
index d5083e3..6024beb 100644
--- a/src/rocksdb/utilities/merge_operators/uint64add.cc
+++ b/src/rocksdb/utilities/merge_operators/uint64add.cc
@@ -4,6 +4,7 @@
 // of patent rights can be found in the PATENTS file in the same directory.
 
 #include <memory>
+
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/slice.h"
@@ -51,7 +52,8 @@ class UInt64AddOperator : public AssociativeMergeOperator {
     } else if (logger != nullptr) {
       // If value is corrupted, treat it as 0
       Log(InfoLogLevel::ERROR_LEVEL, logger,
-          "uint64 value corruption, size: %zu > %zu",
+          "uint64 value corruption, size: %" ROCKSDB_PRIszt
+          " > %" ROCKSDB_PRIszt,
           value.size(), sizeof(uint64_t));
     }
 
diff --git a/src/rocksdb/utilities/redis/redis_lists_test.cc b/src/rocksdb/utilities/redis/redis_lists_test.cc
index 14ed316..3ef35f7 100644
--- a/src/rocksdb/utilities/redis/redis_lists_test.cc
+++ b/src/rocksdb/utilities/redis/redis_lists_test.cc
@@ -15,6 +15,7 @@
  * @author Deon Nicholas (dnicholas at fb.com)
  */
 
+#ifndef ROCKSDB_LITE
 
 #include <iostream>
 #include <cctype>
@@ -38,7 +39,8 @@ class RedisListsTest : public testing::Test {
   }
 };
 
-const string RedisListsTest::kDefaultDbName = "/tmp/redisdefaultdb/";
+const string RedisListsTest::kDefaultDbName =
+    test::TmpDir() + "/redis_lists_test";
 Options RedisListsTest::options = Options();
 
 // operator== and operator<< are defined below for vectors (lists)
@@ -882,3 +884,12 @@ int main(int argc, char* argv[]) {
   }
 }
 
+#else
+#include <stdio.h>
+
+int main(int argc, char* argv[]) {
+  fprintf(stderr, "SKIPPED as redis is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/spatialdb/spatial_db.cc b/src/rocksdb/utilities/spatialdb/spatial_db.cc
index a901853..36c9ed1 100644
--- a/src/rocksdb/utilities/spatialdb/spatial_db.cc
+++ b/src/rocksdb/utilities/spatialdb/spatial_db.cc
@@ -65,28 +65,56 @@ inline bool GetSpatialIndexName(const std::string& column_family_name,
 
 }  // namespace
 
-Variant::Variant(const Variant& v) : type_(v.type_) {
+void Variant::Init(const Variant& v, Data& d) {
   switch (v.type_) {
     case kNull:
       break;
     case kBool:
-      data_.b = v.data_.b;
+      d.b = v.data_.b;
       break;
     case kInt:
-      data_.i = v.data_.i;
+      d.i = v.data_.i;
       break;
     case kDouble:
-      data_.d = v.data_.d;
+      d.d = v.data_.d;
       break;
     case kString:
-      new (&data_.s) std::string(v.data_.s);
+      new (d.s) std::string(*GetStringPtr(v.data_));
       break;
     default:
       assert(false);
   }
 }
 
-bool Variant::operator==(const Variant& rhs) {
+Variant& Variant::operator=(const Variant& v) {
+  // Construct first a temp so exception from a string ctor
+  // does not change this object
+  Data tmp;
+  Init(v, tmp);
+
+  Type thisType = type_;
+  // Boils down to copying bits so safe
+  std::swap(tmp, data_);
+  type_ = v.type_;
+
+  Destroy(thisType, tmp);
+
+  return *this;
+}
+
+Variant& Variant::operator=(Variant&& rhs) {
+  Destroy(type_, data_);
+  if (rhs.type_ == kString) {
+    new (data_.s) std::string(std::move(*GetStringPtr(rhs.data_)));
+  } else {
+    data_ = rhs.data_;
+  }
+  type_ = rhs.type_;
+  rhs.type_ = kNull;
+  return *this;
+}
+
+bool Variant::operator==(const Variant& rhs) const {
   if (type_ != rhs.type_) {
     return false;
   }
@@ -101,7 +129,7 @@ bool Variant::operator==(const Variant& rhs) {
     case kDouble:
       return data_.d == rhs.data_.d;
     case kString:
-      return data_.s == rhs.data_.s;
+      return *GetStringPtr(data_) == *GetStringPtr(rhs.data_);
     default:
       assert(false);
   }
@@ -109,8 +137,6 @@ bool Variant::operator==(const Variant& rhs) {
   return false;
 }
 
-bool Variant::operator!=(const Variant& rhs) { return !(*this == rhs); }
-
 FeatureSet* FeatureSet::Set(const std::string& key, const Variant& value) {
   map_.insert({key, value});
   return this;
@@ -589,7 +615,7 @@ class SpatialDBImpl : public SpatialDB {
 
           Status t = Flush(FlushOptions(), cfh);
           if (t.ok()) {
-            t = CompactRange(cfh, nullptr, nullptr);
+            t = CompactRange(CompactRangeOptions(), cfh, nullptr, nullptr);
           }
 
           {
diff --git a/src/rocksdb/utilities/spatialdb/spatial_db_test.cc b/src/rocksdb/utilities/spatialdb/spatial_db_test.cc
index b304664..41f3cd6 100644
--- a/src/rocksdb/utilities/spatialdb/spatial_db_test.cc
+++ b/src/rocksdb/utilities/spatialdb/spatial_db_test.cc
@@ -3,11 +3,14 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef ROCKSDB_LITE
+
 #include <vector>
 #include <string>
 #include <set>
 
 #include "rocksdb/utilities/spatial_db.h"
+#include "util/compression.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 #include "util/random.h"
@@ -47,6 +50,9 @@ class SpatialDBTest : public testing::Test {
 };
 
 TEST_F(SpatialDBTest, FeatureSetSerializeTest) {
+  if (!LZ4_Supported()) {
+    return;
+  }
   FeatureSet fs;
 
   fs.Set("a", std::string("b"));
@@ -94,6 +100,9 @@ TEST_F(SpatialDBTest, FeatureSetSerializeTest) {
 }
 
 TEST_F(SpatialDBTest, TestNextID) {
+  if (!LZ4_Supported()) {
+    return;
+  }
   ASSERT_OK(SpatialDB::Create(
       SpatialDBOptions(), dbname_,
       {SpatialIndexOptions("simple", BoundingBox<double>(0, 0, 100, 100), 2)}));
@@ -117,6 +126,9 @@ TEST_F(SpatialDBTest, TestNextID) {
 }
 
 TEST_F(SpatialDBTest, FeatureSetTest) {
+  if (!LZ4_Supported()) {
+    return;
+  }
   ASSERT_OK(SpatialDB::Create(
       SpatialDBOptions(), dbname_,
       {SpatialIndexOptions("simple", BoundingBox<double>(0, 0, 100, 100), 2)}));
@@ -151,6 +163,9 @@ TEST_F(SpatialDBTest, FeatureSetTest) {
 }
 
 TEST_F(SpatialDBTest, SimpleTest) {
+  if (!LZ4_Supported()) {
+    return;
+  }
   // iter 0 -- not read only
   // iter 1 -- read only
   for (int iter = 0; iter < 2; ++iter) {
@@ -227,6 +242,9 @@ BoundingBox<double> ScaleBB(BoundingBox<int> b, double step) {
 }  // namespace
 
 TEST_F(SpatialDBTest, RandomizedTest) {
+  if (!LZ4_Supported()) {
+    return;
+  }
   Random rnd(301);
   std::vector<std::pair<std::string, BoundingBox<int>>> elements;
 
@@ -272,3 +290,13 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as SpatialDB is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc
new file mode 100644
index 0000000..be0e53a
--- /dev/null
+++ b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc
@@ -0,0 +1,93 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+#include <memory>
+
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "utilities/table_properties_collectors/compact_on_deletion_collector.h"
+
+namespace rocksdb {
+
+CompactOnDeletionCollector::CompactOnDeletionCollector(
+    size_t sliding_window_size,
+    size_t deletion_trigger) {
+  deletion_trigger_ = deletion_trigger;
+
+  // First, compute the number of keys in each bucket.
+  bucket_size_ =
+      (sliding_window_size + kNumBuckets - 1) / kNumBuckets;
+  assert(bucket_size_ > 0U);
+
+  Reset();
+}
+
+void CompactOnDeletionCollector::Reset() {
+  for (int i = 0; i < kNumBuckets; ++i) {
+    num_deletions_in_buckets_[i] = 0;
+  }
+  current_bucket_ = 0;
+  num_keys_in_current_bucket_ = 0;
+  num_deletions_in_observation_window_ = 0;
+  need_compaction_ = false;
+}
+
+// AddUserKey() will be called when a new key/value pair is inserted into the
+// table.
+// @params key    the user key that is inserted into the table.
+// @params value  the value that is inserted into the table.
+// @params file_size  file size up to now
+Status CompactOnDeletionCollector::AddUserKey(
+    const Slice& key, const Slice& value,
+    EntryType type, SequenceNumber seq,
+    uint64_t file_size) {
+  if (need_compaction_) {
+    // If the output file already needs to be compacted, skip the check.
+    return Status::OK();
+  }
+
+  if (num_keys_in_current_bucket_ == bucket_size_) {
+    // When the current bucket is full, advance the cursor of the
+    // ring buffer to the next bucket.
+    current_bucket_ = (current_bucket_ + 1) % kNumBuckets;
+
+    // Update the current count of observed deletion keys by excluding
+    // the number of deletion keys in the oldest bucket in the
+    // observation window.
+    assert(num_deletions_in_observation_window_ >=
+        num_deletions_in_buckets_[current_bucket_]);
+    num_deletions_in_observation_window_ -=
+        num_deletions_in_buckets_[current_bucket_];
+    num_deletions_in_buckets_[current_bucket_] = 0;
+    num_keys_in_current_bucket_ = 0;
+  }
+
+  num_keys_in_current_bucket_++;
+  if (type == kEntryDelete) {
+    num_deletions_in_observation_window_++;
+    num_deletions_in_buckets_[current_bucket_]++;
+    if (num_deletions_in_observation_window_ >= deletion_trigger_) {
+      need_compaction_ = true;
+    }
+  }
+  return Status::OK();
+}
+
+TablePropertiesCollector* CompactOnDeletionCollectorFactory::
+    CreateTablePropertiesCollector() {
+  return new CompactOnDeletionCollector(
+      sliding_window_size_, deletion_trigger_);
+}
+
+std::shared_ptr<TablePropertiesCollectorFactory>
+    NewCompactOnDeletionCollectorFactory(
+        size_t sliding_window_size,
+        size_t deletion_trigger) {
+  return std::shared_ptr<TablePropertiesCollectorFactory>(
+      new CompactOnDeletionCollectorFactory(
+          sliding_window_size, deletion_trigger));
+}
+}  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h
new file mode 100644
index 0000000..eb01e43
--- /dev/null
+++ b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h
@@ -0,0 +1,101 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+namespace rocksdb {
+
+// A factory of a table property collector that marks a SST
+// file as need-compaction when it observe at least "D" deletion
+// entries in any "N" consecutive entires.
+class CompactOnDeletionCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  // A factory of a table property collector that marks a SST
+  // file as need-compaction when it observe at least "D" deletion
+  // entries in any "N" consecutive entires.
+  //
+  // @param sliding_window_size "N"
+  // @param deletion_trigger "D"
+  CompactOnDeletionCollectorFactory(
+      size_t sliding_window_size,
+      size_t deletion_trigger) :
+          sliding_window_size_(sliding_window_size),
+          deletion_trigger_(deletion_trigger) {}
+
+  virtual ~CompactOnDeletionCollectorFactory() {}
+
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector() override;
+
+  virtual const char* Name() const override {
+    return "CompactOnDeletionCollector";
+  }
+
+ private:
+  size_t sliding_window_size_;
+  size_t deletion_trigger_;
+};
+
+class CompactOnDeletionCollector : public TablePropertiesCollector {
+ public:
+  CompactOnDeletionCollector(
+      size_t sliding_window_size,
+      size_t deletion_trigger);
+
+  // AddUserKey() will be called when a new key/value pair is inserted into the
+  // table.
+  // @params key    the user key that is inserted into the table.
+  // @params value  the value that is inserted into the table.
+  // @params file_size  file size up to now
+  virtual Status AddUserKey(const Slice& key, const Slice& value,
+                            EntryType type, SequenceNumber seq,
+                            uint64_t file_size) override;
+
+  // Finish() will be called when a table has already been built and is ready
+  // for writing the properties block.
+  // @params properties  User will add their collected statistics to
+  // `properties`.
+  virtual Status Finish(UserCollectedProperties* properties) override {
+    Reset();
+    return Status::OK();
+  }
+
+  // Return the human-readable properties, where the key is property name and
+  // the value is the human-readable form of value.
+  virtual UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties();
+  }
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const override {
+    return "CompactOnDeletionCollector";
+  }
+
+  // EXPERIMENTAL Return whether the output file should be further compacted
+  virtual bool NeedCompact() const override {
+    return need_compaction_;
+  }
+
+  static const int kNumBuckets = 128;
+
+ private:
+  void Reset();
+
+  // A ring buffer that used to count the number of deletion entries for every
+  // "bucket_size_" keys.
+  size_t num_deletions_in_buckets_[kNumBuckets];
+  // the number of keys in a bucket
+  size_t bucket_size_;
+
+  size_t current_bucket_;
+  size_t num_keys_in_current_bucket_;
+  size_t num_deletions_in_observation_window_;
+  size_t deletion_trigger_;
+  // true if the current SST file needs to be compacted.
+  bool need_compaction_;
+};
+}  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
new file mode 100644
index 0000000..12f4e2e
--- /dev/null
+++ b/src/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
@@ -0,0 +1,177 @@
+//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+
+#ifndef ROCKSDB_LITE
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "util/random.h"
+#include "utilities/table_properties_collectors/compact_on_deletion_collector.h"
+
+int main(int argc, char** argv) {
+  const int kWindowSizes[] =
+      {1000, 10000, 10000, 127, 128, 129, 255, 256, 257, 2, 10000};
+  const int kDeletionTriggers[] =
+      {500, 9500, 4323, 47, 61, 128, 250, 250, 250, 2, 2};
+
+  std::vector<int> window_sizes;
+  std::vector<int> deletion_triggers;
+  // deterministic tests
+  for (int test = 0; test < 9; ++test) {
+    window_sizes.emplace_back(kWindowSizes[test]);
+    deletion_triggers.emplace_back(kDeletionTriggers[test]);
+  }
+
+  // randomize tests
+  rocksdb::Random rnd(301);
+  const int kMaxTestSize = 100000l;
+  for (int random_test = 0; random_test < 100; random_test++) {
+    int window_size = rnd.Uniform(kMaxTestSize) + 1;
+    int deletion_trigger = rnd.Uniform(window_size);
+    window_sizes.emplace_back(window_size);
+    deletion_triggers.emplace_back(deletion_trigger);
+  }
+
+  assert(window_sizes.size() == deletion_triggers.size());
+
+  for (size_t test = 0; test < window_sizes.size(); ++test) {
+    const int kBucketSize = 128;
+    const int kWindowSize = window_sizes[test];
+    const int kPaddedWindowSize =
+        kBucketSize * ((window_sizes[test] + kBucketSize - 1) / kBucketSize);
+    const int kNumDeletionTrigger = deletion_triggers[test];
+    const int kBias = (kNumDeletionTrigger + kBucketSize - 1) / kBucketSize;
+    // Simple test
+    {
+      std::unique_ptr<rocksdb::TablePropertiesCollector> collector;
+      auto factory = rocksdb::NewCompactOnDeletionCollectorFactory(
+          kWindowSize, kNumDeletionTrigger);
+      collector.reset(
+          factory->CreateTablePropertiesCollector());
+      const int kSample = 10;
+      for (int delete_rate = 0; delete_rate <= kSample; ++delete_rate) {
+        int deletions = 0;
+        for (int i = 0; i < kPaddedWindowSize; ++i) {
+          if (i % kSample < delete_rate) {
+            collector->AddUserKey("hello", "rocksdb",
+                                  rocksdb::kEntryDelete, 0, 0);
+            deletions++;
+          } else {
+            collector->AddUserKey("hello", "rocksdb",
+                                  rocksdb::kEntryPut, 0, 0);
+          }
+        }
+        if (collector->NeedCompact() !=
+            (deletions >= kNumDeletionTrigger) &&
+            std::abs(deletions - kNumDeletionTrigger) > kBias) {
+          fprintf(stderr, "[Error] collector->NeedCompact() != (%d >= %d)"
+                  " with kWindowSize = %d and kNumDeletionTrigger = %d\n",
+                  deletions, kNumDeletionTrigger,
+                  kWindowSize, kNumDeletionTrigger);
+          assert(false);
+        }
+        collector->Finish(nullptr);
+      }
+    }
+
+    // Only one section of a file satisfies the compaction trigger
+    {
+      std::unique_ptr<rocksdb::TablePropertiesCollector> collector;
+      auto factory = rocksdb::NewCompactOnDeletionCollectorFactory(
+          kWindowSize, kNumDeletionTrigger);
+      collector.reset(
+          factory->CreateTablePropertiesCollector());
+      const int kSample = 10;
+      for (int delete_rate = 0; delete_rate <= kSample; ++delete_rate) {
+        int deletions = 0;
+        for (int section = 0; section < 5; ++section) {
+          int initial_entries = rnd.Uniform(kWindowSize) + kWindowSize;
+          for (int i = 0; i < initial_entries; ++i) {
+            collector->AddUserKey("hello", "rocksdb",
+                                  rocksdb::kEntryPut, 0, 0);
+          }
+        }
+        for (int i = 0; i < kPaddedWindowSize; ++i) {
+          if (i % kSample < delete_rate) {
+            collector->AddUserKey("hello", "rocksdb",
+                                  rocksdb::kEntryDelete, 0, 0);
+            deletions++;
+          } else {
+            collector->AddUserKey("hello", "rocksdb",
+                                  rocksdb::kEntryPut, 0, 0);
+          }
+        }
+        for (int section = 0; section < 5; ++section) {
+          int ending_entries = rnd.Uniform(kWindowSize) + kWindowSize;
+          for (int i = 0; i < ending_entries; ++i) {
+            collector->AddUserKey("hello", "rocksdb",
+                                  rocksdb::kEntryPut, 0, 0);
+          }
+        }
+        if (collector->NeedCompact() != (deletions >= kNumDeletionTrigger) &&
+            std::abs(deletions - kNumDeletionTrigger) > kBias) {
+          fprintf(stderr, "[Error] collector->NeedCompact() %d != (%d >= %d)"
+                  " with kWindowSize = %d, kNumDeletionTrigger = %d\n",
+                  collector->NeedCompact(),
+                  deletions, kNumDeletionTrigger, kWindowSize,
+                  kNumDeletionTrigger);
+          assert(false);
+        }
+        collector->Finish(nullptr);
+      }
+    }
+
+    // TEST 3:  Issues a lots of deletes, but their density is not
+    // high enough to trigger compaction.
+    {
+      std::unique_ptr<rocksdb::TablePropertiesCollector> collector;
+      auto factory = rocksdb::NewCompactOnDeletionCollectorFactory(
+          kWindowSize, kNumDeletionTrigger);
+      collector.reset(
+          factory->CreateTablePropertiesCollector());
+      assert(collector->NeedCompact() == false);
+      // Insert "kNumDeletionTrigger * 0.95" deletions for every
+      // "kWindowSize" and verify compaction is not needed.
+      const int kDeletionsPerSection = kNumDeletionTrigger * 95 / 100;
+      if (kDeletionsPerSection >= 0) {
+        for (int section = 0; section < 200; ++section) {
+          for (int i = 0; i < kPaddedWindowSize; ++i) {
+            if (i < kDeletionsPerSection) {
+              collector->AddUserKey("hello", "rocksdb",
+                                    rocksdb::kEntryDelete, 0, 0);
+            } else {
+              collector->AddUserKey("hello", "rocksdb",
+                                    rocksdb::kEntryPut, 0, 0);
+            }
+          }
+        }
+        if (collector->NeedCompact() &&
+            std::abs(kDeletionsPerSection - kNumDeletionTrigger) > kBias) {
+          fprintf(stderr, "[Error] collector->NeedCompact() != false"
+                  " with kWindowSize = %d and kNumDeletionTrigger = %d\n",
+                  kWindowSize, kNumDeletionTrigger);
+          assert(false);
+        }
+        collector->Finish(nullptr);
+      }
+    }
+  }
+  fprintf(stderr, "PASSED\n");
+}
+#else
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as RocksDBLite does not include utilities.\n");
+  return 0;
+}
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc
new file mode 100644
index 0000000..ca98972
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.cc
@@ -0,0 +1,80 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "utilities/transactions/optimistic_transaction_db_impl.h"
+
+#include "db/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "utilities/transactions/optimistic_transaction_impl.h"
+
+namespace rocksdb {
+
+Transaction* OptimisticTransactionDBImpl::BeginTransaction(
+    const WriteOptions& write_options,
+    const OptimisticTransactionOptions& txn_options) {
+  Transaction* txn =
+      new OptimisticTransactionImpl(this, write_options, txn_options);
+
+  return txn;
+}
+
+Status OptimisticTransactionDB::Open(const Options& options,
+                                     const std::string& dbname,
+                                     OptimisticTransactionDB** dbptr) {
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+  Status s = Open(db_options, dbname, column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a reference to
+    // default column family
+    delete handles[0];
+  }
+
+  return s;
+}
+
+Status OptimisticTransactionDB::Open(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles,
+    OptimisticTransactionDB** dbptr) {
+  Status s;
+  DB* db;
+
+  std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
+
+  // Enable MemTable History if not already enabled
+  for (auto& column_family : column_families_copy) {
+    ColumnFamilyOptions* options = &column_family.options;
+
+    if (options->max_write_buffer_number_to_maintain == 0) {
+      // Setting to -1 will set the History size to max_write_buffer_number.
+      options->max_write_buffer_number_to_maintain = -1;
+    }
+  }
+
+  s = DB::Open(db_options, dbname, column_families_copy, handles, &db);
+
+  if (s.ok()) {
+    *dbptr = new OptimisticTransactionDBImpl(db);
+  }
+
+  return s;
+}
+
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h
new file mode 100644
index 0000000..ec5b428
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h
@@ -0,0 +1,33 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+
+namespace rocksdb {
+
+class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
+ public:
+  explicit OptimisticTransactionDBImpl(DB* db)
+      : OptimisticTransactionDB(db), db_(db) {}
+
+  ~OptimisticTransactionDBImpl() {}
+
+  Transaction* BeginTransaction(
+      const WriteOptions& write_options,
+      const OptimisticTransactionOptions& txn_options) override;
+
+  DB* GetBaseDB() override { return db_.get(); }
+
+ private:
+  std::unique_ptr<DB> db_;
+};
+
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_impl.cc b/src/rocksdb/utilities/transactions/optimistic_transaction_impl.cc
new file mode 100644
index 0000000..4bd262e
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction_impl.cc
@@ -0,0 +1,109 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/optimistic_transaction_impl.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_impl.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "util/string_util.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace rocksdb {
+
+struct WriteOptions;
+
+OptimisticTransactionImpl::OptimisticTransactionImpl(
+    OptimisticTransactionDB* txn_db, const WriteOptions& write_options,
+    const OptimisticTransactionOptions& txn_options)
+    : TransactionBaseImpl(txn_db->GetBaseDB(), write_options), txn_db_(txn_db) {
+  if (txn_options.set_snapshot) {
+    SetSnapshot();
+  }
+}
+
+OptimisticTransactionImpl::~OptimisticTransactionImpl() {
+}
+
+void OptimisticTransactionImpl::Clear() {
+  TransactionBaseImpl::Clear();
+}
+
+Status OptimisticTransactionImpl::Commit() {
+  // Set up callback which will call CheckTransactionForConflicts() to
+  // check whether this transaction is safe to be committed.
+  OptimisticTransactionCallback callback(this);
+
+  DBImpl* db_impl = dynamic_cast<DBImpl*>(db_->GetRootDB());
+  if (db_impl == nullptr) {
+    // This should only happen if we support creating transactions from
+    // a StackableDB and someone overrides GetRootDB().
+    return Status::InvalidArgument(
+        "DB::GetRootDB() returned an unexpected DB class");
+  }
+
+  Status s = db_impl->WriteWithCallback(
+      write_options_, write_batch_->GetWriteBatch(), &callback);
+
+  if (s.ok()) {
+    Clear();
+  }
+
+  return s;
+}
+
+void OptimisticTransactionImpl::Rollback() { Clear(); }
+
+// Record this key so that we can check it for conflicts at commit time.
+Status OptimisticTransactionImpl::TryLock(ColumnFamilyHandle* column_family,
+                                          const Slice& key, bool untracked) {
+  if (untracked) {
+    return Status::OK();
+  }
+  uint32_t cfh_id = GetColumnFamilyID(column_family);
+
+  SequenceNumber seq;
+  if (snapshot_) {
+    seq = snapshot_->snapshot()->GetSequenceNumber();
+  } else {
+    seq = db_->GetLatestSequenceNumber();
+  }
+
+  std::string key_str = key.ToString();
+
+  TrackKey(cfh_id, key_str, seq);
+
+  // Always return OK. Confilct checking will happen at commit time.
+  return Status::OK();
+}
+
+// Returns OK if it is safe to commit this transaction.  Returns Status::Busy
+// if there are read or write conflicts that would prevent us from committing OR
+// if we can not determine whether there would be any such conflicts.
+//
+// Should only be called on writer thread in order to avoid any race conditions
+// in detecting
+// write conflicts.
+Status OptimisticTransactionImpl::CheckTransactionForConflicts(DB* db) {
+  Status result;
+
+  assert(dynamic_cast<DBImpl*>(db) != nullptr);
+  auto db_impl = reinterpret_cast<DBImpl*>(db);
+
+  return TransactionUtil::CheckKeysForConflicts(db_impl, GetTrackedKeys());
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_impl.h b/src/rocksdb/utilities/transactions/optimistic_transaction_impl.h
new file mode 100644
index 0000000..a18561e
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction_impl.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/write_callback.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "utilities/transactions/transaction_base.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace rocksdb {
+
+class OptimisticTransactionImpl : public TransactionBaseImpl {
+ public:
+  OptimisticTransactionImpl(OptimisticTransactionDB* db,
+                            const WriteOptions& write_options,
+                            const OptimisticTransactionOptions& txn_options);
+
+  virtual ~OptimisticTransactionImpl();
+
+  Status Commit() override;
+
+  void Rollback() override;
+
+ protected:
+  Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
+                 bool untracked = false) override;
+
+ private:
+  OptimisticTransactionDB* const txn_db_;
+
+  friend class OptimisticTransactionCallback;
+
+  // Returns OK if it is safe to commit this transaction.  Returns Status::Busy
+  // if there are read or write conflicts that would prevent us from committing
+  // OR if we can not determine whether there would be any such conflicts.
+  //
+  // Should only be called on writer thread.
+  Status CheckTransactionForConflicts(DB* db);
+
+  void Clear() override;
+
+  // No copying allowed
+  OptimisticTransactionImpl(const OptimisticTransactionImpl&);
+  void operator=(const OptimisticTransactionImpl&);
+};
+
+// Used at commit time to trigger transaction validation
+class OptimisticTransactionCallback : public WriteCallback {
+ public:
+  explicit OptimisticTransactionCallback(OptimisticTransactionImpl* txn)
+      : txn_(txn) {}
+
+  Status Callback(DB* db) override {
+    return txn_->CheckTransactionForConflicts(db);
+  }
+
+ private:
+  OptimisticTransactionImpl* txn_;
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc b/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc
new file mode 100644
index 0000000..6fe7e95
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/optimistic_transaction_test.cc
@@ -0,0 +1,1134 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+using std::string;
+
+namespace rocksdb {
+
+class OptimisticTransactionTest : public testing::Test {
+ public:
+  OptimisticTransactionDB* txn_db;
+  DB* db;
+  string dbname;
+  Options options;
+
+  OptimisticTransactionTest() {
+    options.create_if_missing = true;
+    options.max_write_buffer_number = 2;
+    dbname = test::TmpDir() + "/optimistic_transaction_testdb";
+
+    DestroyDB(dbname, options);
+    Status s = OptimisticTransactionDB::Open(options, dbname, &txn_db);
+    assert(s.ok());
+    db = txn_db->GetBaseDB();
+  }
+  ~OptimisticTransactionTest() {
+    delete txn_db;
+    DestroyDB(dbname, options);
+  }
+};
+
+TEST_F(OptimisticTransactionTest, SuccessTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, Slice("foo"), Slice("bar"));
+  db->Put(write_options, Slice("foo2"), Slice("bar"));
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  txn->GetForUpdate(read_options, "foo", &value);
+  ASSERT_EQ(value, "bar");
+
+  txn->Put(Slice("foo"), Slice("bar2"));
+
+  txn->GetForUpdate(read_options, "foo", &value);
+  ASSERT_EQ(value, "bar2");
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "bar2");
+
+  delete txn;
+}
+
+TEST_F(OptimisticTransactionTest, WriteConflictTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, "foo", "bar");
+  db->Put(write_options, "foo2", "bar");
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  txn->Put("foo", "bar2");
+
+  // This Put outside of a transaction will conflict with the previous write
+  s = db->Put(write_options, "foo", "barz");
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "barz");
+  ASSERT_EQ(1, txn->GetNumKeys());
+
+  s = txn->Commit();
+  ASSERT_TRUE(s.IsBusy());  // Txn should not commit
+
+  // Verify that transaction did not write anything
+  db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "barz");
+  db->Get(read_options, "foo2", &value);
+  ASSERT_EQ(value, "bar");
+
+  delete txn;
+}
+
+TEST_F(OptimisticTransactionTest, WriteConflictTest2) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  OptimisticTransactionOptions txn_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, "foo", "bar");
+  db->Put(write_options, "foo2", "bar");
+
+  txn_options.set_snapshot = true;
+  Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+
+  // This Put outside of a transaction will conflict with a later write
+  s = db->Put(write_options, "foo", "barz");
+  ASSERT_OK(s);
+
+  txn->Put("foo", "bar2");  // Conflicts with write done after snapshot taken
+
+  s = db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "barz");
+
+  s = txn->Commit();
+  ASSERT_TRUE(s.IsBusy());  // Txn should not commit
+
+  // Verify that transaction did not write anything
+  db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "barz");
+  db->Get(read_options, "foo2", &value);
+  ASSERT_EQ(value, "bar");
+
+  delete txn;
+}
+
+TEST_F(OptimisticTransactionTest, ReadConflictTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  OptimisticTransactionOptions txn_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, "foo", "bar");
+  db->Put(write_options, "foo2", "bar");
+
+  txn_options.set_snapshot = true;
+  Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_EQ(value, "bar");
+
+  // This Put outside of a transaction will conflict with the previous read
+  s = db->Put(write_options, "foo", "barz");
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "barz");
+
+  s = txn->Commit();
+  ASSERT_TRUE(s.IsBusy());  // Txn should not commit
+
+  // Verify that transaction did not write anything
+  txn->GetForUpdate(read_options, "foo", &value);
+  ASSERT_EQ(value, "barz");
+  txn->GetForUpdate(read_options, "foo2", &value);
+  ASSERT_EQ(value, "bar");
+
+  delete txn;
+}
+
+TEST_F(OptimisticTransactionTest, TxnOnlyTest) {
+  // Test to make sure transactions work when there are no other writes in an
+  // empty db.
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  txn->Put("x", "y");
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  delete txn;
+}
+
+TEST_F(OptimisticTransactionTest, FlushTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, Slice("foo"), Slice("bar"));
+  db->Put(write_options, Slice("foo2"), Slice("bar"));
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_EQ(value, "bar");
+
+  txn->Put(Slice("foo"), Slice("bar2"));
+
+  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_EQ(value, "bar2");
+
+  // Put a random key so we have a memtable to flush
+  s = db->Put(write_options, "dummy", "dummy");
+  ASSERT_OK(s);
+
+  // force a memtable flush
+  FlushOptions flush_ops;
+  db->Flush(flush_ops);
+
+  s = txn->Commit();
+  // txn should commit since the flushed table is still in MemtableList History
+  ASSERT_OK(s);
+
+  db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "bar2");
+
+  delete txn;
+}
+
+TEST_F(OptimisticTransactionTest, FlushTest2) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, Slice("foo"), Slice("bar"));
+  db->Put(write_options, Slice("foo2"), Slice("bar"));
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_EQ(value, "bar");
+
+  txn->Put(Slice("foo"), Slice("bar2"));
+
+  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_EQ(value, "bar2");
+
+  // Put a random key so we have a MemTable to flush
+  s = db->Put(write_options, "dummy", "dummy");
+  ASSERT_OK(s);
+
+  // force a memtable flush
+  FlushOptions flush_ops;
+  db->Flush(flush_ops);
+
+  // Put a random key so we have a MemTable to flush
+  s = db->Put(write_options, "dummy", "dummy2");
+  ASSERT_OK(s);
+
+  // force a memtable flush
+  db->Flush(flush_ops);
+
+  s = db->Put(write_options, "dummy", "dummy3");
+  ASSERT_OK(s);
+
+  // force a memtable flush
+  // Since our test db has max_write_buffer_number=2, this flush will cause
+  // the first memtable to get purged from the MemtableList history.
+  db->Flush(flush_ops);
+
+  s = txn->Commit();
+  // txn should not commit since MemTableList History is not large enough
+  ASSERT_TRUE(s.IsTryAgain());
+
+  db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "bar");
+
+  delete txn;
+}
+
+TEST_F(OptimisticTransactionTest, NoSnapshotTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, "AAA", "bar");
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  // Modify key after transaction start
+  db->Put(write_options, "AAA", "bar1");
+
+  // Read and write without a snapshot
+  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_EQ(value, "bar1");
+  txn->Put("AAA", "bar2");
+
+  // Should commit since read/write was done after data changed
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_EQ(value, "bar2");
+
+  delete txn;
+}
+
+TEST_F(OptimisticTransactionTest, MultipleSnapshotTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, "AAA", "bar");
+  db->Put(write_options, "BBB", "bar");
+  db->Put(write_options, "CCC", "bar");
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  db->Put(write_options, "AAA", "bar1");
+
+  // Read and write without a snapshot
+  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_EQ(value, "bar1");
+  txn->Put("AAA", "bar2");
+
+  // Modify BBB before snapshot is taken
+  db->Put(write_options, "BBB", "bar1");
+
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  // Read and write with snapshot
+  txn->GetForUpdate(snapshot_read_options, "BBB", &value);
+  ASSERT_EQ(value, "bar1");
+  txn->Put("BBB", "bar2");
+
+  db->Put(write_options, "CCC", "bar1");
+
+  // Set a new snapshot
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  // Read and write with snapshot
+  txn->GetForUpdate(snapshot_read_options, "CCC", &value);
+  ASSERT_EQ(value, "bar1");
+  txn->Put("CCC", "bar2");
+
+  s = txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+  s = txn->GetForUpdate(read_options, "BBB", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+  s = txn->GetForUpdate(read_options, "CCC", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+
+  s = db->Get(read_options, "AAA", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar1");
+  s = db->Get(read_options, "BBB", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar1");
+  s = db->Get(read_options, "CCC", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar1");
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "AAA", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+  s = db->Get(read_options, "BBB", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+  s = db->Get(read_options, "CCC", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+
+  // verify that we track multiple writes to the same key at different snapshots
+  delete txn;
+  txn = txn_db->BeginTransaction(write_options);
+
+  // Potentially conflicting writes
+  db->Put(write_options, "ZZZ", "zzz");
+  db->Put(write_options, "XXX", "xxx");
+
+  txn->SetSnapshot();
+
+  OptimisticTransactionOptions txn_options;
+  txn_options.set_snapshot = true;
+  Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options);
+  txn2->SetSnapshot();
+
+  // This should not conflict in txn since the snapshot is later than the
+  // previous write (spoiler alert:  it will later conflict with txn2).
+  txn->Put("ZZZ", "zzzz");
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  delete txn;
+
+  // This will conflict since the snapshot is earlier than another write to ZZZ
+  txn2->Put("ZZZ", "xxxxx");
+
+  s = txn2->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  delete txn2;
+}
+
+TEST_F(OptimisticTransactionTest, ColumnFamiliesTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  OptimisticTransactionOptions txn_options;
+  string value;
+  Status s;
+
+  ColumnFamilyHandle *cfa, *cfb;
+  ColumnFamilyOptions cf_options;
+
+  // Create 2 new column families
+  s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
+  ASSERT_OK(s);
+  s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
+  ASSERT_OK(s);
+
+  delete cfa;
+  delete cfb;
+  delete txn_db;
+
+  // open DB with three column families
+  std::vector<ColumnFamilyDescriptor> column_families;
+  // have to open default column family
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
+  // open the new column families
+  column_families.push_back(
+      ColumnFamilyDescriptor("CFA", ColumnFamilyOptions()));
+  column_families.push_back(
+      ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  s = OptimisticTransactionDB::Open(options, dbname, column_families, &handles,
+                                    &txn_db);
+  ASSERT_OK(s);
+  db = txn_db->GetBaseDB();
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn_options.set_snapshot = true;
+  Transaction* txn2 = txn_db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  // Write some data to the db
+  WriteBatch batch;
+  batch.Put("foo", "foo");
+  batch.Put(handles[1], "AAA", "bar");
+  batch.Put(handles[1], "AAAZZZ", "bar");
+  s = db->Write(write_options, &batch);
+  ASSERT_OK(s);
+  db->Delete(write_options, handles[1], "AAAZZZ");
+
+  // These keys do no conflict with existing writes since they're in
+  // different column families
+  txn->Delete("AAA");
+  txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value);
+  Slice key_slice("AAAZZZ");
+  Slice value_slices[2] = {Slice("bar"), Slice("bar")};
+  txn->Put(handles[2], SliceParts(&key_slice, 1), SliceParts(value_slices, 2));
+
+  ASSERT_EQ(3, txn->GetNumKeys());
+
+  // Txn should commit
+  s = txn->Commit();
+  ASSERT_OK(s);
+  s = db->Get(read_options, "AAA", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = db->Get(read_options, handles[2], "AAAZZZ", &value);
+  ASSERT_EQ(value, "barbar");
+
+  Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")};
+  Slice value_slice("barbarbar");
+  // This write will cause a conflict with the earlier batch write
+  txn2->Put(handles[1], SliceParts(key_slices, 3), SliceParts(&value_slice, 1));
+
+  txn2->Delete(handles[2], "XXX");
+  txn2->Delete(handles[1], "XXX");
+  s = txn2->GetForUpdate(snapshot_read_options, handles[1], "AAA", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Verify txn did not commit
+  s = txn2->Commit();
+  ASSERT_TRUE(s.IsBusy());
+  s = db->Get(read_options, handles[1], "AAAZZZ", &value);
+  ASSERT_EQ(value, "barbar");
+
+  delete txn;
+  delete txn2;
+
+  txn = txn_db->BeginTransaction(write_options, txn_options);
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn2 = txn_db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+
+  std::vector<ColumnFamilyHandle*> multiget_cfh = {handles[1], handles[2],
+                                                   handles[0], handles[2]};
+  std::vector<Slice> multiget_keys = {"AAA", "AAAZZZ", "foo", "foo"};
+  std::vector<std::string> values(4);
+
+  std::vector<Status> results = txn->MultiGetForUpdate(
+      snapshot_read_options, multiget_cfh, multiget_keys, &values);
+  ASSERT_OK(results[0]);
+  ASSERT_OK(results[1]);
+  ASSERT_OK(results[2]);
+  ASSERT_TRUE(results[3].IsNotFound());
+  ASSERT_EQ(values[0], "bar");
+  ASSERT_EQ(values[1], "barbar");
+  ASSERT_EQ(values[2], "foo");
+
+  txn->Delete(handles[2], "ZZZ");
+  txn->Put(handles[2], "ZZZ", "YYY");
+  txn->Put(handles[2], "ZZZ", "YYYY");
+  txn->Delete(handles[2], "ZZZ");
+  txn->Put(handles[2], "AAAZZZ", "barbarbar");
+
+  ASSERT_EQ(5, txn->GetNumKeys());
+
+  // Txn should commit
+  s = txn->Commit();
+  ASSERT_OK(s);
+  s = db->Get(read_options, handles[2], "ZZZ", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Put a key which will conflict with the next txn using the previous snapshot
+  db->Put(write_options, handles[2], "foo", "000");
+
+  results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh,
+                                    multiget_keys, &values);
+  ASSERT_OK(results[0]);
+  ASSERT_OK(results[1]);
+  ASSERT_OK(results[2]);
+  ASSERT_TRUE(results[3].IsNotFound());
+  ASSERT_EQ(values[0], "bar");
+  ASSERT_EQ(values[1], "barbar");
+  ASSERT_EQ(values[2], "foo");
+
+  // Verify Txn Did not Commit
+  s = txn2->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  s = db->DropColumnFamily(handles[1]);
+  ASSERT_OK(s);
+  s = db->DropColumnFamily(handles[2]);
+  ASSERT_OK(s);
+
+  delete txn;
+  delete txn2;
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+}
+
+TEST_F(OptimisticTransactionTest, EmptyTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  s = db->Put(write_options, "aaa", "aaa");
+  ASSERT_OK(s);
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  txn = txn_db->BeginTransaction(write_options);
+  txn->Rollback();
+  delete txn;
+
+  txn = txn_db->BeginTransaction(write_options);
+  s = txn->GetForUpdate(read_options, "aaa", &value);
+  ASSERT_EQ(value, "aaa");
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  txn = txn_db->BeginTransaction(write_options);
+  txn->SetSnapshot();
+  s = txn->GetForUpdate(read_options, "aaa", &value);
+  ASSERT_EQ(value, "aaa");
+
+  s = db->Put(write_options, "aaa", "xxx");
+  s = txn->Commit();
+  ASSERT_TRUE(s.IsBusy());
+  delete txn;
+}
+
+TEST_F(OptimisticTransactionTest, PredicateManyPreceders) {
+  WriteOptions write_options;
+  ReadOptions read_options1, read_options2;
+  OptimisticTransactionOptions txn_options;
+  string value;
+  Status s;
+
+  txn_options.set_snapshot = true;
+  Transaction* txn1 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  Transaction* txn2 = txn_db->BeginTransaction(write_options);
+  txn2->SetSnapshot();
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  std::vector<Slice> multiget_keys = {"1", "2", "3"};
+  std::vector<std::string> multiget_values;
+
+  std::vector<Status> results =
+      txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_TRUE(results[1].IsNotFound());
+
+  txn2->Put("2", "x");
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  multiget_values.clear();
+  results =
+      txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_TRUE(results[1].IsNotFound());
+
+  // should not commit since txn2 wrote a key txn has read
+  s = txn1->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  txn1->Put("4", "x");
+
+  txn2->Delete("4");
+
+  // txn1 can commit since txn2's delete hasn't happened yet (it's just batched)
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = txn2->GetForUpdate(read_options2, "4", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // txn2 cannot commit since txn1 changed "4"
+  s = txn2->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_F(OptimisticTransactionTest, LostUpdate) {
+  WriteOptions write_options;
+  ReadOptions read_options, read_options1, read_options2;
+  OptimisticTransactionOptions txn_options;
+  string value;
+  Status s;
+
+  // Test 2 transactions writing to the same key in multiple orders and
+  // with/without snapshots
+
+  Transaction* txn1 = txn_db->BeginTransaction(write_options);
+  Transaction* txn2 = txn_db->BeginTransaction(write_options);
+
+  txn1->Put("1", "1");
+  txn2->Put("1", "2");
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = txn2->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  delete txn1;
+  delete txn2;
+
+  txn_options.set_snapshot = true;
+  txn1 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  txn1->Put("1", "3");
+  txn2->Put("1", "4");
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = txn2->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  txn1->Put("1", "5");
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  txn2->Put("1", "6");
+  s = txn2->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = txn_db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  txn1->Put("1", "5");
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  txn2->SetSnapshot();
+  txn2->Put("1", "6");
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = txn_db->BeginTransaction(write_options);
+  txn2 = txn_db->BeginTransaction(write_options);
+
+  txn1->Put("1", "7");
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  txn2->Put("1", "8");
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  delete txn1;
+  delete txn2;
+
+  s = db->Get(read_options, "1", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "8");
+}
+
+TEST_F(OptimisticTransactionTest, UntrackedWrites) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  // Verify transaction rollback works for untracked keys.
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  txn->PutUntracked("untracked", "0");
+  txn->Rollback();
+  s = db->Get(read_options, "untracked", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+  txn = txn_db->BeginTransaction(write_options);
+
+  txn->Put("tracked", "1");
+  txn->PutUntracked("untracked", "1");
+  txn->MergeUntracked("untracked", "2");
+  txn->DeleteUntracked("untracked");
+
+  // Write to the untracked key outside of the transaction and verify
+  // it doesn't prevent the transaction from committing.
+  s = db->Put(write_options, "untracked", "x");
+  ASSERT_OK(s);
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "untracked", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+  txn = txn_db->BeginTransaction(write_options);
+
+  txn->Put("tracked", "10");
+  txn->PutUntracked("untracked", "A");
+
+  // Write to tracked key outside of the transaction and verify that the
+  // untracked keys are not written when the commit fails.
+  s = db->Delete(write_options, "tracked");
+
+  s = txn->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  s = db->Get(read_options, "untracked", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+}
+
+TEST_F(OptimisticTransactionTest, IteratorTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  OptimisticTransactionOptions txn_options;
+  string value;
+  Status s;
+
+  // Write some keys to the db
+  s = db->Put(write_options, "A", "a");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "G", "g");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "F", "f");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "C", "c");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "D", "d");
+  ASSERT_OK(s);
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  // Write some keys in a txn
+  s = txn->Put("B", "b");
+  ASSERT_OK(s);
+
+  s = txn->Put("H", "h");
+  ASSERT_OK(s);
+
+  s = txn->Delete("D");
+  ASSERT_OK(s);
+
+  s = txn->Put("E", "e");
+  ASSERT_OK(s);
+
+  txn->SetSnapshot();
+  const Snapshot* snapshot = txn->GetSnapshot();
+
+  // Write some keys to the db after the snapshot
+  s = db->Put(write_options, "BB", "xx");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "C", "xx");
+  ASSERT_OK(s);
+
+  read_options.snapshot = snapshot;
+  Iterator* iter = txn->GetIterator(read_options);
+  ASSERT_OK(iter->status());
+  iter->SeekToFirst();
+
+  // Read all keys via iter and lock them all
+  std::string results[] = {"a", "b", "c", "e", "f", "g", "h"};
+  for (int i = 0; i < 7; i++) {
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(results[i], iter->value().ToString());
+
+    s = txn->GetForUpdate(read_options, iter->key(), nullptr);
+    ASSERT_OK(s);
+
+    iter->Next();
+  }
+  ASSERT_FALSE(iter->Valid());
+
+  iter->Seek("G");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("g", iter->value().ToString());
+
+  iter->Prev();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("f", iter->value().ToString());
+
+  iter->Seek("D");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("e", iter->value().ToString());
+
+  iter->Seek("C");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("c", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("e", iter->value().ToString());
+
+  iter->Seek("");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("a", iter->value().ToString());
+
+  iter->Seek("X");
+  ASSERT_OK(iter->status());
+  ASSERT_FALSE(iter->Valid());
+
+  iter->SeekToLast();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("h", iter->value().ToString());
+
+  // key "C" was modified in the db after txn's snapshot.  txn will not commit.
+  s = txn->Commit();
+  ASSERT_TRUE(s.IsBusy());
+
+  delete iter;
+  delete txn;
+}
+
+TEST_F(OptimisticTransactionTest, SavepointTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  OptimisticTransactionOptions txn_options;
+  string value;
+  Status s;
+
+  Transaction* txn = txn_db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  s = txn->RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+
+  txn->SetSavePoint();  // 1
+
+  ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to beginning of txn
+  s = txn->RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Put("B", "b");
+  ASSERT_OK(s);
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  delete txn;
+  txn = txn_db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  s = txn->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn->Put("B", "bb");
+  ASSERT_OK(s);
+
+  s = txn->Put("C", "c");
+  ASSERT_OK(s);
+
+  txn->SetSavePoint();  // 2
+
+  s = txn->Delete("B");
+  ASSERT_OK(s);
+
+  s = txn->Put("C", "cc");
+  ASSERT_OK(s);
+
+  s = txn->Put("D", "d");
+  ASSERT_OK(s);
+
+  ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to 2
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a", value);
+
+  s = txn->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("bb", value);
+
+  s = txn->Get(read_options, "C", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("c", value);
+
+  s = txn->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn->Put("E", "e");
+  ASSERT_OK(s);
+
+  // Rollback to beginning of txn
+  s = txn->RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  txn->Rollback();
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  s = txn->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Get(read_options, "E", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Put("A", "aa");
+  ASSERT_OK(s);
+
+  s = txn->Put("F", "f");
+  ASSERT_OK(s);
+
+  txn->SetSavePoint();  // 3
+  txn->SetSavePoint();  // 4
+
+  s = txn->Put("G", "g");
+  ASSERT_OK(s);
+
+  s = txn->Delete("F");
+  ASSERT_OK(s);
+
+  s = txn->Delete("B");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("aa", value);
+
+  s = txn->Get(read_options, "F", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Get(read_options, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to 3
+
+  s = txn->Get(read_options, "F", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("f", value);
+
+  s = txn->Get(read_options, "G", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "F", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("f", value);
+
+  s = db->Get(read_options, "G", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = db->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("aa", value);
+
+  s = db->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  s = db->Get(read_options, "C", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = db->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = db->Get(read_options, "E", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(
+      stderr,
+      "SKIPPED as optimistic_transaction is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_base.cc b/src/rocksdb/utilities/transactions/transaction_base.cc
new file mode 100644
index 0000000..dc91677
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_base.cc
@@ -0,0 +1,385 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/transaction_base.h"
+
+#include "db/column_family.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+TransactionBaseImpl::TransactionBaseImpl(DB* db,
+                                         const WriteOptions& write_options)
+    : db_(db),
+      write_options_(write_options),
+      cmp_(GetColumnFamilyUserComparator(db->DefaultColumnFamily())),
+      write_batch_(new WriteBatchWithIndex(cmp_, 0, true)),
+      start_time_(db_->GetEnv()->NowMicros()) {}
+
+TransactionBaseImpl::~TransactionBaseImpl() {}
+
+void TransactionBaseImpl::Clear() {
+  save_points_.reset(nullptr);
+  write_batch_->Clear();
+  tracked_keys_.clear();
+  num_puts_ = 0;
+  num_deletes_ = 0;
+  num_merges_ = 0;
+}
+
+void TransactionBaseImpl::SetSnapshot() {
+  snapshot_.reset(new ManagedSnapshot(db_));
+}
+
+Status TransactionBaseImpl::TryLock(ColumnFamilyHandle* column_family,
+                                    const SliceParts& key, bool untracked) {
+  size_t key_size = 0;
+  for (int i = 0; i < key.num_parts; ++i) {
+    key_size += key.parts[i].size();
+  }
+
+  std::string str;
+  str.reserve(key_size);
+
+  for (int i = 0; i < key.num_parts; ++i) {
+    str.append(key.parts[i].data(), key.parts[i].size());
+  }
+
+  return TryLock(column_family, str, untracked);
+}
+
+void TransactionBaseImpl::SetSavePoint() {
+  if (save_points_ == nullptr) {
+    save_points_.reset(new std::stack<TransactionBaseImpl::SavePoint>());
+  }
+  save_points_->emplace(snapshot_, num_puts_, num_deletes_, num_merges_);
+  write_batch_->SetSavePoint();
+}
+
+Status TransactionBaseImpl::RollbackToSavePoint() {
+  if (save_points_ != nullptr && save_points_->size() > 0) {
+    // Restore saved SavePoint
+    TransactionBaseImpl::SavePoint& save_point = save_points_->top();
+    snapshot_ = save_point.snapshot_;
+    num_puts_ = save_point.num_puts_;
+    num_deletes_ = save_point.num_deletes_;
+    num_merges_ = save_point.num_merges_;
+
+    // Rollback batch
+    Status s = write_batch_->RollbackToSavePoint();
+    assert(s.ok());
+
+    // Rollback any keys that were tracked since the last savepoint
+    const TransactionKeyMap* key_map = GetTrackedKeysSinceSavePoint();
+    assert(key_map);
+    for (auto& key_map_iter : *key_map) {
+      uint32_t column_family_id = key_map_iter.first;
+      auto& keys = key_map_iter.second;
+
+      for (auto& key_iter : keys) {
+        const std::string& key = key_iter.first;
+        tracked_keys_[column_family_id].erase(key);
+      }
+    }
+
+    save_points_->pop();
+
+    return s;
+  } else {
+    assert(write_batch_->RollbackToSavePoint().IsNotFound());
+    return Status::NotFound();
+  }
+}
+
+Status TransactionBaseImpl::Get(const ReadOptions& read_options,
+                                ColumnFamilyHandle* column_family,
+                                const Slice& key, std::string* value) {
+  return write_batch_->GetFromBatchAndDB(db_, read_options, column_family, key,
+                                         value);
+}
+
+Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options,
+                                         ColumnFamilyHandle* column_family,
+                                         const Slice& key, std::string* value) {
+  Status s = TryLock(column_family, key);
+
+  if (s.ok() && value != nullptr) {
+    s = Get(read_options, column_family, key, value);
+  }
+  return s;
+}
+
+std::vector<Status> TransactionBaseImpl::MultiGet(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  size_t num_keys = keys.size();
+  values->resize(num_keys);
+
+  std::vector<Status> stat_list(num_keys);
+  for (size_t i = 0; i < num_keys; ++i) {
+    std::string* value = values ? &(*values)[i] : nullptr;
+    stat_list[i] = Get(read_options, column_family[i], keys[i], value);
+  }
+
+  return stat_list;
+}
+
+std::vector<Status> TransactionBaseImpl::MultiGetForUpdate(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  // Regardless of whether the MultiGet succeeded, track these keys.
+  size_t num_keys = keys.size();
+  values->resize(num_keys);
+
+  // Lock all keys
+  for (size_t i = 0; i < num_keys; ++i) {
+    Status s = TryLock(column_family[i], keys[i]);
+    if (!s.ok()) {
+      // Fail entire multiget if we cannot lock all keys
+      return std::vector<Status>(num_keys, s);
+    }
+  }
+
+  // TODO(agiardullo): optimize multiget?
+  std::vector<Status> stat_list(num_keys);
+  for (size_t i = 0; i < num_keys; ++i) {
+    std::string* value = values ? &(*values)[i] : nullptr;
+    stat_list[i] = Get(read_options, column_family[i], keys[i], value);
+  }
+
+  return stat_list;
+}
+
+Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options) {
+  Iterator* db_iter = db_->NewIterator(read_options);
+  assert(db_iter);
+
+  return write_batch_->NewIteratorWithBase(db_iter);
+}
+
+Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options,
+                                           ColumnFamilyHandle* column_family) {
+  Iterator* db_iter = db_->NewIterator(read_options, column_family);
+  assert(db_iter);
+
+  return write_batch_->NewIteratorWithBase(column_family, db_iter);
+}
+
+Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family,
+                                const Slice& key, const Slice& value) {
+  Status s = TryLock(column_family, key);
+
+  if (s.ok()) {
+    write_batch_->Put(column_family, key, value);
+    num_puts_++;
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family,
+                                const SliceParts& key,
+                                const SliceParts& value) {
+  Status s = TryLock(column_family, key);
+
+  if (s.ok()) {
+    write_batch_->Put(column_family, key, value);
+    num_puts_++;
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::Merge(ColumnFamilyHandle* column_family,
+                                  const Slice& key, const Slice& value) {
+  Status s = TryLock(column_family, key);
+
+  if (s.ok()) {
+    write_batch_->Merge(column_family, key, value);
+    num_merges_++;
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family,
+                                   const Slice& key) {
+  Status s = TryLock(column_family, key);
+
+  if (s.ok()) {
+    write_batch_->Delete(column_family, key);
+    num_deletes_++;
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family,
+                                   const SliceParts& key) {
+  Status s = TryLock(column_family, key);
+
+  if (s.ok()) {
+    write_batch_->Delete(column_family, key);
+    num_deletes_++;
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family,
+                                         const Slice& key) {
+  Status s = TryLock(column_family, key);
+
+  if (s.ok()) {
+    write_batch_->SingleDelete(column_family, key);
+    num_deletes_++;
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family,
+                                         const SliceParts& key) {
+  Status s = TryLock(column_family, key);
+
+  if (s.ok()) {
+    write_batch_->SingleDelete(column_family, key);
+    num_deletes_++;
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family,
+                                         const Slice& key, const Slice& value) {
+  bool untracked = true;
+  Status s = TryLock(column_family, key, untracked);
+
+  if (s.ok()) {
+    write_batch_->Put(column_family, key, value);
+    num_puts_++;
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family,
+                                         const SliceParts& key,
+                                         const SliceParts& value) {
+  bool untracked = true;
+  Status s = TryLock(column_family, key, untracked);
+
+  if (s.ok()) {
+    write_batch_->Put(column_family, key, value);
+    num_puts_++;
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::MergeUntracked(ColumnFamilyHandle* column_family,
+                                           const Slice& key,
+                                           const Slice& value) {
+  bool untracked = true;
+  Status s = TryLock(column_family, key, untracked);
+
+  if (s.ok()) {
+    write_batch_->Merge(column_family, key, value);
+    num_merges_++;
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family,
+                                            const Slice& key) {
+  bool untracked = true;
+  Status s = TryLock(column_family, key, untracked);
+
+  if (s.ok()) {
+    write_batch_->Delete(column_family, key);
+    num_deletes_++;
+  }
+
+  return s;
+}
+
+Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family,
+                                            const SliceParts& key) {
+  bool untracked = true;
+  Status s = TryLock(column_family, key, untracked);
+
+  if (s.ok()) {
+    write_batch_->Delete(column_family, key);
+    num_deletes_++;
+  }
+
+  return s;
+}
+
+void TransactionBaseImpl::PutLogData(const Slice& blob) {
+  write_batch_->PutLogData(blob);
+}
+
+WriteBatchWithIndex* TransactionBaseImpl::GetWriteBatch() {
+  return write_batch_.get();
+}
+
+uint64_t TransactionBaseImpl::GetElapsedTime() const {
+  return (db_->GetEnv()->NowMicros() - start_time_) / 1000;
+}
+
+uint64_t TransactionBaseImpl::GetNumPuts() const { return num_puts_; }
+
+uint64_t TransactionBaseImpl::GetNumDeletes() const { return num_deletes_; }
+
+uint64_t TransactionBaseImpl::GetNumMerges() const { return num_merges_; }
+
+uint64_t TransactionBaseImpl::GetNumKeys() const {
+  uint64_t count = 0;
+
+  // sum up locked keys in all column families
+  for (const auto& key_map_iter : tracked_keys_) {
+    const auto& keys = key_map_iter.second;
+    count += keys.size();
+  }
+
+  return count;
+}
+
+void TransactionBaseImpl::TrackKey(uint32_t cfh_id, const std::string& key,
+                                   SequenceNumber seq) {
+  auto iter = tracked_keys_[cfh_id].find(key);
+  if (iter == tracked_keys_[cfh_id].end()) {
+    tracked_keys_[cfh_id].insert({key, seq});
+
+    if (save_points_ != nullptr && !save_points_->empty()) {
+      // Aren't tracking this key, add it.
+      save_points_->top().new_keys_[cfh_id][key] = seq;
+    }
+  } else if (seq < iter->second) {
+    // Now tracking this key with an earlier sequence number
+    iter->second = seq;
+  }
+}
+
+const TransactionKeyMap* TransactionBaseImpl::GetTrackedKeysSinceSavePoint() {
+  if (save_points_ != nullptr && !save_points_->empty()) {
+    return &save_points_->top().new_keys_;
+  }
+
+  return nullptr;
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_base.h b/src/rocksdb/utilities/transactions/transaction_base.h
new file mode 100644
index 0000000..54ea567
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_base.h
@@ -0,0 +1,250 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <stack>
+#include <string>
+#include <vector>
+
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace rocksdb {
+
+class TransactionBaseImpl : public Transaction {
+ public:
+  TransactionBaseImpl(DB* db, const WriteOptions& write_options);
+
+  virtual ~TransactionBaseImpl();
+
+  // Remove pending operations queued in this transaction.
+  virtual void Clear();
+
+  // Called before executing Put, Merge, Delete, and GetForUpdate.  If TryLock
+  // returns non-OK, the Put/Merge/Delete/GetForUpdate will be failed.
+  // untracked will be true if called from PutUntracked, DeleteUntracked, or
+  // MergeUntracked.
+  virtual Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
+                         bool untracked = false) = 0;
+
+  void SetSavePoint() override;
+
+  Status RollbackToSavePoint() override;
+
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, std::string* value) override;
+
+  Status Get(const ReadOptions& options, const Slice& key,
+             std::string* value) override {
+    return Get(options, db_->DefaultColumnFamily(), key, value);
+  }
+
+  Status GetForUpdate(const ReadOptions& options,
+                      ColumnFamilyHandle* column_family, const Slice& key,
+                      std::string* value) override;
+
+  Status GetForUpdate(const ReadOptions& options, const Slice& key,
+                      std::string* value) override {
+    return GetForUpdate(options, db_->DefaultColumnFamily(), key, value);
+  }
+
+  std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+
+  std::vector<Status> MultiGet(const ReadOptions& options,
+                               const std::vector<Slice>& keys,
+                               std::vector<std::string>* values) override {
+    return MultiGet(options, std::vector<ColumnFamilyHandle*>(
+                                 keys.size(), db_->DefaultColumnFamily()),
+                    keys, values);
+  }
+
+  std::vector<Status> MultiGetForUpdate(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+
+  std::vector<Status> MultiGetForUpdate(
+      const ReadOptions& options, const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override {
+    return MultiGetForUpdate(options,
+                             std::vector<ColumnFamilyHandle*>(
+                                 keys.size(), db_->DefaultColumnFamily()),
+                             keys, values);
+  }
+
+  Iterator* GetIterator(const ReadOptions& read_options) override;
+  Iterator* GetIterator(const ReadOptions& read_options,
+                        ColumnFamilyHandle* column_family) override;
+
+  Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+             const Slice& value) override;
+  Status Put(const Slice& key, const Slice& value) override {
+    return Put(nullptr, key, value);
+  }
+
+  Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+             const SliceParts& value) override;
+  Status Put(const SliceParts& key, const SliceParts& value) override {
+    return Put(nullptr, key, value);
+  }
+
+  Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+               const Slice& value) override;
+  Status Merge(const Slice& key, const Slice& value) override {
+    return Merge(nullptr, key, value);
+  }
+
+  Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
+  Status Delete(const Slice& key) override { return Delete(nullptr, key); }
+  Status Delete(ColumnFamilyHandle* column_family,
+                const SliceParts& key) override;
+  Status Delete(const SliceParts& key) override { return Delete(nullptr, key); }
+
+  Status SingleDelete(ColumnFamilyHandle* column_family,
+                      const Slice& key) override;
+  Status SingleDelete(const Slice& key) override {
+    return SingleDelete(nullptr, key);
+  }
+  Status SingleDelete(ColumnFamilyHandle* column_family,
+                      const SliceParts& key) override;
+  Status SingleDelete(const SliceParts& key) override {
+    return SingleDelete(nullptr, key);
+  }
+
+  Status PutUntracked(ColumnFamilyHandle* column_family, const Slice& key,
+                      const Slice& value) override;
+  Status PutUntracked(const Slice& key, const Slice& value) override {
+    return PutUntracked(nullptr, key, value);
+  }
+
+  Status PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key,
+                      const SliceParts& value) override;
+  Status PutUntracked(const SliceParts& key, const SliceParts& value) override {
+    return PutUntracked(nullptr, key, value);
+  }
+
+  Status MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key,
+                        const Slice& value) override;
+  Status MergeUntracked(const Slice& key, const Slice& value) override {
+    return MergeUntracked(nullptr, key, value);
+  }
+
+  Status DeleteUntracked(ColumnFamilyHandle* column_family,
+                         const Slice& key) override;
+  Status DeleteUntracked(const Slice& key) override {
+    return DeleteUntracked(nullptr, key);
+  }
+  Status DeleteUntracked(ColumnFamilyHandle* column_family,
+                         const SliceParts& key) override;
+  Status DeleteUntracked(const SliceParts& key) override {
+    return DeleteUntracked(nullptr, key);
+  }
+
+  void PutLogData(const Slice& blob) override;
+
+  WriteBatchWithIndex* GetWriteBatch() override;
+
+  virtual void SetLockTimeout(int64_t timeout) override { /* Do nothing */
+  }
+
+  const Snapshot* GetSnapshot() const override {
+    return snapshot_ ? snapshot_->snapshot() : nullptr;
+  }
+
+  void SetSnapshot() override;
+
+  uint64_t GetElapsedTime() const override;
+
+  uint64_t GetNumPuts() const override;
+
+  uint64_t GetNumDeletes() const override;
+
+  uint64_t GetNumMerges() const override;
+
+  uint64_t GetNumKeys() const override;
+
+  // Get list of keys in this transaction that must not have any conflicts
+  // with writes in other transactions.
+  const TransactionKeyMap& GetTrackedKeys() const { return tracked_keys_; }
+
+ protected:
+  // Add a key to the list of tracked keys.
+  // seqno is the earliest seqno this key was involved with this transaction.
+  void TrackKey(uint32_t cfh_id, const std::string& key, SequenceNumber seqno);
+
+  const TransactionKeyMap* GetTrackedKeysSinceSavePoint();
+
+  DB* const db_;
+
+  const WriteOptions write_options_;
+
+  const Comparator* cmp_;
+
+  // Records writes pending in this transaction
+  std::unique_ptr<WriteBatchWithIndex> write_batch_;
+
+  // Stores that time the txn was constructed, in microseconds.
+  const uint64_t start_time_;
+
+  // Stores the current snapshot that was was set by SetSnapshot or null if
+  // no snapshot is currently set.
+  std::shared_ptr<ManagedSnapshot> snapshot_;
+
+  // Count of various operations pending in this transaction
+  uint64_t num_puts_ = 0;
+  uint64_t num_deletes_ = 0;
+  uint64_t num_merges_ = 0;
+
+  struct SavePoint {
+    std::shared_ptr<ManagedSnapshot> snapshot_;
+    uint64_t num_puts_;
+    uint64_t num_deletes_;
+    uint64_t num_merges_;
+
+    // Record all keys tracked since the last savepoint
+    TransactionKeyMap new_keys_;
+
+    SavePoint(std::shared_ptr<ManagedSnapshot> snapshot, uint64_t num_puts,
+              uint64_t num_deletes, uint64_t num_merges)
+        : snapshot_(snapshot),
+          num_puts_(num_puts),
+          num_deletes_(num_deletes),
+          num_merges_(num_merges) {}
+  };
+
+ private:
+  // Stack of the Snapshot saved at each save point.  Saved snapshots may be
+  // nullptr if there was no snapshot at the time SetSavePoint() was called.
+  std::unique_ptr<std::stack<TransactionBaseImpl::SavePoint>> save_points_;
+
+  // Map from column_family_id to map of keys that are involved in this
+  // transaction.
+  // Pessimistic Transactions will do conflict checking before adding a key
+  // by calling TrackKey().
+  // Optimistic Transactions will wait till commit time to do conflict checking.
+  TransactionKeyMap tracked_keys_;
+
+  Status TryLock(ColumnFamilyHandle* column_family, const SliceParts& key,
+                 bool untracked = false);
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_db_impl.cc b/src/rocksdb/utilities/transactions/transaction_db_impl.cc
new file mode 100644
index 0000000..edf15e3
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_db_impl.cc
@@ -0,0 +1,260 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/transaction_db_impl.h"
+
+#include <string>
+#include <vector>
+
+#include "db/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+#include "utilities/transactions/transaction_impl.h"
+
+namespace rocksdb {
+
+TransactionDBImpl::TransactionDBImpl(DB* db,
+                                     const TransactionDBOptions& txn_db_options)
+    : TransactionDB(db),
+      txn_db_options_(txn_db_options),
+      lock_mgr_(txn_db_options_.num_stripes, txn_db_options.max_num_locks,
+                txn_db_options_.custom_mutex_factory
+                    ? txn_db_options_.custom_mutex_factory
+                    : std::shared_ptr<TransactionDBMutexFactory>(
+                          new TransactionDBMutexFactoryImpl())) {}
+
+Transaction* TransactionDBImpl::BeginTransaction(
+    const WriteOptions& write_options, const TransactionOptions& txn_options) {
+  Transaction* txn = new TransactionImpl(this, write_options, txn_options);
+
+  return txn;
+}
+
+TransactionDBOptions TransactionDBImpl::ValidateTxnDBOptions(
+    const TransactionDBOptions& txn_db_options) {
+  TransactionDBOptions validated = txn_db_options;
+
+  if (txn_db_options.num_stripes == 0) {
+    validated.num_stripes = 1;
+  }
+
+  return validated;
+}
+
+Status TransactionDB::Open(const Options& options,
+                           const TransactionDBOptions& txn_db_options,
+                           const std::string& dbname, TransactionDB** dbptr) {
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+  Status s = TransactionDB::Open(db_options, txn_db_options, dbname,
+                                 column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a reference to
+    // default column family
+    delete handles[0];
+  }
+
+  return s;
+}
+
+Status TransactionDB::Open(
+    const DBOptions& db_options, const TransactionDBOptions& txn_db_options,
+    const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, TransactionDB** dbptr) {
+  Status s;
+  DB* db;
+
+  std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
+
+  // Enable MemTable History if not already enabled
+  for (auto& column_family : column_families_copy) {
+    ColumnFamilyOptions* options = &column_family.options;
+
+    if (options->max_write_buffer_number_to_maintain == 0) {
+      // Setting to -1 will set the History size to max_write_buffer_number.
+      options->max_write_buffer_number_to_maintain = -1;
+    }
+  }
+
+  s = DB::Open(db_options, dbname, column_families, handles, &db);
+
+  if (s.ok()) {
+    TransactionDBImpl* txn_db = new TransactionDBImpl(
+        db, TransactionDBImpl::ValidateTxnDBOptions(txn_db_options));
+
+    for (auto cf_ptr : *handles) {
+      txn_db->AddColumnFamily(cf_ptr);
+    }
+
+    *dbptr = txn_db;
+  }
+
+  return s;
+}
+
+// Let TransactionLockMgr know that this column family exists so it can
+// allocate a LockMap for it.
+void TransactionDBImpl::AddColumnFamily(const ColumnFamilyHandle* handle) {
+  lock_mgr_.AddColumnFamily(handle->GetID());
+}
+
+Status TransactionDBImpl::CreateColumnFamily(
+    const ColumnFamilyOptions& options, const std::string& column_family_name,
+    ColumnFamilyHandle** handle) {
+  InstrumentedMutexLock l(&column_family_mutex_);
+
+  Status s = db_->CreateColumnFamily(options, column_family_name, handle);
+  if (s.ok()) {
+    lock_mgr_.AddColumnFamily((*handle)->GetID());
+  }
+
+  return s;
+}
+
+// Let TransactionLockMgr know that it can deallocate the LockMap for this
+// column family.
+Status TransactionDBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
+  InstrumentedMutexLock l(&column_family_mutex_);
+
+  Status s = db_->DropColumnFamily(column_family);
+  if (s.ok()) {
+    lock_mgr_.RemoveColumnFamily(column_family->GetID());
+  }
+
+  return s;
+}
+
+Status TransactionDBImpl::TryLock(TransactionImpl* txn, uint32_t cfh_id,
+                                  const std::string& key) {
+  return lock_mgr_.TryLock(txn, cfh_id, key, GetEnv());
+}
+
+void TransactionDBImpl::UnLock(TransactionImpl* txn,
+                               const TransactionKeyMap* keys) {
+  lock_mgr_.UnLock(txn, keys, GetEnv());
+}
+
+void TransactionDBImpl::UnLock(TransactionImpl* txn, uint32_t cfh_id,
+                               const std::string& key) {
+  lock_mgr_.UnLock(txn, cfh_id, key, GetEnv());
+}
+
+// Used when wrapping DB write operations in a transaction
+Transaction* TransactionDBImpl::BeginInternalTransaction(
+    const WriteOptions& options) {
+  TransactionOptions txn_options;
+  Transaction* txn = BeginTransaction(options, txn_options);
+
+  assert(dynamic_cast<TransactionImpl*>(txn) != nullptr);
+  auto txn_impl = reinterpret_cast<TransactionImpl*>(txn);
+
+  // Use default timeout for non-transactional writes
+  txn_impl->SetLockTimeout(txn_db_options_.default_lock_timeout);
+
+  return txn;
+}
+
+// All user Put, Merge, Delete, and Write requests must be intercepted to make
+// sure that they lock all keys that they are writing to avoid causing conflicts
+// with any concurent transactions. The easiest way to do this is to wrap all
+// write operations in a transaction.
+//
+// Put(), Merge(), and Delete() only lock a single key per call.  Write() will
+// sort its keys before locking them.  This guarantees that TransactionDB write
+// methods cannot deadlock with eachother (but still could deadlock with a
+// Transaction).
+Status TransactionDBImpl::Put(const WriteOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key, const Slice& val) {
+  Status s;
+
+  Transaction* txn = BeginInternalTransaction(options);
+
+  // Since the client didn't create a transaction, they don't care about
+  // conflict checking for this write.  So we just need to do PutUntracked().
+  s = txn->PutUntracked(column_family, key, val);
+
+  if (s.ok()) {
+    s = txn->Commit();
+  }
+
+  delete txn;
+
+  return s;
+}
+
+Status TransactionDBImpl::Delete(const WriteOptions& wopts,
+                                 ColumnFamilyHandle* column_family,
+                                 const Slice& key) {
+  Status s;
+
+  Transaction* txn = BeginInternalTransaction(wopts);
+
+  // Since the client didn't create a transaction, they don't care about
+  // conflict checking for this write.  So we just need to do
+  // DeleteUntracked().
+  s = txn->DeleteUntracked(column_family, key);
+
+  if (s.ok()) {
+    s = txn->Commit();
+  }
+
+  delete txn;
+
+  return s;
+}
+
+Status TransactionDBImpl::Merge(const WriteOptions& options,
+                                ColumnFamilyHandle* column_family,
+                                const Slice& key, const Slice& value) {
+  Status s;
+
+  Transaction* txn = BeginInternalTransaction(options);
+
+  // Since the client didn't create a transaction, they don't care about
+  // conflict checking for this write.  So we just need to do
+  // MergeUntracked().
+  s = txn->MergeUntracked(column_family, key, value);
+
+  if (s.ok()) {
+    s = txn->Commit();
+  }
+
+  delete txn;
+
+  return s;
+}
+
+Status TransactionDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
+  // Need to lock all keys in this batch to prevent write conflicts with
+  // concurrent transactions.
+  Transaction* txn = BeginInternalTransaction(opts);
+
+  assert(dynamic_cast<TransactionImpl*>(txn) != nullptr);
+  auto txn_impl = reinterpret_cast<TransactionImpl*>(txn);
+
+  // Since commitBatch sorts the keys before locking, concurrent Write()
+  // operations will not cause a deadlock.
+  // In order to avoid a deadlock with a concurrent Transaction, Transactions
+  // should use a lock timeout.
+  Status s = txn_impl->CommitBatch(updates);
+
+  delete txn;
+
+  return s;
+}
+
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_db_impl.h b/src/rocksdb/utilities/transactions/transaction_db_impl.h
new file mode 100644
index 0000000..5a9d8b4
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_db_impl.h
@@ -0,0 +1,80 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "utilities/transactions/transaction_impl.h"
+#include "utilities/transactions/transaction_lock_mgr.h"
+
+namespace rocksdb {
+
+class TransactionDBImpl : public TransactionDB {
+ public:
+  explicit TransactionDBImpl(DB* db,
+                             const TransactionDBOptions& txn_db_options);
+
+  ~TransactionDBImpl() {}
+
+  Transaction* BeginTransaction(const WriteOptions& write_options,
+                                const TransactionOptions& txn_options) override;
+
+  using StackableDB::Put;
+  virtual Status Put(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& val) override;
+
+  using StackableDB::Delete;
+  virtual Status Delete(const WriteOptions& wopts,
+                        ColumnFamilyHandle* column_family,
+                        const Slice& key) override;
+
+  using StackableDB::Merge;
+  virtual Status Merge(const WriteOptions& options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) override;
+
+  using StackableDB::Write;
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
+
+  using StackableDB::CreateColumnFamily;
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& options,
+                                    const std::string& column_family_name,
+                                    ColumnFamilyHandle** handle) override;
+
+  using StackableDB::DropColumnFamily;
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
+
+  Status TryLock(TransactionImpl* txn, uint32_t cfh_id, const std::string& key);
+
+  void UnLock(TransactionImpl* txn, const TransactionKeyMap* keys);
+  void UnLock(TransactionImpl* txn, uint32_t cfh_id, const std::string& key);
+
+  void AddColumnFamily(const ColumnFamilyHandle* handle);
+
+  static TransactionDBOptions ValidateTxnDBOptions(
+      const TransactionDBOptions& txn_db_options);
+
+  const TransactionDBOptions& GetTxnDBOptions() const {
+    return txn_db_options_;
+  }
+
+ private:
+  const TransactionDBOptions txn_db_options_;
+  TransactionLockMgr lock_mgr_;
+
+  // Must be held when adding/dropping column families.
+  InstrumentedMutex column_family_mutex_;
+  Transaction* BeginInternalTransaction(const WriteOptions& options);
+  Status WriteHelper(WriteBatch* updates, TransactionImpl* txn_impl);
+};
+
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc
new file mode 100644
index 0000000..185f8c7
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc
@@ -0,0 +1,121 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+#include <chrono>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+
+#include "rocksdb/utilities/transaction_db_mutex.h"
+
+namespace rocksdb {
+
+class TransactionDBMutexImpl : public TransactionDBMutex {
+ public:
+  TransactionDBMutexImpl() {}
+  ~TransactionDBMutexImpl() {}
+
+  Status Lock() override;
+
+  Status TryLockFor(int64_t timeout_time) override;
+
+  void UnLock() override { mutex_.unlock(); }
+
+  friend class TransactionDBCondVarImpl;
+
+ private:
+  std::timed_mutex mutex_;
+};
+
+class TransactionDBCondVarImpl : public TransactionDBCondVar {
+ public:
+  TransactionDBCondVarImpl() {}
+  ~TransactionDBCondVarImpl() {}
+
+  Status Wait(std::shared_ptr<TransactionDBMutex> mutex) override;
+
+  Status WaitFor(std::shared_ptr<TransactionDBMutex> mutex,
+                 int64_t timeout_time) override;
+
+  void Notify() override { cv_.notify_one(); }
+
+  void NotifyAll() override { cv_.notify_all(); }
+
+ private:
+  std::condition_variable_any cv_;
+};
+
+std::shared_ptr<TransactionDBMutex>
+TransactionDBMutexFactoryImpl::AllocateMutex() {
+  return std::shared_ptr<TransactionDBMutex>(new TransactionDBMutexImpl());
+}
+
+std::shared_ptr<TransactionDBCondVar>
+TransactionDBMutexFactoryImpl::AllocateCondVar() {
+  return std::shared_ptr<TransactionDBCondVar>(new TransactionDBCondVarImpl());
+}
+
+Status TransactionDBMutexImpl::Lock() {
+  mutex_.lock();
+  return Status::OK();
+}
+
+Status TransactionDBMutexImpl::TryLockFor(int64_t timeout_time) {
+  bool locked = true;
+
+  if (timeout_time < 0) {
+    // If timeout is negative, we wait indefinitely to acquire the lock
+    mutex_.lock();
+  } else if (timeout_time == 0) {
+    locked = mutex_.try_lock();
+  } else {
+    // Attempt to acquire the lock unless we timeout
+    auto duration = std::chrono::microseconds(timeout_time);
+    locked = mutex_.try_lock_for(duration);
+  }
+
+  if (!locked) {
+    // timeout acquiring mutex
+    return Status::TimedOut(Status::SubCode::kMutexTimeout);
+  }
+
+  return Status::OK();
+}
+
+Status TransactionDBCondVarImpl::Wait(
+    std::shared_ptr<TransactionDBMutex> mutex) {
+  auto mutex_impl = reinterpret_cast<TransactionDBMutexImpl*>(mutex.get());
+  cv_.wait(mutex_impl->mutex_);
+  return Status::OK();
+}
+
+Status TransactionDBCondVarImpl::WaitFor(
+    std::shared_ptr<TransactionDBMutex> mutex, int64_t timeout_time) {
+  auto mutex_impl = reinterpret_cast<TransactionDBMutexImpl*>(mutex.get());
+
+  if (timeout_time < 0) {
+    // If timeout is negative, do not use a timeout
+    cv_.wait(mutex_impl->mutex_);
+  } else {
+    auto duration = std::chrono::microseconds(timeout_time);
+    auto cv_status = cv_.wait_for(mutex_impl->mutex_, duration);
+
+    // Check if the wait stopped due to timing out.
+    if (cv_status == std::cv_status::timeout) {
+      return Status::TimedOut(Status::SubCode::kMutexTimeout);
+    }
+  }
+
+  // CV was signaled, or we spuriously woke up (but didn't time out)
+  return Status::OK();
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h
new file mode 100644
index 0000000..7c915ca
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_db_mutex_impl.h
@@ -0,0 +1,26 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/utilities/transaction_db_mutex.h"
+
+namespace rocksdb {
+
+class TransactionDBMutex;
+class TransactionDBCondVar;
+
+// Default implementation of TransactionDBMutexFactory.  May be overridden
+// by TransactionDBOptions.custom_mutex_factory.
+class TransactionDBMutexFactoryImpl : public TransactionDBMutexFactory {
+ public:
+  std::shared_ptr<TransactionDBMutex> AllocateMutex() override;
+  std::shared_ptr<TransactionDBCondVar> AllocateCondVar() override;
+};
+
+}  //  namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_impl.cc b/src/rocksdb/utilities/transactions/transaction_impl.cc
new file mode 100644
index 0000000..c2a93cf
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_impl.cc
@@ -0,0 +1,320 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/transaction_impl.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_impl.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/string_util.h"
+#include "utilities/transactions/transaction_db_impl.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace rocksdb {
+
+struct WriteOptions;
+
+std::atomic<TransactionID> TransactionImpl::txn_id_counter_(1);
+
+TransactionID TransactionImpl::GenTxnID() {
+  return txn_id_counter_.fetch_add(1);
+}
+
+TransactionImpl::TransactionImpl(TransactionDB* txn_db,
+                                 const WriteOptions& write_options,
+                                 const TransactionOptions& txn_options)
+    : TransactionBaseImpl(txn_db->GetBaseDB(), write_options),
+      txn_db_impl_(nullptr),
+      txn_id_(GenTxnID()),
+      expiration_time_(txn_options.expiration >= 0
+                           ? start_time_ + txn_options.expiration * 1000
+                           : 0),
+      lock_timeout_(txn_options.lock_timeout * 1000) {
+  txn_db_impl_ = dynamic_cast<TransactionDBImpl*>(txn_db);
+  assert(txn_db_impl_);
+
+  if (lock_timeout_ < 0) {
+    // Lock timeout not set, use default
+    lock_timeout_ =
+        txn_db_impl_->GetTxnDBOptions().transaction_lock_timeout * 1000;
+  }
+
+  if (txn_options.set_snapshot) {
+    SetSnapshot();
+  }
+}
+
+TransactionImpl::~TransactionImpl() {
+  txn_db_impl_->UnLock(this, &GetTrackedKeys());
+}
+
+void TransactionImpl::Clear() {
+  txn_db_impl_->UnLock(this, &GetTrackedKeys());
+  TransactionBaseImpl::Clear();
+}
+
+bool TransactionImpl::IsExpired() const {
+  if (expiration_time_ > 0) {
+    if (db_->GetEnv()->NowMicros() >= expiration_time_) {
+      // Transaction is expired.
+      return true;
+    }
+  }
+
+  return false;
+}
+
+Status TransactionImpl::CommitBatch(WriteBatch* batch) {
+  TransactionKeyMap keys_to_unlock;
+
+  Status s = LockBatch(batch, &keys_to_unlock);
+
+  if (s.ok()) {
+    s = DoCommit(batch);
+
+    txn_db_impl_->UnLock(this, &keys_to_unlock);
+  }
+
+  return s;
+}
+
+Status TransactionImpl::Commit() {
+  Status s = DoCommit(write_batch_->GetWriteBatch());
+
+  Clear();
+
+  return s;
+}
+
+Status TransactionImpl::DoCommit(WriteBatch* batch) {
+  Status s;
+
+  if (expiration_time_ > 0) {
+    // We cannot commit a transaction that is expired as its locks might have
+    // been released.
+    // To avoid race conditions, we need to use a WriteCallback to check the
+    // expiration time once we're on the writer thread.
+    TransactionCallback callback(this);
+
+    // Do write directly on base db as TransctionDB::Write() would attempt to
+    // do conflict checking that we've already done.
+    assert(dynamic_cast<DBImpl*>(db_) != nullptr);
+    auto db_impl = reinterpret_cast<DBImpl*>(db_);
+
+    s = db_impl->WriteWithCallback(write_options_, batch, &callback);
+  } else {
+    s = db_->Write(write_options_, batch);
+  }
+
+  return s;
+}
+
+void TransactionImpl::Rollback() { Clear(); }
+
+Status TransactionImpl::RollbackToSavePoint() {
+  // Unlock any keys locked since last transaction
+  const TransactionKeyMap* keys = GetTrackedKeysSinceSavePoint();
+  if (keys) {
+    txn_db_impl_->UnLock(this, keys);
+  }
+
+  return TransactionBaseImpl::RollbackToSavePoint();
+}
+
+// Lock all keys in this batch.
+// On success, caller should unlock keys_to_unlock
+Status TransactionImpl::LockBatch(WriteBatch* batch,
+                                  TransactionKeyMap* keys_to_unlock) {
+  class Handler : public WriteBatch::Handler {
+   public:
+    // Sorted map of column_family_id to sorted set of keys.
+    // Since LockBatch() always locks keys in sorted order, it cannot deadlock
+    // with itself.  We're not using a comparator here since it doesn't matter
+    // what the sorting is as long as it's consistent.
+    std::map<uint32_t, std::set<std::string>> keys_;
+
+    Handler() {}
+
+    void RecordKey(uint32_t column_family_id, const Slice& key) {
+      std::string key_str = key.ToString();
+
+      auto iter = (keys_)[column_family_id].find(key_str);
+      if (iter == (keys_)[column_family_id].end()) {
+        // key not yet seen, store it.
+        (keys_)[column_family_id].insert({std::move(key_str)});
+      }
+    }
+
+    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                         const Slice& value) override {
+      RecordKey(column_family_id, key);
+      return Status::OK();
+    }
+    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+                           const Slice& value) override {
+      RecordKey(column_family_id, key);
+      return Status::OK();
+    }
+    virtual Status DeleteCF(uint32_t column_family_id,
+                            const Slice& key) override {
+      RecordKey(column_family_id, key);
+      return Status::OK();
+    }
+  };
+
+  // Iterating on this handler will add all keys in this batch into keys
+  Handler handler;
+  batch->Iterate(&handler);
+
+  Status s;
+
+  // Attempt to lock all keys
+  for (const auto& cf_iter : handler.keys_) {
+    uint32_t cfh_id = cf_iter.first;
+    auto& cfh_keys = cf_iter.second;
+
+    for (const auto& key_iter : cfh_keys) {
+      const std::string& key = key_iter;
+
+      s = txn_db_impl_->TryLock(this, cfh_id, key);
+      if (!s.ok()) {
+        break;
+      }
+      (*keys_to_unlock)[cfh_id].insert({std::move(key), kMaxSequenceNumber});
+    }
+
+    if (!s.ok()) {
+      break;
+    }
+  }
+
+  if (!s.ok()) {
+    txn_db_impl_->UnLock(this, keys_to_unlock);
+  }
+
+  return s;
+}
+
+// Attempt to lock this key.
+// Returns OK if the key has been successfully locked.  Non-ok, otherwise.
+// If check_shapshot is true and this transaction has a snapshot set,
+// this key will only be locked if there have been no writes to this key since
+// the snapshot time.
+Status TransactionImpl::TryLock(ColumnFamilyHandle* column_family,
+                                const Slice& key, bool untracked) {
+  uint32_t cfh_id = GetColumnFamilyID(column_family);
+  std::string key_str = key.ToString();
+  bool previously_locked;
+  Status s;
+
+  // Even though we do not care about doing conflict checking for this write,
+  // we still need to take a lock to make sure we do not cause a conflict with
+  // some other write.  However, we do not need to check if there have been
+  // any writes since this transaction's snapshot.
+  // TODO(agiardullo): could optimize by supporting shared txn locks in the
+  // future
+  bool check_snapshot = !untracked;
+  SequenceNumber tracked_seqno = kMaxSequenceNumber;
+
+  // Lookup whether this key has already been locked by this transaction
+  const auto& tracked_keys = GetTrackedKeys();
+  const auto tracked_keys_cf = tracked_keys.find(cfh_id);
+  if (tracked_keys_cf == tracked_keys.end()) {
+    previously_locked = false;
+  } else {
+    auto iter = tracked_keys_cf->second.find(key_str);
+    if (iter == tracked_keys_cf->second.end()) {
+      previously_locked = false;
+    } else {
+      previously_locked = true;
+      tracked_seqno = iter->second;
+    }
+  }
+
+  // lock this key if this transactions hasn't already locked it
+  if (!previously_locked) {
+    s = txn_db_impl_->TryLock(this, cfh_id, key_str);
+  }
+
+  if (s.ok()) {
+    // If a snapshot is set, we need to make sure the key hasn't been modified
+    // since the snapshot.  This must be done after we locked the key.
+    if (!check_snapshot || snapshot_ == nullptr) {
+      // Need to remember the earliest sequence number that we know that this
+      // key has not been modified after.  This is useful if this same
+      // transaction
+      // later tries to lock this key again.
+      if (tracked_seqno == kMaxSequenceNumber) {
+        // Since we haven't checked a snapshot, we only know this key has not
+        // been modified since after we locked it.
+        tracked_seqno = db_->GetLatestSequenceNumber();
+      }
+    } else {
+      // If the key has been previous validated at a sequence number earlier
+      // than the curent snapshot's sequence number, we already know it has not
+      // been modified.
+      SequenceNumber seq = snapshot_->snapshot()->GetSequenceNumber();
+      bool already_validated = tracked_seqno <= seq;
+
+      if (!already_validated) {
+        s = CheckKeySequence(column_family, key);
+
+        if (s.ok()) {
+          // Record that there have been no writes to this key after this
+          // sequence.
+          tracked_seqno = seq;
+        } else {
+          // Failed to validate key
+          if (!previously_locked) {
+            // Unlock key we just locked
+            txn_db_impl_->UnLock(this, cfh_id, key.ToString());
+          }
+        }
+      }
+    }
+  }
+
+  if (s.ok()) {
+    // Let base class know we've conflict checked this key.
+    TrackKey(cfh_id, key_str, tracked_seqno);
+  }
+
+  return s;
+}
+
+// Return OK() if this key has not been modified more recently than the
+// transaction snapshot_.
+Status TransactionImpl::CheckKeySequence(ColumnFamilyHandle* column_family,
+                                         const Slice& key) {
+  Status result;
+  if (snapshot_ != nullptr) {
+    assert(dynamic_cast<DBImpl*>(db_) != nullptr);
+    auto db_impl = reinterpret_cast<DBImpl*>(db_);
+
+    ColumnFamilyHandle* cfh = column_family ? column_family :
+      db_impl->DefaultColumnFamily();
+
+    result = TransactionUtil::CheckKeyForConflicts(
+        db_impl, cfh, key.ToString(),
+        snapshot_->snapshot()->GetSequenceNumber());
+  }
+
+  return result;
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_impl.h b/src/rocksdb/utilities/transactions/transaction_impl.h
new file mode 100644
index 0000000..57ceacb
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_impl.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/write_callback.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "utilities/transactions/transaction_base.h"
+#include "utilities/transactions/transaction_util.h"
+
+namespace rocksdb {
+
+using TransactionID = uint64_t;
+
+class TransactionDBImpl;
+
+class TransactionImpl : public TransactionBaseImpl {
+ public:
+  TransactionImpl(TransactionDB* db, const WriteOptions& write_options,
+                  const TransactionOptions& txn_options);
+
+  virtual ~TransactionImpl();
+
+  Status Commit() override;
+
+  Status CommitBatch(WriteBatch* batch);
+
+  void Rollback() override;
+
+  Status RollbackToSavePoint() override;
+
+  // Generate a new unique transaction identifier
+  static TransactionID GenTxnID();
+
+  TransactionID GetTxnID() const { return txn_id_; }
+
+  // Returns the time (in microseconds according to Env->GetMicros())
+  // that this transaction will be expired.  Returns 0 if this transaction does
+  // not expire.
+  uint64_t GetExpirationTime() const { return expiration_time_; }
+
+  // returns true if this transaction has an expiration_time and has expired.
+  bool IsExpired() const;
+
+  // Returns the number of microseconds a transaction can wait on acquiring a
+  // lock or -1 if there is no timeout.
+  int64_t GetLockTimeout() const { return lock_timeout_; }
+  void SetLockTimeout(int64_t timeout) override {
+    lock_timeout_ = timeout * 1000;
+  }
+
+ protected:
+  Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
+                 bool untracked = false) override;
+
+ private:
+  TransactionDBImpl* txn_db_impl_;
+
+  // Used to create unique ids for transactions.
+  static std::atomic<TransactionID> txn_id_counter_;
+
+  // Unique ID for this transaction
+  const TransactionID txn_id_;
+
+  // If non-zero, this transaction should not be committed after this time (in
+  // microseconds according to Env->NowMicros())
+  const uint64_t expiration_time_;
+
+  // Timeout in microseconds when locking a key or -1 if there is no timeout.
+  int64_t lock_timeout_;
+
+  void Clear() override;
+
+  Status CheckKeySequence(ColumnFamilyHandle* column_family, const Slice& key);
+
+  Status LockBatch(WriteBatch* batch, TransactionKeyMap* keys_to_unlock);
+
+  Status DoCommit(WriteBatch* batch);
+
+  void RollbackLastN(size_t num);
+
+  // No copying allowed
+  TransactionImpl(const TransactionImpl&);
+  void operator=(const TransactionImpl&);
+};
+
+// Used at commit time to check whether transaction is committing before its
+// expiration time.
+class TransactionCallback : public WriteCallback {
+ public:
+  explicit TransactionCallback(TransactionImpl* txn) : txn_(txn) {}
+
+  Status Callback(DB* db) override {
+    if (txn_->IsExpired()) {
+      return Status::Expired();
+    } else {
+      return Status::OK();
+    }
+  }
+
+ private:
+  TransactionImpl* txn_;
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_lock_mgr.cc b/src/rocksdb/utilities/transactions/transaction_lock_mgr.cc
new file mode 100644
index 0000000..80e4fb8
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_lock_mgr.cc
@@ -0,0 +1,460 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "utilities/transactions/transaction_lock_mgr.h"
+
+#include <inttypes.h>
+
+#include <algorithm>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/utilities/transaction_db_mutex.h"
+#include "util/autovector.h"
+#include "util/murmurhash.h"
+#include "util/thread_local.h"
+
+namespace rocksdb {
+
+struct LockInfo {
+  TransactionID txn_id;
+
+  // Transaction locks are not valid after this time in us
+  uint64_t expiration_time;
+
+  LockInfo(TransactionID id, uint64_t time)
+      : txn_id(id), expiration_time(time) {}
+  LockInfo(const LockInfo& lock_info)
+      : txn_id(lock_info.txn_id), expiration_time(lock_info.expiration_time) {}
+};
+
+struct LockMapStripe {
+  explicit LockMapStripe(std::shared_ptr<TransactionDBMutexFactory> factory) {
+    stripe_mutex = factory->AllocateMutex();
+    stripe_cv = factory->AllocateCondVar();
+    assert(stripe_mutex);
+    assert(stripe_cv);
+  }
+
+  // Mutex must be held before modifying keys map
+  std::shared_ptr<TransactionDBMutex> stripe_mutex;
+
+  // Condition Variable per stripe for waiting on a lock
+  std::shared_ptr<TransactionDBCondVar> stripe_cv;
+
+  // Locked keys mapped to the info about the transactions that locked them.
+  // TODO(agiardullo): Explore performance of other data structures.
+  std::unordered_map<std::string, LockInfo> keys;
+};
+
+// Map of #num_stripes LockMapStripes
+struct LockMap {
+  explicit LockMap(size_t num_stripes,
+                   std::shared_ptr<TransactionDBMutexFactory> factory)
+      : num_stripes_(num_stripes) {
+    lock_map_stripes_.reserve(num_stripes);
+    for (size_t i = 0; i < num_stripes; i++) {
+      LockMapStripe* stripe = new LockMapStripe(factory);
+      lock_map_stripes_.push_back(stripe);
+    }
+  }
+
+  ~LockMap() {
+    for (auto stripe : lock_map_stripes_) {
+      delete stripe;
+    }
+  }
+
+  // Number of sepearate LockMapStripes to create, each with their own Mutex
+  const size_t num_stripes_;
+
+  // Count of keys that are currently locked in this column family.
+  // (Only maintained if TransactionLockMgr::max_num_locks_ is positive.)
+  std::atomic<int64_t> lock_cnt{0};
+
+  std::vector<LockMapStripe*> lock_map_stripes_;
+
+  size_t GetStripe(const std::string& key) const;
+};
+
+namespace {
+void UnrefLockMapsCache(void* ptr) {
+  // Called when a thread exits or a ThreadLocalPtr gets destroyed.
+  auto lock_maps_cache =
+      static_cast<std::unordered_map<uint32_t, std::shared_ptr<LockMap>>*>(ptr);
+  delete lock_maps_cache;
+}
+}  // anonymous namespace
+
+TransactionLockMgr::TransactionLockMgr(
+    size_t default_num_stripes, int64_t max_num_locks,
+    std::shared_ptr<TransactionDBMutexFactory> mutex_factory)
+    : default_num_stripes_(default_num_stripes),
+      max_num_locks_(max_num_locks),
+      mutex_factory_(mutex_factory),
+      lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)) {}
+
+TransactionLockMgr::~TransactionLockMgr() {}
+
+size_t LockMap::GetStripe(const std::string& key) const {
+  assert(num_stripes_ > 0);
+  static murmur_hash hash;
+  size_t stripe = hash(key) % num_stripes_;
+  return stripe;
+}
+
+void TransactionLockMgr::AddColumnFamily(uint32_t column_family_id) {
+  InstrumentedMutexLock l(&lock_map_mutex_);
+
+  if (lock_maps_.find(column_family_id) == lock_maps_.end()) {
+    lock_maps_.emplace(column_family_id,
+                       std::shared_ptr<LockMap>(
+                           new LockMap(default_num_stripes_, mutex_factory_)));
+  } else {
+    // column_family already exists in lock map
+    assert(false);
+  }
+}
+
+void TransactionLockMgr::RemoveColumnFamily(uint32_t column_family_id) {
+  // Remove lock_map for this column family.  Since the lock map is stored
+  // as a shared ptr, concurrent transactions can still keep keep using it
+  // until they release their reference to it.
+  {
+    InstrumentedMutexLock l(&lock_map_mutex_);
+
+    auto lock_maps_iter = lock_maps_.find(column_family_id);
+    assert(lock_maps_iter != lock_maps_.end());
+
+    lock_maps_.erase(lock_maps_iter);
+  }  // lock_map_mutex_
+
+  // Clear all thread-local caches
+  autovector<void*> local_caches;
+  lock_maps_cache_->Scrape(&local_caches, nullptr);
+  for (auto cache : local_caches) {
+    delete static_cast<LockMaps*>(cache);
+  }
+}
+
+// Look up the LockMap shared_ptr for a given column_family_id.
+// Note:  The LockMap is only valid as long as the caller is still holding on
+//   to the returned shared_ptr.
+std::shared_ptr<LockMap> TransactionLockMgr::GetLockMap(
+    uint32_t column_family_id) {
+  // First check thread-local cache
+  if (lock_maps_cache_->Get() == nullptr) {
+    lock_maps_cache_->Reset(new LockMaps());
+  }
+
+  auto lock_maps_cache = static_cast<LockMaps*>(lock_maps_cache_->Get());
+
+  auto lock_map_iter = lock_maps_cache->find(column_family_id);
+  if (lock_map_iter != lock_maps_cache->end()) {
+    // Found lock map for this column family.
+    return lock_map_iter->second;
+  }
+
+  // Not found in local cache, grab mutex and check shared LockMaps
+  InstrumentedMutexLock l(&lock_map_mutex_);
+
+  lock_map_iter = lock_maps_.find(column_family_id);
+  if (lock_map_iter == lock_maps_.end()) {
+    return std::shared_ptr<LockMap>(nullptr);
+  } else {
+    // Found lock map.  Store in thread-local cache and return.
+    std::shared_ptr<LockMap>& lock_map = lock_map_iter->second;
+    lock_maps_cache->insert({column_family_id, lock_map});
+
+    return lock_map;
+  }
+}
+
+// Returns true if this lock has expired and can be acquired by another
+// transaction.
+// If false, sets *expire_time to the expiration time of the lock according
+// to Env->GetMicros() or 0 if no expiration.
+bool TransactionLockMgr::IsLockExpired(const LockInfo& lock_info, Env* env,
+                                       uint64_t* expire_time) {
+  auto now = env->NowMicros();
+
+  bool expired =
+      (lock_info.expiration_time > 0 && lock_info.expiration_time <= now);
+
+  if (!expired && lock_info.expiration_time > 0) {
+    // return how many microseconds until lock will be expired
+    *expire_time = lock_info.expiration_time;
+  } else {
+    *expire_time = 0;
+  }
+
+  return expired;
+}
+
+Status TransactionLockMgr::TryLock(const TransactionImpl* txn,
+                                   uint32_t column_family_id,
+                                   const std::string& key, Env* env) {
+  // Lookup lock map for this column family id
+  std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
+  LockMap* lock_map = lock_map_ptr.get();
+  if (lock_map == nullptr) {
+    char msg[255];
+    snprintf(msg, sizeof(msg), "Column family id not found: %" PRIu32,
+             column_family_id);
+
+    return Status::InvalidArgument(msg);
+  }
+
+  // Need to lock the mutex for the stripe that this key hashes to
+  size_t stripe_num = lock_map->GetStripe(key);
+  assert(lock_map->lock_map_stripes_.size() > stripe_num);
+  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+  LockInfo lock_info(txn->GetTxnID(), txn->GetExpirationTime());
+  int64_t timeout = txn->GetLockTimeout();
+
+  return AcquireWithTimeout(lock_map, stripe, key, env, timeout, lock_info);
+}
+
+// Helper function for TryLock().
+Status TransactionLockMgr::AcquireWithTimeout(LockMap* lock_map,
+                                              LockMapStripe* stripe,
+                                              const std::string& key, Env* env,
+                                              int64_t timeout,
+                                              const LockInfo& lock_info) {
+  Status result;
+  uint64_t start_time = 0;
+  uint64_t end_time = 0;
+
+  if (timeout > 0) {
+    start_time = env->NowMicros();
+    end_time = start_time + timeout;
+  }
+
+  if (timeout < 0) {
+    // If timeout is negative, we wait indefinitely to acquire the lock
+    result = stripe->stripe_mutex->Lock();
+  } else {
+    result = stripe->stripe_mutex->TryLockFor(timeout);
+  }
+
+  if (!result.ok()) {
+    // failed to acquire mutex
+    return result;
+  }
+
+  // Acquire lock if we are able to
+  uint64_t expire_time_hint = 0;
+  result =
+      AcquireLocked(lock_map, stripe, key, env, lock_info, &expire_time_hint);
+
+  if (!result.ok() && timeout != 0) {
+    // If we weren't able to acquire the lock, we will keep retrying as long
+    // as the timeout allows.
+    bool timed_out = false;
+    do {
+      // Decide how long to wait
+      int64_t cv_end_time = -1;
+
+      // Check if held lock's expiration time is sooner than our timeout
+      if (expire_time_hint > 0 &&
+          (timeout < 0 || (timeout > 0 && expire_time_hint < end_time))) {
+        // expiration time is sooner than our timeout
+        cv_end_time = expire_time_hint;
+      } else if (timeout >= 0) {
+        cv_end_time = end_time;
+      }
+
+      if (cv_end_time < 0) {
+        // Wait indefinitely
+        result = stripe->stripe_cv->Wait(stripe->stripe_mutex);
+      } else {
+        uint64_t now = env->NowMicros();
+        if (static_cast<uint64_t>(cv_end_time) > now) {
+          result = stripe->stripe_cv->WaitFor(stripe->stripe_mutex,
+                                              cv_end_time - now);
+        }
+      }
+
+      if (result.IsTimedOut()) {
+          timed_out = true;
+          // Even though we timed out, we will still make one more attempt to
+          // acquire lock below (it is possible the lock expired and we
+          // were never signaled).
+      }
+
+      if (result.ok() || result.IsTimedOut()) {
+        result = AcquireLocked(lock_map, stripe, key, env, lock_info,
+                               &expire_time_hint);
+      }
+    } while (!result.ok() && !timed_out);
+  }
+
+  stripe->stripe_mutex->UnLock();
+
+  return result;
+}
+
+// Try to lock this key after we have acquired the mutex.
+// Sets *expire_time to the expiration time in microseconds
+//  or 0 if no expiration.
+// REQUIRED:  Stripe mutex must be held.
+Status TransactionLockMgr::AcquireLocked(LockMap* lock_map,
+                                         LockMapStripe* stripe,
+                                         const std::string& key, Env* env,
+                                         const LockInfo& txn_lock_info,
+                                         uint64_t* expire_time) {
+  Status result;
+  // Check if this key is already locked
+  if (stripe->keys.find(key) != stripe->keys.end()) {
+    // Lock already held
+
+    LockInfo& lock_info = stripe->keys.at(key);
+    if (lock_info.txn_id != txn_lock_info.txn_id) {
+      // locked by another txn.  Check if it's expired
+      if (IsLockExpired(lock_info, env, expire_time)) {
+        // lock is expired, can steal it
+        lock_info.txn_id = txn_lock_info.txn_id;
+        lock_info.expiration_time = txn_lock_info.expiration_time;
+        // lock_cnt does not change
+      } else {
+        result = Status::TimedOut(Status::SubCode::kLockTimeout);
+      }
+    }
+  } else {  // Lock not held.
+    // Check lock limit
+    if (max_num_locks_ > 0 &&
+        lock_map->lock_cnt.load(std::memory_order_acquire) >= max_num_locks_) {
+      result = Status::Busy(Status::SubCode::kLockLimit);
+    } else {
+      // acquire lock
+      stripe->keys.insert({key, txn_lock_info});
+
+      // Maintain lock count if there is a limit on the number of locks
+      if (max_num_locks_) {
+        lock_map->lock_cnt++;
+      }
+    }
+  }
+
+  return result;
+}
+
+void TransactionLockMgr::UnLock(TransactionImpl* txn, uint32_t column_family_id,
+                                const std::string& key, Env* env) {
+  std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
+  LockMap* lock_map = lock_map_ptr.get();
+  if (lock_map == nullptr) {
+    // Column Family must have been dropped.
+    return;
+  }
+
+  // Lock the mutex for the stripe that this key hashes to
+  size_t stripe_num = lock_map->GetStripe(key);
+  assert(lock_map->lock_map_stripes_.size() > stripe_num);
+  LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+  TransactionID txn_id = txn->GetTxnID();
+
+  stripe->stripe_mutex->Lock();
+
+  const auto& iter = stripe->keys.find(key);
+  if (iter != stripe->keys.end() && iter->second.txn_id == txn_id) {
+    // Found the key we locked.  unlock it.
+    stripe->keys.erase(iter);
+    if (max_num_locks_ > 0) {
+      // Maintain lock count if there is a limit on the number of locks.
+      assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0);
+      lock_map->lock_cnt--;
+    }
+  } else {
+    // This key is either not locked or locked by someone else.  This should
+    // only happen if the unlocking transaction has expired.
+    assert(txn->GetExpirationTime() > 0 &&
+           txn->GetExpirationTime() < env->NowMicros());
+  }
+
+  stripe->stripe_mutex->UnLock();
+
+  // Signal waiting threads to retry locking
+  stripe->stripe_cv->NotifyAll();
+}
+
+void TransactionLockMgr::UnLock(const TransactionImpl* txn,
+                                const TransactionKeyMap* key_map, Env* env) {
+  TransactionID txn_id = txn->GetTxnID();
+
+  for (auto& key_map_iter : *key_map) {
+    uint32_t column_family_id = key_map_iter.first;
+    auto& keys = key_map_iter.second;
+
+    std::shared_ptr<LockMap> lock_map_ptr = GetLockMap(column_family_id);
+    LockMap* lock_map = lock_map_ptr.get();
+
+    if (lock_map == nullptr) {
+      // Column Family must have been dropped.
+      return;
+    }
+
+    // Bucket keys by lock_map_ stripe
+    std::unordered_map<size_t, std::vector<const std::string*>> keys_by_stripe(
+        std::max(keys.size(), lock_map->num_stripes_));
+
+    for (auto& key_iter : keys) {
+      const std::string& key = key_iter.first;
+
+      size_t stripe_num = lock_map->GetStripe(key);
+      keys_by_stripe[stripe_num].push_back(&key);
+    }
+
+    // For each stripe, grab the stripe mutex and unlock all keys in this stripe
+    for (auto& stripe_iter : keys_by_stripe) {
+      size_t stripe_num = stripe_iter.first;
+      auto& stripe_keys = stripe_iter.second;
+
+      assert(lock_map->lock_map_stripes_.size() > stripe_num);
+      LockMapStripe* stripe = lock_map->lock_map_stripes_.at(stripe_num);
+
+      stripe->stripe_mutex->Lock();
+
+      for (const std::string* key : stripe_keys) {
+        const auto& iter = stripe->keys.find(*key);
+        if (iter != stripe->keys.end() && iter->second.txn_id == txn_id) {
+          // Found the key we locked.  unlock it.
+          stripe->keys.erase(iter);
+          if (max_num_locks_ > 0) {
+            // Maintain lock count if there is a limit on the number of locks.
+            assert(lock_map->lock_cnt.load(std::memory_order_relaxed) > 0);
+            lock_map->lock_cnt--;
+          }
+        } else {
+          // This key is either not locked or locked by someone else.  This
+          // should only
+          // happen if the unlocking transaction has expired.
+          assert(txn->GetExpirationTime() > 0 &&
+                 txn->GetExpirationTime() < env->NowMicros());
+        }
+      }
+
+      stripe->stripe_mutex->UnLock();
+
+      // Signal waiting threads to retry locking
+      stripe->stripe_cv->NotifyAll();
+    }
+  }
+}
+
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_lock_mgr.h b/src/rocksdb/utilities/transactions/transaction_lock_mgr.h
new file mode 100644
index 0000000..8f640d4
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_lock_mgr.h
@@ -0,0 +1,94 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <chrono>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/utilities/transaction.h"
+#include "util/instrumented_mutex.h"
+#include "util/thread_local.h"
+#include "utilities/transactions/transaction_impl.h"
+
+namespace rocksdb {
+
+class ColumnFamilyHandle;
+struct LockInfo;
+struct LockMap;
+struct LockMapStripe;
+
+class Slice;
+
+class TransactionLockMgr {
+ public:
+  TransactionLockMgr(size_t default_num_stripes, int64_t max_num_locks,
+                     std::shared_ptr<TransactionDBMutexFactory> factory);
+
+  ~TransactionLockMgr();
+
+  // Creates a new LockMap for this column family.  Caller should guarantee
+  // that this column family does not already exist.
+  void AddColumnFamily(uint32_t column_family_id);
+
+  // Deletes the LockMap for this column family.  Caller should guarantee that
+  // this column family is no longer in use.
+  void RemoveColumnFamily(uint32_t column_family_id);
+
+  // Attempt to lock key.  If OK status is returned, the caller is responsible
+  // for calling UnLock() on this key.
+  Status TryLock(const TransactionImpl* txn, uint32_t column_family_id,
+                 const std::string& key, Env* env);
+
+  // Unlock a key locked by TryLock().  txn must be the same Transaction that
+  // locked this key.
+  void UnLock(const TransactionImpl* txn, const TransactionKeyMap* keys,
+              Env* env);
+  void UnLock(TransactionImpl* txn, uint32_t column_family_id,
+              const std::string& key, Env* env);
+
+ private:
+  // Default number of lock map stripes per column family
+  const size_t default_num_stripes_;
+
+  // Limit on number of keys locked per column family
+  const int64_t max_num_locks_;
+
+  // Used to allocate mutexes/condvars to use when locking keys
+  std::shared_ptr<TransactionDBMutexFactory> mutex_factory_;
+
+  // Must be held when accessing/modifying lock_maps_
+  InstrumentedMutex lock_map_mutex_;
+
+  // Map of ColumnFamilyId to locked key info
+  using LockMaps = std::unordered_map<uint32_t, std::shared_ptr<LockMap>>;
+  LockMaps lock_maps_;
+
+  // Thread-local cache of entries in lock_maps_.  This is an optimization
+  // to avoid acquiring a mutex in order to look up a LockMap
+  std::unique_ptr<ThreadLocalPtr> lock_maps_cache_;
+
+  bool IsLockExpired(const LockInfo& lock_info, Env* env, uint64_t* wait_time);
+
+  std::shared_ptr<LockMap> GetLockMap(uint32_t column_family_id);
+
+  Status AcquireWithTimeout(LockMap* lock_map, LockMapStripe* stripe,
+                            const std::string& key, Env* env, int64_t timeout,
+                            const LockInfo& lock_info);
+
+  Status AcquireLocked(LockMap* lock_map, LockMapStripe* stripe,
+                       const std::string& key, Env* env,
+                       const LockInfo& lock_info, uint64_t* wait_time);
+
+  // No copying allowed
+  TransactionLockMgr(const TransactionLockMgr&);
+  void operator=(const TransactionLockMgr&);
+};
+
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_test.cc b/src/rocksdb/utilities/transactions/transaction_test.cc
new file mode 100644
index 0000000..dedc94c
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_test.cc
@@ -0,0 +1,1902 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/db.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+
+using std::string;
+
+namespace rocksdb {
+
+class TransactionTest : public testing::Test {
+ public:
+  TransactionDB* db;
+  string dbname;
+  Options options;
+
+  TransactionDBOptions txn_db_options;
+
+  TransactionTest() {
+    options.create_if_missing = true;
+    options.max_write_buffer_number = 2;
+    options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+    dbname = test::TmpDir() + "/transaction_testdb";
+
+    DestroyDB(dbname, options);
+    txn_db_options.transaction_lock_timeout = 0;
+    txn_db_options.default_lock_timeout = 0;
+    Status s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+    assert(s.ok());
+  }
+
+  ~TransactionTest() {
+    delete db;
+    DestroyDB(dbname, options);
+  }
+};
+
+TEST_F(TransactionTest, SuccessTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, Slice("foo"), Slice("bar"));
+  db->Put(write_options, Slice("foo2"), Slice("bar"));
+
+  Transaction* txn = db->BeginTransaction(write_options, TransactionOptions());
+  ASSERT_TRUE(txn);
+
+  ASSERT_EQ(0, txn->GetNumPuts());
+
+  s = txn->GetForUpdate(read_options, "foo", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar");
+
+  s = txn->Put(Slice("foo"), Slice("bar2"));
+  ASSERT_OK(s);
+
+  ASSERT_EQ(1, txn->GetNumPuts());
+
+  s = txn->GetForUpdate(read_options, "foo", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "foo", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+
+  delete txn;
+}
+
+TEST_F(TransactionTest, WriteConflictTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, "foo", "A");
+  db->Put(write_options, "foo2", "B");
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  s = txn->Put("foo", "A2");
+  ASSERT_OK(s);
+
+  s = txn->Put("foo2", "B2");
+  ASSERT_OK(s);
+
+  // This Put outside of a transaction will conflict with the previous write
+  s = db->Put(write_options, "foo", "xxx");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "A");
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "A2");
+  db->Get(read_options, "foo2", &value);
+  ASSERT_EQ(value, "B2");
+
+  delete txn;
+}
+
+TEST_F(TransactionTest, WriteConflictTest2) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, "foo", "bar");
+
+  txn_options.set_snapshot = true;
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+
+  // This Put outside of a transaction will conflict with a later write
+  s = db->Put(write_options, "foo", "barz");
+  ASSERT_OK(s);
+
+  s = txn->Put("foo2", "X");
+  ASSERT_OK(s);
+
+  s = txn->Put("foo",
+               "bar2");  // Conflicts with write done after snapshot taken
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn->Put("foo3", "Y");
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "barz");
+
+  ASSERT_EQ(2, txn->GetNumKeys());
+
+  s = txn->Commit();
+  ASSERT_OK(s);  // Txn should commit, but only write foo2 and foo3
+
+  // Verify that transaction wrote foo2 and foo3 but not foo
+  db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "barz");
+
+  db->Get(read_options, "foo2", &value);
+  ASSERT_EQ(value, "X");
+
+  db->Get(read_options, "foo3", &value);
+  ASSERT_EQ(value, "Y");
+
+  delete txn;
+}
+
+TEST_F(TransactionTest, ReadConflictTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  TransactionOptions txn_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, "foo", "bar");
+  db->Put(write_options, "foo2", "bar");
+
+  txn_options.set_snapshot = true;
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_EQ(value, "bar");
+
+  // This Put outside of a transaction will conflict with the previous read
+  s = db->Put(write_options, "foo", "barz");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "bar");
+
+  s = txn->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "bar");
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  delete txn;
+}
+
+TEST_F(TransactionTest, TxnOnlyTest) {
+  // Test to make sure transactions work when there are no other writes in an
+  // empty db.
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  s = txn->Put("x", "y");
+  ASSERT_OK(s);
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  delete txn;
+}
+
+TEST_F(TransactionTest, FlushTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, Slice("foo"), Slice("bar"));
+  db->Put(write_options, Slice("foo2"), Slice("bar"));
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_EQ(value, "bar");
+
+  s = txn->Put(Slice("foo"), Slice("bar2"));
+  ASSERT_OK(s);
+
+  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_EQ(value, "bar2");
+
+  // Put a random key so we have a memtable to flush
+  s = db->Put(write_options, "dummy", "dummy");
+  ASSERT_OK(s);
+
+  // force a memtable flush
+  FlushOptions flush_ops;
+  db->Flush(flush_ops);
+
+  s = txn->Commit();
+  // txn should commit since the flushed table is still in MemtableList History
+  ASSERT_OK(s);
+
+  db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "bar2");
+
+  delete txn;
+}
+
+TEST_F(TransactionTest, FlushTest2) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  TransactionOptions txn_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, Slice("foo"), Slice("bar"));
+  db->Put(write_options, Slice("foo2"), Slice("bar"));
+
+  txn_options.set_snapshot = true;
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_EQ(value, "bar");
+
+  s = txn->Put(Slice("foo"), Slice("bar2"));
+  ASSERT_OK(s);
+
+  txn->GetForUpdate(snapshot_read_options, "foo", &value);
+  ASSERT_EQ(value, "bar2");
+
+  // Put a random key so we have a MemTable to flush
+  s = db->Put(write_options, "dummy", "dummy");
+  ASSERT_OK(s);
+
+  // force a memtable flush
+  FlushOptions flush_ops;
+  db->Flush(flush_ops);
+
+  // Put a random key so we have a MemTable to flush
+  s = db->Put(write_options, "dummy", "dummy2");
+  ASSERT_OK(s);
+
+  // force a memtable flush
+  db->Flush(flush_ops);
+
+  s = db->Put(write_options, "dummy", "dummy3");
+  ASSERT_OK(s);
+
+  // force a memtable flush
+  // Since our test db has max_write_buffer_number=2, this flush will cause
+  // the first memtable to get purged from the MemtableList history.
+  db->Flush(flush_ops);
+
+  s = txn->Put("X", "Y");
+  // Put should fail since MemTableList History is not older than the snapshot.
+  ASSERT_TRUE(s.IsTryAgain());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  // Transaction should only write the keys that succeeded.
+  s = db->Get(read_options, "foo", &value);
+  ASSERT_EQ(value, "bar2");
+
+  s = db->Get(read_options, "X", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+}
+
+TEST_F(TransactionTest, NoSnapshotTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, "AAA", "bar");
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  // Modify key after transaction start
+  db->Put(write_options, "AAA", "bar1");
+
+  // Read and write without a snapshot
+  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_EQ(value, "bar1");
+  s = txn->Put("AAA", "bar2");
+  ASSERT_OK(s);
+
+  // Should commit since read/write was done after data changed
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_EQ(value, "bar2");
+
+  delete txn;
+}
+
+TEST_F(TransactionTest, MultipleSnapshotTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  string value;
+  Status s;
+
+  db->Put(write_options, "AAA", "bar");
+  db->Put(write_options, "BBB", "bar");
+  db->Put(write_options, "CCC", "bar");
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  db->Put(write_options, "AAA", "bar1");
+
+  // Read and write without a snapshot
+  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_EQ(value, "bar1");
+  s = txn->Put("AAA", "bar2");
+  ASSERT_OK(s);
+
+  // Modify BBB before snapshot is taken
+  db->Put(write_options, "BBB", "bar1");
+
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  // Read and write with snapshot
+  txn->GetForUpdate(snapshot_read_options, "BBB", &value);
+  ASSERT_EQ(value, "bar1");
+  s = txn->Put("BBB", "bar2");
+  ASSERT_OK(s);
+
+  db->Put(write_options, "CCC", "bar1");
+
+  // Set a new snapshot
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  // Read and write with snapshot
+  txn->GetForUpdate(snapshot_read_options, "CCC", &value);
+  ASSERT_EQ(value, "bar1");
+  s = txn->Put("CCC", "bar2");
+  ASSERT_OK(s);
+
+  s = txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+  s = txn->GetForUpdate(read_options, "BBB", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+  s = txn->GetForUpdate(read_options, "CCC", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+
+  s = db->Get(read_options, "AAA", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar1");
+  s = db->Get(read_options, "BBB", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar1");
+  s = db->Get(read_options, "CCC", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar1");
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "AAA", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+  s = db->Get(read_options, "BBB", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+  s = db->Get(read_options, "CCC", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "bar2");
+
+  // verify that we track multiple writes to the same key at different snapshots
+  delete txn;
+  txn = db->BeginTransaction(write_options);
+
+  // Potentially conflicting writes
+  db->Put(write_options, "ZZZ", "zzz");
+  db->Put(write_options, "XXX", "xxx");
+
+  txn->SetSnapshot();
+
+  TransactionOptions txn_options;
+  txn_options.set_snapshot = true;
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  txn2->SetSnapshot();
+
+  // This should not conflict in txn since the snapshot is later than the
+  // previous write (spoiler alert:  it will later conflict with txn2).
+  s = txn->Put("ZZZ", "zzzz");
+  ASSERT_OK(s);
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  delete txn;
+
+  // This will conflict since the snapshot is earlier than another write to ZZZ
+  s = txn2->Put("ZZZ", "xxxxx");
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "ZZZ", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "zzzz");
+
+  delete txn2;
+}
+
+TEST_F(TransactionTest, ColumnFamiliesTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  TransactionOptions txn_options;
+  string value;
+  Status s;
+
+  ColumnFamilyHandle *cfa, *cfb;
+  ColumnFamilyOptions cf_options;
+
+  // Create 2 new column families
+  s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
+  ASSERT_OK(s);
+  s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
+  ASSERT_OK(s);
+
+  delete cfa;
+  delete cfb;
+  delete db;
+
+  // open DB with three column families
+  std::vector<ColumnFamilyDescriptor> column_families;
+  // have to open default column family
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, ColumnFamilyOptions()));
+  // open the new column families
+  column_families.push_back(
+      ColumnFamilyDescriptor("CFA", ColumnFamilyOptions()));
+  column_families.push_back(
+      ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
+
+  std::vector<ColumnFamilyHandle*> handles;
+
+  s = TransactionDB::Open(options, txn_db_options, dbname, column_families,
+                          &handles, &db);
+  ASSERT_OK(s);
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  txn->SetSnapshot();
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn_options.set_snapshot = true;
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  // Write some data to the db
+  WriteBatch batch;
+  batch.Put("foo", "foo");
+  batch.Put(handles[1], "AAA", "bar");
+  batch.Put(handles[1], "AAAZZZ", "bar");
+  s = db->Write(write_options, &batch);
+  ASSERT_OK(s);
+  db->Delete(write_options, handles[1], "AAAZZZ");
+
+  // These keys do not conflict with existing writes since they're in
+  // different column families
+  s = txn->Delete("AAA");
+  ASSERT_OK(s);
+  s = txn->GetForUpdate(snapshot_read_options, handles[1], "foo", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  Slice key_slice("AAAZZZ");
+  Slice value_slices[2] = {Slice("bar"), Slice("bar")};
+  s = txn->Put(handles[2], SliceParts(&key_slice, 1),
+               SliceParts(value_slices, 2));
+  ASSERT_OK(s);
+  ASSERT_EQ(3, txn->GetNumKeys());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  s = db->Get(read_options, "AAA", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = db->Get(read_options, handles[2], "AAAZZZ", &value);
+  ASSERT_EQ(value, "barbar");
+
+  Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")};
+  Slice value_slice("barbarbar");
+
+  s = txn2->Delete(handles[2], "XXX");
+  ASSERT_OK(s);
+  s = txn2->Delete(handles[1], "XXX");
+  ASSERT_OK(s);
+
+  // This write will cause a conflict with the earlier batch write
+  s = txn2->Put(handles[1], SliceParts(key_slices, 3),
+                SliceParts(&value_slice, 1));
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  s = db->Get(read_options, handles[1], "AAAZZZ", &value);
+  ASSERT_EQ(value, "barbar");
+
+  delete txn;
+  delete txn2;
+
+  txn = db->BeginTransaction(write_options, txn_options);
+  snapshot_read_options.snapshot = txn->GetSnapshot();
+
+  txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+
+  std::vector<ColumnFamilyHandle*> multiget_cfh = {handles[1], handles[2],
+                                                   handles[0], handles[2]};
+  std::vector<Slice> multiget_keys = {"AAA", "AAAZZZ", "foo", "foo"};
+  std::vector<std::string> values(4);
+
+  std::vector<Status> results = txn->MultiGetForUpdate(
+      snapshot_read_options, multiget_cfh, multiget_keys, &values);
+  ASSERT_OK(results[0]);
+  ASSERT_OK(results[1]);
+  ASSERT_OK(results[2]);
+  ASSERT_TRUE(results[3].IsNotFound());
+  ASSERT_EQ(values[0], "bar");
+  ASSERT_EQ(values[1], "barbar");
+  ASSERT_EQ(values[2], "foo");
+
+  s = txn->SingleDelete(handles[2], "ZZZ");
+  ASSERT_OK(s);
+  s = txn->Put(handles[2], "ZZZ", "YYY");
+  ASSERT_OK(s);
+  s = txn->Put(handles[2], "ZZZ", "YYYY");
+  ASSERT_OK(s);
+  s = txn->Delete(handles[2], "ZZZ");
+  ASSERT_OK(s);
+  s = txn->Put(handles[2], "AAAZZZ", "barbarbar");
+  ASSERT_OK(s);
+
+  ASSERT_EQ(5, txn->GetNumKeys());
+
+  // Txn should commit
+  s = txn->Commit();
+  ASSERT_OK(s);
+  s = db->Get(read_options, handles[2], "ZZZ", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Put a key which will conflict with the next txn using the previous snapshot
+  db->Put(write_options, handles[2], "foo", "000");
+
+  results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh,
+                                    multiget_keys, &values);
+  // All results should fail since there was a conflict
+  ASSERT_TRUE(results[0].IsBusy());
+  ASSERT_TRUE(results[1].IsBusy());
+  ASSERT_TRUE(results[2].IsBusy());
+  ASSERT_TRUE(results[3].IsBusy());
+
+  s = db->Get(read_options, handles[2], "foo", &value);
+  ASSERT_EQ(value, "000");
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->DropColumnFamily(handles[1]);
+  ASSERT_OK(s);
+  s = db->DropColumnFamily(handles[2]);
+  ASSERT_OK(s);
+
+  delete txn;
+  delete txn2;
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+}
+
+TEST_F(TransactionTest, ColumnFamiliesTest2) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  TransactionOptions txn_options;
+  string value;
+  Status s;
+
+  ColumnFamilyHandle *one, *two;
+  ColumnFamilyOptions cf_options;
+
+  // Create 2 new column families
+  s = db->CreateColumnFamily(cf_options, "ONE", &one);
+  ASSERT_OK(s);
+  s = db->CreateColumnFamily(cf_options, "TWO", &two);
+  ASSERT_OK(s);
+
+  Transaction* txn1 = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn1);
+  Transaction* txn2 = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn2);
+
+  s = txn1->Put(one, "X", "1");
+  ASSERT_OK(s);
+  s = txn1->Put(two, "X", "2");
+  ASSERT_OK(s);
+  s = txn1->Put("X", "0");
+  ASSERT_OK(s);
+
+  s = txn2->Put(one, "X", "11");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  // Drop first column family
+  s = db->DropColumnFamily(one);
+  ASSERT_OK(s);
+
+  // Should fail since column family was dropped.
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  delete txn1;
+  txn1 = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn1);
+
+  // Should fail since column family was dropped
+  s = txn1->Put(one, "X", "111");
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  s = txn1->Put(two, "X", "222");
+  ASSERT_OK(s);
+
+  s = txn1->Put("X", "000");
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, two, "X", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("222", value);
+
+  s = db->Get(read_options, "X", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("000", value);
+
+  s = db->DropColumnFamily(two);
+  ASSERT_OK(s);
+
+  delete txn1;
+  delete txn2;
+
+  delete one;
+  delete two;
+}
+
+TEST_F(TransactionTest, EmptyTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  s = db->Put(write_options, "aaa", "aaa");
+  ASSERT_OK(s);
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  txn = db->BeginTransaction(write_options);
+  txn->Rollback();
+  delete txn;
+
+  txn = db->BeginTransaction(write_options);
+  s = txn->GetForUpdate(read_options, "aaa", &value);
+  ASSERT_EQ(value, "aaa");
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  txn = db->BeginTransaction(write_options);
+  txn->SetSnapshot();
+
+  s = txn->GetForUpdate(read_options, "aaa", &value);
+  ASSERT_EQ(value, "aaa");
+
+  // Conflicts with previous GetForUpdate
+  s = db->Put(write_options, "aaa", "xxx");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  // transaction expired!
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+}
+
+TEST_F(TransactionTest, PredicateManyPreceders) {
+  WriteOptions write_options;
+  ReadOptions read_options1, read_options2;
+  TransactionOptions txn_options;
+  string value;
+  Status s;
+
+  txn_options.set_snapshot = true;
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  Transaction* txn2 = db->BeginTransaction(write_options);
+  txn2->SetSnapshot();
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  std::vector<Slice> multiget_keys = {"1", "2", "3"};
+  std::vector<std::string> multiget_values;
+
+  std::vector<Status> results =
+      txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_TRUE(results[1].IsNotFound());
+
+  s = txn2->Put("2", "x");  // Conflict's with txn1's MultiGetForUpdate
+  ASSERT_TRUE(s.IsTimedOut());
+
+  txn2->Rollback();
+
+  multiget_values.clear();
+  results =
+      txn1->MultiGetForUpdate(read_options1, multiget_keys, &multiget_values);
+  ASSERT_TRUE(results[1].IsNotFound());
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  s = txn1->Put("4", "x");
+  ASSERT_OK(s);
+
+  s = txn2->Delete("4");  // conflict
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = txn2->GetForUpdate(read_options2, "4", &value);
+  ASSERT_TRUE(s.IsBusy());
+
+  txn2->Rollback();
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_F(TransactionTest, LostUpdate) {
+  WriteOptions write_options;
+  ReadOptions read_options, read_options1, read_options2;
+  TransactionOptions txn_options;
+  string value;
+  Status s;
+
+  // Test 2 transactions writing to the same key in multiple orders and
+  // with/without snapshots
+
+  Transaction* txn1 = db->BeginTransaction(write_options);
+  Transaction* txn2 = db->BeginTransaction(write_options);
+
+  s = txn1->Put("1", "1");
+  ASSERT_OK(s);
+
+  s = txn2->Put("1", "2");  // conflict
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "1", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("1", value);
+
+  delete txn1;
+  delete txn2;
+
+  txn_options.set_snapshot = true;
+  txn1 = db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  s = txn1->Put("1", "3");
+  ASSERT_OK(s);
+  s = txn2->Put("1", "4");  // conflict
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "1", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("3", value);
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  s = txn1->Put("1", "5");
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = txn2->Put("1", "6");
+  ASSERT_TRUE(s.IsBusy());
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "1", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("5", value);
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = db->BeginTransaction(write_options, txn_options);
+  read_options1.snapshot = txn1->GetSnapshot();
+
+  txn2 = db->BeginTransaction(write_options, txn_options);
+  read_options2.snapshot = txn2->GetSnapshot();
+
+  s = txn1->Put("1", "7");
+  ASSERT_OK(s);
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  txn2->SetSnapshot();
+  s = txn2->Put("1", "8");
+  ASSERT_OK(s);
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "1", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("8", value);
+
+  delete txn1;
+  delete txn2;
+
+  txn1 = db->BeginTransaction(write_options);
+  txn2 = db->BeginTransaction(write_options);
+
+  s = txn1->Put("1", "9");
+  ASSERT_OK(s);
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = txn2->Put("1", "10");
+  ASSERT_OK(s);
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  delete txn1;
+  delete txn2;
+
+  s = db->Get(read_options, "1", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(value, "10");
+}
+
+TEST_F(TransactionTest, UntrackedWrites) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  // Verify transaction rollback works for untracked keys.
+  Transaction* txn = db->BeginTransaction(write_options);
+  txn->SetSnapshot();
+
+  s = txn->PutUntracked("untracked", "0");
+  ASSERT_OK(s);
+  txn->Rollback();
+  s = db->Get(read_options, "untracked", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+  txn = db->BeginTransaction(write_options);
+  txn->SetSnapshot();
+
+  s = db->Put(write_options, "untracked", "x");
+  ASSERT_OK(s);
+
+  // Untracked writes should succeed even though key was written after snapshot
+  s = txn->PutUntracked("untracked", "1");
+  ASSERT_OK(s);
+  s = txn->MergeUntracked("untracked", "2");
+  ASSERT_OK(s);
+  s = txn->DeleteUntracked("untracked");
+  ASSERT_OK(s);
+
+  // Conflict
+  s = txn->Put("untracked", "3");
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "untracked", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+}
+
+TEST_F(TransactionTest, ExpiredTransaction) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  string value;
+  Status s;
+
+  // Set txn expiration timeout to 0 microseconds (expires instantly)
+  txn_options.expiration = 0;
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  s = txn1->Put("X", "1");
+  ASSERT_OK(s);
+
+  s = txn1->Put("Y", "1");
+  ASSERT_OK(s);
+
+  Transaction* txn2 = db->BeginTransaction(write_options);
+
+  // txn2 should be able to write to X since txn1 has expired
+  s = txn2->Put("X", "2");
+  ASSERT_OK(s);
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  s = db->Get(read_options, "X", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("2", value);
+
+  s = txn1->Put("Z", "1");
+  ASSERT_OK(s);
+
+  // txn1 should fail to commit since it is expired
+  s = txn1->Commit();
+  ASSERT_TRUE(s.IsExpired());
+
+  s = db->Get(read_options, "Y", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = db->Get(read_options, "Z", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_F(TransactionTest, Rollback) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  string value;
+  Status s;
+
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+
+  ASSERT_OK(s);
+
+  s = txn1->Put("X", "1");
+  ASSERT_OK(s);
+
+  Transaction* txn2 = db->BeginTransaction(write_options);
+
+  // txn2 should not be able to write to X since txn1 has it locked
+  s = txn2->Put("X", "2");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  txn1->Rollback();
+  delete txn1;
+
+  // txn2 should now be able to write to X
+  s = txn2->Put("X", "3");
+  ASSERT_OK(s);
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "X", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("3", value);
+
+  delete txn2;
+}
+
+TEST_F(TransactionTest, LockLimitTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  TransactionOptions txn_options;
+  string value;
+  Status s;
+
+  delete db;
+
+  // Open DB with a lock limit of 3
+  txn_db_options.max_num_locks = 3;
+  s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+  ASSERT_OK(s);
+
+  // Create a txn and verify we can only lock up to 3 keys
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  s = txn->Put("X", "x");
+  ASSERT_OK(s);
+
+  s = txn->Put("Y", "y");
+  ASSERT_OK(s);
+
+  s = txn->Put("Z", "z");
+  ASSERT_OK(s);
+
+  // lock limit reached
+  s = txn->Put("W", "w");
+  ASSERT_TRUE(s.IsBusy());
+
+  // re-locking same key shouldn't put us over the limit
+  s = txn->Put("X", "xx");
+  ASSERT_OK(s);
+
+  s = txn->GetForUpdate(read_options, "W", &value);
+  ASSERT_TRUE(s.IsBusy());
+  s = txn->GetForUpdate(read_options, "V", &value);
+  ASSERT_TRUE(s.IsBusy());
+
+  // re-locking same key shouldn't put us over the limit
+  s = txn->GetForUpdate(read_options, "Y", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("y", value);
+
+  s = txn->Get(read_options, "W", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  Transaction* txn2 = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn2);
+
+  // "X" currently locked
+  s = txn2->Put("X", "x");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  // lock limit reached
+  s = txn2->Put("M", "m");
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "X", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("xx", value);
+
+  s = db->Get(read_options, "W", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  // Committing txn should release its locks and allow txn2 to proceed
+  s = txn2->Put("X", "x2");
+  ASSERT_OK(s);
+
+  s = txn2->Delete("X");
+  ASSERT_OK(s);
+
+  s = txn2->Put("M", "m");
+  ASSERT_OK(s);
+
+  s = txn2->Put("Z", "z2");
+  ASSERT_OK(s);
+
+  // lock limit reached
+  s = txn2->Delete("Y");
+  ASSERT_TRUE(s.IsBusy());
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "Z", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("z2", value);
+
+  s = db->Get(read_options, "Y", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("y", value);
+
+  s = db->Get(read_options, "X", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+  delete txn2;
+}
+
+TEST_F(TransactionTest, IteratorTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  TransactionOptions txn_options;
+  string value;
+  Status s;
+
+  // Write some keys to the db
+  s = db->Put(write_options, "A", "a");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "G", "g");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "F", "f");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "C", "c");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "D", "d");
+  ASSERT_OK(s);
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  // Write some keys in a txn
+  s = txn->Put("B", "b");
+  ASSERT_OK(s);
+
+  s = txn->Put("H", "h");
+  ASSERT_OK(s);
+
+  s = txn->Delete("D");
+  ASSERT_OK(s);
+
+  s = txn->Put("E", "e");
+  ASSERT_OK(s);
+
+  txn->SetSnapshot();
+  const Snapshot* snapshot = txn->GetSnapshot();
+
+  // Write some keys to the db after the snapshot
+  s = db->Put(write_options, "BB", "xx");
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "C", "xx");
+  ASSERT_OK(s);
+
+  read_options.snapshot = snapshot;
+  Iterator* iter = txn->GetIterator(read_options);
+  ASSERT_OK(iter->status());
+  iter->SeekToFirst();
+
+  // Read all keys via iter and lock them all
+  std::string results[] = {"a", "b", "c", "e", "f", "g", "h"};
+  for (int i = 0; i < 7; i++) {
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(results[i], iter->value().ToString());
+
+    s = txn->GetForUpdate(read_options, iter->key(), nullptr);
+    if (i == 2) {
+      // "C" was modified after txn's snapshot
+      ASSERT_TRUE(s.IsBusy());
+    } else {
+      ASSERT_OK(s);
+    }
+
+    iter->Next();
+  }
+  ASSERT_FALSE(iter->Valid());
+
+  iter->Seek("G");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("g", iter->value().ToString());
+
+  iter->Prev();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("f", iter->value().ToString());
+
+  iter->Seek("D");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("e", iter->value().ToString());
+
+  iter->Seek("C");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("c", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("e", iter->value().ToString());
+
+  iter->Seek("");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("a", iter->value().ToString());
+
+  iter->Seek("X");
+  ASSERT_OK(iter->status());
+  ASSERT_FALSE(iter->Valid());
+
+  iter->SeekToLast();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("h", iter->value().ToString());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  delete iter;
+  delete txn;
+}
+
+TEST_F(TransactionTest, SavepointTest) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  TransactionOptions txn_options;
+  string value;
+  Status s;
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  ASSERT_EQ(0, txn->GetNumPuts());
+
+  s = txn->RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+
+  txn->SetSavePoint();  // 1
+
+  ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to beginning of txn
+  s = txn->RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Put("B", "b");
+  ASSERT_OK(s);
+
+  ASSERT_EQ(1, txn->GetNumPuts());
+  ASSERT_EQ(0, txn->GetNumDeletes());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  delete txn;
+  txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  s = txn->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn->Put("B", "bb");
+  ASSERT_OK(s);
+
+  s = txn->Put("C", "c");
+  ASSERT_OK(s);
+
+  txn->SetSavePoint();  // 2
+
+  s = txn->Delete("B");
+  ASSERT_OK(s);
+
+  s = txn->Put("C", "cc");
+  ASSERT_OK(s);
+
+  s = txn->Put("D", "d");
+  ASSERT_OK(s);
+
+  ASSERT_EQ(5, txn->GetNumPuts());
+  ASSERT_EQ(1, txn->GetNumDeletes());
+
+  ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to 2
+
+  ASSERT_EQ(3, txn->GetNumPuts());
+  ASSERT_EQ(0, txn->GetNumDeletes());
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a", value);
+
+  s = txn->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("bb", value);
+
+  s = txn->Get(read_options, "C", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("c", value);
+
+  s = txn->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn->Put("E", "e");
+  ASSERT_OK(s);
+
+  ASSERT_EQ(5, txn->GetNumPuts());
+  ASSERT_EQ(0, txn->GetNumDeletes());
+
+  // Rollback to beginning of txn
+  s = txn->RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  txn->Rollback();
+
+  ASSERT_EQ(0, txn->GetNumPuts());
+  ASSERT_EQ(0, txn->GetNumDeletes());
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  s = txn->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Get(read_options, "E", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Put("A", "aa");
+  ASSERT_OK(s);
+
+  s = txn->Put("F", "f");
+  ASSERT_OK(s);
+
+  ASSERT_EQ(2, txn->GetNumPuts());
+  ASSERT_EQ(0, txn->GetNumDeletes());
+
+  txn->SetSavePoint();  // 3
+  txn->SetSavePoint();  // 4
+
+  s = txn->Put("G", "g");
+  ASSERT_OK(s);
+
+  s = txn->SingleDelete("F");
+  ASSERT_OK(s);
+
+  s = txn->Delete("B");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("aa", value);
+
+  s = txn->Get(read_options, "F", &value);
+  // According to db.h, doing a SingleDelete on a key that has been
+  // overwritten will have undefinied behavior.  So it is unclear what the
+  // result of fetching "F" should be. The current implementation will
+  // return NotFound in this case.
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Get(read_options, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  ASSERT_EQ(3, txn->GetNumPuts());
+  ASSERT_EQ(2, txn->GetNumDeletes());
+
+  ASSERT_OK(txn->RollbackToSavePoint());  // Rollback to 3
+
+  ASSERT_EQ(2, txn->GetNumPuts());
+  ASSERT_EQ(0, txn->GetNumDeletes());
+
+  s = txn->Get(read_options, "F", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("f", value);
+
+  s = txn->Get(read_options, "G", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "F", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("f", value);
+
+  s = db->Get(read_options, "G", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = db->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("aa", value);
+
+  s = db->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  s = db->Get(read_options, "C", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = db->Get(read_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = db->Get(read_options, "E", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete txn;
+}
+
+TEST_F(TransactionTest, SavepointTest2) {
+  WriteOptions write_options;
+  ReadOptions read_options, snapshot_read_options;
+  TransactionOptions txn_options;
+  string value;
+  Status s;
+
+  txn_options.lock_timeout = 1;  // 1 ms
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+
+  s = txn1->Put("A", "");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();  // 1
+
+  s = txn1->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn1->Put("C", "c");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();  // 2
+
+  s = txn1->Put("A", "a");
+  ASSERT_OK(s);
+  s = txn1->Put("B", "b");
+  ASSERT_OK(s);
+
+  ASSERT_OK(txn1->RollbackToSavePoint());  // Rollback to 2
+
+  // Verify that "A" and "C" is still locked while "B" is not
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  s = txn2->Put("A", "a2");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("C", "c2");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("B", "b2");
+  ASSERT_OK(s);
+
+  s = txn1->Put("A", "aa");
+  ASSERT_OK(s);
+  s = txn1->Put("B", "bb");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  delete txn2;
+
+  s = txn1->Put("A", "aaa");
+  ASSERT_OK(s);
+  s = txn1->Put("B", "bbb");
+  ASSERT_OK(s);
+  s = txn1->Put("C", "ccc");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();                    // 3
+  ASSERT_OK(txn1->RollbackToSavePoint());  // Rollback to 3
+
+  // Verify that "A", "B", "C" are still locked
+  txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  s = txn2->Put("A", "a2");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("B", "b2");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("C", "c2");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  ASSERT_OK(txn1->RollbackToSavePoint());  // Rollback to 1
+
+  // Verify that only "A" is locked
+  s = txn2->Put("A", "a3");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Put("B", "b3");
+  ASSERT_OK(s);
+  s = txn2->Put("C", "c3po");
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+  delete txn1;
+
+  // Verify "A" "C" "B" are no longer locked
+  s = txn2->Put("A", "a4");
+  ASSERT_OK(s);
+  s = txn2->Put("B", "b4");
+  ASSERT_OK(s);
+  s = txn2->Put("C", "c4");
+  ASSERT_OK(s);
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  delete txn2;
+}
+
+TEST_F(TransactionTest, TimeoutTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  delete db;
+
+  // transaction writes have an infinite timeout,
+  // but we will override this when we start a txn
+  // db writes have infinite timeout
+  txn_db_options.transaction_lock_timeout = -1;
+  txn_db_options.default_lock_timeout = -1;
+
+  s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "aaa", "aaa");
+  ASSERT_OK(s);
+
+  TransactionOptions txn_options0;
+  txn_options0.expiration = 100;  // 100ms
+  txn_options0.lock_timeout = 50;  // txn timeout no longer infinite
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options0);
+
+  s = txn1->GetForUpdate(read_options, "aaa", nullptr);
+  ASSERT_OK(s);
+
+  // Conflicts with previous GetForUpdate.
+  // Since db writes do not have a timeout, this should eventually succeed when
+  // the transaction expires.
+  s = db->Put(write_options, "aaa", "xxx");
+  ASSERT_OK(s);
+
+  ASSERT_GE(txn1->GetElapsedTime(),
+            static_cast<uint64_t>(txn_options0.expiration));
+
+  s = txn1->Commit();
+  ASSERT_TRUE(s.IsExpired());  // expired!
+
+  s = db->Get(read_options, "aaa", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("xxx", value);
+
+  delete txn1;
+  delete db;
+
+  // transaction writes have 10ms timeout,
+  // db writes have infinite timeout
+  txn_db_options.transaction_lock_timeout = 50;
+  txn_db_options.default_lock_timeout = -1;
+
+  s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+  ASSERT_OK(s);
+
+  s = db->Put(write_options, "aaa", "aaa");
+  ASSERT_OK(s);
+
+  TransactionOptions txn_options;
+  txn_options.expiration = 100;  // 100ms
+  txn1 = db->BeginTransaction(write_options, txn_options);
+
+  s = txn1->GetForUpdate(read_options, "aaa", nullptr);
+  ASSERT_OK(s);
+
+  // Conflicts with previous GetForUpdate.
+  // Since db writes do not have a timeout, this should eventually succeed when
+  // the transaction expires.
+  s = db->Put(write_options, "aaa", "xxx");
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_NOK(s);  // expired!
+
+  s = db->Get(read_options, "aaa", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("xxx", value);
+
+  delete txn1;
+  txn_options.expiration = 6000000;  // 100 minutes
+  txn_options.lock_timeout = 1;      // 1ms
+  txn1 = db->BeginTransaction(write_options, txn_options);
+  txn1->SetLockTimeout(100);
+
+  TransactionOptions txn_options2;
+  txn_options2.expiration = 10;  // 10ms
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options2);
+  ASSERT_OK(s);
+
+  s = txn2->Put("a", "2");
+  ASSERT_OK(s);
+
+  // txn1 has a lock timeout longer than txn2's expiration, so it will win
+  s = txn1->Delete("a");
+  ASSERT_OK(s);
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  // txn2 should be expired out since txn1 waiting until its timeout expired.
+  s = txn2->Commit();
+  ASSERT_TRUE(s.IsExpired());
+
+  delete txn1;
+  delete txn2;
+  txn_options.expiration = 6000000;  // 100 minutes
+  txn1 = db->BeginTransaction(write_options, txn_options);
+  txn_options2.expiration = 100000000;
+  txn2 = db->BeginTransaction(write_options, txn_options2);
+
+  s = txn1->Delete("asdf");
+  ASSERT_OK(s);
+
+  // txn2 has a smaller lock timeout than txn1's expiration, so it will time out
+  s = txn2->Delete("asdf");
+  ASSERT_TRUE(s.IsTimedOut());
+  ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+
+  s = txn2->Put("asdf", "asdf");
+  ASSERT_OK(s);
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "asdf", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("asdf", value);
+
+  delete txn1;
+  delete txn2;
+}
+
+TEST_F(TransactionTest, SingleDeleteTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  Transaction* txn = db->BeginTransaction(write_options);
+  ASSERT_TRUE(txn);
+
+  s = txn->SingleDelete("A");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  txn = db->BeginTransaction(write_options);
+
+  s = txn->SingleDelete("A");
+  ASSERT_OK(s);
+
+  s = txn->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a", value);
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  s = db->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a", value);
+
+  txn = db->BeginTransaction(write_options);
+
+  s = txn->SingleDelete("A");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  s = db->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  txn = db->BeginTransaction(write_options);
+  Transaction* txn2 = db->BeginTransaction(write_options);
+  txn2->SetSnapshot();
+
+  s = txn->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn->Put("A", "a2");
+  ASSERT_OK(s);
+
+  s = txn->SingleDelete("A");
+  ASSERT_OK(s);
+
+  s = txn->SingleDelete("B");
+  ASSERT_OK(s);
+
+  // According to db.h, doing a SingleDelete on a key that has been
+  // overwritten will have undefinied behavior.  So it is unclear what the
+  // result of fetching "A" should be. The current implementation will
+  // return NotFound in this case.
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn2->Put("B", "b");
+  ASSERT_TRUE(s.IsTimedOut());
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  delete txn2;
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  // According to db.h, doing a SingleDelete on a key that has been
+  // overwritten will have undefinied behavior.  So it is unclear what the
+  // result of fetching "A" should be. The current implementation will
+  // return NotFound in this case.
+  s = db->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = db->Get(read_options, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_F(TransactionTest, MergeTest) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  Status s;
+
+  Transaction* txn = db->BeginTransaction(write_options, TransactionOptions());
+  ASSERT_TRUE(txn);
+
+  s = db->Put(write_options, "A", "a0");
+  ASSERT_OK(s);
+
+  s = txn->Merge("A", "1");
+  ASSERT_OK(s);
+
+  s = txn->Merge("A", "2");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsMergeInProgress());
+
+  s = txn->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a", value);
+
+  s = txn->Merge("A", "3");
+  ASSERT_OK(s);
+
+  s = txn->Get(read_options, "A", &value);
+  ASSERT_TRUE(s.IsMergeInProgress());
+
+  TransactionOptions txn_options;
+  txn_options.lock_timeout = 1;  // 1 ms
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  // verify that txn has "A" locked
+  s = txn2->Merge("A", "4");
+  ASSERT_TRUE(s.IsTimedOut());
+
+  s = txn2->Commit();
+  ASSERT_OK(s);
+  delete txn2;
+
+  s = txn->Commit();
+  ASSERT_OK(s);
+  delete txn;
+
+  s = db->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a,3", value);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr,
+          "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_util.cc b/src/rocksdb/utilities/transactions/transaction_util.cc
new file mode 100644
index 0000000..413cfbb
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_util.cc
@@ -0,0 +1,147 @@
+//  Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "utilities/transactions/transaction_util.h"
+
+#include <inttypes.h>
+#include <string>
+#include <vector>
+
+#include "db/db_impl.h"
+#include "rocksdb/status.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+Status TransactionUtil::CheckKeyForConflicts(DBImpl* db_impl,
+                                             ColumnFamilyHandle* column_family,
+                                             const std::string& key,
+                                             SequenceNumber key_seq) {
+  Status result;
+
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  SuperVersion* sv = db_impl->GetAndRefSuperVersion(cfd);
+
+  if (sv == nullptr) {
+    result = Status::InvalidArgument("Could not access column family " +
+                                     cfh->GetName());
+  }
+
+  if (result.ok()) {
+    SequenceNumber earliest_seq =
+        db_impl->GetEarliestMemTableSequenceNumber(sv, true);
+
+    result = CheckKey(db_impl, sv, earliest_seq, key_seq, key);
+
+    db_impl->ReturnAndCleanupSuperVersion(cfd, sv);
+  }
+
+  return result;
+}
+
+Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
+                                 SequenceNumber earliest_seq,
+                                 SequenceNumber key_seq,
+                                 const std::string& key) {
+  Status result;
+
+  // Since it would be too slow to check the SST files, we will only use
+  // the memtables to check whether there have been any recent writes
+  // to this key after it was accessed in this transaction.  But if the
+  // Memtables do not contain a long enough history, we must fail the
+  // transaction.
+  if (earliest_seq == kMaxSequenceNumber) {
+    // The age of this memtable is unknown.  Cannot rely on it to check
+    // for recent writes.  This error shouldn't happen often in practice as
+    // the
+    // Memtable should have a valid earliest sequence number except in some
+    // corner cases (such as error cases during recovery).
+    result = Status::TryAgain(
+        "Transaction ould not check for conflicts as the MemTable does not "
+        "countain a long enough history to check write at SequenceNumber: ",
+        ToString(key_seq));
+
+  } else if (key_seq < earliest_seq) {
+    // The age of this memtable is too new to use to check for recent
+    // writes.
+    char msg[255];
+    snprintf(msg, sizeof(msg),
+             "Transaction could not check for conflicts for opearation at "
+             "SequenceNumber %" PRIu64
+             " as the MemTable only contains changes newer than SequenceNumber "
+             "%" PRIu64
+             ".  Increasing the value of the "
+             "max_write_buffer_number_to_maintain option could reduce the "
+             "frequency "
+             "of this error.",
+             key_seq, earliest_seq);
+    result = Status::TryAgain(msg);
+  } else {
+    SequenceNumber seq = kMaxSequenceNumber;
+    Status s = db_impl->GetLatestSequenceForKeyFromMemtable(sv, key, &seq);
+    if (!s.ok()) {
+      result = s;
+    } else if (seq != kMaxSequenceNumber && seq > key_seq) {
+      // Write Conflict
+      result = Status::Busy();
+    }
+  }
+
+  return result;
+}
+
+Status TransactionUtil::CheckKeysForConflicts(
+    DBImpl* db_impl, const TransactionKeyMap& key_map) {
+  Status result;
+
+  for (auto& key_map_iter : key_map) {
+    uint32_t cf_id = key_map_iter.first;
+    const auto& keys = key_map_iter.second;
+
+    SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf_id);
+    if (sv == nullptr) {
+      result = Status::InvalidArgument("Could not access column family " +
+                                       ToString(cf_id));
+      break;
+    }
+
+    SequenceNumber earliest_seq =
+        db_impl->GetEarliestMemTableSequenceNumber(sv, true);
+
+    // For each of the keys in this transaction, check to see if someone has
+    // written to this key since the start of the transaction.
+    for (const auto& key_iter : keys) {
+      const auto& key = key_iter.first;
+      const SequenceNumber key_seq = key_iter.second;
+
+      result = CheckKey(db_impl, sv, earliest_seq, key_seq, key);
+
+      if (!result.ok()) {
+        break;
+      }
+    }
+
+    db_impl->ReturnAndCleanupSuperVersion(cf_id, sv);
+
+    if (!result.ok()) {
+      break;
+    }
+  }
+
+  return result;
+}
+
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/transactions/transaction_util.h b/src/rocksdb/utilities/transactions/transaction_util.h
new file mode 100644
index 0000000..c843b0e
--- /dev/null
+++ b/src/rocksdb/utilities/transactions/transaction_util.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2015, Facebook, Inc.  All rights reserved.
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree. An additional grant
+// of patent rights can be found in the PATENTS file in the same directory.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <unordered_map>
+
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace rocksdb {
+
+using TransactionKeyMap =
+    std::unordered_map<uint32_t,
+                       std::unordered_map<std::string, SequenceNumber>>;
+
+class DBImpl;
+struct SuperVersion;
+class WriteBatchWithIndex;
+
+class TransactionUtil {
+ public:
+  // Verifies there have been no writes to this key in the db since this
+  // sequence number.
+  //
+  // Returns OK on success, BUSY if there is a conflicting write, or other error
+  // status for any unexpected errors.
+  static Status CheckKeyForConflicts(DBImpl* db_impl,
+                                     ColumnFamilyHandle* column_family,
+                                     const std::string& key,
+                                     SequenceNumber key_seq);
+
+  // For each key,SequenceNumber pair in the TransactionKeyMap, this function
+  // will verify there have been no writes to the key in the db since that
+  // sequence number.
+  //
+  // Returns OK on success, BUSY if there is a conflicting write, or other error
+  // status for any unexpected errors.
+  //
+  // REQUIRED: this function should only be called on the write thread or if the
+  // mutex is held.
+  static Status CheckKeysForConflicts(DBImpl* db_impl,
+                                      const TransactionKeyMap& keys);
+
+ private:
+  static Status CheckKey(DBImpl* db_impl, SuperVersion* sv,
+                         SequenceNumber earliest_seq, SequenceNumber key_seq,
+                         const std::string& key);
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/ttl/db_ttl_impl.cc b/src/rocksdb/utilities/ttl/db_ttl_impl.cc
index f3d9417..b9edb3c 100644
--- a/src/rocksdb/utilities/ttl/db_ttl_impl.cc
+++ b/src/rocksdb/utilities/ttl/db_ttl_impl.cc
@@ -5,13 +5,13 @@
 
 #include "utilities/ttl/db_ttl_impl.h"
 
-#include "rocksdb/utilities/convenience.h"
-#include "rocksdb/utilities/db_ttl.h"
 #include "db/filename.h"
 #include "db/write_batch_internal.h"
-#include "util/coding.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "util/coding.h"
 
 namespace rocksdb {
 
diff --git a/src/rocksdb/utilities/ttl/db_ttl_impl.h b/src/rocksdb/utilities/ttl/db_ttl_impl.h
index 9abf6fc..a96123d 100644
--- a/src/rocksdb/utilities/ttl/db_ttl_impl.h
+++ b/src/rocksdb/utilities/ttl/db_ttl_impl.h
@@ -17,6 +17,12 @@
 #include "rocksdb/utilities/db_ttl.h"
 #include "db/db_impl.h"
 
+#ifdef _WIN32
+// Windows API macro interference
+#undef GetCurrentTime
+#endif
+
+
 namespace rocksdb {
 
 class DBWithTTLImpl : public DBWithTTL {
@@ -188,9 +194,15 @@ class TtlCompactionFilterFactory : public CompactionFilterFactory {
 
   virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) override {
+    std::unique_ptr<const CompactionFilter> user_comp_filter_from_factory =
+        nullptr;
+    if (user_comp_filter_factory_) {
+      user_comp_filter_from_factory =
+          user_comp_filter_factory_->CreateCompactionFilter(context);
+    }
+
     return std::unique_ptr<TtlCompactionFilter>(new TtlCompactionFilter(
-        ttl_, env_, nullptr,
-        std::move(user_comp_filter_factory_->CreateCompactionFilter(context))));
+        ttl_, env_, nullptr, std::move(user_comp_filter_from_factory)));
   }
 
   virtual const char* Name() const override {
diff --git a/src/rocksdb/utilities/ttl/ttl_test.cc b/src/rocksdb/utilities/ttl/ttl_test.cc
index c970047..81fad49 100644
--- a/src/rocksdb/utilities/ttl/ttl_test.cc
+++ b/src/rocksdb/utilities/ttl/ttl_test.cc
@@ -2,13 +2,17 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#ifndef ROCKSDB_LITE
+
 #include <memory>
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/utilities/db_ttl.h"
 #include "util/testharness.h"
 #include "util/logging.h"
 #include <map>
+#ifndef OS_WIN
 #include <unistd.h>
+#endif
 
 namespace rocksdb {
 
@@ -16,10 +20,7 @@ namespace {
 
 typedef std::map<std::string, std::string> KVMap;
 
-enum BatchOperation {
-  PUT = 0,
-  DELETE = 1
-};
+enum BatchOperation { OP_PUT = 0, OP_DELETE = 1 };
 }
 
 class SpecialTimeEnv : public EnvWrapper {
@@ -48,7 +49,6 @@ class TtlTest : public testing::Test {
     // ensure that compaction is kicked in to always strip timestamp from kvs
     options_.max_grandparent_overlap_factor = 0;
     // compaction should take place always from level0 for determinism
-    options_.max_mem_compaction_level = 0;
     db_ttl_ = nullptr;
     DestroyDB(dbname_, Options());
   }
@@ -124,10 +124,10 @@ class TtlTest : public testing::Test {
     kv_it_ = kvmap_.begin();
     for (int64_t i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, ++kv_it_) {
       switch (batch_ops[i]) {
-        case PUT:
+        case OP_PUT:
           batch.Put(kv_it_->first, kv_it_->second);
           break;
-        case DELETE:
+        case OP_DELETE:
           batch.Delete(kv_it_->first);
           break;
         default:
@@ -168,9 +168,9 @@ class TtlTest : public testing::Test {
   // Runs a manual compaction
   void ManualCompact(ColumnFamilyHandle* cf = nullptr) {
     if (cf == nullptr) {
-      db_ttl_->CompactRange(nullptr, nullptr);
+      db_ttl_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
     } else {
-      db_ttl_->CompactRange(cf, nullptr, nullptr);
+      db_ttl_->CompactRange(CompactRangeOptions(), cf, nullptr, nullptr);
     }
   }
 
@@ -305,7 +305,13 @@ class TtlTest : public testing::Test {
       size_t pos = key_string.find_first_of(search_str);
       int num_key_end;
       if (pos != std::string::npos) {
-        num_key_end = stoi(key_string.substr(pos, key.size() - pos));
+        auto key_substr = key_string.substr(pos, key.size() - pos);
+#ifndef CYGWIN
+        num_key_end = std::stoi(key_substr);
+#else
+        num_key_end = std::strtol(key_substr.c_str(), 0, 10);
+#endif
+
       } else {
         return false; // Keep keys not matching the format "key<NUMBER>"
       }
@@ -355,7 +361,7 @@ class TtlTest : public testing::Test {
 
 
   // Choose carefully so that Put, Gets & Compaction complete in 1 second buffer
-  const int64_t kSampleSize_ = 100;
+  static const int64_t kSampleSize_ = 100;
   std::string dbname_;
   DBWithTTL* db_ttl_;
   unique_ptr<SpecialTimeEnv> env_;
@@ -506,13 +512,13 @@ TEST_F(TtlTest, WriteBatchTest) {
   MakeKVMap(kSampleSize_);
   BatchOperation batch_ops[kSampleSize_];
   for (int i = 0; i < kSampleSize_; i++) {
-    batch_ops[i] = PUT;
+    batch_ops[i] = OP_PUT;
   }
 
   OpenTtl(2);
   MakePutWriteBatch(batch_ops, kSampleSize_);
   for (int i = 0; i < kSampleSize_ / 2; i++) {
-    batch_ops[i] = DELETE;
+    batch_ops[i] = OP_DELETE;
   }
   MakePutWriteBatch(batch_ops, kSampleSize_ / 2);
   SleepCompactCheck(0, 0, kSampleSize_ / 2, false);
@@ -627,3 +633,13 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+  fprintf(stderr, "SKIPPED as DBWithTTL is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc
index 0c3e02f..ba90ec1 100644
--- a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -3,16 +3,19 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef ROCKSDB_LITE
+
 #include "rocksdb/utilities/write_batch_with_index.h"
 
+#include <limits>
 #include <memory>
 
-#include "rocksdb/comparator.h"
-#include "rocksdb/iterator.h"
 #include "db/column_family.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "db/skiplist.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/iterator.h"
 #include "util/arena.h"
 #include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
 
@@ -89,7 +92,8 @@ class BaseDeltaIterator : public Iterator {
         AdvanceBase();
       }
       if (DeltaValid() && BaseValid()) {
-        if (Compare() == 0) {
+        if (comparator_->Equal(delta_iterator_->Entry().key,
+                               base_iterator_->key())) {
           equal_keys_ = true;
         }
       }
@@ -123,7 +127,8 @@ class BaseDeltaIterator : public Iterator {
         AdvanceBase();
       }
       if (DeltaValid() && BaseValid()) {
-        if (Compare() == 0) {
+        if (comparator_->Equal(delta_iterator_->Entry().key,
+                               base_iterator_->key())) {
           equal_keys_ = true;
         }
       }
@@ -153,23 +158,6 @@ class BaseDeltaIterator : public Iterator {
   }
 
  private:
-  // -1 -- delta less advanced than base
-  // 0 -- delta == base
-  // 1 -- delta more advanced than base
-  int Compare() const {
-    assert(delta_iterator_->Valid() && base_iterator_->Valid());
-    int cmp = comparator_->Compare(delta_iterator_->Entry().key,
-                                   base_iterator_->key());
-    if (forward_) {
-      return cmp;
-    } else {
-      return -cmp;
-    }
-  }
-  bool IsDeltaDelete() {
-    assert(DeltaValid());
-    return delta_iterator_->Entry().type == kDeleteRecord;
-  }
   void AssertInvariants() {
 #ifndef NDEBUG
     if (!Valid()) {
@@ -239,6 +227,10 @@ class BaseDeltaIterator : public Iterator {
   bool DeltaValid() const { return delta_iterator_->Valid(); }
   void UpdateCurrent() {
     while (true) {
+      WriteEntry delta_entry;
+      if (DeltaValid()) {
+        delta_entry = delta_iterator_->Entry();
+      }
       equal_keys_ = false;
       if (!BaseValid()) {
         // Base has finished.
@@ -246,7 +238,8 @@ class BaseDeltaIterator : public Iterator {
           // Finished
           return;
         }
-        if (IsDeltaDelete()) {
+        if (delta_entry.type == kDeleteRecord ||
+            delta_entry.type == kSingleDeleteRecord) {
           AdvanceDelta();
         } else {
           current_at_base_ = false;
@@ -257,12 +250,15 @@ class BaseDeltaIterator : public Iterator {
         current_at_base_ = true;
         return;
       } else {
-        int compare = Compare();
+        int compare =
+            (forward_ ? 1 : -1) *
+            comparator_->Compare(delta_entry.key, base_iterator_->key());
         if (compare <= 0) {  // delta bigger or equal
           if (compare == 0) {
             equal_keys_ = true;
           }
-          if (!IsDeltaDelete()) {
+          if (delta_entry.type != kDeleteRecord &&
+              delta_entry.type != kSingleDeleteRecord) {
             current_at_base_ = false;
             return;
           }
@@ -300,23 +296,26 @@ class WBWIIteratorImpl : public WBWIIterator {
                    const ReadableWriteBatch* write_batch)
       : column_family_id_(column_family_id),
         skip_list_iter_(skip_list),
-        write_batch_(write_batch),
-        valid_(false) {}
+        write_batch_(write_batch) {}
 
   virtual ~WBWIIteratorImpl() {}
 
-  virtual bool Valid() const override { return valid_; }
+  virtual bool Valid() const override {
+    if (!skip_list_iter_.Valid()) {
+      return false;
+    }
+    const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
+    return (iter_entry != nullptr &&
+            iter_entry->column_family == column_family_id_);
+  }
 
   virtual void SeekToFirst() override {
-    valid_ = true;
     WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
                                       column_family_id_);
     skip_list_iter_.Seek(&search_entry);
-    ReadEntry();
   }
 
   virtual void SeekToLast() override {
-    valid_ = true;
     WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
                                       column_family_id_ + 1);
     skip_list_iter_.Seek(&search_entry);
@@ -325,29 +324,37 @@ class WBWIIteratorImpl : public WBWIIterator {
     } else {
       skip_list_iter_.Prev();
     }
-    ReadEntry();
   }
 
   virtual void Seek(const Slice& key) override {
-    valid_ = true;
     WriteBatchIndexEntry search_entry(&key, column_family_id_);
     skip_list_iter_.Seek(&search_entry);
-    ReadEntry();
   }
 
-  virtual void Next() override {
-    skip_list_iter_.Next();
-    ReadEntry();
-  }
+  virtual void Next() override { skip_list_iter_.Next(); }
 
-  virtual void Prev() override {
-    skip_list_iter_.Prev();
-    ReadEntry();
-  }
+  virtual void Prev() override { skip_list_iter_.Prev(); }
 
-  virtual const WriteEntry& Entry() const override { return current_; }
+  virtual WriteEntry Entry() const override {
+    WriteEntry ret;
+    Slice blob;
+    const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
+    // this is guaranteed with Valid()
+    assert(iter_entry != nullptr &&
+           iter_entry->column_family == column_family_id_);
+    auto s = write_batch_->GetEntryFromDataOffset(iter_entry->offset, &ret.type,
+                                                  &ret.key, &ret.value, &blob);
+    assert(s.ok());
+    assert(ret.type == kPutRecord || ret.type == kDeleteRecord ||
+           ret.type == kSingleDeleteRecord || ret.type == kMergeRecord);
+    return ret;
+  }
 
-  virtual Status status() const override { return status_; }
+  virtual Status status() const override {
+    // this is in-memory data structure, so the only way status can be non-ok is
+    // through memory corruption
+    return Status::OK();
+  }
 
   const WriteBatchIndexEntry* GetRawEntry() const {
     return skip_list_iter_.key();
@@ -357,33 +364,6 @@ class WBWIIteratorImpl : public WBWIIterator {
   uint32_t column_family_id_;
   WriteBatchEntrySkipList::Iterator skip_list_iter_;
   const ReadableWriteBatch* write_batch_;
-  Status status_;
-  bool valid_;
-  WriteEntry current_;
-
-  void ReadEntry() {
-    if (!status_.ok() || !skip_list_iter_.Valid()) {
-      valid_ = false;
-      return;
-    }
-    const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
-    if (iter_entry == nullptr ||
-        iter_entry->column_family != column_family_id_) {
-      valid_ = false;
-      return;
-    }
-    Slice blob;
-    status_ = write_batch_->GetEntryFromDataOffset(
-        iter_entry->offset, &current_.type, &current_.key, &current_.value,
-        &blob);
-    if (!status_.ok()) {
-      valid_ = false;
-    } else if (current_.type != kPutRecord && current_.type != kDeleteRecord &&
-               current_.type != kMergeRecord) {
-      valid_ = false;
-      status_ = Status::Corruption("write batch index is corrupted");
-    }
-  }
 };
 
 struct WriteBatchWithIndex::Rep {
@@ -422,6 +402,11 @@ struct WriteBatchWithIndex::Rep {
 
   // Clear all updates buffered in this batch.
   void Clear();
+  void ClearIndex();
+
+  // Rebuild index by reading all records from the batch.
+  // Returns non-ok status on corruption.
+  Status ReBuildIndex();
 };
 
 bool WriteBatchWithIndex::Rep::UpdateExistingEntry(
@@ -477,13 +462,75 @@ void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) {
 
   void WriteBatchWithIndex::Rep::Clear() {
     write_batch.Clear();
+    ClearIndex();
+  }
+
+  void WriteBatchWithIndex::Rep::ClearIndex() {
+    skip_list.~WriteBatchEntrySkipList();
     arena.~Arena();
     new (&arena) Arena();
-    skip_list.~WriteBatchEntrySkipList();
     new (&skip_list) WriteBatchEntrySkipList(comparator, &arena);
     last_entry_offset = 0;
   }
 
+  Status WriteBatchWithIndex::Rep::ReBuildIndex() {
+    Status s;
+
+    ClearIndex();
+
+    if (write_batch.Count() == 0) {
+      // Nothing to re-index
+      return s;
+    }
+
+    size_t offset = WriteBatchInternal::GetFirstOffset(&write_batch);
+
+    Slice input(write_batch.Data());
+    input.remove_prefix(offset);
+
+    // Loop through all entries in Rep and add each one to the index
+    int found = 0;
+    while (s.ok() && !input.empty()) {
+      Slice key, value, blob;
+      uint32_t column_family_id = 0;  // default
+      char tag = 0;
+
+      // set offset of current entry for call to AddNewEntry()
+      last_entry_offset = input.data() - write_batch.Data().data();
+
+      s = ReadRecordFromWriteBatch(&input, &tag, &column_family_id, &key,
+                                   &value, &blob);
+      if (!s.ok()) {
+        break;
+      }
+
+      switch (tag) {
+        case kTypeColumnFamilyValue:
+        case kTypeValue:
+        case kTypeColumnFamilyDeletion:
+        case kTypeDeletion:
+        case kTypeColumnFamilySingleDeletion:
+        case kTypeSingleDeletion:
+        case kTypeColumnFamilyMerge:
+        case kTypeMerge:
+          found++;
+          if (!UpdateExistingEntryWithCfId(column_family_id, key)) {
+            AddNewEntry(column_family_id);
+          }
+          break;
+        case kTypeLogData:
+          break;
+        default:
+          return Status::Corruption("unknown WriteBatch tag");
+      }
+    }
+
+    if (s.ok() && found != write_batch.Count()) {
+      s = Status::Corruption("WriteBatch has wrong count");
+    }
+
+    return s;
+  }
 
 WriteBatchWithIndex::WriteBatchWithIndex(
     const Comparator* default_index_comparator, size_t reserved_bytes,
@@ -537,36 +584,49 @@ void WriteBatchWithIndex::Put(const Slice& key, const Slice& value) {
   rep->AddOrUpdateIndex(key);
 }
 
-void WriteBatchWithIndex::Merge(ColumnFamilyHandle* column_family,
-                                const Slice& key, const Slice& value) {
+void WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family,
+                                 const Slice& key) {
   rep->SetLastEntryOffset();
-  rep->write_batch.Merge(column_family, key, value);
+  rep->write_batch.Delete(column_family, key);
   rep->AddOrUpdateIndex(column_family, key);
 }
 
-void WriteBatchWithIndex::Merge(const Slice& key, const Slice& value) {
+void WriteBatchWithIndex::Delete(const Slice& key) {
   rep->SetLastEntryOffset();
-  rep->write_batch.Merge(key, value);
+  rep->write_batch.Delete(key);
   rep->AddOrUpdateIndex(key);
 }
 
-void WriteBatchWithIndex::PutLogData(const Slice& blob) {
-  rep->write_batch.PutLogData(blob);
+void WriteBatchWithIndex::SingleDelete(ColumnFamilyHandle* column_family,
+                                       const Slice& key) {
+  rep->SetLastEntryOffset();
+  rep->write_batch.SingleDelete(column_family, key);
+  rep->AddOrUpdateIndex(column_family, key);
 }
 
-void WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family,
-                                 const Slice& key) {
+void WriteBatchWithIndex::SingleDelete(const Slice& key) {
   rep->SetLastEntryOffset();
-  rep->write_batch.Delete(column_family, key);
+  rep->write_batch.SingleDelete(key);
+  rep->AddOrUpdateIndex(key);
+}
+
+void WriteBatchWithIndex::Merge(ColumnFamilyHandle* column_family,
+                                const Slice& key, const Slice& value) {
+  rep->SetLastEntryOffset();
+  rep->write_batch.Merge(column_family, key, value);
   rep->AddOrUpdateIndex(column_family, key);
 }
 
-void WriteBatchWithIndex::Delete(const Slice& key) {
+void WriteBatchWithIndex::Merge(const Slice& key, const Slice& value) {
   rep->SetLastEntryOffset();
-  rep->write_batch.Delete(key);
+  rep->write_batch.Merge(key, value);
   rep->AddOrUpdateIndex(key);
 }
 
+void WriteBatchWithIndex::PutLogData(const Slice& blob) {
+  rep->write_batch.PutLogData(blob);
+}
+
 void WriteBatchWithIndex::Clear() { rep->Clear(); }
 
 Status WriteBatchWithIndex::GetFromBatch(ColumnFamilyHandle* column_family,
@@ -576,19 +636,22 @@ Status WriteBatchWithIndex::GetFromBatch(ColumnFamilyHandle* column_family,
   MergeContext merge_context;
 
   WriteBatchWithIndexInternal::Result result =
-      WriteBatchWithIndexInternal::GetFromBatch(options, this, column_family,
-                                                key, &merge_context,
-                                                &rep->comparator, value, &s);
+      WriteBatchWithIndexInternal::GetFromBatch(
+          options, this, column_family, key, &merge_context, &rep->comparator,
+          value, rep->overwrite_key, &s);
 
   switch (result) {
     case WriteBatchWithIndexInternal::Result::kFound:
     case WriteBatchWithIndexInternal::Result::kError:
-      return s;
+      // use returned status
+      break;
     case WriteBatchWithIndexInternal::Result::kDeleted:
     case WriteBatchWithIndexInternal::Result::kNotFound:
-      return Status::NotFound();
+      s = Status::NotFound();
+      break;
     case WriteBatchWithIndexInternal::Result::kMergeInProgress:
-      return Status::MergeInProgress("");
+      s = Status::MergeInProgress();
+      break;
     default:
       assert(false);
   }
@@ -617,7 +680,7 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
   WriteBatchWithIndexInternal::Result result =
       WriteBatchWithIndexInternal::GetFromBatch(
           options, this, column_family, key, &merge_context, &rep->comparator,
-          &batch_value, &s);
+          &batch_value, rep->overwrite_key, &s);
 
   if (result == WriteBatchWithIndexInternal::Result::kFound) {
     value->assign(batch_value.data(), batch_value.size());
@@ -629,6 +692,14 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
   if (result == WriteBatchWithIndexInternal::Result::kError) {
     return s;
   }
+  if (result == WriteBatchWithIndexInternal::Result::kMergeInProgress &&
+      rep->overwrite_key == true) {
+    // Since we've overwritten keys, we do not know what other operations are
+    // in this batch for this key, so we cannot do a Merge to compute the
+    // result.  Instead, we will simply return MergeInProgress.
+    return Status::MergeInProgress();
+  }
+
   assert(result == WriteBatchWithIndexInternal::Result::kMergeInProgress ||
          result == WriteBatchWithIndexInternal::Result::kNotFound);
 
@@ -662,4 +733,17 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
   return s;
 }
 
+void WriteBatchWithIndex::SetSavePoint() { rep->write_batch.SetSavePoint(); }
+
+Status WriteBatchWithIndex::RollbackToSavePoint() {
+  Status s = rep->write_batch.RollbackToSavePoint();
+
+  if (s.ok()) {
+    s = rep->ReBuildIndex();
+  }
+
+  return s;
+}
+
 }  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc
index b9cf644..ba88e67 100644
--- a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc
@@ -3,6 +3,10 @@
 //  LICENSE file in the root directory of this source tree. An additional grant
 //  of patent rights can be found in the PATENTS file in the same directory.
 
+#ifndef ROCKSDB_LITE
+
+#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
+
 #include "db/column_family.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
@@ -10,7 +14,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "util/coding.h"
-#include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
+#include "util/string_util.h"
 
 namespace rocksdb {
 
@@ -27,7 +31,12 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
     return Status::InvalidArgument("Output parameters cannot be null");
   }
 
-  if (data_offset >= GetDataSize()) {
+  if (data_offset == GetDataSize()) {
+    // reached end of batch.
+    return Status::NotFound();
+  }
+
+  if (data_offset > GetDataSize()) {
     return Status::InvalidArgument("data offset exceed write batch size");
   }
   Slice input = Slice(rep_.data() + data_offset, rep_.size() - data_offset);
@@ -45,6 +54,10 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
     case kTypeDeletion:
       *type = kDeleteRecord;
       break;
+    case kTypeColumnFamilySingleDeletion:
+    case kTypeSingleDeletion:
+      *type = kSingleDeleteRecord;
+      break;
     case kTypeColumnFamilyMerge:
     case kTypeMerge:
       *type = kMergeRecord;
@@ -124,7 +137,7 @@ WriteBatchWithIndexInternal::Result WriteBatchWithIndexInternal::GetFromBatch(
     const DBOptions& options, WriteBatchWithIndex* batch,
     ColumnFamilyHandle* column_family, const Slice& key,
     MergeContext* merge_context, WriteBatchEntryComparator* cmp,
-    std::string* value, Status* s) {
+    std::string* value, bool overwrite_key, Status* s) {
   uint32_t cf_id = GetColumnFamilyID(column_family);
   *s = Status::OK();
   WriteBatchWithIndexInternal::Result result =
@@ -176,7 +189,8 @@ WriteBatchWithIndexInternal::Result WriteBatchWithIndexInternal::GetFromBatch(
         merge_context->PushOperand(entry.value);
         break;
       }
-      case kDeleteRecord: {
+      case kDeleteRecord:
+      case kSingleDeleteRecord: {
         result = WriteBatchWithIndexInternal::Result::kDeleted;
         break;
       }
@@ -187,7 +201,7 @@ WriteBatchWithIndexInternal::Result WriteBatchWithIndexInternal::GetFromBatch(
       default: {
         result = WriteBatchWithIndexInternal::Result::kError;
         (*s) = Status::Corruption("Unexpected entry in WriteBatchWithIndex:",
-                                  std::to_string(entry.type));
+                                  ToString(entry.type));
         break;
       }
     }
@@ -197,6 +211,13 @@ WriteBatchWithIndexInternal::Result WriteBatchWithIndexInternal::GetFromBatch(
       // We can stop iterating once we find a PUT or DELETE
       break;
     }
+    if (result == WriteBatchWithIndexInternal::Result::kMergeInProgress &&
+        overwrite_key == true) {
+      // Since we've overwritten keys, we do not know what other operations are
+      // in this batch for this key, so we cannot do a Merge to compute the
+      // result.  Instead, we will simply return MergeInProgress.
+      break;
+    }
 
     iter->Prev();
   }
@@ -240,3 +261,5 @@ WriteBatchWithIndexInternal::Result WriteBatchWithIndexInternal::GetFromBatch(
 }
 
 }  // namespace rocksdb
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
index a98ddd6..b88cd76 100644
--- a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
@@ -2,9 +2,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree. An additional grant
 // of patent rights can be found in the PATENTS file in the same directory.
-
 #pragma once
 
+#ifndef ROCKSDB_LITE
+
 #include <limits>
 #include <string>
 #include <unordered_map>
@@ -14,6 +15,7 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
+#include "port/port.h"
 
 namespace rocksdb {
 
@@ -29,7 +31,7 @@ struct WriteBatchIndexEntry {
 
   // If this flag appears in the offset, it indicates a key that is smaller
   // than any other entry for the same column family
-  static const size_t kFlagMin = std::numeric_limits<size_t>::max();
+  static const size_t kFlagMin = port::kMaxSizet;
 
   size_t offset;           // offset of an entry in write batch's string buffer.
   uint32_t column_family;  // column family of the entry
@@ -90,7 +92,8 @@ class WriteBatchWithIndexInternal {
       const DBOptions& options, WriteBatchWithIndex* batch,
       ColumnFamilyHandle* column_family, const Slice& key,
       MergeContext* merge_context, WriteBatchEntryComparator* cmp,
-      std::string* value, Status* s);
+      std::string* value, bool overwrite_key, Status* s);
 };
 
 }  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc
index 5e9ff77..da695c4 100644
--- a/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/src/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -7,11 +7,14 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#ifndef ROCKSDB_LITE
 
 #include <memory>
 #include <map>
 #include "db/column_family.h"
+#include "port/stack_trace.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
+#include "util/string_util.h"
 #include "util/testharness.h"
 #include "utilities/merge_operators.h"
 #include "utilities/merge_operators/string_append/stringappend.h"
@@ -102,7 +105,7 @@ void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
       std::unique_ptr<WBWIIterator> iter(batch->NewIterator(&data));
       iter->Seek(e.key);
       ASSERT_OK(iter->status());
-      auto& write_entry = iter->Entry();
+      auto write_entry = iter->Entry();
       ASSERT_EQ(e.key, write_entry.key.ToString());
       ASSERT_EQ(e.value, write_entry.value.ToString());
       batch->Delete(&data, e.key);
@@ -123,7 +126,7 @@ void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
         for (auto v : pair.second) {
           ASSERT_OK(iter->status());
           ASSERT_TRUE(iter->Valid());
-          auto& write_entry = iter->Entry();
+          auto write_entry = iter->Entry();
           ASSERT_EQ(pair.first, write_entry.key.ToString());
           ASSERT_EQ(v->type, write_entry.type);
           if (write_entry.type != kDeleteRecord) {
@@ -139,7 +142,7 @@ void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
       for (auto v = pair->second.rbegin(); v != pair->second.rend(); v++) {
         ASSERT_OK(iter->status());
         ASSERT_TRUE(iter->Valid());
-        auto& write_entry = iter->Entry();
+        auto write_entry = iter->Entry();
         ASSERT_EQ(pair->first, write_entry.key.ToString());
         ASSERT_EQ((*v)->type, write_entry.type);
         if (write_entry.type != kDeleteRecord) {
@@ -164,7 +167,7 @@ void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
         for (auto v : pair.second) {
           ASSERT_OK(iter->status());
           ASSERT_TRUE(iter->Valid());
-          auto& write_entry = iter->Entry();
+          auto write_entry = iter->Entry();
           ASSERT_EQ(pair.first, write_entry.key.ToString());
           if (v->type != kDeleteRecord) {
             ASSERT_EQ(v->key, write_entry.value.ToString());
@@ -181,7 +184,7 @@ void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
       for (auto v = pair->second.rbegin(); v != pair->second.rend(); v++) {
         ASSERT_OK(iter->status());
         ASSERT_TRUE(iter->Valid());
-        auto& write_entry = iter->Entry();
+        auto write_entry = iter->Entry();
         ASSERT_EQ(pair->first, write_entry.key.ToString());
         if ((*v)->type != kDeleteRecord) {
           ASSERT_EQ((*v)->key, write_entry.value.ToString());
@@ -203,7 +206,7 @@ void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
       ASSERT_OK(iter->status());
       for (auto v : pair->second) {
         ASSERT_TRUE(iter->Valid());
-        auto& write_entry = iter->Entry();
+        auto write_entry = iter->Entry();
         ASSERT_EQ(pair->first, write_entry.key.ToString());
         ASSERT_EQ(v->type, write_entry.type);
         if (write_entry.type != kDeleteRecord) {
@@ -225,7 +228,7 @@ void TestValueAsSecondaryIndexHelper(std::vector<Entry> entries,
       ASSERT_OK(iter->status());
       for (auto v : pair->second) {
         ASSERT_TRUE(iter->Valid());
-        auto& write_entry = iter->Entry();
+        auto write_entry = iter->Entry();
         ASSERT_EQ(pair->first, write_entry.key.ToString());
         ASSERT_EQ(v->value, write_entry.key.ToString());
         if (v->type != kDeleteRecord) {
@@ -968,7 +971,7 @@ TEST_F(WriteBatchWithIndexTest, TestGetFromBatchMerge) {
 
   DestroyDB(dbname, options);
   Status s = DB::Open(options, dbname, &db);
-  assert(s.ok());
+  ASSERT_OK(s);
 
   ColumnFamilyHandle* column_family = db->DefaultColumnFamily();
   WriteBatchWithIndex batch;
@@ -981,11 +984,11 @@ TEST_F(WriteBatchWithIndexTest, TestGetFromBatchMerge) {
   std::string expected = "X";
 
   for (int i = 0; i < 5; i++) {
-    batch.Merge("x", std::to_string(i));
-    expected = expected + "," + std::to_string(i);
+    batch.Merge("x", ToString(i));
+    expected = expected + "," + ToString(i);
 
     if (i % 2 == 0) {
-      batch.Put("y", std::to_string(i / 2));
+      batch.Put("y", ToString(i / 2));
     }
 
     batch.Merge("z", "z");
@@ -996,7 +999,7 @@ TEST_F(WriteBatchWithIndexTest, TestGetFromBatchMerge) {
 
     s = batch.GetFromBatch(column_family, options, "y", &value);
     ASSERT_OK(s);
-    ASSERT_EQ(std::to_string(i / 2), value);
+    ASSERT_EQ(ToString(i / 2), value);
 
     s = batch.GetFromBatch(column_family, options, "z", &value);
     ASSERT_TRUE(s.IsMergeInProgress());
@@ -1006,6 +1009,66 @@ TEST_F(WriteBatchWithIndexTest, TestGetFromBatchMerge) {
   DestroyDB(dbname, options);
 }
 
+TEST_F(WriteBatchWithIndexTest, TestGetFromBatchMerge2) {
+  DB* db;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  options.create_if_missing = true;
+
+  std::string dbname = test::TmpDir() + "/write_batch_with_index_test";
+
+  DestroyDB(dbname, options);
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+
+  ColumnFamilyHandle* column_family = db->DefaultColumnFamily();
+
+  // Test batch with overwrite_key=true
+  WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+  std::string value;
+
+  s = batch.GetFromBatch(column_family, options, "X", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  batch.Put(column_family, "X", "x");
+  s = batch.GetFromBatch(column_family, options, "X", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("x", value);
+
+  batch.Put(column_family, "X", "x2");
+  s = batch.GetFromBatch(column_family, options, "X", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("x2", value);
+
+  batch.Merge(column_family, "X", "aaa");
+  s = batch.GetFromBatch(column_family, options, "X", &value);
+  ASSERT_TRUE(s.IsMergeInProgress());
+
+  batch.Merge(column_family, "X", "bbb");
+  s = batch.GetFromBatch(column_family, options, "X", &value);
+  ASSERT_TRUE(s.IsMergeInProgress());
+
+  batch.Put(column_family, "X", "x3");
+  s = batch.GetFromBatch(column_family, options, "X", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("x3", value);
+
+  batch.Merge(column_family, "X", "ccc");
+  s = batch.GetFromBatch(column_family, options, "X", &value);
+  ASSERT_TRUE(s.IsMergeInProgress());
+
+  batch.Delete(column_family, "X");
+  s = batch.GetFromBatch(column_family, options, "X", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  batch.Merge(column_family, "X", "ddd");
+  s = batch.GetFromBatch(column_family, options, "X", &value);
+  ASSERT_TRUE(s.IsMergeInProgress());
+
+  delete db;
+  DestroyDB(dbname, options);
+}
+
 TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDB) {
   DB* db;
   Options options;
@@ -1014,7 +1077,7 @@ TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDB) {
 
   DestroyDB(dbname, options);
   Status s = DB::Open(options, dbname, &db);
-  assert(s.ok());
+  ASSERT_OK(s);
 
   WriteBatchWithIndex batch;
   ReadOptions read_options;
@@ -1182,9 +1245,556 @@ TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge) {
   DestroyDB(dbname, options);
 }
 
+TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge2) {
+  DB* db;
+  Options options;
+
+  options.create_if_missing = true;
+  std::string dbname = test::TmpDir() + "/write_batch_with_index_test";
+
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  DestroyDB(dbname, options);
+  Status s = DB::Open(options, dbname, &db);
+  assert(s.ok());
+
+  // Test batch with overwrite_key=true
+  WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+
+  ReadOptions read_options;
+  WriteOptions write_options;
+  std::string value;
+
+  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  batch.Merge("A", "xxx");
+
+  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
+  ASSERT_TRUE(s.IsMergeInProgress());
+
+  batch.Merge("A", "yyy");
+
+  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
+  ASSERT_TRUE(s.IsMergeInProgress());
+
+  s = db->Put(write_options, "A", "a0");
+  ASSERT_OK(s);
+
+  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
+  ASSERT_TRUE(s.IsMergeInProgress());
+
+  batch.Delete("A");
+
+  s = batch.GetFromBatchAndDB(db, read_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  delete db;
+  DestroyDB(dbname, options);
+}
+
+void AssertKey(std::string key, WBWIIterator* iter) {
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(key, iter->Entry().key.ToString());
+}
+
+void AssertValue(std::string value, WBWIIterator* iter) {
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value, iter->Entry().value.ToString());
+}
+
+// Tests that we can write to the WBWI while we iterate (from a single thread).
+// iteration should see the newest writes
+TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingCorrectnessTest) {
+  WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+  for (char c = 'a'; c <= 'z'; ++c) {
+    batch.Put(std::string(1, c), std::string(1, c));
+  }
+
+  std::unique_ptr<WBWIIterator> iter(batch.NewIterator());
+  iter->Seek("k");
+  AssertKey("k", iter.get());
+  iter->Next();
+  AssertKey("l", iter.get());
+  batch.Put("ab", "cc");
+  iter->Next();
+  AssertKey("m", iter.get());
+  batch.Put("mm", "kk");
+  iter->Next();
+  AssertKey("mm", iter.get());
+  AssertValue("kk", iter.get());
+  batch.Delete("mm");
+
+  iter->Next();
+  AssertKey("n", iter.get());
+  iter->Prev();
+  AssertKey("mm", iter.get());
+  ASSERT_EQ(kDeleteRecord, iter->Entry().type);
+
+  iter->Seek("ab");
+  AssertKey("ab", iter.get());
+  batch.Delete("x");
+  iter->Seek("x");
+  AssertKey("x", iter.get());
+  ASSERT_EQ(kDeleteRecord, iter->Entry().type);
+  iter->Prev();
+  AssertKey("w", iter.get());
+}
+
+void AssertIterKey(std::string key, Iterator* iter) {
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(key, iter->key().ToString());
+}
+
+void AssertIterValue(std::string value, Iterator* iter) {
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value, iter->value().ToString());
+}
+
+// same thing as above, but testing IteratorWithBase
+TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingBaseCorrectnessTest) {
+  WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+  for (char c = 'a'; c <= 'z'; ++c) {
+    batch.Put(std::string(1, c), std::string(1, c));
+  }
+
+  KVMap map;
+  map["aa"] = "aa";
+  map["cc"] = "cc";
+  map["ee"] = "ee";
+  map["em"] = "me";
+
+  std::unique_ptr<Iterator> iter(
+      batch.NewIteratorWithBase(new KVIter(&map)));
+  iter->Seek("k");
+  AssertIterKey("k", iter.get());
+  iter->Next();
+  AssertIterKey("l", iter.get());
+  batch.Put("ab", "cc");
+  iter->Next();
+  AssertIterKey("m", iter.get());
+  batch.Put("mm", "kk");
+  iter->Next();
+  AssertIterKey("mm", iter.get());
+  AssertIterValue("kk", iter.get());
+  batch.Delete("mm");
+  iter->Next();
+  AssertIterKey("n", iter.get());
+  iter->Prev();
+  // "mm" is deleted, so we're back at "m"
+  AssertIterKey("m", iter.get());
+
+  iter->Seek("ab");
+  AssertIterKey("ab", iter.get());
+  iter->Prev();
+  AssertIterKey("aa", iter.get());
+  iter->Prev();
+  AssertIterKey("a", iter.get());
+  batch.Delete("aa");
+  iter->Next();
+  AssertIterKey("ab", iter.get());
+  iter->Prev();
+  AssertIterKey("a", iter.get());
+
+  batch.Delete("x");
+  iter->Seek("x");
+  AssertIterKey("y", iter.get());
+  iter->Next();
+  AssertIterKey("z", iter.get());
+  iter->Prev();
+  iter->Prev();
+  AssertIterKey("w", iter.get());
+
+  batch.Delete("e");
+  iter->Seek("e");
+  AssertIterKey("ee", iter.get());
+  AssertIterValue("ee", iter.get());
+  batch.Put("ee", "xx");
+  // still the same value
+  AssertIterValue("ee", iter.get());
+  iter->Next();
+  AssertIterKey("em", iter.get());
+  iter->Prev();
+  // new value
+  AssertIterValue("xx", iter.get());
+}
+
+// stress testing mutations with IteratorWithBase
+TEST_F(WriteBatchWithIndexTest, MutateWhileIteratingBaseStressTest) {
+  WriteBatchWithIndex batch(BytewiseComparator(), 0, true);
+  for (char c = 'a'; c <= 'z'; ++c) {
+    batch.Put(std::string(1, c), std::string(1, c));
+  }
+
+  KVMap map;
+  for (char c = 'a'; c <= 'z'; ++c) {
+    map[std::string(2, c)] = std::string(2, c);
+  }
+
+  std::unique_ptr<Iterator> iter(
+      batch.NewIteratorWithBase(new KVIter(&map)));
+
+  Random rnd(301);
+  for (int i = 0; i < 1000000; ++i) {
+    int random = rnd.Uniform(8);
+    char c = static_cast<char>(rnd.Uniform(26) + 'a');
+    switch (random) {
+      case 0:
+        batch.Put(std::string(1, c), "xxx");
+        break;
+      case 1:
+        batch.Put(std::string(2, c), "xxx");
+        break;
+      case 2:
+        batch.Delete(std::string(1, c));
+        break;
+      case 3:
+        batch.Delete(std::string(2, c));
+        break;
+      case 4:
+        iter->Seek(std::string(1, c));
+        break;
+      case 5:
+        iter->Seek(std::string(2, c));
+        break;
+      case 6:
+        if (iter->Valid()) {
+          iter->Next();
+        }
+        break;
+      case 7:
+        if (iter->Valid()) {
+          iter->Prev();
+        }
+        break;
+      default:
+        assert(false);
+    }
+  }
+}
+
+static std::string PrintContents(WriteBatchWithIndex* batch,
+                                 ColumnFamilyHandle* column_family) {
+  std::string result;
+
+  WBWIIterator* iter;
+  if (column_family == nullptr) {
+    iter = batch->NewIterator();
+  } else {
+    iter = batch->NewIterator(column_family);
+  }
+
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    WriteEntry e = iter->Entry();
+
+    if (e.type == kPutRecord) {
+      result.append("PUT(");
+      result.append(e.key.ToString());
+      result.append("):");
+      result.append(e.value.ToString());
+    } else if (e.type == kMergeRecord) {
+      result.append("MERGE(");
+      result.append(e.key.ToString());
+      result.append("):");
+      result.append(e.value.ToString());
+    } else if (e.type == kSingleDeleteRecord) {
+      result.append("SINGLE-DEL(");
+      result.append(e.key.ToString());
+      result.append(")");
+    } else {
+      assert(e.type == kDeleteRecord);
+      result.append("DEL(");
+      result.append(e.key.ToString());
+      result.append(")");
+    }
+
+    result.append(",");
+    iter->Next();
+  }
+
+  delete iter;
+  return result;
+}
+
+static std::string PrintContents(WriteBatchWithIndex* batch, KVMap* base_map,
+                                 ColumnFamilyHandle* column_family) {
+  std::string result;
+
+  Iterator* iter;
+  if (column_family == nullptr) {
+    iter = batch->NewIteratorWithBase(new KVIter(base_map));
+  } else {
+    iter = batch->NewIteratorWithBase(column_family, new KVIter(base_map));
+  }
+
+  iter->SeekToFirst();
+  while (iter->Valid()) {
+    assert(iter->status().ok());
+
+    Slice key = iter->key();
+    Slice value = iter->value();
+
+    result.append(key.ToString());
+    result.append(":");
+    result.append(value.ToString());
+    result.append(",");
+
+    iter->Next();
+  }
+
+  delete iter;
+  return result;
+}
+
+TEST_F(WriteBatchWithIndexTest, SavePointTest) {
+  WriteBatchWithIndex batch;
+  ColumnFamilyHandleImplDummy cf1(1, BytewiseComparator());
+  Status s;
+
+  batch.Put("A", "a");
+  batch.Put("B", "b");
+  batch.Put("A", "aa");
+  batch.Put(&cf1, "A", "a1");
+  batch.Delete(&cf1, "B");
+  batch.Put(&cf1, "C", "c1");
+  batch.Put(&cf1, "E", "e1");
+
+  batch.SetSavePoint();  // 1
+
+  batch.Put("C", "cc");
+  batch.Put("B", "bb");
+  batch.Delete("A");
+  batch.Put(&cf1, "B", "b1");
+  batch.Delete(&cf1, "A");
+  batch.SingleDelete(&cf1, "E");
+  batch.SetSavePoint();  // 2
+
+  batch.Put("A", "aaa");
+  batch.Put("A", "xxx");
+  batch.Delete("B");
+  batch.Put(&cf1, "B", "b2");
+  batch.Delete(&cf1, "C");
+  batch.SetSavePoint();  // 3
+  batch.SetSavePoint();  // 4
+  batch.SingleDelete("D");
+  batch.Delete(&cf1, "D");
+  batch.Delete(&cf1, "E");
+
+  ASSERT_EQ(
+      "PUT(A):a,PUT(A):aa,DEL(A),PUT(A):aaa,PUT(A):xxx,PUT(B):b,PUT(B):bb,DEL("
+      "B)"
+      ",PUT(C):cc,SINGLE-DEL(D),",
+      PrintContents(&batch, nullptr));
+
+  ASSERT_EQ(
+      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(B):b2,PUT(C):c1,DEL(C),"
+      "DEL(D),PUT(E):e1,SINGLE-DEL(E),DEL(E),",
+      PrintContents(&batch, &cf1));
+
+  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 4
+  ASSERT_EQ(
+      "PUT(A):a,PUT(A):aa,DEL(A),PUT(A):aaa,PUT(A):xxx,PUT(B):b,PUT(B):bb,DEL("
+      "B)"
+      ",PUT(C):cc,",
+      PrintContents(&batch, nullptr));
+
+  ASSERT_EQ(
+      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(B):b2,PUT(C):c1,DEL(C),"
+      "PUT(E):e1,SINGLE-DEL(E),",
+      PrintContents(&batch, &cf1));
+
+  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 3
+  ASSERT_EQ(
+      "PUT(A):a,PUT(A):aa,DEL(A),PUT(A):aaa,PUT(A):xxx,PUT(B):b,PUT(B):bb,DEL("
+      "B)"
+      ",PUT(C):cc,",
+      PrintContents(&batch, nullptr));
+
+  ASSERT_EQ(
+      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(B):b2,PUT(C):c1,DEL(C),"
+      "PUT(E):e1,SINGLE-DEL(E),",
+      PrintContents(&batch, &cf1));
+
+  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 2
+  ASSERT_EQ("PUT(A):a,PUT(A):aa,DEL(A),PUT(B):b,PUT(B):bb,PUT(C):cc,",
+            PrintContents(&batch, nullptr));
+
+  ASSERT_EQ(
+      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(C):c1,"
+      "PUT(E):e1,SINGLE-DEL(E),",
+      PrintContents(&batch, &cf1));
+
+  batch.SetSavePoint();  // 5
+  batch.Put("X", "x");
+
+  ASSERT_EQ("PUT(A):a,PUT(A):aa,DEL(A),PUT(B):b,PUT(B):bb,PUT(C):cc,PUT(X):x,",
+            PrintContents(&batch, nullptr));
+
+  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 5
+  ASSERT_EQ("PUT(A):a,PUT(A):aa,DEL(A),PUT(B):b,PUT(B):bb,PUT(C):cc,",
+            PrintContents(&batch, nullptr));
+
+  ASSERT_EQ(
+      "PUT(A):a1,DEL(A),DEL(B),PUT(B):b1,PUT(C):c1,"
+      "PUT(E):e1,SINGLE-DEL(E),",
+      PrintContents(&batch, &cf1));
+
+  ASSERT_OK(batch.RollbackToSavePoint());  // rollback to 1
+  ASSERT_EQ("PUT(A):a,PUT(A):aa,PUT(B):b,", PrintContents(&batch, nullptr));
+
+  ASSERT_EQ("PUT(A):a1,DEL(B),PUT(C):c1,PUT(E):e1,",
+            PrintContents(&batch, &cf1));
+
+  s = batch.RollbackToSavePoint();  // no savepoint found
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("PUT(A):a,PUT(A):aa,PUT(B):b,", PrintContents(&batch, nullptr));
+
+  ASSERT_EQ("PUT(A):a1,DEL(B),PUT(C):c1,PUT(E):e1,",
+            PrintContents(&batch, &cf1));
+
+  batch.SetSavePoint();  // 6
+
+  batch.Clear();
+  ASSERT_EQ("", PrintContents(&batch, nullptr));
+  ASSERT_EQ("", PrintContents(&batch, &cf1));
+
+  s = batch.RollbackToSavePoint();  // rollback to 6
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+TEST_F(WriteBatchWithIndexTest, SingleDeleteTest) {
+  WriteBatchWithIndex batch;
+  Status s;
+  std::string value;
+  DBOptions db_options;
+
+  batch.SingleDelete("A");
+
+  s = batch.GetFromBatch(db_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = batch.GetFromBatch(db_options, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  value = PrintContents(&batch, nullptr);
+  ASSERT_EQ("SINGLE-DEL(A),", value);
+
+  batch.Clear();
+  batch.Put("A", "a");
+  batch.Put("A", "a2");
+  batch.Put("B", "b");
+  batch.SingleDelete("A");
+
+  s = batch.GetFromBatch(db_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = batch.GetFromBatch(db_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  value = PrintContents(&batch, nullptr);
+  ASSERT_EQ("PUT(A):a,PUT(A):a2,SINGLE-DEL(A),PUT(B):b,", value);
+
+  batch.Put("C", "c");
+  batch.Put("A", "a3");
+  batch.Delete("B");
+  batch.SingleDelete("B");
+  batch.SingleDelete("C");
+
+  s = batch.GetFromBatch(db_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a3", value);
+  s = batch.GetFromBatch(db_options, "B", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = batch.GetFromBatch(db_options, "C", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = batch.GetFromBatch(db_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  value = PrintContents(&batch, nullptr);
+  ASSERT_EQ(
+      "PUT(A):a,PUT(A):a2,SINGLE-DEL(A),PUT(A):a3,PUT(B):b,DEL(B),SINGLE-DEL(B)"
+      ",PUT(C):c,SINGLE-DEL(C),",
+      value);
+
+  batch.Put("B", "b4");
+  batch.Put("C", "c4");
+  batch.Put("D", "d4");
+  batch.SingleDelete("D");
+  batch.SingleDelete("D");
+  batch.Delete("A");
+
+  s = batch.GetFromBatch(db_options, "A", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  s = batch.GetFromBatch(db_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b4", value);
+  s = batch.GetFromBatch(db_options, "C", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("c4", value);
+  s = batch.GetFromBatch(db_options, "D", &value);
+  ASSERT_TRUE(s.IsNotFound());
+
+  value = PrintContents(&batch, nullptr);
+  ASSERT_EQ(
+      "PUT(A):a,PUT(A):a2,SINGLE-DEL(A),PUT(A):a3,DEL(A),PUT(B):b,DEL(B),"
+      "SINGLE-DEL(B),PUT(B):b4,PUT(C):c,SINGLE-DEL(C),PUT(C):c4,PUT(D):d4,"
+      "SINGLE-DEL(D),SINGLE-DEL(D),",
+      value);
+}
+
+TEST_F(WriteBatchWithIndexTest, SingleDeleteDeltaIterTest) {
+  Status s;
+  std::string value;
+  DBOptions db_options;
+  WriteBatchWithIndex batch(BytewiseComparator(), 20, true /* overwrite_key */);
+  batch.Put("A", "a");
+  batch.Put("A", "a2");
+  batch.Put("B", "b");
+  batch.SingleDelete("A");
+  batch.Delete("B");
+
+  KVMap map;
+  value = PrintContents(&batch, &map, nullptr);
+  ASSERT_EQ("", value);
+
+  map["A"] = "aa";
+  map["C"] = "cc";
+  map["D"] = "dd";
+
+  batch.SingleDelete("B");
+  batch.SingleDelete("C");
+  batch.SingleDelete("Z");
+
+  value = PrintContents(&batch, &map, nullptr);
+  ASSERT_EQ("D:dd,", value);
+
+  batch.Put("A", "a3");
+  batch.Put("B", "b3");
+  batch.SingleDelete("A");
+  batch.SingleDelete("A");
+  batch.SingleDelete("D");
+  batch.SingleDelete("D");
+  batch.Delete("D");
+
+  map["E"] = "ee";
+
+  value = PrintContents(&batch, &map, nullptr);
+  ASSERT_EQ("B:b3,E:ee,", value);
+}
+
 }  // namespace
 
 int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
+
+#else
+#include <stdio.h>
+
+int main() {
+  fprintf(stderr, "SKIPPED\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/test/Makefile-client.am b/src/test/Makefile-client.am
index e1cb98b..8a9ae6e 100644
--- a/src/test/Makefile-client.am
+++ b/src/test/Makefile-client.am
@@ -184,6 +184,13 @@ ceph_test_cls_numops_LDADD = \
 ceph_test_cls_numops_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_cls_numops
 
+ceph_test_cls_journal_SOURCES = test/cls_journal/test_cls_journal.cc
+ceph_test_cls_journal_LDADD = \
+        libcls_journal_client.la $(LIBRADOS) \
+        $(LIBCOMMON) $(CRYPTO_LIBS) $(UNITTEST_LDADD) $(RADOS_TEST_LDADD) -luuid
+ceph_test_cls_journal_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph_test_cls_journal
+
 ceph_test_rados_api_cmd_SOURCES = test/librados/cmd.cc
 ceph_test_rados_api_cmd_LDADD = \
 	$(LIBCOMMON) $(LIBRADOS) $(CRYPTO_LIBS) \
@@ -283,6 +290,8 @@ librados_test_stub_la_SOURCES = \
 	test/librados_test_stub/TestWatchNotify.cc
 noinst_HEADERS += \
 	test/librados_test_stub/LibradosTestStub.h \
+	test/librados_test_stub/MockTestMemIoCtxImpl.h \
+	test/librados_test_stub/MockTestMemRadosClient.h \
 	test/librados_test_stub/TestClassHandler.h \
 	test/librados_test_stub/TestRadosClient.h \
 	test/librados_test_stub/TestMemRadosClient.h \
@@ -291,6 +300,24 @@ noinst_HEADERS += \
 	test/librados_test_stub/TestIoCtxImpl.h
 noinst_LTLIBRARIES += librados_test_stub.la
 
+unittest_journal_SOURCES = \
+	test/journal/test_main.cc \
+        test/journal/test_Entry.cc \
+	test/journal/test_FutureImpl.cc \
+	test/journal/test_Journaler.cc \
+	test/journal/test_JournalMetadata.cc \
+	test/journal/test_JournalPlayer.cc \
+	test/journal/test_JournalRecorder.cc \
+	test/journal/test_JournalTrimmer.cc \
+	test/journal/test_ObjectPlayer.cc \
+	test/journal/test_ObjectRecorder.cc \
+	test/journal/RadosTestFixture.cc
+unittest_journal_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_journal_LDADD = \
+	libjournal.la libcls_journal_client.la \
+	librados_test_stub.la librados_internal.la \
+	$(UNITTEST_LDADD) $(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
+check_TESTPROGRAMS += unittest_journal
 
 if WITH_RBD
 ceph_smalliobenchrbd_SOURCES = \
@@ -317,6 +344,8 @@ librbd_test_la_SOURCES = \
 	test/librbd/test_librbd.cc \
 	test/librbd/test_ImageWatcher.cc \
 	test/librbd/test_internal.cc \
+	test/librbd/test_JournalEntries.cc \
+	test/librbd/test_JournalReplay.cc \
 	test/librbd/test_ObjectMap.cc
 librbd_test_la_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 noinst_LTLIBRARIES += librbd_test.la
@@ -328,6 +357,7 @@ unittest_librbd_CXXFLAGS = $(UNITTEST_CXXFLAGS) -DTEST_LIBRBD_INTERNALS
 unittest_librbd_LDADD = \
 	librbd_test.la librbd_api.la librbd_internal.la $(LIBRBD_TYPES) \
 	libcls_rbd_client.la libcls_lock_client.la \
+	libjournal.la libcls_journal_client.la \
 	librados_test_stub.la librados_internal.la \
 	$(LIBOSDC) $(UNITTEST_LDADD) \
 	$(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
@@ -340,6 +370,7 @@ ceph_test_librbd_CXXFLAGS = $(UNITTEST_CXXFLAGS) -DTEST_LIBRBD_INTERNALS
 ceph_test_librbd_LDADD = \
 	librbd_test.la librbd_api.la librbd_internal.la $(LIBRBD_TYPES) \
 	libcls_rbd_client.la libcls_lock_client.la \
+	libjournal.la libcls_journal_client.la \
 	librados_api.la $(LIBRADOS_DEPS) $(UNITTEST_LDADD) \
 	$(CEPH_GLOBAL) $(RADOS_TEST_LDADD)
 bin_DEBUGPROGRAMS += ceph_test_librbd
@@ -444,12 +475,13 @@ ceph_test_libcephfs_SOURCES = \
 	test/libcephfs/test.cc \
 	test/libcephfs/readdir_r_cb.cc \
 	test/libcephfs/caps.cc \
-	test/libcephfs/multiclient.cc
+	test/libcephfs/multiclient.cc \
+	test/libcephfs/access.cc
 if LINUX
 ceph_test_libcephfs_SOURCES += test/libcephfs/flock.cc
 endif # LINUX
 
-ceph_test_libcephfs_LDADD = $(LIBCEPHFS) $(UNITTEST_LDADD)
+ceph_test_libcephfs_LDADD = $(LIBRADOS) $(LIBCEPHFS) $(LIBCOMMON) $(UNITTEST_LDADD)
 ceph_test_libcephfs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_libcephfs
 
diff --git a/src/test/Makefile-server.am b/src/test/Makefile-server.am
index 8bf8cfc..4e4d70f 100644
--- a/src/test/Makefile-server.am
+++ b/src/test/Makefile-server.am
@@ -152,7 +152,7 @@ ceph_test_keys_LDADD = $(LIBMON) $(CEPH_GLOBAL)
 bin_DEBUGPROGRAMS += ceph_test_keys
 
 get_command_descriptions_SOURCES = test/common/get_command_descriptions.cc
-get_command_descriptions_LDADD = $(LIBMON) $(LIBCOMMON) $(CEPH_GLOBAL)
+get_command_descriptions_LDADD = $(LIBMON) $(LIBMON_TYPES) $(LIBOS) $(LIBCOMMON) $(CEPH_GLOBAL)
 noinst_PROGRAMS += get_command_descriptions
 
 unittest_mon_moncap_SOURCES = test/mon/moncap.cc
@@ -214,8 +214,8 @@ endif # WITH_OSD
 
 if WITH_SLIBROCKSDB
 unittest_rocksdb_option_static_SOURCES = test/objectstore/TestRocksdbOptionParse.cc
-unittest_rocksdb_option_static_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL) rocksdb/librocksdb.la
-unittest_rocksdb_option_static_CXXFLAGS = $(UNITTEST_CXXFLAGS) ${AM_CXXFLAGS} ${LIBROCKSDB_CFLAGS} -std=gnu++11 -I rocksdb/include
+unittest_rocksdb_option_static_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+unittest_rocksdb_option_static_CXXFLAGS = $(UNITTEST_CXXFLAGS) ${AM_CXXFLAGS} ${LIBROCKSDB_CFLAGS} -I rocksdb/include
 check_TESTPROGRAMS += unittest_rocksdb_option_static
 endif
 
diff --git a/src/test/Makefile.am b/src/test/Makefile.am
index 89fc7df..3d63535 100644
--- a/src/test/Makefile.am
+++ b/src/test/Makefile.am
@@ -79,8 +79,10 @@ check_SCRIPTS += \
 	test/mon/mkfs.sh \
 	test/mon/mon-scrub.sh \
 	test/osd/osd-scrub-repair.sh \
+	test/osd/osd-scrub-snaps.sh \
 	test/osd/osd-config.sh \
 	test/osd/osd-bench.sh \
+	test/osd/osd-reactivate.sh \
 	test/osd/osd-copy-from.sh \
 	test/mon/mon-handle-forward.sh \
 	test/libradosstriper/rados-striper.sh \
@@ -134,6 +136,11 @@ UNITTEST_LDADD = \
 	$(top_builddir)/src/gmock/gtest/lib/libgtest.la \
 	$(PTHREAD_LIBS)
 
+if SOLARIS
+UNITTEST_LDADD += \
+	-lsocket -lnsl
+endif
+
 unittest_addrs_SOURCES = test/test_addrs.cc
 unittest_addrs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_addrs_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
@@ -466,4 +473,5 @@ noinst_HEADERS += \
 	test/system/st_rados_watch.h \
 	test/system/systest_runnable.h \
 	test/system/systest_settings.h \
-	test/unit.h
+	test/unit.h \
+	test/journal/RadosTestFixture.h
diff --git a/src/test/ObjectMap/KeyValueDBMemory.cc b/src/test/ObjectMap/KeyValueDBMemory.cc
index 8883b95..fc59b7d 100644
--- a/src/test/ObjectMap/KeyValueDBMemory.cc
+++ b/src/test/ObjectMap/KeyValueDBMemory.cc
@@ -139,6 +139,10 @@ public:
     else
       return make_pair("", "");
   }
+  
+  bool raw_key_is_prefixed(const string &prefix) {
+    return prefix == (*it).first.first;
+  }
 
   bufferlist value() {
     if (valid())
diff --git a/src/test/ObjectMap/KeyValueDBMemory.h b/src/test/ObjectMap/KeyValueDBMemory.h
index 77342a0..94e224b 100644
--- a/src/test/ObjectMap/KeyValueDBMemory.h
+++ b/src/test/ObjectMap/KeyValueDBMemory.h
@@ -5,7 +5,7 @@
 #include <string>
 #include "include/memory.h"
 
-#include "os/KeyValueDB.h"
+#include "kv/KeyValueDB.h"
 #include "include/buffer.h"
 #include "include/Context.h"
 
diff --git a/src/test/ObjectMap/test_keyvaluedb_atomicity.cc b/src/test/ObjectMap/test_keyvaluedb_atomicity.cc
index 04cef95..6e7fc8d 100644
--- a/src/test/ObjectMap/test_keyvaluedb_atomicity.cc
+++ b/src/test/ObjectMap/test_keyvaluedb_atomicity.cc
@@ -1,15 +1,19 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 #include <pthread.h>
 #include "include/buffer.h"
-#include "os/LevelDBStore.h"
+#include "kv/KeyValueDB.h"
 #include <sys/types.h>
 #include <dirent.h>
 #include <string>
 #include <vector>
 #include "include/memory.h"
 #include <boost/scoped_ptr.hpp>
+#include <iostream>
 #include <sstream>
 #include "stdlib.h"
+#include "global/global_context.h"
+
+using namespace std;
 
 const string CONTROL_PREFIX = "CONTROL";
 const string PRIMARY_PREFIX = "PREFIX";
@@ -84,7 +88,7 @@ int main() {
   }
   string strpath(path);
   std::cerr << "Using path: " << strpath << std::endl;
-  LevelDBStore *store = new LevelDBStore(NULL, strpath);
+  KeyValueDB *store = KeyValueDB::create(g_ceph_context, "leveldb", strpath);
   assert(!store->create_and_open(std::cerr));
   db.reset(store);
 
diff --git a/src/test/ObjectMap/test_keyvaluedb_iterators.cc b/src/test/ObjectMap/test_keyvaluedb_iterators.cc
index cbe2ab1..e3ab7da 100644
--- a/src/test/ObjectMap/test_keyvaluedb_iterators.cc
+++ b/src/test/ObjectMap/test_keyvaluedb_iterators.cc
@@ -17,8 +17,7 @@
 #include <boost/scoped_ptr.hpp>
 
 #include "test/ObjectMap/KeyValueDBMemory.h"
-#include "os/KeyValueDB.h"
-#include "os/LevelDBStore.h"
+#include "kv/KeyValueDB.h"
 #include <sys/types.h>
 #include "global/global_init.h"
 #include "common/ceph_argparse.h"
@@ -37,7 +36,7 @@ public:
   virtual void SetUp() {
     assert(!store_path.empty());
 
-    LevelDBStore *db_ptr = new LevelDBStore(g_ceph_context, store_path);
+    KeyValueDB *db_ptr = KeyValueDB::create(g_ceph_context, "leveldb", store_path);
     assert(!db_ptr->create_and_open(std::cerr));
     db.reset(db_ptr);
     mock.reset(new KeyValueDBMemory());
@@ -101,6 +100,21 @@ public:
 	      << __func__
 	      << " iterator not valid";
     }
+    
+    if (!it->raw_key_is_prefixed(expected_prefix)) {
+      return ::testing::AssertionFailure()
+	      << __func__
+	      << " expected raw_key_is_prefixed() == TRUE"
+	      << " got FALSE";
+    }
+    
+    if (it->raw_key_is_prefixed("??__SomeUnexpectedValue__??")) {
+      return ::testing::AssertionFailure()
+	      << __func__
+	      << " expected raw_key_is_prefixed() == FALSE"
+	      << " got TRUE";
+    }
+ 
     pair<string,string> key = it->raw_key();
 
     if (expected_prefix != key.first) {
diff --git a/src/test/ObjectMap/test_object_map.cc b/src/test/ObjectMap/test_object_map.cc
index 6ab3f82..6af60cf 100644
--- a/src/test/ObjectMap/test_object_map.cc
+++ b/src/test/ObjectMap/test_object_map.cc
@@ -6,10 +6,9 @@
 
 #include "include/buffer.h"
 #include "test/ObjectMap/KeyValueDBMemory.h"
-#include "os/KeyValueDB.h"
+#include "kv/KeyValueDB.h"
 #include "os/DBObjectMap.h"
 #include "os/HashIndex.h"
-#include "os/LevelDBStore.h"
 #include <sys/types.h>
 #include "global/global_init.h"
 #include "common/ceph_argparse.h"
@@ -534,7 +533,7 @@ public:
     string strpath(path);
 
     cerr << "using path " << strpath << std::endl;
-    LevelDBStore *store = new LevelDBStore(g_ceph_context, strpath);
+    KeyValueDB *store = KeyValueDB::create(g_ceph_context, "leveldb", strpath);
     assert(!store->create_and_open(cerr));
 
     db.reset(new DBObjectMap(store));
diff --git a/src/test/centos-6/Dockerfile.in b/src/test/centos-6/Dockerfile.in
index a2eece8..6caf951 100644
--- a/src/test/centos-6/Dockerfile.in
+++ b/src/test/centos-6/Dockerfile.in
@@ -27,4 +27,4 @@ RUN yum install -y which ; cd /root ; ./install-deps.sh
 # development tools
 # nc is required to run make check on firefly only (giant+ do not use nc)
 RUN yum install -y ccache valgrind gdb git python-virtualenv gdisk kpartx hdparm jq sudo xmlstarlet parted nc
-RUN useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+RUN if test %%USER%% != root ; then useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers ; fi
diff --git a/src/test/centos-6/ceph.spec.in b/src/test/centos-6/ceph.spec.in
index 8f2a6fc..2939fef 100644
--- a/src/test/centos-6/ceph.spec.in
+++ b/src/test/centos-6/ceph.spec.in
@@ -590,6 +590,11 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 		%{?_with_tcmalloc} \
 		CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
 
+%if %{with lowmem_builder}
+%if 0%{?jobs} > 8
+%define _smp_mflags -j8
+%endif
+%endif
 
 make %{?_smp_mflags}
 
@@ -607,8 +612,7 @@ make %{?_smp_mflags} check-local
 make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
-install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
-install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
+install -D src/etc-rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
 %if 0%{?fedora} || 0%{?rhel}
 install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph
 %endif
@@ -617,6 +621,7 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
 %endif
 %if 0%{?_with_systemd}
   install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/ceph-common.conf
+  install -m 0644 -D systemd/rbdmap.service $RPM_BUILD_ROOT%{_unitdir}/rbdmap.service
   install -m 0644 -D systemd/ceph-osd at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-osd at .service
   install -m 0644 -D systemd/ceph-mon at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mon at .service
   install -m 0644 -D systemd/ceph-create-keys at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-create-keys at .service
@@ -626,6 +631,7 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
   install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
   install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
 %else
+  install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
   install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
   install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
   ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
@@ -810,6 +816,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/rados-classes/libcls_timeindex.so*
 %{_libdir}/rados-classes/libcls_user.so*
 %{_libdir}/rados-classes/libcls_version.so*
+%{_libdir}/rados-classes/libcls_journal.so*
 %dir %{_libdir}/ceph/erasure-code
 %{_libdir}/ceph/erasure-code/libec_*.so*
 %if 0%{?_with_lttng}
@@ -872,6 +879,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/rbd
 %{_bindir}/rbd-replay
 %{_bindir}/rbd-replay-many
+%{_bindir}/rbdmap
 %if 0%{?_with_lttng}
 %{_bindir}/rbd-replay-prep
 %endif
@@ -901,7 +909,11 @@ rm -rf $RPM_BUILD_ROOT
 %config %{_sysconfdir}/bash_completion.d/rados
 %config %{_sysconfdir}/bash_completion.d/rbd
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
+%if 0%{?_with_systemd}
+%{_unitdir}/rbdmap.service
+%else
 %{_initrddir}/rbdmap
+%endif
 %{python_sitelib}/ceph_argparse.py*
 %{python_sitelib}/ceph_daemon.py*
 %{_udevrulesdir}/50-rbd.rules
@@ -1302,12 +1314,12 @@ exit 0
 %files libs-compat
 # We need an empty %%files list for ceph-libs-compat, to tell rpmbuild to actually
 # build this meta package.
+%endif
 
 #################################################################################
 %files devel-compat
 # We need an empty %%files list for ceph-devel-compat, to tell rpmbuild to
 # actually build this meta package.
-%endif
 
 #################################################################################
 %files -n python-ceph-compat
diff --git a/src/test/centos-6/install-deps.sh b/src/test/centos-6/install-deps.sh
index 1bebf09..8249ea3 100755
--- a/src/test/centos-6/install-deps.sh
+++ b/src/test/centos-6/install-deps.sh
@@ -62,7 +62,7 @@ CentOS|Fedora|RedHatEnterpriseServer)
             CentOS|RedHatEnterpriseServer)
                 $SUDO yum install -y yum-utils
                 MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
-                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                if test $(lsb_release -si) = RedHatEnterpriseServer ; then
                     $SUDO yum install subscription-manager
                     $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
                 fi
@@ -70,6 +70,9 @@ CentOS|Fedora|RedHatEnterpriseServer)
                 $SUDO yum install --nogpgcheck -y epel-release
                 $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
                 $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
+                if test $(lsb_release -si) = CentOS -a $MAJOR_VERSION = 7 ; then
+                    $SUDO yum-config-manager --enable cr
+                fi
                 ;;
         esac
         sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
diff --git a/src/test/centos-7/Dockerfile.in b/src/test/centos-7/Dockerfile.in
index 19fe1aa..7562f86 100644
--- a/src/test/centos-7/Dockerfile.in
+++ b/src/test/centos-7/Dockerfile.in
@@ -24,10 +24,10 @@ COPY install-deps.sh /root/
 COPY ceph.spec.in /root/
 # http://jperrin.github.io/centos/2014/09/25/centos-docker-and-systemd/
 RUN yum -y swap -- remove fakesystemd systemd-libs systemd-container -- install systemd systemd-libs && (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ $i == systemd-tmpfiles-setup.service ] || rm -f $i; done) && rm -f /lib/systemd/system/multi-user.target.wants/* && rm -f /etc/systemd/system/*.wants/* && rm -f /lib/systemd/system/local-fs.target.wants/* && rm -f /lib/systemd/system/sockets.target.wants/*udev* && rm -f /lib/systemd/system/sockets.target.wants/*initctl* && [...]
-RUN yum install -y yum-utils && yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/7/x86_64/ && yum install --nogpgcheck -y epel-release && rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7 && rm /etc/yum.repos.d/dl.fedoraproject.org*
+RUN yum install -y yum-utils && yum-config-manager --add-repo https://dl.fedoraproject.org/pub/epel/7/x86_64/ && yum install --nogpgcheck -y epel-release && rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-7 && rm /etc/yum.repos.d/dl.fedoraproject.org* && yum-config-manager --enable cr
 # build dependencies
 RUN cd /root ; ./install-deps.sh
 # development tools
 # nc is required to run make check on firefly only (giant+ do not use nc)
 RUN yum install -y ccache valgrind gdb git python-virtualenv gdisk kpartx hdparm jq sudo xmlstarlet parted nc
-RUN useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+RUN if test %%USER%% != root ; then useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers ; fi
diff --git a/src/test/centos-7/ceph.spec.in b/src/test/centos-7/ceph.spec.in
index 8f2a6fc..2939fef 100644
--- a/src/test/centos-7/ceph.spec.in
+++ b/src/test/centos-7/ceph.spec.in
@@ -590,6 +590,11 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 		%{?_with_tcmalloc} \
 		CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
 
+%if %{with lowmem_builder}
+%if 0%{?jobs} > 8
+%define _smp_mflags -j8
+%endif
+%endif
 
 make %{?_smp_mflags}
 
@@ -607,8 +612,7 @@ make %{?_smp_mflags} check-local
 make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
-install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
-install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
+install -D src/etc-rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
 %if 0%{?fedora} || 0%{?rhel}
 install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph
 %endif
@@ -617,6 +621,7 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
 %endif
 %if 0%{?_with_systemd}
   install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/ceph-common.conf
+  install -m 0644 -D systemd/rbdmap.service $RPM_BUILD_ROOT%{_unitdir}/rbdmap.service
   install -m 0644 -D systemd/ceph-osd at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-osd at .service
   install -m 0644 -D systemd/ceph-mon at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mon at .service
   install -m 0644 -D systemd/ceph-create-keys at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-create-keys at .service
@@ -626,6 +631,7 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
   install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
   install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
 %else
+  install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
   install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
   install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
   ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
@@ -810,6 +816,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/rados-classes/libcls_timeindex.so*
 %{_libdir}/rados-classes/libcls_user.so*
 %{_libdir}/rados-classes/libcls_version.so*
+%{_libdir}/rados-classes/libcls_journal.so*
 %dir %{_libdir}/ceph/erasure-code
 %{_libdir}/ceph/erasure-code/libec_*.so*
 %if 0%{?_with_lttng}
@@ -872,6 +879,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/rbd
 %{_bindir}/rbd-replay
 %{_bindir}/rbd-replay-many
+%{_bindir}/rbdmap
 %if 0%{?_with_lttng}
 %{_bindir}/rbd-replay-prep
 %endif
@@ -901,7 +909,11 @@ rm -rf $RPM_BUILD_ROOT
 %config %{_sysconfdir}/bash_completion.d/rados
 %config %{_sysconfdir}/bash_completion.d/rbd
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
+%if 0%{?_with_systemd}
+%{_unitdir}/rbdmap.service
+%else
 %{_initrddir}/rbdmap
+%endif
 %{python_sitelib}/ceph_argparse.py*
 %{python_sitelib}/ceph_daemon.py*
 %{_udevrulesdir}/50-rbd.rules
@@ -1302,12 +1314,12 @@ exit 0
 %files libs-compat
 # We need an empty %%files list for ceph-libs-compat, to tell rpmbuild to actually
 # build this meta package.
+%endif
 
 #################################################################################
 %files devel-compat
 # We need an empty %%files list for ceph-devel-compat, to tell rpmbuild to
 # actually build this meta package.
-%endif
 
 #################################################################################
 %files -n python-ceph-compat
diff --git a/src/test/centos-7/install-deps.sh b/src/test/centos-7/install-deps.sh
index 1bebf09..8249ea3 100755
--- a/src/test/centos-7/install-deps.sh
+++ b/src/test/centos-7/install-deps.sh
@@ -62,7 +62,7 @@ CentOS|Fedora|RedHatEnterpriseServer)
             CentOS|RedHatEnterpriseServer)
                 $SUDO yum install -y yum-utils
                 MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
-                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                if test $(lsb_release -si) = RedHatEnterpriseServer ; then
                     $SUDO yum install subscription-manager
                     $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
                 fi
@@ -70,6 +70,9 @@ CentOS|Fedora|RedHatEnterpriseServer)
                 $SUDO yum install --nogpgcheck -y epel-release
                 $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
                 $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
+                if test $(lsb_release -si) = CentOS -a $MAJOR_VERSION = 7 ; then
+                    $SUDO yum-config-manager --enable cr
+                fi
                 ;;
         esac
         sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
diff --git a/src/test/ceph_objectstore_tool.py b/src/test/ceph_objectstore_tool.py
index 79230d2..186b0b6 100755
--- a/src/test/ceph_objectstore_tool.py
+++ b/src/test/ceph_objectstore_tool.py
@@ -180,14 +180,19 @@ def get_nspace(num):
     return "ns{num}".format(num=num)
 
 
-def verify(DATADIR, POOL, NAME_PREFIX):
+def verify(DATADIR, POOL, NAME_PREFIX, db):
     TMPFILE = r"/tmp/tmp.{pid}".format(pid=os.getpid())
     nullfd = open(os.devnull, "w")
     ERRORS = 0
-    for nsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(NAME_PREFIX) == 0]:
+    for rawnsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(NAME_PREFIX) == 0]:
+        nsfile = rawnsfile.split("__")[0]
+        clone = rawnsfile.split("__")[1]
         nspace = nsfile.split("-")[0]
         file = nsfile.split("-")[1]
-        path = os.path.join(DATADIR, nsfile)
+        # Skip clones
+        if clone != "head":
+            continue
+        path = os.path.join(DATADIR, rawnsfile)
         try:
             os.unlink(TMPFILE)
         except:
@@ -205,6 +210,55 @@ def verify(DATADIR, POOL, NAME_PREFIX):
             os.unlink(TMPFILE)
         except:
             pass
+        for key, val in db[nspace][file]["xattr"].iteritems():
+            cmd = "./rados -p {pool} -N '{nspace}' getxattr {name} {key}".format(pool=POOL, name=file, key=key, nspace=nspace)
+            logging.debug(cmd)
+            getval = check_output(cmd, shell=True, stderr=nullfd)
+            logging.debug("getxattr {key} {val}".format(key=key, val=getval))
+            if getval != val:
+                logging.error("getxattr of key {key} returned wrong val: {get} instead of {orig}".format(key=key, get=getval, orig=val))
+                ERRORS += 1
+                continue
+        hdr = db[nspace][file].get("omapheader", "")
+        cmd = "./rados -p {pool} -N '{nspace}' getomapheader {name} {file}".format(pool=POOL, name=file, nspace=nspace, file=TMPFILE)
+        logging.debug(cmd)
+        ret = call(cmd, shell=True, stderr=nullfd)
+        if ret != 0:
+            logging.error("rados getomapheader returned {ret}".format(ret=ret))
+            ERRORS += 1
+        else:
+            getlines = get_lines(TMPFILE)
+            assert(len(getlines) == 0 or len(getlines) == 1)
+            if len(getlines) == 0:
+                gethdr = ""
+            else:
+                gethdr = getlines[0]
+            logging.debug("header: {hdr}".format(hdr=gethdr))
+            if gethdr != hdr:
+                logging.error("getomapheader returned wrong val: {get} instead of {orig}".format(get=gethdr, orig=hdr))
+                ERRORS += 1
+        for key, val in db[nspace][file]["omap"].iteritems():
+            cmd = "./rados -p {pool} -N '{nspace}' getomapval {name} {key} {file}".format(pool=POOL, name=file, key=key, nspace=nspace, file=TMPFILE)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stderr=nullfd)
+            if ret != 0:
+                logging.error("getomapval returned {ret}".format(ret=ret))
+                ERRORS += 1
+                continue
+            getlines = get_lines(TMPFILE)
+            if len(getlines) != 1:
+                logging.error("Bad data from getomapval {lines}".format(lines=getlines))
+                ERRORS += 1
+                continue
+            getval = getlines[0]
+            logging.debug("getomapval {key} {val}".format(key=key, val=getval))
+            if getval != val:
+                logging.error("getomapval returned wrong val: {get} instead of {orig}".format(get=getval, orig=val))
+                ERRORS += 1
+        try:
+            os.unlink(TMPFILE)
+        except:
+            pass
     return ERRORS
 
 
@@ -324,10 +378,15 @@ def kill_daemons():
 def check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME):
     repcount = 0
     ERRORS = 0
-    for nsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(SPLIT_NAME) == 0]:
+    for rawnsfile in [f for f in os.listdir(DATADIR) if f.split('-')[1].find(SPLIT_NAME) == 0]:
+        nsfile = rawnsfile.split("__")[0]
+        clone = rawnsfile.split("__")[1]
         nspace = nsfile.split("-")[0]
-        file = nsfile.split("-")[1]
-        path = os.path.join(DATADIR, nsfile)
+        file = nsfile.split("-")[1] + "__" + clone
+        # Skip clones
+        if clone != "head":
+            continue
+        path = os.path.join(DATADIR, rawnsfile)
         tmpfd = open(TMPFILE, "w")
         cmd = "find {dir} -name '{file}_*_{nspace}_*'".format(dir=OSDDIR, file=file, nspace=nspace)
         logging.debug(cmd)
@@ -352,7 +411,6 @@ def check_data(DATADIR, TMPFILE, OSDDIR, SPLIT_NAME):
 
 
 def set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight):
-    print "Testing get-osdmap and set-osdmap"
     # change the weight of osd.0 to math.pi in the newest osdmap of given osd
     osdmap_file = tempfile.NamedTemporaryFile()
     cmd = (CFSD_PREFIX + "--op get-osdmap --file {osdmap_file}").format(osd=osd_path,
@@ -388,6 +446,13 @@ def set_osd_weight(CFSD_PREFIX, osd_ids, osd_path, weight):
                stderr=subprocess.DEVNULL,
                shell=True)
     assert(ret == 0)
+
+    # Minimum test of --dry-run by using it, but not checking anything
+    cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force --dry-run"
+    cmd = cmd.format(osd=osd_path, osdmap_file=osdmap_file.name, epoch=epoch)
+    ret = call(cmd, stdout=subprocess.DEVNULL, shell=True)
+    assert(ret == 0)
+
     # osdmaptool increases the epoch of the changed osdmap, so we need to force the tool
     # to use use a different epoch than the one in osdmap
     cmd = CFSD_PREFIX + "--op set-osdmap --file {osdmap_file} --epoch {epoch} --force"
@@ -467,6 +532,10 @@ def test_get_set_inc_osdmap(CFSD_PREFIX, osd_path):
     cmd = CFSD_PREFIX + "--op set-inc-osdmap --force --epoch {epoch} --file {file}"
     ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e2.name), shell=True)
     if ret: return 1
+    # Use dry-run to set back to e1 which shouldn't happen
+    cmd = CFSD_PREFIX + "--op set-inc-osdmap --dry-run --epoch {epoch} --file {file}"
+    ret = call(cmd.format(osd=osd_path, epoch=epoch, file=file_e1_backup.name), shell=True)
+    if ret: return 1
     # read from e1
     file_e1_read = tempfile.NamedTemporaryFile(delete=False)
     cmd = CFSD_PREFIX + "--op get-inc-osdmap --epoch {epoch} --file {file}"
@@ -568,6 +637,7 @@ def main(argv):
             NAME = REP_NAME + "{num}".format(num=i)
             LNAME = nspace + "-" + NAME
             DDNAME = os.path.join(DATADIR, LNAME)
+            DDNAME += "__head"
 
             cmd = "rm -f " + DDNAME
             logging.debug(cmd)
@@ -634,6 +704,45 @@ def main(argv):
                     logging.critical("setomapval failed with {ret}".format(ret=ret))
                 db[nspace][NAME]["omap"][mykey] = myval
 
+    # Create some clones
+    cmd = "./rados -p {pool} mksnap snap1".format(pool=REP_POOL)
+    logging.debug(cmd)
+    call(cmd, shell=True)
+
+    objects = range(1, NUM_REP_OBJECTS + 1)
+    nspaces = range(NUM_NSPACES)
+    for n in nspaces:
+        nspace = get_nspace(n)
+
+        for i in objects:
+            NAME = REP_NAME + "{num}".format(num=i)
+            LNAME = nspace + "-" + NAME
+            DDNAME = os.path.join(DATADIR, LNAME)
+            # First clone
+            CLONENAME = DDNAME + "__1"
+            DDNAME += "__head"
+
+            cmd = "mv -f " + DDNAME + " " + CLONENAME
+            logging.debug(cmd)
+            call(cmd, shell=True)
+
+            if i == 1:
+                dataline = range(DATALINECOUNT)
+            else:
+                dataline = range(1)
+            fd = open(DDNAME, "w")
+            data = "This is the replicated data after a snapshot for " + LNAME + "\n"
+            for _ in dataline:
+                fd.write(data)
+            fd.close()
+
+            cmd = "./rados -p {pool} -N '{nspace}' put {name} {ddname}".format(pool=REP_POOL, name=NAME, ddname=DDNAME, nspace=nspace)
+            logging.debug(cmd)
+            ret = call(cmd, shell=True, stderr=nullfd)
+            if ret != 0:
+                logging.critical("Rados put command failed with {ret}".format(ret=ret))
+                return 1
+
     print "Creating {objs} objects in erasure coded pool".format(objs=(NUM_EC_OBJECTS*NUM_NSPACES))
 
     objects = range(1, NUM_EC_OBJECTS + 1)
@@ -645,6 +754,7 @@ def main(argv):
             NAME = EC_NAME + "{num}".format(num=i)
             LNAME = nspace + "-" + NAME
             DDNAME = os.path.join(DATADIR, LNAME)
+            DDNAME += "__head"
 
             cmd = "rm -f " + DDNAME
             logging.debug(cmd)
@@ -760,7 +870,13 @@ def main(argv):
 
     os.unlink(OTHERFILE)
     cmd = (CFSD_PREFIX + "--op import --file {FOO}").format(osd=ONEOSD, FOO=OTHERFILE)
-    ERRORS += test_failure(cmd, "open: No such file or directory")
+    ERRORS += test_failure(cmd, "file: {FOO}: No such file or directory".format(FOO=OTHERFILE))
+
+    cmd = "./ceph-objectstore-tool --data-path BAD_DATA_PATH --journal-path " + OSDDIR + "/{osd}.journal --op list".format(osd=ONEOSD)
+    ERRORS += test_failure(cmd, "data-path: BAD_DATA_PATH: No such file or directory")
+
+    cmd = "./ceph-objectstore-tool --journal-path BAD_JOURNAL_PATH --op dump-journal"
+    ERRORS += test_failure(cmd, "journal-path: BAD_JOURNAL_PATH: (2) No such file or directory")
 
     # On import can't use stdin from a terminal
     cmd = (CFSD_PREFIX + "--op import --pgid {pg}").format(osd=ONEOSD, pg=ONEPG)
@@ -805,6 +921,27 @@ def main(argv):
     cmd = (CFSD_PREFIX + "--pgid {pg} '' notacommand").format(osd=ONEOSD, pg=ONEPG)
     ERRORS += test_failure(cmd, "Unknown object command 'notacommand'")
 
+    cmd = (CFSD_PREFIX + "foo list-omap").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "No object id 'foo' found or invalid JSON specified")
+
+    cmd = (CFSD_PREFIX + "'{{\"oid\":\"obj4\",\"key\":\"\",\"snapid\":-1,\"hash\":2826278768,\"max\":0,\"pool\":1,\"namespace\":\"\"}}' list-omap").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "Without --pgid the object '{\"oid\":\"obj4\",\"key\":\"\",\"snapid\":-1,\"hash\":2826278768,\"max\":0,\"pool\":1,\"namespace\":\"\"}' must be a JSON array")
+
+    cmd = (CFSD_PREFIX + "'[]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "Object '[]' must be a JSON array with 2 elements")
+
+    cmd = (CFSD_PREFIX + "'[\"1.0\"]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "Object '[\"1.0\"]' must be a JSON array with 2 elements")
+
+    cmd = (CFSD_PREFIX + "'[\"1.0\", 5, 8, 9]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "Object '[\"1.0\", 5, 8, 9]' must be a JSON array with 2 elements")
+
+    cmd = (CFSD_PREFIX + "'[1, 2]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "Object '[1, 2]' must be a JSON array with the first element a string")
+
+    cmd = (CFSD_PREFIX + "'[\"1.3\",{{\"snapid\":\"not an int\"}}]' list-omap").format(osd=ONEOSD, pg=ONEPG)
+    ERRORS += test_failure(cmd, "Decode object JSON error: value type is 2 not 4")
+
     TMPFILE = r"/tmp/tmp.{pid}".format(pid=pid)
     ALLPGS = OBJREPPGS + OBJECPGS
     OSDS = get_osds(ALLPGS[0], OSDDIR)
@@ -883,6 +1020,9 @@ def main(argv):
     JSONOBJ = sorted(set(lines))
     for JSON in JSONOBJ:
         (pgid, jsondict) = json.loads(JSON)
+        # Skip clones for now
+        if jsondict['snapid'] != -2:
+            continue
         db[jsondict['namespace']][jsondict['oid']]['json'] = json.dumps((pgid, jsondict))
         # print db[jsondict['namespace']][jsondict['oid']]['json']
         if string.find(jsondict['oid'], EC_NAME) == 0 and 'shard_id' not in jsondict:
@@ -893,7 +1033,7 @@ def main(argv):
     print "Test get-bytes and set-bytes"
     for nspace in db.keys():
         for basename in db[nspace].keys():
-            file = os.path.join(DATADIR, nspace + "-" + basename)
+            file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
             JSON = db[nspace][basename]['json']
             GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
             TESTNAME = "/tmp/testbytes.{pid}".format(pid=pid)
@@ -1012,6 +1152,220 @@ def main(argv):
     except:
         pass
 
+    # Test get-attr, set-attr, rm-attr, get-omaphdr, set-omaphdr, get-omap, set-omap, rm-omap
+    print "Test get-attr, set-attr, rm-attr, get-omaphdr, set-omaphdr, get-omap, set-omap, rm-omap"
+    for nspace in db.keys():
+        for basename in db[nspace].keys():
+            file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
+            JSON = db[nspace][basename]['json']
+            for pg in OBJREPPGS:
+                OSDS = get_osds(pg, OSDDIR)
+                for osd in OSDS:
+                    DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+                    fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+                              and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+                    if not fnames:
+                        continue
+                    for key, val in db[nspace][basename]["xattr"].iteritems():
+                        attrkey = "_" + key
+                        cmd = (CFSD_PREFIX + " '{json}' get-attr {key}").format(osd=osd, json=JSON, key=attrkey)
+                        logging.debug(cmd)
+                        getval = check_output(cmd, shell=True)
+                        if getval != val:
+                            logging.error("get-attr of key {key} returned wrong val: {get} instead of {orig}".format(key=attrkey, get=getval, orig=val))
+                            ERRORS += 1
+                            continue
+                        # set-attr to bogus value "foobar"
+                        cmd = ("echo -n foobar | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        # Test set-attr with dry-run
+                        cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True, stdout=nullfd)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        # Check the set-attr
+                        cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+                        logging.debug(cmd)
+                        getval = check_output(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from get-attr".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        if getval != "foobar":
+                            logging.error("Check of set-attr failed because we got {val}".format(val=getval))
+                            ERRORS += 1
+                            continue
+                        # Test rm-attr
+                        cmd = (CFSD_PREFIX + "'{json}' rm-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from rm-attr".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        # Check rm-attr with dry-run
+                        cmd = (CFSD_PREFIX + "--dry-run '{json}' rm-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True, stdout=nullfd)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from rm-attr".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        cmd = (CFSD_PREFIX + "'{json}' get-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True, stderr=nullfd, stdout=nullfd)
+                        if ret == 0:
+                            logging.error("For rm-attr expect get-attr to fail, but it succeeded")
+                            ERRORS += 1
+                        # Put back value
+                        cmd = ("echo -n {val} | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-attr {key}").format(osd=osd, pg=pg, json=JSON, key=attrkey, val=val)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from set-attr".format(ret=ret))
+                            ERRORS += 1
+                            continue
+
+                    hdr = db[nspace][basename].get("omapheader", "")
+                    cmd = (CFSD_PREFIX + "'{json}' get-omaphdr").format(osd=osd, json=JSON)
+                    logging.debug(cmd)
+                    gethdr = check_output(cmd, shell=True)
+                    if gethdr != hdr:
+                        logging.error("get-omaphdr was wrong: {get} instead of {orig}".format(get=gethdr, orig=hdr))
+                        ERRORS += 1
+                        continue
+                    # set-omaphdr to bogus value "foobar"
+                    cmd = ("echo -n foobar | " + CFSD_PREFIX + "'{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True)
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
+                        ERRORS += 1
+                        continue
+                    # Check the set-omaphdr
+                    cmd = (CFSD_PREFIX + "'{json}' get-omaphdr").format(osd=osd, pg=pg, json=JSON)
+                    logging.debug(cmd)
+                    gethdr = check_output(cmd, shell=True)
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from get-omaphdr".format(ret=ret))
+                        ERRORS += 1
+                        continue
+                    if gethdr != "foobar":
+                        logging.error("Check of set-omaphdr failed because we got {val}".format(val=getval))
+                        ERRORS += 1
+                        continue
+                    # Test dry-run with set-omaphdr
+                    cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run '{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True, stdout=nullfd)
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
+                        ERRORS += 1
+                        continue
+                    # Put back value
+                    cmd = ("echo -n {val} | " + CFSD_PREFIX + "'{json}' set-omaphdr").format(osd=osd, pg=pg, json=JSON, val=hdr)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True)
+                    if ret != 0:
+                        logging.error("Bad exit status {ret} from set-omaphdr".format(ret=ret))
+                        ERRORS += 1
+                        continue
+
+                    for omapkey, val in db[nspace][basename]["omap"].iteritems():
+                        cmd = (CFSD_PREFIX + " '{json}' get-omap {key}").format(osd=osd, json=JSON, key=omapkey)
+                        logging.debug(cmd)
+                        getval = check_output(cmd, shell=True)
+                        if getval != val:
+                            logging.error("get-omap of key {key} returned wrong val: {get} instead of {orig}".format(key=omapkey, get=getval, orig=val))
+                            ERRORS += 1
+                            continue
+                        # set-omap to bogus value "foobar"
+                        cmd = ("echo -n foobar | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        # Check set-omap with dry-run
+                        cmd = ("echo -n dryrunbroken | " + CFSD_PREFIX + "--dry-run --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True, stdout=nullfd)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        # Check the set-omap
+                        cmd = (CFSD_PREFIX + " --pgid {pg} '{json}' get-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+                        logging.debug(cmd)
+                        getval = check_output(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from get-omap".format(ret=ret))
+                            ERRORS += 1
+                            continue
+                        if getval != "foobar":
+                            logging.error("Check of set-omap failed because we got {val}".format(val=getval))
+                            ERRORS += 1
+                            continue
+                        # Test rm-omap
+                        cmd = (CFSD_PREFIX + "'{json}' rm-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from rm-omap".format(ret=ret))
+                            ERRORS += 1
+                        # Check rm-omap with dry-run
+                        cmd = (CFSD_PREFIX + "--dry-run '{json}' rm-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True, stdout=nullfd)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from rm-omap".format(ret=ret))
+                            ERRORS += 1
+                        cmd = (CFSD_PREFIX + "'{json}' get-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True, stderr=nullfd, stdout=nullfd)
+                        if ret == 0:
+                            logging.error("For rm-omap expect get-omap to fail, but it succeeded")
+                            ERRORS += 1
+                        # Put back value
+                        cmd = ("echo -n {val} | " + CFSD_PREFIX + " --pgid {pg} '{json}' set-omap {key}").format(osd=osd, pg=pg, json=JSON, key=omapkey, val=val)
+                        logging.debug(cmd)
+                        ret = call(cmd, shell=True)
+                        if ret != 0:
+                            logging.error("Bad exit status {ret} from set-omap".format(ret=ret))
+                            ERRORS += 1
+                            continue
+
+    # Test dump
+    print "Test dump"
+    for nspace in db.keys():
+        for basename in db[nspace].keys():
+            file = os.path.join(DATADIR, nspace + "-" + basename + "__head")
+            JSON = db[nspace][basename]['json']
+            GETNAME = "/tmp/getbytes.{pid}".format(pid=pid)
+            for pg in OBJREPPGS:
+                OSDS = get_osds(pg, OSDDIR)
+                for osd in OSDS:
+                    DIR = os.path.join(OSDDIR, os.path.join(osd, os.path.join("current", "{pg}_head".format(pg=pg))))
+                    fnames = [f for f in os.listdir(DIR) if os.path.isfile(os.path.join(DIR, f))
+                              and f.split("_")[0] == basename and f.split("_")[4] == nspace]
+                    if not fnames:
+                        continue
+                    cmd = (CFSD_PREFIX + " '{json}' dump | grep '\"snap\": 1,' > /dev/null").format(osd=osd, json=JSON)
+                    logging.debug(cmd)
+                    ret = call(cmd, shell=True)
+                    if ret != 0:
+                        logging.error("Invalid dump for {json}".format(json=JSON))
+                        ERRORS += 1
+
     print "Test list-attrs get-attr"
     ATTRFILE = r"/tmp/attrs.{pid}".format(pid=pid)
     VALFILE = r"/tmp/val.{pid}".format(pid=pid)
@@ -1313,7 +1667,10 @@ def main(argv):
 
     if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
         print "Verify erasure coded import data"
-        ERRORS += verify(DATADIR, EC_POOL, EC_NAME)
+        ERRORS += verify(DATADIR, EC_POOL, EC_NAME, db)
+        # Check replicated data/xattr/omap using rados
+        print "Verify replicated import data using rados"
+        ERRORS += verify(DATADIR, REP_POOL, REP_NAME, db)
 
     if EXP_ERRORS == 0:
         NEWPOOL = "rados-import-pool"
@@ -1358,7 +1715,7 @@ def main(argv):
                     logging.error("Rados import --no-overwrite failed from {file} with {ret}".format(file=file, ret=ret))
                     ERRORS += 1
 
-        ERRORS += verify(DATADIR, NEWPOOL, REP_NAME)
+        ERRORS += verify(DATADIR, NEWPOOL, REP_NAME, db)
     else:
         logging.warning("SKIPPING IMPORT-RADOS TESTS DUE TO PREVIOUS FAILURES")
 
@@ -1393,6 +1750,7 @@ def main(argv):
             NAME = SPLIT_NAME + "{num}".format(num=i)
             LNAME = nspace + "-" + NAME
             DDNAME = os.path.join(DATADIR, LNAME)
+            DDNAME += "__head"
 
             cmd = "rm -f " + DDNAME
             logging.debug(cmd)
diff --git a/src/test/cli/crushtool/check-names.empty.t b/src/test/cli/crushtool/check-names.empty.t
index 9e30790..755e931 100644
--- a/src/test/cli/crushtool/check-names.empty.t
+++ b/src/test/cli/crushtool/check-names.empty.t
@@ -1,4 +1,5 @@
   $ crushtool -c "$TESTDIR/check-names.empty.crushmap.txt" -o "$TESTDIR/check-names.empty.crushmap"
-  $ crushtool -i "$TESTDIR/check-names.empty.crushmap" --check-names
+  $ crushtool -i "$TESTDIR/check-names.empty.crushmap" --check 0
   unknown type name: item#0
+  [1]
   $ rm -f "$TESTDIR/check-names.empty.crushmap"
diff --git a/src/test/cli/crushtool/check-names.max-id.t b/src/test/cli/crushtool/check-names.max-id.t
index 18724ff..ee04fdc 100644
--- a/src/test/cli/crushtool/check-names.max-id.t
+++ b/src/test/cli/crushtool/check-names.max-id.t
@@ -4,4 +4,5 @@
   $ crushtool -i check-names.crushmap       --add-item 2 1.0 device2 --loc host host0 --loc cluster cluster0 -o check-names.crushmap > /dev/null
   $ crushtool -i check-names.crushmap --check 2
   item id too large: item#2
+  [1]
   $ crushtool -i check-names.crushmap --check
diff --git a/src/test/cli/crushtool/help.t b/src/test/cli/crushtool/help.t
index 4c21912..b737474 100644
--- a/src/test/cli/crushtool/help.t
+++ b/src/test/cli/crushtool/help.t
@@ -64,7 +64,7 @@
                            show location for given device id
      -i mapfn --test       test a range of inputs on the map
         [--min-x x] [--max-x x] [--x x]
-        [--min-rule r] [--max-rule r] [--rule r]
+        [--min-rule r] [--max-rule r] [--rule r] [--ruleset rs]
         [--num-rep n]
         [--batches b]      split the CRUSH mapping into b > 1 rounds
         [--weight|-w devno weight]
diff --git a/src/test/cli/osdmaptool/pool.t b/src/test/cli/osdmaptool/pool.t
index 5441a34..fc25856 100644
--- a/src/test/cli/osdmaptool/pool.t
+++ b/src/test/cli/osdmaptool/pool.t
@@ -7,6 +7,7 @@
 #
   $ osdmaptool myosdmap --test-map-object foo --pool
   Option --pool requires an argument.
+  
   [1]
 
   $ osdmaptool myosdmap --test-map-object foo --pool bar
@@ -32,6 +33,7 @@
 #
   $ osdmaptool myosdmap --test-map-pgs --pool
   Option --pool requires an argument.
+  
   [1]
 
   $ osdmaptool myosdmap --test-map-pgs --pool baz
diff --git a/src/test/cli/radosgw-admin/help.t b/src/test/cli/radosgw-admin/help.t
index 2ef9573..0b0f900 100644
--- a/src/test/cli/radosgw-admin/help.t
+++ b/src/test/cli/radosgw-admin/help.t
@@ -57,7 +57,8 @@
     metadata rm                remove metadata info
     metadata list              list metadata info
     mdlog list                 list metadata log
-    mdlog trim                 trim metadata log
+    mdlog trim                 trim metadata log (use start-date, end-date or
+                               start-marker, end-marker)
     bilog list                 list bucket index log
     bilog trim                 trim bucket index log (use start-marker, end-marker)
     datalog list               list data log
@@ -75,7 +76,8 @@
      --subuser=<name>          subuser name
      --access-key=<key>        S3 access key
      --email=<email>
-     --secret=<key>            specify secret key
+     --secret/--secret-key=<key>
+                               specify secret key
      --gen-access-key          generate random access key (for S3)
      --gen-secret              generate random secret key
      --key-type=<type>         key type, options are: swift, s3
@@ -125,7 +127,6 @@
      --caps=<caps>             list of caps (e.g., "usage=read, write; user=read"
      --yes-i-really-mean-it    required for certain operations
      --reset-regions           reset regionmap when regionmap update
- 
   <date> := "YYYY-MM-DD[ hh:mm:ss]"
   
   Quota options:
@@ -143,3 +144,4 @@
     --version         show version and quit
   
   [1]
+ 
diff --git a/src/test/cli/rbd/help.t b/src/test/cli/rbd/help.t
index ad6a9ee..7d6dde2 100644
--- a/src/test/cli/rbd/help.t
+++ b/src/test/cli/rbd/help.t
@@ -1,119 +1,813 @@
   $ rbd --help
-  usage: rbd [-n <auth user>] [OPTIONS] <cmd> ...
-  where 'pool' is a rados pool name (default is 'rbd') and 'cmd' is one of:
-    (ls | list) [-l | --long ] [pool-name]      list rbd images
-                                                (-l includes snapshots/clones)
-    (du | disk-usage) [<image-spec> | <snap-spec>]
-                                                show disk usage stats for pool,
-                                                image or snapshot
-    info <image-spec> | <snap-spec>             show information about image size,
-                                                striping, etc.
-    create [--order <bits>] [--image-features <features>] [--image-shared]
-           --size <M/G/T> <image-spec>          create an empty image
-    clone [--order <bits>] [--image-features <features>] [--image-shared]
-           <parent-snap-spec> <child-image-spec>
-                                                clone a snapshot into a COW
-                                                child image
-    children <snap-spec>                        display children of snapshot
-    flatten <image-spec>                        fill clone with parent data
-                                                (make it independent)
-    resize --size <M/G/T> <image-spec>          resize (expand or contract) image
-    rm <image-spec>                             delete an image
-    export (<image-spec> | <snap-spec>) [<path>]
-                                                export image to file
-                                                "-" for stdout
-    import [--image-features <features>] [--image-shared]
-           <path> [<image-spec>]                import image from file
-                                                "-" for stdin
-                                                "rbd/$(basename <path>)" is
-                                                assumed for <image-spec> if
-                                                omitted
-    diff [--from-snap <snap-name>] [--whole-object]
-           <image-spec> | <snap-spec>           print extents that differ since
-                                                a previous snap, or image creation
-    export-diff [--from-snap <snap-name>] [--whole-object]
-           (<image-spec> | <snap-spec>) <path>  export an incremental diff to
-                                                path, or "-" for stdout
-    merge-diff <diff1> <diff2> <path>           merge <diff1> and <diff2> into
-                                                <path>, <diff1> could be "-"
-                                                for stdin, and <path> could be "-"
-                                                for stdout
-    import-diff <path> <image-spec>             import an incremental diff from
-                                                path or "-" for stdin
-    (cp | copy) (<src-image-spec> | <src-snap-spec>) <dest-image-spec>
-                                                copy src image to dest
-    (mv | rename) <src-image-spec> <dest-image-spec>
-                                                rename src image to dest
-    image-meta list <image-spec>                image metadata list keys with values
-    image-meta get <image-spec> <key>           image metadata get the value associated with the key
-    image-meta set <image-spec> <key> <value>   image metadata set key with value
-    image-meta remove <image-spec> <key>        image metadata remove the key and value associated
-    object-map rebuild <image-spec> | <snap-spec>
-                                                rebuild an invalid object map
-    snap ls <image-spec>                        dump list of image snapshots
-    snap create <snap-spec>                     create a snapshot
-    snap rollback <snap-spec>                   rollback image to snapshot
-    snap rm <snap-spec>                         deletes a snapshot
-    snap purge <image-spec>                     deletes all snapshots
-    snap protect <snap-spec>                    prevent a snapshot from being deleted
-    snap unprotect <snap-spec>                  allow a snapshot to be deleted
-    watch <image-spec>                          watch events on image
-    status <image-spec>                         show the status of this image
-    map <image-spec> | <snap-spec>              map image to a block device
-                                                using the kernel
-    unmap <image-spec> | <snap-spec> | <device> unmap a rbd device that was
-                                                mapped by the kernel
-    showmapped                                  show the rbd images mapped
-                                                by the kernel
-    feature disable <image-spec> <feature>      disable the specified image feature
-    feature enable <image-spec> <feature>       enable the specified image feature
-    lock list <image-spec>                      show locks held on an image
-    lock add <image-spec> <id> [--shared <tag>] take a lock called id on an image
-    lock remove <image-spec> <id> <locker>      release a lock on an image
-    bench-write <image-spec>                    simple write benchmark
-                 --io-size <size in B/K/M/G/T>    write size
-                 --io-threads <num>               ios in flight
-                 --io-total <size in B/K/M/G/T>   total size to write
-                 --io-pattern <seq|rand>          write pattern
-  
-  <image-spec> is [<pool-name>]/<image-name>,
-  <snap-spec> is [<pool-name>]/<image-name>@<snap-name>,
-  or you may specify individual pieces of names with -p/--pool <pool-name>,
-  --image <image-name> and/or --snap <snap-name>.
-  
-  Other input options:
-    -p, --pool <pool-name>             source pool name
-    --dest-pool <pool-name>            destination pool name
-    --image <image-name>               image name
-    --dest <image-name>                destination image name
-    --snap <snap-name>                 snapshot name
-    --path <path-name>                 path name for import/export
-    -s, --size <size in M/G/T>         size of image for create and resize
-    --order <bits>                     the object size in bits; object size will be
-                                       (1 << order) bytes. Default is 22 (4 MB).
-    --image-format <format-number>     format to use when creating an image
-                                       format 1 is the original format
-                                       format 2 supports cloning (default)
-    --image-feature <feature>          optional format 2 feature to enable.
-                                       use multiple times to enable multiple features
-    --image-shared                     image will be used concurrently (disables
-                                       RBD exclusive lock and dependent features)
-    --stripe-unit <size in B/K/M>      size of a block of data
-    --stripe-count <num>               number of consecutive objects in a stripe
-    --id <username>                    rados user (without 'client.'prefix) to
-                                       authenticate as
-    --keyfile <path>                   file containing secret key for use with cephx
-    --keyring <path>                   file containing keyring for use with cephx
-    --shared <tag>                     take a shared (rather than exclusive) lock
-    --format <output-format>           output format (default: plain, json, xml)
-    --pretty-format                    make json or xml output more readable
-    --no-progress                      do not show progress for long-running commands
-    -o, --options <map-options>        options to use when mapping an image
-    --read-only                        set device readonly when mapping image
-    --allow-shrink                     allow shrinking of an image when resizing
-  
-  Supported image features:
-    layering (+), striping (+), exclusive-lock (*), object-map (*), fast-diff (*), deep-flatten
+  usage: rbd <command> ...
   
+  Command-line interface for managing Ceph RBD images.
+  
+  Positional arguments:
+    <command>
+      bench-write                 Simple write benchmark.
+      children                    Display children of snapshot.
+      clone                       Clone a snapshot into a COW child image.
+      copy (cp)                   Copy src image to dest.
+      create                      Create an empty image.
+      diff                        Print extents that differ since a previous
+                                  snap, or image creation.
+      disk-usage (du)             Show disk usage stats for pool, image or
+                                  snapshot
+      export                      Export image to file.
+      export-diff                 Export incremental diff to file.
+      feature disable             Disable the specified image feature.
+      feature enable              Enable the specified image feature.
+      flatten                     Fill clone with parent data (make it
+                                  independent).
+      image-meta get              Image metadata get the value associated with
+                                  the key.
+      image-meta list             Image metadata list keys with values.
+      image-meta remove           Image metadata remove the key and value
+                                  associated.
+      image-meta set              Image metadata set key with value.
+      import                      Import image from file.
+      import-diff                 Import an incremental diff.
+      info                        Show information about image size, striping,
+                                  etc.
+      list (ls)                   List rbd images.
+      lock add                    Take a lock on an image.
+      lock list (lock ls)         Show locks held on an image.
+      lock remove (lock rm)       Release a lock on an image.
+      map                         Map image to a block device using the kernel.
+      merge-diff                  Merge two diff exports together.
+      object-map rebuild          Rebuild an invalid object map.
+      remove (rm)                 Delete an image.
+      rename (mv)                 Rename image within pool.
+      resize                      Resize (expand or shrink) image.
+      showmapped                  Show the rbd images mapped by the kernel.
+      snap create (snap add)      Create a snapshot.
+      snap list (snap ls)         Dump list of image snapshots.
+      snap protect                Prevent a snapshot from being deleted.
+      snap purge                  Deletes all snapshots.
+      snap remove (snap rm)       Deletes a snapshot.
+      snap rename                 Rename a snapshot.
+      snap rollback (snap revert) Rollback image to snapshot.
+      snap unprotect              Allow a snapshot to be deleted.
+      status                      Show the status of this image.
+      unmap                       Unmap a rbd device that was used by the kernel.
+      watch                       Watch events on image.
+  
+  Optional arguments:
+    -c [ --conf ] arg     path to cluster configuration
+    --cluster arg         cluster name
+    --id arg              client id (without 'client.' prefix)
+    --user arg            client id (without 'client.' prefix)
+    -n [ --name ] arg     client name
+    -m [ --mon_host ] arg monitor host
+    --secret arg          path to secret key (deprecated)
+    -K [ --keyfile ] arg  path to secret key
+    -k [ --keyring ] arg  path to keyring
+  
+  See 'rbd help <command>' for help on a specific command.
+  $ rbd help | grep '^    [a-z]' | sed 's/^    \([a-z -]*[a-z]\).*/\1/g' | while read -r line; do echo rbd help $line ; rbd help $line; done
+  rbd help bench-write
+  usage: rbd bench-write [--pool <pool>] [--image <image>] [--io-size <io-size>] 
+                         [--io-threads <io-threads>] [--io-total <io-total>] 
+                         [--io-pattern <io-pattern>] 
+                         <image-spec> 
+  
+  Simple write benchmark.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --io-size arg        write size (in B/K/M/G/T)
+    --io-threads arg     ios in flight
+    --io-total arg       total size to write (in B/K/M/G/T)
+    --io-pattern arg     write pattern (rand or seq)
+  
+  rbd help children
+  usage: rbd children [--pool <pool>] [--image <image>] [--snap <snap>] 
+                      [--format <format>] [--pretty-format] 
+                      <snap-spec> 
+  
+  Display children of snapshot.
+  
+  Positional arguments
+    <snap-spec>          snapshot specification
+                         (example: [<pool-name>/]<image-name>@<snapshot-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --snap arg           snapshot name
+    --format arg         output format [plain, json, or xml]
+    --pretty-format      pretty formatting (json and xml)
+  
+  rbd help clone
+  usage: rbd clone [--pool <pool>] [--image <image>] [--snap <snap>] 
+                   [--dest-pool <dest-pool>] [--dest <dest>] [--order <order>] 
+                   [--image-feature <image-feature>] [--image-shared] 
+                   [--stripe-unit <stripe-unit>] [--stripe-count <stripe-count>] 
+                   <source-snap-spec> <dest-image-spec> 
+  
+  Clone a snapshot into a COW child image.
+  
+  Positional arguments
+    <source-snap-spec>   source snapshot specification
+                         (example: [<pool-name>/]<image-name>@<snapshot-name>)
+    <dest-image-spec>    destination image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    source pool name
+    --image arg          source image name
+    --snap arg           source snapshot name
+    --dest-pool arg      destination pool name
+    --dest arg           destination image name
+    --order arg          object order [12 <= order <= 25]
+    --image-feature arg  image features
+                         [layering(+), striping(+), exclusive-lock(*),
+                         object-map(*), fast-diff(*), deep-flatten, journaling(*)]
+    --image-shared       shared image
+    --stripe-unit arg    stripe unit
+    --stripe-count arg   stripe count
+  
+  Image Features:
+    (*) supports enabling/disabling on existing images
+    (+) enabled by default for new images if features not specified
+  
+  rbd help copy
+  usage: rbd copy [--pool <pool>] [--image <image>] [--snap <snap>] 
+                  [--dest-pool <dest-pool>] [--dest <dest>] [--no-progress] 
+                  <source-image-or-snap-spec> <dest-image-spec> 
+  
+  Copy src image to dest.
+  
+  Positional arguments
+    <source-image-or-snap-spec>  source image or snapshot specification
+                                 (example:
+                                 [<pool-name>/]<image-name>[@<snap-name>])
+    <dest-image-spec>            destination image specification
+                                 (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg            source pool name
+    --image arg                  source image name
+    --snap arg                   source snapshot name
+    --dest-pool arg              destination pool name
+    --dest arg                   destination image name
+    --no-progress                disable progress output
+  
+  rbd help create
+  usage: rbd create [--pool <pool>] [--image <image>] 
+                    [--image-format <image-format>] [--new-format] 
+                    [--order <order>] [--image-feature <image-feature>] 
+                    [--image-shared] [--stripe-unit <stripe-unit>] 
+                    [--stripe-count <stripe-count>] --size <size> 
+                    <image-spec> 
+  
+  Create an empty image.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --image-format arg   image format [1 or 2]
+    --new-format         use image format 2
+                         (deprecated)
+    --order arg          object order [12 <= order <= 25]
+    --image-feature arg  image features
+                         [layering(+), striping(+), exclusive-lock(*),
+                         object-map(*), fast-diff(*), deep-flatten, journaling(*)]
+    --image-shared       shared image
+    --stripe-unit arg    stripe unit
+    --stripe-count arg   stripe count
+    -s [ --size ] arg    image size (in M/G/T)
+  
+  Image Features:
+    (*) supports enabling/disabling on existing images
+    (+) enabled by default for new images if features not specified
+  
+  rbd help diff
+  usage: rbd diff [--pool <pool>] [--image <image>] [--snap <snap>] 
+                  [--from-snap <from-snap>] [--whole-object] [--format <format>] 
+                  [--pretty-format] 
+                  <image-or-snap-spec> 
+  
+  Print extents that differ since a previous snap, or image creation.
+  
+  Positional arguments
+    <image-or-snap-spec>  image or snapshot specification
+                          (example: [<pool-name>/]<image-name>[@<snap-name>])
+  
+  Optional arguments
+    -p [ --pool ] arg     pool name
+    --image arg           image name
+    --snap arg            snapshot name
+    --from-snap arg       snapshot starting point
+    --whole-object        compare whole object
+    --format arg          output format [plain, json, or xml]
+    --pretty-format       pretty formatting (json and xml)
+  
+  rbd help disk-usage
+  usage: rbd disk-usage [--pool <pool>] [--image <image>] [--snap <snap>] 
+                        [--format <format>] [--pretty-format] 
+                        <image-or-snap-spec> 
+  
+  Show disk usage stats for pool, image or snapshot
+  
+  Positional arguments
+    <image-or-snap-spec>  image or snapshot specification
+                          (example: [<pool-name>/]<image-name>[@<snap-name>])
+  
+  Optional arguments
+    -p [ --pool ] arg     pool name
+    --image arg           image name
+    --snap arg            snapshot name
+    --format arg          output format [plain, json, or xml]
+    --pretty-format       pretty formatting (json and xml)
+  
+  rbd help export
+  usage: rbd export [--pool <pool>] [--image <image>] [--snap <snap>] 
+                    [--path <path>] [--no-progress] 
+                    <source-image-or-snap-spec> <path-name> 
+  
+  Export image to file.
+  
+  Positional arguments
+    <source-image-or-snap-spec>  source image or snapshot specification
+                                 (example:
+                                 [<pool-name>/]<image-name>[@<snap-name>])
+    <path-name>                  export file (or '-' for stdout)
+  
+  Optional arguments
+    -p [ --pool ] arg            source pool name
+    --image arg                  source image name
+    --snap arg                   source snapshot name
+    --path arg                   export file (or '-' for stdout)
+    --no-progress                disable progress output
+  
+  rbd help export-diff
+  usage: rbd export-diff [--pool <pool>] [--image <image>] [--snap <snap>] 
+                         [--path <path>] [--from-snap <from-snap>] 
+                         [--whole-object] [--no-progress] 
+                         <source-image-or-snap-spec> <path-name> 
+  
+  Export incremental diff to file.
+  
+  Positional arguments
+    <source-image-or-snap-spec>  source image or snapshot specification
+                                 (example:
+                                 [<pool-name>/]<image-name>[@<snap-name>])
+    <path-name>                  export file (or '-' for stdout)
+  
+  Optional arguments
+    -p [ --pool ] arg            source pool name
+    --image arg                  source image name
+    --snap arg                   source snapshot name
+    --path arg                   export file (or '-' for stdout)
+    --from-snap arg              snapshot starting point
+    --whole-object               compare whole object
+    --no-progress                disable progress output
+  
+  rbd help feature disable
+  usage: rbd feature disable [--pool <pool>] [--image <image>] 
+                             <image-spec> <features> [<features> ...]
+  
+  Disable the specified image feature.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+    <features>           image features
+                         [layering, striping, exclusive-lock, object-map,
+                         fast-diff, deep-flatten, journaling]
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+  
+  rbd help feature enable
+  usage: rbd feature enable [--pool <pool>] [--image <image>] 
+                            <image-spec> <features> [<features> ...]
+  
+  Enable the specified image feature.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+    <features>           image features
+                         [layering, striping, exclusive-lock, object-map,
+                         fast-diff, deep-flatten, journaling]
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+  
+  rbd help flatten
+  usage: rbd flatten [--pool <pool>] [--image <image>] [--no-progress] 
+                     <image-spec> 
+  
+  Fill clone with parent data (make it independent).
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --no-progress        disable progress output
+  
+  rbd help image-meta get
+  usage: rbd image-meta get [--pool <pool>] [--image <image>] 
+                            <image-spec> <key> 
+  
+  Image metadata get the value associated with the key.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+    <key>                image meta key
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+  
+  rbd help image-meta list
+  usage: rbd image-meta list [--pool <pool>] [--image <image>] 
+                             [--format <format>] [--pretty-format] 
+                             <image-spec> 
+  
+  Image metadata list keys with values.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --format arg         output format [plain, json, or xml]
+    --pretty-format      pretty formatting (json and xml)
+  
+  rbd help image-meta remove
+  usage: rbd image-meta remove [--pool <pool>] [--image <image>] 
+                               <image-spec> <key> 
+  
+  Image metadata remove the key and value associated.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+    <key>                image meta key
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+  
+  rbd help image-meta set
+  usage: rbd image-meta set [--pool <pool>] [--image <image>] 
+                            <image-spec> <key> <value> 
+  
+  Image metadata set key with value.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+    <key>                image meta key
+    <value>              image meta value
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+  
+  rbd help import
+  usage: rbd import [--path <path>] [--dest-pool <dest-pool>] [--dest <dest>] 
+                    [--image-format <image-format>] [--new-format] 
+                    [--order <order>] [--image-feature <image-feature>] 
+                    [--image-shared] [--stripe-unit <stripe-unit>] 
+                    [--stripe-count <stripe-count>] [--no-progress] 
+                    [--pool <pool>] [--image <image>] 
+                    <path-name> <dest-image-spec> 
+  
+  Import image from file.
+  
+  Positional arguments
+    <path-name>          import file (or '-' for stdin)
+    <dest-image-spec>    destination image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    --path arg           import file (or '-' for stdin)
+    --dest-pool arg      destination pool name
+    --dest arg           destination image name
+    --image-format arg   image format [1 or 2]
+    --new-format         use image format 2
+                         (deprecated)
+    --order arg          object order [12 <= order <= 25]
+    --image-feature arg  image features
+                         [layering(+), striping(+), exclusive-lock(*),
+                         object-map(*), fast-diff(*), deep-flatten, journaling(*)]
+    --image-shared       shared image
+    --stripe-unit arg    stripe unit
+    --stripe-count arg   stripe count
+    --no-progress        disable progress output
+    -p [ --pool ] arg    pool name (deprecated)
+    --image arg          image name (deprecated)
+  
+  Image Features:
     (*) supports enabling/disabling on existing images
-    (+) enabled by default for new images if features are not specified
+    (+) enabled by default for new images if features not specified
+  
+  rbd help import-diff
+  usage: rbd import-diff [--path <path>] [--pool <pool>] [--image <image>] 
+                         [--no-progress] 
+                         <path-name> <image-spec> 
+  
+  Import an incremental diff.
+  
+  Positional arguments
+    <path-name>          import file (or '-' for stdin)
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    --path arg           import file (or '-' for stdin)
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --no-progress        disable progress output
+  
+  rbd help info
+  usage: rbd info [--pool <pool>] [--image <image>] [--snap <snap>] 
+                  [--format <format>] [--pretty-format] 
+                  <image-or-snap-spec> 
+  
+  Show information about image size, striping, etc.
+  
+  Positional arguments
+    <image-or-snap-spec>  image or snapshot specification
+                          (example: [<pool-name>/]<image-name>[@<snap-name>])
+  
+  Optional arguments
+    -p [ --pool ] arg     pool name
+    --image arg           image name
+    --snap arg            snapshot name
+    --format arg          output format [plain, json, or xml]
+    --pretty-format       pretty formatting (json and xml)
+  
+  rbd help list
+  usage: rbd list [--long] [--pool <pool>] [--format <format>] [--pretty-format] 
+                  <pool-name> 
+  
+  List rbd images.
+  
+  Positional arguments
+    <pool-name>          pool name
+  
+  Optional arguments
+    -l [ --long ]        long listing format
+    -p [ --pool ] arg    pool name
+    --format arg         output format [plain, json, or xml]
+    --pretty-format      pretty formatting (json and xml)
+  
+  rbd help lock add
+  usage: rbd lock add [--pool <pool>] [--image <image>] [--shared <shared>] 
+                      <image-spec> <lock-id> 
+  
+  Take a lock on an image.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+    <lock-id>            unique lock id
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --shared arg         shared lock tag
+  
+  rbd help lock list
+  usage: rbd lock list [--pool <pool>] [--image <image>] [--format <format>] 
+                       [--pretty-format] 
+                       <image-spec> 
+  
+  Show locks held on an image.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --format arg         output format [plain, json, or xml]
+    --pretty-format      pretty formatting (json and xml)
+  
+  rbd help lock remove
+  usage: rbd lock remove [--pool <pool>] [--image <image>] 
+                         <image-spec> <lock-id> <locker> 
+  
+  Release a lock on an image.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+    <lock-id>            unique lock id
+    <locker>             locker client
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+  
+  rbd help map
+  usage: rbd map [--pool <pool>] [--image <image>] [--snap <snap>] 
+                 [--options <options>] [--read-only] 
+                 <image-or-snap-spec> 
+  
+  Map image to a block device using the kernel.
+  
+  Positional arguments
+    <image-or-snap-spec>  image or snapshot specification
+                          (example: [<pool-name>/]<image-name>[@<snap-name>])
+  
+  Optional arguments
+    -p [ --pool ] arg     pool name
+    --image arg           image name
+    --snap arg            snapshot name
+    -o [ --options ] arg  mapping options
+    --read-only           mount read-only
+  
+  rbd help merge-diff
+  usage: rbd merge-diff [--path <path>] [--no-progress] 
+                        <diff1-path> <diff2-path> <path-name> 
+  
+  Merge two diff exports together.
+  
+  Positional arguments
+    <diff1-path>         path to first diff (or '-' for stdin)
+    <diff2-path>         path to second diff
+    <path-name>          path to merged diff (or '-' for stdout)
+  
+  Optional arguments
+    --path arg           path to merged diff (or '-' for stdout)
+    --no-progress        disable progress output
+  
+  rbd help object-map rebuild
+  usage: rbd object-map rebuild [--pool <pool>] [--image <image>] 
+                                [--snap <snap>] [--no-progress] 
+                                <image-or-snap-spec> 
+  
+  Rebuild an invalid object map.
+  
+  Positional arguments
+    <image-or-snap-spec>  image or snapshot specification
+                          (example: [<pool-name>/]<image-name>[@<snap-name>])
+  
+  Optional arguments
+    -p [ --pool ] arg     pool name
+    --image arg           image name
+    --snap arg            snapshot name
+    --no-progress         disable progress output
+  
+  rbd help remove
+  usage: rbd remove [--pool <pool>] [--image <image>] [--no-progress] 
+                    <image-spec> 
+  
+  Delete an image.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --no-progress        disable progress output
+  
+  rbd help rename
+  usage: rbd rename [--pool <pool>] [--image <image>] [--dest-pool <dest-pool>] 
+                    [--dest <dest>] 
+                    <source-image-spec> <dest-image-spec> 
+  
+  Rename image within pool.
+  
+  Positional arguments
+    <source-image-spec>  source image specification
+                         (example: [<pool-name>/]<image-name>)
+    <dest-image-spec>    destination image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    source pool name
+    --image arg          source image name
+    --dest-pool arg      destination pool name
+    --dest arg           destination image name
+  
+  rbd help resize
+  usage: rbd resize [--pool <pool>] [--image <image>] --size <size> 
+                    [--allow-shrink] [--no-progress] 
+                    <image-spec> 
+  
+  Resize (expand or shrink) image.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    -s [ --size ] arg    image size (in M/G/T)
+    --allow-shrink       permit shrinking
+    --no-progress        disable progress output
+  
+  rbd help showmapped
+  usage: rbd showmapped [--format <format>] [--pretty-format] 
+  
+  Show the rbd images mapped by the kernel.
+  
+  Optional arguments
+    --format arg         output format [plain, json, or xml]
+    --pretty-format      pretty formatting (json and xml)
+  
+  rbd help snap create
+  usage: rbd snap create [--pool <pool>] [--image <image>] [--snap <snap>] 
+                         <snap-spec> 
+  
+  Create a snapshot.
+  
+  Positional arguments
+    <snap-spec>          snapshot specification
+                         (example: [<pool-name>/]<image-name>@<snapshot-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --snap arg           snapshot name
+  
+  rbd help snap list
+  usage: rbd snap list [--pool <pool>] [--image <image>] [--format <format>] 
+                       [--pretty-format] 
+                       <image-spec> 
+  
+  Dump list of image snapshots.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --format arg         output format [plain, json, or xml]
+    --pretty-format      pretty formatting (json and xml)
+  
+  rbd help snap protect
+  usage: rbd snap protect [--pool <pool>] [--image <image>] [--snap <snap>] 
+                          <snap-spec> 
+  
+  Prevent a snapshot from being deleted.
+  
+  Positional arguments
+    <snap-spec>          snapshot specification
+                         (example: [<pool-name>/]<image-name>@<snapshot-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --snap arg           snapshot name
+  
+  rbd help snap purge
+  usage: rbd snap purge [--pool <pool>] [--image <image>] [--no-progress] 
+                        <image-spec> 
+  
+  Deletes all snapshots.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --no-progress        disable progress output
+  
+  rbd help snap remove
+  usage: rbd snap remove [--pool <pool>] [--image <image>] [--snap <snap>] 
+                         <snap-spec> 
+  
+  Deletes a snapshot.
+  
+  Positional arguments
+    <snap-spec>          snapshot specification
+                         (example: [<pool-name>/]<image-name>@<snapshot-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --snap arg           snapshot name
+  
+  rbd help snap rename
+  usage: rbd snap rename [--pool <pool>] [--image <image>] [--snap <snap>] 
+                         [--dest-pool <dest-pool>] [--dest <dest>] 
+                         <source-snap-spec> <dest-snap-spec> 
+  
+  Rename a snapshot.
+  
+  Positional arguments
+    <source-snap-spec>   source snapshot specification
+                         (example: [<pool-name>/]<image-name>@<snapshot-name>)
+    <dest-snap-spec>     destination snapshot specification
+                         (example: [<pool-name>/]<image-name>@<snapshot-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    source pool name
+    --image arg          source image name
+    --snap arg           source snapshot name
+    --dest-pool arg      destination pool name
+    --dest arg           destination image name
+  
+  rbd help snap rollback
+  usage: rbd snap rollback [--pool <pool>] [--image <image>] [--snap <snap>] 
+                           [--no-progress] 
+                           <snap-spec> 
+  
+  Rollback image to snapshot.
+  
+  Positional arguments
+    <snap-spec>          snapshot specification
+                         (example: [<pool-name>/]<image-name>@<snapshot-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --snap arg           snapshot name
+    --no-progress        disable progress output
+  
+  rbd help snap unprotect
+  usage: rbd snap unprotect [--pool <pool>] [--image <image>] [--snap <snap>] 
+                            <snap-spec> 
+  
+  Allow a snapshot to be deleted.
+  
+  Positional arguments
+    <snap-spec>          snapshot specification
+                         (example: [<pool-name>/]<image-name>@<snapshot-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --snap arg           snapshot name
+  
+  rbd help status
+  usage: rbd status [--pool <pool>] [--image <image>] [--format <format>] 
+                    [--pretty-format] 
+                    <image-spec> 
+  
+  Show the status of this image.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+    --format arg         output format [plain, json, or xml]
+    --pretty-format      pretty formatting (json and xml)
+  
+  rbd help unmap
+  usage: rbd unmap [--pool <pool>] [--image <image>] [--snap <snap>] 
+                   <image-or-snap-or-device-spec> 
+  
+  Unmap a rbd device that was used by the kernel.
+  
+  Positional arguments
+    <image-or-snap-or-device-spec>  image, snapshot, or device specification
+                                    [<pool-name>/]<image-name>[@<snapshot-name>]
+                                    or <device-path>
+  
+  Optional arguments
+    -p [ --pool ] arg               pool name
+    --image arg                     image name
+    --snap arg                      snapshot name
+  
+  rbd help watch
+  usage: rbd watch [--pool <pool>] [--image <image>] 
+                   <image-spec> 
+  
+  Watch events on image.
+  
+  Positional arguments
+    <image-spec>         image specification
+                         (example: [<pool-name>/]<image-name>)
+  
+  Optional arguments
+    -p [ --pool ] arg    pool name
+    --image arg          image name
+  
diff --git a/src/test/cli/rbd/invalid-snap-usage.t b/src/test/cli/rbd/invalid-snap-usage.t
index 6735dc4..9b8c594 100644
--- a/src/test/cli/rbd/invalid-snap-usage.t
+++ b/src/test/cli/rbd/invalid-snap-usage.t
@@ -1,109 +1,109 @@
   $ rbd create foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd flatten foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd resize foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd rm foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd import-diff /tmp/diff foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd mv foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd mv foo at snap bar
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd mv foo at snap bar at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd image-meta list foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd image-meta get foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd image-meta get foo at snap key
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd image-meta set foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd image-meta set foo at snap key
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd image-meta set foo at snap key val
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd image-meta remove foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd image-meta remove foo at snap key
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd snap ls foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd snap purge foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd watch foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd status foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd feature disable foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd feature disable foo at snap layering
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd feature enable foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd feature enable foo at snap layering
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd lock list foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd lock add foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd lock add foo at snap id
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd lock remove foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd lock remove foo at snap id
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd lock remove foo at snap id client.1234
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd bench-write foo at snap
   rbd: snapname specified for a command that doesn't use it
-  [1]
+  [22]
 
   $ rbd clone foo at snap bar at snap
   rbd: destination snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd import /bin/ls ls at snap
   rbd: destination snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd cp foo bar at snap
   rbd: destination snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd cp foo at snap bar at snap
   rbd: destination snapname specified for a command that doesn't use it
-  [1]
+  [22]
   $ rbd mv foo bar at snap
   rbd: destination snapname specified for a command that doesn't use it
-  [1]
+  [22]
diff --git a/src/test/cli/rbd/not-enough-args.t b/src/test/cli/rbd/not-enough-args.t
index df4fbd7..10283a3 100644
--- a/src/test/cli/rbd/not-enough-args.t
+++ b/src/test/cli/rbd/not-enough-args.t
@@ -1,192 +1,192 @@
   $ rbd info
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd create
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd clone
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd clone foo
   rbd: snap name was not specified
-  [1]
+  [22]
   $ rbd clone foo at snap
   rbd: destination image name was not specified
-  [1]
+  [22]
   $ rbd clone foo bar
   rbd: snap name was not specified
-  [1]
+  [22]
   $ rbd clone foo bar at snap
   rbd: snap name was not specified
-  [1]
+  [22]
   $ rbd children
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd children foo
   rbd: snap name was not specified
-  [1]
+  [22]
   $ rbd flatten
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd resize
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd rm
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd export
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd import
   rbd: path was not specified
-  [1]
+  [22]
   $ rbd diff
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd export-diff
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd export-diff foo
   rbd: path was not specified
-  [1]
+  [22]
   $ rbd export-diff foo at snap
   rbd: path was not specified
-  [1]
+  [22]
   $ rbd merge-diff
   rbd: first diff was not specified
-  [1]
+  [22]
   $ rbd merge-diff /tmp/diff1
   rbd: second diff was not specified
-  [1]
+  [22]
   $ rbd merge-diff /tmp/diff1 /tmp/diff2
   rbd: path was not specified
-  [1]
+  [22]
   $ rbd import-diff
   rbd: path was not specified
-  [1]
+  [22]
   $ rbd import-diff /tmp/diff
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd cp
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd cp foo
   rbd: destination image name was not specified
-  [1]
+  [22]
   $ rbd cp foo at snap
   rbd: destination image name was not specified
-  [1]
+  [22]
   $ rbd mv
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd mv foo
   rbd: destination image name was not specified
-  [1]
+  [22]
   $ rbd image-meta list
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd image-meta get
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd image-meta get foo
   rbd: metadata key was not specified
-  [1]
+  [22]
   $ rbd image-meta set
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd image-meta set foo
   rbd: metadata key was not specified
-  [1]
+  [22]
   $ rbd image-meta set foo key
   rbd: metadata value was not specified
-  [1]
+  [22]
   $ rbd image-meta remove
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd image-meta remove foo
   rbd: metadata key was not specified
-  [1]
+  [22]
   $ rbd object-map rebuild
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd snap ls
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd snap create
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd snap create foo
   rbd: snap name was not specified
-  [1]
+  [22]
   $ rbd snap rollback
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd snap rollback foo
   rbd: snap name was not specified
-  [1]
+  [22]
   $ rbd snap rm
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd snap rm foo
   rbd: snap name was not specified
-  [1]
+  [22]
   $ rbd snap purge
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd snap protect
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd snap protect foo
   rbd: snap name was not specified
-  [1]
+  [22]
   $ rbd snap unprotect
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd snap unprotect foo
   rbd: snap name was not specified
-  [1]
+  [22]
   $ rbd watch
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd status
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd map
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd unmap
   rbd: unmap requires either image name or device path
-  [1]
+  [22]
   $ rbd feature disable
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd feature disable foo
   rbd: at least one feature name must be specified
-  [1]
+  [22]
   $ rbd feature enable
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd feature enable foo
   rbd: at least one feature name must be specified
-  [1]
+  [22]
   $ rbd lock list
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd lock add
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd lock add foo
   rbd: lock id was not specified
-  [1]
+  [22]
   $ rbd lock remove
   rbd: image name was not specified
-  [1]
+  [22]
   $ rbd lock remove foo
   rbd: lock id was not specified
-  [1]
+  [22]
   $ rbd lock remove foo id
   rbd: locker was not specified
-  [1]
+  [22]
   $ rbd bench-write
   rbd: image name was not specified
-  [1]
+  [22]
diff --git a/src/test/cli/rbd/too-many-args.t b/src/test/cli/rbd/too-many-args.t
new file mode 100644
index 0000000..957845c
--- /dev/null
+++ b/src/test/cli/rbd/too-many-args.t
@@ -0,0 +1,33 @@
+A command taking no args:
+
+  $ rbd showmapped junk
+  rbd: too many arguments
+  [1]
+
+A command taking one arg:
+
+  $ rbd info img1 junk
+  rbd: too many arguments
+  [1]
+
+A command taking two args:
+
+  $ rbd copy img1 img2 junk
+  rbd: too many arguments
+  [1]
+
+A command taking three args:
+
+  $ rbd lock remove img1 lock1 locker1 junk
+  rbd: too many arguments
+  [1]
+
+A command taking unlimited args:
+
+  $ rbd feature enable img1 layering striping exclusive-lock object-map fast-diff deep-flatten journaling junk
+  rbd: the argument for option is invalid
+  [1]
+
+  $ rbd feature disable img1 layering striping exclusive-lock object-map fast-diff deep-flatten journaling junk
+  rbd: the argument for option is invalid
+  [1]
diff --git a/src/test/cls_journal/test_cls_journal.cc b/src/test/cls_journal/test_cls_journal.cc
new file mode 100644
index 0000000..b6405ff
--- /dev/null
+++ b/src/test/cls_journal/test_cls_journal.cc
@@ -0,0 +1,380 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "cls/journal/cls_journal_client.h"
+#include "include/stringify.h"
+#include "common/Cond.h"
+#include "test/librados/test.h"
+#include "gtest/gtest.h"
+#include <errno.h>
+#include <set>
+#include <string>
+
+using namespace cls::journal;
+
+class TestClsJournal : public ::testing::Test {
+public:
+
+  static void SetUpTestCase() {
+    _pool_name = get_temp_pool_name();
+    ASSERT_EQ("", create_one_pool_pp(_pool_name, _rados));
+  }
+
+  static void TearDownTestCase() {
+    ASSERT_EQ(0, destroy_one_pool_pp(_pool_name, _rados));
+  }
+
+  std::string get_temp_image_name() {
+    ++_image_number;
+    return "image" + stringify(_image_number);
+  }
+
+  static std::string _pool_name;
+  static librados::Rados _rados;
+  static uint64_t _image_number;
+
+};
+
+std::string TestClsJournal::_pool_name;
+librados::Rados TestClsJournal::_rados;
+uint64_t TestClsJournal::_image_number = 0;
+
+TEST_F(TestClsJournal, Create) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  uint8_t order = 1;
+  uint8_t splay_width = 2;
+  int64_t pool_id = ioctx.get_id();
+  ASSERT_EQ(0, client::create(ioctx, oid, order, splay_width, pool_id));
+
+  uint8_t read_order;
+  uint8_t read_splay_width;
+  int64_t read_pool_id;
+  C_SaferCond cond;
+  client::get_immutable_metadata(ioctx, oid, &read_order, &read_splay_width,
+                                 &read_pool_id, &cond);
+  ASSERT_EQ(0, cond.wait());
+  ASSERT_EQ(order, read_order);
+  ASSERT_EQ(splay_width, read_splay_width);
+  ASSERT_EQ(pool_id, read_pool_id);
+}
+
+TEST_F(TestClsJournal, MinimumSet) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 4, ioctx.get_id()));
+
+  librados::ObjectWriteOperation op1;
+  client::set_active_set(&op1, 300);
+  ASSERT_EQ(0, ioctx.operate(oid, &op1));
+
+  uint64_t minimum_set = 123;
+  librados::ObjectWriteOperation op2;
+  client::set_minimum_set(&op2, minimum_set);
+  ASSERT_EQ(0, ioctx.operate(oid, &op2));
+
+  C_SaferCond cond;
+  uint64_t read_minimum_set;
+  uint64_t read_active_set;
+  std::set<cls::journal::Client> read_clients;
+  client::get_mutable_metadata(ioctx, oid, &read_minimum_set, &read_active_set,
+                               &read_clients, &cond);
+  ASSERT_EQ(0, cond.wait());
+  ASSERT_EQ(minimum_set, read_minimum_set);
+}
+
+TEST_F(TestClsJournal, MinimumSetStale) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 4, ioctx.get_id()));
+
+  librados::ObjectWriteOperation op1;
+  client::set_active_set(&op1, 300);
+  ASSERT_EQ(0, ioctx.operate(oid, &op1));
+
+  uint64_t minimum_set = 123;
+  librados::ObjectWriteOperation op2;
+  client::set_minimum_set(&op2, minimum_set);
+  ASSERT_EQ(0, ioctx.operate(oid, &op2));
+
+  librados::ObjectWriteOperation op3;
+  client::set_minimum_set(&op3, 1);
+  ASSERT_EQ(-ESTALE, ioctx.operate(oid, &op3));
+
+  C_SaferCond cond;
+  uint64_t read_minimum_set;
+  uint64_t read_active_set;
+  std::set<cls::journal::Client> read_clients;
+  client::get_mutable_metadata(ioctx, oid, &read_minimum_set, &read_active_set,
+                               &read_clients, &cond);
+  ASSERT_EQ(0, cond.wait());
+  ASSERT_EQ(minimum_set, read_minimum_set);
+}
+
+TEST_F(TestClsJournal, MinimumSetOrderConstraint) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 4, ioctx.get_id()));
+
+  librados::ObjectWriteOperation op1;
+  client::set_minimum_set(&op1, 123);
+  ASSERT_EQ(-EINVAL, ioctx.operate(oid, &op1));
+
+  C_SaferCond cond;
+  uint64_t read_minimum_set;
+  uint64_t read_active_set;
+  std::set<cls::journal::Client> read_clients;
+  client::get_mutable_metadata(ioctx, oid, &read_minimum_set, &read_active_set,
+                               &read_clients, &cond);
+  ASSERT_EQ(0, cond.wait());
+  ASSERT_EQ(0U, read_minimum_set);
+}
+
+TEST_F(TestClsJournal, ActiveSet) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 4, ioctx.get_id()));
+
+  uint64_t active_set = 234;
+  librados::ObjectWriteOperation op1;
+  client::set_active_set(&op1, active_set);
+  ASSERT_EQ(0, ioctx.operate(oid, &op1));
+
+  C_SaferCond cond;
+  uint64_t read_minimum_set;
+  uint64_t read_active_set;
+  std::set<cls::journal::Client> read_clients;
+  client::get_mutable_metadata(ioctx, oid, &read_minimum_set, &read_active_set,
+                               &read_clients, &cond);
+  ASSERT_EQ(0, cond.wait());
+  ASSERT_EQ(active_set, read_active_set);
+}
+
+TEST_F(TestClsJournal, ActiveSetStale) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 4, ioctx.get_id()));
+
+  librados::ObjectWriteOperation op1;
+  client::set_active_set(&op1, 345);
+  ASSERT_EQ(0, ioctx.operate(oid, &op1));
+
+  librados::ObjectWriteOperation op2;
+  client::set_active_set(&op2, 3);
+  ASSERT_EQ(-ESTALE, ioctx.operate(oid, &op2));
+}
+
+TEST_F(TestClsJournal, CreateDuplicate) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 4, ioctx.get_id()));
+  ASSERT_EQ(-EEXIST, client::create(ioctx, oid, 3, 5, ioctx.get_id()));
+}
+
+TEST_F(TestClsJournal, ClientRegister) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  ASSERT_EQ(0, client::client_register(ioctx, oid, "id1", "desc1"));
+
+  std::set<Client> clients;
+  ASSERT_EQ(0, client::client_list(ioctx, oid, &clients));
+
+  std::set<Client> expected_clients = {Client("id1", "desc1")};
+  ASSERT_EQ(expected_clients, clients);
+}
+
+TEST_F(TestClsJournal, ClientRegisterDuplicate) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  ASSERT_EQ(0, client::client_register(ioctx, oid, "id1", "desc1"));
+  ASSERT_EQ(-EEXIST, client::client_register(ioctx, oid, "id1", "desc2"));
+}
+
+TEST_F(TestClsJournal, ClientUnregister) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  ASSERT_EQ(0, client::client_register(ioctx, oid, "id1", "desc1"));
+  ASSERT_EQ(0, client::client_unregister(ioctx, oid, "id1"));
+}
+
+TEST_F(TestClsJournal, ClientUnregisterDNE) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  ASSERT_EQ(0, client::client_register(ioctx, oid, "id1", "desc1"));
+  ASSERT_EQ(0, client::client_unregister(ioctx, oid, "id1"));
+  ASSERT_EQ(-ENOENT, client::client_unregister(ioctx, oid, "id1"));
+}
+
+TEST_F(TestClsJournal, ClientCommit) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 2, ioctx.get_id()));
+  ASSERT_EQ(0, client::client_register(ioctx, oid, "id1", "desc1"));
+
+  cls::journal::EntryPositions entry_positions;
+  entry_positions = {
+    cls::journal::EntryPosition("tag1", 120),
+    cls::journal::EntryPosition("tag2", 121)};
+  cls::journal::ObjectSetPosition object_set_position(
+    1, entry_positions);
+
+  librados::ObjectWriteOperation op2;
+  client::client_commit(&op2, "id1", object_set_position);
+  ASSERT_EQ(0, ioctx.operate(oid, &op2));
+
+  std::set<Client> clients;
+  ASSERT_EQ(0, client::client_list(ioctx, oid, &clients));
+
+  std::set<Client> expected_clients = {
+    Client("id1", "desc1", object_set_position)};
+  ASSERT_EQ(expected_clients, clients);
+}
+
+TEST_F(TestClsJournal, ClientCommitInvalid) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  ASSERT_EQ(0, client::create(ioctx, oid, 2, 2, ioctx.get_id()));
+  ASSERT_EQ(0, client::client_register(ioctx, oid, "id1", "desc1"));
+
+  cls::journal::EntryPositions entry_positions;
+  entry_positions = {
+    cls::journal::EntryPosition("tag1", 120),
+    cls::journal::EntryPosition("tag1", 121),
+    cls::journal::EntryPosition("tag2", 121)};
+  cls::journal::ObjectSetPosition object_set_position(
+    1, entry_positions);
+
+  librados::ObjectWriteOperation op2;
+  client::client_commit(&op2, "id1", object_set_position);
+  ASSERT_EQ(-EINVAL, ioctx.operate(oid, &op2));
+}
+
+TEST_F(TestClsJournal, ClientCommitDNE) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  cls::journal::ObjectSetPosition object_set_position;
+
+  librados::ObjectWriteOperation op1;
+  client::client_commit(&op1, "id1", object_set_position);
+  ASSERT_EQ(-ENOENT, ioctx.operate(oid, &op1));
+}
+
+TEST_F(TestClsJournal, ClientList) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  ASSERT_EQ(0, client::create(ioctx, oid, 12, 5, ioctx.get_id()));
+
+  std::set<Client> expected_clients;
+  librados::ObjectWriteOperation op1;
+  for (uint32_t i = 0; i < 512; ++i) {
+    std::string id =  "id" + stringify(i + 1);
+    expected_clients.insert(Client(id, ""));
+    client::client_register(&op1, id, "");
+  }
+  ASSERT_EQ(0, ioctx.operate(oid, &op1));
+
+  std::set<Client> clients;
+  ASSERT_EQ(0, client::client_list(ioctx, oid, &clients));
+  ASSERT_EQ(expected_clients, clients);
+
+  C_SaferCond cond;
+  uint64_t read_minimum_set;
+  uint64_t read_active_set;
+  std::set<cls::journal::Client> read_clients;
+  client::get_mutable_metadata(ioctx, oid, &read_minimum_set, &read_active_set,
+                               &read_clients, &cond);
+  ASSERT_EQ(0, cond.wait());
+  ASSERT_EQ(expected_clients, read_clients);
+}
+
+TEST_F(TestClsJournal, GuardAppend) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  bufferlist bl;
+  bl.append("journal entry!");
+
+  librados::ObjectWriteOperation op1;
+  op1.append(bl);
+  ASSERT_EQ(0, ioctx.operate(oid, &op1));
+
+  librados::ObjectWriteOperation op2;
+  client::guard_append(&op2, 1024);
+  ASSERT_EQ(0, ioctx.operate(oid, &op2));
+}
+
+TEST_F(TestClsJournal, GuardAppendDNE) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  librados::ObjectWriteOperation op2;
+  client::guard_append(&op2, 1024);
+  ASSERT_EQ(0, ioctx.operate(oid, &op2));
+}
+
+TEST_F(TestClsJournal, GuardAppendOverflow) {
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), ioctx));
+
+  std::string oid = get_temp_image_name();
+
+  bufferlist bl;
+  bl.append("journal entry!");
+
+  librados::ObjectWriteOperation op1;
+  op1.append(bl);
+  ASSERT_EQ(0, ioctx.operate(oid, &op1));
+
+  librados::ObjectWriteOperation op2;
+  client::guard_append(&op2, 1);
+  ASSERT_EQ(-EOVERFLOW, ioctx.operate(oid, &op2));
+}
diff --git a/src/test/cls_rbd/test_cls_rbd.cc b/src/test/cls_rbd/test_cls_rbd.cc
index eb9c19c..b53cfa8 100644
--- a/src/test/cls_rbd/test_cls_rbd.cc
+++ b/src/test/cls_rbd/test_cls_rbd.cc
@@ -63,6 +63,7 @@ using ::librbd::cls_client::metadata_set;
 using ::librbd::cls_client::metadata_remove;
 using ::librbd::cls_client::metadata_list;
 using ::librbd::cls_client::metadata_get;
+using ::librbd::cls_client::snapshot_rename;
 
 static char *random_buf(size_t len)
 {
@@ -781,6 +782,14 @@ TEST_F(TestClsRbd, snapshots)
   ASSERT_EQ("snap1", snap_names[1]);
   ASSERT_EQ(10u, snap_sizes[1]);
 
+  ASSERT_EQ(0, snapshot_rename(&ioctx, oid, 0, "snap1-rename"));
+  ASSERT_EQ(0, snapshot_list(&ioctx, oid, snapc.snaps, &snap_names,
+			     &snap_sizes, &parents, &protection_status));
+  ASSERT_EQ(2u, snap_names.size());
+  ASSERT_EQ("snap2", snap_names[0]);
+  ASSERT_EQ(10u, snap_sizes[0]);
+  ASSERT_EQ("snap1-rename", snap_names[1]);
+  ASSERT_EQ(10u, snap_sizes[1]);
   ASSERT_EQ(0, snapshot_remove(&ioctx, oid, 0));
   ASSERT_EQ(0, get_snapcontext(&ioctx, oid, &snapc));
   ASSERT_EQ(1u, snapc.snaps.size());
diff --git a/src/test/debian-jessie/Dockerfile.in b/src/test/debian-jessie/Dockerfile.in
index c9287bd..59f9072 100644
--- a/src/test/debian-jessie/Dockerfile.in
+++ b/src/test/debian-jessie/Dockerfile.in
@@ -28,4 +28,4 @@ RUN apt-get update
 RUN cd /root ; ./install-deps.sh
 # development tools
 RUN apt-get install -y ccache valgrind gdb python-virtualenv gdisk kpartx hdparm jq xmlstarlet
-RUN useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+RUN if test %%USER%% != root ; then useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers ; fi
diff --git a/src/test/debian-jessie/install-deps.sh b/src/test/debian-jessie/install-deps.sh
index 1bebf09..8249ea3 100755
--- a/src/test/debian-jessie/install-deps.sh
+++ b/src/test/debian-jessie/install-deps.sh
@@ -62,7 +62,7 @@ CentOS|Fedora|RedHatEnterpriseServer)
             CentOS|RedHatEnterpriseServer)
                 $SUDO yum install -y yum-utils
                 MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
-                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                if test $(lsb_release -si) = RedHatEnterpriseServer ; then
                     $SUDO yum install subscription-manager
                     $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
                 fi
@@ -70,6 +70,9 @@ CentOS|Fedora|RedHatEnterpriseServer)
                 $SUDO yum install --nogpgcheck -y epel-release
                 $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
                 $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
+                if test $(lsb_release -si) = CentOS -a $MAJOR_VERSION = 7 ; then
+                    $SUDO yum-config-manager --enable cr
+                fi
                 ;;
         esac
         sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index 34b0696..a9dbc9b 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -226,6 +226,8 @@ TYPE(ETableServer)
 #include "mds/events/EUpdate.h"
 TYPE(EUpdate)
 
+#include "librbd/JournalTypes.h"
+TYPE(librbd::journal::EventEntry)
 #include "librbd/WatchNotifyTypes.h"
 TYPE(librbd::WatchNotify::NotifyMessage)
 TYPE(librbd::WatchNotify::ResponseMessage)
@@ -311,6 +313,11 @@ TYPE(cls_user_get_header_op)
 TYPE(cls_user_get_header_ret)
 TYPE(cls_user_complete_stats_sync_op)
 
+#include "cls/journal/cls_journal_types.h"
+TYPE(cls::journal::EntryPosition)
+TYPE(cls::journal::ObjectSetPosition)
+TYPE(cls::journal::Client)
+
 #include "rgw/rgw_common.h"
 TYPE(RGWAccessKey)
 TYPE(RGWSubUser)
@@ -360,6 +367,8 @@ TYPE(cls_refcount_read_op)
 TYPE(cls_refcount_read_ret)
 TYPE(cls_refcount_set_op)
 
+#include "journal/Entry.h"
+TYPE(journal::Entry)
 
 // --- messages ---
 #include "messages/MAuth.h"
diff --git a/src/test/erasure-code/test-erasure-code.sh b/src/test/erasure-code/test-erasure-code.sh
index 2840553..1328766 100755
--- a/src/test/erasure-code/test-erasure-code.sh
+++ b/src/test/erasure-code/test-erasure-code.sh
@@ -22,7 +22,7 @@ function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7101"
+    export CEPH_MON="127.0.0.1:7101" # git grep '\<7101\>' : there must be only one
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
diff --git a/src/test/erasure-code/test-erasure-eio.sh b/src/test/erasure-code/test-erasure-eio.sh
index 129d09b..32a6e17 100755
--- a/src/test/erasure-code/test-erasure-eio.sh
+++ b/src/test/erasure-code/test-erasure-eio.sh
@@ -22,7 +22,7 @@ function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7112"
+    export CEPH_MON="127.0.0.1:7112" # git grep '\<7112\>' : there must be only one
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
diff --git a/src/test/fedora-21/Dockerfile.in b/src/test/fedora-21/Dockerfile.in
index 9b606ea..7e01fab 100644
--- a/src/test/fedora-21/Dockerfile.in
+++ b/src/test/fedora-21/Dockerfile.in
@@ -26,4 +26,4 @@ RUN yum install -y which ; cd /root ; ./install-deps.sh
 # development tools
 # nc is required to run make check on firefly only (giant+ do not use nc)
 RUN yum install -y ccache valgrind gdb git python-virtualenv gdisk kpartx hdparm jq sudo xmlstarlet parted nc
-RUN useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+RUN if test %%USER%% != root ; then useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers ; fi
diff --git a/src/test/fedora-21/ceph.spec.in b/src/test/fedora-21/ceph.spec.in
index 8f2a6fc..2939fef 100644
--- a/src/test/fedora-21/ceph.spec.in
+++ b/src/test/fedora-21/ceph.spec.in
@@ -590,6 +590,11 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 		%{?_with_tcmalloc} \
 		CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
 
+%if %{with lowmem_builder}
+%if 0%{?jobs} > 8
+%define _smp_mflags -j8
+%endif
+%endif
 
 make %{?_smp_mflags}
 
@@ -607,8 +612,7 @@ make %{?_smp_mflags} check-local
 make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
-install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
-install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
+install -D src/etc-rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
 %if 0%{?fedora} || 0%{?rhel}
 install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph
 %endif
@@ -617,6 +621,7 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
 %endif
 %if 0%{?_with_systemd}
   install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/ceph-common.conf
+  install -m 0644 -D systemd/rbdmap.service $RPM_BUILD_ROOT%{_unitdir}/rbdmap.service
   install -m 0644 -D systemd/ceph-osd at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-osd at .service
   install -m 0644 -D systemd/ceph-mon at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mon at .service
   install -m 0644 -D systemd/ceph-create-keys at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-create-keys at .service
@@ -626,6 +631,7 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
   install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
   install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
 %else
+  install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
   install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
   install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
   ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
@@ -810,6 +816,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/rados-classes/libcls_timeindex.so*
 %{_libdir}/rados-classes/libcls_user.so*
 %{_libdir}/rados-classes/libcls_version.so*
+%{_libdir}/rados-classes/libcls_journal.so*
 %dir %{_libdir}/ceph/erasure-code
 %{_libdir}/ceph/erasure-code/libec_*.so*
 %if 0%{?_with_lttng}
@@ -872,6 +879,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/rbd
 %{_bindir}/rbd-replay
 %{_bindir}/rbd-replay-many
+%{_bindir}/rbdmap
 %if 0%{?_with_lttng}
 %{_bindir}/rbd-replay-prep
 %endif
@@ -901,7 +909,11 @@ rm -rf $RPM_BUILD_ROOT
 %config %{_sysconfdir}/bash_completion.d/rados
 %config %{_sysconfdir}/bash_completion.d/rbd
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
+%if 0%{?_with_systemd}
+%{_unitdir}/rbdmap.service
+%else
 %{_initrddir}/rbdmap
+%endif
 %{python_sitelib}/ceph_argparse.py*
 %{python_sitelib}/ceph_daemon.py*
 %{_udevrulesdir}/50-rbd.rules
@@ -1302,12 +1314,12 @@ exit 0
 %files libs-compat
 # We need an empty %%files list for ceph-libs-compat, to tell rpmbuild to actually
 # build this meta package.
+%endif
 
 #################################################################################
 %files devel-compat
 # We need an empty %%files list for ceph-devel-compat, to tell rpmbuild to
 # actually build this meta package.
-%endif
 
 #################################################################################
 %files -n python-ceph-compat
diff --git a/src/test/fedora-21/install-deps.sh b/src/test/fedora-21/install-deps.sh
index 1bebf09..8249ea3 100755
--- a/src/test/fedora-21/install-deps.sh
+++ b/src/test/fedora-21/install-deps.sh
@@ -62,7 +62,7 @@ CentOS|Fedora|RedHatEnterpriseServer)
             CentOS|RedHatEnterpriseServer)
                 $SUDO yum install -y yum-utils
                 MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
-                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                if test $(lsb_release -si) = RedHatEnterpriseServer ; then
                     $SUDO yum install subscription-manager
                     $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
                 fi
@@ -70,6 +70,9 @@ CentOS|Fedora|RedHatEnterpriseServer)
                 $SUDO yum install --nogpgcheck -y epel-release
                 $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
                 $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
+                if test $(lsb_release -si) = CentOS -a $MAJOR_VERSION = 7 ; then
+                    $SUDO yum-config-manager --enable cr
+                fi
                 ;;
         esac
         sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
diff --git a/src/test/journal/RadosTestFixture.cc b/src/test/journal/RadosTestFixture.cc
new file mode 100644
index 0000000..d5f1b32
--- /dev/null
+++ b/src/test/journal/RadosTestFixture.cc
@@ -0,0 +1,93 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/journal/RadosTestFixture.h"
+#include "cls/journal/cls_journal_client.h"
+#include "include/stringify.h"
+
+RadosTestFixture::RadosTestFixture()
+  : m_timer_lock("m_timer_lock"), m_timer(NULL), m_listener(this) {
+}
+
+void RadosTestFixture::SetUpTestCase() {
+  _pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(_pool_name, _rados));
+}
+
+void RadosTestFixture::TearDownTestCase() {
+  ASSERT_EQ(0, destroy_one_pool_pp(_pool_name, _rados));
+}
+
+std::string RadosTestFixture::get_temp_oid() {
+  ++_oid_number;
+  return "oid" + stringify(_oid_number);
+}
+
+void RadosTestFixture::SetUp() {
+  ASSERT_EQ(0, _rados.ioctx_create(_pool_name.c_str(), m_ioctx));
+  m_timer = new SafeTimer(reinterpret_cast<CephContext*>(m_ioctx.cct()),
+                          m_timer_lock, true);
+  m_timer->init();
+}
+
+void RadosTestFixture::TearDown() {
+  {
+    Mutex::Locker locker(m_timer_lock);
+    m_timer->shutdown();
+  }
+  delete m_timer;
+}
+
+int RadosTestFixture::create(const std::string &oid, uint8_t order,
+                             uint8_t splay_width) {
+  return cls::journal::client::create(m_ioctx, oid, order, splay_width, -1);
+}
+
+int RadosTestFixture::append(const std::string &oid, const bufferlist &bl) {
+  librados::ObjectWriteOperation op;
+  op.append(bl);
+  return m_ioctx.operate(oid, &op);
+}
+
+int RadosTestFixture::client_register(const std::string &oid,
+                                      const std::string &id,
+                                      const std::string &description) {
+  return cls::journal::client::client_register(m_ioctx, oid, id, description);
+}
+
+int RadosTestFixture::client_commit(const std::string &oid,
+                                    const std::string &id,
+                                    const cls::journal::ObjectSetPosition &commit_position) {
+  librados::ObjectWriteOperation op;
+  cls::journal::client::client_commit(&op, id, commit_position);
+  return m_ioctx.operate(oid, &op);
+}
+
+bufferlist RadosTestFixture::create_payload(const std::string &payload) {
+  bufferlist bl;
+  bl.append(payload);
+  return bl;
+}
+
+int RadosTestFixture::init_metadata(journal::JournalMetadataPtr metadata) {
+  C_SaferCond cond;
+  metadata->init(&cond);
+  return cond.wait();
+}
+
+bool RadosTestFixture::wait_for_update(journal::JournalMetadataPtr metadata) {
+  Mutex::Locker locker(m_listener.mutex);
+  while (m_listener.updates[metadata.get()] == 0) {
+    if (m_listener.cond.WaitInterval(
+          reinterpret_cast<CephContext*>(m_ioctx.cct()),
+          m_listener.mutex, utime_t(10, 0)) != 0) {
+      return false;
+    }
+  }
+  --m_listener.updates[metadata.get()];
+  return true;
+}
+
+std::string RadosTestFixture::_pool_name;
+librados::Rados RadosTestFixture::_rados;
+uint64_t RadosTestFixture::_oid_number = 0;
diff --git a/src/test/journal/RadosTestFixture.h b/src/test/journal/RadosTestFixture.h
new file mode 100644
index 0000000..3b05a70
--- /dev/null
+++ b/src/test/journal/RadosTestFixture.h
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librados/test.h"
+#include "common/Mutex.h"
+#include "common/Timer.h"
+#include "journal/JournalMetadata.h"
+#include "cls/journal/cls_journal_types.h"
+#include "gtest/gtest.h"
+
+class RadosTestFixture : public ::testing::Test {
+public:
+  static void SetUpTestCase();
+  static void TearDownTestCase();
+
+  static std::string get_temp_oid();
+
+  RadosTestFixture();
+  virtual void SetUp();
+  virtual void TearDown();
+
+  int create(const std::string &oid, uint8_t order, uint8_t splay_width);
+  int append(const std::string &oid, const bufferlist &bl);
+
+  int client_register(const std::string &oid, const std::string &id,
+                      const std::string &description);
+  int client_commit(const std::string &oid, const std::string &id,
+                    const cls::journal::ObjectSetPosition &commit_position);
+
+  bufferlist create_payload(const std::string &payload);
+
+  struct Listener : public journal::JournalMetadata::Listener {
+    RadosTestFixture *test_fixture;
+    Mutex mutex;
+    Cond cond;
+    std::map<journal::JournalMetadata*, uint32_t> updates;
+
+    Listener(RadosTestFixture *_test_fixture)
+      : test_fixture(_test_fixture), mutex("mutex") {}
+
+    virtual void handle_update(journal::JournalMetadata *metadata) {
+      Mutex::Locker locker(mutex);
+      ++updates[metadata];
+      cond.Signal();
+    }
+  };
+
+  int init_metadata(journal::JournalMetadataPtr metadata);
+
+  bool wait_for_update(journal::JournalMetadataPtr metadata);
+
+  static std::string _pool_name;
+  static librados::Rados _rados;
+  static uint64_t _oid_number;
+
+  librados::IoCtx m_ioctx;
+
+  Mutex m_timer_lock;
+  SafeTimer *m_timer;
+
+  Listener m_listener;
+};
diff --git a/src/test/journal/test_Entry.cc b/src/test/journal/test_Entry.cc
new file mode 100644
index 0000000..e042978
--- /dev/null
+++ b/src/test/journal/test_Entry.cc
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/Entry.h"
+#include "gtest/gtest.h"
+
+class TestEntry : public ::testing::Test {
+};
+
+TEST_F(TestEntry, DefaultConstructor) {
+  journal::Entry entry;
+  ASSERT_EQ(0U, entry.get_tid());
+  ASSERT_EQ("", entry.get_tag());
+
+  bufferlist data(entry.get_data());
+  bufferlist expected_data;
+  ASSERT_TRUE(data.contents_equal(expected_data));
+}
+
+TEST_F(TestEntry, Constructor) {
+  bufferlist data;
+  data.append("data");
+  journal::Entry entry("tag", 123, data);
+
+  data.clear();
+  data = entry.get_data();
+
+  bufferlist expected_data;
+  expected_data.append("data");
+
+  ASSERT_EQ(123U, entry.get_tid());
+  ASSERT_EQ("tag", entry.get_tag());
+  ASSERT_TRUE(data.contents_equal(expected_data));
+}
+
+TEST_F(TestEntry, IsReadable) {
+  bufferlist data;
+  data.append("data");
+  journal::Entry entry("tag", 123, data);
+
+  bufferlist full_bl;
+  ::encode(entry, full_bl);
+
+  uint32_t bytes_needed;
+  for (size_t i = 0; i < full_bl.length() - 1; ++i) {
+    bufferlist partial_bl;
+    if (i > 0) {
+      partial_bl.substr_of(full_bl, 0, i);
+    }
+    ASSERT_FALSE(journal::Entry::is_readable(partial_bl.begin(),
+                                             &bytes_needed));
+    ASSERT_GT(bytes_needed, 0U);
+  }
+  ASSERT_TRUE(journal::Entry::is_readable(full_bl.begin(), &bytes_needed));
+  ASSERT_EQ(0U, bytes_needed);
+}
+
+TEST_F(TestEntry, IsReadableBadPreamble) {
+  bufferlist data;
+  data.append("data");
+  journal::Entry entry("tag", 123, data);
+
+  uint64_t stray_bytes = 0x1122334455667788;
+  bufferlist full_bl;
+  ::encode(stray_bytes, full_bl);
+  ::encode(entry, full_bl);
+
+  uint32_t bytes_needed;
+  bufferlist::iterator it = full_bl.begin();
+  ASSERT_FALSE(journal::Entry::is_readable(it, &bytes_needed));
+  ASSERT_EQ(0U, bytes_needed);
+
+  it.advance(sizeof(stray_bytes));
+  ASSERT_TRUE(journal::Entry::is_readable(it, &bytes_needed));
+  ASSERT_EQ(0U, bytes_needed);
+}
+
+TEST_F(TestEntry, IsReadableBadCRC) {
+  bufferlist data;
+  data.append("data");
+  journal::Entry entry("tag", 123, data);
+
+  bufferlist full_bl;
+  ::encode(entry, full_bl);
+
+  bufferlist bad_bl;
+  bad_bl.substr_of(full_bl, 0, full_bl.length() - 4);
+  ::encode(full_bl.crc32c(1), bad_bl);
+
+  uint32_t bytes_needed;
+  ASSERT_FALSE(journal::Entry::is_readable(bad_bl.begin(), &bytes_needed));
+  ASSERT_EQ(0U, bytes_needed);
+
+
+
+}
diff --git a/src/test/journal/test_FutureImpl.cc b/src/test/journal/test_FutureImpl.cc
new file mode 100644
index 0000000..5d5bb04
--- /dev/null
+++ b/src/test/journal/test_FutureImpl.cc
@@ -0,0 +1,206 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/FutureImpl.h"
+#include "common/Cond.h"
+#include "common/Finisher.h"
+#include "common/Mutex.h"
+#include "gtest/gtest.h"
+#include "test/journal/RadosTestFixture.h"
+
+class TestFutureImpl : public RadosTestFixture {
+public:
+
+  TestFutureImpl() : m_finisher(NULL) {
+  }
+  ~TestFutureImpl() {
+    m_finisher->stop();
+    delete m_finisher;
+  }
+
+  struct FlushHandler : public journal::FutureImpl::FlushHandler {
+    uint64_t refs;
+    uint64_t flushes;
+    FlushHandler() : refs(0), flushes(0) {}
+    virtual void get() {
+      ++refs;
+    }
+    virtual void put() {
+      assert(refs > 0);
+      --refs;
+    }
+    virtual void flush(const journal::FutureImplPtr &future) {
+      ++flushes;
+    }
+  };
+
+  void SetUp() {
+    RadosTestFixture::SetUp();
+    m_finisher = new Finisher(reinterpret_cast<CephContext*>(m_ioctx.cct()));
+    m_finisher->start();
+  }
+
+  journal::FutureImplPtr create_future(const std::string &tag, uint64_t tid,
+                                       uint64_t commit_tid,
+                                       const journal::FutureImplPtr &prev =
+                                         journal::FutureImplPtr()) {
+    journal::FutureImplPtr future(new journal::FutureImpl(*m_finisher,
+                                                          tag, tid,
+                                                          commit_tid));
+    future->init(prev);
+    return future;
+  }
+
+  void flush(const journal::FutureImplPtr &future) {
+  }
+
+  Finisher *m_finisher;
+
+  FlushHandler m_flush_handler;
+};
+
+TEST_F(TestFutureImpl, Getters) {
+  journal::FutureImplPtr future = create_future("tag", 123, 456);
+  ASSERT_EQ("tag", future->get_tag());
+  ASSERT_EQ(123U, future->get_tid());
+  ASSERT_EQ(456U, future->get_commit_tid());
+}
+
+TEST_F(TestFutureImpl, Attach) {
+  journal::FutureImplPtr future = create_future("tag", 123, 456);
+  ASSERT_FALSE(future->attach(&m_flush_handler));
+  ASSERT_EQ(1U, m_flush_handler.refs);
+}
+
+TEST_F(TestFutureImpl, AttachWithPendingFlush) {
+  journal::FutureImplPtr future = create_future("tag", 123, 456);
+  future->flush(NULL);
+
+  ASSERT_TRUE(future->attach(&m_flush_handler));
+  ASSERT_EQ(1U, m_flush_handler.refs);
+}
+
+TEST_F(TestFutureImpl, Detach) {
+  journal::FutureImplPtr future = create_future("tag", 123, 456);
+  ASSERT_FALSE(future->attach(&m_flush_handler));
+  future->detach();
+  ASSERT_EQ(0U, m_flush_handler.refs);
+}
+
+TEST_F(TestFutureImpl, DetachImplicit) {
+  journal::FutureImplPtr future = create_future("tag", 123, 456);
+  ASSERT_FALSE(future->attach(&m_flush_handler));
+  future.reset();
+  ASSERT_EQ(0U, m_flush_handler.refs);
+}
+
+TEST_F(TestFutureImpl, Flush) {
+  journal::FutureImplPtr future = create_future("tag", 123, 456);
+  ASSERT_FALSE(future->attach(&m_flush_handler));
+
+  C_SaferCond cond;
+  future->flush(&cond);
+
+  ASSERT_EQ(1U, m_flush_handler.flushes);
+  future->safe(-EIO);
+  ASSERT_EQ(-EIO, cond.wait());
+}
+
+TEST_F(TestFutureImpl, FlushWithoutContext) {
+  journal::FutureImplPtr future = create_future("tag", 123, 456);
+  ASSERT_FALSE(future->attach(&m_flush_handler));
+
+  future->flush(NULL);
+  ASSERT_EQ(1U, m_flush_handler.flushes);
+  future->safe(-EIO);
+  ASSERT_TRUE(future->is_complete());
+  ASSERT_EQ(-EIO, future->get_return_value());
+}
+
+TEST_F(TestFutureImpl, FlushChain) {
+  journal::FutureImplPtr future1 = create_future("tag1", 123, 456);
+  journal::FutureImplPtr future2 = create_future("tag1", 124, 457, future1);
+  journal::FutureImplPtr future3 = create_future("tag2", 1, 458, future2);
+  ASSERT_FALSE(future1->attach(&m_flush_handler));
+  ASSERT_FALSE(future2->attach(&m_flush_handler));
+  ASSERT_FALSE(future3->attach(&m_flush_handler));
+
+  C_SaferCond cond;
+  future3->flush(&cond);
+
+  ASSERT_EQ(3U, m_flush_handler.flushes);
+
+  future3->safe(0);
+  ASSERT_FALSE(future3->is_complete());
+
+  future1->safe(0);
+  ASSERT_FALSE(future3->is_complete());
+
+  future2->safe(-EIO);
+  ASSERT_TRUE(future3->is_complete());
+  ASSERT_EQ(-EIO, future3->get_return_value());
+  ASSERT_EQ(-EIO, cond.wait());
+  ASSERT_EQ(0, future1->get_return_value());
+}
+
+TEST_F(TestFutureImpl, FlushInProgress) {
+  journal::FutureImplPtr future1 = create_future("tag1", 123, 456);
+  journal::FutureImplPtr future2 = create_future("tag1", 124, 457, future1);
+  ASSERT_FALSE(future1->attach(&m_flush_handler));
+  ASSERT_FALSE(future2->attach(&m_flush_handler));
+
+  future1->set_flush_in_progress();
+  ASSERT_TRUE(future1->is_flush_in_progress());
+
+  future1->flush(NULL);
+  ASSERT_EQ(0U, m_flush_handler.flushes);
+
+  future1->safe(0);
+}
+
+TEST_F(TestFutureImpl, FlushAlreadyComplete) {
+  journal::FutureImplPtr future = create_future("tag1", 123, 456);
+  future->safe(-EIO);
+
+  C_SaferCond cond;
+  future->flush(&cond);
+  ASSERT_EQ(-EIO, cond.wait());
+}
+
+TEST_F(TestFutureImpl, Wait) {
+  journal::FutureImplPtr future = create_future("tag", 1, 456);
+
+  C_SaferCond cond;
+  future->wait(&cond);
+  future->safe(-EEXIST);
+  ASSERT_EQ(-EEXIST, cond.wait());
+}
+
+TEST_F(TestFutureImpl, WaitAlreadyComplete) {
+  journal::FutureImplPtr future = create_future("tag", 1, 456);
+  future->safe(-EEXIST);
+
+  C_SaferCond cond;
+  future->wait(&cond);
+  ASSERT_EQ(-EEXIST, cond.wait());
+}
+
+TEST_F(TestFutureImpl, SafePreservesError) {
+  journal::FutureImplPtr future1 = create_future("tag1", 123, 456);
+  journal::FutureImplPtr future2 = create_future("tag1", 124, 457, future1);
+
+  future1->safe(-EIO);
+  future2->safe(-EEXIST);
+  ASSERT_TRUE(future2->is_complete());
+  ASSERT_EQ(-EIO, future2->get_return_value());
+}
+
+TEST_F(TestFutureImpl, ConsistentPreservesError) {
+  journal::FutureImplPtr future1 = create_future("tag1", 123, 456);
+  journal::FutureImplPtr future2 = create_future("tag1", 124, 457, future1);
+
+  future2->safe(-EEXIST);
+  future1->safe(-EIO);
+  ASSERT_TRUE(future2->is_complete());
+  ASSERT_EQ(-EEXIST, future2->get_return_value());
+}
diff --git a/src/test/journal/test_JournalMetadata.cc b/src/test/journal/test_JournalMetadata.cc
new file mode 100644
index 0000000..e0bd918
--- /dev/null
+++ b/src/test/journal/test_JournalMetadata.cc
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/JournalMetadata.h"
+#include "test/journal/RadosTestFixture.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include <map>
+
+class TestJournalMetadata : public RadosTestFixture {
+public:
+  virtual void TearDown() {
+    for (MetadataList::iterator it = m_metadata_list.begin();
+         it != m_metadata_list.end(); ++it) {
+      (*it)->remove_listener(&m_listener);
+    }
+    RadosTestFixture::TearDown();
+  }
+
+  journal::JournalMetadataPtr create_metadata(const std::string &oid,
+                                              const std::string &client_id) {
+    journal::JournalMetadataPtr metadata(new journal::JournalMetadata(
+      m_ioctx, oid, client_id, 0.1));
+    m_metadata_list.push_back(metadata);
+    metadata->add_listener(&m_listener);
+    return metadata;
+  }
+
+  typedef std::list<journal::JournalMetadataPtr> MetadataList;
+  MetadataList m_metadata_list;
+};
+
+TEST_F(TestJournalMetadata, JournalDNE) {
+  std::string oid = get_temp_oid();
+
+  journal::JournalMetadataPtr metadata1 = create_metadata(oid, "client1");
+  ASSERT_EQ(-ENOENT, init_metadata(metadata1));
+}
+
+TEST_F(TestJournalMetadata, ClientDNE) {
+  std::string oid = get_temp_oid();
+
+  ASSERT_EQ(0, create(oid, 14, 2));
+  ASSERT_EQ(0, client_register(oid, "client1", ""));
+
+  journal::JournalMetadataPtr metadata1 = create_metadata(oid, "client1");
+  ASSERT_EQ(0, init_metadata(metadata1));
+
+  journal::JournalMetadataPtr metadata2 = create_metadata(oid, "client2");
+  ASSERT_EQ(-ENOENT, init_metadata(metadata2));
+}
+
+TEST_F(TestJournalMetadata, SetCommitPositions) {
+  std::string oid = get_temp_oid();
+
+  ASSERT_EQ(0, create(oid, 14, 2));
+  ASSERT_EQ(0, client_register(oid, "client1", ""));
+
+  journal::JournalMetadataPtr metadata1 = create_metadata(oid, "client1");
+  ASSERT_EQ(0, init_metadata(metadata1));
+
+  journal::JournalMetadataPtr metadata2 = create_metadata(oid, "client1");
+  ASSERT_EQ(0, init_metadata(metadata2));
+  ASSERT_TRUE(wait_for_update(metadata2));
+
+  journal::JournalMetadata::ObjectSetPosition commit_position;
+  journal::JournalMetadata::ObjectSetPosition read_commit_position;
+  metadata1->get_commit_position(&read_commit_position);
+  ASSERT_EQ(commit_position, read_commit_position);
+
+  journal::JournalMetadata::EntryPositions entry_positions;
+  entry_positions = {
+    cls::journal::EntryPosition("tag1", 122)};
+  commit_position = journal::JournalMetadata::ObjectSetPosition(1, entry_positions);
+
+  C_SaferCond cond;
+  metadata1->set_commit_position(commit_position, &cond);
+  ASSERT_EQ(0, cond.wait());
+  ASSERT_TRUE(wait_for_update(metadata2));
+
+  metadata2->get_commit_position(&read_commit_position);
+  ASSERT_EQ(commit_position, read_commit_position);
+}
+
+TEST_F(TestJournalMetadata, UpdateActiveObject) {
+  std::string oid = get_temp_oid();
+
+  ASSERT_EQ(0, create(oid, 14, 2));
+  ASSERT_EQ(0, client_register(oid, "client1", ""));
+
+  journal::JournalMetadataPtr metadata1 = create_metadata(oid, "client1");
+  ASSERT_EQ(0, init_metadata(metadata1));
+  ASSERT_TRUE(wait_for_update(metadata1));
+
+  ASSERT_EQ(0U, metadata1->get_active_set());
+
+  metadata1->set_active_set(123);
+  ASSERT_TRUE(wait_for_update(metadata1));
+
+  ASSERT_EQ(123U, metadata1->get_active_set());
+}
diff --git a/src/test/journal/test_JournalPlayer.cc b/src/test/journal/test_JournalPlayer.cc
new file mode 100644
index 0000000..adc445a
--- /dev/null
+++ b/src/test/journal/test_JournalPlayer.cc
@@ -0,0 +1,354 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/JournalPlayer.h"
+#include "journal/Entry.h"
+#include "journal/JournalMetadata.h"
+#include "journal/ReplayHandler.h"
+#include "include/stringify.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "gtest/gtest.h"
+#include "test/journal/RadosTestFixture.h"
+#include <list>
+
+class TestJournalPlayer : public RadosTestFixture {
+public:
+  typedef std::list<journal::JournalPlayer *> JournalPlayers;
+  typedef std::list<journal::Entry> Entries;
+
+  struct ReplayHandler : public journal::ReplayHandler {
+    Mutex lock;
+    Cond cond;
+    bool entries_available;
+    bool complete;
+    int complete_result;
+
+    ReplayHandler()
+      : lock("lock"), entries_available(false), complete(false),
+        complete_result(0) {}
+
+    virtual void get() {}
+    virtual void put() {}
+
+    virtual bool filter_entry(const std::string &tag) {
+      return false;
+    }
+
+    virtual void handle_entries_available() {
+      Mutex::Locker locker(lock);
+      entries_available = true;
+      cond.Signal();
+    }
+
+    virtual void handle_complete(int r) {
+      Mutex::Locker locker(lock);
+      complete = true;
+      complete_result = r;
+      cond.Signal();
+    }
+  };
+
+  virtual void TearDown() {
+    for (JournalPlayers::iterator it = m_players.begin();
+         it != m_players.end(); ++it) {
+      delete *it;
+    }
+    RadosTestFixture::TearDown();
+  }
+
+  int create(const std::string &oid) {
+    return RadosTestFixture::create(oid, 14, 2);
+  }
+
+  int client_register(const std::string &oid) {
+    return RadosTestFixture::client_register(oid, "client", "");
+  }
+
+  int client_commit(const std::string &oid,
+                    journal::JournalPlayer::ObjectSetPosition position) {
+    return RadosTestFixture::client_commit(oid, "client", position);
+  }
+
+  journal::Entry create_entry(const std::string &tag, uint64_t tid) {
+    bufferlist payload_bl;
+    payload_bl.append("playload");
+    return journal::Entry(tag, tid, payload_bl);
+  }
+
+  journal::JournalMetadataPtr create_metadata(const std::string &oid) {
+    journal::JournalMetadataPtr metadata(new journal::JournalMetadata(
+      m_ioctx, oid, "client", 0.1));
+    return metadata;
+  }
+
+  journal::JournalPlayer *create_player(const std::string &oid,
+                                          const journal::JournalMetadataPtr &metadata) {
+    journal::JournalPlayer *player(new journal::JournalPlayer(
+      m_ioctx, oid + ".", metadata, &m_replay_hander));
+    m_players.push_back(player);
+    return player;
+  }
+
+  bool wait_for_entries(journal::JournalPlayer *player, uint32_t count,
+                        Entries *entries) {
+    entries->clear();
+    while (entries->size() < count) {
+      journal::Entry entry;
+      uint64_t commit_tid;
+      while (entries->size() < count &&
+             player->try_pop_front(&entry, &commit_tid)) {
+        entries->push_back(entry);
+      }
+      if (entries->size() == count) {
+        break;
+      }
+
+      Mutex::Locker locker(m_replay_hander.lock);
+      if (m_replay_hander.entries_available) {
+        m_replay_hander.entries_available = false;
+      } else if (m_replay_hander.cond.WaitInterval(
+          reinterpret_cast<CephContext*>(m_ioctx.cct()),
+          m_replay_hander.lock, utime_t(10, 0)) != 0) {
+        break;
+      }
+    }
+    return entries->size() == count;
+  }
+
+  bool wait_for_complete(journal::JournalPlayer *player) {
+    journal::Entry entry;
+    uint64_t commit_tid;
+    player->try_pop_front(&entry, &commit_tid);
+
+    Mutex::Locker locker(m_replay_hander.lock);
+    while (!m_replay_hander.complete) {
+      if (m_replay_hander.cond.WaitInterval(
+            reinterpret_cast<CephContext*>(m_ioctx.cct()),
+            m_replay_hander.lock, utime_t(10, 0)) != 0) {
+        return false;
+      }
+    }
+    m_replay_hander.complete = false;
+    return true;
+  }
+
+  int write_entry(const std::string &oid, uint64_t object_num,
+                  const std::string &tag, uint64_t tid) {
+    bufferlist bl;
+    ::encode(create_entry(tag, tid), bl);
+    return append(oid + "." + stringify(object_num), bl);
+  }
+
+  JournalPlayers m_players;
+  ReplayHandler m_replay_hander;
+};
+
+TEST_F(TestJournalPlayer, Prefetch) {
+  std::string oid = get_temp_oid();
+
+  journal::JournalPlayer::EntryPositions positions;
+  positions = {
+    cls::journal::EntryPosition("tag1", 122) };
+  cls::journal::ObjectSetPosition commit_position(0, positions);
+
+  ASSERT_EQ(0, create(oid));
+  ASSERT_EQ(0, client_register(oid));
+  ASSERT_EQ(0, client_commit(oid, commit_position));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+
+  journal::JournalPlayer *player = create_player(oid, metadata);
+
+  ASSERT_EQ(0, write_entry(oid, 0, "tag1", 122));
+  ASSERT_EQ(0, write_entry(oid, 1, "tag1", 123));
+  ASSERT_EQ(0, write_entry(oid, 0, "tag1", 124));
+  ASSERT_EQ(0, write_entry(oid, 1, "tag1", 125));
+
+  player->prefetch();
+
+  Entries entries;
+  ASSERT_TRUE(wait_for_entries(player, 3, &entries));
+  ASSERT_TRUE(wait_for_complete(player));
+
+  Entries expected_entries;
+  expected_entries = {
+    create_entry("tag1", 123),
+    create_entry("tag1", 124),
+    create_entry("tag1", 125)};
+  ASSERT_EQ(expected_entries, entries);
+
+  uint64_t last_tid;
+  ASSERT_TRUE(metadata->get_last_allocated_tid("tag1", &last_tid));
+  ASSERT_EQ(125U, last_tid);
+}
+
+TEST_F(TestJournalPlayer, PrefetchSkip) {
+  std::string oid = get_temp_oid();
+
+  journal::JournalPlayer::EntryPositions positions;
+  positions = {
+    cls::journal::EntryPosition("tag1", 125) };
+  cls::journal::ObjectSetPosition commit_position(0, positions);
+
+  ASSERT_EQ(0, create(oid));
+  ASSERT_EQ(0, client_register(oid));
+  ASSERT_EQ(0, client_commit(oid, commit_position));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+
+  journal::JournalPlayer *player = create_player(oid, metadata);
+
+  ASSERT_EQ(0, write_entry(oid, 0, "tag1", 122));
+  ASSERT_EQ(0, write_entry(oid, 1, "tag1", 123));
+  ASSERT_EQ(0, write_entry(oid, 0, "tag1", 124));
+  ASSERT_EQ(0, write_entry(oid, 1, "tag1", 125));
+
+  player->prefetch();
+
+  Entries entries;
+  ASSERT_TRUE(wait_for_entries(player, 0, &entries));
+  ASSERT_TRUE(wait_for_complete(player));
+
+  uint64_t last_tid;
+  ASSERT_TRUE(metadata->get_last_allocated_tid("tag1", &last_tid));
+  ASSERT_EQ(125U, last_tid);
+}
+
+TEST_F(TestJournalPlayer, PrefetchWithoutCommit) {
+  std::string oid = get_temp_oid();
+
+  cls::journal::ObjectSetPosition commit_position;
+
+  ASSERT_EQ(0, create(oid));
+  ASSERT_EQ(0, client_register(oid));
+  ASSERT_EQ(0, client_commit(oid, commit_position));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+
+  journal::JournalPlayer *player = create_player(oid, metadata);
+
+  ASSERT_EQ(0, write_entry(oid, 0, "tag1", 122));
+  ASSERT_EQ(0, write_entry(oid, 1, "tag1", 123));
+
+  player->prefetch();
+
+  Entries entries;
+  ASSERT_TRUE(wait_for_entries(player, 2, &entries));
+  ASSERT_TRUE(wait_for_complete(player));
+
+  Entries expected_entries;
+  expected_entries = {
+    create_entry("tag1", 122),
+    create_entry("tag1", 123)};
+  ASSERT_EQ(expected_entries, entries);
+}
+
+TEST_F(TestJournalPlayer, PrefetchMultipleTags) {
+  std::string oid = get_temp_oid();
+
+  journal::JournalPlayer::EntryPositions positions;
+  positions = {
+    cls::journal::EntryPosition("tag1", 122),
+    cls::journal::EntryPosition("tag2", 1)};
+  cls::journal::ObjectSetPosition commit_position(0, positions);
+
+  ASSERT_EQ(0, create(oid));
+  ASSERT_EQ(0, client_register(oid));
+  ASSERT_EQ(0, client_commit(oid, commit_position));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+
+  journal::JournalPlayer *player = create_player(oid, metadata);
+
+  ASSERT_EQ(0, write_entry(oid, 0, "tag1", 120));
+  ASSERT_EQ(0, write_entry(oid, 0, "tag2", 0));
+  ASSERT_EQ(0, write_entry(oid, 1, "tag1", 121));
+  ASSERT_EQ(0, write_entry(oid, 1, "tag2", 1));
+  ASSERT_EQ(0, write_entry(oid, 0, "tag1", 122));
+  ASSERT_EQ(0, write_entry(oid, 1, "tag1", 123));
+  ASSERT_EQ(0, write_entry(oid, 0, "tag1", 124));
+  ASSERT_EQ(0, write_entry(oid, 0, "tag2", 2));
+
+  player->prefetch();
+
+  Entries entries;
+  ASSERT_TRUE(wait_for_entries(player, 3, &entries));
+  ASSERT_TRUE(wait_for_complete(player));
+
+  uint64_t last_tid;
+  ASSERT_TRUE(metadata->get_last_allocated_tid("tag1", &last_tid));
+  ASSERT_EQ(124U, last_tid);
+  ASSERT_TRUE(metadata->get_last_allocated_tid("tag2", &last_tid));
+  ASSERT_EQ(2U, last_tid);
+}
+
+TEST_F(TestJournalPlayer, PrefetchCorruptSequence) {
+  std::string oid = get_temp_oid();
+
+  cls::journal::ObjectSetPosition commit_position;
+
+  ASSERT_EQ(0, create(oid));
+  ASSERT_EQ(0, client_register(oid));
+  ASSERT_EQ(0, client_commit(oid, commit_position));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+
+  journal::JournalPlayer *player = create_player(oid, metadata);
+
+  ASSERT_EQ(0, write_entry(oid, 0, "tag1", 120));
+  ASSERT_EQ(0, write_entry(oid, 0, "tag2", 0));
+  ASSERT_EQ(0, write_entry(oid, 1, "tag1", 121));
+  ASSERT_EQ(0, write_entry(oid, 0, "tag1", 124));
+
+  player->prefetch();
+  Entries entries;
+  ASSERT_TRUE(wait_for_entries(player, 3, &entries));
+
+  journal::Entry entry;
+  uint64_t commit_tid;
+  ASSERT_FALSE(player->try_pop_front(&entry, &commit_tid));
+  ASSERT_TRUE(wait_for_complete(player));
+  ASSERT_NE(0, m_replay_hander.complete_result);
+}
+
+TEST_F(TestJournalPlayer, PrefetchAndWatch) {
+  std::string oid = get_temp_oid();
+
+  journal::JournalPlayer::EntryPositions positions;
+  positions = {
+    cls::journal::EntryPosition("tag1", 122)};
+  cls::journal::ObjectSetPosition commit_position(0, positions);
+
+  ASSERT_EQ(0, create(oid));
+  ASSERT_EQ(0, client_register(oid));
+  ASSERT_EQ(0, client_commit(oid, commit_position));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+
+  journal::JournalPlayer *player = create_player(oid, metadata);
+
+  ASSERT_EQ(0, write_entry(oid, 0, "tag1", 122));
+
+  player->prefetch_and_watch(0.25);
+
+  Entries entries;
+  ASSERT_EQ(0, write_entry(oid, 1, "tag1", 123));
+  ASSERT_TRUE(wait_for_entries(player, 1, &entries));
+
+  Entries expected_entries;
+  expected_entries = {create_entry("tag1", 123)};
+  ASSERT_EQ(expected_entries, entries);
+
+  ASSERT_EQ(0, write_entry(oid, 0, "tag1", 124));
+  ASSERT_TRUE(wait_for_entries(player, 1, &entries));
+
+  expected_entries = {create_entry("tag1", 124)};
+  ASSERT_EQ(expected_entries, entries);
+}
diff --git a/src/test/journal/test_JournalRecorder.cc b/src/test/journal/test_JournalRecorder.cc
new file mode 100644
index 0000000..73b3f35
--- /dev/null
+++ b/src/test/journal/test_JournalRecorder.cc
@@ -0,0 +1,148 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/JournalRecorder.h"
+#include "journal/JournalMetadata.h"
+#include "test/journal/RadosTestFixture.h"
+#include <limits>
+#include <list>
+
+class TestJournalRecorder : public RadosTestFixture {
+public:
+
+  virtual void TearDown() {
+    for (std::list<journal::JournalRecorder*>::iterator it = m_recorders.begin();
+         it != m_recorders.end(); ++it) {
+      delete *it;
+    }
+    RadosTestFixture::TearDown();
+  }
+
+  int client_register(const std::string &oid) {
+    return RadosTestFixture::client_register(oid, "client", "");
+  }
+
+  journal::JournalMetadataPtr create_metadata(const std::string &oid) {
+    journal::JournalMetadataPtr metadata(new journal::JournalMetadata(
+      m_ioctx, oid, "client", 0.1));
+    return metadata;
+  }
+
+  journal::JournalRecorder *create_recorder(const std::string &oid,
+                                            const journal::JournalMetadataPtr &metadata) {
+    journal::JournalRecorder *recorder(new journal::JournalRecorder(
+      m_ioctx, oid + ".", metadata, 0, std::numeric_limits<uint32_t>::max(), 0));
+    m_recorders.push_back(recorder);
+    return recorder;
+  }
+
+  std::list<journal::JournalRecorder*> m_recorders;
+
+};
+
+TEST_F(TestJournalRecorder, Append) {
+  std::string oid = get_temp_oid();
+  ASSERT_EQ(0, create(oid, 12, 2));
+  ASSERT_EQ(0, client_register(oid));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+
+  journal::JournalRecorder *recorder = create_recorder(oid, metadata);
+
+  journal::Future future1 = recorder->append("tag1", create_payload("payload"));
+
+  C_SaferCond cond;
+  future1.flush(&cond);
+  ASSERT_EQ(0, cond.wait());
+}
+
+TEST_F(TestJournalRecorder, AppendKnownOverflow) {
+  std::string oid = get_temp_oid();
+  ASSERT_EQ(0, create(oid, 12, 2));
+  ASSERT_EQ(0, client_register(oid));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+  ASSERT_EQ(0U, metadata->get_active_set());
+
+  journal::JournalRecorder *recorder = create_recorder(oid, metadata);
+
+  recorder->append("tag1", create_payload(std::string(1 << 12, '1')));
+  journal::Future future2 = recorder->append("tag1", create_payload(std::string(1, '2')));
+
+  C_SaferCond cond;
+  future2.flush(&cond);
+  ASSERT_EQ(0, cond.wait());
+
+  ASSERT_EQ(1U, metadata->get_active_set());
+}
+
+TEST_F(TestJournalRecorder, AppendDelayedOverflow) {
+  std::string oid = get_temp_oid();
+  ASSERT_EQ(0, create(oid, 12, 2));
+  ASSERT_EQ(0, client_register(oid));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+  ASSERT_EQ(0U, metadata->get_active_set());
+
+  journal::JournalRecorder *recorder1 = create_recorder(oid, metadata);
+  journal::JournalRecorder *recorder2 = create_recorder(oid, metadata);
+
+  recorder1->append("tag1", create_payload(std::string(1, '1')));
+  recorder2->append("tag2", create_payload(std::string(1 << 12, '2')));
+
+  journal::Future future = recorder2->append("tag1", create_payload(std::string(1, '3')));
+
+  C_SaferCond cond;
+  future.flush(&cond);
+  ASSERT_EQ(0, cond.wait());
+
+  ASSERT_EQ(1U, metadata->get_active_set());
+}
+
+TEST_F(TestJournalRecorder, FutureFlush) {
+  std::string oid = get_temp_oid();
+  ASSERT_EQ(0, create(oid, 12, 2));
+  ASSERT_EQ(0, client_register(oid));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+
+  journal::JournalRecorder *recorder = create_recorder(oid, metadata);
+
+  journal::Future future1 = recorder->append("tag1", create_payload("payload1"));
+  journal::Future future2 = recorder->append("tag1", create_payload("payload2"));
+
+  C_SaferCond cond;
+  future2.flush(&cond);
+  ASSERT_EQ(0, cond.wait());
+  ASSERT_TRUE(future1.is_complete());
+  ASSERT_TRUE(future2.is_complete());
+}
+
+TEST_F(TestJournalRecorder, Flush) {
+  std::string oid = get_temp_oid();
+  ASSERT_EQ(0, create(oid, 12, 2));
+  ASSERT_EQ(0, client_register(oid));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+
+  journal::JournalRecorder *recorder = create_recorder(oid, metadata);
+
+  journal::Future future1 = recorder->append("tag1", create_payload("payload1"));
+  journal::Future future2 = recorder->append("tag1", create_payload("payload2"));
+
+  C_SaferCond cond1;
+  recorder->flush(&cond1);
+  ASSERT_EQ(0, cond1.wait());
+
+  C_SaferCond cond2;
+  future2.wait(&cond2);
+  ASSERT_EQ(0, cond2.wait());
+  ASSERT_TRUE(future1.is_complete());
+  ASSERT_TRUE(future2.is_complete());
+}
+
diff --git a/src/test/journal/test_JournalTrimmer.cc b/src/test/journal/test_JournalTrimmer.cc
new file mode 100644
index 0000000..ff1f6b8
--- /dev/null
+++ b/src/test/journal/test_JournalTrimmer.cc
@@ -0,0 +1,188 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/JournalTrimmer.h"
+#include "journal/JournalMetadata.h"
+#include "include/stringify.h"
+#include "test/journal/RadosTestFixture.h"
+#include <limits>
+#include <list>
+
+class TestJournalTrimmer : public RadosTestFixture {
+public:
+
+  virtual void TearDown() {
+    for (MetadataList::iterator it = m_metadata_list.begin();
+         it != m_metadata_list.end(); ++it) {
+      (*it)->remove_listener(&m_listener);
+    }
+    for (std::list<journal::JournalTrimmer*>::iterator it = m_trimmers.begin();
+         it != m_trimmers.end(); ++it) {
+      delete *it;
+    }
+    RadosTestFixture::TearDown();
+  }
+
+  int append_payload(journal::JournalMetadataPtr metadata,
+                     const std::string &oid, uint64_t object_num,
+                     const std::string &payload, uint64_t *commit_tid) {
+    int r = append(oid + "." + stringify(object_num), create_payload(payload));
+    uint64_t tid = metadata->allocate_commit_tid(object_num, "tag", 123);
+    if (commit_tid != NULL) {
+      *commit_tid = tid;
+    }
+    return r;
+  }
+
+  using RadosTestFixture::client_register;
+  int client_register(const std::string &oid) {
+    return RadosTestFixture::client_register(oid, "client", "");
+  }
+
+  journal::JournalMetadataPtr create_metadata(const std::string &oid) {
+    journal::JournalMetadataPtr metadata(new journal::JournalMetadata(
+      m_ioctx, oid, "client", 0.1));
+    m_metadata_list.push_back(metadata);
+    metadata->add_listener(&m_listener);
+    return metadata;
+  }
+
+  journal::JournalTrimmer *create_trimmer(const std::string &oid,
+                                            const journal::JournalMetadataPtr &metadata) {
+    journal::JournalTrimmer *trimmer(new journal::JournalTrimmer(
+      m_ioctx, oid + ".", metadata));
+    m_trimmers.push_back(trimmer);
+    return trimmer;
+  }
+
+  int assert_exists(const std::string &oid) {
+    librados::ObjectWriteOperation op;
+    op.assert_exists();
+    return m_ioctx.operate(oid, &op);
+  }
+
+  typedef std::list<journal::JournalMetadataPtr> MetadataList;
+  MetadataList m_metadata_list;
+  std::list<journal::JournalTrimmer*> m_trimmers;
+};
+
+TEST_F(TestJournalTrimmer, Committed) {
+  std::string oid = get_temp_oid();
+  ASSERT_EQ(0, create(oid, 12, 2));
+  ASSERT_EQ(0, client_register(oid));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+  ASSERT_TRUE(wait_for_update(metadata));
+
+  metadata->set_active_set(10);
+  ASSERT_TRUE(wait_for_update(metadata));
+
+  uint64_t commit_tid1;
+  uint64_t commit_tid2;
+  uint64_t commit_tid3;
+  uint64_t commit_tid4;
+  uint64_t commit_tid5;
+  uint64_t commit_tid6;
+  ASSERT_EQ(0, append_payload(metadata, oid, 0, "payload", &commit_tid1));
+  ASSERT_EQ(0, append_payload(metadata, oid, 2, "payload", &commit_tid2));
+  ASSERT_EQ(0, append_payload(metadata, oid, 5, "payload", &commit_tid3));
+  ASSERT_EQ(0, append_payload(metadata, oid, 0, "payload", &commit_tid4));
+  ASSERT_EQ(0, append_payload(metadata, oid, 2, "payload", &commit_tid5));
+  ASSERT_EQ(0, append_payload(metadata, oid, 5, "payload", &commit_tid6));
+
+  journal::JournalTrimmer *trimmer = create_trimmer(oid, metadata);
+
+  trimmer->committed(commit_tid4);
+  trimmer->committed(commit_tid6);
+  trimmer->committed(commit_tid2);
+  trimmer->committed(commit_tid5);
+  trimmer->committed(commit_tid3);
+  trimmer->committed(commit_tid1);
+  while (metadata->get_minimum_set() != 2U) {
+    ASSERT_TRUE(wait_for_update(metadata));
+  }
+
+  ASSERT_EQ(-ENOENT, assert_exists(oid + ".0"));
+  ASSERT_EQ(-ENOENT, assert_exists(oid + ".2"));
+  ASSERT_EQ(0, assert_exists(oid + ".5"));
+}
+
+TEST_F(TestJournalTrimmer, CommittedWithOtherClient) {
+  std::string oid = get_temp_oid();
+  ASSERT_EQ(0, create(oid, 12, 2));
+  ASSERT_EQ(0, client_register(oid));
+  ASSERT_EQ(0, client_register(oid, "client2", "slow client"));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+  ASSERT_TRUE(wait_for_update(metadata));
+
+  metadata->set_active_set(10);
+  ASSERT_TRUE(wait_for_update(metadata));
+
+  uint64_t commit_tid1;
+  uint64_t commit_tid2;
+  uint64_t commit_tid3;
+  uint64_t commit_tid4;
+  ASSERT_EQ(0, append_payload(metadata, oid, 0, "payload", &commit_tid1));
+  ASSERT_EQ(0, append_payload(metadata, oid, 2, "payload", &commit_tid2));
+  ASSERT_EQ(0, append_payload(metadata, oid, 3, "payload", &commit_tid3));
+  ASSERT_EQ(0, append_payload(metadata, oid, 5, "payload", &commit_tid4));
+
+  journal::JournalTrimmer *trimmer = create_trimmer(oid, metadata);
+
+  trimmer->committed(commit_tid1);
+  trimmer->committed(commit_tid2);
+  trimmer->committed(commit_tid3);
+  trimmer->committed(commit_tid4);
+  ASSERT_TRUE(wait_for_update(metadata));
+
+  ASSERT_EQ(0, assert_exists(oid + ".0"));
+  ASSERT_EQ(0, assert_exists(oid + ".2"));
+  ASSERT_EQ(0, assert_exists(oid + ".3"));
+  ASSERT_EQ(0, assert_exists(oid + ".5"));
+}
+
+TEST_F(TestJournalTrimmer, RemoveObjects) {
+  std::string oid = get_temp_oid();
+  ASSERT_EQ(0, create(oid, 12, 2));
+  ASSERT_EQ(0, client_register(oid));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+  ASSERT_TRUE(wait_for_update(metadata));
+
+  metadata->set_active_set(10);
+  ASSERT_TRUE(wait_for_update(metadata));
+
+  ASSERT_EQ(0, append(oid + ".0", create_payload("payload")));
+  ASSERT_EQ(0, append(oid + ".2", create_payload("payload")));
+  ASSERT_EQ(0, append(oid + ".3", create_payload("payload")));
+  ASSERT_EQ(0, append(oid + ".5", create_payload("payload")));
+
+  journal::JournalTrimmer *trimmer = create_trimmer(oid, metadata);
+
+  ASSERT_EQ(0, trimmer->remove_objects());
+  ASSERT_TRUE(wait_for_update(metadata));
+
+  ASSERT_EQ(-ENOENT, assert_exists(oid + ".0"));
+  ASSERT_EQ(-ENOENT, assert_exists(oid + ".2"));
+  ASSERT_EQ(-ENOENT, assert_exists(oid + ".3"));
+  ASSERT_EQ(-ENOENT, assert_exists(oid + ".5"));
+}
+
+TEST_F(TestJournalTrimmer, RemoveObjectsWithOtherClient) {
+  std::string oid = get_temp_oid();
+  ASSERT_EQ(0, create(oid, 12, 2));
+  ASSERT_EQ(0, client_register(oid));
+  ASSERT_EQ(0, client_register(oid, "client2", "other client"));
+
+  journal::JournalMetadataPtr metadata = create_metadata(oid);
+  ASSERT_EQ(0, init_metadata(metadata));
+  ASSERT_TRUE(wait_for_update(metadata));
+
+  journal::JournalTrimmer *trimmer = create_trimmer(oid, metadata);
+  ASSERT_EQ(-EBUSY, trimmer->remove_objects());
+}
+
diff --git a/src/test/journal/test_Journaler.cc b/src/test/journal/test_Journaler.cc
new file mode 100644
index 0000000..5a19910
--- /dev/null
+++ b/src/test/journal/test_Journaler.cc
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/Journaler.h"
+#include "include/stringify.h"
+#include "gtest/gtest.h"
+#include "test/librados/test.h"
+#include "test/journal/RadosTestFixture.h"
+#include "include/stringify.h"
+
+class TestJournaler : public RadosTestFixture {
+public:
+
+  static const std::string CLIENT_ID;
+
+  static std::string get_temp_journal_id() {
+    return stringify(++_journal_id);
+  }
+
+  virtual void SetUp() {
+    RadosTestFixture::SetUp();
+    m_journal_id = get_temp_journal_id();
+    m_journaler = new journal::Journaler(m_ioctx, m_journal_id, CLIENT_ID, 5);
+  }
+
+  virtual void TearDown() {
+    delete m_journaler;
+    RadosTestFixture::TearDown();
+  }
+
+  int create_journal(uint8_t order, uint8_t splay_width) {
+    return m_journaler->create(order, splay_width, -1);
+  }
+
+  int init_journaler() {
+    C_SaferCond cond;
+    m_journaler->init(&cond);
+    return cond.wait();
+  }
+
+  int register_client(const std::string &client_id, const std::string &desc) {
+    journal::Journaler journaler(m_ioctx, m_journal_id, client_id, 5);
+    return journaler.register_client(desc);
+  }
+
+  static uint64_t _journal_id;
+
+  std::string m_journal_id;
+  journal::Journaler *m_journaler;
+};
+
+const std::string TestJournaler::CLIENT_ID = "client1";
+uint64_t TestJournaler::_journal_id = 0;
+
+TEST_F(TestJournaler, Create) {
+  ASSERT_EQ(0, create_journal(12, 8));
+}
+
+TEST_F(TestJournaler, CreateDuplicate) {
+  ASSERT_EQ(0, create_journal(12, 8));
+  ASSERT_EQ(-EEXIST, create_journal(12, 8));
+}
+
+TEST_F(TestJournaler, CreateInvalidParams) {
+  ASSERT_EQ(-EDOM, create_journal(1, 8));
+  ASSERT_EQ(-EDOM, create_journal(123, 8));
+  ASSERT_EQ(-EINVAL, create_journal(12, 0));
+}
+
+TEST_F(TestJournaler, Init) {
+  ASSERT_EQ(0, create_journal(12, 8));
+  ASSERT_EQ(0, register_client(CLIENT_ID, "foo"));
+  ASSERT_EQ(0, init_journaler());
+}
+
+TEST_F(TestJournaler, InitDNE) {
+  ASSERT_EQ(-ENOENT, init_journaler());
+}
+
+TEST_F(TestJournaler, RegisterClientDuplicate) {
+  ASSERT_EQ(0, register_client(CLIENT_ID, "foo"));
+  ASSERT_EQ(-EEXIST, register_client(CLIENT_ID, "foo2"));
+}
+
diff --git a/src/test/journal/test_ObjectPlayer.cc b/src/test/journal/test_ObjectPlayer.cc
new file mode 100644
index 0000000..3e9a2af
--- /dev/null
+++ b/src/test/journal/test_ObjectPlayer.cc
@@ -0,0 +1,275 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/ObjectPlayer.h"
+#include "journal/Entry.h"
+#include "include/stringify.h"
+#include "common/Mutex.h"
+#include "common/Timer.h"
+#include "gtest/gtest.h"
+#include "test/librados/test.h"
+#include "test/journal/RadosTestFixture.h"
+
+class TestObjectPlayer : public RadosTestFixture {
+public:
+  journal::ObjectPlayerPtr create_object(const std::string &oid,
+                                         uint8_t order) {
+    journal::ObjectPlayerPtr object(new journal::ObjectPlayer(
+      m_ioctx, oid + ".", 0, *m_timer, m_timer_lock, order));
+    return object;
+  }
+
+  std::string get_object_name(const std::string &oid) {
+    return oid + ".0";
+  }
+};
+
+TEST_F(TestObjectPlayer, Fetch) {
+  std::string oid = get_temp_oid();
+
+  journal::Entry entry1("tag1", 123, create_payload(std::string(24, '1')));
+  journal::Entry entry2("tag1", 124, create_payload(std::string(24, '1')));
+
+  bufferlist bl;
+  ::encode(entry1, bl);
+  ::encode(entry2, bl);
+  ASSERT_EQ(0, append(get_object_name(oid), bl));
+
+  journal::ObjectPlayerPtr object = create_object(oid, 14);
+
+  C_SaferCond cond;
+  object->fetch(&cond);
+  ASSERT_LE(0, cond.wait());
+
+  journal::ObjectPlayer::Entries entries;
+  object->get_entries(&entries);
+  ASSERT_EQ(2U, entries.size());
+
+  journal::ObjectPlayer::Entries expected_entries = {entry1, entry2};
+  ASSERT_EQ(expected_entries, entries);
+}
+
+TEST_F(TestObjectPlayer, FetchLarge) {
+  std::string oid = get_temp_oid();
+
+  journal::Entry entry1("tag1", 123,
+                        create_payload(std::string(8192 - 33, '1')));
+  journal::Entry entry2("tag1", 124, create_payload(""));
+
+  bufferlist bl;
+  ::encode(entry1, bl);
+  ::encode(entry2, bl);
+  ASSERT_EQ(0, append(get_object_name(oid), bl));
+
+  journal::ObjectPlayerPtr object = create_object(oid, 12);
+
+  C_SaferCond cond;
+  object->fetch(&cond);
+  ASSERT_LE(0, cond.wait());
+
+  journal::ObjectPlayer::Entries entries;
+  object->get_entries(&entries);
+  ASSERT_EQ(1U, entries.size());
+
+  journal::ObjectPlayer::Entries expected_entries = {entry1};
+  ASSERT_EQ(expected_entries, entries);
+}
+
+TEST_F(TestObjectPlayer, FetchDeDup) {
+  std::string oid = get_temp_oid();
+
+  journal::Entry entry1("tag1", 123, create_payload(std::string(24, '1')));
+  journal::Entry entry2("tag1", 123, create_payload(std::string(24, '2')));
+
+  bufferlist bl;
+  ::encode(entry1, bl);
+  ::encode(entry2, bl);
+  ASSERT_EQ(0, append(get_object_name(oid), bl));
+
+  journal::ObjectPlayerPtr object = create_object(oid, 14);
+
+  C_SaferCond cond;
+  object->fetch(&cond);
+  ASSERT_LE(0, cond.wait());
+
+  journal::ObjectPlayer::Entries entries;
+  object->get_entries(&entries);
+  ASSERT_EQ(1U, entries.size());
+
+  journal::ObjectPlayer::Entries expected_entries = {entry2};
+  ASSERT_EQ(expected_entries, entries);
+}
+
+TEST_F(TestObjectPlayer, FetchEmpty) {
+  std::string oid = get_temp_oid();
+
+  bufferlist bl;
+  ASSERT_EQ(0, append(get_object_name(oid), bl));
+
+  journal::ObjectPlayerPtr object = create_object(oid, 14);
+
+  C_SaferCond cond;
+  object->fetch(&cond);
+  ASSERT_EQ(-ENOENT, cond.wait());
+  ASSERT_TRUE(object->empty());
+}
+
+TEST_F(TestObjectPlayer, FetchError) {
+  std::string oid = get_temp_oid();
+
+  journal::ObjectPlayerPtr object = create_object(oid, 14);
+
+  C_SaferCond cond;
+  object->fetch(&cond);
+  ASSERT_EQ(-ENOENT, cond.wait());
+  ASSERT_TRUE(object->empty());
+}
+
+TEST_F(TestObjectPlayer, FetchCorrupt) {
+  std::string oid = get_temp_oid();
+
+  journal::Entry entry1("tag1", 123, create_payload(std::string(24, '1')));
+  journal::Entry entry2("tag1", 124, create_payload(std::string(24, '2')));
+
+  bufferlist bl;
+  ::encode(entry1, bl);
+  ::encode(create_payload("corruption"), bl);
+  ::encode(entry2, bl);
+  ASSERT_EQ(0, append(get_object_name(oid), bl));
+
+  journal::ObjectPlayerPtr object = create_object(oid, 14);
+
+  C_SaferCond cond;
+  object->fetch(&cond);
+  ASSERT_EQ(-EINVAL, cond.wait());
+
+  journal::ObjectPlayer::Entries entries;
+  object->get_entries(&entries);
+  ASSERT_EQ(2U, entries.size());
+
+  journal::ObjectPlayer::Entries expected_entries = {entry1, entry2};
+  ASSERT_EQ(expected_entries, entries);
+}
+
+TEST_F(TestObjectPlayer, FetchAppend) {
+  std::string oid = get_temp_oid();
+
+  journal::Entry entry1("tag1", 123, create_payload(std::string(24, '1')));
+  journal::Entry entry2("tag1", 124, create_payload(std::string(24, '2')));
+
+  bufferlist bl;
+  ::encode(entry1, bl);
+  ASSERT_EQ(0, append(get_object_name(oid), bl));
+
+  journal::ObjectPlayerPtr object = create_object(oid, 14);
+
+  C_SaferCond cond1;
+  object->fetch(&cond1);
+  ASSERT_LE(0, cond1.wait());
+
+  journal::ObjectPlayer::Entries entries;
+  object->get_entries(&entries);
+  ASSERT_EQ(1U, entries.size());
+
+  journal::ObjectPlayer::Entries expected_entries = {entry1};
+  ASSERT_EQ(expected_entries, entries);
+
+  bl.clear();
+  ::encode(entry2, bl);
+  ASSERT_EQ(0, append(get_object_name(oid), bl));
+
+  C_SaferCond cond2;
+  object->fetch(&cond2);
+  ASSERT_LE(0, cond2.wait());
+
+  object->get_entries(&entries);
+  ASSERT_EQ(2U, entries.size());
+
+  expected_entries = {entry1, entry2};
+  ASSERT_EQ(expected_entries, entries);
+}
+
+TEST_F(TestObjectPlayer, PopEntry) {
+  std::string oid = get_temp_oid();
+
+  journal::Entry entry1("tag1", 123, create_payload(std::string(24, '1')));
+  journal::Entry entry2("tag1", 124, create_payload(std::string(24, '1')));
+
+  bufferlist bl;
+  ::encode(entry1, bl);
+  ::encode(entry2, bl);
+  ASSERT_EQ(0, append(get_object_name(oid), bl));
+
+  journal::ObjectPlayerPtr object = create_object(oid, 14);
+
+  C_SaferCond cond;
+  object->fetch(&cond);
+  ASSERT_LE(0, cond.wait());
+
+  journal::ObjectPlayer::Entries entries;
+  object->get_entries(&entries);
+  ASSERT_EQ(2U, entries.size());
+
+  journal::Entry entry;
+  object->front(&entry);
+  object->pop_front();
+  ASSERT_EQ(entry1, entry);
+  object->front(&entry);
+  object->pop_front();
+  ASSERT_EQ(entry2, entry);
+  ASSERT_TRUE(object->empty());
+}
+
+TEST_F(TestObjectPlayer, Watch) {
+  std::string oid = get_temp_oid();
+  journal::ObjectPlayerPtr object = create_object(oid, 14);
+
+  C_SaferCond cond1;
+  object->watch(&cond1, 0.1);
+
+  journal::Entry entry1("tag1", 123, create_payload(std::string(24, '1')));
+  journal::Entry entry2("tag1", 124, create_payload(std::string(24, '1')));
+
+  bufferlist bl;
+  ::encode(entry1, bl);
+  ASSERT_EQ(0, append(get_object_name(oid), bl));
+  ASSERT_LE(0, cond1.wait());
+
+  journal::ObjectPlayer::Entries entries;
+  object->get_entries(&entries);
+  ASSERT_EQ(1U, entries.size());
+
+  journal::ObjectPlayer::Entries expected_entries;
+  expected_entries = {entry1};
+  ASSERT_EQ(expected_entries, entries);
+
+  C_SaferCond cond2;
+  object->watch(&cond2, 0.1);
+
+  bl.clear();
+  ::encode(entry2, bl);
+  ASSERT_EQ(0, append(get_object_name(oid), bl));
+  ASSERT_LE(0, cond2.wait());
+
+  object->get_entries(&entries);
+  ASSERT_EQ(2U, entries.size());
+
+  expected_entries = {entry1, entry2};
+  ASSERT_EQ(expected_entries, entries);
+}
+
+TEST_F(TestObjectPlayer, Unwatch) {
+  std::string oid = get_temp_oid();
+  journal::ObjectPlayerPtr object = create_object(oid, 14);
+
+  Mutex mutex("lock");
+  Cond cond;
+  bool done = false;
+  int rval = 0;
+  C_SafeCond *ctx = new C_SafeCond(&mutex, &cond, &done, &rval);
+  object->watch(ctx, 0.1);
+
+  usleep(200000);
+  ASSERT_FALSE(done);
+  object->unwatch();
+}
diff --git a/src/test/journal/test_ObjectRecorder.cc b/src/test/journal/test_ObjectRecorder.cc
new file mode 100644
index 0000000..8c27430
--- /dev/null
+++ b/src/test/journal/test_ObjectRecorder.cc
@@ -0,0 +1,329 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "journal/ObjectRecorder.h"
+#include "common/Cond.h"
+#include "common/Finisher.h"
+#include "common/Mutex.h"
+#include "common/Timer.h"
+#include "gtest/gtest.h"
+#include "test/librados/test.h"
+#include "test/journal/RadosTestFixture.h"
+#include <limits>
+
+class TestObjectRecorder : public RadosTestFixture {
+public:
+  TestObjectRecorder()
+    : m_flush_interval(std::numeric_limits<uint32_t>::max()),
+      m_flush_bytes(std::numeric_limits<uint64_t>::max()),
+      m_flush_age(600),
+      m_finisher(NULL)
+  {
+  }
+
+  ~TestObjectRecorder() {
+    m_finisher->stop();
+    delete m_finisher;
+  }
+
+  struct OverflowHandler : public journal::ObjectRecorder::OverflowHandler {
+    Mutex lock;
+    Cond cond;
+    uint32_t overflows;
+
+    OverflowHandler() : lock("lock"), overflows(0) {}
+
+    virtual void overflow(journal::ObjectRecorder *object_recorder) {
+      Mutex::Locker locker(lock);
+      journal::AppendBuffers append_buffers;
+      object_recorder->claim_append_buffers(&append_buffers);
+
+      ++overflows;
+      cond.Signal();
+    }
+  };
+
+  typedef std::list<journal::ObjectRecorderPtr> ObjectRecorders;
+
+  ObjectRecorders m_object_recorders;
+
+  uint32_t m_flush_interval;
+  uint64_t m_flush_bytes;
+  double m_flush_age;
+  OverflowHandler m_overflow_handler;
+
+  Finisher *m_finisher;
+
+  void SetUp() {
+    RadosTestFixture::SetUp();
+    m_finisher = new Finisher(reinterpret_cast<CephContext*>(m_ioctx.cct()));
+    m_finisher->start();
+  }
+
+  void TearDown() {
+    for (ObjectRecorders::iterator it = m_object_recorders.begin();
+         it != m_object_recorders.end(); ++it) {
+      C_SaferCond cond;
+      (*it)->flush(&cond);
+      cond.wait();
+    }
+    RadosTestFixture::TearDown();
+  }
+
+  inline void set_flush_interval(uint32_t i) {
+    m_flush_interval = i;
+  }
+  inline void set_flush_bytes(uint64_t i) {
+    m_flush_bytes = i;
+  }
+  inline void set_flush_age(double i) {
+    m_flush_age = i;
+  }
+
+  journal::AppendBuffer create_append_buffer(const std::string &tag,
+                                             uint64_t tid,
+                                             const std::string &payload) {
+    journal::FutureImplPtr future(new journal::FutureImpl(*m_finisher,
+                                                          tag, tid, 456));
+    future->init(journal::FutureImplPtr());
+
+    bufferlist bl;
+    bl.append(payload);
+    return std::make_pair(future, bl);
+  }
+
+  journal::ObjectRecorderPtr create_object(const std::string &oid,
+                                           uint8_t order) {
+    journal::ObjectRecorderPtr object(new journal::ObjectRecorder(
+      m_ioctx, oid, 0, *m_timer, m_timer_lock, &m_overflow_handler, order,
+      m_flush_interval, m_flush_bytes, m_flush_age));
+    m_object_recorders.push_back(object);
+    return object;
+  }
+};
+
+TEST_F(TestObjectRecorder, Append) {
+  std::string oid = get_temp_oid();
+
+  journal::ObjectRecorderPtr object = create_object(oid, 24);
+
+  journal::AppendBuffer append_buffer1 = create_append_buffer("tag", 123,
+                                                             "payload");
+  journal::AppendBuffers append_buffers;
+  append_buffers = {append_buffer1};
+  ASSERT_FALSE(object->append(append_buffers));
+  ASSERT_EQ(1U, object->get_pending_appends());
+
+  journal::AppendBuffer append_buffer2 = create_append_buffer("tag", 124,
+                                                             "payload");
+  append_buffers = {append_buffer2};
+  ASSERT_FALSE(object->append(append_buffers));
+  ASSERT_EQ(2U, object->get_pending_appends());
+
+  C_SaferCond cond;
+  append_buffer2.first->flush(&cond);
+  ASSERT_EQ(0, cond.wait());
+  ASSERT_EQ(0U, object->get_pending_appends());
+}
+
+TEST_F(TestObjectRecorder, AppendFlushByCount) {
+  std::string oid = get_temp_oid();
+
+  set_flush_interval(2);
+  journal::ObjectRecorderPtr object = create_object(oid, 24);
+
+  journal::AppendBuffer append_buffer1 = create_append_buffer("tag", 123,
+                                                             "payload");
+  journal::AppendBuffers append_buffers;
+  append_buffers = {append_buffer1};
+  ASSERT_FALSE(object->append(append_buffers));
+  ASSERT_EQ(1U, object->get_pending_appends());
+
+  journal::AppendBuffer append_buffer2 = create_append_buffer("tag", 124,
+                                                             "payload");
+  append_buffers = {append_buffer2};
+  ASSERT_FALSE(object->append(append_buffers));
+  ASSERT_EQ(0U, object->get_pending_appends());
+
+  C_SaferCond cond;
+  append_buffer2.first->wait(&cond);
+  ASSERT_EQ(0, cond.wait());
+}
+
+TEST_F(TestObjectRecorder, AppendFlushByBytes) {
+  std::string oid = get_temp_oid();
+
+  set_flush_bytes(10);
+  journal::ObjectRecorderPtr object = create_object(oid, 24);
+
+  journal::AppendBuffer append_buffer1 = create_append_buffer("tag", 123,
+                                                             "payload");
+  journal::AppendBuffers append_buffers;
+  append_buffers = {append_buffer1};
+  ASSERT_FALSE(object->append(append_buffers));
+  ASSERT_EQ(1U, object->get_pending_appends());
+
+  journal::AppendBuffer append_buffer2 = create_append_buffer("tag", 124,
+                                                             "payload");
+  append_buffers = {append_buffer2};
+  ASSERT_FALSE(object->append(append_buffers));
+  ASSERT_EQ(0U, object->get_pending_appends());
+
+  C_SaferCond cond;
+  append_buffer2.first->wait(&cond);
+  ASSERT_EQ(0, cond.wait());
+}
+
+TEST_F(TestObjectRecorder, AppendFlushByAge) {
+  std::string oid = get_temp_oid();
+
+  set_flush_age(0.1);
+  journal::ObjectRecorderPtr object = create_object(oid, 24);
+
+  journal::AppendBuffer append_buffer1 = create_append_buffer("tag", 123,
+                                                             "payload");
+  journal::AppendBuffers append_buffers;
+  append_buffers = {append_buffer1};
+  ASSERT_FALSE(object->append(append_buffers));
+
+  journal::AppendBuffer append_buffer2 = create_append_buffer("tag", 124,
+                                                             "payload");
+  append_buffers = {append_buffer2};
+  ASSERT_FALSE(object->append(append_buffers));
+
+  C_SaferCond cond;
+  append_buffer2.first->wait(&cond);
+  ASSERT_EQ(0, cond.wait());
+  ASSERT_EQ(0U, object->get_pending_appends());
+}
+
+TEST_F(TestObjectRecorder, AppendFilledObject) {
+  std::string oid = get_temp_oid();
+
+  journal::ObjectRecorderPtr object = create_object(oid, 12);
+
+  std::string payload(2048, '1');
+  journal::AppendBuffer append_buffer1 = create_append_buffer("tag", 123,
+                                                              payload);
+  journal::AppendBuffers append_buffers;
+  append_buffers = {append_buffer1};
+  ASSERT_FALSE(object->append(append_buffers));
+
+  journal::AppendBuffer append_buffer2 = create_append_buffer("tag", 124,
+                                                              payload);
+  append_buffers = {append_buffer2};
+  ASSERT_TRUE(object->append(append_buffers));
+
+  C_SaferCond cond;
+  append_buffer2.first->wait(&cond);
+  ASSERT_EQ(0, cond.wait());
+  ASSERT_EQ(0U, object->get_pending_appends());
+}
+
+TEST_F(TestObjectRecorder, Flush) {
+  std::string oid = get_temp_oid();
+
+  journal::ObjectRecorderPtr object = create_object(oid, 24);
+
+  journal::AppendBuffer append_buffer1 = create_append_buffer("tag", 123,
+                                                             "payload");
+  journal::AppendBuffers append_buffers;
+  append_buffers = {append_buffer1};
+  ASSERT_FALSE(object->append(append_buffers));
+  ASSERT_EQ(1U, object->get_pending_appends());
+
+  C_SaferCond cond1;
+  object->flush(&cond1);
+  ASSERT_EQ(0, cond1.wait());
+
+  C_SaferCond cond2;
+  append_buffer1.first->wait(&cond2);
+  ASSERT_EQ(0, cond2.wait());
+  ASSERT_EQ(0U, object->get_pending_appends());
+}
+
+TEST_F(TestObjectRecorder, FlushFuture) {
+  std::string oid = get_temp_oid();
+
+  journal::ObjectRecorderPtr object = create_object(oid, 24);
+
+  journal::AppendBuffer append_buffer = create_append_buffer("tag", 123,
+                                                             "payload");
+  journal::AppendBuffers append_buffers;
+  append_buffers = {append_buffer};
+  ASSERT_FALSE(object->append(append_buffers));
+  ASSERT_EQ(1U, object->get_pending_appends());
+
+  C_SaferCond cond;
+  append_buffer.first->wait(&cond);
+  object->flush(append_buffer.first);
+  ASSERT_TRUE(append_buffer.first->is_flush_in_progress() ||
+              append_buffer.first->is_complete());
+  ASSERT_EQ(0, cond.wait());
+}
+
+TEST_F(TestObjectRecorder, FlushDetachedFuture) {
+  std::string oid = get_temp_oid();
+
+  journal::ObjectRecorderPtr object = create_object(oid, 24);
+
+  journal::AppendBuffer append_buffer = create_append_buffer("tag", 123,
+                                                             "payload");
+
+  journal::AppendBuffers append_buffers;
+  append_buffers = {append_buffer};
+
+  object->flush(append_buffer.first);
+  ASSERT_FALSE(append_buffer.first->is_flush_in_progress());
+  ASSERT_FALSE(object->append(append_buffers));
+
+  // should automatically flush once its attached to the object
+  C_SaferCond cond;
+  append_buffer.first->wait(&cond);
+  ASSERT_EQ(0, cond.wait());
+}
+
+TEST_F(TestObjectRecorder, Overflow) {
+  std::string oid = get_temp_oid();
+
+  journal::ObjectRecorderPtr object1 = create_object(oid, 12);
+  journal::ObjectRecorderPtr object2 = create_object(oid, 12);
+
+  std::string payload(2048, '1');
+  journal::AppendBuffer append_buffer1 = create_append_buffer("tag", 123,
+                                                              payload);
+  journal::AppendBuffer append_buffer2 = create_append_buffer("tag", 124,
+                                                              payload);
+  journal::AppendBuffers append_buffers;
+  append_buffers = {append_buffer1, append_buffer2};
+  ASSERT_TRUE(object1->append(append_buffers));
+
+  C_SaferCond cond;
+  append_buffer2.first->wait(&cond);
+  ASSERT_EQ(0, cond.wait());
+  ASSERT_EQ(0U, object1->get_pending_appends());
+
+  journal::AppendBuffer append_buffer3 = create_append_buffer("bar", 123,
+                                                              payload);
+  append_buffers = {append_buffer3};
+
+  ASSERT_FALSE(object2->append(append_buffers));
+  append_buffer3.first->flush(NULL);
+
+  bool overflowed = false;
+  {
+    Mutex::Locker locker(m_overflow_handler.lock);
+    while (m_overflow_handler.overflows == 0) {
+      if (m_overflow_handler.cond.WaitInterval(
+            reinterpret_cast<CephContext*>(m_ioctx.cct()),
+            m_overflow_handler.lock, utime_t(10, 0)) != 0) {
+        break;
+      }
+    }
+    if (m_overflow_handler.overflows != 0) {
+      overflowed = true;
+    }
+  }
+
+  ASSERT_TRUE(overflowed);
+}
diff --git a/src/test/journal/test_main.cc b/src/test/journal/test_main.cc
new file mode 100644
index 0000000..f8c7d3e
--- /dev/null
+++ b/src/test/journal/test_main.cc
@@ -0,0 +1,26 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "gtest/gtest.h"
+#include "common/ceph_argparse.h"
+#include "common/ceph_crypto.h"
+#include "common/config.h"
+#include "global/global_context.h"
+#include "global/global_init.h"
+#include <vector>
+
+int main(int argc, char **argv)
+{
+  ::testing::InitGoogleTest(&argc, argv);
+
+  std::vector<const char*> args;
+  argv_to_vec(argc, (const char **)argv, args);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  g_conf->set_val("lockdep", "true");
+  common_init_finish(g_ceph_context);
+
+  int r = RUN_ALL_TESTS();
+  g_ceph_context->put();
+  return r;
+}
diff --git a/src/test/libcephfs/access.cc b/src/test/libcephfs/access.cc
new file mode 100644
index 0000000..ac8cbd1
--- /dev/null
+++ b/src/test/libcephfs/access.cc
@@ -0,0 +1,358 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "gtest/gtest.h"
+#include "include/buffer.h"
+#include "include/cephfs/libcephfs.h"
+#include "include/rados/librados.h"
+#include <errno.h>
+#include <sys/fcntl.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <sys/xattr.h>
+#include <sys/uio.h>
+#include <iostream>
+#include <vector>
+#include "common/ceph_argparse.h"
+#include "include/stringify.h"
+#include "json_spirit/json_spirit.h"
+
+#ifdef __linux__
+#include <limits.h>
+#endif
+
+
+rados_t cluster;
+
+string key;
+
+int do_mon_command(string s, string *key)
+{
+  char *outs, *outbuf;
+  size_t outs_len, outbuf_len;
+  const char *ss = s.c_str();
+  int r = rados_mon_command(cluster, (const char **)&ss, 1,
+			    0, 0,
+			    &outbuf, &outbuf_len,
+			    &outs, &outs_len);
+  if (outbuf_len) {
+    string s(outbuf, outbuf_len);
+    std::cout << "out: " << s << std::endl;
+
+    // parse out the key
+    json_spirit::mValue v, k;
+    json_spirit::read_or_throw(s, v);
+    k = v.get_array()[0].get_obj().find("key")->second;
+    *key = k.get_str();
+    std::cout << "key: " << *key << std::endl;
+    free(outbuf);
+  } else {
+    return -EINVAL;
+  }
+  if (outs_len) {
+    string s(outs, outs_len);
+    std::cout << "outs: " << s << std::endl;
+    free(outs);
+  }
+  return r;
+}
+
+string get_unique_dir()
+{
+  return string("/ceph_test_libcephfs.") + stringify(rand());
+}
+
+TEST(AccessTest, Foo) {
+  string dir = get_unique_dir();
+  string user = "libcephfs_foo_test." + stringify(rand());
+  // admin mount to set up test
+  struct ceph_mount_info *admin;
+  ASSERT_EQ(0, ceph_create(&admin, NULL));
+  ASSERT_EQ(0, ceph_conf_read_file(admin, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(admin, NULL));
+  ASSERT_EQ(0, ceph_mount(admin, "/"));
+  ASSERT_EQ(0, ceph_mkdir(admin, dir.c_str(), 0755));
+
+  // create access key
+  string key;
+  ASSERT_EQ(0, do_mon_command(
+      "{\"prefix\": \"auth get-or-create\", \"entity\": \"client." + user + "\", "
+      "\"caps\": [\"mon\", \"allow *\", \"osd\", \"allow rw\", "
+      "\"mds\", \"allow rw\""
+      "], \"format\": \"json\"}", &key));
+
+  struct ceph_mount_info *cmount;
+  ASSERT_EQ(0, ceph_create(&cmount, user.c_str()));
+  ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_set(cmount, "key", key.c_str()));
+  ASSERT_EQ(0, ceph_mount(cmount, "/"));
+
+  ceph_shutdown(cmount);
+
+  // clean up
+  ASSERT_EQ(0, ceph_rmdir(admin, dir.c_str()));
+  ceph_shutdown(admin);
+}
+
+TEST(AccessTest, Path) {
+  string good = get_unique_dir();
+  string bad = get_unique_dir();
+  string user = "libcephfs_path_test." + stringify(rand());
+  struct ceph_mount_info *admin;
+  ASSERT_EQ(0, ceph_create(&admin, NULL));
+  ASSERT_EQ(0, ceph_conf_read_file(admin, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(admin, NULL));
+  ASSERT_EQ(0, ceph_mount(admin, "/"));
+  ASSERT_EQ(0, ceph_mkdir(admin, good.c_str(), 0755));
+  ASSERT_EQ(0, ceph_mkdir(admin, string(good + "/p").c_str(), 0755));
+  ASSERT_EQ(0, ceph_mkdir(admin, bad.c_str(), 0755));
+  ASSERT_EQ(0, ceph_mkdir(admin, string(bad + "/p").c_str(), 0755));
+  int fd = ceph_open(admin, string(good + "/q").c_str(), O_CREAT|O_WRONLY, 0755);
+  ceph_close(admin, fd);
+  fd = ceph_open(admin, string(bad + "/q").c_str(), O_CREAT|O_WRONLY, 0755);
+  ceph_close(admin, fd);
+  fd = ceph_open(admin, string(bad + "/z").c_str(), O_CREAT|O_WRONLY, 0755);
+  ceph_write(admin, fd, "TEST FAILED", 11, 0);
+  ceph_close(admin, fd);
+
+  string key;
+  ASSERT_EQ(0, do_mon_command(
+      "{\"prefix\": \"auth get-or-create\", \"entity\": \"client." + user + "\", "
+      "\"caps\": [\"mon\", \"allow r\", \"osd\", \"allow rwx\", "
+      "\"mds\", \"allow r, allow rw path=" + good + "\""
+      "], \"format\": \"json\"}", &key));
+
+  struct ceph_mount_info *cmount;
+  ASSERT_EQ(0, ceph_create(&cmount, user.c_str()));
+  ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_set(cmount, "key", key.c_str()));
+  ASSERT_EQ(0, ceph_mount(cmount, "/"));
+
+  // allowed
+  ASSERT_GE(ceph_mkdir(cmount, string(good + "/x").c_str(), 0755), 0);
+  ASSERT_GE(ceph_rmdir(cmount, string(good + "/p").c_str()), 0);
+  ASSERT_GE(ceph_unlink(cmount, string(good + "/q").c_str()), 0);
+  fd = ceph_open(cmount, string(good + "/y").c_str(), O_CREAT|O_WRONLY, 0755);
+  ASSERT_GE(fd, 0);
+  ceph_write(cmount, fd, "bar", 3, 0);
+  ceph_close(cmount, fd);
+  ASSERT_GE(ceph_unlink(cmount, string(good + "/y").c_str()), 0);
+  ASSERT_GE(ceph_rmdir(cmount, string(good + "/x").c_str()), 0);
+
+  fd = ceph_open(cmount, string(bad + "/z").c_str(), O_RDONLY, 0644);
+  ASSERT_GE(fd, 0);
+  ceph_close(cmount, fd);
+
+  // not allowed
+  ASSERT_LT(ceph_mkdir(cmount, string(bad + "/x").c_str(), 0755), 0);
+  ASSERT_LT(ceph_rmdir(cmount, string(bad + "/p").c_str()), 0);
+  ASSERT_LT(ceph_unlink(cmount, string(bad + "/q").c_str()), 0);
+  fd = ceph_open(cmount, string(bad + "/y").c_str(), O_CREAT|O_WRONLY, 0755);
+  ASSERT_LT(fd, 0);
+
+  // unlink open file
+  fd = ceph_open(cmount, string(good + "/unlinkme").c_str(), O_CREAT|O_WRONLY, 0755);
+  ceph_unlink(cmount, string(good + "/unlinkme").c_str());
+  ASSERT_GE(ceph_write(cmount, fd, "foo", 3, 0), 0);
+  ASSERT_GE(ceph_fchmod(cmount, fd, 0777), 0);
+  ASSERT_GE(ceph_ftruncate(cmount, fd, 0), 0);
+  ASSERT_GE(ceph_fsetxattr(cmount, fd, "user.any", "bar", 3, 0), 0);
+  ceph_close(cmount, fd);
+
+  // rename open file
+  fd = ceph_open(cmount, string(good + "/renameme").c_str(), O_CREAT|O_WRONLY, 0755);
+  ASSERT_EQ(ceph_rename(admin, string(good + "/renameme").c_str(),
+			string(bad + "/asdf").c_str()), 0);
+  ASSERT_GE(ceph_write(cmount, fd, "foo", 3, 0), 0);
+  ASSERT_GE(ceph_fchmod(cmount, fd, 0777), -EACCES);
+  ASSERT_GE(ceph_ftruncate(cmount, fd, 0), -EACCES);
+  ASSERT_GE(ceph_fsetxattr(cmount, fd, "user.any", "bar", 3, 0), -EACCES);
+  ceph_close(cmount, fd);
+
+  ceph_shutdown(cmount);
+  ASSERT_EQ(0, ceph_unlink(admin, string(bad + "/q").c_str()));
+  ASSERT_EQ(0, ceph_unlink(admin, string(bad + "/z").c_str()));
+  ASSERT_EQ(0, ceph_rmdir(admin, string(bad + "/p").c_str()));
+  ASSERT_EQ(0, ceph_unlink(admin, string(bad + "/asdf").c_str()));
+  ASSERT_EQ(0, ceph_rmdir(admin, good.c_str()));
+  ASSERT_EQ(0, ceph_rmdir(admin, bad.c_str()));
+  ceph_shutdown(admin);
+}
+
+TEST(AccessTest, ReadOnly) {
+  string dir = get_unique_dir();
+  string dir2 = get_unique_dir();
+  string user = "libcephfs_readonly_test." + stringify(rand());
+  struct ceph_mount_info *admin;
+  ASSERT_EQ(0, ceph_create(&admin, NULL));
+  ASSERT_EQ(0, ceph_conf_read_file(admin, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(admin, NULL));
+  ASSERT_EQ(0, ceph_mount(admin, "/"));
+  ASSERT_EQ(0, ceph_mkdir(admin, dir.c_str(), 0755));
+  int fd = ceph_open(admin, string(dir + "/out").c_str(), O_CREAT|O_WRONLY, 0755);
+  ceph_write(admin, fd, "foo", 3, 0);
+  ceph_close(admin,fd);
+
+  string key;
+  ASSERT_EQ(0, do_mon_command(
+      "{\"prefix\": \"auth get-or-create\", \"entity\": \"client." + user + "\", "
+      "\"caps\": [\"mon\", \"allow r\", \"osd\", \"allow rw\", "
+      "\"mds\", \"allow r\""
+      "], \"format\": \"json\"}", &key));
+
+  struct ceph_mount_info *cmount;
+  ASSERT_EQ(0, ceph_create(&cmount, user.c_str()));
+  ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_set(cmount, "key", key.c_str()));
+  ASSERT_EQ(0, ceph_mount(cmount, "/"));
+
+  // allowed
+  fd = ceph_open(cmount, string(dir + "/out").c_str(), O_RDONLY, 0644);
+  ASSERT_GE(fd, 0);
+  ceph_close(cmount,fd);
+
+  // not allowed
+  fd = ceph_open(cmount, string(dir + "/bar").c_str(), O_CREAT|O_WRONLY, 0755);
+  ASSERT_LT(fd, 0);
+  ASSERT_LT(ceph_mkdir(cmount, dir2.c_str(), 0755), 0);
+
+  ceph_shutdown(cmount);
+  ASSERT_EQ(0, ceph_unlink(admin, string(dir + "/out").c_str()));
+  ASSERT_EQ(0, ceph_rmdir(admin, dir.c_str()));
+  ceph_shutdown(admin);
+}
+
+TEST(AccessTest, User) {
+  string dir = get_unique_dir();
+  string user = "libcephfs_user_test." + stringify(rand());
+
+  // admin mount to set up test
+  struct ceph_mount_info *admin;
+  ASSERT_EQ(0, ceph_create(&admin, NULL));
+  ASSERT_EQ(0, ceph_conf_read_file(admin, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(admin, NULL));
+  ASSERT_EQ(0, ceph_mount(admin, "/"));
+  ASSERT_EQ(0, ceph_mkdir(admin, dir.c_str(), 0755));
+
+  // create access key
+  string key;
+  ASSERT_EQ(0, do_mon_command(
+      "{\"prefix\": \"auth get-or-create\", \"entity\": \"client." + user + "\", "
+      "\"caps\": [\"mon\", \"allow *\", \"osd\", \"allow rw\", "
+      "\"mds\", \"allow rw uid=123 gids=456,789\""
+      "], \"format\": \"json\"}", &key));
+
+  struct ceph_mount_info *cmount;
+  ASSERT_EQ(0, ceph_create(&cmount, user.c_str()));
+  ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_set(cmount, "key", key.c_str()));
+  ASSERT_EQ(-EACCES, ceph_mount(cmount, "/"));
+  ASSERT_EQ(0, ceph_conf_set(cmount, "client_mount_uid", "123"));
+  ASSERT_EQ(0, ceph_conf_set(cmount, "client_mount_gid", "456"));
+  ASSERT_EQ(0, ceph_mount(cmount, "/"));
+
+  // user bits
+  ASSERT_EQ(0, ceph_chmod(admin, dir.c_str(), 0700));
+  ASSERT_EQ(0, ceph_chown(admin, dir.c_str(), 123, 456));
+  ASSERT_EQ(0, ceph_mkdir(cmount, string(dir + "/u1").c_str(), 0755));
+  ASSERT_EQ(0, ceph_chown(admin, dir.c_str(), 1, 456));
+  ASSERT_EQ(-EACCES, ceph_mkdir(cmount, string(dir + "/no").c_str(), 0755));
+
+  // group bits
+  ASSERT_EQ(0, ceph_chmod(admin, dir.c_str(), 0770));
+  ASSERT_EQ(0, ceph_chown(admin, dir.c_str(), 1, 456));
+  ASSERT_EQ(0, ceph_mkdir(cmount, string(dir + "/u2").c_str(), 0755));
+  ASSERT_EQ(0, ceph_chown(admin, dir.c_str(), 1, 2));
+  ASSERT_EQ(-EACCES, ceph_mkdir(cmount, string(dir + "/no").c_str(), 0755));
+
+  // user overrides group
+  ASSERT_EQ(0, ceph_chmod(admin, dir.c_str(), 0470));
+  ASSERT_EQ(0, ceph_chown(admin, dir.c_str(), 123, 456));
+  ASSERT_EQ(-EACCES, ceph_mkdir(cmount, string(dir + "/no").c_str(), 0755));
+
+  // other
+  ASSERT_EQ(0, ceph_chmod(admin, dir.c_str(), 0777));
+  ASSERT_EQ(0, ceph_chown(admin, dir.c_str(), 1, 1));
+  ASSERT_EQ(0, ceph_mkdir(cmount, string(dir + "/u3").c_str(), 0755));
+  ASSERT_EQ(0, ceph_chmod(admin, dir.c_str(), 0770));
+  ASSERT_EQ(-EACCES, ceph_mkdir(cmount, string(dir + "/no").c_str(), 0755));
+
+  // user and group overrides other
+  ASSERT_EQ(0, ceph_chmod(admin, dir.c_str(), 07));
+  ASSERT_EQ(0, ceph_chown(admin, dir.c_str(), 1, 456));
+  ASSERT_EQ(-EACCES, ceph_mkdir(cmount, string(dir + "/no").c_str(), 0755));
+  ASSERT_EQ(0, ceph_chown(admin, dir.c_str(), 123, 1));
+  ASSERT_EQ(-EACCES, ceph_mkdir(cmount, string(dir + "/no").c_str(), 0755));
+  ASSERT_EQ(0, ceph_chown(admin, dir.c_str(), 123, 456));
+  ASSERT_EQ(-EACCES, ceph_mkdir(cmount, string(dir + "/no").c_str(), 0755));
+
+  // chown and chgrp
+  ASSERT_EQ(0, ceph_chmod(admin, dir.c_str(), 0700));
+  ASSERT_EQ(0, ceph_chown(admin, dir.c_str(), 123, 456));
+  ASSERT_EQ(0, ceph_chown(cmount, dir.c_str(), 123, 789));
+  ASSERT_EQ(0, ceph_chown(cmount, dir.c_str(), 123, 456));
+  ASSERT_EQ(0, ceph_chown(cmount, dir.c_str(), -1, 789));
+  ASSERT_EQ(0, ceph_chown(cmount, dir.c_str(), -1, 456));
+  ASSERT_EQ(-EACCES, ceph_chown(cmount, dir.c_str(), 123, 1));
+  ASSERT_EQ(-EACCES, ceph_chown(cmount, dir.c_str(), 1, 456));
+
+  ASSERT_EQ(0, ceph_chown(admin, dir.c_str(), 1, 1));
+  ASSERT_EQ(-EACCES, ceph_chown(cmount, dir.c_str(), 123, 456));
+  ASSERT_EQ(-EACCES, ceph_chown(cmount, dir.c_str(), 123, -1));
+  ASSERT_EQ(-EACCES, ceph_chown(cmount, dir.c_str(), -1, 456));
+
+  ASSERT_EQ(0, ceph_chown(admin, dir.c_str(), 1, 456));
+  ASSERT_EQ(-EACCES, ceph_chown(cmount, dir.c_str(), 123, 456));
+  ASSERT_EQ(-EACCES, ceph_chown(cmount, dir.c_str(), 123, -1));
+  ASSERT_EQ(-EACCES, ceph_chown(cmount, dir.c_str(), -1, 456));
+
+  ASSERT_EQ(0, ceph_chown(admin, dir.c_str(), 123, 1));
+  ASSERT_EQ(0, ceph_chown(cmount, dir.c_str(), -1, 456));
+  ASSERT_EQ(0, ceph_chown(cmount, dir.c_str(), 123, 789));
+
+  ceph_shutdown(cmount);
+
+  // clean up
+  ASSERT_EQ(0, ceph_rmdir(admin, string(dir + "/u1").c_str()));
+  ASSERT_EQ(0, ceph_rmdir(admin, string(dir + "/u2").c_str()));
+  ASSERT_EQ(0, ceph_rmdir(admin, string(dir + "/u3").c_str()));
+  ASSERT_EQ(0, ceph_rmdir(admin, dir.c_str()));
+  ceph_shutdown(admin);
+}
+
+int main(int argc, char **argv)
+{
+  ::testing::InitGoogleTest(&argc, argv);
+
+  srand(getpid());
+
+  rados_create(&cluster, NULL);
+  rados_conf_read_file(cluster, NULL);
+  rados_conf_parse_env(cluster, NULL);
+  int r = rados_connect(cluster);
+  if (r < 0)
+    exit(1);
+
+  r = RUN_ALL_TESTS();
+
+  rados_shutdown(cluster);
+
+  return r;
+}
diff --git a/src/test/libcephfs/flock.cc b/src/test/libcephfs/flock.cc
index 2bea91f..0b3f0d7 100644
--- a/src/test/libcephfs/flock.cc
+++ b/src/test/libcephfs/flock.cc
@@ -22,6 +22,7 @@
 #include <errno.h>
 #include <sys/fcntl.h>
 #include <unistd.h>
+#include <sys/file.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <dirent.h>
@@ -39,8 +40,8 @@
 // Startup common: create and mount ceph fs
 #define STARTUP_CEPH() do {				\
     ASSERT_EQ(0, ceph_create(&cmount, NULL));		\
-    ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));	\
     ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));	\
+    ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));	\
     ASSERT_EQ(0, ceph_mount(cmount, NULL));		\
   } while(0)
 
diff --git a/src/test/libcephfs/multiclient.cc b/src/test/libcephfs/multiclient.cc
index 78fdb9a..636e740 100644
--- a/src/test/libcephfs/multiclient.cc
+++ b/src/test/libcephfs/multiclient.cc
@@ -25,13 +25,13 @@
 TEST(LibCephFS, MulticlientSimple) {
   struct ceph_mount_info *ca, *cb;
   ASSERT_EQ(ceph_create(&ca, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(ca, NULL));
   ASSERT_EQ(ceph_conf_read_file(ca, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(ca, NULL));
   ASSERT_EQ(ceph_mount(ca, NULL), 0);
 
   ASSERT_EQ(ceph_create(&cb, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cb, NULL));
   ASSERT_EQ(ceph_conf_read_file(cb, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cb, NULL));
   ASSERT_EQ(ceph_mount(cb, NULL), 0);
 
   char name[20];
@@ -65,13 +65,13 @@ TEST(LibCephFS, MulticlientSimple) {
 TEST(LibCephFS, MulticlientHoleEOF) {
   struct ceph_mount_info *ca, *cb;
   ASSERT_EQ(ceph_create(&ca, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(ca, NULL));
   ASSERT_EQ(ceph_conf_read_file(ca, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(ca, NULL));
   ASSERT_EQ(ceph_mount(ca, NULL), 0);
 
   ASSERT_EQ(ceph_create(&cb, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cb, NULL));
   ASSERT_EQ(ceph_conf_read_file(cb, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cb, NULL));
   ASSERT_EQ(ceph_mount(cb, NULL), 0);
 
   char name[20];
diff --git a/src/test/libcephfs/test.cc b/src/test/libcephfs/test.cc
index 5f8e343..1c203ec 100644
--- a/src/test/libcephfs/test.cc
+++ b/src/test/libcephfs/test.cc
@@ -32,8 +32,8 @@ TEST(LibCephFS, OpenEmptyComponent) {
   pid_t mypid = getpid();
   struct ceph_mount_info *cmount;
   ASSERT_EQ(0, ceph_create(&cmount, NULL));
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_mount(cmount, "/"));
 
   char c_dir[1024];
@@ -54,8 +54,8 @@ TEST(LibCephFS, OpenEmptyComponent) {
   ceph_shutdown(cmount);
 
   ASSERT_EQ(0, ceph_create(&cmount, NULL));
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
 
   ASSERT_EQ(0, ceph_mount(cmount, "/"));
 
@@ -71,8 +71,8 @@ TEST(LibCephFS, MountNonExist) {
   struct ceph_mount_info *cmount;
 
   ASSERT_EQ(0, ceph_create(&cmount, NULL));
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_NE(0, ceph_mount(cmount, "/non-exist"));
   ceph_shutdown(cmount);
 }
@@ -82,8 +82,8 @@ TEST(LibCephFS, MountDouble) {
   struct ceph_mount_info *cmount;
 
   ASSERT_EQ(0, ceph_create(&cmount, NULL));
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_mount(cmount, "/"));
   ASSERT_EQ(-EISCONN, ceph_mount(cmount, "/"));
   ceph_shutdown(cmount);
@@ -94,8 +94,8 @@ TEST(LibCephFS, MountRemount) {
   struct ceph_mount_info *cmount;
 
   ASSERT_EQ(0, ceph_create(&cmount, NULL));
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
 
   CephContext *cct = ceph_get_mount_context(cmount);
   ASSERT_EQ(0, ceph_mount(cmount, "/"));
@@ -112,8 +112,8 @@ TEST(LibCephFS, UnmountUnmounted) {
   struct ceph_mount_info *cmount;
 
   ASSERT_EQ(0, ceph_create(&cmount, NULL));
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(-ENOTCONN, ceph_unmount(cmount));
   ceph_shutdown(cmount);
 }
@@ -123,8 +123,8 @@ TEST(LibCephFS, ReleaseUnmounted) {
   struct ceph_mount_info *cmount;
 
   ASSERT_EQ(0, ceph_create(&cmount, NULL));
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_release(cmount));
 }
 
@@ -133,8 +133,8 @@ TEST(LibCephFS, ReleaseMounted) {
   struct ceph_mount_info *cmount;
 
   ASSERT_EQ(0, ceph_create(&cmount, NULL));
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_mount(cmount, "/"));
   ASSERT_EQ(-EISCONN, ceph_release(cmount));
   ASSERT_EQ(0, ceph_unmount(cmount));
@@ -146,8 +146,8 @@ TEST(LibCephFS, UnmountRelease) {
   struct ceph_mount_info *cmount;
 
   ASSERT_EQ(0, ceph_create(&cmount, NULL));
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_mount(cmount, "/"));
   ASSERT_EQ(0, ceph_unmount(cmount));
   ASSERT_EQ(0, ceph_release(cmount));
@@ -156,14 +156,14 @@ TEST(LibCephFS, UnmountRelease) {
 TEST(LibCephFS, Mount) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
   ceph_shutdown(cmount);
 
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
   ceph_shutdown(cmount);
 }
@@ -171,8 +171,8 @@ TEST(LibCephFS, Mount) {
 TEST(LibCephFS, OpenLayout) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   /* valid layout */
@@ -182,8 +182,7 @@ TEST(LibCephFS, OpenLayout) {
   ASSERT_GT(fd, 0);
   char poolname[80];
   ASSERT_LT(0, ceph_get_file_pool_name(cmount, fd, poolname, sizeof(poolname)));
-  ASSERT_EQ(4, ceph_get_file_pool_name(cmount, fd, poolname, 0));
-  ASSERT_EQ(0, strcmp("data", poolname));
+  ASSERT_LT(0, ceph_get_file_pool_name(cmount, fd, poolname, 0));
 
   /* on already-written file (ENOTEMPTY) */
   ceph_write(cmount, fd, "hello world", 11, 0);
@@ -202,10 +201,8 @@ TEST(LibCephFS, OpenLayout) {
 
   /* with data pool */
   sprintf(test_layout_file, "test_layout_%d_d", getpid());
-  fd = ceph_open_layout(cmount, test_layout_file, O_CREAT, 0666, (1<<20), 7, (1<<20), "data");
+  fd = ceph_open_layout(cmount, test_layout_file, O_CREAT, 0666, (1<<20), 7, (1<<20), poolname);
   ASSERT_GT(fd, 0);
-  ASSERT_EQ(4, ceph_get_file_pool_name(cmount, fd, poolname, sizeof(poolname)));
-  ASSERT_EQ(0, strcmp("data", poolname));
   ceph_close(cmount, fd);
 
   /* with metadata pool (invalid) */
@@ -227,8 +224,8 @@ TEST(LibCephFS, DirLs) {
 
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, "/"), 0);
 
   struct ceph_dir_result *ls_dir = NULL;
@@ -386,8 +383,8 @@ TEST(LibCephFS, DirLs) {
 TEST(LibCephFS, ManyNestedDirs) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   const char *many_path = "a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a";
@@ -431,8 +428,8 @@ TEST(LibCephFS, ManyNestedDirs) {
 TEST(LibCephFS, Xattrs) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   char test_xattr_file[256];
@@ -494,8 +491,8 @@ TEST(LibCephFS, Xattrs) {
 TEST(LibCephFS, Xattrs_ll) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   char test_xattr_file[256];
@@ -532,8 +529,8 @@ TEST(LibCephFS, Xattrs_ll) {
 TEST(LibCephFS, LstatSlashdot) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   struct stat stbuf;
@@ -547,8 +544,8 @@ TEST(LibCephFS, DoubleChmod) {
 
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   char test_file[256];
@@ -602,8 +599,8 @@ TEST(LibCephFS, DoubleChmod) {
 TEST(LibCephFS, Fchmod) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   char test_file[256];
@@ -646,8 +643,8 @@ TEST(LibCephFS, Fchmod) {
 TEST(LibCephFS, Fchown) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   char test_file[256];
@@ -675,8 +672,8 @@ TEST(LibCephFS, FlagO_PATH) {
   struct ceph_mount_info *cmount;
 
   ASSERT_EQ(0, ceph_create(&cmount, NULL));
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_conf_read_file(cmount, NULL));
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(0, ceph_mount(cmount, NULL));
 
   char test_file[PATH_MAX];
@@ -717,8 +714,8 @@ TEST(LibCephFS, FlagO_PATH) {
 TEST(LibCephFS, Symlinks) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   char test_file[256];
@@ -776,8 +773,8 @@ TEST(LibCephFS, Symlinks) {
 TEST(LibCephFS, DirSyms) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   char test_dir1[256];
@@ -808,8 +805,8 @@ TEST(LibCephFS, DirSyms) {
 TEST(LibCephFS, LoopSyms) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   char test_dir1[256];
@@ -852,8 +849,8 @@ TEST(LibCephFS, HardlinkNoOriginal) {
 
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   char dir[256];
@@ -877,8 +874,8 @@ TEST(LibCephFS, HardlinkNoOriginal) {
 
   // now cleanup
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
   ASSERT_EQ(ceph_chdir(cmount, dir), 0);
   ASSERT_EQ(ceph_unlink(cmount, "hardl1"), 0);
@@ -890,8 +887,8 @@ TEST(LibCephFS, HardlinkNoOriginal) {
 TEST(LibCephFS, BadArgument) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   int fd = ceph_open(cmount, "test_file", O_CREAT|O_RDWR, 0666);
@@ -908,8 +905,8 @@ TEST(LibCephFS, BadArgument) {
 TEST(LibCephFS, BadFileDesc) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   ASSERT_EQ(ceph_fchmod(cmount, -1, 0655), -EBADF);
@@ -939,8 +936,8 @@ TEST(LibCephFS, BadFileDesc) {
 TEST(LibCephFS, ReadEmptyFile) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   // test the read_sync path in the client for zero files
@@ -968,8 +965,8 @@ TEST(LibCephFS, ReadEmptyFile) {
 TEST(LibCephFS, PreadvPwritev) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   int mypid = getpid();
@@ -1006,8 +1003,8 @@ TEST(LibCephFS, PreadvPwritev) {
 TEST(LibCephFS, StripeUnitGran) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
   ASSERT_GT(ceph_get_stripe_unit_granularity(cmount), 0);
   ceph_shutdown(cmount);
@@ -1016,8 +1013,8 @@ TEST(LibCephFS, StripeUnitGran) {
 TEST(LibCephFS, Rename) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   int mypid = getpid();
@@ -1051,8 +1048,8 @@ TEST(LibCephFS, Rename) {
 TEST(LibCephFS, UseUnmounted) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
 
   struct statvfs stvfs;
   EXPECT_EQ(-ENOTCONN, ceph_statfs(cmount, "/", &stvfs));
@@ -1132,12 +1129,14 @@ TEST(LibCephFS, UseUnmounted) {
 TEST(LibCephFS, GetPoolId) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
-  ASSERT_GE(ceph_get_pool_id(cmount, "data"), 0);
-  ASSERT_GE(ceph_get_pool_id(cmount, "metadata"), 0);
+  char name[80];
+  memset(name, 0, sizeof(name));
+  ASSERT_LE(0, ceph_get_path_pool_name(cmount, "/", name, sizeof(name)));
+  ASSERT_GE(ceph_get_pool_id(cmount, name), 0);
   ASSERT_EQ(ceph_get_pool_id(cmount, "weflkjwelfjwlkejf"), -ENOENT);
 
   ceph_shutdown(cmount);
@@ -1146,15 +1145,18 @@ TEST(LibCephFS, GetPoolId) {
 TEST(LibCephFS, GetPoolReplication) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   /* negative pools */
   ASSERT_EQ(ceph_get_pool_replication(cmount, -10), -ENOENT);
 
   /* valid pool */
-  int pool_id = ceph_get_pool_id(cmount, "data");
+  int pool_id;
+  int stripe_unit, stripe_count, object_size;
+  ASSERT_EQ(0, ceph_get_path_layout(cmount, "/", &stripe_unit, &stripe_count,
+				    &object_size, &pool_id));
   ASSERT_GE(pool_id, 0);
   ASSERT_GT(ceph_get_pool_replication(cmount, pool_id), 0);
 
@@ -1164,11 +1166,11 @@ TEST(LibCephFS, GetPoolReplication) {
 TEST(LibCephFS, GetExtentOsds) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
 
   EXPECT_EQ(-ENOTCONN, ceph_get_file_extent_osds(cmount, 0, 0, NULL, NULL, 0));
 
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   int stripe_unit = (1<<18);
@@ -1215,11 +1217,11 @@ TEST(LibCephFS, GetExtentOsds) {
 TEST(LibCephFS, GetOsdCrushLocation) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
 
   EXPECT_EQ(-ENOTCONN, ceph_get_osd_crush_location(cmount, 0, NULL, 0));
 
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   ASSERT_EQ(ceph_get_osd_crush_location(cmount, 0, NULL, 1), -EINVAL);
@@ -1266,11 +1268,11 @@ TEST(LibCephFS, GetOsdCrushLocation) {
 TEST(LibCephFS, GetOsdAddr) {
   struct ceph_mount_info *cmount;
   ASSERT_EQ(ceph_create(&cmount, NULL), 0);
-  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
 
   EXPECT_EQ(-ENOTCONN, ceph_get_osd_addr(cmount, 0, NULL));
 
   ASSERT_EQ(ceph_conf_read_file(cmount, NULL), 0);
+  ASSERT_EQ(0, ceph_conf_parse_env(cmount, NULL));
   ASSERT_EQ(ceph_mount(cmount, NULL), 0);
 
   ASSERT_EQ(-EINVAL, ceph_get_osd_addr(cmount, 0, NULL));
diff --git a/src/test/librados/aio.cc b/src/test/librados/aio.cc
index 6971715..cbae877 100644
--- a/src/test/librados/aio.cc
+++ b/src/test/librados/aio.cc
@@ -41,7 +41,7 @@ public:
   std::string init()
   {
     int ret;
-    if (SEM_FAILED == (m_sem = sem_open("test_aio_sem", O_CREAT, 0644, 0))) {
+    if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
       int err = errno;
       ostringstream oss;
       oss << "sem_open failed: " << cpp_strerror(err);
@@ -98,7 +98,7 @@ public:
   std::string init()
   {
     int ret;
-    if (SEM_FAILED == (m_sem = sem_open("test_aio_sem", O_CREAT, 0644, 0))) {
+    if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
       int err = errno;
       ostringstream oss;
       oss << "sem_open failed: " << cpp_strerror(err);
@@ -1755,7 +1755,7 @@ public:
   std::string init()
   {
     int ret;
-    if (SEM_FAILED == (m_sem = sem_open("test_aio_sem", O_CREAT, 0644, 0))) {
+    if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
       int err = errno;
       ostringstream oss;
       oss << "sem_open failed: " << cpp_strerror(err);
@@ -1812,7 +1812,7 @@ public:
   std::string init()
   {
     int ret;
-    if (SEM_FAILED == (m_sem = sem_open("test_aio_sem", O_CREAT, 0644, 0))) {
+    if (SEM_FAILED == (m_sem = sem_open("/test_aio_sem", O_CREAT, 0644, 0))) {
       int err = errno;
       ostringstream oss;
       oss << "sem_open failed: " << cpp_strerror(err);
diff --git a/src/test/librados/test.cc b/src/test/librados/test.cc
index aac053a..48fbf96 100644
--- a/src/test/librados/test.cc
+++ b/src/test/librados/test.cc
@@ -45,25 +45,42 @@ std::string create_one_pool(const std::string &pool_name, rados_t *cluster)
   return "";
 }
 
+int destroy_ec_profile(rados_t *cluster)
+{
+    char *cmd[2];
+    cmd[0] = (char *)"{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}";
+    cmd[1] = NULL;
+    return rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0);
+}
+
 std::string create_one_ec_pool(const std::string &pool_name, rados_t *cluster)
 {
   std::string err = connect_cluster(cluster);
   if (err.length())
     return err;
 
+  int ret = destroy_ec_profile(cluster);
+  if (ret) {
+    rados_shutdown(*cluster);
+    std::ostringstream oss;
+    oss << "rados_mon_command erasure-code-profile rm testprofile failed with error " << ret;
+    return oss.str();
+  }
+    
   char *cmd[2];
-
   cmd[1] = NULL;
 
-  cmd[0] = (char *)"{\"prefix\": \"osd erasure-code-profile set\", \"name\": \"testprofile\", \"profile\": [ \"k=2\", \"m=1\", \"ruleset-failure-domain=osd\"]}";
-  int ret = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, NULL, NULL, NULL);
+  std::string profile_create = "{\"prefix\": \"osd erasure-code-profile set\", \"name\": \"testprofile\", \"profile\": [ \"k=2\", \"m=1\", \"ruleset-failure-domain=osd\"]}";
+  cmd[0] = (char *)profile_create.c_str();
+  ret = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0);
   if (ret) {
-    rados_shutdown(*cluster);
     std::ostringstream oss;
+
+    rados_shutdown(*cluster);
     oss << "rados_mon_command erasure-code-profile set name:testprofile failed with error " << ret;
     return oss.str();
   }
-    
+
   std::string cmdstr = "{\"prefix\": \"osd pool create\", \"pool\": \"" +
      pool_name + "\", \"pool_type\":\"erasure\", \"pg_num\":8, \"pgp_num\":8, \"erasure_code_profile\":\"testprofile\"}";
   cmd[0] = (char *)cmdstr.c_str();
@@ -71,13 +88,12 @@ std::string create_one_ec_pool(const std::string &pool_name, rados_t *cluster)
   if (ret) {
     std::ostringstream oss;
 
-    cmd[0] = (char *)"{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}";
-    int ret2 = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0);
+    int ret2 = destroy_ec_profile(cluster);
     if (ret2)
       oss << "rados_mon_command osd erasure-code-profile rm name:testprofile failed with error " << ret2 << std::endl;
 
     rados_shutdown(*cluster);
-    oss << "rados_mon_command erasure-code-profile set name:testprofile failed with error " << ret;
+    oss << "rados_mon_command osd pool create failed with error " << ret;
     return oss.str();
   }
 
@@ -100,14 +116,29 @@ std::string create_one_pool_pp(const std::string &pool_name, Rados &cluster)
   return "";
 }
 
+int destroy_ec_profile_pp(Rados &cluster)
+{
+  bufferlist inbl;
+  return cluster.mon_command("{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}",
+                             inbl, NULL, NULL);
+}
+
 std::string create_one_ec_pool_pp(const std::string &pool_name, Rados &cluster)
 {
   std::string err = connect_cluster_pp(cluster);
   if (err.length())
     return err;
 
+  int ret = destroy_ec_profile_pp(cluster);
+  if (ret) {
+    cluster.shutdown();
+    std::ostringstream oss;
+    oss << "rados_mon_command erasure-code-profile rm testprofile failed with error " << ret;
+    return oss.str();
+  }
+
   bufferlist inbl;
-  int ret = cluster.mon_command(
+  ret = cluster.mon_command(
     "{\"prefix\": \"osd erasure-code-profile set\", \"name\": \"testprofile\", \"profile\": [ \"k=2\", \"m=1\", \"ruleset-failure-domain=osd\"]}",
     inbl, NULL, NULL);
   if (ret) {
@@ -123,9 +154,7 @@ std::string create_one_ec_pool_pp(const std::string &pool_name, Rados &cluster)
   if (ret) {
     std::ostringstream oss;
     bufferlist inbl;
-    int ret2 = cluster.mon_command(
-      "{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}",
-      inbl, NULL, NULL);
+    int ret2 = destroy_ec_profile_pp(cluster);
     if (ret2)
       oss << "mon_command osd erasure-code-profile rm name:testprofile failed with error " << ret2 << std::endl;
 
@@ -213,12 +242,7 @@ int destroy_one_ec_pool(const std::string &pool_name, rados_t *cluster)
 {
   int ret = rados_pool_delete(*cluster, pool_name.c_str());
   if (ret == 0) {
-    char *cmd[2];
-
-    cmd[1] = NULL;
-
-    cmd[0] = (char *)"{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}";
-    int ret2 = rados_mon_command(*cluster, (const char **)cmd, 1, "", 0, NULL, 0, NULL, 0);
+    int ret2 = destroy_ec_profile(cluster);
     if (ret2) {
       rados_shutdown(*cluster);
       return ret2;
@@ -245,9 +269,7 @@ int destroy_one_ec_pool_pp(const std::string &pool_name, Rados &cluster)
   int ret = cluster.pool_delete(pool_name.c_str());
   bufferlist inbl;
   if (ret == 0) {
-    int ret2 = cluster.mon_command(
-      "{\"prefix\": \"osd erasure-code-profile rm\", \"name\": \"testprofile\"}",
-      inbl, NULL, NULL);
+    int ret2 = destroy_ec_profile_pp(cluster);
     if (ret2) {
       cluster.shutdown();
       return ret2;
diff --git a/src/test/librados/tier.cc b/src/test/librados/tier.cc
index eb2db6b..12ccfc2 100644
--- a/src/test/librados/tier.cc
+++ b/src/test/librados/tier.cc
@@ -769,6 +769,65 @@ TEST_F(LibRadosTwoPoolsPP, Evict) {
     ASSERT_TRUE(it == cache_ioctx.nobjects_end());
   }
 
+  // pin
+  {
+    ObjectWriteOperation op;
+    op.cache_pin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
+  // evict the pinned object with -EPERM
+  {
+    ObjectReadOperation op;
+    op.cache_evict();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op,
+					 librados::OPERATION_IGNORE_CACHE,
+					 NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(-EPERM, completion->get_return_value());
+    completion->release();
+  }
+
+  // unpin
+  {
+    ObjectWriteOperation op;
+    op.cache_unpin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
+  // flush
+  {
+    ObjectReadOperation op;
+    op.cache_flush();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate(
+      "foo", completion, &op,
+      librados::OPERATION_IGNORE_OVERLAY, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
+  // verify clean
+  {
+    bool dirty = false;
+    int r = -1;
+    ObjectReadOperation op;
+    op.is_dirty(&dirty, &r);
+    ASSERT_EQ(0, cache_ioctx.operate("foo", &op, NULL));
+    ASSERT_FALSE(dirty);
+    ASSERT_EQ(0, r);
+  }
+
   // evict
   {
     ObjectReadOperation op;
@@ -1174,6 +1233,42 @@ TEST_F(LibRadosTwoPoolsPP, TryFlush) {
     ASSERT_EQ(0, r);
   }
 
+  // pin
+  {
+    ObjectWriteOperation op;
+    op.cache_pin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
+  // flush the pinned object with -EPERM
+  {
+    ObjectReadOperation op;
+    op.cache_try_flush();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate(
+      "foo", completion, &op,
+      librados::OPERATION_IGNORE_OVERLAY |
+      librados::OPERATION_SKIPRWLOCKS, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(-EPERM, completion->get_return_value());
+    completion->release();
+  }
+
+  // unpin
+  {
+    ObjectWriteOperation op;
+    op.cache_unpin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
   // flush
   {
     ObjectReadOperation op;
@@ -1285,6 +1380,42 @@ TEST_F(LibRadosTwoPoolsPP, Flush) {
     user_version = cache_ioctx.get_last_version();
   }
 
+  // pin
+  {
+    ObjectWriteOperation op;
+    op.cache_pin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
+  // flush the pinned object with -EPERM
+  {
+    ObjectReadOperation op;
+    op.cache_try_flush();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate(
+      "foo", completion, &op,
+      librados::OPERATION_IGNORE_OVERLAY |
+      librados::OPERATION_SKIPRWLOCKS, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(-EPERM, completion->get_return_value());
+    completion->release();
+  }
+
+  // unpin
+  {
+    ObjectWriteOperation op;
+    op.cache_unpin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
   // flush
   {
     ObjectReadOperation op;
@@ -2275,6 +2406,12 @@ TEST_F(LibRadosTwoPoolsPP, PromoteOn2ndRead) {
   ASSERT_EQ(0, cluster.mon_command(
     set_pool_str(cache_pool_name, "min_read_recency_for_promote", 1),
     inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_grade_decay_rate", 20),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_search_last_n", 1),
+    inbl, NULL, NULL));
 
   // wait for maps to settle
   cluster.wait_for_latest_osdmap();
@@ -2412,6 +2549,157 @@ TEST_F(LibRadosTwoPoolsPP, ProxyRead) {
   cluster.wait_for_latest_osdmap();
 }
 
+TEST_F(LibRadosTwoPoolsPP, CachePin) {
+  // create object
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, ioctx.operate("foo", &op));
+  }
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, ioctx.operate("bar", &op));
+  }
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, ioctx.operate("baz", &op));
+  }
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, ioctx.operate("bam", &op));
+  }
+
+  // configure cache
+  bufferlist inbl;
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name +
+    "\", \"force_nonempty\": \"--force-nonempty\" }",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
+    "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier cache-mode\", \"pool\": \"" + cache_pool_name +
+    "\", \"mode\": \"writeback\"}",
+    inbl, NULL, NULL));
+
+  // wait for maps to settle
+  cluster.wait_for_latest_osdmap();
+
+  // read, trigger promote
+  {
+    bufferlist bl;
+    ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+    ASSERT_EQ(1, ioctx.read("bar", bl, 1, 0));
+    ASSERT_EQ(1, ioctx.read("baz", bl, 1, 0));
+    ASSERT_EQ(1, ioctx.read("bam", bl, 1, 0));
+  }
+
+  // verify the objects are present in the cache tier
+  {
+    NObjectIterator it = cache_ioctx.nobjects_begin();
+    ASSERT_TRUE(it != cache_ioctx.nobjects_end());
+    for (uint32_t i = 0; i < 4; i++) {
+      ASSERT_TRUE(it->get_oid() == string("foo") ||
+                  it->get_oid() == string("bar") ||
+                  it->get_oid() == string("baz") ||
+                  it->get_oid() == string("bam"));
+      ++it;
+    }
+    ASSERT_TRUE(it == cache_ioctx.nobjects_end());
+  }
+
+  // pin objects
+  {
+    ObjectWriteOperation op;
+    op.cache_pin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+  {
+    ObjectWriteOperation op;
+    op.cache_pin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("baz", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
+  // enable agent
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_count", 2),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_period", 600),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "min_read_recency_for_promote", 1),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "target_max_objects", 1),
+    inbl, NULL, NULL));
+
+  sleep(10);
+
+  // Verify the pinned object 'foo' is not flushed/evicted
+  uint32_t count = 0;
+  while (true) {
+    bufferlist bl;
+    ASSERT_EQ(1, ioctx.read("baz", bl, 1, 0));
+
+    count = 0;
+    NObjectIterator it = cache_ioctx.nobjects_begin();
+    while (it != cache_ioctx.nobjects_end()) {
+      ASSERT_TRUE(it->get_oid() == string("foo") ||
+                  it->get_oid() == string("bar") ||
+                  it->get_oid() == string("baz") ||
+                  it->get_oid() == string("bam"));
+      ++count;
+      ++it;
+    }
+    if (count == 2) {
+      ASSERT_TRUE(it->get_oid() == string("foo") ||
+                  it->get_oid() == string("baz"));
+      break;
+    }
+
+    sleep(1);
+  }
+
+  // tear down tiers
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+    "\"}",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+
+  // wait for maps to settle before next test
+  cluster.wait_for_latest_osdmap();
+}
+
 class LibRadosTwoPoolsECPP : public RadosTestECPP
 {
 public:
@@ -2993,7 +3281,18 @@ TEST_F(LibRadosTwoPoolsECPP, Evict) {
     ASSERT_TRUE(it == cache_ioctx.nobjects_end());
   }
 
-  // evict
+  // pin
+  {
+    ObjectWriteOperation op;
+    op.cache_pin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
+  // evict the pinned object with -EPERM
   {
     ObjectReadOperation op;
     op.cache_evict();
@@ -3002,23 +3301,71 @@ TEST_F(LibRadosTwoPoolsECPP, Evict) {
 					 librados::OPERATION_IGNORE_CACHE,
 					 NULL));
     completion->wait_for_safe();
-    ASSERT_EQ(0, completion->get_return_value());
+    ASSERT_EQ(-EPERM, completion->get_return_value());
     completion->release();
   }
+
+  // unpin
   {
-    ObjectReadOperation op;
-    op.cache_evict();
+    ObjectWriteOperation op;
+    op.cache_unpin();
     librados::AioCompletion *completion = cluster.aio_create_completion();
-    ASSERT_EQ(0, cache_ioctx.aio_operate(
-      "foo", completion, &op,
-      librados::OPERATION_IGNORE_CACHE, NULL));
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op));
     completion->wait_for_safe();
     ASSERT_EQ(0, completion->get_return_value());
     completion->release();
   }
+
+  // flush
   {
     ObjectReadOperation op;
-    op.cache_evict();
+    op.cache_flush();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate(
+      "foo", completion, &op,
+      librados::OPERATION_IGNORE_OVERLAY, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
+  // verify clean
+  {
+    bool dirty = false;
+    int r = -1;
+    ObjectReadOperation op;
+    op.is_dirty(&dirty, &r);
+    ASSERT_EQ(0, cache_ioctx.operate("foo", &op, NULL));
+    ASSERT_FALSE(dirty);
+    ASSERT_EQ(0, r);
+  }
+
+  // evict
+  {
+    ObjectReadOperation op;
+    op.cache_evict();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op,
+					 librados::OPERATION_IGNORE_CACHE,
+					 NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+  {
+    ObjectReadOperation op;
+    op.cache_evict();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate(
+      "foo", completion, &op,
+      librados::OPERATION_IGNORE_CACHE, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+  {
+    ObjectReadOperation op;
+    op.cache_evict();
     librados::AioCompletion *completion = cluster.aio_create_completion();
     ASSERT_EQ(0, cache_ioctx.aio_operate(
       "bar", completion, &op,
@@ -3322,6 +3669,42 @@ TEST_F(LibRadosTwoPoolsECPP, TryFlush) {
     ASSERT_EQ(0, r);
   }
 
+  // pin
+  {
+    ObjectWriteOperation op;
+    op.cache_pin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
+  // flush the pinned object with -EPERM
+  {
+    ObjectReadOperation op;
+    op.cache_try_flush();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate(
+      "foo", completion, &op,
+      librados::OPERATION_IGNORE_OVERLAY |
+      librados::OPERATION_SKIPRWLOCKS, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(-EPERM, completion->get_return_value());
+    completion->release();
+  }
+
+  // unpin
+  {
+    ObjectWriteOperation op;
+    op.cache_unpin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
   // flush
   {
     ObjectReadOperation op;
@@ -3433,6 +3816,42 @@ TEST_F(LibRadosTwoPoolsECPP, Flush) {
     user_version = cache_ioctx.get_last_version();
   }
 
+  // pin
+  {
+    ObjectWriteOperation op;
+    op.cache_pin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
+  // flush the pinned object with -EPERM
+  {
+    ObjectReadOperation op;
+    op.cache_try_flush();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate(
+      "foo", completion, &op,
+      librados::OPERATION_IGNORE_OVERLAY |
+      librados::OPERATION_SKIPRWLOCKS, NULL));
+    completion->wait_for_safe();
+    ASSERT_EQ(-EPERM, completion->get_return_value());
+    completion->release();
+  }
+
+  // unpin
+  {
+    ObjectWriteOperation op;
+    op.cache_unpin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
   // flush
   {
     ObjectReadOperation op;
@@ -4309,12 +4728,12 @@ TEST_F(LibRadosTwoPoolsECPP, HitSetTrim) {
 
 TEST_F(LibRadosTwoPoolsECPP, PromoteOn2ndRead) {
   // create object
-  {
+  for (int i=0; i<20; ++i) {
     bufferlist bl;
     bl.append("hi there");
     ObjectWriteOperation op;
     op.write_full(bl);
-    ASSERT_EQ(0, ioctx.operate("foo", &op));
+    ASSERT_EQ(0, ioctx.operate("foo" + stringify(i), &op));
   }
 
   // configure cache
@@ -4346,44 +4765,73 @@ TEST_F(LibRadosTwoPoolsECPP, PromoteOn2ndRead) {
   ASSERT_EQ(0, cluster.mon_command(
     set_pool_str(cache_pool_name, "min_read_recency_for_promote", 1),
     inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_grade_decay_rate", 20),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_search_last_n", 1),
+    inbl, NULL, NULL));
 
   // wait for maps to settle
   cluster.wait_for_latest_osdmap();
 
-  // 1st read, don't trigger a promote
-  utime_t start = ceph_clock_now(NULL);
-  {
-    bufferlist bl;
-    ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
-  }
-  utime_t end = ceph_clock_now(NULL);
-  float dur = end - start;
-  cout << "duration " << dur << std::endl;
+  int fake = 0;  // set this to non-zero to test spurious promotion,
+		 // e.g. from thrashing
+  int attempt = 0;
+  string obj;
+  while (true) {
+    // 1st read, don't trigger a promote
+    obj = "foo" + stringify(attempt);
+    cout << obj << std::endl;
+    {
+      bufferlist bl;
+      ASSERT_EQ(1, ioctx.read(obj.c_str(), bl, 1, 0));
+      if (--fake >= 0) {
+	sleep(1);
+	ASSERT_EQ(1, ioctx.read(obj.c_str(), bl, 1, 0));
+	sleep(1);
+      }
+    }
 
-  // verify the object is NOT present in the cache tier
-  {
-    NObjectIterator it = cache_ioctx.nobjects_begin();
-    if (it != cache_ioctx.nobjects_end()) {
-      if (dur > 1.0) {
-	cout << " object got promoted, but read was slow, ignoring" << std::endl;
-      } else {
-	ASSERT_TRUE(it == cache_ioctx.nobjects_end());
+    // verify the object is NOT present in the cache tier
+    {
+      bool found = false;
+      NObjectIterator it = cache_ioctx.nobjects_begin();
+      while (it != cache_ioctx.nobjects_end()) {
+	cout << " see " << it->get_oid() << std::endl;
+	if (it->get_oid() == string(obj.c_str())) {
+	  found = true;
+	  break;
+	}
+	++it;
       }
+      if (!found)
+	break;
     }
+
+    ++attempt;
+    ASSERT_LE(attempt, 20);
+    cout << "hrm, object is present in cache on attempt " << attempt
+	 << ", retrying" << std::endl;
   }
 
   // Read until the object is present in the cache tier
+  cout << "verifying " << obj << " is eventually promoted" << std::endl;
   while (true) {
     bufferlist bl;
-    ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+    ASSERT_EQ(1, ioctx.read(obj.c_str(), bl, 1, 0));
 
+    bool there = false;
     NObjectIterator it = cache_ioctx.nobjects_begin();
-    if (it != cache_ioctx.nobjects_end()) {
-      ASSERT_TRUE(it->get_oid() == string("foo"));
+    while (it != cache_ioctx.nobjects_end()) {
+      if (it->get_oid() == string(obj.c_str())) {
+	there = true;
+	break;
+      }
       ++it;
-      ASSERT_TRUE(it == cache_ioctx.nobjects_end());
-      break;
     }
+    if (there)
+      break;
 
     sleep(1);
   }
@@ -4460,6 +4908,157 @@ TEST_F(LibRadosTwoPoolsECPP, ProxyRead) {
   cluster.wait_for_latest_osdmap();
 }
 
+TEST_F(LibRadosTwoPoolsECPP, CachePin) {
+  // create object
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, ioctx.operate("foo", &op));
+  }
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, ioctx.operate("bar", &op));
+  }
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, ioctx.operate("baz", &op));
+  }
+  {
+    bufferlist bl;
+    bl.append("hi there");
+    ObjectWriteOperation op;
+    op.write_full(bl);
+    ASSERT_EQ(0, ioctx.operate("bam", &op));
+  }
+
+  // configure cache
+  bufferlist inbl;
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier add\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name +
+    "\", \"force_nonempty\": \"--force-nonempty\" }",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier set-overlay\", \"pool\": \"" + pool_name +
+    "\", \"overlaypool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier cache-mode\", \"pool\": \"" + cache_pool_name +
+    "\", \"mode\": \"writeback\"}",
+    inbl, NULL, NULL));
+
+  // wait for maps to settle
+  cluster.wait_for_latest_osdmap();
+
+  // read, trigger promote
+  {
+    bufferlist bl;
+    ASSERT_EQ(1, ioctx.read("foo", bl, 1, 0));
+    ASSERT_EQ(1, ioctx.read("bar", bl, 1, 0));
+    ASSERT_EQ(1, ioctx.read("baz", bl, 1, 0));
+    ASSERT_EQ(1, ioctx.read("bam", bl, 1, 0));
+  }
+
+  // verify the objects are present in the cache tier
+  {
+    NObjectIterator it = cache_ioctx.nobjects_begin();
+    ASSERT_TRUE(it != cache_ioctx.nobjects_end());
+    for (uint32_t i = 0; i < 4; i++) {
+      ASSERT_TRUE(it->get_oid() == string("foo") ||
+                  it->get_oid() == string("bar") ||
+                  it->get_oid() == string("baz") ||
+                  it->get_oid() == string("bam"));
+      ++it;
+    }
+    ASSERT_TRUE(it == cache_ioctx.nobjects_end());
+  }
+
+  // pin objects
+  {
+    ObjectWriteOperation op;
+    op.cache_pin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("foo", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+  {
+    ObjectWriteOperation op;
+    op.cache_pin();
+    librados::AioCompletion *completion = cluster.aio_create_completion();
+    ASSERT_EQ(0, cache_ioctx.aio_operate("baz", completion, &op));
+    completion->wait_for_safe();
+    ASSERT_EQ(0, completion->get_return_value());
+    completion->release();
+  }
+
+  // enable agent
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_count", 2),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_period", 600),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "hit_set_type", "bloom"),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "min_read_recency_for_promote", 1),
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    set_pool_str(cache_pool_name, "target_max_objects", 1),
+    inbl, NULL, NULL));
+
+  sleep(10);
+
+  // Verify the pinned object 'foo' is not flushed/evicted
+  uint32_t count = 0;
+  while (true) {
+    bufferlist bl;
+    ASSERT_EQ(1, ioctx.read("baz", bl, 1, 0));
+
+    count = 0;
+    NObjectIterator it = cache_ioctx.nobjects_begin();
+    while (it != cache_ioctx.nobjects_end()) {
+      ASSERT_TRUE(it->get_oid() == string("foo") ||
+                  it->get_oid() == string("bar") ||
+                  it->get_oid() == string("baz") ||
+                  it->get_oid() == string("bam"));
+      ++count;
+      ++it;
+    }
+    if (count == 2) {
+      ASSERT_TRUE(it->get_oid() == string("foo") ||
+                  it->get_oid() == string("baz"));
+      break;
+    }
+
+    sleep(1);
+  }
+
+  // tear down tiers
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier remove-overlay\", \"pool\": \"" + pool_name +
+    "\"}",
+    inbl, NULL, NULL));
+  ASSERT_EQ(0, cluster.mon_command(
+    "{\"prefix\": \"osd tier remove\", \"pool\": \"" + pool_name +
+    "\", \"tierpool\": \"" + cache_pool_name + "\"}",
+    inbl, NULL, NULL));
+
+  // wait for maps to settle before next test
+  cluster.wait_for_latest_osdmap();
+}
+
 int main(int argc, char **argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/src/test/librados/watch_notify.cc b/src/test/librados/watch_notify.cc
index c424fd8..c191a38 100644
--- a/src/test/librados/watch_notify.cc
+++ b/src/test/librados/watch_notify.cc
@@ -104,7 +104,7 @@ public:
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 
 TEST_F(LibRadosWatchNotify, WatchNotify) {
-  ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0)));
+  ASSERT_NE(SEM_FAILED, (sem = sem_open("/test_watch_notify_sem", O_CREAT, 0644, 0)));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
@@ -124,7 +124,7 @@ TEST_F(LibRadosWatchNotify, WatchNotify) {
 }
 
 TEST_P(LibRadosWatchNotifyPP, WatchNotify) {
-  ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0)));
+  ASSERT_NE(SEM_FAILED, (sem = sem_open("/test_watch_notify_sem", O_CREAT, 0644, 0)));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
@@ -145,7 +145,7 @@ TEST_P(LibRadosWatchNotifyPP, WatchNotify) {
 }
 
 TEST_F(LibRadosWatchNotifyEC, WatchNotify) {
-  ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0)));
+  ASSERT_NE(SEM_FAILED, (sem = sem_open("/test_watch_notify_sem", O_CREAT, 0644, 0)));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   ASSERT_EQ(0, rados_write(ioctx, "foo", buf, sizeof(buf), 0));
@@ -160,7 +160,7 @@ TEST_F(LibRadosWatchNotifyEC, WatchNotify) {
 }
 
 TEST_F(LibRadosWatchNotifyECPP, WatchNotify) {
-  ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0)));
+  ASSERT_NE(SEM_FAILED, (sem = sem_open("/test_watch_notify_sem", O_CREAT, 0644, 0)));
   char buf[128];
   memset(buf, 0xcc, sizeof(buf));
   bufferlist bl1;
@@ -183,7 +183,7 @@ TEST_F(LibRadosWatchNotifyECPP, WatchNotify) {
 // --
 
 TEST_P(LibRadosWatchNotifyPP, WatchNotifyTimeout) {
-  ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0)));
+  ASSERT_NE(SEM_FAILED, (sem = sem_open("/test_watch_notify_sem", O_CREAT, 0644, 0)));
   ioctx.set_notify_timeout(1);
   uint64_t handle;
   WatchNotifyTestCtx ctx;
@@ -200,7 +200,7 @@ TEST_P(LibRadosWatchNotifyPP, WatchNotifyTimeout) {
 }
 
 TEST_F(LibRadosWatchNotifyECPP, WatchNotifyTimeout) {
-  ASSERT_NE(SEM_FAILED, (sem = sem_open("test_watch_notify_sem", O_CREAT, 0644, 0)));
+  ASSERT_NE(SEM_FAILED, (sem = sem_open("/test_watch_notify_sem", O_CREAT, 0644, 0)));
   ioctx.set_notify_timeout(1);
   uint64_t handle;
   WatchNotifyTestCtx ctx;
@@ -378,6 +378,59 @@ TEST_F(LibRadosWatchNotify, WatchNotify2) {
   rados_unwatch2(ioctx, handle);
 }
 
+TEST_F(LibRadosWatchNotify, AioNotify) {
+  notify_io = ioctx;
+  notify_oid = "foo";
+  notify_cookies.clear();
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  ASSERT_EQ(0, rados_write(ioctx, notify_oid, buf, sizeof(buf), 0));
+  uint64_t handle;
+  ASSERT_EQ(0,
+      rados_watch2(ioctx, notify_oid, &handle,
+		   watch_notify2_test_cb,
+		   watch_notify2_test_errcb, NULL));
+  ASSERT_GT(rados_watch_check(ioctx, handle), 0);
+  char *reply_buf = 0;
+  size_t reply_buf_len;
+  rados_completion_t comp;
+  ASSERT_EQ(0, rados_aio_create_completion(NULL, NULL, NULL, &comp));
+  ASSERT_EQ(0, rados_aio_notify(ioctx, "foo", comp, "notify", 6, 300000,
+                                &reply_buf, &reply_buf_len));
+  ASSERT_EQ(0, rados_aio_wait_for_complete(comp));
+  ASSERT_EQ(0, rados_aio_get_return_value(comp));
+  rados_aio_release(comp);
+
+  bufferlist reply;
+  reply.append(reply_buf, reply_buf_len);
+  std::map<std::pair<uint64_t,uint64_t>, bufferlist> reply_map;
+  std::set<std::pair<uint64_t,uint64_t> > missed_map;
+  bufferlist::iterator reply_p = reply.begin();
+  ::decode(reply_map, reply_p);
+  ::decode(missed_map, reply_p);
+  ASSERT_EQ(1u, reply_map.size());
+  ASSERT_EQ(0u, missed_map.size());
+  ASSERT_EQ(1u, notify_cookies.size());
+  ASSERT_EQ(1u, notify_cookies.count(handle));
+  ASSERT_EQ(5u, reply_map.begin()->second.length());
+  ASSERT_EQ(0, strncmp("reply", reply_map.begin()->second.c_str(), 5));
+  ASSERT_GT(rados_watch_check(ioctx, handle), 0);
+  rados_buffer_free(reply_buf);
+
+  // try it on a non-existent object ... our buffer pointers
+  // should get zeroed.
+  ASSERT_EQ(0, rados_aio_create_completion(NULL, NULL, NULL, &comp));
+  ASSERT_EQ(0, rados_aio_notify(ioctx, "doesnotexist", comp, "notify", 6,
+                                300000, &reply_buf, &reply_buf_len));
+  ASSERT_EQ(0, rados_aio_wait_for_complete(comp));
+  ASSERT_EQ(-ENOENT, rados_aio_get_return_value(comp));
+  rados_aio_release(comp);
+  ASSERT_EQ((char*)0, reply_buf);
+  ASSERT_EQ(0u, reply_buf_len);
+
+  rados_unwatch2(ioctx, handle);
+}
+
 TEST_P(LibRadosWatchNotifyPP, WatchNotify2) {
   notify_oid = "foo";
   notify_ioctx = &ioctx;
@@ -411,6 +464,43 @@ TEST_P(LibRadosWatchNotifyPP, WatchNotify2) {
   ioctx.unwatch2(handle);
 }
 
+TEST_P(LibRadosWatchNotifyPP, AioNotify) {
+  notify_oid = "foo";
+  notify_ioctx = &ioctx;
+  notify_cookies.clear();
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist bl1;
+  bl1.append(buf, sizeof(buf));
+  ASSERT_EQ(0, ioctx.write(notify_oid, bl1, sizeof(buf), 0));
+  uint64_t handle;
+  WatchNotifyTestCtx2 ctx;
+  ASSERT_EQ(0, ioctx.watch2(notify_oid, &handle, &ctx));
+  ASSERT_GT(ioctx.watch_check(handle), 0);
+  std::list<obj_watch_t> watches;
+  ASSERT_EQ(0, ioctx.list_watchers(notify_oid, &watches));
+  ASSERT_EQ(watches.size(), 1u);
+  bufferlist bl2, bl_reply;
+  librados::AioCompletion *comp = cluster.aio_create_completion();
+  ASSERT_EQ(0, ioctx.aio_notify(notify_oid, comp, bl2, 300000, &bl_reply));
+  ASSERT_EQ(0, comp->wait_for_complete());
+  ASSERT_EQ(0, comp->get_return_value());
+  comp->release();
+  bufferlist::iterator p = bl_reply.begin();
+  std::map<std::pair<uint64_t,uint64_t>,bufferlist> reply_map;
+  std::set<std::pair<uint64_t,uint64_t> > missed_map;
+  ::decode(reply_map, p);
+  ::decode(missed_map, p);
+  ASSERT_EQ(1u, notify_cookies.size());
+  ASSERT_EQ(1u, notify_cookies.count(handle));
+  ASSERT_EQ(1u, reply_map.size());
+  ASSERT_EQ(5u, reply_map.begin()->second.length());
+  ASSERT_EQ(0, strncmp("reply", reply_map.begin()->second.c_str(), 5));
+  ASSERT_EQ(0u, missed_map.size());
+  ASSERT_GT(ioctx.watch_check(handle), 0);
+  ioctx.unwatch2(handle);
+}
+
 // --
 
 TEST_F(LibRadosWatchNotify, WatchNotify2Multi) {
diff --git a/src/test/librados_test_stub/LibradosTestStub.cc b/src/test/librados_test_stub/LibradosTestStub.cc
index 2a0006e..75f7fde 100644
--- a/src/test/librados_test_stub/LibradosTestStub.cc
+++ b/src/test/librados_test_stub/LibradosTestStub.cc
@@ -90,6 +90,9 @@ TestRadosClientPtr get_rados_client() {
     cct->_conf->apply_changes(NULL);
     client->reset(new librados::TestMemRadosClient(cct),
                   &librados::TestRadosClient::Deallocate);
+    if (g_ceph_context == NULL) {
+      g_ceph_context = cct;
+    }
     cct->put();
   }
   (*client)->get();
@@ -354,6 +357,13 @@ int IoCtx::aio_flush_async(AioCompletion *c) {
   return 0;
 }
 
+int IoCtx::aio_notify(const std::string& oid, AioCompletion *c, bufferlist& bl,
+                      uint64_t timeout_ms, bufferlist *pbl) {
+  TestIoCtxImpl *ctx = reinterpret_cast<TestIoCtxImpl*>(io_ctx_impl);
+  ctx->aio_notify(oid, c->pc, bl, timeout_ms, pbl);
+  return 0;
+}
+
 int IoCtx::aio_operate(const std::string& oid, AioCompletion *c,
                        ObjectReadOperation *op, bufferlist *pbl) {
   return aio_operate(oid, c, op, 0, pbl);
@@ -688,6 +698,11 @@ void ObjectReadOperation::sparse_read(uint64_t off, uint64_t len,
   o->ops.push_back(op);
 }
 
+void ObjectWriteOperation::append(const bufferlist &bl) {
+  TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
+  o->ops.push_back(boost::bind(&TestIoCtxImpl::append, _1, _2, bl, _4));
+}
+
 void ObjectWriteOperation::create(bool exclusive) {
   TestObjectOperationImpl *o = reinterpret_cast<TestObjectOperationImpl*>(impl);
   o->ops.push_back(boost::bind(&TestIoCtxImpl::create, _1, _2, exclusive));
diff --git a/src/test/librados_test_stub/MockTestMemIoCtxImpl.h b/src/test/librados_test_stub/MockTestMemIoCtxImpl.h
new file mode 100644
index 0000000..198db6c
--- /dev/null
+++ b/src/test/librados_test_stub/MockTestMemIoCtxImpl.h
@@ -0,0 +1,101 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRADOS_TEST_STUB_MOCK_TEST_MEM_IO_CTX_IMPL_H
+#define LIBRADOS_TEST_STUB_MOCK_TEST_MEM_IO_CTX_IMPL_H
+
+#include "test/librados_test_stub/TestMemIoCtxImpl.h"
+#include "gmock/gmock.h"
+
+namespace librados {
+
+class MockTestMemRadosClient;
+
+class MockTestMemIoCtxImpl : public TestMemIoCtxImpl {
+public:
+  MockTestMemIoCtxImpl(MockTestMemRadosClient *mock_client,
+                       TestMemRadosClient *client, int64_t pool_id,
+                       const std::string& pool_name,
+                       TestMemRadosClient::Pool *pool)
+    : TestMemIoCtxImpl(client, pool_id, pool_name, pool),
+      m_mock_client(mock_client), m_client(client) {
+    default_to_parent();
+  }
+
+  MockTestMemRadosClient *get_mock_rados_client() {
+    return m_mock_client;
+  }
+
+  virtual TestIoCtxImpl *clone() {
+    TestIoCtxImpl *io_ctx_impl = new ::testing::NiceMock<MockTestMemIoCtxImpl>(
+      m_mock_client, m_client, get_pool_id(), get_pool_name(), get_pool());
+    io_ctx_impl->set_snap_read(get_snap_read());
+    io_ctx_impl->set_snap_context(get_snap_context());
+    return io_ctx_impl;
+  }
+
+  MOCK_METHOD7(exec, int(const std::string& oid,
+                         TestClassHandler *handler,
+                         const char *cls,
+                         const char *method,
+                         bufferlist& inbl,
+                         bufferlist* outbl,
+                         const SnapContext &snapc));
+  int do_exec(const std::string& oid, TestClassHandler *handler,
+              const char *cls, const char *method, bufferlist& inbl,
+              bufferlist* outbl, const SnapContext &snapc) {
+    return TestMemIoCtxImpl::exec(oid, handler, cls, method, inbl, outbl,
+                                  snapc);
+  }
+
+  MOCK_METHOD4(read, int(const std::string& oid,
+                         size_t len,
+                         uint64_t off,
+                         bufferlist *bl));
+  int do_read(const std::string& oid, size_t len, uint64_t off,
+              bufferlist *bl) {
+    return TestMemIoCtxImpl::read(oid, len, off, bl);
+  }
+
+  MOCK_METHOD1(remove, int(const std::string& oid));
+  int do_remove(const std::string& oid) {
+    return TestMemIoCtxImpl::remove(oid);
+  }
+
+  MOCK_METHOD1(selfmanaged_snap_create, int(uint64_t *snap_id));
+  int do_selfmanaged_snap_create(uint64_t *snap_id) {
+    return TestMemIoCtxImpl::selfmanaged_snap_create(snap_id);
+  }
+
+  MOCK_METHOD1(selfmanaged_snap_remove, int(uint64_t snap_id));
+  int do_selfmanaged_snap_remove(uint64_t snap_id) {
+    return TestMemIoCtxImpl::selfmanaged_snap_remove(snap_id);
+  }
+
+  MOCK_METHOD3(write_full, int(const std::string& oid,
+                               bufferlist& bl,
+                               const SnapContext &snapc));
+  int do_write_full(const std::string& oid, bufferlist& bl,
+                    const SnapContext &snapc) {
+    return TestMemIoCtxImpl::write_full(oid, bl, snapc);
+  }
+
+  void default_to_parent() {
+    using namespace ::testing;
+
+    ON_CALL(*this, exec(_, _, _, _, _, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_exec));
+    ON_CALL(*this, read(_, _, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_read));
+    ON_CALL(*this, remove(_)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_remove));
+    ON_CALL(*this, selfmanaged_snap_create(_)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_selfmanaged_snap_create));
+    ON_CALL(*this, selfmanaged_snap_remove(_)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_selfmanaged_snap_remove));
+    ON_CALL(*this, write_full(_, _, _)).WillByDefault(Invoke(this, &MockTestMemIoCtxImpl::do_write_full));
+  }
+
+private:
+  MockTestMemRadosClient *m_mock_client;
+  TestMemRadosClient *m_client;
+};
+
+} // namespace librados
+
+#endif // LIBRADOS_TEST_STUB_MOCK_TEST_MEM_IO_CTX_IMPL_H
diff --git a/src/test/librados_test_stub/MockTestMemRadosClient.h b/src/test/librados_test_stub/MockTestMemRadosClient.h
new file mode 100644
index 0000000..1d0b994
--- /dev/null
+++ b/src/test/librados_test_stub/MockTestMemRadosClient.h
@@ -0,0 +1,36 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef LIBRADOS_TEST_STUB_MOCK_TEST_MEM_RADOS_CLIENT_H
+#define LIBRADOS_TEST_STUB_MOCK_TEST_MEM_RADOS_CLIENT_H
+
+#include "test/librados_test_stub/TestMemRadosClient.h"
+#include "test/librados_test_stub/MockTestMemIoCtxImpl.h"
+#include "gmock/gmock.h"
+
+namespace librados {
+
+class MockTestMemRadosClient : public TestMemRadosClient {
+public:
+  MockTestMemRadosClient(CephContext *cct) : TestMemRadosClient(cct) {
+    default_to_dispatch();
+  }
+
+  MOCK_METHOD2(create_ioctx, TestIoCtxImpl *(int64_t pool_id,
+                                             const std::string &pool_name));
+  TestIoCtxImpl *do_create_ioctx(int64_t pool_id,
+                                 const std::string &pool_name) {
+    return new ::testing::NiceMock<MockTestMemIoCtxImpl>(
+      this, this, pool_id, pool_name, get_pool(pool_name));
+  }
+
+  void default_to_dispatch() {
+    using namespace ::testing;
+
+    ON_CALL(*this, create_ioctx(_, _)).WillByDefault(Invoke(this, &MockTestMemRadosClient::do_create_ioctx));
+  }
+};
+
+} // namespace librados
+
+#endif // LIBRADOS_TEST_STUB_MOCK_TEST_MEM_RADOS_CLIENT_H
diff --git a/src/test/librados_test_stub/TestIoCtxImpl.cc b/src/test/librados_test_stub/TestIoCtxImpl.cc
index 30c1e13..4bde5c9 100644
--- a/src/test/librados_test_stub/TestIoCtxImpl.cc
+++ b/src/test/librados_test_stub/TestIoCtxImpl.cc
@@ -38,6 +38,7 @@ TestIoCtxImpl::TestIoCtxImpl(const TestIoCtxImpl& rhs)
 }
 
 TestIoCtxImpl::~TestIoCtxImpl() {
+  assert(m_pending_ops.read() == 0);
 }
 
 void TestObjectOperationImpl::get() {
@@ -90,11 +91,21 @@ void TestIoCtxImpl::aio_flush_async(AioCompletionImpl *c) {
   m_client->flush_aio_operations(c);
 }
 
+void TestIoCtxImpl::aio_notify(const std::string& oid, AioCompletionImpl *c,
+                               bufferlist& bl, uint64_t timeout_ms,
+                               bufferlist *pbl) {
+  m_pending_ops.inc();
+  c->get();
+  C_AioNotify *ctx = new C_AioNotify(this, c);
+  m_client->get_watch_notify().aio_notify(oid, bl, timeout_ms, pbl, ctx);
+}
+
 int TestIoCtxImpl::aio_operate(const std::string& oid, TestObjectOperationImpl &ops,
                                AioCompletionImpl *c, SnapContext *snap_context,
                                int flags) {
   // TODO flags for now
   ops.get();
+  m_pending_ops.inc();
   m_client->add_aio_operation(oid, true, boost::bind(
     &TestIoCtxImpl::execute_aio_operations, this, oid, &ops,
     reinterpret_cast<bufferlist*>(0),
@@ -108,6 +119,7 @@ int TestIoCtxImpl::aio_operate_read(const std::string& oid,
                                     bufferlist *pbl) {
   // TODO ignoring flags for now
   ops.get();
+  m_pending_ops.inc();
   m_client->add_aio_operation(oid, true, boost::bind(
     &TestIoCtxImpl::execute_aio_operations, this, oid, &ops, pbl, m_snapc), c);
   return 0;
@@ -146,6 +158,7 @@ int TestIoCtxImpl::operate(const std::string& oid, TestObjectOperationImpl &ops)
   AioCompletionImpl *comp = new AioCompletionImpl();
 
   ops.get();
+  m_pending_ops.inc();
   m_client->add_aio_operation(oid, false, boost::bind(
     &TestIoCtxImpl::execute_aio_operations, this, oid, &ops,
     reinterpret_cast<bufferlist*>(0), m_snapc), comp);
@@ -161,6 +174,7 @@ int TestIoCtxImpl::operate_read(const std::string& oid, TestObjectOperationImpl
   AioCompletionImpl *comp = new AioCompletionImpl();
 
   ops.get();
+  m_pending_ops.inc();
   m_client->add_aio_operation(oid, false, boost::bind(
     &TestIoCtxImpl::execute_aio_operations, this, oid, &ops, pbl,
     m_snapc), comp);
@@ -270,8 +284,15 @@ int TestIoCtxImpl::execute_aio_operations(const std::string& oid,
       break;
     }
   }
+  m_pending_ops.dec();
   ops->put();
   return ret;
 }
 
+void TestIoCtxImpl::handle_aio_notify_complete(AioCompletionImpl *c, int r) {
+  m_pending_ops.dec();
+
+  m_client->finish_aio_completion(c, r);
+}
+
 } // namespace librados
diff --git a/src/test/librados_test_stub/TestIoCtxImpl.h b/src/test/librados_test_stub/TestIoCtxImpl.h
index 450ee59..c38e90e 100644
--- a/src/test/librados_test_stub/TestIoCtxImpl.h
+++ b/src/test/librados_test_stub/TestIoCtxImpl.h
@@ -6,6 +6,7 @@
 
 #include "include/rados/librados.hpp"
 #include "include/atomic.h"
+#include "include/Context.h"
 #include "common/snap_types.h"
 #include <boost/function.hpp>
 #include <list>
@@ -71,6 +72,8 @@ public:
 
   virtual int aio_flush();
   virtual void aio_flush_async(AioCompletionImpl *c);
+  virtual void aio_notify(const std::string& oid, AioCompletionImpl *c,
+                          bufferlist& bl, uint64_t timeout_ms, bufferlist *pbl);
   virtual int aio_operate(const std::string& oid, TestObjectOperationImpl &ops,
                           AioCompletionImpl *c, SnapContext *snap_context,
                           int flags);
@@ -79,6 +82,8 @@ public:
                                bufferlist *pbl);
   virtual int aio_remove(const std::string& oid, AioCompletionImpl *c) = 0;
 
+  virtual int append(const std::string& oid, const bufferlist &bl,
+                     const SnapContext &snapc) = 0;
   virtual int assert_exists(const std::string &oid) = 0;
 
   virtual int create(const std::string& oid, bool exclusive) = 0;
@@ -150,6 +155,16 @@ protected:
                              bufferlist *pbl, const SnapContext &snapc);
 
 private:
+  struct C_AioNotify : public Context {
+    TestIoCtxImpl *io_ctx;
+    AioCompletionImpl *aio_comp;
+    C_AioNotify(TestIoCtxImpl *_io_ctx, AioCompletionImpl *_aio_comp)
+      : io_ctx(_io_ctx), aio_comp(_aio_comp) {
+    }
+    virtual void finish(int r) {
+      io_ctx->handle_aio_notify_complete(aio_comp, r);
+    }
+  };
 
   TestRadosClient *m_client;
   int64_t m_pool_id;
@@ -157,7 +172,9 @@ private:
   snap_t m_snap_seq;
   SnapContext m_snapc;
   atomic_t m_refcount;
+  atomic_t m_pending_ops;
 
+  void handle_aio_notify_complete(AioCompletionImpl *aio_comp, int r);
 };
 
 } // namespace librados
diff --git a/src/test/librados_test_stub/TestMemIoCtxImpl.cc b/src/test/librados_test_stub/TestMemIoCtxImpl.cc
index 6b124ed..9b95c50 100644
--- a/src/test/librados_test_stub/TestMemIoCtxImpl.cc
+++ b/src/test/librados_test_stub/TestMemIoCtxImpl.cc
@@ -51,6 +51,23 @@ int TestMemIoCtxImpl::aio_remove(const std::string& oid, AioCompletionImpl *c) {
   return 0;
 }
 
+int TestMemIoCtxImpl::append(const std::string& oid, const bufferlist &bl,
+                             const SnapContext &snapc) {
+  if (get_snap_read() != CEPH_NOSNAP) {
+    return -EROFS;
+  }
+
+  TestMemRadosClient::SharedFile file;
+  {
+    RWLock::WLocker l(m_pool->file_lock);
+    file = get_file(oid, true, snapc);
+  }
+
+  RWLock::WLocker l(file->lock);
+  file->data.append(bl);
+  return 0;
+}
+
 int TestMemIoCtxImpl::assert_exists(const std::string &oid) {
   RWLock::RLocker l(m_pool->file_lock);
   TestMemRadosClient::SharedFile file = get_file(oid, false,
diff --git a/src/test/librados_test_stub/TestMemIoCtxImpl.h b/src/test/librados_test_stub/TestMemIoCtxImpl.h
index aa65415..6556c9c 100644
--- a/src/test/librados_test_stub/TestMemIoCtxImpl.h
+++ b/src/test/librados_test_stub/TestMemIoCtxImpl.h
@@ -21,6 +21,8 @@ public:
 
   virtual int aio_remove(const std::string& oid, AioCompletionImpl *c);
 
+  virtual int append(const std::string& oid, const bufferlist &bl,
+                     const SnapContext &snapc);
   virtual int assert_exists(const std::string &oid);
 
   virtual int create(const std::string& oid, bool exclusive);
diff --git a/src/test/librados_test_stub/TestRadosClient.cc b/src/test/librados_test_stub/TestRadosClient.cc
index 46437ac..1a9792c 100644
--- a/src/test/librados_test_stub/TestRadosClient.cc
+++ b/src/test/librados_test_stub/TestRadosClient.cc
@@ -221,6 +221,10 @@ void TestRadosClient::flush_aio_operations(AioCompletionImpl *c) {
   }
 }
 
+void TestRadosClient::finish_aio_completion(AioCompletionImpl *c, int r) {
+  librados::finish_aio_completion(c, r);
+}
+
 Finisher *TestRadosClient::get_finisher(const std::string &oid) {
   std::size_t h = m_hash(oid);
   return m_finishers[h % m_finishers.size()];
diff --git a/src/test/librados_test_stub/TestRadosClient.h b/src/test/librados_test_stub/TestRadosClient.h
index ad0cf67..d3c2034 100644
--- a/src/test/librados_test_stub/TestRadosClient.h
+++ b/src/test/librados_test_stub/TestRadosClient.h
@@ -99,6 +99,8 @@ public:
   void flush_aio_operations();
   void flush_aio_operations(AioCompletionImpl *c);
 
+  void finish_aio_completion(AioCompletionImpl *c, int r);
+
 protected:
   virtual ~TestRadosClient();
 
diff --git a/src/test/librados_test_stub/TestWatchNotify.cc b/src/test/librados_test_stub/TestWatchNotify.cc
index 14a43bc..ef8f537 100644
--- a/src/test/librados_test_stub/TestWatchNotify.cc
+++ b/src/test/librados_test_stub/TestWatchNotify.cc
@@ -58,38 +58,31 @@ int TestWatchNotify::list_watchers(const std::string& o,
   return 0;
 }
 
+void TestWatchNotify::aio_notify(const std::string& oid, bufferlist& bl,
+                                 uint64_t timeout_ms, bufferlist *pbl,
+                                 Context *on_notify) {
+  SharedWatcher watcher = get_watcher(oid);
+  RWLock::WLocker watcher_locker(watcher->lock);
+  Mutex::Locker file_watcher_lock(m_file_watcher_lock);
+  ++m_pending_notifies;
+  uint64_t notify_id = ++m_notify_id;
+
+  SharedNotifyHandle notify_handle(new NotifyHandle());
+  notify_handle->pbl = pbl;
+
+  watcher->notify_handles[notify_id] = notify_handle;
+
+  FunctionContext *ctx = new FunctionContext(
+      boost::bind(&TestWatchNotify::execute_notify, this,
+                  oid, bl, notify_id, on_notify));
+  m_finisher->queue(ctx);
+}
+
 int TestWatchNotify::notify(const std::string& oid, bufferlist& bl,
                             uint64_t timeout_ms, bufferlist *pbl) {
-  Mutex lock("TestRadosClient::watcher_notify::lock");
-  Cond cond;
-  bool done = false;
-
-  {
-    SharedWatcher watcher = get_watcher(oid);
-    RWLock::WLocker l(watcher->lock);
-    {
-      Mutex::Locker l2(m_file_watcher_lock);
-      ++m_pending_notifies;
-      uint64_t notify_id = ++m_notify_id;
-
-      SharedNotifyHandle notify_handle(new NotifyHandle());
-      notify_handle->pbl = pbl;
-
-      watcher->notify_handles[notify_id] = notify_handle;
-
-      FunctionContext *ctx = new FunctionContext(
-          boost::bind(&TestWatchNotify::execute_notify, this,
-                      oid, bl, notify_id, &lock, &cond, &done));
-      m_finisher->queue(ctx);
-    }
-  }
-
-  lock.Lock();
-  while (!done) {
-    cond.Wait(lock);
-  }
-  lock.Unlock();
-  return 0;
+  C_SaferCond cond;
+  aio_notify(oid, bl, timeout_ms, pbl, &cond);
+  return cond.wait();
 }
 
 void TestWatchNotify::notify_ack(const std::string& o, uint64_t notify_id,
@@ -169,8 +162,7 @@ TestWatchNotify::SharedWatcher TestWatchNotify::_get_watcher(
 
 void TestWatchNotify::execute_notify(const std::string &oid,
                                      bufferlist &bl, uint64_t notify_id,
-                                     Mutex *lock, Cond *cond,
-                                     bool *done) {
+                                     Context *on_notify) {
   WatchHandles watch_handles;
   SharedNotifyHandle notify_handle;
 
@@ -218,15 +210,11 @@ void TestWatchNotify::execute_notify(const std::string &oid,
     }
   }
 
-  Mutex::Locker l3(*lock);
-  *done = true;
-  cond->Signal();
+  on_notify->complete(0);
 
-  {
-    Mutex::Locker file_watcher_locker(m_file_watcher_lock);
-    if (--m_pending_notifies == 0) {
-      m_file_watcher_cond.Signal();
-    }
+  Mutex::Locker file_watcher_locker(m_file_watcher_lock);
+  if (--m_pending_notifies == 0) {
+    m_file_watcher_cond.Signal();
   }
 }
 
diff --git a/src/test/librados_test_stub/TestWatchNotify.h b/src/test/librados_test_stub/TestWatchNotify.h
index 1761302..6f99704 100644
--- a/src/test/librados_test_stub/TestWatchNotify.h
+++ b/src/test/librados_test_stub/TestWatchNotify.h
@@ -57,6 +57,8 @@ public:
   void flush();
   int list_watchers(const std::string& o,
                     std::list<obj_watch_t> *out_watchers);
+  void aio_notify(const std::string& oid, bufferlist& bl, uint64_t timeout_ms,
+                  bufferlist *pbl, Context *on_notify);
   int notify(const std::string& o, bufferlist& bl,
              uint64_t timeout_ms, bufferlist *pbl);
   void notify_ack(const std::string& o, uint64_t notify_id,
@@ -84,7 +86,7 @@ private:
   SharedWatcher get_watcher(const std::string& oid);
   SharedWatcher _get_watcher(const std::string& oid);
   void execute_notify(const std::string &oid, bufferlist &bl,
-                      uint64_t notify_id, Mutex *lock, Cond *cond, bool *done);
+                      uint64_t notify_id, Context *on_notify);
 
 };
 
diff --git a/src/test/libradosstriper/rados-striper.sh b/src/test/libradosstriper/rados-striper.sh
index df0a837..ed0e892 100755
--- a/src/test/libradosstriper/rados-striper.sh
+++ b/src/test/libradosstriper/rados-striper.sh
@@ -20,7 +20,7 @@ function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7116"
+    export CEPH_MON="127.0.0.1:7116" # git grep '\<7116\>' : there must be only one
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
diff --git a/src/test/librbd/fsx.cc b/src/test/librbd/fsx.cc
index de40625..e7bfcf1 100644
--- a/src/test/librbd/fsx.cc
+++ b/src/test/librbd/fsx.cc
@@ -550,7 +550,7 @@ krbd_open(const char *name, struct rbd_ctx *ctx)
 	if (ret < 0)
 		return ret;
 
-	ret = krbd_map(krbd, pool, name, NULL, NULL, &devnode);
+	ret = krbd_map(krbd, pool, name, "", "", &devnode);
 	if (ret < 0) {
 		prt("krbd_map(%s) failed\n", name);
 		return ret;
diff --git a/src/test/librbd/test_ImageWatcher.cc b/src/test/librbd/test_ImageWatcher.cc
index 8813152..0f137fe 100644
--- a/src/test/librbd/test_ImageWatcher.cc
+++ b/src/test/librbd/test_ImageWatcher.cc
@@ -13,6 +13,7 @@
 #include "cls/lock/cls_lock_client.h"
 #include "cls/lock/cls_lock_types.h"
 #include "librbd/AioCompletion.h"
+#include "librbd/AioImageRequestWQ.h"
 #include "librbd/internal.h"
 #include "librbd/ImageCtx.h"
 #include "librbd/ImageWatcher.h"
@@ -40,12 +41,48 @@ void register_test_image_watcher() {
 class TestImageWatcher : public TestFixture {
 public:
 
-  TestImageWatcher() : m_watch_ctx(NULL), m_aio_completion_restarts(0),
-		       m_expected_aio_restarts(0),
-		       m_callback_lock("m_callback_lock")
+  TestImageWatcher() : m_watch_ctx(NULL), m_callback_lock("m_callback_lock")
   {
   }
 
+  struct LockListener : public librbd::ImageWatcher::Listener {
+    Mutex lock;
+    Cond cond;
+    size_t releasing_lock_count;
+    size_t lock_updated_count;
+    bool lock_owner;
+
+    LockListener()
+      : lock("lock"), releasing_lock_count(0), lock_updated_count(0),
+        lock_owner(false) {
+    }
+
+    virtual bool handle_requested_lock() {
+      return true;
+    }
+    virtual void handle_lock_updated(
+        librbd::ImageWatcher::LockUpdateState state) {
+      Mutex::Locker locker(lock);
+      ++lock_updated_count;
+
+      switch (state) {
+      case librbd::ImageWatcher::LOCK_UPDATE_STATE_NOT_SUPPORTED:
+      case librbd::ImageWatcher::LOCK_UPDATE_STATE_UNLOCKED:
+      case librbd::ImageWatcher::LOCK_UPDATE_STATE_NOTIFICATION:
+        lock_owner = false;
+        break;
+      case librbd::ImageWatcher::LOCK_UPDATE_STATE_RELEASING:
+        lock_owner = false;
+        ++releasing_lock_count;
+        break;
+      case librbd::ImageWatcher::LOCK_UPDATE_STATE_LOCKED:
+        lock_owner = true;
+        break;
+      }
+      cond.Signal();
+    }
+  };
+
   class WatchCtx : public librados::WatchCtx2 {
   public:
     WatchCtx(TestImageWatcher &parent) : m_parent(parent), m_handle(0) {}
@@ -73,8 +110,10 @@ public:
 	DECODE_FINISH(iter);
 
         NotifyOp notify_op = static_cast<NotifyOp>(op);
+        /*
 	std::cout << "NOTIFY: " << notify_op << ", " << notify_id
 		  << ", " << cookie << ", " << notifier_id << std::endl;
+        */
 
 	Mutex::Locker l(m_parent.m_callback_lock);
         m_parent.m_notify_payloads[notify_op] = payload;
@@ -126,80 +165,49 @@ public:
     return 0;
   }
 
+  void register_lock_listener(librbd::ImageCtx &ictx) {
+    ictx.image_watcher->register_listener(&m_lock_listener);
+  }
+
   int register_image_watch(librbd::ImageCtx &ictx) {
     m_watch_ctx = new WatchCtx(*this);
     return m_watch_ctx->watch(ictx);
   }
 
-  bool wait_for_notifies(librbd::ImageCtx &ictx) {
-    Mutex::Locker l(m_callback_lock);
-    while (m_notifies.size() < m_notify_acks.size()) {
-      int r = m_callback_cond.WaitInterval(ictx.cct, m_callback_lock,
-					 utime_t(10, 0));
-      if (r != 0) {
-	break;
+  bool wait_for_releasing_lock(librbd::ImageCtx &ictx) {
+    Mutex::Locker locker(m_lock_listener.lock);
+    while (m_lock_listener.releasing_lock_count == 0) {
+      if (m_lock_listener.cond.WaitInterval(ictx.cct, m_lock_listener.lock,
+                                            utime_t(10, 0)) != 0) {
+        return false;
       }
     }
-    return (m_notifies.size() == m_notify_acks.size());
+    m_lock_listener.releasing_lock_count = 0;
+    return true;
   }
 
-
-  librbd::AioCompletion *create_aio_completion(librbd::ImageCtx &ictx) {
-    librbd::AioCompletion *aio_completion = new librbd::AioCompletion();
-    aio_completion->complete_cb = &handle_aio_completion;
-    aio_completion->complete_arg = this;
-
-    aio_completion->init_time(&ictx, librbd::AIO_TYPE_NONE);
-    m_aio_completions.insert(aio_completion);
-    return aio_completion;
-  }
-
-  static void handle_aio_completion(void *arg1, void *arg2) {
-    TestImageWatcher *test_image_watcher =
-      reinterpret_cast<TestImageWatcher *>(arg2);
-    assert(test_image_watcher->m_callback_lock.is_locked());
-    test_image_watcher->m_callback_cond.Signal();
-  }
-
-  int handle_restart_aio(librbd::ImageCtx *ictx,
-			 librbd::AioCompletion *aio_completion) {
-    Mutex::Locker callback_locker(m_callback_lock);
-    ++m_aio_completion_restarts;
-
-    RWLock::RLocker owner_locker(ictx->owner_lock);
-    if (!ictx->image_watcher->is_lock_owner() &&
-        (m_expected_aio_restarts == 0 ||
-	 m_aio_completion_restarts < m_expected_aio_restarts)) {
-      ictx->image_watcher->request_lock(
-        boost::bind(&TestImageWatcher::handle_restart_aio, this, ictx, _1),
-	aio_completion);
-    } else {
-      {
-	Mutex::Locker completion_locker(aio_completion->lock);
-	aio_completion->complete(ictx->cct);
+  bool wait_for_lock_updated(librbd::ImageCtx &ictx) {
+    Mutex::Locker locker(m_lock_listener.lock);
+    while (m_lock_listener.lock_updated_count == 0) {
+      if (m_lock_listener.cond.WaitInterval(ictx.cct, m_lock_listener.lock,
+                                            utime_t(10, 0)) != 0) {
+        return false;
       }
-
-      m_aio_completions.erase(aio_completion);
-      aio_completion->release();
     }
-
-    m_callback_cond.Signal();
-    return 0;
+    m_lock_listener.lock_updated_count = 0;
+    return true;
   }
 
-  bool wait_for_aio_completions(librbd::ImageCtx &ictx) {
+  bool wait_for_notifies(librbd::ImageCtx &ictx) {
     Mutex::Locker l(m_callback_lock);
-    int r = 0;
-    while (!m_aio_completions.empty() &&
-           (m_expected_aio_restarts == 0 ||
-	    m_aio_completion_restarts < m_expected_aio_restarts)) {
-      r = m_callback_cond.WaitInterval(ictx.cct, m_callback_lock,
-				       utime_t(10, 0));
+    while (m_notifies.size() < m_notify_acks.size()) {
+      int r = m_callback_cond.WaitInterval(ictx.cct, m_callback_lock,
+					 utime_t(10, 0));
       if (r != 0) {
-        break;
+	break;
       }
     }
-    return (r == 0);
+    return (m_notifies.size() == m_notify_acks.size());
   }
 
   bufferlist create_response_message(int r) {
@@ -263,16 +271,14 @@ public:
 
   WatchCtx *m_watch_ctx;
 
+  LockListener m_lock_listener;
+
   NotifyOps m_notifies;
   NotifyOpPayloads m_notify_payloads;
   NotifyOpPayloads m_notify_acks;
 
   AsyncRequestId m_async_request_id;
 
-  std::set<librbd::AioCompletion *> m_aio_completions;
-  uint32_t m_aio_completion_restarts;
-  uint32_t m_expected_aio_restarts;
-
   Mutex m_callback_lock;
   Cond m_callback_cond;
 
@@ -476,17 +482,17 @@ TEST_F(TestImageWatcher, TryLockWithUserSharedLocked) {
   ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
 }
 
-TEST_F(TestImageWatcher, UnlockNotLocked) {
+TEST_F(TestImageWatcher, ReleaseLockNotLocked) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
 
   RWLock::WLocker l(ictx->owner_lock);
-  ASSERT_EQ(0, ictx->image_watcher->unlock());
+  ASSERT_EQ(0, ictx->image_watcher->release_lock());
 }
 
-TEST_F(TestImageWatcher, UnlockNotifyReleaseLock) {
+TEST_F(TestImageWatcher, ReleaseLockNotifies) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
   librbd::ImageCtx *ictx;
@@ -504,7 +510,7 @@ TEST_F(TestImageWatcher, UnlockNotifyReleaseLock) {
   m_notify_acks += std::make_pair(NOTIFY_OP_RELEASED_LOCK, bufferlist());
   {
     RWLock::WLocker l(ictx->owner_lock);
-    ASSERT_EQ(0, ictx->image_watcher->unlock());
+    ASSERT_EQ(0, ictx->image_watcher->release_lock());
   }
   ASSERT_TRUE(wait_for_notifies(*ictx));
 
@@ -513,7 +519,7 @@ TEST_F(TestImageWatcher, UnlockNotifyReleaseLock) {
   ASSERT_EQ(expected_notify_ops, m_notifies);
 }
 
-TEST_F(TestImageWatcher, UnlockBrokenLock) {
+TEST_F(TestImageWatcher, ReleaseLockBrokenLock) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
   librbd::ImageCtx *ictx;
@@ -534,7 +540,7 @@ TEST_F(TestImageWatcher, UnlockBrokenLock) {
 					    lockers.begin()->first.cookie,
 					    lockers.begin()->first.locker));
 
-  ASSERT_EQ(0, ictx->image_watcher->unlock());
+  ASSERT_EQ(0, ictx->image_watcher->release_lock());
 }
 
 TEST_F(TestImageWatcher, RequestLock) {
@@ -543,19 +549,41 @@ TEST_F(TestImageWatcher, RequestLock) {
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
   ASSERT_EQ(0, register_image_watch(*ictx));
+
+  register_lock_listener(*ictx);
+  m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
+
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ictx->image_watcher->request_lock();
+  }
+
+  ASSERT_TRUE(wait_for_notifies(*ictx));
+  NotifyOps expected_notify_ops;
+  expected_notify_ops += NOTIFY_OP_ACQUIRED_LOCK;
+  ASSERT_EQ(expected_notify_ops, m_notifies);
+
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
+  }
+}
+
+TEST_F(TestImageWatcher, RequestLockFromPeer) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, register_image_watch(*ictx));
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
 			  "auto " + stringify(m_watch_ctx->get_handle())));
 
+  register_lock_listener(*ictx);
   m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, create_response_message(0)}};
 
   {
-    RWLock::WLocker l(ictx->owner_lock);
-    ictx->image_watcher->request_lock(
-      boost::bind(&TestImageWatcher::handle_restart_aio, this, ictx, _1),
-      create_aio_completion(*ictx));
-    ictx->image_watcher->request_lock(
-      boost::bind(&TestImageWatcher::handle_restart_aio, this, ictx, _1),
-      create_aio_completion(*ictx));
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ictx->image_watcher->request_lock();
   }
 
   ASSERT_TRUE(wait_for_notifies(*ictx));
@@ -565,8 +593,11 @@ TEST_F(TestImageWatcher, RequestLock) {
 
   ASSERT_EQ(0, unlock_image());
 
-  m_notifies.clear();
-  m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK,{}}, {NOTIFY_OP_ACQUIRED_LOCK,{}}};
+  {
+    Mutex::Locker l(m_callback_lock);
+    m_notifies.clear();
+    m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK,{}}};
+  }
 
   bufferlist bl;
   {
@@ -575,13 +606,28 @@ TEST_F(TestImageWatcher, RequestLock) {
     ENCODE_FINISH(bl);
   }
   ASSERT_EQ(0, m_ioctx.notify2(ictx->header_oid, bl, 5000, NULL));
+  ASSERT_TRUE(wait_for_lock_updated(*ictx));
+
+  {
+    Mutex::Locker l(m_callback_lock);
+    m_notifies.clear();
+    m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
+  }
+
+  {
+    RWLock::RLocker owner_lock(ictx->owner_lock);
+    ictx->image_watcher->request_lock();
+  }
 
   ASSERT_TRUE(wait_for_notifies(*ictx));
   expected_notify_ops.clear();
-  expected_notify_ops += NOTIFY_OP_RELEASED_LOCK, NOTIFY_OP_ACQUIRED_LOCK;
+  expected_notify_ops += NOTIFY_OP_ACQUIRED_LOCK;
   ASSERT_EQ(expected_notify_ops, m_notifies);
 
-  ASSERT_TRUE(wait_for_aio_completions(*ictx));
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
+  }
 }
 
 TEST_F(TestImageWatcher, RequestLockTimedOut) {
@@ -593,14 +639,12 @@ TEST_F(TestImageWatcher, RequestLockTimedOut) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
 			  "auto " + stringify(m_watch_ctx->get_handle())));
 
+  register_lock_listener(*ictx);
   m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, {}}};
 
-  m_expected_aio_restarts = 1;
   {
-    RWLock::WLocker l(ictx->owner_lock);
-    ictx->image_watcher->request_lock(
-      boost::bind(&TestImageWatcher::handle_restart_aio, this, ictx, _1),
-      create_aio_completion(*ictx));
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ictx->image_watcher->request_lock();
   }
 
   ASSERT_TRUE(wait_for_notifies(*ictx));
@@ -608,7 +652,27 @@ TEST_F(TestImageWatcher, RequestLockTimedOut) {
   expected_notify_ops += NOTIFY_OP_REQUEST_LOCK;
   ASSERT_EQ(expected_notify_ops, m_notifies);
 
-  ASSERT_TRUE(wait_for_aio_completions(*ictx));
+  // should resend when empty ack returned
+  {
+    Mutex::Locker l(m_callback_lock);
+    m_notifies.clear();
+  }
+  ASSERT_TRUE(wait_for_notifies(*ictx));
+
+  {
+    Mutex::Locker l(m_callback_lock);
+    ASSERT_EQ(0, unlock_image());
+    m_notifies.clear();
+    m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
+  }
+
+  ASSERT_TRUE(wait_for_notifies(*ictx));
+  ASSERT_TRUE(wait_for_lock_updated(*ictx));
+
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
+  }
 }
 
 TEST_F(TestImageWatcher, RequestLockIgnored) {
@@ -620,6 +684,7 @@ TEST_F(TestImageWatcher, RequestLockIgnored) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
 			  "auto " + stringify(m_watch_ctx->get_handle())));
 
+  register_lock_listener(*ictx);
   m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, create_response_message(0)}};
 
   int orig_notify_timeout = ictx->cct->_conf->client_notify_timeout;
@@ -630,10 +695,8 @@ TEST_F(TestImageWatcher, RequestLockIgnored) {
   } BOOST_SCOPE_EXIT_END;
 
   {
-    RWLock::WLocker l(ictx->owner_lock);
-    ictx->image_watcher->request_lock(
-      boost::bind(&TestImageWatcher::handle_restart_aio, this, ictx, _1),
-      create_aio_completion(*ictx));
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ictx->image_watcher->request_lock();
   }
 
   ASSERT_TRUE(wait_for_notifies(*ictx));
@@ -642,11 +705,27 @@ TEST_F(TestImageWatcher, RequestLockIgnored) {
   ASSERT_EQ(expected_notify_ops, m_notifies);
 
   // after the request times out -- it will be resent
+  {
+    Mutex::Locker l(m_callback_lock);
+    m_notifies.clear();
+  }
   ASSERT_TRUE(wait_for_notifies(*ictx));
   ASSERT_EQ(expected_notify_ops, m_notifies);
 
-  ASSERT_EQ(0, unlock_image());
-  ASSERT_TRUE(wait_for_aio_completions(*ictx));
+  {
+    Mutex::Locker l(m_callback_lock);
+    ASSERT_EQ(0, unlock_image());
+    m_notifies.clear();
+    m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
+  }
+
+  ASSERT_TRUE(wait_for_notifies(*ictx));
+  ASSERT_TRUE(wait_for_lock_updated(*ictx));
+
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
+  }
 }
 
 TEST_F(TestImageWatcher, RequestLockTryLockRace) {
@@ -658,14 +737,12 @@ TEST_F(TestImageWatcher, RequestLockTryLockRace) {
   ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
                           "auto " + stringify(m_watch_ctx->get_handle())));
 
+  register_lock_listener(*ictx);
   m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, create_response_message(0)}};
 
-  m_expected_aio_restarts = 1;
   {
-    RWLock::WLocker l(ictx->owner_lock);
-    ictx->image_watcher->request_lock(
-      boost::bind(&TestImageWatcher::handle_restart_aio, this, ictx, _1),
-      create_aio_completion(*ictx));
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ictx->image_watcher->request_lock();
   }
 
   ASSERT_TRUE(wait_for_notifies(*ictx));
@@ -673,8 +750,11 @@ TEST_F(TestImageWatcher, RequestLockTryLockRace) {
   expected_notify_ops += NOTIFY_OP_REQUEST_LOCK;
   ASSERT_EQ(expected_notify_ops, m_notifies);
 
-  m_notifies.clear();
-  m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK, {}}};
+  {
+    Mutex::Locker l(m_callback_lock);
+    m_notifies.clear();
+    m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK, {}}};
+  }
 
   bufferlist bl;
   {
@@ -683,45 +763,59 @@ TEST_F(TestImageWatcher, RequestLockTryLockRace) {
     ENCODE_FINISH(bl);
   }
   ASSERT_EQ(0, m_ioctx.notify2(ictx->header_oid, bl, 5000, NULL));
-  ASSERT_TRUE(wait_for_aio_completions(*ictx));
-  RWLock::RLocker l(ictx->owner_lock);
-  ASSERT_FALSE(ictx->image_watcher->is_lock_owner());
-}
 
-TEST_F(TestImageWatcher, RequestLockPreTryLockFailed) {
-  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+  // after losing race -- it will re-request
+  ASSERT_TRUE(wait_for_notifies(*ictx));
 
-  librbd::ImageCtx *ictx;
-  ASSERT_EQ(0, open_image(m_image_name, &ictx));
-  ASSERT_EQ(0, lock_image(*ictx, LOCK_SHARED, "manually 1234"));
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ASSERT_FALSE(ictx->image_watcher->is_lock_owner());
+  }
 
-  m_expected_aio_restarts = 1;
   {
-    RWLock::WLocker l(ictx->owner_lock);
-    ictx->image_watcher->request_lock(
-      boost::bind(&TestImageWatcher::handle_restart_aio, this, ictx, _1),
-      create_aio_completion(*ictx));
+    Mutex::Locker l(m_callback_lock);
+    ASSERT_EQ(0, unlock_image());
+    m_notifies.clear();
+    m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK, {}}};
+  }
+
+  ASSERT_EQ(0, m_ioctx.notify2(ictx->header_oid, bl, 5000, NULL));
+  ASSERT_TRUE(wait_for_lock_updated(*ictx));
+
+  {
+    Mutex::Locker l(m_callback_lock);
+    m_notifies.clear();
+    m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
+  }
+
+  {
+    RWLock::RLocker owner_lock(ictx->owner_lock);
+    ictx->image_watcher->request_lock();
+  }
+
+  ASSERT_TRUE(wait_for_lock_updated(*ictx));
+  ASSERT_TRUE(wait_for_notifies(*ictx));
+
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
   }
-  ASSERT_TRUE(wait_for_aio_completions(*ictx));
 }
 
-TEST_F(TestImageWatcher, RequestLockPostTryLockFailed) {
+TEST_F(TestImageWatcher, RequestLockTryLockFailed) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
   ASSERT_EQ(0, register_image_watch(*ictx));
-  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
-                          "auto " + stringify(m_watch_ctx->get_handle())));
+  ASSERT_EQ(0, lock_image(*ictx, LOCK_SHARED, "manually 1234"));
 
-  m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, create_response_message(0)}};
+  register_lock_listener(*ictx);
+  m_notify_acks = {{NOTIFY_OP_REQUEST_LOCK, {}}};
 
-  m_expected_aio_restarts = 1;
   {
-    RWLock::WLocker l(ictx->owner_lock);
-    ictx->image_watcher->request_lock(
-      boost::bind(&TestImageWatcher::handle_restart_aio, this, ictx, _1),
-      create_aio_completion(*ictx));
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ictx->image_watcher->request_lock();
   }
 
   ASSERT_TRUE(wait_for_notifies(*ictx));
@@ -729,20 +823,21 @@ TEST_F(TestImageWatcher, RequestLockPostTryLockFailed) {
   expected_notify_ops += NOTIFY_OP_REQUEST_LOCK;
   ASSERT_EQ(expected_notify_ops, m_notifies);
 
-  ASSERT_EQ(0, unlock_image());
-  ASSERT_EQ(0, lock_image(*ictx, LOCK_SHARED, "manually 1234"));
-
-  m_notifies.clear();
-  m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK, bufferlist()}};
+  // should resend when error encountered
+  {
+    Mutex::Locker l(m_callback_lock);
+    m_notifies.clear();
+  }
+  ASSERT_TRUE(wait_for_notifies(*ictx));
 
-  bufferlist bl;
   {
-    ENCODE_START(1, 1, bl);
-    ::encode(NOTIFY_OP_RELEASED_LOCK, bl);
-    ENCODE_FINISH(bl);
+    Mutex::Locker l(m_callback_lock);
+    ASSERT_EQ(0, unlock_image());
+    m_notifies.clear();
+    m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
   }
-  ASSERT_EQ(0, m_ioctx.notify2(ictx->header_oid, bl, 5000, NULL));
-  ASSERT_TRUE(wait_for_aio_completions(*ictx));
+
+  ASSERT_TRUE(wait_for_notifies(*ictx));
 }
 
 TEST_F(TestImageWatcher, NotifyHeaderUpdate) {
@@ -906,6 +1001,46 @@ TEST_F(TestImageWatcher, NotifySnapCreateError) {
   ASSERT_EQ(expected_notify_ops, m_notifies);
 }
 
+TEST_F(TestImageWatcher, NotifySnapRename) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  ASSERT_EQ(0, register_image_watch(*ictx));
+  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
+        "auto " + stringify(m_watch_ctx->get_handle())));
+
+  m_notify_acks = {{NOTIFY_OP_SNAP_RENAME, create_response_message(0)}};
+
+  RWLock::RLocker l(ictx->owner_lock);
+  ASSERT_EQ(0, ictx->image_watcher->notify_snap_rename(1, "snap-rename"));
+
+  NotifyOps expected_notify_ops;
+  expected_notify_ops += NOTIFY_OP_SNAP_RENAME;
+  ASSERT_EQ(expected_notify_ops, m_notifies);
+}
+
+TEST_F(TestImageWatcher, NotifySnapRenameError) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  ASSERT_EQ(0, register_image_watch(*ictx));
+  ASSERT_EQ(0, lock_image(*ictx, LOCK_EXCLUSIVE,
+        "auto " + stringify(m_watch_ctx->get_handle())));
+
+  m_notify_acks = {{NOTIFY_OP_SNAP_RENAME, create_response_message(-EEXIST)}};
+
+  RWLock::RLocker l(ictx->owner_lock);
+  ASSERT_EQ(-EEXIST, ictx->image_watcher->notify_snap_rename(1, "snap-rename"));
+
+  NotifyOps expected_notify_ops;
+  expected_notify_ops += NOTIFY_OP_SNAP_RENAME;
+  ASSERT_EQ(expected_notify_ops, m_notifies);
+}
+
 TEST_F(TestImageWatcher, NotifySnapRemove) {
   REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
 
@@ -1020,3 +1155,48 @@ TEST_F(TestImageWatcher, NotifyAsyncRequestTimedOut) {
   ASSERT_TRUE(thread.timed_join(boost::posix_time::seconds(10)));
   ASSERT_EQ(-ERESTART, flatten_task.result);
 }
+
+TEST_F(TestImageWatcher, PeerRequestsLock) {
+  REQUIRE_FEATURE(RBD_FEATURE_EXCLUSIVE_LOCK);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ASSERT_EQ(0, register_image_watch(*ictx));
+
+  register_lock_listener(*ictx);
+  m_notify_acks = {{NOTIFY_OP_ACQUIRED_LOCK, {}}};
+
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ictx->image_watcher->request_lock();
+  }
+
+  ASSERT_TRUE(wait_for_notifies(*ictx));
+
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ASSERT_TRUE(ictx->image_watcher->is_lock_owner());
+  }
+
+  // if journaling is enabled, ensure we wait for it to replay since
+  // it will block our peer request
+  std::string buffer(256, '1');
+  ictx->aio_work_queue->write(0, buffer.size(), buffer.c_str(), 0);
+
+  {
+    Mutex::Locker l(m_callback_lock);
+    m_notifies.clear();
+    m_notify_acks = {{NOTIFY_OP_RELEASED_LOCK, {}}};
+  }
+
+  bufferlist bl;
+  {
+    ENCODE_START(1, 1, bl);
+    ::encode(NOTIFY_OP_REQUEST_LOCK, bl);
+    ENCODE_FINISH(bl);
+  }
+  ASSERT_EQ(0, m_ioctx.notify2(ictx->header_oid, bl, 5000, NULL));
+
+  ASSERT_TRUE(wait_for_releasing_lock(*ictx));
+  ASSERT_TRUE(wait_for_notifies(*ictx));
+}
diff --git a/src/test/librbd/test_JournalEntries.cc b/src/test/librbd/test_JournalEntries.cc
new file mode 100644
index 0000000..7dea547
--- /dev/null
+++ b/src/test/librbd/test_JournalEntries.cc
@@ -0,0 +1,217 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_fixture.h"
+#include "test/librbd/test_support.h"
+#include "librbd/AioCompletion.h"
+#include "librbd/AioImageRequestWQ.h"
+#include "librbd/internal.h"
+#include "librbd/Journal.h"
+#include "librbd/JournalTypes.h"
+#include "journal/Journaler.h"
+#include "journal/ReplayEntry.h"
+#include "journal/ReplayHandler.h"
+#include <list>
+#include <boost/variant.hpp>
+
+void register_test_journal_entries() {
+}
+
+class TestJournalEntries : public TestFixture {
+public:
+  typedef std::list<journal::Journaler *> Journalers;
+
+  struct ReplayHandler : public journal::ReplayHandler {
+    Mutex lock;
+    Cond cond;
+    bool entries_available;
+    bool complete;
+
+    ReplayHandler()
+      : lock("ReplayHandler::lock"), entries_available(false), complete(false) {
+    }
+
+    virtual void get() {
+    }
+    virtual void put() {
+    }
+
+    virtual void handle_entries_available()  {
+      Mutex::Locker locker(lock);
+      entries_available = true;
+      cond.Signal();
+    }
+
+    virtual void handle_complete(int r) {
+      Mutex::Locker locker(lock);
+      complete = true;
+      cond.Signal();
+    }
+  };
+
+  ReplayHandler m_replay_handler;
+  Journalers m_journalers;
+
+  virtual void TearDown() {
+    for (Journalers::iterator it = m_journalers.begin();
+         it != m_journalers.end(); ++it) {
+      journal::Journaler *journaler = *it;
+      journaler->stop_replay();
+      delete journaler;
+    }
+
+    TestFixture::TearDown();
+  }
+
+  journal::Journaler *create_journaler(librbd::ImageCtx *ictx) {
+    journal::Journaler *journaler = new journal::Journaler(
+      ictx->md_ctx, ictx->id, "dummy client", 1);
+
+    int r = journaler->register_client("unit test client");
+    if (r < 0) {
+      ADD_FAILURE() << "failed to register journal client";
+      delete journaler;
+      return NULL;
+    }
+
+    C_SaferCond cond;
+    journaler->init(&cond);
+    r = cond.wait();
+    if (r < 0) {
+      ADD_FAILURE() << "failed to initialize journal client";
+      delete journaler;
+      return NULL;
+    }
+
+    journaler->start_live_replay(&m_replay_handler, 0.1);
+    m_journalers.push_back(journaler);
+    return journaler;
+  }
+
+  bool wait_for_entries_available(librbd::ImageCtx *ictx) {
+    Mutex::Locker locker(m_replay_handler.lock);
+    while (!m_replay_handler.entries_available) {
+      if (m_replay_handler.cond.WaitInterval(ictx->cct, m_replay_handler.lock,
+                                             utime_t(10, 0)) != 0) {
+        return false;
+      }
+    }
+    m_replay_handler.entries_available = false;
+    return true;
+  }
+
+  bool get_event_entry(const journal::ReplayEntry &replay_entry,
+                       librbd::journal::EventEntry *event_entry) {
+    try {
+      bufferlist data_bl = replay_entry.get_data();
+      bufferlist::iterator it = data_bl.begin();
+      ::decode(*event_entry, it);
+    } catch (const buffer::error &err) {
+      return false;
+    }
+    return true;
+  }
+
+};
+
+TEST_F(TestJournalEntries, AioWrite) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  journal::Journaler *journaler = create_journaler(ictx);
+  ASSERT_TRUE(journaler != NULL);
+
+  std::string buffer(512, '1');
+  C_SaferCond cond_ctx;
+  librbd::AioCompletion *c =
+    librbd::aio_create_completion_internal(&cond_ctx, librbd::rbd_ctx_cb);
+  c->get();
+  ictx->aio_work_queue->aio_write(c, 123, buffer.size(), buffer.c_str(), 0);
+  ASSERT_EQ(0, c->wait_for_complete());
+  c->put();
+
+  ASSERT_TRUE(wait_for_entries_available(ictx));
+
+  journal::ReplayEntry replay_entry;
+  ASSERT_TRUE(journaler->try_pop_front(&replay_entry));
+
+  librbd::journal::EventEntry event_entry;
+  ASSERT_TRUE(get_event_entry(replay_entry, &event_entry));
+
+  ASSERT_EQ(librbd::journal::EVENT_TYPE_AIO_WRITE,
+            event_entry.get_event_type());
+
+  librbd::journal::AioWriteEvent aio_write_event =
+    boost::get<librbd::journal::AioWriteEvent>(event_entry.event);
+  ASSERT_EQ(123U, aio_write_event.offset);
+  ASSERT_EQ(buffer.size(), aio_write_event.length);
+
+  bufferlist buffer_bl;
+  buffer_bl.append(buffer);
+  ASSERT_TRUE(aio_write_event.data.contents_equal(buffer_bl));
+}
+
+TEST_F(TestJournalEntries, AioDiscard) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  journal::Journaler *journaler = create_journaler(ictx);
+  ASSERT_TRUE(journaler != NULL);
+
+  C_SaferCond cond_ctx;
+  librbd::AioCompletion *c =
+    librbd::aio_create_completion_internal(&cond_ctx, librbd::rbd_ctx_cb);
+  c->get();
+  ictx->aio_work_queue->aio_discard(c, 123, 234);
+  ASSERT_EQ(0, c->wait_for_complete());
+  c->put();
+
+  ASSERT_TRUE(wait_for_entries_available(ictx));
+
+  journal::ReplayEntry replay_entry;
+  ASSERT_TRUE(journaler->try_pop_front(&replay_entry));
+
+  librbd::journal::EventEntry event_entry;
+  ASSERT_TRUE(get_event_entry(replay_entry, &event_entry));
+
+  ASSERT_EQ(librbd::journal::EVENT_TYPE_AIO_DISCARD,
+            event_entry.get_event_type());
+
+  librbd::journal::AioDiscardEvent aio_discard_event =
+    boost::get<librbd::journal::AioDiscardEvent>(event_entry.event);
+  ASSERT_EQ(123U, aio_discard_event.offset);
+  ASSERT_EQ(234U, aio_discard_event.length);
+}
+
+TEST_F(TestJournalEntries, AioFlush) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+
+  journal::Journaler *journaler = create_journaler(ictx);
+  ASSERT_TRUE(journaler != NULL);
+
+  C_SaferCond cond_ctx;
+  librbd::AioCompletion *c =
+    librbd::aio_create_completion_internal(&cond_ctx, librbd::rbd_ctx_cb);
+  c->get();
+  ictx->aio_work_queue->aio_flush(c);
+  ASSERT_EQ(0, c->wait_for_complete());
+  c->put();
+
+  ASSERT_TRUE(wait_for_entries_available(ictx));
+
+  journal::ReplayEntry replay_entry;
+  ASSERT_TRUE(journaler->try_pop_front(&replay_entry));
+
+  librbd::journal::EventEntry event_entry;
+  ASSERT_TRUE(get_event_entry(replay_entry, &event_entry));
+
+  ASSERT_EQ(librbd::journal::EVENT_TYPE_AIO_FLUSH,
+            event_entry.get_event_type());
+}
diff --git a/src/test/librbd/test_JournalReplay.cc b/src/test/librbd/test_JournalReplay.cc
new file mode 100644
index 0000000..c6d6347
--- /dev/null
+++ b/src/test/librbd/test_JournalReplay.cc
@@ -0,0 +1,209 @@
+// -*- mode:C; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "test/librbd/test_fixture.h"
+#include "test/librbd/test_support.h"
+#include "librbd/AioCompletion.h"
+#include "librbd/AioImageRequest.h"
+#include "librbd/AioImageRequestWQ.h"
+#include "librbd/ImageCtx.h"
+#include "librbd/ImageWatcher.h"
+#include "librbd/Journal.h"
+#include "librbd/JournalTypes.h"
+
+void register_test_journal_replay() {
+}
+
+class TestJournalReplay : public TestFixture {
+public:
+
+  struct Listener : public librbd::ImageWatcher::Listener{
+    Mutex lock;
+    Cond cond;
+
+    Listener() : lock("TestJournalReplay::Listener::lock") {
+    }
+    virtual bool handle_requested_lock() {
+      return true;
+    }
+    virtual void handle_releasing_lock() {
+    }
+    virtual void handle_lock_updated(
+        librbd::ImageWatcher::LockUpdateState state) {
+      Mutex::Locker locker(lock);
+      cond.Signal();
+    }
+  };
+
+  void wait_for_lock_owner(librbd::ImageCtx *ictx) {
+    Listener listener;
+    ictx->image_watcher->register_listener(&listener);
+    {
+      Mutex::Locker listener_locker(listener.lock);
+      RWLock::RLocker owner_locker(ictx->owner_lock);
+      while (!ictx->image_watcher->is_lock_owner()) {
+        ictx->owner_lock.put_read();
+        listener.cond.Wait(listener.lock);
+        ictx->owner_lock.get_read();
+      }
+    }
+    ictx->image_watcher->unregister_listener(&listener);
+  }
+};
+
+TEST_F(TestJournalReplay, AioDiscardEvent) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+  // write to the image w/o using the journal
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  ictx->features &= ~RBD_FEATURE_JOURNALING;
+  ASSERT_EQ(0, ictx->close_journal(true));
+
+  std::string payload(4096, '1');
+  librbd::AioCompletion *aio_comp = new librbd::AioCompletion();
+  ictx->aio_work_queue->aio_write(aio_comp, 0, payload.size(), payload.c_str(),
+                                  0);
+  ASSERT_EQ(0, aio_comp->wait_for_complete());
+  aio_comp->release();
+
+  aio_comp = new librbd::AioCompletion();
+  ictx->aio_work_queue->aio_flush(aio_comp);
+  ASSERT_EQ(0, aio_comp->wait_for_complete());
+  aio_comp->release();
+
+  std::string read_payload(4096, '\0');
+  aio_comp = new librbd::AioCompletion();
+  ictx->aio_work_queue->aio_read(aio_comp, 0, read_payload.size(),
+                                 &read_payload[0], NULL, 0);
+  ASSERT_EQ(0, aio_comp->wait_for_complete());
+  aio_comp->release();
+  ASSERT_EQ(payload, read_payload);
+  close_image(ictx);
+
+  // inject a discard operation into the journal
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  {
+    RWLock::WLocker owner_locker(ictx->owner_lock);
+    ictx->image_watcher->request_lock();
+  }
+  wait_for_lock_owner(ictx);
+
+  ictx->journal->open();
+  ASSERT_TRUE(ictx->journal->wait_for_journal_ready());
+
+  librbd::journal::EventEntry event_entry(
+    librbd::journal::AioDiscardEvent(0, payload.size()));
+  librbd::Journal::AioObjectRequests requests;
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ictx->journal->append_event(NULL, event_entry, requests, 0, 0, true);
+  }
+  ASSERT_EQ(0, ictx->journal->close());
+
+  // re-open the journal so that it replays the new entry
+  ictx->journal->open();
+  ASSERT_TRUE(ictx->journal->wait_for_journal_ready());
+
+  aio_comp = new librbd::AioCompletion();
+  ictx->aio_work_queue->aio_read(aio_comp, 0, read_payload.size(),
+                                 &read_payload[0], NULL, 0);
+  ASSERT_EQ(0, aio_comp->wait_for_complete());
+  aio_comp->release();
+  ASSERT_EQ(std::string(read_payload.size(), '\0'), read_payload);
+}
+
+TEST_F(TestJournalReplay, AioWriteEvent) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+  // inject a write operation into the journal
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  {
+    RWLock::WLocker owner_locker(ictx->owner_lock);
+    ictx->image_watcher->request_lock();
+  }
+  wait_for_lock_owner(ictx);
+
+  ictx->journal->open();
+  ASSERT_TRUE(ictx->journal->wait_for_journal_ready());
+
+  std::string payload(4096, '1');
+  bufferlist payload_bl;
+  payload_bl.append(payload);
+  librbd::journal::EventEntry event_entry(
+    librbd::journal::AioWriteEvent(0, payload.size(), payload_bl));
+  librbd::Journal::AioObjectRequests requests;
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ictx->journal->append_event(NULL, event_entry, requests, 0, 0, true);
+  }
+  ASSERT_EQ(0, ictx->journal->close());
+
+  // re-open the journal so that it replays the new entry
+  ictx->journal->open();
+  ASSERT_TRUE(ictx->journal->wait_for_journal_ready());
+
+  std::string read_payload(4096, '\0');
+  librbd::AioCompletion *aio_comp = new librbd::AioCompletion();
+  ictx->aio_work_queue->aio_read(aio_comp, 0, read_payload.size(),
+                                 &read_payload[0], NULL, 0);
+  ASSERT_EQ(0, aio_comp->wait_for_complete());
+  aio_comp->release();
+  ASSERT_EQ(payload, read_payload);
+}
+
+TEST_F(TestJournalReplay, AioFlushEvent) {
+  REQUIRE_FEATURE(RBD_FEATURE_JOURNALING);
+
+  // inject a flush operation into the journal
+  librbd::ImageCtx *ictx;
+  ASSERT_EQ(0, open_image(m_image_name, &ictx));
+  {
+    RWLock::WLocker owner_locker(ictx->owner_lock);
+    ictx->image_watcher->request_lock();
+  }
+  wait_for_lock_owner(ictx);
+
+  ictx->journal->open();
+  ASSERT_TRUE(ictx->journal->wait_for_journal_ready());
+
+  librbd::journal::AioFlushEvent aio_flush_event;
+  librbd::journal::EventEntry event_entry(aio_flush_event);
+  librbd::Journal::AioObjectRequests requests;
+  {
+    RWLock::RLocker owner_locker(ictx->owner_lock);
+    ictx->journal->append_event(NULL, event_entry, requests, 0, 0, true);
+  }
+  ASSERT_EQ(0, ictx->journal->close());
+
+  // start an AIO write op
+  librbd::Journal *journal = ictx->journal;
+  ictx->journal = NULL;
+
+  std::string payload(m_image_size, '1');
+  librbd::AioCompletion *aio_comp = new librbd::AioCompletion();
+  {
+    RWLock::RLocker owner_lock(ictx->owner_lock);
+    librbd::AioImageRequest::aio_write(ictx, aio_comp, 0, payload.size(),
+                                       payload.c_str(), 0);
+  }
+  ictx->journal = journal;
+
+  // re-open the journal so that it replays the new entry
+  ictx->journal->open();
+  ASSERT_TRUE(ictx->journal->wait_for_journal_ready());
+
+  ASSERT_TRUE(aio_comp->is_complete());
+  ASSERT_EQ(0, aio_comp->wait_for_complete());
+  aio_comp->release();
+
+  std::string read_payload(m_image_size, '\0');
+  aio_comp = new librbd::AioCompletion();
+  ictx->aio_work_queue->aio_read(aio_comp, 0, read_payload.size(),
+                                 &read_payload[0], NULL, 0);
+  ASSERT_EQ(0, aio_comp->wait_for_complete());
+  aio_comp->release();
+  ASSERT_EQ(payload, read_payload);
+}
+
diff --git a/src/test/librbd/test_internal.cc b/src/test/librbd/test_internal.cc
index 805a86c..e35e90f 100644
--- a/src/test/librbd/test_internal.cc
+++ b/src/test/librbd/test_internal.cc
@@ -3,6 +3,8 @@
 #include "test/librbd/test_fixture.h"
 #include "test/librbd/test_support.h"
 #include "librbd/AioCompletion.h"
+#include "librbd/AioImageRequest.h"
+#include "librbd/AioImageRequestWQ.h"
 #include "librbd/ImageWatcher.h"
 #include "librbd/internal.h"
 #include "librbd/ObjectMap.h"
@@ -254,7 +256,7 @@ TEST_F(TestInternal, AioWriteRequestsLock) {
   librbd::AioCompletion *c =
     librbd::aio_create_completion_internal(ctx, librbd::rbd_ctx_cb);
   c->get();
-  aio_write(ictx, 0, buffer.size(), buffer.c_str(), c, 0);
+  ictx->aio_work_queue->aio_write(c, 0, buffer.size(), buffer.c_str(), 0);
 
   bool is_owner;
   ASSERT_EQ(0, librbd::is_exclusive_lock_owner(ictx, &is_owner));
@@ -277,7 +279,7 @@ TEST_F(TestInternal, AioDiscardRequestsLock) {
   librbd::AioCompletion *c =
     librbd::aio_create_completion_internal(ctx, librbd::rbd_ctx_cb);
   c->get();
-  aio_discard(ictx, 0, 256, c);
+  ictx->aio_work_queue->aio_discard(c, 0, 256);
 
   bool is_owner;
   ASSERT_EQ(0, librbd::is_exclusive_lock_owner(ictx, &is_owner));
@@ -465,7 +467,7 @@ TEST_F(TestInternal, SnapshotCopyup)
 
   bufferlist bl;
   bl.append(std::string(256, '1'));
-  ASSERT_EQ(256, librbd::write(ictx, 0, bl.length(), bl.c_str(), 0));
+  ASSERT_EQ(256, ictx->aio_work_queue->write(0, bl.length(), bl.c_str(), 0));
 
   ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
   ASSERT_EQ(0, librbd::snap_protect(ictx, "snap1"));
@@ -484,7 +486,7 @@ TEST_F(TestInternal, SnapshotCopyup)
   ASSERT_EQ(0, librbd::snap_create(ictx2, "snap1"));
   ASSERT_EQ(0, librbd::snap_create(ictx2, "snap2"));
 
-  ASSERT_EQ(256, librbd::write(ictx2, 256, bl.length(), bl.c_str(), 0));
+  ASSERT_EQ(256, ictx2->aio_work_queue->write(256, bl.length(), bl.c_str(), 0));
 
   librados::IoCtx snap_ctx;
   snap_ctx.dup(m_ioctx);
@@ -514,10 +516,10 @@ TEST_F(TestInternal, SnapshotCopyup)
     const char *snap_name = it->empty() ? NULL : it->c_str();
     ASSERT_EQ(0, librbd::snap_set(ictx2, snap_name));
 
-    ASSERT_EQ(256, librbd::read(ictx2, 0, 256, read_bl.c_str(), 0));
+    ASSERT_EQ(256, ictx2->aio_work_queue->read(0, 256, read_bl.c_str(), 0));
     ASSERT_TRUE(bl.contents_equal(read_bl));
 
-    ASSERT_EQ(256, librbd::read(ictx2, 256, 256, read_bl.c_str(), 0));
+    ASSERT_EQ(256, ictx2->aio_work_queue->read(256, 256, read_bl.c_str(), 0));
     if (snap_name == NULL) {
       ASSERT_TRUE(bl.contents_equal(read_bl));
     } else {
@@ -556,7 +558,8 @@ TEST_F(TestInternal, ResizeCopyup)
   bufferlist bl;
   bl.append(std::string(4096, '1'));
   for (size_t i = 0; i < m_image_size; i += bl.length()) {
-    ASSERT_EQ(bl.length(), librbd::write(ictx, i, bl.length(), bl.c_str(), 0));
+    ASSERT_EQ(bl.length(), ictx->aio_work_queue->write(i, bl.length(),
+                                                       bl.c_str(), 0));
   }
 
   ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
@@ -587,8 +590,8 @@ TEST_F(TestInternal, ResizeCopyup)
   }
 
   for (size_t i = 2 << order; i < m_image_size; i += bl.length()) {
-    ASSERT_EQ(bl.length(), librbd::read(ictx2, i, bl.length(), read_bl.c_str(),
-                                        0));
+    ASSERT_EQ(bl.length(), ictx2->aio_work_queue->read(i, bl.length(),
+                                                       read_bl.c_str(), 0));
     ASSERT_TRUE(bl.contents_equal(read_bl));
   }
 }
@@ -612,7 +615,8 @@ TEST_F(TestInternal, DiscardCopyup)
   bufferlist bl;
   bl.append(std::string(4096, '1'));
   for (size_t i = 0; i < m_image_size; i += bl.length()) {
-    ASSERT_EQ(bl.length(), librbd::write(ictx, i, bl.length(), bl.c_str(), 0));
+    ASSERT_EQ(bl.length(), ictx->aio_work_queue->write(i, bl.length(),
+                                                       bl.c_str(), 0));
   }
 
   ASSERT_EQ(0, librbd::snap_create(ictx, "snap1"));
@@ -632,7 +636,7 @@ TEST_F(TestInternal, DiscardCopyup)
   read_bl.push_back(read_ptr);
 
   ASSERT_EQ(static_cast<int>(m_image_size - 64),
-            librbd::discard(ictx2, 32, m_image_size - 64));
+            ictx2->aio_work_queue->discard(32, m_image_size - 64));
   ASSERT_EQ(0, librbd::snap_set(ictx2, "snap1"));
 
   {
@@ -642,8 +646,8 @@ TEST_F(TestInternal, DiscardCopyup)
   }
 
   for (size_t i = 0; i < m_image_size; i += bl.length()) {
-    ASSERT_EQ(bl.length(), librbd::read(ictx2, i, bl.length(), read_bl.c_str(),
-                                        0));
+    ASSERT_EQ(bl.length(), ictx2->aio_work_queue->read(i, bl.length(),
+                                                       read_bl.c_str(), 0));
     ASSERT_TRUE(bl.contents_equal(read_bl));
   }
 }
@@ -652,17 +656,16 @@ TEST_F(TestInternal, ShrinkFlushesCache) {
   librbd::ImageCtx *ictx;
   ASSERT_EQ(0, open_image(m_image_name, &ictx));
 
-  {
-    RWLock::WLocker owner_locker(ictx->owner_lock);
-    ASSERT_EQ(0, ictx->image_watcher->try_lock());
-  }
-
   std::string buffer(4096, '1');
+
+  // ensure write-path is initialized
+  ictx->aio_work_queue->write(0, buffer.size(), buffer.c_str(), 0);
+
   C_SaferCond cond_ctx;
   librbd::AioCompletion *c =
     librbd::aio_create_completion_internal(&cond_ctx, librbd::rbd_ctx_cb);
   c->get();
-  aio_write(ictx, 0, buffer.size(), buffer.c_str(), c, 0);
+  ictx->aio_work_queue->aio_write(c, 0, buffer.size(), buffer.c_str(), 0);
 
   librbd::NoOpProgressContext no_op;
   ASSERT_EQ(0, librbd::resize(ictx, m_image_size >> 1, no_op));
@@ -672,3 +675,70 @@ TEST_F(TestInternal, ShrinkFlushesCache) {
   ASSERT_EQ(0, cond_ctx.wait());
   c->put();
 }
+
+TEST_F(TestInternal, ImageOptions) {
+  rbd_image_options_t opts1 = NULL, opts2 = NULL;
+  uint64_t uint64_val1 = 10, uint64_val2 = 0;
+  std::string string_val1;
+
+  librbd::image_options_create(&opts1);
+  ASSERT_NE((rbd_image_options_t)NULL, opts1);
+  ASSERT_TRUE(librbd::image_options_is_empty(opts1));
+
+  ASSERT_EQ(-EINVAL, librbd::image_options_get(opts1, RBD_IMAGE_OPTION_FEATURES,
+	  &string_val1));
+  ASSERT_EQ(-ENOENT, librbd::image_options_get(opts1, RBD_IMAGE_OPTION_FEATURES,
+	  &uint64_val1));
+
+  ASSERT_EQ(-EINVAL, librbd::image_options_set(opts1, RBD_IMAGE_OPTION_FEATURES,
+	  string_val1));
+
+  ASSERT_EQ(0, librbd::image_options_set(opts1, RBD_IMAGE_OPTION_FEATURES,
+	  uint64_val1));
+  ASSERT_FALSE(librbd::image_options_is_empty(opts1));
+  ASSERT_EQ(0, librbd::image_options_get(opts1, RBD_IMAGE_OPTION_FEATURES,
+	  &uint64_val2));
+  ASSERT_EQ(uint64_val1, uint64_val2);
+
+  librbd::image_options_create_ref(&opts2, opts1);
+  ASSERT_NE((rbd_image_options_t)NULL, opts2);
+  ASSERT_FALSE(librbd::image_options_is_empty(opts2));
+
+  uint64_val2 = 0;
+  ASSERT_NE(uint64_val1, uint64_val2);
+  ASSERT_EQ(0, librbd::image_options_get(opts2, RBD_IMAGE_OPTION_FEATURES,
+	  &uint64_val2));
+  ASSERT_EQ(uint64_val1, uint64_val2);
+
+  uint64_val2++;
+  ASSERT_NE(uint64_val1, uint64_val2);
+  ASSERT_EQ(-ENOENT, librbd::image_options_get(opts1, RBD_IMAGE_OPTION_ORDER,
+	  &uint64_val1));
+  ASSERT_EQ(-ENOENT, librbd::image_options_get(opts2, RBD_IMAGE_OPTION_ORDER,
+	  &uint64_val2));
+  ASSERT_EQ(0, librbd::image_options_set(opts2, RBD_IMAGE_OPTION_ORDER,
+	  uint64_val2));
+  ASSERT_EQ(0, librbd::image_options_get(opts1, RBD_IMAGE_OPTION_ORDER,
+	  &uint64_val1));
+  ASSERT_EQ(0, librbd::image_options_get(opts2, RBD_IMAGE_OPTION_ORDER,
+	  &uint64_val2));
+  ASSERT_EQ(uint64_val1, uint64_val2);
+
+  librbd::image_options_destroy(opts1);
+
+  uint64_val2++;
+  ASSERT_NE(uint64_val1, uint64_val2);
+  ASSERT_EQ(0, librbd::image_options_get(opts2, RBD_IMAGE_OPTION_ORDER,
+	  &uint64_val2));
+  ASSERT_EQ(uint64_val1, uint64_val2);
+
+  ASSERT_EQ(0, librbd::image_options_unset(opts2, RBD_IMAGE_OPTION_ORDER));
+  ASSERT_EQ(-ENOENT, librbd::image_options_unset(opts2, RBD_IMAGE_OPTION_ORDER));
+
+  librbd::image_options_clear(opts2);
+  ASSERT_EQ(-ENOENT, librbd::image_options_get(opts2, RBD_IMAGE_OPTION_FEATURES,
+	  &uint64_val2));
+  ASSERT_TRUE(librbd::image_options_is_empty(opts2));
+
+  librbd::image_options_destroy(opts2);
+}
diff --git a/src/test/librbd/test_librbd.cc b/src/test/librbd/test_librbd.cc
index a6bf251..70d8391 100644
--- a/src/test/librbd/test_librbd.cc
+++ b/src/test/librbd/test_librbd.cc
@@ -667,6 +667,47 @@ TEST_F(TestLibRBD, TestCreateLsDeleteSnapPP)
   ioctx.close();
 }
 
+TEST_F(TestLibRBD, TestCreateLsRenameSnapPP)
+{
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  {
+    librbd::RBD rbd;
+    librbd::Image image;
+    int order = 0;
+    std::string name = get_temp_image_name();
+    uint64_t size = 2 << 20;
+    uint64_t size2 = 4 << 20;
+    
+    ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+    ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
+
+    ASSERT_FALSE(image.snap_exists("snap1"));
+    ASSERT_EQ(0, image.snap_create("snap1"));
+    ASSERT_TRUE(image.snap_exists("snap1"));
+    ASSERT_EQ(1, test_ls_snaps(image, 1, "snap1", size));
+    ASSERT_EQ(0, image.resize(size2));
+    ASSERT_FALSE(image.snap_exists("snap2"));
+    ASSERT_EQ(0, image.snap_create("snap2"));
+    ASSERT_TRUE(image.snap_exists("snap2"));
+    ASSERT_EQ(2, test_ls_snaps(image, 2, "snap1", size, "snap2", size2));
+    ASSERT_EQ(0, image.snap_rename("snap1","snap1-rename"));
+    ASSERT_EQ(2, test_ls_snaps(image, 2, "snap1-rename", size, "snap2", size2));
+    ASSERT_FALSE(image.snap_exists("snap1"));
+    ASSERT_TRUE(image.snap_exists("snap1-rename"));
+    ASSERT_EQ(0, image.snap_remove("snap1-rename"));
+    ASSERT_EQ(0, image.snap_rename("snap2","snap2-rename"));
+    ASSERT_EQ(1, test_ls_snaps(image, 1, "snap2-rename", size2));
+    ASSERT_FALSE(image.snap_exists("snap2"));
+    ASSERT_TRUE(image.snap_exists("snap2-rename"));
+    ASSERT_EQ(0, image.snap_remove("snap2-rename"));
+    ASSERT_EQ(0, test_ls_snaps(image, 0));
+  }
+
+  ioctx.close();
+}
+
 
 
 #define TEST_IO_SIZE 512
@@ -2975,17 +3016,20 @@ TEST_F(TestLibRBD, UpdateFeatures)
   ASSERT_EQ(-EINVAL, image.update_features(0, true));
 
   ASSERT_EQ(0, image.update_features(RBD_FEATURE_EXCLUSIVE_LOCK |
-                                       RBD_FEATURE_OBJECT_MAP |
-                                       RBD_FEATURE_FAST_DIFF, false));
+                                     RBD_FEATURE_OBJECT_MAP |
+                                     RBD_FEATURE_FAST_DIFF |
+                                     RBD_FEATURE_JOURNALING, false));
 
-  // cannot enable object map w/o exclusive lock
+  // cannot enable object map nor journaling w/o exclusive lock
   ASSERT_EQ(-EINVAL, image.update_features(RBD_FEATURE_OBJECT_MAP, true));
+  ASSERT_EQ(-EINVAL, image.update_features(RBD_FEATURE_JOURNALING, true));
   ASSERT_EQ(0, image.update_features(RBD_FEATURE_EXCLUSIVE_LOCK, true));
 
   // cannot enable fast diff w/o object map
   ASSERT_EQ(-EINVAL, image.update_features(RBD_FEATURE_FAST_DIFF, true));
   ASSERT_EQ(0, image.update_features(RBD_FEATURE_OBJECT_MAP |
-                                       RBD_FEATURE_FAST_DIFF, true));
+                                     RBD_FEATURE_FAST_DIFF |
+                                     RBD_FEATURE_JOURNALING, true));
 
   uint64_t expected_flags = RBD_FLAG_OBJECT_MAP_INVALID |
                             RBD_FLAG_FAST_DIFF_INVALID;
@@ -3005,6 +3049,10 @@ TEST_F(TestLibRBD, UpdateFeatures)
   ASSERT_EQ(-EINVAL, image.update_features(RBD_FEATURE_EXCLUSIVE_LOCK, false));
   ASSERT_EQ(0, image.update_features(RBD_FEATURE_OBJECT_MAP, false));
 
+  // cannot disable exclusive lock w/ journaling
+  ASSERT_EQ(-EINVAL, image.update_features(RBD_FEATURE_EXCLUSIVE_LOCK, false));
+  ASSERT_EQ(0, image.update_features(RBD_FEATURE_JOURNALING, false));
+
   ASSERT_EQ(0, image.get_flags(&flags));
   ASSERT_EQ(0U, flags);
 
@@ -3116,14 +3164,21 @@ TEST_F(TestLibRBD, BlockingAIO)
   int order = 18;
   ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
 
+  std::string non_blocking_aio;
+  ASSERT_EQ(0, _rados.conf_get("rbd_non_blocking_aio", non_blocking_aio));
   ASSERT_EQ(0, _rados.conf_set("rbd_non_blocking_aio", "0"));
+  BOOST_SCOPE_EXIT( (non_blocking_aio) ) {
+    ASSERT_EQ(0, _rados.conf_set("rbd_non_blocking_aio",
+                                 non_blocking_aio.c_str()));
+  } BOOST_SCOPE_EXIT_END;
 
   librbd::Image image;
   ASSERT_EQ(0, rbd.open(ioctx, image, name.c_str(), NULL));
 
   bufferlist bl;
-  bl.append(std::string(256, '1'));
+  ASSERT_EQ(0, image.write(0, bl.length(), bl));
 
+  bl.append(std::string(256, '1'));
   librbd::RBD::AioCompletion *write_comp =
     new librbd::RBD::AioCompletion(NULL, NULL);
   ASSERT_EQ(0, image.aio_write(0, bl.length(), bl, write_comp));
@@ -3265,3 +3320,158 @@ TEST_F(TestLibRBD, CacheMayCopyOnWrite) {
   ASSERT_EQ(1024, clone.read(offset + 2048, 1024, read_bl));
   ASSERT_TRUE(expect_bl.contents_equal(read_bl));
 }
+
+TEST_F(TestLibRBD, FlushEmptyOpsOnExternalSnapshot) {
+  std::string cache_enabled;
+  ASSERT_EQ(0, _rados.conf_get("rbd_cache", cache_enabled));
+  ASSERT_EQ(0, _rados.conf_set("rbd_cache", "false"));
+  BOOST_SCOPE_EXIT( (cache_enabled) ) {
+    ASSERT_EQ(0, _rados.conf_set("rbd_cache", cache_enabled.c_str()));
+  } BOOST_SCOPE_EXIT_END;
+
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  librbd::RBD rbd;
+  std::string name = get_temp_image_name();
+  uint64_t size = 1 << 18;
+  int order = 0;
+  ASSERT_EQ(0, create_image_pp(rbd, ioctx, name.c_str(), size, &order));
+
+  librbd::Image image1;
+  librbd::Image image2;
+  ASSERT_EQ(0, rbd.open(ioctx, image1, name.c_str(), NULL));
+  ASSERT_EQ(0, rbd.open(ioctx, image2, name.c_str(), NULL));
+  ASSERT_EQ(0, image1.snap_create("snap1"));
+
+  librbd::RBD::AioCompletion *read_comp =
+    new librbd::RBD::AioCompletion(NULL, NULL);
+  bufferlist read_bl;
+  image2.aio_read(0, 1024, read_bl, read_comp);
+  ASSERT_EQ(0, read_comp->wait_for_complete());
+  read_comp->release();
+}
+
+TEST_F(TestLibRBD, TestImageOptions)
+{
+  rados_ioctx_t ioctx;
+  rados_ioctx_create(_cluster, m_pool_name.c_str(), &ioctx);
+
+  //make create image options
+  uint64_t features = RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2 ;
+  uint64_t order = 0;
+  uint64_t stripe_unit = 65536;
+  uint64_t stripe_count = 16;
+  rbd_image_options_t opts;
+  rbd_image_options_create(&opts);
+  ASSERT_EQ(0, rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_FORMAT,
+	  2));
+  ASSERT_EQ(0, rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_FEATURES,
+	  features));
+  ASSERT_EQ(0, rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_ORDER,
+	  order));
+  ASSERT_EQ(0, rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_STRIPE_UNIT,
+	  stripe_unit));
+  ASSERT_EQ(0, rbd_image_options_set_uint64(opts, RBD_IMAGE_OPTION_STRIPE_COUNT,
+	  stripe_count));
+
+  std::string parent_name = get_temp_image_name();
+
+  // make parent
+  ASSERT_EQ(0, rbd_create4(ioctx, parent_name.c_str(), 4<<20, opts));
+
+  // check order is returned in opts
+  ASSERT_EQ(0, rbd_image_options_get_uint64(opts, RBD_IMAGE_OPTION_ORDER,
+	  &order));
+  ASSERT_NE((uint64_t)0, order);
+
+  // write some data to parent
+  rbd_image_t parent;
+  ASSERT_EQ(0, rbd_open(ioctx, parent_name.c_str(), &parent, NULL));
+  char *data = (char *)"testdata";
+  ASSERT_EQ((ssize_t)strlen(data), rbd_write(parent, 0, strlen(data), data));
+  ASSERT_EQ((ssize_t)strlen(data), rbd_write(parent, 12, strlen(data), data));
+
+  // create a snapshot, reopen as the parent we're interested in
+  ASSERT_EQ(0, rbd_snap_create(parent, "parent_snap"));
+  ASSERT_EQ(0, rbd_close(parent));
+  ASSERT_EQ(0, rbd_open(ioctx, parent_name.c_str(), &parent, "parent_snap"));
+
+  // clone
+  std::string child_name = get_temp_image_name();
+  ASSERT_EQ(0, rbd_snap_protect(parent, "parent_snap"));
+  ASSERT_EQ(0, rbd_clone3(ioctx, parent_name.c_str(), "parent_snap", ioctx,
+	  child_name.c_str(), opts));
+
+  // copy
+  std::string copy1_name = get_temp_image_name();
+  ASSERT_EQ(0, rbd_copy3(parent, ioctx, copy1_name.c_str(), opts));
+  std::string copy2_name = get_temp_image_name();
+  ASSERT_EQ(0, rbd_copy_with_progress3(parent, ioctx, copy2_name.c_str(), opts,
+	  print_progress_percent, NULL));
+
+  ASSERT_EQ(0, rbd_close(parent));
+
+  rbd_image_options_destroy(opts);
+
+  rados_ioctx_destroy(ioctx);
+}
+
+TEST_F(TestLibRBD, TestImageOptionsPP)
+{
+  librados::IoCtx ioctx;
+  ASSERT_EQ(0, _rados.ioctx_create(m_pool_name.c_str(), ioctx));
+
+  //make create image options
+  uint64_t features = RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2 ;
+  uint64_t order = 0;
+  uint64_t stripe_unit = 65536;
+  uint64_t stripe_count = 16;
+  librbd::ImageOptions opts;
+  ASSERT_EQ(0, opts.set(RBD_IMAGE_OPTION_FORMAT, static_cast<uint64_t>(2)));
+  ASSERT_EQ(0, opts.set(RBD_IMAGE_OPTION_FEATURES, features));
+  ASSERT_EQ(0, opts.set(RBD_IMAGE_OPTION_ORDER, order));
+  ASSERT_EQ(0, opts.set(RBD_IMAGE_OPTION_STRIPE_UNIT, stripe_unit));
+  ASSERT_EQ(0, opts.set(RBD_IMAGE_OPTION_STRIPE_COUNT, stripe_count));
+
+  librbd::RBD rbd;
+  std::string parent_name = get_temp_image_name();
+
+  // make parent
+  ASSERT_EQ(0, rbd.create4(ioctx, parent_name.c_str(), 4<<20, opts));
+
+  // check order is returned in opts
+  ASSERT_EQ(0, opts.get(RBD_IMAGE_OPTION_ORDER, &order));
+  ASSERT_NE((uint64_t)0, order);
+
+  // write some data to parent
+  librbd::Image parent;
+  ASSERT_EQ(0, rbd.open(ioctx, parent, parent_name.c_str(), NULL));
+
+  ssize_t len = 1024;
+  bufferlist bl;
+  bl.append(buffer::create(len));
+  bl.zero();
+  ASSERT_EQ(len, parent.write(0, len, bl));
+  ASSERT_EQ(len, parent.write(len, len, bl));
+
+  // create a snapshot, reopen as the parent we're interested in
+  ASSERT_EQ(0, parent.snap_create("parent_snap"));
+  ASSERT_EQ(0, parent.close());
+  ASSERT_EQ(0, rbd.open(ioctx, parent, parent_name.c_str(), "parent_snap"));
+
+  // clone
+  std::string child_name = get_temp_image_name();
+  ASSERT_EQ(0, parent.snap_protect("parent_snap"));
+  ASSERT_EQ(0, rbd.clone3(ioctx, parent_name.c_str(), "parent_snap", ioctx,
+	  child_name.c_str(), opts));
+
+  // copy
+  std::string copy1_name = get_temp_image_name();
+  ASSERT_EQ(0, parent.copy3(ioctx, copy1_name.c_str(), opts));
+  std::string copy2_name = get_temp_image_name();
+  PrintProgress pp;
+  ASSERT_EQ(0, parent.copy_with_progress3(ioctx, copy2_name.c_str(), opts, pp));
+
+  ASSERT_EQ(0, parent.close());
+}
diff --git a/src/test/librbd/test_main.cc b/src/test/librbd/test_main.cc
index e71a5af..7451195 100644
--- a/src/test/librbd/test_main.cc
+++ b/src/test/librbd/test_main.cc
@@ -12,6 +12,8 @@ extern void register_test_librbd();
 #ifdef TEST_LIBRBD_INTERNALS
 extern void register_test_image_watcher();
 extern void register_test_internal();
+extern void register_test_journal_entries();
+extern void register_test_journal_replay();
 extern void register_test_object_map();
 #endif // TEST_LIBRBD_INTERNALS
 
@@ -21,6 +23,8 @@ int main(int argc, char **argv)
 #ifdef TEST_LIBRBD_INTERNALS
   register_test_image_watcher();
   register_test_internal();
+  register_test_journal_entries();
+  register_test_journal_replay();
   register_test_object_map();
 #endif // TEST_LIBRBD_INTERNALS
 
diff --git a/src/test/mds/TestMDSAuthCaps.cc b/src/test/mds/TestMDSAuthCaps.cc
index 74cec8a..0c821e8 100644
--- a/src/test/mds/TestMDSAuthCaps.cc
+++ b/src/test/mds/TestMDSAuthCaps.cc
@@ -16,6 +16,9 @@
 
 #include "include/stringify.h"
 #include "mds/MDSAuthCaps.h"
+#include "common/ceph_argparse.h"
+#include "common/common_init.h"
+#include "global/global_init.h"
 
 #include "gtest/gtest.h"
 
@@ -23,6 +26,7 @@ using std::string;
 using std::cout;
 
 const char *parse_good[] = {
+  "allow rw uid=1 gids=1",
   "allow * path=\"/foo\"",
   "allow * path=/foo",
   "allow * path=\"/foo bar/baz\"",
@@ -31,6 +35,15 @@ const char *parse_good[] = {
   "allow *",
   "allow r",
   "allow rw",
+  "allow rw uid=1 gids=1,2,3",
+  "allow rw path=/foo uid=1 gids=1,2,3",
+  "allow r, allow rw path=/foo",
+  "allow r, allow * uid=1",
+  "allow r ,allow * uid=1",
+  "allow r ;allow * uid=1",
+  "allow r ; allow * uid=1",
+  "allow r ; allow * uid=1",
+  "allow r uid=1 gids=1,2,3, allow * uid=2",
   0
 };
 
@@ -39,7 +52,7 @@ TEST(MDSAuthCaps, ParseGood) {
     string str = parse_good[i];
     MDSAuthCaps cap;
     std::cout << "Testing good input: '" << str << "'" << std::endl;
-    ASSERT_TRUE(cap.parse(str, &cout));
+    ASSERT_TRUE(cap.parse(g_ceph_context, str, &cout));
   }
 }
 
@@ -66,6 +79,10 @@ const char *parse_bad[] = {
   "allow namespace=foo",
   "allow rwx auid 123 namespace asdf",
   "allow wwx pool ''",
+  "allow rw gids=1",
+  "allow rw gids=1,2,3",
+  "allow rw uid=123 gids=asdf",
+  "allow rw uid=123 gids=1,2,asdf",
   0
 };
 
@@ -74,7 +91,7 @@ TEST(MDSAuthCaps, ParseBad) {
     string str = parse_bad[i];
     MDSAuthCaps cap;
     std::cout << "Testing bad input: '" << str << "'" << std::endl;
-    ASSERT_FALSE(cap.parse(str, &cout));
+    ASSERT_FALSE(cap.parse(g_ceph_context, str, &cout));
   }
 }
 
@@ -82,39 +99,88 @@ TEST(MDSAuthCaps, AllowAll) {
   MDSAuthCaps cap;
   ASSERT_FALSE(cap.allow_all());
 
-  ASSERT_TRUE(cap.parse("allow r", NULL));
+  ASSERT_TRUE(cap.parse(g_ceph_context, "allow r", NULL));
   ASSERT_FALSE(cap.allow_all());
   cap = MDSAuthCaps();
 
-  ASSERT_TRUE(cap.parse("allow rw", NULL));
+  ASSERT_TRUE(cap.parse(g_ceph_context, "allow rw", NULL));
   ASSERT_FALSE(cap.allow_all());
   cap = MDSAuthCaps();
 
-  ASSERT_TRUE(cap.parse("allow", NULL));
+  ASSERT_TRUE(cap.parse(g_ceph_context, "allow", NULL));
   ASSERT_FALSE(cap.allow_all());
   cap = MDSAuthCaps();
 
-  ASSERT_TRUE(cap.parse("allow *", NULL));
+  ASSERT_TRUE(cap.parse(g_ceph_context, "allow *", NULL));
   ASSERT_TRUE(cap.allow_all());
-  ASSERT_TRUE(cap.is_capable("/foo/bar", 0, true, true));
+  ASSERT_TRUE(cap.is_capable("foo/bar", 0, 0, 0777, 0, 0, MAY_READ | MAY_WRITE, 0, 0));
 }
 
 TEST(MDSAuthCaps, AllowUid) {
-  MDSAuthCaps cap;
-  ASSERT_TRUE(cap.parse("allow * uid=10", NULL));
+  MDSAuthCaps cap(g_ceph_context);
+  ASSERT_TRUE(cap.parse(g_ceph_context, "allow * uid=10 gids=10,11; allow * uid=12 gids=12", NULL));
   ASSERT_FALSE(cap.allow_all());
-  ASSERT_TRUE(cap.is_capable("/foo", 10, true, true));
-  ASSERT_FALSE(cap.is_capable("/foo", -1, true, true));
-  ASSERT_FALSE(cap.is_capable("/foo", 0, true, true));
+
+  // uid/gid must be valid
+  ASSERT_FALSE(cap.is_capable("foo", 0, 0, 0777, 0, 0, MAY_READ, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 0, 0, 0777, 10, 0, MAY_READ, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 0, 0, 0777, 9, 10, MAY_READ, 0, 0));
+  ASSERT_TRUE(cap.is_capable("foo", 0, 0, 0777, 10, 10, MAY_READ, 0, 0));
+  ASSERT_TRUE(cap.is_capable("foo", 0, 0, 0777, 12, 12, MAY_READ, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 0, 0, 0777, 10, 12, MAY_READ, 0, 0));
+
+  // user
+  ASSERT_TRUE(cap.is_capable("foo", 10, 10, 0500, 10, 11, MAY_READ, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 10, 10, 0500, 10, 11, MAY_WRITE, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 10, 10, 0500, 10, 11, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_TRUE(cap.is_capable("foo", 10, 10, 0700, 10, 11, MAY_READ, 0, 0));
+  ASSERT_TRUE(cap.is_capable("foo", 10, 10, 0700, 10, 11, MAY_WRITE, 0, 0));
+  ASSERT_TRUE(cap.is_capable("foo", 10, 10, 0700, 10, 10, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_TRUE(cap.is_capable("foo", 10, 0, 0700, 10, 10, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 12, 0, 0700, 10, 10, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_TRUE(cap.is_capable("foo", 12, 0, 0700, 12, 12, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 0, 0, 0700, 10, 10, MAY_READ | MAY_WRITE, 0, 0));
+
+  // group
+  ASSERT_TRUE(cap.is_capable("foo", 0, 10, 0750, 10, 10, MAY_READ, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 0, 10, 0750, 10, 10, MAY_WRITE, 0, 0));
+  ASSERT_TRUE(cap.is_capable("foo", 0, 10, 0770, 10, 10, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_TRUE(cap.is_capable("foo", 0, 10, 0770, 10, 11, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_TRUE(cap.is_capable("foo", 0, 11, 0770, 10, 10, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_TRUE(cap.is_capable("foo", 0, 11, 0770, 10, 11, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_TRUE(cap.is_capable("foo", 0, 12, 0770, 12, 12, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 0, 10, 0770, 12, 12, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 0, 12, 0770, 10, 10, MAY_READ | MAY_WRITE, 0, 0));
+
+  // user > group
+  ASSERT_TRUE(cap.is_capable("foo", 10, 10, 0570, 10, 10, MAY_READ, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 10, 10, 0570, 10, 10, MAY_WRITE, 0, 0));
+
+  // other
+  ASSERT_TRUE(cap.is_capable("foo", 0, 0, 0775, 10, 10, MAY_READ, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 0, 0, 0770, 10, 10, MAY_READ, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 0, 0, 0775, 10, 10, MAY_WRITE, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 0, 0, 0775, 10, 10, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_TRUE(cap.is_capable("foo", 0, 0, 0777, 10, 10, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 0, 0, 0773, 10, 10, MAY_READ, 0, 0));
+
+  // group > other
+  ASSERT_TRUE(cap.is_capable("foo", 0, 0, 0557, 10, 10, MAY_READ, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 0, 10, 0557, 10, 10, MAY_WRITE, 0, 0));
+
+  // user > other
+  ASSERT_TRUE(cap.is_capable("foo", 0, 0, 0557, 10, 10, MAY_READ, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 10, 0, 0557, 10, 10, MAY_WRITE, 0, 0));
 }
 
 TEST(MDSAuthCaps, AllowPath) {
   MDSAuthCaps cap;
-  ASSERT_TRUE(cap.parse("allow * path=/sandbox", NULL));
+  ASSERT_TRUE(cap.parse(g_ceph_context, "allow * path=/sandbox", NULL));
   ASSERT_FALSE(cap.allow_all());
-  ASSERT_TRUE(cap.is_capable("/sandbox/foo", 0, true, true));
-  ASSERT_TRUE(cap.is_capable("/sandbox", 0, true, true));
-  ASSERT_FALSE(cap.is_capable("/foo", 0, true, true));
+  ASSERT_TRUE(cap.is_capable("sandbox/foo", 0, 0, 0777, 0, 0, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_TRUE(cap.is_capable("sandbox", 0, 0, 0777, 0, 0, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_FALSE(cap.is_capable("sandboxed", 0, 0, 0777, 0, 0, MAY_READ | MAY_WRITE, 0, 0));
+  ASSERT_FALSE(cap.is_capable("foo", 0, 0, 0777, 0, 0, MAY_READ | MAY_WRITE, 0, 0));
 }
 
 TEST(MDSAuthCaps, OutputParsed) {
@@ -133,19 +199,40 @@ TEST(MDSAuthCaps, OutputParsed) {
      "MDSAuthCaps[allow rw]"},
     {"allow * uid=1",
      "MDSAuthCaps[allow * uid=1]"},
+    {"allow * uid=1 gids=1",
+     "MDSAuthCaps[allow * uid=1 gids=1]"},
+    {"allow * uid=1 gids=1,2,3",
+     "MDSAuthCaps[allow * uid=1 gids=1,2,3]"},
     {"allow * path=/foo",
      "MDSAuthCaps[allow * path=\"/foo\"]"},
     {"allow * path=\"/foo\"",
      "MDSAuthCaps[allow * path=\"/foo\"]"},
     {"allow * path=\"/foo\" uid=1",
      "MDSAuthCaps[allow * path=\"/foo\" uid=1]"},
+    {"allow * path=\"/foo\" uid=1 gids=1,2,3",
+     "MDSAuthCaps[allow * path=\"/foo\" uid=1 gids=1,2,3]"},
+    {"allow r uid=1 gids=1,2,3, allow * uid=2",
+     "MDSAuthCaps[allow r uid=1 gids=1,2,3, allow * uid=2]"},
   };
   size_t num_tests = sizeof(test_values) / sizeof(*test_values);
   for (size_t i = 0; i < num_tests; ++i) {
     MDSAuthCaps cap;
     std::cout << "Testing input '" << test_values[i].input << "'" << std::endl;
-    ASSERT_TRUE(cap.parse(test_values[i].input, &cout));
+    ASSERT_TRUE(cap.parse(g_ceph_context, test_values[i].input, &cout));
     ASSERT_EQ(test_values[i].output, stringify(cap));
   }
 }
 
+int main(int argc, char **argv)
+{
+  ::testing::InitGoogleTest(&argc, argv);
+
+  vector<const char*> args;
+  argv_to_vec(argc, (const char **)argv, args);
+  env_to_vec(args, NULL);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/test/mon/misc.sh b/src/test/mon/misc.sh
index d2e5dbd..c11c0eb 100755
--- a/src/test/mon/misc.sh
+++ b/src/test/mon/misc.sh
@@ -21,7 +21,7 @@ function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7102"
+    export CEPH_MON="127.0.0.1:7102" # git grep '\<7102\>' : there must be only one
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
@@ -89,8 +89,8 @@ function TEST_mon_add_to_single_mon() {
     local dir=$1
 
     fsid=$(uuidgen)
-    MONA=127.0.0.1:7117
-    MONB=127.0.0.1:7118
+    MONA=127.0.0.1:7117 # git grep '\<7117\>' : there must be only one
+    MONB=127.0.0.1:7118 # git grep '\<7118\>' : there must be only one
     CEPH_ARGS_orig=$CEPH_ARGS
     CEPH_ARGS="--fsid=$fsid --auth-supported=none "
     CEPH_ARGS+="--mon-initial-members=a "
diff --git a/src/test/mon/mkfs.sh b/src/test/mon/mkfs.sh
index 68208b2..57a0301 100755
--- a/src/test/mon/mkfs.sh
+++ b/src/test/mon/mkfs.sh
@@ -23,7 +23,7 @@ export CEPH_CONF=/dev/null
 unset CEPH_ARGS
 MON_ID=a
 MON_DIR=$DIR/$MON_ID
-CEPH_MON=127.0.0.1:7110
+CEPH_MON=127.0.0.1:7110 # git grep '\<7110\>' : there must be only one
 TIMEOUT=360
 
 function setup() {
diff --git a/src/test/mon/mon-ping.sh b/src/test/mon/mon-ping.sh
index e3f7395..c27dc7b 100755
--- a/src/test/mon/mon-ping.sh
+++ b/src/test/mon/mon-ping.sh
@@ -18,7 +18,7 @@ function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7119"
+    export CEPH_MON="127.0.0.1:7119" # git grep '\<7119\>' : there must be only one
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
diff --git a/src/test/mon/mon-scrub.sh b/src/test/mon/mon-scrub.sh
index eb33bbc..b869839 100755
--- a/src/test/mon/mon-scrub.sh
+++ b/src/test/mon/mon-scrub.sh
@@ -21,7 +21,7 @@ function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7120"
+    export CEPH_MON="127.0.0.1:7120" # git grep '\<7120\>' : there must be only one
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
diff --git a/src/test/mon/osd-crush.sh b/src/test/mon/osd-crush.sh
index 1c2adff..4dbbd04 100755
--- a/src/test/mon/osd-crush.sh
+++ b/src/test/mon/osd-crush.sh
@@ -21,7 +21,7 @@ function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7104"
+    export CEPH_MON="127.0.0.1:7104" # git grep '\<7104\>' : there must be only one
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
@@ -124,7 +124,7 @@ function TEST_crush_rule_create_erasure() {
     # it will prevent the creation of a pool.
     #
     local crushtool_path_old=`ceph-conf --show-config-value crushtool`
-    ceph tell mon.* injectargs --crushtool "false"
+    ceph tell mon.\* injectargs --crushtool "false"
 
     expect_failure $dir "Error EINVAL" \
         ./ceph osd pool create mypool 1 1 erasure || return 1
@@ -241,9 +241,9 @@ function TEST_crush_tree() {
 function TEST_crush_repair_faulty_crushmap() {
     local dir=$1
     fsid=$(uuidgen)
-    MONA=127.0.0.1:7113
-    MONB=127.0.0.1:7114
-    MONC=127.0.0.1:7115
+    MONA=127.0.0.1:7113 # git grep '\<7113\>' : there must be only one
+    MONB=127.0.0.1:7114 # git grep '\<7114\>' : there must be only one
+    MONC=127.0.0.1:7115 # git grep '\<7115\>' : there must be only one
     CEPH_ARGS_orig=$CEPH_ARGS
     CEPH_ARGS="--fsid=$fsid --auth-supported=none "
     CEPH_ARGS+="--mon-initial-members=a,b,c "
@@ -257,7 +257,7 @@ function TEST_crush_repair_faulty_crushmap() {
     ./crushtool -c $empty_map.txt -o $empty_map.map || return 1
 
     local crushtool_path_old=`ceph-conf --show-config-value crushtool`
-    ceph tell mon.* injectargs --crushtool "true"
+    ceph tell mon.\* injectargs --crushtool "true"
 
     ceph osd setcrushmap -i $empty_map.map || return 1
     # should be an empty crush map without any buckets
diff --git a/src/test/mon/osd-erasure-code-profile.sh b/src/test/mon/osd-erasure-code-profile.sh
index 27be346..bc4e30f 100755
--- a/src/test/mon/osd-erasure-code-profile.sh
+++ b/src/test/mon/osd-erasure-code-profile.sh
@@ -21,7 +21,7 @@ function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7108"
+    export CEPH_MON="127.0.0.1:7108" # git grep '\<7108\>' : there must be only one
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
diff --git a/src/test/mon/osd-pool-create.sh b/src/test/mon/osd-pool-create.sh
index e7d1be6..0d11a23 100755
--- a/src/test/mon/osd-pool-create.sh
+++ b/src/test/mon/osd-pool-create.sh
@@ -21,7 +21,7 @@ function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7105"
+    export CEPH_MON="127.0.0.1:7105" # git grep '\<7105\>' : there must be only one
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
diff --git a/src/test/msgr/test_msgr.cc b/src/test/msgr/test_msgr.cc
index eddef22..8829c1e 100644
--- a/src/test/msgr/test_msgr.cc
+++ b/src/test/msgr/test_msgr.cc
@@ -552,7 +552,9 @@ TEST_P(MessengerTest, ClientStandbyTest) {
   usleep(300*1000);
   // client should be standby, so we use original connection
   {
-    conn->send_keepalive();
+    // Try send message to verify got remote reset callback
+    m = new MPing();
+    ASSERT_EQ(conn->send_message(m), 0);
     {
       Mutex::Locker l(cli_dispatcher.lock);
       while (!cli_dispatcher.got_remote_reset)
diff --git a/src/test/objectstore/FileStoreTracker.h b/src/test/objectstore/FileStoreTracker.h
index 11033a6..e350c80 100644
--- a/src/test/objectstore/FileStoreTracker.h
+++ b/src/test/objectstore/FileStoreTracker.h
@@ -4,7 +4,7 @@
 #define FILESTORE_TRACKER_H
 #include "test/common/ObjectContents.h"
 #include "os/FileStore.h"
-#include "os/KeyValueDB.h"
+#include "kv/KeyValueDB.h"
 #include <boost/scoped_ptr.hpp>
 #include <list>
 #include <map>
diff --git a/src/test/objectstore/TestRocksdbOptionParse.cc b/src/test/objectstore/TestRocksdbOptionParse.cc
index cdbbfa9..eaccfa1 100644
--- a/src/test/objectstore/TestRocksdbOptionParse.cc
+++ b/src/test/objectstore/TestRocksdbOptionParse.cc
@@ -5,7 +5,7 @@
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/thread_status.h"
-#include "os/RocksDBStore.h"
+#include "kv/RocksDBStore.h"
 #include <iostream>
 using namespace std;
 
diff --git a/src/test/objectstore/test_idempotent.cc b/src/test/objectstore/test_idempotent.cc
index 098bc81..d52f7db 100644
--- a/src/test/objectstore/test_idempotent.cc
+++ b/src/test/objectstore/test_idempotent.cc
@@ -21,8 +21,7 @@
 #include "common/debug.h"
 #include "test/common/ObjectContents.h"
 #include "FileStoreTracker.h"
-#include "os/LevelDBStore.h"
-#include "os/KeyValueDB.h"
+#include "kv/KeyValueDB.h"
 #include "os/ObjectStore.h"
 
 void usage(const string &name) {
@@ -63,7 +62,7 @@ int main(int argc, char **argv) {
   bool start_new = false;
   if (string(args[0]) == string("new")) start_new = true;
 
-  LevelDBStore *_db = new LevelDBStore(g_ceph_context, db_path);
+  KeyValueDB *_db = KeyValueDB::create(g_ceph_context, "leveldb", db_path);
   assert(!_db->create_and_open(std::cerr));
   boost::scoped_ptr<KeyValueDB> db(_db);
   boost::scoped_ptr<ObjectStore> store(new FileStore(store_path, store_dev));
diff --git a/src/test/objectstore/test_kv.cc b/src/test/objectstore/test_kv.cc
index df3805b..b561650 100644
--- a/src/test/objectstore/test_kv.cc
+++ b/src/test/objectstore/test_kv.cc
@@ -17,7 +17,7 @@
 #include <iostream>
 #include <time.h>
 #include <sys/mount.h>
-#include "os/KeyValueDB.h"
+#include "kv/KeyValueDB.h"
 #include "include/Context.h"
 #include "common/ceph_argparse.h"
 #include "global/global_init.h"
diff --git a/src/test/opensuse-13.2/Dockerfile.in b/src/test/opensuse-13.2/Dockerfile.in
index 85d91a2..20eb181 100644
--- a/src/test/opensuse-13.2/Dockerfile.in
+++ b/src/test/opensuse-13.2/Dockerfile.in
@@ -27,4 +27,4 @@ RUN cd /root ; ./install-deps.sh
 # development tools
 # nc (ncat) is required to run make check on firefly only (giant+ do not use nc)
 RUN zypper --non-interactive install ccache valgrind gdb git python-virtualenv gdisk kpartx hdparm ncat sudo xmlstarlet parted
-RUN useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+RUN if test %%USER%% != root ; then useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers ; fi
diff --git a/src/test/opensuse-13.2/ceph.spec.in b/src/test/opensuse-13.2/ceph.spec.in
index 8f2a6fc..2939fef 100644
--- a/src/test/opensuse-13.2/ceph.spec.in
+++ b/src/test/opensuse-13.2/ceph.spec.in
@@ -590,6 +590,11 @@ export RPM_OPT_FLAGS=`echo $RPM_OPT_FLAGS | sed -e 's/i386/i486/'`
 		%{?_with_tcmalloc} \
 		CFLAGS="$RPM_OPT_FLAGS" CXXFLAGS="$RPM_OPT_FLAGS"
 
+%if %{with lowmem_builder}
+%if 0%{?jobs} > 8
+%define _smp_mflags -j8
+%endif
+%endif
 
 make %{?_smp_mflags}
 
@@ -607,8 +612,7 @@ make %{?_smp_mflags} check-local
 make DESTDIR=$RPM_BUILD_ROOT install
 find $RPM_BUILD_ROOT -type f -name "*.la" -exec rm -f {} ';'
 find $RPM_BUILD_ROOT -type f -name "*.a" -exec rm -f {} ';'
-install -D src/rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
-install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
+install -D src/etc-rbdmap $RPM_BUILD_ROOT%{_sysconfdir}/ceph/rbdmap
 %if 0%{?fedora} || 0%{?rhel}
 install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ceph
 %endif
@@ -617,6 +621,7 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
 %endif
 %if 0%{?_with_systemd}
   install -m 0644 -D systemd/ceph.tmpfiles.d $RPM_BUILD_ROOT%{_tmpfilesdir}/ceph-common.conf
+  install -m 0644 -D systemd/rbdmap.service $RPM_BUILD_ROOT%{_unitdir}/rbdmap.service
   install -m 0644 -D systemd/ceph-osd at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-osd at .service
   install -m 0644 -D systemd/ceph-mon at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-mon at .service
   install -m 0644 -D systemd/ceph-create-keys at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-create-keys at .service
@@ -626,6 +631,7 @@ install -m 0644 -D etc/sysconfig/ceph $RPM_BUILD_ROOT%{_localstatedir}/adm/fillu
   install -m 0644 -D systemd/ceph-disk at .service $RPM_BUILD_ROOT%{_unitdir}/ceph-disk at .service
   install -m 0755 -D systemd/ceph $RPM_BUILD_ROOT%{_sbindir}/rcceph
 %else
+  install -D src/init-rbdmap $RPM_BUILD_ROOT%{_initrddir}/rbdmap
   install -D src/init-ceph $RPM_BUILD_ROOT%{_initrddir}/ceph
   install -D src/init-radosgw $RPM_BUILD_ROOT%{_initrddir}/ceph-radosgw
   ln -sf ../../etc/init.d/ceph %{buildroot}/%{_sbindir}/rcceph
@@ -810,6 +816,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_libdir}/rados-classes/libcls_timeindex.so*
 %{_libdir}/rados-classes/libcls_user.so*
 %{_libdir}/rados-classes/libcls_version.so*
+%{_libdir}/rados-classes/libcls_journal.so*
 %dir %{_libdir}/ceph/erasure-code
 %{_libdir}/ceph/erasure-code/libec_*.so*
 %if 0%{?_with_lttng}
@@ -872,6 +879,7 @@ rm -rf $RPM_BUILD_ROOT
 %{_bindir}/rbd
 %{_bindir}/rbd-replay
 %{_bindir}/rbd-replay-many
+%{_bindir}/rbdmap
 %if 0%{?_with_lttng}
 %{_bindir}/rbd-replay-prep
 %endif
@@ -901,7 +909,11 @@ rm -rf $RPM_BUILD_ROOT
 %config %{_sysconfdir}/bash_completion.d/rados
 %config %{_sysconfdir}/bash_completion.d/rbd
 %config(noreplace) %{_sysconfdir}/ceph/rbdmap
+%if 0%{?_with_systemd}
+%{_unitdir}/rbdmap.service
+%else
 %{_initrddir}/rbdmap
+%endif
 %{python_sitelib}/ceph_argparse.py*
 %{python_sitelib}/ceph_daemon.py*
 %{_udevrulesdir}/50-rbd.rules
@@ -1302,12 +1314,12 @@ exit 0
 %files libs-compat
 # We need an empty %%files list for ceph-libs-compat, to tell rpmbuild to actually
 # build this meta package.
+%endif
 
 #################################################################################
 %files devel-compat
 # We need an empty %%files list for ceph-devel-compat, to tell rpmbuild to
 # actually build this meta package.
-%endif
 
 #################################################################################
 %files -n python-ceph-compat
diff --git a/src/test/opensuse-13.2/install-deps.sh b/src/test/opensuse-13.2/install-deps.sh
index 1bebf09..8249ea3 100755
--- a/src/test/opensuse-13.2/install-deps.sh
+++ b/src/test/opensuse-13.2/install-deps.sh
@@ -62,7 +62,7 @@ CentOS|Fedora|RedHatEnterpriseServer)
             CentOS|RedHatEnterpriseServer)
                 $SUDO yum install -y yum-utils
                 MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
-                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                if test $(lsb_release -si) = RedHatEnterpriseServer ; then
                     $SUDO yum install subscription-manager
                     $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
                 fi
@@ -70,6 +70,9 @@ CentOS|Fedora|RedHatEnterpriseServer)
                 $SUDO yum install --nogpgcheck -y epel-release
                 $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
                 $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
+                if test $(lsb_release -si) = CentOS -a $MAJOR_VERSION = 7 ; then
+                    $SUDO yum-config-manager --enable cr
+                fi
                 ;;
         esac
         sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
diff --git a/src/test/osd/osd-bench.sh b/src/test/osd/osd-bench.sh
index fd466f8..0fb5ab8 100755
--- a/src/test/osd/osd-bench.sh
+++ b/src/test/osd/osd-bench.sh
@@ -22,7 +22,7 @@ function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7106"
+    export CEPH_MON="127.0.0.1:7106" # git grep '\<7106\>' : there must be only one
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
diff --git a/src/test/osd/osd-config.sh b/src/test/osd/osd-config.sh
index 1f73485..3cb7fa6 100755
--- a/src/test/osd/osd-config.sh
+++ b/src/test/osd/osd-config.sh
@@ -22,7 +22,7 @@ function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7100"
+    export CEPH_MON="127.0.0.1:7100" # git grep '\<7100\>' : there must be only one
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
diff --git a/src/test/osd/osd-copy-from.sh b/src/test/osd/osd-copy-from.sh
index 6f1e037..375ad44 100755
--- a/src/test/osd/osd-copy-from.sh
+++ b/src/test/osd/osd-copy-from.sh
@@ -23,7 +23,7 @@ function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7111"
+    export CEPH_MON="127.0.0.1:7111" # git grep '\<7111\>' : there must be only one
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
diff --git a/src/test/osd/osd-reactivate.sh b/src/test/osd/osd-reactivate.sh
new file mode 100755
index 0000000..9bc2933
--- /dev/null
+++ b/src/test/osd/osd-reactivate.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+#
+# Author: Vicente Cheng <freeze.bilsted at gmail.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+
+source ../qa/workunits/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    export CEPH_MON="127.0.0.1:7122" # git grep '\<7122\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        setup $dir || return 1
+        $func $dir || return 1
+        teardown $dir || return 1
+    done
+}
+
+function TEST_reactivate() {
+    local dir=$1
+
+    run_mon $dir a || return 1
+    run_osd $dir 0 || return 1
+
+    kill_daemons $dir TERM osd || return 1
+
+    ready_path=$dir"/0/ready"
+    activate_path=$dir"/0/active"
+    # trigger mkfs again
+    rm -rf $ready_path $activate_path
+    activate_osd $dir 0 || return 1
+
+}
+
+main osd-reactivate "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && test/osd/osd-reactivate.sh"
+# End:
diff --git a/src/test/osd/osd-scrub-repair.sh b/src/test/osd/osd-scrub-repair.sh
index 13fac7c..ca32997 100755
--- a/src/test/osd/osd-scrub-repair.sh
+++ b/src/test/osd/osd-scrub-repair.sh
@@ -20,7 +20,7 @@ function run() {
     local dir=$1
     shift
 
-    export CEPH_MON="127.0.0.1:7107"
+    export CEPH_MON="127.0.0.1:7107" # git grep '\<7107\>' : there must be only one
     export CEPH_ARGS
     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
     CEPH_ARGS+="--mon-host=$CEPH_MON "
@@ -144,6 +144,49 @@ function corrupt_and_repair_erasure_coded() {
 
 }
 
+function TEST_auto_repair_erasure_coded() {
+    local dir=$1
+    local poolname=ecpool
+
+    # Launch a cluster with 5 seconds scrub interval
+    setup $dir || return 1
+    run_mon $dir a || return 1
+    for id in $(seq 0 2) ; do
+        run_osd $dir $id \
+            --osd-scrub-auto-repair=true \
+            --osd-deep-scrub-interval=5 \
+            --osd-scrub-max-interval=5 \
+            --osd-scrub-min-interval=5 \
+            --osd-scrub-interval-randomize-ratio=0
+    done
+    wait_for_clean || return 1
+
+    # Create an EC pool
+    ceph osd erasure-code-profile set myprofile \
+        k=2 m=1 ruleset-failure-domain=osd || return 1
+    ceph osd pool create $poolname 8 8 erasure myprofile || return 1
+    wait_for_clean || return 1
+
+    # Put an object
+    local payload=ABCDEF
+    echo $payload > $dir/ORIGINAL
+    rados --pool $poolname put SOMETHING $dir/ORIGINAL || return 1
+
+    # Remove the object from one shard physically
+    objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING remove || return 1
+
+    # Give some time for auto repair
+    sleep 20
+
+    # Verify - the file should be back
+    objectstore_tool $dir $(get_not_primary $poolname SOMETHING) SOMETHING list-attrs || return 1
+    rados --pool $poolname get SOMETHING $dir/COPY || return 1
+    diff $dir/ORIGINAL $dir/COPY || return 1
+
+    # Tear down
+    teardown $dir || return 1
+}
+
 function TEST_corrupt_and_repair_jerasure() {
     local dir=$1
     local poolname=ecpool
diff --git a/src/test/osd/osd-scrub-snaps.sh b/src/test/osd/osd-scrub-snaps.sh
new file mode 100755
index 0000000..9819c04
--- /dev/null
+++ b/src/test/osd/osd-scrub-snaps.sh
@@ -0,0 +1,227 @@
+#! /bin/bash
+#
+# Copyright (C) 2015 Red Hat <contact at redhat.com>
+#
+# Author: David Zafman <dzafman at redhat.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Library Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Library Public License for more details.
+#
+source ../qa/workunits/ceph-helpers.sh
+
+function run() {
+    local dir=$1
+    shift
+
+    export CEPH_MON="127.0.0.1:7121" # git grep '\<7121\>' : there must be only one
+    export CEPH_ARGS
+    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
+    CEPH_ARGS+="--mon-host=$CEPH_MON "
+
+    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
+    for func in $funcs ; do
+        $func $dir || return 1
+    done
+}
+
+function TEST_scrub_snaps() {
+    local dir=$1
+    local poolname=test
+
+    TESTDATA="testdata.$$"
+
+    setup $dir || return 1
+    run_mon $dir a --osd_pool_default_size=1 || return 1
+    run_osd $dir 0 || return 1
+
+    wait_for_clean || return 1
+
+    # Create a pool with a single pg
+    ceph osd pool create $poolname 1 1
+    poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
+
+    dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
+    for i in `seq 1 14`
+    do
+        rados -p $poolname put obj${i} $TESTDATA
+    done
+
+    SNAP=1
+    rados -p $poolname mksnap snap${SNAP}
+    dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+    rados -p $poolname put obj1 $TESTDATA
+    rados -p $poolname put obj5 $TESTDATA
+    rados -p $poolname put obj3 $TESTDATA
+    for i in `seq 6 14`
+     do rados -p $poolname put obj${i} $TESTDATA
+    done
+
+    SNAP=2
+    rados -p $poolname mksnap snap${SNAP}
+    dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+    rados -p $poolname put obj5 $TESTDATA
+
+    SNAP=3
+    rados -p $poolname mksnap snap${SNAP}
+    dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+    rados -p $poolname put obj3 $TESTDATA
+
+    SNAP=4
+    rados -p $poolname mksnap snap${SNAP}
+    dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+    rados -p $poolname put obj5 $TESTDATA
+    rados -p $poolname put obj2 $TESTDATA
+
+    SNAP=5
+    rados -p $poolname mksnap snap${SNAP}
+    SNAP=6
+    rados -p $poolname mksnap snap${SNAP}
+    dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
+    rados -p $poolname put obj5 $TESTDATA
+
+    SNAP=7
+    rados -p $poolname mksnap snap${SNAP}
+
+    rados -p $poolname rm obj4
+    rados -p $poolname rm obj2
+
+    kill_daemons $dir KILL osd || return 1
+    sleep 5
+
+    # Don't need to ceph_objectstore_tool function because osd stopped
+
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj1 | grep \"snapid\":-2)"
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
+
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":2)"
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
+
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":1)"
+    OBJ5SAVE="$JSON"
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
+
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj5 | grep \"snapid\":4)"
+    dd if=/dev/urandom of=$TESTDATA bs=256 count=18
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
+
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj3 | grep \"snapid\":-2)"
+    dd if=/dev/urandom of=$TESTDATA bs=256 count=15
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
+
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj4 | grep \"snapid\":7)"
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" remove
+
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj2 | grep \"snapid\":-1)"
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" rm-attr snapset
+
+    # Create a clone which isn't in snapset and doesn't have object info
+    JSON="$(echo "$OBJ5SAVE" | sed s/snapid\":1/snapid\":7/)"
+    dd if=/dev/urandom of=$TESTDATA bs=256 count=7
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" set-bytes $TESTDATA
+
+    rm -f $TESTDATA
+
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj6 | grep \"snapid\":-2)"
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj7 | grep \"snapid\":-2)"
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset corrupt
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj8 | grep \"snapid\":-2)"
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset seq
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj9 | grep \"snapid\":-2)"
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clone_size
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj10 | grep \"snapid\":-2)"
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clone_overlap
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj11 | grep \"snapid\":-2)"
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset clones
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj12 | grep \"snapid\":-2)"
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset head
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj13 | grep \"snapid\":-2)"
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset snaps
+    JSON="$(ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal --op list obj14 | grep \"snapid\":-2)"
+    ceph-objectstore-tool --data-path $dir/0 --journal-path $dir/0/journal "$JSON" clear-snapset size
+
+    run_osd $dir 0 || return 1
+    wait_for_clean || return 1
+
+    sleep 5
+    ceph pg scrub ${poolid}.0
+    timeout 30 ceph -w
+
+    for i in `seq 1 7`
+    do
+        rados -p $poolname rmsnap snap$i
+    done
+
+    sleep 10
+
+    ERRORS=0
+
+    pidfile=$(find $dir 2>/dev/null | grep $name_prefix'[^/]*\.pid')
+    pid=$(cat $pidfile)
+    if ! kill -0 $pid
+    then
+        echo "OSD crash occurred"
+        tail -100 $dir/osd.0.log
+        ERRORS=$(expr $ERRORS + 1)
+    fi
+
+    kill_daemons $dir || return 1
+
+    declare -a err_strings
+    err_strings[0]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/2acecc8b/obj10/1 is missing in clone_overlap"
+    err_strings[1]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/7 no '_' attr"
+    err_strings[2]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/7 is an unexpected clone"
+    err_strings[3]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/4 on disk size [(]4608[)] does not match object info size [(]512[)] adjusted for ondisk to [(]512[)]"
+    err_strings[4]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/head expected clone [0-9]*/666934a3/obj5/2"
+    err_strings[5]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/head expected clone [0-9]*/666934a3/obj5/1"
+    err_strings[6]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 [0-9]*/666934a3/obj5/head 2 missing clone[(]s[)]"
+    err_strings[7]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/d3a9faf5/obj12/head snapset.head_exists=false, but head exists"
+    err_strings[8]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/8df7eaa5/obj8/head snaps.seq not set"
+    err_strings[9]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/5c889059/obj7/head snapset.head_exists=false, but head exists"
+    err_strings[10]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/5c889059/obj7/1 is an unexpected clone"
+    err_strings[11]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/61f68bb1/obj3/head on disk size [(]3840[)] does not match object info size [(]768[)] adjusted for ondisk to [(]768[)]"
+    err_strings[12]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/83425cc4/obj6/1 is an unexpected clone"
+    err_strings[13]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/3f1ee208/obj2/snapdir no 'snapset' attr"
+    err_strings[14]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 [0-9]*/3f1ee208/obj2/7 clone ignored due to missing snapset"
+    err_strings[15]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 [0-9]*/3f1ee208/obj2/4 clone ignored due to missing snapset"
+    err_strings[16]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/a8759770/obj4/snapdir expected clone [0-9]*/a8759770/obj4/7"
+    err_strings[17]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 [0-9]*/a8759770/obj4/snapdir 1 missing clone[(]s[)]"
+    err_strings[18]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/6cf8deff/obj1/1 is an unexpected clone"
+    err_strings[19]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/e478ac7f/obj9/1 is missing in clone_size"
+    err_strings[20]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/29547577/obj11/1 is an unexpected clone"
+    err_strings[21]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*/94122507/obj14/1 size 1032 != clone_size 1033"
+    err_strings[22]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 19 errors"
+
+    for i in `seq 0 ${#err_strings[@]}`
+    do
+        if ! grep "${err_strings[$i]}" $dir/osd.0.log > /dev/null;
+        then
+            echo "Missing log message '${err_strings[$i]}'"
+            ERRORS=$(expr $ERRORS + 1)
+        fi
+    done
+
+    teardown $dir || return 1
+
+    if [ $ERRORS != "0" ];
+    then
+        echo "TEST FAILED WITH $ERRORS ERRORS"
+        return 1
+    fi
+
+    echo "TEST PASSED"
+    return 0
+}
+
+main osd-scrub-snaps "$@"
+
+# Local Variables:
+# compile-command: "cd ../.. ; make -j4 && \
+#    test/osd/osd-scrub-snaps.sh"
diff --git a/src/test/osdc/FakeWriteback.cc b/src/test/osdc/FakeWriteback.cc
index 69f3299..32ae4f5 100644
--- a/src/test/osdc/FakeWriteback.cc
+++ b/src/test/osdc/FakeWriteback.cc
@@ -74,7 +74,7 @@ ceph_tid_t FakeWriteback::write(const object_t& oid,
 			   const SnapContext& snapc,
 			   const bufferlist &bl, utime_t mtime,
 			   uint64_t trunc_size, __u32 trunc_seq,
-			   Context *oncommit)
+			   ceph_tid_t journal_tid, Context *oncommit)
 {
   C_Delay *wrapper = new C_Delay(m_cct, oncommit, m_lock, off, NULL, m_delay_ns);
   m_finisher->queue(wrapper, 0);
diff --git a/src/test/osdc/FakeWriteback.h b/src/test/osdc/FakeWriteback.h
index 9b9598e..351d521 100644
--- a/src/test/osdc/FakeWriteback.h
+++ b/src/test/osdc/FakeWriteback.h
@@ -26,7 +26,8 @@ public:
 		           uint64_t off, uint64_t len,
 			   const SnapContext& snapc, const bufferlist &bl,
 			   utime_t mtime, uint64_t trunc_size,
-			   __u32 trunc_seq, Context *oncommit);
+			   __u32 trunc_seq, ceph_tid_t journal_tid,
+                           Context *oncommit);
 
   virtual bool may_copy_on_write(const object_t&, uint64_t, uint64_t, snapid_t);
 private:
diff --git a/src/test/osdc/object_cacher_stress.cc b/src/test/osdc/object_cacher_stress.cc
index 39aabfd..7b04d4a 100644
--- a/src/test/osdc/object_cacher_stress.cc
+++ b/src/test/osdc/object_cacher_stress.cc
@@ -72,6 +72,7 @@ int stress_test(uint64_t num_ops, uint64_t num_objs,
   SnapContext snapc;
   ceph::buffer::ptr bp(max_op_len);
   ceph::bufferlist bl;
+  uint64_t journal_tid = 0;
   bp.zero();
   bl.append(bp);
 
@@ -109,7 +110,8 @@ int stress_test(uint64_t num_ops, uint64_t num_objs,
       else
 	assert(r == 0);
     } else {
-      ObjectCacher::OSDWrite *wr = obc.prepare_write(snapc, bl, utime_t(), 0);
+      ObjectCacher::OSDWrite *wr = obc.prepare_write(snapc, bl, utime_t(), 0,
+                                                     ++journal_tid);
       wr->extents.push_back(op->extent);
       lock.Lock();
       obc.writex(wr, &object_set, NULL);
diff --git a/src/test/pybind/test_ceph_argparse.py b/src/test/pybind/test_ceph_argparse.py
index b0f608d..fd44977 100755
--- a/src/test/pybind/test_ceph_argparse.py
+++ b/src/test/pybind/test_ceph_argparse.py
@@ -103,9 +103,6 @@ class TestPG(TestArgparse):
     def test_getmap(self):
         self.assert_valid_command(['pg', 'getmap'])
 
-    def test_send_pg_creates(self):
-        self.assert_valid_command(['pg', 'send_pg_creates'])
-
     def test_dump(self):
         self.assert_valid_command(['pg', 'dump'])
         self.assert_valid_command(['pg', 'dump',
diff --git a/src/test/python/ceph-disk/tests/test_ceph_disk.py b/src/test/python/ceph-disk/tests/test_ceph_disk.py
index a150dd3..b0c818c 100644
--- a/src/test/python/ceph-disk/tests/test_ceph_disk.py
+++ b/src/test/python/ceph-disk/tests/test_ceph_disk.py
@@ -1,4 +1,8 @@
 from mock import patch, DEFAULT, Mock
+import os
+import io
+import subprocess
+import unittest
 import argparse
 import pytest
 import ceph_disk
@@ -136,7 +140,11 @@ class TestCephDisk(object):
                     'ptype': ptype,
                     'state': 'prepared',
                 }
-                out = ceph_disk.list_format_dev_plain(dev, devices)
+                with patch.multiple(
+                        ceph_disk,
+                        list_devices=lambda path: devices,
+                        ):
+                    out = ceph_disk.list_format_dev_plain(dev, devices)
                 assert 'data' in out
                 assert 'dmcrypt' in out
                 assert type in out
@@ -638,3 +646,758 @@ class TestCephDisk(object):
                        'ptype': 'unknown',
                        'type': 'other'}]
             assert expect == ceph_disk.list_devices(args)
+
+class TestCephDiskDeactivateAndDestroy(unittest.TestCase):
+
+    def setup_class(self):
+        ceph_disk.setup_logging(verbose=True, log_stdout=False)
+
+    @patch('__builtin__.open')
+    def test_main_deactivate(self, mock_open):
+        DMCRYPT_OSD_UUID = '4fbd7e29-9d25-41b8-afd0-5ec00ceff05d'
+        DMCRYPT_LUKS_OSD_UUID = '4fbd7e29-9d25-41b8-afd0-35865ceff05d'
+        part_uuid = '0ce28a16-6d5d-11e5-aec3-fa163e5c167b'
+        disk = 'sdX'
+        cluster = 'ceph'
+        #
+        # Can not find match device by osd-id
+        #
+        args = ceph_disk.parse_args(['deactivate', \
+                                     '--cluster', 'ceph', \
+                                     '--deactivate-by-id', '5566'])
+        fake_device = [{'path': '/dev/' + disk,
+                          'partitions': [{
+                                  'path': '/dev/sdX1',
+                                  'whoami': '-1',
+                                  }]}]
+        with patch.multiple(
+                ceph_disk,
+                list_devices=lambda dev:fake_device,
+                ):
+            self.assertRaises(Exception, ceph_disk.main_deactivate, args)
+
+        #
+        # find match device by osd-id, status: OSD_STATUS_IN_DOWN
+        # with --mark-out option
+        #
+        args = ceph_disk.parse_args(['deactivate', \
+                                     '--cluster', 'ceph', \
+                                     '--deactivate-by-id', '5566', \
+                                     '--mark-out'])
+        fake_device = [{'path': '/dev/' + disk,
+                          'partitions': [{
+                                  'ptype': DMCRYPT_LUKS_OSD_UUID,
+                                  'path': '/dev/sdX1',
+                                  'whoami': '5566',
+                                  'mount': '/var/lib/ceph/osd/ceph-5566/',
+                                  'uuid': part_uuid,
+                                  }]}]
+        with patch.multiple(
+                ceph_disk,
+                list_devices=lambda dev:fake_device,
+                _check_osd_status=lambda cluster, osd_id: 2,
+                _mark_osd_out=lambda cluster, osd_id: True
+                ):
+            ceph_disk.main_deactivate(args)
+
+        #
+        # find match device by device partition, status: OSD_STATUS_IN_DOWN
+        #
+        args = ceph_disk.parse_args(['deactivate', \
+                                     '--cluster', 'ceph', \
+                                     '/dev/sdX1'])
+        fake_device = [{'path': '/dev/' + disk,
+                          'partitions': [{
+                                  'ptype': DMCRYPT_LUKS_OSD_UUID,
+                                  'path': '/dev/sdX1',
+                                  'whoami': '5566',
+                                  'mount': '/var/lib/ceph/osd/ceph-5566/',
+                                  'uuid': part_uuid,
+                                  }]}]
+        with patch.multiple(
+                ceph_disk,
+                list_devices=lambda dev:fake_device,
+                _check_osd_status=lambda cluster, osd_id: 0,
+                ):
+            ceph_disk.main_deactivate(args)
+
+        #
+        # find match device by device partition, status: OSD_STATUS_IN_UP
+        # with --mark-out option
+        #
+        args = ceph_disk.parse_args(['deactivate', \
+                                     '--cluster', 'ceph', \
+                                     '/dev/sdX1', \
+                                     '--mark-out'])
+        fake_device = [{'path': '/dev/' + disk,
+                          'partitions': [{
+                                  'ptype': DMCRYPT_LUKS_OSD_UUID,
+                                  'path': '/dev/sdX1',
+                                  'whoami': '5566',
+                                  'mount': '/var/lib/ceph/osd/ceph-5566/',
+                                  'uuid': part_uuid,
+                                  }]}]
+
+        # mock the file open.
+        file_opened = io.StringIO()
+        file_opened.write(u'deactive')
+        mock_open.return_value = file_opened
+
+        with patch.multiple(
+                ceph_disk,
+                mock_open,
+                list_devices=lambda dev:fake_device,
+                _check_osd_status=lambda cluster, osd_id: 3,
+                _mark_osd_out=lambda cluster, osd_id: True,
+                stop_daemon=lambda cluster, osd_id: True,
+                _remove_osd_directory_files=lambda path, cluster: True,
+                path_set_context=lambda path: True,
+                unmount=lambda path: True,
+                dmcrypt_unmap=lambda part_uuid: True,
+                ):
+            ceph_disk.main_deactivate(args)
+
+        #
+        # find match device by osd-id, status: OSD_STATUS_OUT_UP
+        #
+        args = ceph_disk.parse_args(['deactivate', \
+                                     '--cluster', 'ceph', \
+                                     '--deactivate-by-id', '5566'])
+        fake_device = [{'path': '/dev/' + disk,
+                          'partitions': [{
+                                  'ptype': DMCRYPT_LUKS_OSD_UUID,
+                                  'path': '/dev/sdX1',
+                                  'whoami': '5566',
+                                  'mount': '/var/lib/ceph/osd/ceph-5566/',
+                                  'uuid': part_uuid,
+                                  }]}]
+
+        # mock the file open.
+        file_opened = io.StringIO()
+        file_opened.write(u'deactive')
+        mock_open.return_value = file_opened
+
+        with patch.multiple(
+                ceph_disk,
+                mock_open,
+                list_devices=lambda dev:fake_device,
+                _check_osd_status=lambda cluster, osd_id: 1,
+                _mark_osd_out=lambda cluster, osd_id: True,
+                stop_daemon=lambda cluster, osd_id: True,
+                _remove_osd_directory_files=lambda path, cluster: True,
+                path_set_context=lambda path: True,
+                unmount=lambda path: True,
+                dmcrypt_unmap=lambda part_uuid: True,
+                ):
+            ceph_disk.main_deactivate(args)
+
+    def test_mark_out_out(self):
+        dev = {
+            'cluster': 'ceph',
+            'osd_id': '5566',
+        }
+
+        def mark_osd_out_fail(osd_id):
+            raise ceph_disk.Error('Could not find osd.%s, is a vaild/exist osd id?' % osd_id)
+
+        with patch.multiple(
+                ceph_disk,
+                command=mark_osd_out_fail,
+                ):
+            self.assertRaises(Exception, ceph_disk._mark_osd_out, 'ceph', '5566')
+
+    def test_check_osd_status(self):
+        #
+        # command failure
+        #
+        with patch.multiple(
+                ceph_disk,
+                command=raise_command_error,
+                ):
+            self.assertRaises(Exception, ceph_disk._check_osd_status, 'ceph', '5566')
+
+        #
+        # osd not found
+        #
+
+        fake_data = '{"osds":[{"osd":0,"up":1,"in":1},{"osd":1,"up":1,"in":1}]}'
+
+        def return_fake_value(cmd):
+            return fake_data, 0
+
+        with patch.multiple(
+                ceph_disk,
+                command=return_fake_value,
+                ):
+            self.assertRaises(Exception, ceph_disk._check_osd_status, 'ceph', '5566')
+
+        #
+        # successfully
+        #
+
+        fake_data = '{"osds":[{"osd":0,"up":1,"in":1},{"osd":5566,"up":1,"in":1}]}'
+
+        def return_fake_value(cmd):
+            return fake_data, 0
+
+        with patch.multiple(
+                ceph_disk,
+                command=return_fake_value,
+                ):
+            ceph_disk._check_osd_status('ceph', '5566')
+
+    def test_stop_daemon(self):
+        STATEDIR = '/var/lib/ceph'
+        cluster = 'ceph'
+        osd_id = '5566'
+
+        def stop_daemon_fail(cmd):
+            raise Exception('ceph osd stop failed')
+
+        #
+        # fail on init type
+        #
+        with patch('os.path.exists', return_value=False):
+            self.assertRaises(Exception, ceph_disk.stop_daemon, 'ceph', '5566')
+
+        #
+        # faile on os path
+        #
+        with patch('os.path.exists', return_value=Exception):
+            self.assertRaises(Exception, ceph_disk.stop_daemon, 'ceph', '5566')
+
+        #
+        # upstart failure
+        #
+        fake_path = (STATEDIR + '/osd/{cluster}-{osd_id}/upstart').format(
+                    cluster=cluster, osd_id=osd_id)
+
+        def path_exist(check_path):
+            if check_path == fake_path:
+                return True
+            else:
+                False
+
+        patcher = patch('os.path.exists')
+        check_path = patcher.start()
+        check_path.side_effect = path_exist
+        with patch.multiple(
+                ceph_disk,
+                check_path,
+                command_check_call=stop_daemon_fail,
+                ):
+            self.assertRaises(Exception, ceph_disk.stop_daemon, 'ceph', '5566')
+
+        #
+        # sysvinit failure
+        #
+        fake_path = (STATEDIR + '/osd/{cluster}-{osd_id}/sysvinit').format(
+                    cluster=cluster, osd_id=osd_id)
+
+        def path_exist(check_path):
+            if check_path == fake_path:
+                return True
+            else:
+                return False
+
+        patcher = patch('os.path.exists')
+        check_path = patcher.start()
+        check_path.side_effect = path_exist
+        with patch.multiple(
+                ceph_disk,
+                check_path,
+                which=lambda name: True,
+                command_check_call=stop_daemon_fail,
+                ):
+            self.assertRaises(Exception, ceph_disk.stop_daemon, 'ceph', '5566')
+
+        #
+        # systemd failure
+        #
+        fake_path = (STATEDIR + '/osd/{cluster}-{osd_id}/systemd').format(
+                    cluster=cluster, osd_id=osd_id)
+
+        def path_exist(check_path):
+            if check_path == fake_path:
+                return True
+            else:
+                False
+
+        def stop_daemon_fail(cmd):
+            if 'stop' in cmd:
+                raise Exception('ceph osd stop failed')
+            else:
+                return True
+
+        patcher = patch('os.path.exists')
+        check_path = patcher.start()
+        check_path.side_effect = path_exist
+        with patch.multiple(
+                ceph_disk,
+                check_path,
+                command_check_call=stop_daemon_fail,
+                ):
+            self.assertRaises(Exception, ceph_disk.stop_daemon, 'ceph', '5566')
+
+    def test_remove_osd_directory_files(self):
+        cluster = 'ceph'
+        mounted_path = 'somewhere'
+        fake_path_2 = None
+        fake_path_remove_2 = None
+        fake_path_remove_init = None
+
+        def handle_path_exist(check_path):
+            if check_path == fake_path:
+                return True
+            elif fake_path_2 and check_path == fake_path_2:
+                return True
+            else:
+                return False
+
+        def handle_path_remove(remove_path):
+            if remove_path == fake_path_remove:
+                return True
+            elif fake_path_remove_2 and remove_path == fake_path_remove_2:
+                return True
+            elif fake_path_remove_init and remove_path == fake_path_remove_init:
+                return True
+            else:
+                raise OSError
+
+        #
+        # remove ready file failure
+        #
+        fake_path = os.path.join(mounted_path, 'ready')
+        fake_path_remove = os.path.join(mounted_path, 'no_ready')
+
+        patcher_exist = patch('os.path.exists')
+        patcher_remove = patch('os.remove')
+        path_exist = patcher_exist.start()
+        path_remove = patcher_remove.start()
+        path_exist.side_effect = handle_path_exist
+        path_remove.side_effect = handle_path_remove
+        with patch.multiple(
+                ceph_disk,
+                path_exist,
+                path_remove,
+                get_conf=lambda cluster, **kwargs: True,
+                ):
+            self.assertRaises(Exception, ceph_disk._remove_osd_directory_files, 'somewhere', cluster)
+
+        #
+        # remove active fil failure
+        #
+        fake_path = os.path.join(mounted_path, 'ready')
+        fake_path_2 = os.path.join(mounted_path, 'active')
+        fake_path_remove = os.path.join(mounted_path, 'ready')
+        fake_path_remove_2 = os.path.join(mounted_path, 'no_active')
+
+        patcher_exist = patch('os.path.exists')
+        patcher_remove = patch('os.remove')
+        path_exist = patcher_exist.start()
+        path_remove = patcher_remove.start()
+        path_exist.side_effect = handle_path_exist
+        path_remove.side_effect = handle_path_remove
+        with patch.multiple(
+                ceph_disk,
+                path_exist,
+                path_remove,
+                get_conf=lambda cluster, **kwargs: True,
+                ):
+            self.assertRaises(Exception, ceph_disk._remove_osd_directory_files, 'somewhere', cluster)
+
+        #
+        # conf_val is None and remove init file failure
+        #
+        fake_path = os.path.join(mounted_path, 'ready')
+        fake_path_2 = os.path.join(mounted_path, 'active')
+        fake_path_remove = os.path.join(mounted_path, 'ready')
+        fake_path_remove_2 = os.path.join(mounted_path, 'active')
+        fake_path_remove_init = os.path.join(mounted_path, 'init_failure')
+
+        patcher_exist = patch('os.path.exists')
+        patcher_remove = patch('os.remove')
+        path_exist = patcher_exist.start()
+        path_remove = patcher_remove.start()
+        path_exist.side_effect = handle_path_exist
+        path_remove.side_effect = handle_path_remove
+        with patch.multiple(
+                ceph_disk,
+                path_exist,
+                path_remove,
+                get_conf=lambda cluster, **kwargs: None,
+                init_get=lambda: 'upstart',
+                ):
+            self.assertRaises(Exception, ceph_disk._remove_osd_directory_files, 'somewhere', cluster)
+
+        #
+        # already remove `ready`, `active` and remove init file successfully
+        #
+        fake_path = os.path.join(mounted_path, 'no_ready')
+        fake_path_2 = os.path.join(mounted_path, 'no_active')
+        fake_path_remove = os.path.join(mounted_path, 'upstart')
+
+        patcher_exist = patch('os.path.exists')
+        patcher_remove = patch('os.remove')
+        path_exist = patcher_exist.start()
+        path_remove = patcher_remove.start()
+        path_exist.side_effect = handle_path_exist
+        path_remove.side_effect = handle_path_remove
+        with patch.multiple(
+                ceph_disk,
+                path_exist,
+                path_remove,
+                get_conf=lambda cluster, **kwargs: 'upstart',
+                ):
+            ceph_disk._remove_osd_directory_files('somewhere', cluster)
+
+    def test_path_set_context(self):
+        path = '/somewhere'
+        with patch.multiple(
+                ceph_disk,
+                get_ceph_user=lambda **kwargs: 'ceph',
+                ):
+            ceph_disk.path_set_context(path)
+
+    def test_mount(self):
+        #
+        # None to mount
+        #
+        dev = None
+        fs_type = 'ext4'
+        option = ''
+        self.assertRaises(Exception, ceph_disk.mount, dev, fs_type, option)
+
+        #
+        # fstype undefine
+        #
+        dev = '/dev/Xda1'
+        fs_type = None
+        option = ''
+        self.assertRaises(Exception, ceph_disk.mount, dev, fs_type, option)
+
+        #
+        # mount failure
+        #
+        dev = '/dev/Xda1'
+        fstype = 'ext4'
+        options = ''
+        with patch('tempfile.mkdtemp', return_value='/mnt'):
+            self.assertRaises(Exception, ceph_disk.mount, dev, fstype, options)
+
+        #
+        # mount successfully
+        #
+        def create_temp_directory(*args, **kwargs):
+            return '/mnt'
+
+        dev = '/dev/Xda1'
+        fstype = 'ext4'
+        options = ''
+        patcher = patch('tempfile.mkdtemp')
+        create_tmpdir = patcher.start()
+        create_tmpdir.side_effect = create_temp_directory
+        with patch.multiple(
+                ceph_disk,
+                create_tmpdir,
+                command_check_call=lambda cmd: True,
+                ):
+            ceph_disk.mount(dev, fstype, options)
+
+    def test_umount(self):
+        #
+        # umount failure
+        #
+        path = '/somewhere'
+        self.assertRaises(Exception, ceph_disk.unmount, path)
+
+        #
+        # umount successfully
+        #
+        def remove_directory_successfully(path):
+            return True
+
+        path = '/somewhere'
+        patcher = patch('os.rmdir')
+        rm_directory = patcher.start()
+        rm_directory.side_effect = remove_directory_successfully
+        with patch.multiple(
+                ceph_disk,
+                rm_directory,
+                command_check_call=lambda cmd: True,
+                ):
+            ceph_disk.unmount(path)
+
+    def test_main_destroy(self):
+        DMCRYPT_OSD_UUID = '4fbd7e29-9d25-41b8-afd0-5ec00ceff05d'
+        DMCRYPT_LUKS_OSD_UUID = '4fbd7e29-9d25-41b8-afd0-35865ceff05d'
+        OSD_UUID = '4fbd7e29-9d25-41b8-afd0-062c0ceff05d'
+        MPATH_OSD_UUID = '4fbd7e29-8ae0-4982-bf9d-5a8d867af560'
+        part_uuid = '0ce28a16-6d5d-11e5-aec3-fa163e5c167b'
+        journal_uuid = "7ad5e65a-0ca5-40e4-a896-62a74ca61c55"
+        cluster = 'ceph'
+
+        fake_devices_normal = [{'path': '/dev/sdY',
+                                    'partitions': [{
+                                    'dmcrypt': {},
+                                    'ptype': OSD_UUID,
+                                    'path': '/dev/sdY1',
+                                    'whoami': '5566',
+                                    'mount': '/var/lib/ceph/osd/ceph-5566/',
+                                    'uuid': part_uuid,
+                                    'journal_uuid': journal_uuid}]},
+                               {'path': '/dev/sdX',
+                               'partitions': [{
+                                    'dmcrypt': {},
+                                    'ptype': MPATH_OSD_UUID,
+                                    'path': '/dev/sdX1',
+                                    'whoami': '7788',
+                                    'mount': '/var/lib/ceph/osd/ceph-7788/',
+                                    'uuid': part_uuid,
+                                    'journal_uuid': journal_uuid}]}
+                              ]
+        fake_devices_dmcrypt_unmap = [{'path': '/dev/sdY',
+                                        'partitions': [{
+                                        'dmcrypt': {
+                                            'holders': '',
+                                            'type': type,
+                                        },
+                                        'ptype': DMCRYPT_OSD_UUID,
+                                        'path': '/dev/sdX1',
+                                        'whoami': '5566',
+                                        'mount': '/var/lib/ceph/osd/ceph-5566/',
+                                        'uuid': part_uuid,
+                                        'journal_uuid': journal_uuid}]}]
+        fake_devices_dmcrypt_luk_unmap = [{'path': '/dev/sdY',
+                                            'partitions': [{
+                                            'dmcrypt': {
+                                                'holders': '',
+                                                'type': type,
+                                            },
+                                            'ptype': DMCRYPT_LUKS_OSD_UUID,
+                                            'path': '/dev/sdX1',
+                                            'whoami': '5566',
+                                            'mount': '/var/lib/ceph/osd/ceph-5566/',
+                                            'uuid': part_uuid,
+                                            'journal_uuid': journal_uuid}]}]
+        fake_devices_dmcrypt_unknow = [{'path': '/dev/sdY',
+                                            'partitions': [{
+                                            'dmcrypt': {
+                                                'holders': '',
+                                                'type': type,
+                                            },
+                                            'ptype': '00000000-0000-0000-0000-000000000000',
+                                            'path': '/dev/sdX1',
+                                            'whoami': '5566',
+                                            'mount': '/var/lib/ceph/osd/ceph-5566/',
+                                            'uuid': part_uuid,
+                                            'journal_uuid': journal_uuid}]}]
+        fake_devices_dmcrypt_map = [{'dmcrypt': {
+                                        'holders': 'dm_0',
+                                        'type': type,
+                                        },
+                                     'ptype': DMCRYPT_OSD_UUID,
+                                     'path': '/dev/sdX1',
+                                     'whoami': '5566',
+                                     'mount': '/var/lib/ceph/osd/ceph-5566/',
+                                     'uuid': part_uuid,
+                                     'journal_uuid': journal_uuid}]
+
+        def list_devices_return(dev):
+            if dev == []:
+                return fake_devices_normal
+
+        #
+        # input device is not the device partition
+        #
+        args = ceph_disk.parse_args(['destroy', \
+                                     '--cluster', 'ceph', \
+                                     '/dev/sdX'])
+        with patch.multiple(
+                ceph_disk,
+                is_partition=lambda path: False,
+                ):
+            self.assertRaises(Exception, ceph_disk.main_destroy, args)
+
+        #
+        # skip the redundent devices and not found by dev
+        #
+        args = ceph_disk.parse_args(['destroy', \
+                                     '--cluster', 'ceph', \
+                                     '/dev/sdZ1'])
+        with patch.multiple(
+                ceph_disk,
+                is_partition=lambda path: True,
+                list_devices=list_devices_return,
+                ):
+            self.assertRaises(Exception, ceph_disk.main_destroy, args)
+
+        #
+        # skip the redundent devices and not found by osd-id
+        #
+        args = ceph_disk.parse_args(['destroy', \
+                                     '--cluster', 'ceph', \
+                                     '--destroy-by-id', '1234'])
+        with patch.multiple(
+                ceph_disk,
+                is_partition=lambda path: True,
+                list_devices=list_devices_return,
+                ):
+            self.assertRaises(Exception, ceph_disk.main_destroy, args)
+
+        #
+        # skip the redundent devices and found by dev
+        #
+        args = ceph_disk.parse_args(['destroy', \
+                                     '--cluster', 'ceph', \
+                                     '/dev/sdY1', '--zap'])
+        with patch.multiple(
+                ceph_disk,
+                is_partition=lambda path: True,
+                list_devices=list_devices_return,
+                get_partition_base=lambda dev_path: '/dev/sdY',
+                _check_osd_status=lambda cluster, osd_id: 0,
+                _remove_from_crush_map=lambda cluster, osd_id: True,
+                _delete_osd_auth_key=lambda cluster, osd_id: True,
+                _deallocate_osd_id=lambda cluster, osd_id: True,
+                zap=lambda dev: True
+                ):
+            ceph_disk.main_destroy(args)
+            #self.assertRaises(Exception, ceph_disk.main_destroy, args)
+
+        #
+        # skip the redundent devices and found by osd-id
+        # with active status and MPATH_OSD
+        #
+        args = ceph_disk.parse_args(['destroy', \
+                                     '--cluster', 'ceph', \
+                                     '--destroy-by-id', '7788'])
+        with patch.multiple(
+                ceph_disk,
+                is_partition=lambda path: True,
+                list_devices=list_devices_return,
+                get_partition_base_mpath=lambda dev_path: '/dev/sdX',
+                _check_osd_status=lambda cluster, osd_id: 1,
+                ):
+            self.assertRaises(Exception, ceph_disk.main_destroy, args)
+
+        #
+        # skip the redundent devices and found by dev
+        # with dmcrypt (plain)
+        #
+        args = ceph_disk.parse_args(['destroy', \
+                                     '--cluster', 'ceph', \
+                                     '/dev/sdX1', '--zap'])
+        def list_devices_return(dev):
+            if dev == []:
+                return fake_devices_dmcrypt_unmap
+            elif dev == ['/dev/sdX1']:
+                return fake_devices_dmcrypt_map
+
+        with patch.multiple(
+                ceph_disk,
+                is_partition=lambda path: True,
+                list_devices=list_devices_return,
+                get_dmcrypt_key_path=lambda part_uuid, dmcrypt_key_dir, luks: True,
+                dmcrypt_map=lambda rawdev, keypath, _uuid, \
+                                   cryptsetup_parameters, luks, format_dev: True,
+                dmcrypt_unmap=lambda part_uuid: True,
+                get_partition_base=lambda dev_path: '/dev/sdX',
+                _check_osd_status=lambda cluster, osd_id: 0,
+                _remove_from_crush_map=lambda cluster, osd_id: True,
+                _delete_osd_auth_key=lambda cluster, osd_id: True,
+                _deallocate_osd_id=lambda cluster, osd_id: True,
+                zap=lambda dev: True
+                ):
+            ceph_disk.main_destroy(args)
+            #self.assertRaises(Exception, ceph_disk.main_destroy, args)
+
+        #
+        # skip the redundent devices and found by osd-id
+        # with dmcrypt (luk) and status: active
+        #
+        args = ceph_disk.parse_args(['destroy', \
+                                     '--cluster', 'ceph', \
+                                     '--destroy-by-id', '5566'])
+        def list_devices_return(dev):
+            if dev == []:
+                return fake_devices_dmcrypt_luk_unmap
+            elif dev == ['/dev/sdX1']:
+                return fake_devices_dmcrypt_map
+
+        with patch.multiple(
+                ceph_disk,
+                is_partition=lambda path: True,
+                list_devices=list_devices_return,
+                get_dmcrypt_key_path=lambda part_uuid, dmcrypt_key_dir, luks: True,
+                dmcrypt_map=lambda rawdev, keypath, _uuid, \
+                                   cryptsetup_parameters, luks, format_dev: True,
+                dmcrypt_unmap=lambda part_uuid: True,
+                get_partition_base=lambda dev_path: '/dev/sdX',
+                _check_osd_status=lambda cluster, osd_id: 1,
+                ):
+            self.assertRaises(Exception, ceph_disk.main_destroy, args)
+
+        #
+        # skip the redundent devices and found by osd-id
+        # with unknow dmcrypt type
+        #
+        args = ceph_disk.parse_args(['destroy', \
+                                     '--cluster', 'ceph', \
+                                     '--destroy-by-id', '5566'])
+        def list_devices_return(dev):
+            if dev == []:
+                return fake_devices_dmcrypt_unknow
+
+        with patch.multiple(
+                ceph_disk,
+                is_partition=lambda path: True,
+                list_devices=list_devices_return,
+                ):
+            self.assertRaises(Exception, ceph_disk.main_destroy, args)
+
+    def test_remove_from_crush_map_fail(self):
+        cluster = 'ceph'
+        osd_id = '5566'
+        with patch.multiple(
+                ceph_disk,
+                command=raise_command_error
+                ):
+            self.assertRaises(Exception, ceph_disk._remove_from_crush_map, cluster, osd_id)
+
+    def test_delete_osd_auth_key_fail(self):
+        cluster = 'ceph'
+        osd_id = '5566'
+        with patch.multiple(
+                ceph_disk,
+                command=raise_command_error
+                ):
+            self.assertRaises(Exception, ceph_disk._delete_osd_auth_key, cluster, osd_id)
+
+    def test_deallocate_osd_id_fail(self):
+        cluster = 'ceph'
+        osd_id = '5566'
+        with patch.multiple(
+                ceph_disk,
+                command=raise_command_error
+                ):
+            self.assertRaises(Exception, ceph_disk._deallocate_osd_id, cluster, osd_id)
+
+
+##### Help function #####
+
+def raise_command_error(*args):
+    e = subprocess.CalledProcessError('aaa', 'bbb', 'ccc')
+    raise e
+
+def path_exists(target_paths=None):
+    """
+    A quick helper that enforces a check for the existence of a path. Since we
+    are dealing with fakes, we allow to pass in a list of paths that are OK to
+    return True, otherwise return False.
+    """
+    target_paths = target_paths or []
+
+    def exists(path):
+        return path in target_paths
+    return exists
diff --git a/src/test/run-rbd-unit-tests.sh b/src/test/run-rbd-unit-tests.sh
index 09edb41..7bd27ef 100755
--- a/src/test/run-rbd-unit-tests.sh
+++ b/src/test/run-rbd-unit-tests.sh
@@ -7,7 +7,7 @@ export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$CEPH_SRC/.libs"
 PATH="$CEPH_SRC:$PATH"
 
 unittest_librbd
-for i in 0 1 5 29 45
+for i in 0 1 5 29 109
 do
     RBD_FEATURES=$i unittest_librbd
 done
diff --git a/src/test/test_filejournal.cc b/src/test/test_filejournal.cc
index aaf6487..bd0ff52 100644
--- a/src/test/test_filejournal.cc
+++ b/src/test/test_filejournal.cc
@@ -12,6 +12,7 @@
 #include "include/Context.h"
 #include "common/Mutex.h"
 #include "common/safe_io.h"
+#include "os/JournalingObjectStore.h"
 
 Finisher *finisher;
 Cond sync_cond;
@@ -137,9 +138,11 @@ TEST(TestFileJournal, WriteSmall) {
     ASSERT_EQ(0, j.create());
     j.make_writeable();
 
+    list<ObjectStore::Transaction*> tls;
     bufferlist bl;
     bl.append("small");
-    j.submit_entry(1, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+    int orig_len = j.prepare_entry(tls, &bl);
+    j.submit_entry(1, bl, orig_len, new C_SafeCond(&wait_lock, &cond, &done));
     wait();
 
     j.close();
@@ -165,9 +168,10 @@ TEST(TestFileJournal, WriteBig) {
       memset(foo, 1, sizeof(foo));
       bl.append(foo, sizeof(foo));
     }
-    j.submit_entry(1, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+    list<ObjectStore::Transaction*> tls;
+    int orig_len = j.prepare_entry(tls, &bl);
+    j.submit_entry(1, bl, orig_len, new C_SafeCond(&wait_lock, &cond, &done));
     wait();
-
     j.close();
   }
 }
@@ -187,14 +191,15 @@ TEST(TestFileJournal, WriteMany) {
 
     C_GatherBuilder gb(g_ceph_context, new C_SafeCond(&wait_lock, &cond, &done));
 
+    list<ObjectStore::Transaction*> tls;
     bufferlist bl;
     bl.append("small");
     uint64_t seq = 1;
     for (int i=0; i<100; i++) {
       bl.append("small");
-      j.submit_entry(seq++, bl, 0, gb.new_sub());
+      int orig_len = j.prepare_entry(tls, &bl);
+      j.submit_entry(seq++, bl, orig_len, gb.new_sub());
     }
-
     gb.activate();
 
     wait();
@@ -220,7 +225,9 @@ TEST(TestFileJournal, WriteManyVecs) {
 
     bufferlist first;
     first.append("small");
-    j.submit_entry(1, first, 0, gb.new_sub());
+    list<ObjectStore::Transaction*> tls;
+    int orig_len = j.prepare_entry(tls, &first);
+    j.submit_entry(1, first, orig_len, gb.new_sub());
 
     bufferlist bl;
     for (int i=0; i<IOV_MAX * 2; i++) {
@@ -229,7 +236,8 @@ TEST(TestFileJournal, WriteManyVecs) {
       bl.append(bp);
     }
     bufferlist origbl = bl;
-    j.submit_entry(2, bl, 0, gb.new_sub());
+    orig_len = j.prepare_entry(tls, &bl);
+    j.submit_entry(2, bl, orig_len, gb.new_sub());
     gb.activate();
     wait();
 
@@ -253,6 +261,8 @@ TEST(TestFileJournal, ReplaySmall) {
   g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
   g_ceph_context->_conf->apply_changes(NULL);
 
+  list<ObjectStore::Transaction*> tls;
+
   for (unsigned i = 0 ; i < 3; ++i) {
     SCOPED_TRACE(subtests[i].description);
     fsid.generate_random();
@@ -265,11 +275,14 @@ TEST(TestFileJournal, ReplaySmall) {
 
     bufferlist bl;
     bl.append("small");
-    j.submit_entry(1, bl, 0, gb.new_sub());
+    int orig_len = j.prepare_entry(tls, &bl);
+    j.submit_entry(1, bl, orig_len, gb.new_sub());
     bl.append("small");
-    j.submit_entry(2, bl, 0, gb.new_sub());
+    orig_len = j.prepare_entry(tls, &bl);
+    j.submit_entry(2, bl, orig_len, gb.new_sub());
     bl.append("small");
-    j.submit_entry(3, bl, 0, gb.new_sub());
+    orig_len = j.prepare_entry(tls, &bl);
+    j.submit_entry(3, bl, orig_len, gb.new_sub());
     gb.activate();
     wait();
 
@@ -306,6 +319,7 @@ TEST(TestFileJournal, ReplayCorrupt) {
   g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
   g_ceph_context->_conf->apply_changes(NULL);
 
+  list<ObjectStore::Transaction*> tls;
   for (unsigned i = 0 ; i < 3; ++i) {
     SCOPED_TRACE(subtests[i].description);
     fsid.generate_random();
@@ -320,13 +334,17 @@ TEST(TestFileJournal, ReplayCorrupt) {
     const char *newneedle = "in a haystack";
     bufferlist bl;
     bl.append(needle);
-    j.submit_entry(1, bl, 0, gb.new_sub());
+    int orig_len = j.prepare_entry(tls, &bl);
+    j.submit_entry(1, bl, orig_len, gb.new_sub());
     bl.append(needle);
-    j.submit_entry(2, bl, 0, gb.new_sub());
+    orig_len = j.prepare_entry(tls, &bl);
+    j.submit_entry(2, bl, orig_len, gb.new_sub());
     bl.append(needle);
-    j.submit_entry(3, bl, 0, gb.new_sub());
+    orig_len = j.prepare_entry(tls, &bl);
+    j.submit_entry(3, bl, orig_len, gb.new_sub());
     bl.append(needle);
-    j.submit_entry(4, bl, 0, gb.new_sub());
+    orig_len = j.prepare_entry(tls, &bl);
+    j.submit_entry(4, bl, orig_len, gb.new_sub());
     gb.activate();
     wait();
 
@@ -342,10 +360,10 @@ TEST(TestFileJournal, ReplayCorrupt) {
     for (unsigned o=0; o < sizeof(buf) - strlen(needle); o++) {
       if (memcmp(buf+o, needle, strlen(needle)) == 0) {
         if (n >= 2) {
-	cout << "replacing at offset " << o << std::endl;
-	memcpy(buf+o, newneedle, strlen(newneedle));
+	  cout << "replacing at offset " << o << std::endl;
+	  memcpy(buf+o, newneedle, strlen(newneedle));
         } else {
-	cout << "leaving at offset " << o << std::endl;
+	  cout << "leaving at offset " << o << std::endl;
         }
         n++;
       }
@@ -398,13 +416,15 @@ TEST(TestFileJournal, WriteTrim) {
     memset(foo, 1, sizeof(foo));
 
     uint64_t seq = 1, committed = 0;
+    list<ObjectStore::Transaction*> tls;
 
     for (unsigned i=0; i<size_mb*2; i++) {
       bl.clear();
       bl.push_back(buffer::copy(foo, sizeof(foo)));
       bl.zero();
       ls.push_back(new C_Sync);
-      j.submit_entry(seq++, bl, 0, ls.back()->c);
+      int orig_len = j.prepare_entry(tls, &bl);
+      j.submit_entry(seq++, bl, orig_len, ls.back()->c);
 
       while (ls.size() > size_mb/2) {
         delete ls.front();
@@ -430,6 +450,7 @@ TEST(TestFileJournal, WriteTrimSmall) {
   g_ceph_context->_conf->set_val("journal_ignore_corruption", "false");
   g_ceph_context->_conf->set_val("journal_write_header_frequency", "0");
   g_ceph_context->_conf->apply_changes(NULL);
+  list<ObjectStore::Transaction*> tls;
 
   for (unsigned i = 0 ; i < 3; ++i) {
     SCOPED_TRACE(subtests[i].description);
@@ -453,7 +474,8 @@ TEST(TestFileJournal, WriteTrimSmall) {
         bl.push_back(buffer::copy(foo, sizeof(foo) / 128));
       bl.zero();
       ls.push_back(new C_Sync);
-      j.submit_entry(seq++, bl, 0, ls.back()->c);
+      int orig_len = j.prepare_entry(tls, &bl);
+      j.submit_entry(seq++, bl, orig_len, ls.back()->c);
 
       while (ls.size() > size_mb/2) {
         delete ls.front();
@@ -478,6 +500,7 @@ TEST(TestFileJournal, ReplayDetectCorruptFooterMagic) {
   g_ceph_context->_conf->set_val("journal_write_header_frequency", "1");
   g_ceph_context->_conf->apply_changes(NULL);
 
+  list<ObjectStore::Transaction*> tls;
   for (unsigned i = 0 ; i < 3; ++i) {
     SCOPED_TRACE(subtests[i].description);
     fsid.generate_random();
@@ -492,14 +515,16 @@ TEST(TestFileJournal, ReplayDetectCorruptFooterMagic) {
     for (unsigned i = 1; i <= 4; ++i) {
       bufferlist bl;
       bl.append(needle);
-      j.submit_entry(i, bl, 0, gb.new_sub());
+      int orig_len = j.prepare_entry(tls, &bl);
+      j.submit_entry(i, bl, orig_len, gb.new_sub());
     }
     gb.activate();
     wait();
 
     bufferlist bl;
     bl.append("needle");
-    j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+    int orig_len = j.prepare_entry(tls, &bl);
+    j.submit_entry(5, bl, orig_len, new C_SafeCond(&wait_lock, &cond, &done));
     wait();
 
     j.close();
@@ -532,6 +557,7 @@ TEST(TestFileJournal, ReplayDetectCorruptPayload) {
   g_ceph_context->_conf->set_val("journal_write_header_frequency", "1");
   g_ceph_context->_conf->apply_changes(NULL);
 
+  list<ObjectStore::Transaction*> tls;
   for (unsigned i = 0 ; i < 3; ++i) {
     SCOPED_TRACE(subtests[i].description);
     fsid.generate_random();
@@ -546,14 +572,16 @@ TEST(TestFileJournal, ReplayDetectCorruptPayload) {
     for (unsigned i = 1; i <= 4; ++i) {
       bufferlist bl;
       bl.append(needle);
-      j.submit_entry(i, bl, 0, gb.new_sub());
+      int orig_len = j.prepare_entry(tls, &bl);
+      j.submit_entry(i, bl, orig_len, gb.new_sub());
     }
     gb.activate();
     wait();
 
     bufferlist bl;
     bl.append("needle");
-    j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+    int orig_len = j.prepare_entry(tls, &bl);
+    j.submit_entry(5, bl, orig_len, new C_SafeCond(&wait_lock, &cond, &done));
     wait();
 
     j.close();
@@ -586,6 +614,7 @@ TEST(TestFileJournal, ReplayDetectCorruptHeader) {
   g_ceph_context->_conf->set_val("journal_write_header_frequency", "1");
   g_ceph_context->_conf->apply_changes(NULL);
 
+  list<ObjectStore::Transaction*> tls;
   for (unsigned i = 0 ; i < 3; ++i) {
     SCOPED_TRACE(subtests[i].description);
     fsid.generate_random();
@@ -600,14 +629,16 @@ TEST(TestFileJournal, ReplayDetectCorruptHeader) {
     for (unsigned i = 1; i <= 4; ++i) {
       bufferlist bl;
       bl.append(needle);
-      j.submit_entry(i, bl, 0, gb.new_sub());
+      int orig_len = j.prepare_entry(tls, &bl);
+      j.submit_entry(i, bl, orig_len, gb.new_sub());
     }
     gb.activate();
     wait();
 
     bufferlist bl;
     bl.append("needle");
-    j.submit_entry(5, bl, 0, new C_SafeCond(&wait_lock, &cond, &done));
+    int orig_len = j.prepare_entry(tls, &bl);
+    j.submit_entry(5, bl, orig_len, new C_SafeCond(&wait_lock, &cond, &done));
     wait();
 
     j.close();
diff --git a/src/test/test_rgw_admin_log.cc b/src/test/test_rgw_admin_log.cc
index 46b69eb..f460251 100644
--- a/src/test/test_rgw_admin_log.cc
+++ b/src/test/test_rgw_admin_log.cc
@@ -327,9 +327,7 @@ int run_rgw_admin(string& cmd, string& resp) {
       argv[loop++] = (char *)(*it).c_str();
     }
     argv[loop] = NULL;
-    close(1);
-    stdout = fopen(RGW_ADMIN_RESP_PATH, "w+");
-    if (!stdout) {
+    if (!freopen(RGW_ADMIN_RESP_PATH, "w+", stdout)) {
       cout << "Unable to open stdout file" << std::endl;
     }
     execv((g_test->get_rgw_admin_path()).c_str(), argv); 
diff --git a/src/test/test_rgw_admin_meta.cc b/src/test/test_rgw_admin_meta.cc
index 5b0d6a6..063b9ea 100644
--- a/src/test/test_rgw_admin_meta.cc
+++ b/src/test/test_rgw_admin_meta.cc
@@ -321,9 +321,7 @@ int run_rgw_admin(string& cmd, string& resp) {
       argv[loop++] = (char *)(*it).c_str();
     }
     argv[loop] = NULL;
-    close(1);
-    stdout = fopen(RGW_ADMIN_RESP_PATH, "w+");
-    if (!stdout) {
+    if (!freopen(RGW_ADMIN_RESP_PATH, "w+", stdout)) {
       cout << "Unable to open stdout file" << std::endl;
     }
     execv((g_test->get_rgw_admin_path()).c_str(), argv); 
diff --git a/src/test/test_rgw_admin_opstate.cc b/src/test/test_rgw_admin_opstate.cc
index a9a65f5..2656857 100644
--- a/src/test/test_rgw_admin_opstate.cc
+++ b/src/test/test_rgw_admin_opstate.cc
@@ -325,9 +325,7 @@ int run_rgw_admin(string& cmd, string& resp) {
       argv[loop++] = (char *)(*it).c_str();
     }
     argv[loop] = NULL;
-    close(1);
-    stdout = fopen(RGW_ADMIN_RESP_PATH, "w+");
-    if (!stdout) {
+    if (!freopen(RGW_ADMIN_RESP_PATH, "w+", stdout)) {
       cout << "Unable to open stdout file" << std::endl;
     }
     execv((g_test->get_rgw_admin_path()).c_str(), argv); 
diff --git a/src/test/test_subprocess.cc b/src/test/test_subprocess.cc
index c07538b..725d2a6 100644
--- a/src/test/test_subprocess.cc
+++ b/src/test/test_subprocess.cc
@@ -54,7 +54,7 @@ TEST(SubProcess, NotFound)
   SubProcess p("NOTEXISTENTBINARY", false, false, true);
   ASSERT_EQ(p.spawn(), 0);
   std::string buf;
-  ASSERT_TRUE(read_from_fd(p.stderr(), buf));
+  ASSERT_TRUE(read_from_fd(p.get_stderr(), buf));
   std::cerr << "stderr: " << buf;
   ASSERT_EQ(p.join(), 1);
   std::cerr << "err: " << p.err() << std::endl;
@@ -68,7 +68,7 @@ TEST(SubProcess, Echo)
 
   ASSERT_EQ(echo.spawn(), 0);
   std::string buf;
-  ASSERT_TRUE(read_from_fd(echo.stdout(), buf));
+  ASSERT_TRUE(read_from_fd(echo.get_stdout(), buf));
   std::cerr << "stdout: " << buf;
   ASSERT_EQ(buf, "1 2 3\n");
   ASSERT_EQ(echo.join(), 0);
@@ -81,14 +81,14 @@ TEST(SubProcess, Cat)
 
   ASSERT_EQ(cat.spawn(), 0);
   std::string msg("to my, trociny!");
-  int n = write(cat.stdin(), msg.c_str(), msg.size());
+  int n = write(cat.get_stdin(), msg.c_str(), msg.size());
   ASSERT_EQ(n, (int)msg.size());
   cat.close_stdin();
   std::string buf;
-  ASSERT_TRUE(read_from_fd(cat.stdout(), buf));
+  ASSERT_TRUE(read_from_fd(cat.get_stdout(), buf));
   std::cerr << "stdout: " << buf << std::endl;
   ASSERT_EQ(buf, msg);
-  ASSERT_TRUE(read_from_fd(cat.stderr(), buf));
+  ASSERT_TRUE(read_from_fd(cat.get_stderr(), buf));
   ASSERT_EQ(buf, "");
   ASSERT_EQ(cat.join(), 0);
   ASSERT_TRUE(cat.err()[0] == '\0');
@@ -101,9 +101,9 @@ TEST(SubProcess, CatDevNull)
 
   ASSERT_EQ(cat.spawn(), 0);
   std::string buf;
-  ASSERT_TRUE(read_from_fd(cat.stdout(), buf));
+  ASSERT_TRUE(read_from_fd(cat.get_stdout(), buf));
   ASSERT_EQ(buf, "");
-  ASSERT_TRUE(read_from_fd(cat.stderr(), buf));
+  ASSERT_TRUE(read_from_fd(cat.get_stderr(), buf));
   ASSERT_EQ(buf, "");
   ASSERT_EQ(cat.join(), 0);
   ASSERT_TRUE(cat.err()[0] == '\0');
@@ -127,14 +127,14 @@ TEST(SubProcess, CatWithArgs)
 
   ASSERT_EQ(cat.spawn(), 0);
   std::string msg("Hello, Word!");
-  int n = write(cat.stdin(), msg.c_str(), msg.size());
+  int n = write(cat.get_stdin(), msg.c_str(), msg.size());
   ASSERT_EQ(n, (int)msg.size());
   cat.close_stdin();
   std::string buf;
-  ASSERT_TRUE(read_from_fd(cat.stdout(), buf));
+  ASSERT_TRUE(read_from_fd(cat.get_stdout(), buf));
   std::cerr << "stdout: " << buf << std::endl;
   ASSERT_EQ(buf, msg);
-  ASSERT_TRUE(read_from_fd(cat.stderr(), buf));
+  ASSERT_TRUE(read_from_fd(cat.get_stderr(), buf));
   std::cerr << "stderr: " << buf;
   ASSERT_FALSE(buf.empty());
   ASSERT_EQ(cat.join(), 1);
@@ -152,14 +152,14 @@ TEST(SubProcess, Subshell)
       "/bin/sh -c 'exit 13'", NULL);
   ASSERT_EQ(sh.spawn(), 0);
   std::string msg("hello via subshell");
-  int n = write(sh.stdin(), msg.c_str(), msg.size());
+  int n = write(sh.get_stdin(), msg.c_str(), msg.size());
   ASSERT_EQ(n, (int)msg.size());
   sh.close_stdin();
   std::string buf;
-  ASSERT_TRUE(read_from_fd(sh.stdout(), buf));
+  ASSERT_TRUE(read_from_fd(sh.get_stdout(), buf));
   std::cerr << "stdout: " << buf << std::endl;
   ASSERT_EQ(buf, msg);
-  ASSERT_TRUE(read_from_fd(sh.stderr(), buf));
+  ASSERT_TRUE(read_from_fd(sh.get_stderr(), buf));
   std::cerr << "stderr: " << buf;
   ASSERT_EQ(buf, "error from subshell\n");
   ASSERT_EQ(sh.join(), 13);
@@ -192,9 +192,9 @@ TEST(SubProcessTimed, Killed)
   ASSERT_EQ(cat.spawn(), 0);
   cat.kill();
   std::string buf;
-  ASSERT_TRUE(read_from_fd(cat.stdout(), buf));
+  ASSERT_TRUE(read_from_fd(cat.get_stdout(), buf));
   ASSERT_TRUE(buf.empty());
-  ASSERT_TRUE(read_from_fd(cat.stderr(), buf));
+  ASSERT_TRUE(read_from_fd(cat.get_stderr(), buf));
   ASSERT_TRUE(buf.empty());
   ASSERT_EQ(cat.join(), 128 + SIGTERM);
   std::cerr << "err: " << cat.err() << std::endl;
@@ -208,7 +208,7 @@ TEST(SubProcessTimed, SleepTimedout)
 
   ASSERT_EQ(sleep.spawn(), 0);
   std::string buf;
-  ASSERT_TRUE(read_from_fd(sleep.stderr(), buf));
+  ASSERT_TRUE(read_from_fd(sleep.get_stderr(), buf));
   std::cerr << "stderr: " << buf;
   ASSERT_FALSE(buf.empty());
   ASSERT_EQ(sleep.join(), 128 + SIGKILL);
@@ -222,14 +222,14 @@ TEST(SubProcessTimed, SubshellNoTimeout)
   sh.add_cmd_args("-c", "cat >&2", NULL);
   ASSERT_EQ(sh.spawn(), 0);
   std::string msg("the quick brown fox jumps over the lazy dog");
-  int n = write(sh.stdin(), msg.c_str(), msg.size());
+  int n = write(sh.get_stdin(), msg.c_str(), msg.size());
   ASSERT_EQ(n, (int)msg.size());
   sh.close_stdin();
   std::string buf;
-  ASSERT_TRUE(read_from_fd(sh.stdout(), buf));
+  ASSERT_TRUE(read_from_fd(sh.get_stdout(), buf));
   std::cerr << "stdout: " << buf << std::endl;
   ASSERT_TRUE(buf.empty());
-  ASSERT_TRUE(read_from_fd(sh.stderr(), buf));
+  ASSERT_TRUE(read_from_fd(sh.get_stderr(), buf));
   std::cerr << "stderr: " << buf << std::endl;
   ASSERT_EQ(buf, msg);
   ASSERT_EQ(sh.join(), 0);
@@ -242,11 +242,11 @@ TEST(SubProcessTimed, SubshellKilled)
   sh.add_cmd_args("-c", "sh -c cat", NULL);
   ASSERT_EQ(sh.spawn(), 0);
   std::string msg("etaoin shrdlu");
-  int n = write(sh.stdin(), msg.c_str(), msg.size());
+  int n = write(sh.get_stdin(), msg.c_str(), msg.size());
   ASSERT_EQ(n, (int)msg.size());
   sh.kill();
   std::string buf;
-  ASSERT_TRUE(read_from_fd(sh.stderr(), buf));
+  ASSERT_TRUE(read_from_fd(sh.get_stderr(), buf));
   ASSERT_TRUE(buf.empty());
   ASSERT_EQ(sh.join(), 128 + SIGTERM);
   std::cerr << "err: " << sh.err() << std::endl;
@@ -259,7 +259,7 @@ TEST(SubProcessTimed, SubshellTimedout)
   sh.add_cmd_args("-c", "sleep 1000& cat; NEVER REACHED", NULL);
   ASSERT_EQ(sh.spawn(), 0);
   std::string buf;
-  ASSERT_TRUE(read_from_fd(sh.stderr(), buf));
+  ASSERT_TRUE(read_from_fd(sh.get_stderr(), buf));
   std::cerr << "stderr: " << buf;
   ASSERT_FALSE(buf.empty());
   ASSERT_EQ(sh.join(), 128 + SIGTERM);
diff --git a/src/test/ubuntu-12.04/Dockerfile.in b/src/test/ubuntu-12.04/Dockerfile.in
index ed55bad..5a1c6a1 100644
--- a/src/test/ubuntu-12.04/Dockerfile.in
+++ b/src/test/ubuntu-12.04/Dockerfile.in
@@ -28,4 +28,4 @@ RUN apt-get update
 RUN cd /root ; ./install-deps.sh
 # development tools
 RUN apt-get install -y sudo ccache valgrind gdb python-virtualenv gdisk kpartx hdparm xmlstarlet
-RUN useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+RUN if test %%USER%% != root ; then useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers ; fi
diff --git a/src/test/ubuntu-12.04/install-deps.sh b/src/test/ubuntu-12.04/install-deps.sh
index 1bebf09..8249ea3 100755
--- a/src/test/ubuntu-12.04/install-deps.sh
+++ b/src/test/ubuntu-12.04/install-deps.sh
@@ -62,7 +62,7 @@ CentOS|Fedora|RedHatEnterpriseServer)
             CentOS|RedHatEnterpriseServer)
                 $SUDO yum install -y yum-utils
                 MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
-                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                if test $(lsb_release -si) = RedHatEnterpriseServer ; then
                     $SUDO yum install subscription-manager
                     $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
                 fi
@@ -70,6 +70,9 @@ CentOS|Fedora|RedHatEnterpriseServer)
                 $SUDO yum install --nogpgcheck -y epel-release
                 $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
                 $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
+                if test $(lsb_release -si) = CentOS -a $MAJOR_VERSION = 7 ; then
+                    $SUDO yum-config-manager --enable cr
+                fi
                 ;;
         esac
         sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
diff --git a/src/test/ubuntu-14.04/Dockerfile.in b/src/test/ubuntu-14.04/Dockerfile.in
index fddf929..51b47ad 100644
--- a/src/test/ubuntu-14.04/Dockerfile.in
+++ b/src/test/ubuntu-14.04/Dockerfile.in
@@ -28,4 +28,4 @@ RUN apt-get update
 RUN cd /root ; ./install-deps.sh
 # development tools
 RUN apt-get install -y ccache valgrind gdb python-virtualenv gdisk kpartx hdparm jq xmlstarlet
-RUN useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+RUN if test %%USER%% != root ; then useradd -M --uid %%user_id%% %%USER%% && echo '%%USER%% ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers ; fi
diff --git a/src/test/ubuntu-14.04/install-deps.sh b/src/test/ubuntu-14.04/install-deps.sh
index 1bebf09..8249ea3 100755
--- a/src/test/ubuntu-14.04/install-deps.sh
+++ b/src/test/ubuntu-14.04/install-deps.sh
@@ -62,7 +62,7 @@ CentOS|Fedora|RedHatEnterpriseServer)
             CentOS|RedHatEnterpriseServer)
                 $SUDO yum install -y yum-utils
                 MAJOR_VERSION=$(lsb_release -rs | cut -f1 -d.)
-                if test $(lsb_release -si) == RedHatEnterpriseServer ; then
+                if test $(lsb_release -si) = RedHatEnterpriseServer ; then
                     $SUDO yum install subscription-manager
                     $SUDO subscription-manager repos --enable=rhel-$MAJOR_VERSION-server-optional-rpms
                 fi
@@ -70,6 +70,9 @@ CentOS|Fedora|RedHatEnterpriseServer)
                 $SUDO yum install --nogpgcheck -y epel-release
                 $SUDO rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-EPEL-$MAJOR_VERSION
                 $SUDO rm -f /etc/yum.repos.d/dl.fedoraproject.org*
+                if test $(lsb_release -si) = CentOS -a $MAJOR_VERSION = 7 ; then
+                    $SUDO yum-config-manager --enable cr
+                fi
                 ;;
         esac
         sed -e 's/@//g' < ceph.spec.in > $DIR/ceph.spec
diff --git a/src/tools/Makefile-client.am b/src/tools/Makefile-client.am
index 4cbfd5d..1764eac 100644
--- a/src/tools/Makefile-client.am
+++ b/src/tools/Makefile-client.am
@@ -21,6 +21,55 @@ rados_SOURCES += common/obj_bencher.cc # needs cleanup so it can go in libcommon
 rados_LDADD = libcls_lock_client.la $(LIBRADOS) $(LIBRADOSSTRIPER) $(CEPH_GLOBAL)
 bin_PROGRAMS += rados
 
+if WITH_RBD
+
+rbd_SOURCES = \
+	tools/rbd/rbd.cc \
+	tools/rbd/ArgumentTypes.cc \
+	tools/rbd/IndentStream.cc \
+	tools/rbd/OptionPrinter.cc \
+	tools/rbd/Shell.cc \
+	tools/rbd/Utils.cc \
+	tools/rbd/action/BenchWrite.cc \
+	tools/rbd/action/Children.cc \
+	tools/rbd/action/Clone.cc \
+	tools/rbd/action/Copy.cc \
+	tools/rbd/action/Create.cc \
+	tools/rbd/action/Diff.cc \
+	tools/rbd/action/DiskUsage.cc \
+	tools/rbd/action/Export.cc \
+	tools/rbd/action/ExportDiff.cc \
+	tools/rbd/action/Feature.cc \
+	tools/rbd/action/Flatten.cc \
+	tools/rbd/action/ImageMeta.cc \
+	tools/rbd/action/Import.cc \
+	tools/rbd/action/ImportDiff.cc \
+	tools/rbd/action/Info.cc \
+	tools/rbd/action/Kernel.cc \
+	tools/rbd/action/List.cc \
+	tools/rbd/action/Lock.cc \
+	tools/rbd/action/MergeDiff.cc \
+	tools/rbd/action/ObjectMap.cc \
+	tools/rbd/action/Remove.cc \
+	tools/rbd/action/Rename.cc \
+	tools/rbd/action/Resize.cc \
+	tools/rbd/action/Snap.cc \
+	tools/rbd/action/Status.cc \
+	tools/rbd/action/Watch.cc
+noinst_HEADERS += \
+	tools/rbd/ArgumentTypes.h \
+	tools/rbd/IndentStream.h \
+	tools/rbd/OptionPrinter.h \
+	tools/rbd/Shell.h \
+	tools/rbd/Utils.h
+rbd_LDADD = \
+	$(LIBKRBD) $(LIBRBD) $(LIBRADOS) $(CEPH_GLOBAL) \
+	$(BOOST_REGEX_LIBS) $(BOOST_PROGRAM_OPTIONS_LIBS)
+if LINUX
+bin_PROGRAMS += rbd
+endif # LINUX
+
+endif # WITH_RBD
 
 if WITH_CEPHFS
 
diff --git a/src/tools/Makefile.am b/src/tools/Makefile.am
index a6661eb..e14f3f8 100644
--- a/src/tools/Makefile.am
+++ b/src/tools/Makefile.am
@@ -7,7 +7,7 @@ include tools/Makefile-server.am
 endif
 
 monmaptool_SOURCES = tools/monmaptool.cc
-monmaptool_LDADD = $(CEPH_GLOBAL) $(LIBCOMMON)
+monmaptool_LDADD = $(CEPH_GLOBAL)
 bin_PROGRAMS += monmaptool
 
 crushtool_SOURCES = tools/crushtool.cc
@@ -23,11 +23,11 @@ ceph_psim_LDADD = $(CEPH_GLOBAL)
 bin_DEBUGPROGRAMS += ceph_psim
 
 ceph_conf_SOURCES = tools/ceph_conf.cc
-ceph_conf_LDADD = $(CEPH_GLOBAL) $(LIBCOMMON)
+ceph_conf_LDADD = $(CEPH_GLOBAL)
 bin_PROGRAMS += ceph-conf
 
 ceph_authtool_SOURCES = tools/ceph_authtool.cc
-ceph_authtool_LDADD = $(CEPH_GLOBAL) $(LIBCOMMON)
+ceph_authtool_LDADD = $(CEPH_GLOBAL)
 bin_PROGRAMS += ceph-authtool
 
 noinst_HEADERS += \
diff --git a/src/tools/ceph-monstore-update-crush.sh b/src/tools/ceph-monstore-update-crush.sh
index dc6a6d7..7fc41e0 100755
--- a/src/tools/ceph-monstore-update-crush.sh
+++ b/src/tools/ceph-monstore-update-crush.sh
@@ -54,7 +54,7 @@ function test_crush() {
                        -v $epoch -o $osdmap > /dev/null
     osdmaptool --export-crush $crush $osdmap &> /dev/null
 
-    if crushtool --check $max_osd -i $crush > /dev/null; then
+    if crushtool --test --check $max_osd -i $crush > /dev/null; then
         good=true
     else
         good=false
@@ -63,17 +63,6 @@ function test_crush() {
     $good || return 1
 }
 
-function get_crush()  {
-    local store_path=$1
-    local osdmap_epoch=$2
-    local osdmap_path=`mktemp`
-    local crush_path=`mktemp`
-
-    ceph-monstore-tool $store_path get osdmap -- \
-                       -v $osdmap_epoch -o $osdmap_path
-    osdmaptool --export-crush $crush $osdmap_path 2>&1 > /dev/null
-}
-
 function die() {
     local retval=$?
     echo "$@" >&2
diff --git a/src/tools/ceph_kvstore_tool.cc b/src/tools/ceph_kvstore_tool.cc
index a234213..202c5df 100644
--- a/src/tools/ceph_kvstore_tool.cc
+++ b/src/tools/ceph_kvstore_tool.cc
@@ -20,10 +20,12 @@
 #include "common/config.h"
 #include "common/errno.h"
 #include "common/strtol.h"
-
+#include "global/global_context.h"
 #include "global/global_init.h"
 #include "include/stringify.h"
-#include "os/KeyValueDB.h"
+#include "include/utime.h"
+#include "common/Clock.h"
+#include "kv/KeyValueDB.h"
 
 using namespace std;
 
diff --git a/src/tools/ceph_objectstore_tool.cc b/src/tools/ceph_objectstore_tool.cc
index b978a6c..31ea7d4 100644
--- a/src/tools/ceph_objectstore_tool.cc
+++ b/src/tools/ceph_objectstore_tool.cc
@@ -105,7 +105,7 @@ int _action_on_all_objects_in_pg(ObjectStore *store, coll_t coll, action_on_obje
         if (r < 0) {
 	  cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", "
 	       << cpp_strerror(r) << std::endl;
-	  return r;
+	  continue;
         }
         bufferlist::iterator bp = attr.begin();
         try {
@@ -114,7 +114,7 @@ int _action_on_all_objects_in_pg(ObjectStore *store, coll_t coll, action_on_obje
 	  r = -EINVAL;
 	  cerr << "Error getting attr on : " << make_pair(coll, *obj) << ", "
 	       << cpp_strerror(r) << std::endl;
-	  return r;
+	  continue;
         }
       }
       r = action.call(store, coll, *obj, oi);
@@ -147,7 +147,7 @@ int action_on_all_objects_in_pg(ObjectStore *store, string pgidstr, action_on_ob
       continue;
 
     // If an exact match or treat no shard as any shard
-    if (cand_pgid == pgid || 
+    if (cand_pgid == pgid ||
         (pgid.is_no_shard() && pgid.pgid == cand_pgid.pgid)) {
       colls_to_check.push_back(*i);
     }
@@ -254,10 +254,14 @@ struct pgid_object_list {
 struct lookup_ghobject : public action_on_object_t {
   pgid_object_list _objects;
   const string _name;
+  bool _need_snapset;
 
-  lookup_ghobject(const string& name) : _name(name) { }
+  lookup_ghobject(const string& name, bool need_snapset = false) : _name(name),
+		  _need_snapset(need_snapset) { }
 
   virtual int call(ObjectStore *store, coll_t coll, ghobject_t &ghobj, object_info_t &oi) {
+    if (_need_snapset && !ghobj.hobj.has_snapset())
+      return 0;
     if (_name.length() == 0 || ghobj.hobj.oid.name == _name)
       _objects.insert(coll, ghobj);
     return 0;
@@ -392,55 +396,6 @@ void dump_log(Formatter *formatter, ostream &out, pg_log_t &log,
   formatter->flush(out);
 }
 
-//Based on RemoveWQ::_process()
-void remove_coll(ObjectStore *store, const coll_t &coll,
-		 ObjectStore::Sequencer &osr)
-{
-  spg_t pg;
-  coll.is_pg_prefix(&pg);
-  OSDriver driver(
-    store,
-    coll_t(),
-    OSD::make_snapmapper_oid());
-  SnapMapper mapper(&driver, 0, 0, 0, pg.shard);
-
-  ghobject_t next;
-  int r = 0;
-  int64_t num = 0;
-  ObjectStore::Transaction *t = new ObjectStore::Transaction;
-  cout << "remove_coll " << coll << std::endl;
-  while (!next.is_max()) {
-    vector<ghobject_t> objects;
-    r = store->collection_list(coll, next, ghobject_t::get_max(), true, 300,
-      &objects, &next);
-    if (r < 0)
-      goto out;
-    for (vector<ghobject_t>::iterator i = objects.begin();
-	 i != objects.end();
-	 ++i, ++num) {
-
-      OSDriver::OSTransaction _t(driver.get_transaction(t));
-      cout << "remove " << *i << std::endl;
-      int r = mapper.remove_oid(i->hobj, &_t);
-      if (r != 0 && r != -ENOENT) {
-        assert(0);
-      }
-
-      t->remove(coll, *i);
-      if (num >= 30) {
-        store->apply_transaction(&osr, *t);
-        delete t;
-        t = new ObjectStore::Transaction;
-        num = 0;
-      }
-    }
-  }
-  t->remove_collection(coll);
-  store->apply_transaction(&osr, *t);
-out:
-  delete t;
-}
-
 //Based on part of OSD::load_pgs()
 int finish_remove_pgs(ObjectStore *store)
 {
@@ -526,13 +481,12 @@ int initiate_new_remove_pg(ObjectStore *store, spg_t r_pgid,
   cout << " marking collection for removal" << std::endl;
   if (dry_run)
     return 0;
-  ObjectStore::Transaction *rmt = new ObjectStore::Transaction;
-  int r = mark_pg_for_removal(store, r_pgid, rmt);
+  ObjectStore::Transaction rmt;
+  int r = mark_pg_for_removal(store, r_pgid, &rmt);
   if (r < 0) {
-    delete rmt;
     return r;
   }
-  store->apply_transaction(&osr, *rmt);
+  store->apply_transaction(&osr, rmt);
   finish_remove_pgs(store);
   return r;
 }
@@ -746,6 +700,8 @@ int set_inc_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force,
     }
     cout << "Creating a new epoch." << std::endl;
   }
+  if (dry_run)
+    return 0;
   ObjectStore::Transaction t;
   t.write(coll_t::meta(), inc_oid, 0, bl.length(), bl);
   t.truncate(coll_t::meta(), inc_oid, bl.length());
@@ -791,6 +747,8 @@ int set_osdmap(ObjectStore *store, epoch_t e, bufferlist& bl, bool force,
     }
     cout << "Creating a new epoch." << std::endl;
   }
+  if (dry_run)
+    return 0;
   ObjectStore::Transaction t;
   t.write(coll_t::meta(), full_oid, 0, bl.length(), bl);
   t.truncate(coll_t::meta(), full_oid, bl.length());
@@ -977,18 +935,18 @@ int ObjectStoreTool::get_object(ObjectStore *store, coll_t coll,
     object_locator_t loc(ob.hoid.hobj);
     pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
     pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
-  
+
     spg_t coll_pgid;
     if (coll.is_pg(&coll_pgid) == false) {
       cerr << "INTERNAL ERROR: Bad collection during import" << std::endl;
       return -EFAULT;
     }
     if (coll_pgid.shard != ob.hoid.shard_id) {
-      cerr << "INTERNAL ERROR: Importing shard " << coll_pgid.shard 
+      cerr << "INTERNAL ERROR: Importing shard " << coll_pgid.shard
         << " but object shard is " << ob.hoid.shard_id << std::endl;
       return -EFAULT;
     }
-     
+
     if (coll_pgid.pgid != pgid) {
       cerr << "Skipping object '" << ob.hoid << "' which belongs in pg " << pgid << std::endl;
       *skipped_objects = true;
@@ -1350,18 +1308,17 @@ int ObjectStoreTool::do_import(ObjectStore *store, OSDSuperblock& sb,
   }
 
   if (!dry_run) {
-    ObjectStore::Transaction *t = new ObjectStore::Transaction;
-    PG::_create(*t, pgid,
+    ObjectStore::Transaction t;
+    PG::_create(t, pgid,
 		pgid.get_split_bits(curmap.get_pg_pool(pgid.pool())->get_pg_num()));
-    PG::_init(*t, pgid, NULL);
+    PG::_init(t, pgid, NULL);
 
     // mark this coll for removal until we're done
     map<string,bufferlist> values;
     ::encode((char)1, values["_remove"]);
-    t->omap_setkeys(coll, pgid.make_pgmeta_oid(), values);
+    t.omap_setkeys(coll, pgid.make_pgmeta_oid(), values);
 
-    store->apply_transaction(&osr, *t);
-    delete t;
+    store->apply_transaction(&osr, t);
   }
 
   cout << "Importing pgid " << pgid;
@@ -1463,10 +1420,11 @@ int ObjectStoreTool::do_import(ObjectStore *store, OSDSuperblock& sb,
   return 0;
 }
 
-int do_list(ObjectStore *store, string pgidstr, string object, Formatter *formatter, bool debug, bool human_readable)
+int do_list(ObjectStore *store, string pgidstr, string object,
+	    Formatter *formatter, bool debug, bool human_readable, bool head)
 {
   int r;
-  lookup_ghobject lookup(object);
+  lookup_ghobject lookup(object, head);
   if (pgidstr.length() > 0) {
     r = action_on_all_objects_in_pg(store, pgidstr, lookup, debug);
   } else {
@@ -1513,18 +1471,17 @@ int do_remove_object(ObjectStore *store, coll_t coll,
   cout << "remove " << ghobj << std::endl;
   if (dry_run)
     return 0;
-  ObjectStore::Transaction *t = new ObjectStore::Transaction;
-  OSDriver::OSTransaction _t(driver.get_transaction(t));
+  ObjectStore::Transaction t;
+  OSDriver::OSTransaction _t(driver.get_transaction(&t));
   r = mapper.remove_oid(ghobj.hobj, &_t);
   if (r < 0 && r != -ENOENT) {
     cerr << "remove_oid returned " << cpp_strerror(r) << std::endl;
     return r;
   }
 
-  t->remove(coll, ghobj);
+  t.remove(coll, ghobj);
 
-  store->apply_transaction(&osr, *t);
-  delete t;
+  store->apply_transaction(&osr, t);
   return 0;
 }
 
@@ -1690,6 +1647,9 @@ int do_set_attr(ObjectStore *store, coll_t coll,
   if (ret < 0)
     return ret;
 
+  if (dry_run)
+    return 0;
+
   t->touch(coll, ghobj);
 
   t->setattr(coll, ghobj, key,  bl);
@@ -1708,6 +1668,9 @@ int do_rm_attr(ObjectStore *store, coll_t coll,
   if (debug)
     cerr << "Rmattr " << ghobj << std::endl;
 
+  if (dry_run)
+    return 0;
+
   t->rmattr(coll, ghobj, key);
 
   store->apply_transaction(&osr, *t);
@@ -1763,6 +1726,9 @@ int do_set_omap(ObjectStore *store, coll_t coll,
 
   attrset.insert(pair<string, bufferlist>(key, valbl));
 
+  if (dry_run)
+    return 0;
+
   t->touch(coll, ghobj);
 
   t->omap_setkeys(coll, ghobj, attrset);
@@ -1784,6 +1750,9 @@ int do_rm_omap(ObjectStore *store, coll_t coll,
   if (debug)
     cerr << "Rm_omap " << ghobj << std::endl;
 
+  if (dry_run)
+    return 0;
+
   t->omap_rmkeys(coll, ghobj, keys);
 
   store->apply_transaction(&osr, *t);
@@ -1825,6 +1794,9 @@ int do_set_omaphdr(ObjectStore *store, coll_t coll,
   if (ret)
     return ret;
 
+  if (dry_run)
+    return 0;
+
   t->touch(coll, ghobj);
 
   t->omap_setheader(coll, ghobj, hdrbl);
@@ -1864,8 +1836,95 @@ struct do_fix_lost : public action_on_object_t {
   }
 };
 
+int get_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj, SnapSet &ss, bool silent = false)
+{
+  bufferlist attr;
+  int r = store->getattr(coll, ghobj, SS_ATTR, attr);
+  if (r < 0) {
+    if (!silent)
+      cerr << "Error getting snapset on : " << make_pair(coll, ghobj) << ", "
+	   << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  bufferlist::iterator bp = attr.begin();
+  try {
+    ::decode(ss, bp);
+  } catch (...) {
+    r = -EINVAL;
+    cerr << "Error decoding snapset on : " << make_pair(coll, ghobj) << ", "
+         << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
 int print_obj_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter* formatter)
 {
+  int r = 0;
+  formatter->open_object_section("obj");
+  formatter->open_object_section("id");
+  ghobj.dump(formatter);
+  formatter->close_section();
+
+  bufferlist attr;
+  int gr = store->getattr(coll, ghobj, OI_ATTR, attr);
+  if (gr < 0) {
+    r = gr;
+    cerr << "Error getting attr on : " << make_pair(coll, ghobj) << ", "
+       << cpp_strerror(r) << std::endl;
+  } else {
+    object_info_t oi;
+    bufferlist::iterator bp = attr.begin();
+    try {
+      ::decode(oi, bp);
+      formatter->open_object_section("info");
+      oi.dump(formatter);
+      formatter->close_section();
+    } catch (...) {
+      r = -EINVAL;
+      cerr << "Error decoding attr on : " << make_pair(coll, ghobj) << ", "
+           << cpp_strerror(r) << std::endl;
+    }
+  }
+  struct stat st;
+  int sr =  store->stat(coll, ghobj, &st, true);
+  if (sr < 0) {
+    r = sr;
+    cerr << "Error stat on : " << make_pair(coll, ghobj) << ", "
+         << cpp_strerror(r) << std::endl;
+  } else {
+    formatter->open_object_section("stat");
+    formatter->dump_int("size", st.st_size);
+    formatter->dump_int("blksize", st.st_blksize);
+    formatter->dump_int("blocks", st.st_blocks);
+    formatter->dump_int("nlink", st.st_nlink);
+    formatter->close_section();
+  }
+
+  if (ghobj.hobj.has_snapset()) {
+    SnapSet ss;
+    int snr = get_snapset(store, coll, ghobj, ss);
+    if (snr < 0) {
+      r = snr;
+    } else {
+      formatter->open_object_section("SnapSet");
+      ss.dump(formatter);
+      formatter->close_section();
+    }
+  }
+  formatter->close_section();
+  formatter->flush(cout);
+  cout << std::endl;
+  return r;
+}
+
+int set_size(ObjectStore *store, coll_t coll, ghobject_t &ghobj, uint64_t setsize, Formatter* formatter,
+	     ObjectStore::Sequencer &osr)
+{
+  if (ghobj.hobj.is_snapdir()) {
+    cerr << "Can't set the size of a snapdir" << std::endl;
+    return -EINVAL;
+  }
   bufferlist attr;
   int r = store->getattr(coll, ghobj, OI_ATTR, attr);
   if (r < 0) {
@@ -1883,11 +1942,227 @@ int print_obj_info(ObjectStore *store, coll_t coll, ghobject_t &ghobj, Formatter
          << cpp_strerror(r) << std::endl;
     return r;
   }
-  formatter->open_object_section("info");
-  oi.dump(formatter);
-  formatter->close_section();
-  formatter->flush(cout);
+  struct stat st;
+  r =  store->stat(coll, ghobj, &st, true);
+  if (r < 0) {
+    cerr << "Error stat on : " << make_pair(coll, ghobj) << ", "
+         << cpp_strerror(r) << std::endl;
+  }
+  ghobject_t head(ghobj);
+  SnapSet ss;
+  bool found_head = true;
+  map<snapid_t, uint64_t>::iterator csi;
+  bool is_snap = ghobj.hobj.is_snap();
+  if (is_snap) {
+    head.hobj = head.hobj.get_head();
+    r = get_snapset(store, coll, head, ss, true);
+    if (r < 0 && r != -ENOENT) {
+      // Requested get_snapset() silent, so if not -ENOENT show error
+      cerr << "Error getting snapset on : " << make_pair(coll, head) << ", "
+	   << cpp_strerror(r) << std::endl;
+      return r;
+    }
+    if (r == -ENOENT) {
+      head.hobj = head.hobj.get_snapdir();
+      r = get_snapset(store, coll, head, ss);
+      if (r < 0)
+        return r;
+      found_head = false;
+    } else {
+      found_head = true;
+    }
+    csi = ss.clone_size.find(ghobj.hobj.snap);
+    if (csi == ss.clone_size.end()) {
+      cerr << "SnapSet is missing clone_size for snap " << ghobj.hobj.snap << std::endl;
+      return -EINVAL;
+    }
+  }
+  if ((uint64_t)st.st_size == setsize && oi.size == setsize
+       && (!is_snap || csi->second == setsize)) {
+    cout << "Size of object is already " << setsize << std::endl;
+    return 0;
+  }
+  cout << "Setting size to " << setsize << ", stat size " << st.st_size
+       << ", obj info size " << oi.size;
+  if (is_snap) {
+    cout << ", " << (found_head ? "head" : "snapdir")
+	 << " clone_size " << csi->second;
+    csi->second = setsize;
+  }
   cout << std::endl;
+  if (!dry_run) {
+    attr.clear();
+    oi.size = setsize;
+    ::encode(oi, attr);
+    ObjectStore::Transaction t;
+    t.setattr(coll, ghobj, OI_ATTR, attr);
+    t.truncate(coll, ghobj, setsize);
+    if (is_snap) {
+      bufferlist snapattr;
+      snapattr.clear();
+      ::encode(ss, snapattr);
+      t.setattr(coll, head, SS_ATTR, snapattr);
+    }
+    r = store->apply_transaction(&osr, t);
+    if (r < 0) {
+      cerr << "Error writing object info: " << make_pair(coll, ghobj) << ", "
+         << cpp_strerror(r) << std::endl;
+      return r;
+    }
+  }
+  return 0;
+}
+
+int clear_snapset(ObjectStore *store, coll_t coll, ghobject_t &ghobj,
+                  string arg, ObjectStore::Sequencer &osr)
+{
+  SnapSet ss;
+  int ret = get_snapset(store, coll, ghobj, ss);
+  if (ret < 0)
+    return ret;
+
+  // Use "head" to set head_exists incorrectly
+  if (arg == "corrupt" || arg == "head")
+    ss.head_exists = !ghobj.hobj.is_head();
+  else if (ss.head_exists != ghobj.hobj.is_head()) {
+    cerr << "Correcting head_exists, set to "
+         << (ghobj.hobj.is_head() ? "true" : "false") << std::endl;
+    ss.head_exists = ghobj.hobj.is_head();
+  }
+  // Use "corrupt" to clear entire SnapSet
+  // Use "seq" to just corrupt SnapSet.seq
+  if (arg == "corrupt" || arg == "seq")
+    ss.seq = 0;
+  // Use "snaps" to just clear SnapSet.snaps
+  if (arg == "corrupt" || arg == "snaps")
+    ss.snaps.clear();
+  // By default just clear clone, clone_overlap and clone_size
+  if (arg == "corrupt")
+    arg = "";
+  if (arg == "" || arg == "clones")
+    ss.clones.clear();
+  if (arg == "" || arg == "clone_overlap")
+    ss.clone_overlap.clear();
+  if (arg == "" || arg == "clone_size")
+    ss.clone_size.clear();
+  // Break all clone sizes by adding 1
+  if (arg == "size") {
+    for (map<snapid_t, uint64_t>::iterator i = ss.clone_size.begin();
+         i != ss.clone_size.end(); ++i)
+      ++(i->second);
+  }
+
+  if (!dry_run) {
+    bufferlist bl;
+    ::encode(ss, bl);
+    ObjectStore::Transaction t;
+    t.setattr(coll, ghobj, SS_ATTR, bl);
+    int r = store->apply_transaction(&osr, t);
+    if (r < 0) {
+      cerr << "Error setting snapset on : " << make_pair(coll, ghobj) << ", "
+	   << cpp_strerror(r) << std::endl;
+      return r;
+    }
+  }
+  return 0;
+}
+
+vector<snapid_t>::iterator find(vector<snapid_t> &v, snapid_t clid)
+{
+  return std::find(v.begin(), v.end(), clid);
+}
+
+map<snapid_t, interval_set<uint64_t> >::iterator
+find(map<snapid_t, interval_set<uint64_t> > &m, snapid_t clid)
+{
+  return m.find(clid);
+}
+
+map<snapid_t, uint64_t>::iterator find(map<snapid_t, uint64_t> &m,
+				       snapid_t clid)
+{
+  return m.find(clid);
+}
+
+template<class T>
+int remove_from(T &mv, string name, snapid_t cloneid, bool force)
+{
+  typename T::iterator i = find(mv, cloneid);
+  if (i != mv.end()) {
+    mv.erase(i);
+  } else {
+    cerr << "Clone " << cloneid << " doesn't exist in " << name;
+    if (force) {
+      cerr << " (ignored)" << std::endl;
+      return 0;
+    }
+    cerr << std::endl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+int remove_clone(ObjectStore *store, coll_t coll, ghobject_t &ghobj, snapid_t cloneid, bool force,
+		     ObjectStore::Sequencer &osr)
+{
+  // XXX: Don't allow this if in a cache tier or former cache tier
+  // bool allow_incomplete_clones() const {
+  // 	return cache_mode != CACHEMODE_NONE || has_flag(FLAG_INCOMPLETE_CLONES);
+
+  SnapSet snapset;
+  int ret = get_snapset(store, coll, ghobj, snapset);
+  if (ret < 0)
+    return ret;
+
+  // Derived from trim_object()
+  // ...from snapset
+  vector<snapid_t>::iterator p;
+  for (p = snapset.clones.begin(); p != snapset.clones.end(); ++p)
+    if (*p == cloneid)
+      break;
+  if (p == snapset.clones.end()) {
+    cerr << "Clone " << cloneid << " not present";
+    return -ENOENT;
+  }
+  if (p != snapset.clones.begin()) {
+    // not the oldest... merge overlap into next older clone
+    vector<snapid_t>::iterator n = p - 1;
+    hobject_t prev_coid = ghobj.hobj;
+    prev_coid.snap = *n;
+    //bool adjust_prev_bytes = is_present_clone(prev_coid);
+
+    //if (adjust_prev_bytes)
+    //  ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
+
+    snapset.clone_overlap[*n].intersection_of(
+	snapset.clone_overlap[*p]);
+
+    //if (adjust_prev_bytes)
+    //  ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
+  }
+
+  ret = remove_from(snapset.clones, "clones", cloneid, force);
+  if (ret) return ret;
+  ret = remove_from(snapset.clone_overlap, "clone_overlap", cloneid, force);
+  if (ret) return ret;
+  ret = remove_from(snapset.clone_size, "clone_size", cloneid, force);
+  if (ret) return ret;
+
+  if (dry_run)
+    return 0;
+
+  bufferlist bl;
+  ::encode(snapset, bl);
+  ObjectStore::Transaction t;
+  t.setattr(coll, ghobj, SS_ATTR, bl);
+  int r = store->apply_transaction(&osr, t);
+  if (r < 0) {
+    cerr << "Error setting snapset on : " << make_pair(coll, ghobj) << ", "
+	 << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  cout << "Removal of clone " << cloneid << " complete" << std::endl;
+  cout << "Use pg repair after OSD restarted to correct stat information" << std::endl;
   return 0;
 }
 
@@ -1906,7 +2181,9 @@ void usage(po::options_description &desc)
     cerr << "ceph-objectstore-tool ... <object> list-attrs" << std::endl;
     cerr << "ceph-objectstore-tool ... <object> list-omap" << std::endl;
     cerr << "ceph-objectstore-tool ... <object> remove" << std::endl;
-    cerr << "ceph-objectstore-tool ... <object> dump-info" << std::endl;
+    cerr << "ceph-objectstore-tool ... <object> dump" << std::endl;
+    cerr << "ceph-objectstore-tool ... <object> set-size" << std::endl;
+    cerr << "ceph-objectstore-tool ... <object> remove-clone-metadata <cloneid>" << std::endl;
     cerr << std::endl;
     cerr << "<object> can be a JSON object description as displayed" << std::endl;
     cerr << "by --op list." << std::endl;
@@ -1947,6 +2224,7 @@ int main(int argc, char **argv)
   bool human_readable;
   bool force;
   Formatter *formatter;
+  bool head;
 
   po::options_description desc("Allowed options");
   desc.add_options()
@@ -1972,6 +2250,7 @@ int main(int argc, char **argv)
     ("force", "Ignore some types of errors and proceed with operation - USE WITH CAUTION: CORRUPTION POSSIBLE NOW OR IN THE FUTURE")
     ("skip-journal-replay", "Disable journal replay")
     ("skip-mount-omap", "Disable mounting of omap")
+    ("head", "Find head/snapdir when searching for objects by name")
     ("dry-run", "Don't modify the objectstore")
     ;
 
@@ -2029,6 +2308,8 @@ int main(int argc, char **argv)
   if (vm.count("skip-mount-omap"))
     flags |= SKIP_MOUNT_OMAP;
 
+  head = (vm.count("head") > 0);
+
   vector<const char *> ceph_options;
   env_to_vec(ceph_options);
   ceph_options.reserve(ceph_options.size() + ceph_option_strings.size());
@@ -2101,7 +2382,8 @@ int main(int argc, char **argv)
   }
 
   if (file_fd != fd_none && file_fd < 0) {
-    perror("open");
+    string err = string("file: ") + file;
+    perror(err.c_str());
     myexit(1);
   }
 
@@ -2134,15 +2416,21 @@ int main(int argc, char **argv)
   // Special handling for filestore journal, so we can dump it without mounting
   if (op == "dump-journal" && type == "filestore") {
     int ret = mydump_journal(formatter, jpath, g_conf->journal_dio);
+    if (ret < 0) {
+      cerr << "journal-path: " << jpath << ": "
+	   << cpp_strerror(ret) << std::endl;
+      myexit(1);
+    }
     formatter->flush(cout);
-    myexit(ret != 0);
+    myexit(0);
   }
 
   //Verify that data-path really exists
   struct stat st;
   if (::stat(dpath.c_str(), &st) == -1) {
-     perror("data-path");
-     myexit(1);
+    string err = string("data-path: ") + dpath;
+    perror(err.c_str());
+    myexit(1);
   }
   //Verify data data-path really is a filestore
   if (type == "filestore") {
@@ -2250,14 +2538,17 @@ int main(int argc, char **argv)
     json_spirit::Value v;
     try {
       if (!json_spirit::read(object, v)) {
-	lookup_ghobject lookup(object);
+        // Special: Need head/snapdir so set even if user didn't specify
+        if (vm.count("objcmd") && (objcmd == "remove-clone-metadata"))
+	  head = true;
+	lookup_ghobject lookup(object, head);
 	if (action_on_all_objects(fs, lookup, debug)) {
 	  throw std::runtime_error("Internal error");
 	} else {
 	  if (lookup.size() != 1) {
 	    stringstream ss;
 	    if (lookup.size() == 0)
-	      ss << "No object id '" << object << "' found";
+	      ss << "No object id '" << object << "' found or invalid JSON specified";
 	    else
 	      ss << "Found " << lookup.size() << " objects with id '" << object
 		 << "', please use a JSON spec from --op list instead";
@@ -2271,18 +2562,22 @@ int main(int argc, char **argv)
       } else {
 	stringstream ss;
 	if (pgidstr.length() == 0 && v.type() != json_spirit::array_type) {
-	  ss << "object '" << object
-	     << "' must be a JSON array but is of type "
-	     << v.type() << " instead";
+	  ss << "Without --pgid the object '" << object
+	     << "' must be a JSON array";
 	  throw std::runtime_error(ss.str());
 	}
 	if (v.type() == json_spirit::array_type) {
 	  json_spirit::Array array = v.get_array();
+	  if (array.size() != 2) {
+	    ss << "Object '" << object
+	       << "' must be a JSON array with 2 elements";
+	    throw std::runtime_error(ss.str());
+	  }
 	  vector<json_spirit::Value>::iterator i = array.begin();
+	  //if (i == array.end() || i->type() != json_spirit::str_type) {
 	  if (i->type() != json_spirit::str_type) {
-	    ss << "object '" << object
-	       << "' must be a JSON array with the first element a string but "
-	       << "found type " << v.type() << " instead";
+	    ss << "Object '" << object
+	       << "' must be a JSON array with the first element a string";
 	    throw std::runtime_error(ss.str());
 	  }
 	  string object_pgidstr = i->get_str();
@@ -2309,7 +2604,7 @@ int main(int argc, char **argv)
 	try {
 	  ghobj.decode(v);
 	} catch (std::runtime_error& e) {
-	  ss << "Decode object json error: " << e.what();
+	  ss << "Decode object JSON error: " << e.what();
 	  throw std::runtime_error(ss.str());
 	}
         if (pgidstr != "meta" && (uint64_t)pgid.pgid.m_pool != (uint64_t)ghobj.hobj.pool) {
@@ -2537,7 +2832,7 @@ int main(int argc, char **argv)
   }
 
   if (op == "list") {
-    ret = do_list(fs, pgidstr, object, formatter, debug, human_readable);
+    ret = do_list(fs, pgidstr, object, formatter, debug, human_readable, head);
     if (ret < 0) {
       cerr << "do_list failed: " << cpp_strerror(ret) << std::endl;
     }
@@ -2794,9 +3089,60 @@ int main(int argc, char **argv)
 	if (fd != STDIN_FILENO)
 	  close(fd);
         goto out;
-      } else if (objcmd == "dump-info") {
+      } else if (objcmd == "dump") {
+	// There should not be any other arguments
+	if (vm.count("arg1") || vm.count("arg2")) {
+	  usage(desc);
+	  ret = 1;
+	  goto out;
+	}
 	ret = print_obj_info(fs, coll, ghobj, formatter);
 	goto out;
+      } else if (objcmd == "set-size") {
+        // Extra arg
+	if (vm.count("arg1") == 0 || vm.count("arg2")) {
+	  usage(desc);
+          ret = 1;
+          goto out;
+        }
+        if (arg1.length() == 0 || !isdigit(arg1.c_str()[0])) {
+	  cerr << "Invalid size '" << arg1 << "' specified" << std::endl;
+	  ret = 1;
+	  goto out;
+	}
+	uint64_t size = atoll(arg1.c_str());
+	ret = set_size(fs, coll, ghobj, size, formatter, *osr);
+	goto out;
+      } else if (objcmd == "clear-snapset") {
+        // UNDOCUMENTED: For testing zap SnapSet
+        // IGNORE extra args since not in usage anyway
+	if (!ghobj.hobj.has_snapset()) {
+	  cerr << "'" << objcmd << "' requires a head or snapdir object" << std::endl;
+	  ret = 1;
+	  goto out;
+	}
+        ret = clear_snapset(fs, coll, ghobj, arg1, *osr);
+        goto out;
+      } else if (objcmd == "remove-clone-metadata") {
+        // Extra arg
+	if (vm.count("arg1") == 0 || vm.count("arg2")) {
+	  usage(desc);
+          ret = 1;
+          goto out;
+        }
+	if (!ghobj.hobj.has_snapset()) {
+	  cerr << "'" << objcmd << "' requires a head or snapdir object" << std::endl;
+	  ret = 1;
+	  goto out;
+	}
+        if (arg1.length() == 0 || !isdigit(arg1.c_str()[0])) {
+	  cerr << "Invalid cloneid '" << arg1 << "' specified" << std::endl;
+	  ret = 1;
+	  goto out;
+	}
+        snapid_t cloneid = atoi(arg1.c_str());
+	ret = remove_clone(fs, coll, ghobj, cloneid, force, *osr);
+	goto out;
       }
       cerr << "Unknown object command '" << objcmd << "'" << std::endl;
       usage(desc);
@@ -2864,6 +3210,10 @@ int main(int argc, char **argv)
       cout << "Remove past-intervals " << past_intervals << std::endl;
 
       past_intervals.clear();
+      if (dry_run) {
+        ret = 0;
+        goto out;
+      }
       ret = write_info(*t, map_epoch, info, past_intervals);
 
       if (ret == 0) {
@@ -2891,11 +3241,13 @@ int main(int argc, char **argv)
       info.history.last_epoch_clean = superblock.current_epoch;
       past_intervals.clear();
 
-      ret = write_info(*t, map_epoch, info, past_intervals);
-      if (ret == 0) {
+      if (!dry_run) {
+	ret = write_info(*t, map_epoch, info, past_intervals);
+	if (ret != 0)
+	  goto out;
 	fs->apply_transaction(osr, *t);
-	cout << "Marking complete succeeded" << std::endl;
       }
+      cout << "Marking complete succeeded" << std::endl;
     } else {
       assert(!"Should have already checked for valid --op");
     }
diff --git a/src/tools/ceph_osdomap_tool.cc b/src/tools/ceph_osdomap_tool.cc
index fb950fa..465ffda 100644
--- a/src/tools/ceph_osdomap_tool.cc
+++ b/src/tools/ceph_osdomap_tool.cc
@@ -20,7 +20,7 @@
 #include "global/global_init.h"
 
 #include "os/DBObjectMap.h"
-#include "os/LevelDBStore.h"
+#include "kv/KeyValueDB.h"
 
 namespace po = boost::program_options;
 using namespace std;
@@ -86,11 +86,11 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  LevelDBStore* store(new LevelDBStore(g_ceph_context, store_path));
-  if (vm.count("paranoid")) {
+  KeyValueDB* store(KeyValueDB::create(g_ceph_context, "leveldb", store_path));
+  /*if (vm.count("paranoid")) {
     std::cerr << "Enabling paranoid checks" << std::endl;
     store->options.paranoid_checks = true;
-  }
+    }*/
   DBObjectMap omap(store);
   stringstream out;
   int r = store->open(out);
diff --git a/src/tools/cephfs/Dumper.cc b/src/tools/cephfs/Dumper.cc
index 0603661..f97029e 100644
--- a/src/tools/cephfs/Dumper.cc
+++ b/src/tools/cephfs/Dumper.cc
@@ -29,6 +29,7 @@
 
 #define dout_subsys ceph_subsys_mds
 
+#define HEADER_LEN 4096
 
 int Dumper::init(int rank_)
 {
@@ -90,9 +91,9 @@ int Dumper::dump(const char *dump_file)
   int fd = ::open(dump_file, O_WRONLY|O_CREAT|O_TRUNC, 0644);
   if (fd >= 0) {
     // include an informative header
-    char buf[200];
+    char buf[HEADER_LEN];
     memset(buf, 0, sizeof(buf));
-    sprintf(buf, "Ceph mds%d journal dump\n start offset %llu (0x%llx)\n       length %llu (0x%llx)\n    write_pos %llu (0x%llx)\n    format %llu\n    trimmed_pos %llu (0x%llx)\n%c",
+    snprintf(buf, HEADER_LEN, "Ceph mds%d journal dump\n start offset %llu (0x%llx)\n       length %llu (0x%llx)\n    write_pos %llu (0x%llx)\n    format %llu\n    trimmed_pos %llu (0x%llx)\n%c",
 	    rank, 
 	    (unsigned long long)start, (unsigned long long)start,
 	    (unsigned long long)len, (unsigned long long)len,
@@ -185,7 +186,7 @@ int Dumper::undump(const char *dump_file)
   //  start offset 232401996 (0xdda2c4c)
   //        length 1097504 (0x10bf20)
 
-  char buf[200];
+  char buf[HEADER_LEN];
   r = safe_read(fd, buf, sizeof(buf));
   if (r < 0) {
     VOID_TEMP_FAILURE_RETRY(::close(fd));
diff --git a/src/tools/rados/RadosImport.cc b/src/tools/rados/RadosImport.cc
index 1f74af2..b4b397b 100644
--- a/src/tools/rados/RadosImport.cc
+++ b/src/tools/rados/RadosImport.cc
@@ -317,7 +317,9 @@ int RadosImport::get_object_rados(librados::IoCtx &ioctx, bufferlist &bl, bool n
         break;
       for (std::map<string,bufferlist>::iterator i = as.data.begin();
           i != as.data.end(); ++i) {
-        if (i->first == "_" || i->first == "snapset")
+	// The user xattrs that we want all begin with "_" with length > 1.
+        // Drop key "_" and all attributes that do not start with '_'
+        if (i->first == "_" || i->first[0] != '_')
           continue;
         ret = ioctx.setxattr(ob.hoid.hobj.oid.name, i->first.substr(1).c_str(), i->second);
         if (ret) {
diff --git a/src/tools/rados/rados.cc b/src/tools/rados/rados.cc
index 4f4b086..6f87c68 100644
--- a/src/tools/rados/rados.cc
+++ b/src/tools/rados/rados.cc
@@ -191,6 +191,12 @@ void usage(ostream& out)
 "        prefix output with date/time\n"
 "   --no-verify\n"
 "        do not verify contents of read objects\n"
+"   --write-object\n"
+"        write contents to the objects\n"
+"   --write-omap\n"
+"        write contents to the omap\n"
+"   --write-xattr\n"
+"        write contents to the extended attributes\n"
 "\n"
 "LOAD GEN OPTIONS:\n"
 "   --num-objects                    total number of objects\n"
@@ -822,6 +828,11 @@ void LoadGen::cleanup()
   }
 }
 
+enum OpWriteDest {
+  OP_WRITE_DEST_OBJ = 2 << 0,
+  OP_WRITE_DEST_OMAP = 2 << 1,
+  OP_WRITE_DEST_XATTR = 2 << 2,
+};
 
 class RadosBencher : public ObjBencher {
   librados::AioCompletion **completions;
@@ -829,6 +840,8 @@ class RadosBencher : public ObjBencher {
   librados::IoCtx& io_ctx;
   librados::NObjectIterator oi;
   bool iterator_valid;
+  OpWriteDest write_destination;
+
 protected:
   int completions_init(int concurrentios) {
     completions = new librados::AioCompletion *[concurrentios];
@@ -856,7 +869,23 @@ protected:
   }
 
   int aio_write(const std::string& oid, int slot, bufferlist& bl, size_t len) {
-    return io_ctx.aio_write(oid, completions[slot], bl, len, 0);
+    librados::ObjectWriteOperation op;
+
+    if (write_destination & OP_WRITE_DEST_OBJ) {
+      op.write(0, bl);
+    }
+
+    if (write_destination & OP_WRITE_DEST_OMAP) {
+      std::map<std::string, librados::bufferlist> omap;
+      omap["bench-omap-key"] = bl;
+      op.omap_set(omap);
+    }
+
+    if (write_destination & OP_WRITE_DEST_XATTR) {
+      op.setxattr("bench-xattr-key", bl);
+    }
+
+    return io_ctx.aio_operate(oid, completions[slot], &op);
   }
 
   int aio_remove(const std::string& oid, int slot) {
@@ -916,8 +945,12 @@ protected:
 
 public:
   RadosBencher(CephContext *cct_, librados::Rados& _r, librados::IoCtx& _i)
-    : ObjBencher(cct_), completions(NULL), rados(_r), io_ctx(_i), iterator_valid(false) {}
+    : ObjBencher(cct_), completions(NULL), rados(_r), io_ctx(_i), iterator_valid(false), write_destination(OP_WRITE_DEST_OBJ) {}
   ~RadosBencher() { }
+
+  void set_write_destination(OpWriteDest dest) {
+    write_destination = dest;
+  }
 };
 
 static int do_lock_cmd(std::vector<const char*> &nargs,
@@ -1182,6 +1215,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
   int concurrent_ios = 16;
   unsigned op_size = default_op_size;
   bool block_size_specified = false;
+  int bench_write_dest = 0;
   bool cleanup = true;
   bool no_verify = false;
   bool use_striper = false;
@@ -1365,7 +1399,18 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
   if (i != opts.end()) {
     output = i->second.c_str();
   }
-
+  i = opts.find("write-dest-obj");
+  if (i != opts.end()) {
+    bench_write_dest |= static_cast<int>(OP_WRITE_DEST_OBJ);
+  }
+  i = opts.find("write-dest-omap");
+  if (i != opts.end()) {
+    bench_write_dest |= static_cast<int>(OP_WRITE_DEST_OMAP);
+  }
+  i = opts.find("write-dest-xattr");
+  if (i != opts.end()) {
+    bench_write_dest |= static_cast<int>(OP_WRITE_DEST_XATTR);
+  }
 
   // open rados
   ret = rados.init_with_context(g_ceph_context);
@@ -1405,14 +1450,31 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       goto out;
     }
 
-    // align op_size
-    if (io_ctx.pool_requires_alignment()) {
-      const uint64_t align = io_ctx.pool_required_alignment();
-      const uint64_t prev_op_size = op_size;
-      op_size = uint64_t((op_size + align - 1) / align) * align;
-      // Warn: if user specified and it was rounded
-      if (prev_op_size != default_op_size && prev_op_size != op_size)
-	cerr << "INFO: op_size has been rounded to " << op_size << std::endl;
+   // align op_size
+   {
+      bool requires;
+      ret = io_ctx.pool_requires_alignment2(&requires);
+      if (ret < 0) {
+        cerr << "error checking pool alignment requirement"
+          << cpp_strerror(ret) << std::endl;
+        goto out;	
+      }
+
+      if (requires) {
+        uint64_t align = 0;
+        ret = io_ctx.pool_required_alignment2(&align);
+        if (ret < 0) {
+          cerr << "error getting pool alignment"
+            << cpp_strerror(ret) << std::endl;
+          goto out;	
+        }
+
+        const uint64_t prev_op_size = op_size;
+        op_size = uint64_t((op_size + align - 1) / align) * align;
+        // Warn: if user specified and it was rounded
+        if (prev_op_size != default_op_size && prev_op_size != op_size)
+          cerr << "INFO: op_size has been rounded to " << op_size << std::endl;
+      }
     }
 
     // create striper interface
@@ -1930,11 +1992,11 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     }
 
     if (values.size() && values.begin()->first == key) {
-      cout << " (length " << values.begin()->second.length() << ") : ";
       if (!outfile.empty()) {
 	cerr << "Writing to " << outfile << std::endl;
 	dump_data(outfile, values.begin()->second);
       } else {
+        cout << "value (" << values.begin()->second.length() << " bytes) :\n";
 	values.begin()->second.hexdump(cout);
 	cout << std::endl;
       }
@@ -1984,7 +2046,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
 	// dump key in hex if it contains nonprintable characters
 	if (std::count_if(it->first.begin(), it->first.end(),
 	    (int (*)(int))isprint) < (int)it->first.length()) {
-	  cout << "key: (" << it->first.length() << " bytes):\n";
+	  cout << "key (" << it->first.length() << " bytes):\n";
 	  bufferlist keybl;
 	  keybl.append(it->first);
 	  keybl.hexdump(cout);
@@ -1992,7 +2054,7 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
 	  cout << it->first;
 	}
 	cout << std::endl;
-	cout << "value: (" << it->second.length() << " bytes) :\n";
+	cout << "value (" << it->second.length() << " bytes) :\n";
 	it->second.hexdump(cout);
 	cout << std::endl;
       }
@@ -2405,12 +2467,25 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
       operation = OP_RAND_READ;
     else
       usage_exit();
-    if (block_size_specified && (operation != OP_WRITE)){
-      cerr << "-b|--block_size option can be used only with `write' bench test"
-           << std::endl;
-      ret = -EINVAL;
-      goto out;
+    if (operation != OP_WRITE) {
+      if (block_size_specified) {
+        cerr << "-b|--block_size option can be used only with `write' bench test"
+             << std::endl;
+        ret = -EINVAL;
+        goto out;
+      }
+      if (bench_write_dest != 0) {
+        cerr << "--write-object, --write-omap and --write-xattr options can "
+                "only be used with the 'write' bench test"
+             << std::endl;
+        ret = -EINVAL;
+        goto out;
+      }
+    }
+    else if (bench_write_dest == 0) {
+      bench_write_dest = OP_WRITE_DEST_OBJ;
     }
+
     if (!formatter && output) {
       cerr << "-o|--output option can be used only with '--format' option"
            << std::endl;
@@ -2419,6 +2494,8 @@ static int rados_tool_common(const std::map < std::string, std::string > &opts,
     }
     RadosBencher bencher(g_ceph_context, rados, io_ctx);
     bencher.set_show_time(show_time);
+    bencher.set_write_destination(static_cast<OpWriteDest>(bench_write_dest));
+
     ostream *outstream = NULL;
     if (formatter) {
       bencher.set_formatter(formatter);
@@ -2950,6 +3027,12 @@ int main(int argc, const char **argv)
       opts["default"] = "true";
     } else if (ceph_argparse_witharg(args, i, &val, "-o", "--output", (char*)NULL)) {
       opts["output"] = val;
+    } else if (ceph_argparse_flag(args, i, "--write-omap", (char*)NULL)) {
+      opts["write-dest-omap"] = "true";
+    } else if (ceph_argparse_flag(args, i, "--write-object", (char*)NULL)) {
+      opts["write-dest-obj"] = "true";
+    } else if (ceph_argparse_flag(args, i, "--write-xattr", (char*)NULL)) {
+      opts["write-dest-xattr"] = "true";
     } else {
       if (val[0] == '-')
         usage_exit();
diff --git a/src/tools/rbd/ArgumentTypes.cc b/src/tools/rbd/ArgumentTypes.cc
new file mode 100644
index 0000000..f18e88d
--- /dev/null
+++ b/src/tools/rbd/ArgumentTypes.cc
@@ -0,0 +1,342 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "include/rbd/features.h"
+#include "common/config.h"
+#include "common/strtol.h"
+#include "common/Formatter.h"
+#include "global/global_context.h"
+#include <iostream>
+#include <boost/tokenizer.hpp>
+
+namespace rbd {
+namespace argument_types {
+
+namespace po = boost::program_options;
+
+const std::map<uint64_t, std::string> ImageFeatures::FEATURE_MAPPING = {
+  {RBD_FEATURE_LAYERING, "layering"},
+  {RBD_FEATURE_STRIPINGV2, "striping"},
+  {RBD_FEATURE_EXCLUSIVE_LOCK, "exclusive-lock"},
+  {RBD_FEATURE_OBJECT_MAP, "object-map"},
+  {RBD_FEATURE_FAST_DIFF, "fast-diff"},
+  {RBD_FEATURE_DEEP_FLATTEN, "deep-flatten"},
+  {RBD_FEATURE_JOURNALING, "journaling"}};
+
+Format::Formatter Format::create_formatter(bool pretty) const {
+  if (value == "json") {
+    return Formatter(new JSONFormatter(pretty));
+  } else if (value == "xml") {
+    return Formatter(new XMLFormatter(pretty));
+  }
+  return Formatter();
+}
+
+std::string get_name_prefix(ArgumentModifier modifier) {
+  switch (modifier) {
+  case ARGUMENT_MODIFIER_SOURCE:
+    return SOURCE_PREFIX;
+  case ARGUMENT_MODIFIER_DEST:
+    return DEST_PREFIX;
+  default:
+    return "";
+  }
+}
+
+std::string get_description_prefix(ArgumentModifier modifier) {
+  switch (modifier) {
+  case ARGUMENT_MODIFIER_SOURCE:
+    return "source ";
+  case ARGUMENT_MODIFIER_DEST:
+    return "destination ";
+  default:
+    return "";
+  }
+}
+
+void add_pool_option(po::options_description *opt,
+                     ArgumentModifier modifier,
+                     const std::string &desc_suffix) {
+  std::string name = POOL_NAME + ",p";
+  std::string description = "pool name";
+  switch (modifier) {
+  case ARGUMENT_MODIFIER_NONE:
+    break;
+  case ARGUMENT_MODIFIER_SOURCE:
+    description = "source " + description;
+    break;
+  case ARGUMENT_MODIFIER_DEST:
+    name = DEST_POOL_NAME;
+    description = "destination " + description;
+    break;
+  }
+  description += desc_suffix;
+
+  // TODO add validator
+  opt->add_options()
+    (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_image_option(po::options_description *opt,
+                      ArgumentModifier modifier,
+                      const std::string &desc_suffix) {
+  std::string name = IMAGE_NAME;
+  std::string description = "image name";
+  switch (modifier) {
+  case ARGUMENT_MODIFIER_NONE:
+    break;
+  case ARGUMENT_MODIFIER_SOURCE:
+    description = "source " + description;
+    break;
+  case ARGUMENT_MODIFIER_DEST:
+    name = DEST_IMAGE_NAME;
+    description = "destination " + description;
+    break;
+  }
+  description += desc_suffix;
+
+  // TODO add validator
+  opt->add_options()
+    (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_snap_option(po::options_description *opt,
+                      ArgumentModifier modifier) {
+  if (modifier == ARGUMENT_MODIFIER_DEST) {
+    return;
+  }
+
+  std::string name = SNAPSHOT_NAME;
+  std::string description = "snapshot name";
+  switch (modifier) {
+  case ARGUMENT_MODIFIER_NONE:
+  case ARGUMENT_MODIFIER_DEST:
+    break;
+  case ARGUMENT_MODIFIER_SOURCE:
+    description = "source " + description;
+    break;
+  }
+
+  // TODO add validator
+  opt->add_options()
+    (name.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_image_spec_options(po::options_description *pos,
+                            po::options_description *opt,
+                            ArgumentModifier modifier) {
+  pos->add_options()
+    ((get_name_prefix(modifier) + IMAGE_SPEC).c_str(),
+     (get_description_prefix(modifier) + "image specification\n" +
+      "(example: [<pool-name>/]<image-name>)").c_str());
+  add_pool_option(opt, modifier);
+  add_image_option(opt, modifier);
+}
+
+void add_snap_spec_options(po::options_description *pos,
+                           po::options_description *opt,
+                           ArgumentModifier modifier) {
+  pos->add_options()
+    ((get_name_prefix(modifier) + SNAPSHOT_SPEC).c_str(),
+     (get_description_prefix(modifier) + "snapshot specification\n" +
+      "(example: [<pool-name>/]<image-name>@<snapshot-name>)").c_str());
+  add_pool_option(opt, modifier);
+  add_image_option(opt, modifier);
+  add_snap_option(opt, modifier);
+}
+
+void add_image_or_snap_spec_options(po::options_description *pos,
+                                    po::options_description *opt,
+                                    ArgumentModifier modifier) {
+  pos->add_options()
+    ((get_name_prefix(modifier) + IMAGE_OR_SNAPSHOT_SPEC).c_str(),
+     (get_description_prefix(modifier) + "image or snapshot specification\n" +
+      "(example: [<pool-name>/]<image-name>[@<snap-name>])").c_str());
+  add_pool_option(opt, modifier);
+  add_image_option(opt, modifier);
+  add_snap_option(opt, modifier);
+}
+
+void add_create_image_options(po::options_description *opt,
+                              bool include_format) {
+  // TODO get default image format from conf
+  if (include_format) {
+    opt->add_options()
+      (IMAGE_FORMAT.c_str(), po::value<ImageFormat>(), "image format [1 or 2]")
+      (IMAGE_NEW_FORMAT.c_str(),
+       po::value<ImageNewFormat>()->zero_tokens(),
+       "use image format 2\n(deprecated)");
+  }
+
+  opt->add_options()
+    (IMAGE_ORDER.c_str(), po::value<ImageOrder>(),
+     "object order [12 <= order <= 25]")
+    (IMAGE_FEATURES.c_str(), po::value<ImageFeatures>()->composing(),
+     ("image features\n" + get_short_features_help(true)).c_str())
+    (IMAGE_SHARED.c_str(), po::bool_switch(), "shared image")
+    (IMAGE_STRIPE_UNIT.c_str(), po::value<uint32_t>(), "stripe unit")
+    (IMAGE_STRIPE_COUNT.c_str(), po::value<uint32_t>(), "stripe count");
+}
+
+void add_size_option(boost::program_options::options_description *opt) {
+  opt->add_options()
+    ((IMAGE_SIZE + ",s").c_str(), po::value<ImageSize>()->required(),
+     "image size (in M/G/T)");
+}
+
+void add_path_options(boost::program_options::options_description *pos,
+                      boost::program_options::options_description *opt,
+                      const std::string &description) {
+  pos->add_options()
+    (PATH_NAME.c_str(), po::value<std::string>(), description.c_str());
+  opt->add_options()
+    (PATH.c_str(), po::value<std::string>(), description.c_str());
+}
+
+void add_no_progress_option(boost::program_options::options_description *opt) {
+  opt->add_options()
+    (NO_PROGRESS.c_str(), po::bool_switch(), "disable progress output");
+}
+
+void add_format_options(boost::program_options::options_description *opt) {
+  opt->add_options()
+    (FORMAT.c_str(), po::value<Format>(), "output format [plain, json, or xml]")
+    (PRETTY_FORMAT.c_str(), po::bool_switch(),
+     "pretty formatting (json and xml)");
+}
+
+std::string get_short_features_help(bool append_suffix) {
+  std::ostringstream oss;
+  bool first_feature = true;
+  oss << "[";
+  for (auto &pair : ImageFeatures::FEATURE_MAPPING) {
+    if (!first_feature) {
+      oss << ", ";
+    }
+    first_feature = false;
+
+    std::string suffix;
+    if (append_suffix) {
+      if ((pair.first & RBD_FEATURES_MUTABLE) != 0) {
+        suffix += "*";
+      }
+      if ((pair.first & g_conf->rbd_default_features) != 0) {
+        suffix += "+";
+      }
+      if (!suffix.empty()) {
+        suffix = "(" + suffix + ")";
+      }
+    }
+    oss << pair.second << suffix;
+  }
+  oss << "]";
+  return oss.str();
+}
+
+std::string get_long_features_help() {
+  std::ostringstream oss;
+  oss << "Image Features:" << std::endl
+      << "  (*) supports enabling/disabling on existing images" << std::endl
+      << "  (+) enabled by default for new images if features not specified"
+      << std::endl;
+  return oss.str();
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+              ImageSize *target_type, int) {
+  po::validators::check_first_occurrence(v);
+  const std::string &s = po::validators::get_single_string(values);
+
+  std::string parse_error;
+  uint64_t size = strict_sistrtoll(s.c_str(), &parse_error);
+  if (!parse_error.empty()) {
+    throw po::validation_error(po::validation_error::invalid_option_value);
+  }
+
+  //NOTE: We can remove below given three lines of code once all applications,
+  //which use this CLI will adopt B/K/M/G/T/P/E with size value
+  if (isdigit(*s.rbegin())) {
+    size = size << 20;   // Default MB to Bytes
+  }
+  v = boost::any(size);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+              ImageOrder *target_type, int dummy) {
+  po::validators::check_first_occurrence(v);
+  const std::string &s = po::validators::get_single_string(values);
+  try {
+    uint32_t order = boost::lexical_cast<uint32_t>(s);
+    if (order >= 12 && order <= 25) {
+      v = boost::any(order);
+      return;
+    }
+  } catch (const boost::bad_lexical_cast &) {
+  }
+  throw po::validation_error(po::validation_error::invalid_option_value);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+              ImageFormat *target_type, int dummy) {
+  po::validators::check_first_occurrence(v);
+  const std::string &s = po::validators::get_single_string(values);
+  try {
+    uint32_t format = boost::lexical_cast<uint32_t>(s);
+    if (format == 1 || format == 2) {
+      v = boost::any(format);
+      return;
+    }
+  } catch (const boost::bad_lexical_cast &) {
+  }
+  throw po::validation_error(po::validation_error::invalid_option_value);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+              ImageNewFormat *target_type, int dummy) {
+  std::cout << "rbd: --new-format is deprecated, use --image-format"
+            << std::endl;
+  v = boost::any(true);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+              ImageFeatures *target_type, int) {
+  if (v.empty()) {
+    v = boost::any(static_cast<uint64_t>(0));
+  }
+
+  uint64_t &features = boost::any_cast<uint64_t &>(v);
+  for (auto &value : values) {
+    boost::char_separator<char> sep(",");
+    boost::tokenizer<boost::char_separator<char> > tok(value, sep);
+    for (auto &token : tok) {
+      bool matched = false;
+      for (auto &it : ImageFeatures::FEATURE_MAPPING) {
+        if (token == it.second) {
+          features |= it.first;
+          matched = true;
+          break;
+        }
+      }
+
+      if (!matched) {
+        throw po::validation_error(po::validation_error::invalid_option_value);
+      }
+    }
+  }
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+              Format *target_type, int) {
+  po::validators::check_first_occurrence(v);
+  const std::string &s = po::validators::get_single_string(values);
+  if (s == "plain" || s == "json" || s == "xml") {
+    v = boost::any(Format(s));
+  } else {
+    throw po::validation_error(po::validation_error::invalid_option_value);
+  }
+}
+
+} // namespace argument_types
+} // namespace rbd
diff --git a/src/tools/rbd/ArgumentTypes.h b/src/tools/rbd/ArgumentTypes.h
new file mode 100644
index 0000000..47ad55f
--- /dev/null
+++ b/src/tools/rbd/ArgumentTypes.h
@@ -0,0 +1,157 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_ARGUMENT_TYPES_H
+#define CEPH_RBD_ARGUMENT_TYPES_H
+
+#include "include/int_types.h"
+#include <set>
+#include <string>
+#include <vector>
+#include <boost/any.hpp>
+#include <boost/program_options.hpp>
+#include <boost/shared_ptr.hpp>
+
+namespace ceph { class Formatter; }
+
+namespace rbd {
+namespace argument_types {
+
+enum ArgumentModifier {
+  ARGUMENT_MODIFIER_NONE,
+  ARGUMENT_MODIFIER_SOURCE,
+  ARGUMENT_MODIFIER_DEST
+};
+
+enum SpecFormat {
+  SPEC_FORMAT_IMAGE,
+  SPEC_FORMAT_SNAPSHOT,
+  SPEC_FORMAT_IMAGE_OR_SNAPSHOT
+};
+
+static const std::string DEFAULT_POOL_NAME("rbd");
+
+static const std::string SOURCE_PREFIX("source-");
+static const std::string DEST_PREFIX("dest-");
+
+// positional arguments
+static const std::string POSITIONAL_COMMAND_SPEC("positional-command-spec");
+static const std::string POSITIONAL_ARGUMENTS("positional-arguments");
+static const std::string IMAGE_SPEC("image-spec");
+static const std::string SNAPSHOT_SPEC("snap-spec");
+static const std::string IMAGE_OR_SNAPSHOT_SPEC("image-or-snap-spec");
+static const std::string PATH_NAME("path-name");
+
+// optional arguments
+static const std::string POOL_NAME("pool");
+static const std::string DEST_POOL_NAME("dest-pool");
+static const std::string IMAGE_NAME("image");
+static const std::string DEST_IMAGE_NAME("dest");
+static const std::string SNAPSHOT_NAME("snap");
+static const std::string PATH("path");
+static const std::string FROM_SNAPSHOT_NAME("from-snap");
+static const std::string WHOLE_OBJECT("whole-object");
+
+static const std::string IMAGE_FORMAT("image-format");
+static const std::string IMAGE_NEW_FORMAT("new-format");
+static const std::string IMAGE_ORDER("order");
+static const std::string IMAGE_FEATURES("image-feature");
+static const std::string IMAGE_SHARED("image-shared");
+static const std::string IMAGE_SIZE("size");
+static const std::string IMAGE_STRIPE_UNIT("stripe-unit");
+static const std::string IMAGE_STRIPE_COUNT("stripe-count");
+
+static const std::string NO_PROGRESS("no-progress");
+static const std::string FORMAT("format");
+static const std::string PRETTY_FORMAT("pretty-format");
+
+static const std::set<std::string> SWITCH_ARGUMENTS = {
+  WHOLE_OBJECT, NO_PROGRESS, PRETTY_FORMAT};
+
+struct ImageSize {};
+struct ImageOrder {};
+struct ImageFormat {};
+struct ImageNewFormat {};
+
+struct ImageFeatures {
+  static const std::map<uint64_t, std::string>  FEATURE_MAPPING;
+
+  uint64_t features;
+};
+
+template <typename T>
+struct TypedValue {
+  T value;
+  TypedValue(const T& t) : value(t) {}
+};
+
+struct Format : public TypedValue<std::string> {
+  typedef boost::shared_ptr<ceph::Formatter> Formatter;
+
+  Format(const std::string &format) : TypedValue<std::string>(format) {}
+
+  Formatter create_formatter(bool pretty) const;
+};
+
+std::string get_name_prefix(ArgumentModifier modifier);
+std::string get_description_prefix(ArgumentModifier modifier);
+
+void add_pool_option(boost::program_options::options_description *opt,
+                     ArgumentModifier modifier,
+                     const std::string &desc_suffix = "");
+
+void add_image_option(boost::program_options::options_description *opt,
+                      ArgumentModifier modifier,
+                      const std::string &desc_suffix = "");
+
+void add_snap_option(boost::program_options::options_description *opt,
+                     ArgumentModifier modifier);
+
+void add_image_spec_options(boost::program_options::options_description *pos,
+                            boost::program_options::options_description *opt,
+                            ArgumentModifier modifier);
+
+void add_snap_spec_options(boost::program_options::options_description *pos,
+                           boost::program_options::options_description *opt,
+                           ArgumentModifier modifier);
+
+void add_image_or_snap_spec_options(
+  boost::program_options::options_description *pos,
+  boost::program_options::options_description *opt,
+  ArgumentModifier modifier);
+
+void add_create_image_options(boost::program_options::options_description *opt,
+                              bool include_format);
+
+void add_size_option(boost::program_options::options_description *opt);
+
+void add_path_options(boost::program_options::options_description *pos,
+                      boost::program_options::options_description *opt,
+                      const std::string &description);
+
+void add_no_progress_option(boost::program_options::options_description *opt);
+
+void add_format_options(boost::program_options::options_description *opt);
+
+std::string get_short_features_help(bool append_suffix);
+std::string get_long_features_help();
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+              ImageSize *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+              ImageOrder *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+              ImageFormat *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+              ImageNewFormat *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+              ImageFeatures *target_type, int);
+void validate(boost::any& v, const std::vector<std::string>& values,
+              Format *target_type, int);
+
+std::ostream &operator<<(std::ostream &os, const ImageFeatures &features);
+
+} // namespace argument_types
+} // namespace rbd
+
+#endif // CEPH_RBD_ARGUMENT_TYPES_H
diff --git a/src/tools/rbd/IndentStream.cc b/src/tools/rbd/IndentStream.cc
new file mode 100644
index 0000000..83591a8
--- /dev/null
+++ b/src/tools/rbd/IndentStream.cc
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/IndentStream.h"
+
+namespace rbd {
+
+int IndentBuffer::overflow (int c) {
+  if (traits_type::eq_int_type(traits_type::eof(), c)) {
+    return traits_type::not_eof(c);
+  }
+
+  int r;
+  switch (c) {
+  case '\n':
+    m_buffer += c;
+    flush_line();
+    r = m_streambuf->sputn(m_buffer.c_str(), m_buffer.size());
+    m_buffer.clear();
+    return r;
+  case '\t':
+    // convert tab to single space and fall-through
+    c = ' ';
+  default:
+    if (m_indent + m_buffer.size() >= m_line_length) {
+      size_t word_offset = m_buffer.find_last_of(m_delim);
+      bool space_delim = (m_delim == " ");
+      if (word_offset == std::string::npos && !space_delim) {
+        word_offset = m_buffer.find_last_of(" ");
+      }
+
+      if (word_offset != std::string::npos) {
+        flush_line();
+        m_streambuf->sputn(m_buffer.c_str(), word_offset);
+        m_buffer = std::string(m_buffer,
+                               word_offset + (space_delim ? 1 : 0));
+      } else {
+        flush_line();
+        m_streambuf->sputn(m_buffer.c_str(), m_buffer.size());
+        m_buffer.clear();
+      }
+      m_streambuf->sputc('\n');
+    }
+    m_buffer += c;
+    return c;
+  }
+}
+
+void IndentBuffer::flush_line() {
+  if (m_initial_offset >= m_indent) {
+    m_initial_offset = 0;
+    m_streambuf->sputc('\n');
+  }
+
+  m_streambuf->sputn(m_indent_prefix.c_str(), m_indent - m_initial_offset);
+  m_initial_offset = 0;
+}
+
+} // namespace rbd
diff --git a/src/tools/rbd/IndentStream.h b/src/tools/rbd/IndentStream.h
new file mode 100644
index 0000000..ba7d90b
--- /dev/null
+++ b/src/tools/rbd/IndentStream.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_INDENT_STREAM_H
+#define CEPH_RBD_INDENT_STREAM_H
+
+#include "include/int_types.h"
+#include <iostream>
+#include <streambuf>
+#include <iomanip>
+
+namespace rbd {
+
+class IndentBuffer : public std::streambuf {
+public:
+  IndentBuffer(size_t indent, size_t initial_offset, size_t line_length,
+               std::streambuf *streambuf)
+    : m_indent(indent), m_initial_offset(initial_offset),
+      m_line_length(line_length), m_streambuf(streambuf),
+      m_delim(" "), m_indent_prefix(m_indent, ' ') {
+  }
+
+  void set_delimiter(const std::string &delim) {
+    m_delim = delim;
+  }
+
+protected:
+  virtual int overflow (int c);
+
+private:
+  size_t m_indent;
+  size_t m_initial_offset;
+  size_t m_line_length;
+  std::streambuf *m_streambuf;
+
+  std::string m_delim;
+  std::string m_indent_prefix;
+  std::string m_buffer;
+
+  void flush_line();
+};
+
+class IndentStream : public std::ostream {
+public:
+  IndentStream(size_t indent, size_t initial_offset, size_t line_length,
+               std::ostream &os)
+    : std::ostream(&m_indent_buffer),
+      m_indent_buffer(indent, initial_offset, line_length, os.rdbuf()) {
+  }
+
+  void set_delimiter(const std::string &delim) {
+    m_indent_buffer.set_delimiter(delim);
+  }
+private:
+  IndentBuffer m_indent_buffer;
+};
+
+} // namespace rbd
+
+#endif // CEPH_RBD_INDENT_STREAM_ITERATOR_H
diff --git a/src/tools/rbd/OptionPrinter.cc b/src/tools/rbd/OptionPrinter.cc
new file mode 100644
index 0000000..1033b53
--- /dev/null
+++ b/src/tools/rbd/OptionPrinter.cc
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/OptionPrinter.h"
+#include "tools/rbd/IndentStream.h"
+
+namespace rbd {
+
+namespace po = boost::program_options;
+
+const std::string OptionPrinter::POSITIONAL_ARGUMENTS("Positional arguments");
+const std::string OptionPrinter::OPTIONAL_ARGUMENTS("Optional arguments");
+
+const size_t OptionPrinter::MAX_DESCRIPTION_OFFSET;
+
+OptionPrinter::OptionPrinter(const OptionsDescription &positional,
+                             const OptionsDescription &optional)
+  : m_positional(positional), m_optional(optional) {
+}
+
+void OptionPrinter::print_short(std::ostream &os, size_t initial_offset) {
+  size_t name_width = std::min(initial_offset, MAX_DESCRIPTION_OFFSET) + 1;
+
+  IndentStream indent_stream(name_width, initial_offset, LINE_WIDTH, os);
+  indent_stream.set_delimiter("[");
+  for (size_t i = 0; i < m_optional.options().size(); ++i) {
+    bool required = m_optional.options()[i]->semantic()->is_required();
+    if (!required) {
+      indent_stream << "[";
+    }
+    indent_stream << "--" << m_optional.options()[i]->long_name();
+    if (m_optional.options()[i]->semantic()->max_tokens() != 0) {
+      indent_stream << " <" << m_optional.options()[i]->long_name() << ">";
+    }
+    if (!required) {
+      indent_stream << "]";
+    }
+    indent_stream << " ";
+  }
+  indent_stream << std::endl;
+
+  if (m_positional.options().size() > 0) {
+    indent_stream.set_delimiter(" ");
+    for (size_t i = 0; i < m_positional.options().size(); ++i) {
+      indent_stream << "<" << m_positional.options()[i]->long_name() << "> ";
+      if (m_positional.options()[i]->semantic()->max_tokens() > 1) {
+        indent_stream << "[<" << m_positional.options()[i]->long_name()
+                      << "> ...]";
+        break;
+      }
+    }
+    indent_stream << std::endl;
+  }
+}
+
+void OptionPrinter::print_detailed(std::ostream &os) {
+  std::string indent_prefix(2, ' ');
+  size_t name_width = compute_name_width(indent_prefix.size());
+
+  if (m_positional.options().size() > 0) {
+    std::cout << POSITIONAL_ARGUMENTS << std::endl;
+    for (size_t i = 0; i < m_positional.options().size(); ++i) {
+      std::stringstream ss;
+      ss << indent_prefix << "<" << m_positional.options()[i]->long_name()
+         << ">";
+
+      std::cout << ss.str();
+      IndentStream indent_stream(name_width, ss.str().size(), LINE_WIDTH, os);
+      indent_stream << m_positional.options()[i]->description() << std::endl;
+    }
+    std::cout << std::endl;
+  }
+
+  if (m_optional.options().size() > 0) {
+    std::cout << OPTIONAL_ARGUMENTS << std::endl;
+    for (size_t i = 0; i < m_optional.options().size(); ++i) {
+      std::stringstream ss;
+      ss << indent_prefix
+         << m_optional.options()[i]->format_name() << " "
+         << m_optional.options()[i]->format_parameter();
+
+      std::cout << ss.str();
+      IndentStream indent_stream(name_width, ss.str().size(), LINE_WIDTH, os);
+      indent_stream << m_optional.options()[i]->description() << std::endl;
+    }
+    std::cout << std::endl;
+  }
+}
+
+size_t OptionPrinter::compute_name_width(size_t indent) {
+  size_t width = MIN_NAME_WIDTH;
+  std::vector<OptionsDescription> descs = {m_positional, m_optional};
+  for (size_t desc_idx = 0; desc_idx < descs.size(); ++desc_idx) {
+    const OptionsDescription &desc = descs[desc_idx];
+    for (size_t opt_idx = 0; opt_idx < desc.options().size(); ++opt_idx) {
+      size_t name_width = desc.options()[opt_idx]->format_name().size() +
+                          desc.options()[opt_idx]->format_parameter().size()
+                          + 1;
+      width = std::max(width, name_width);
+    }
+  }
+  width += indent;
+  width = std::min(width, MAX_DESCRIPTION_OFFSET) + 1;
+  return width;
+}
+
+} // namespace rbd
diff --git a/src/tools/rbd/OptionPrinter.h b/src/tools/rbd/OptionPrinter.h
new file mode 100644
index 0000000..e18a5f8
--- /dev/null
+++ b/src/tools/rbd/OptionPrinter.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_OPTION_PRINTER_H
+#define CEPH_RBD_OPTION_PRINTER_H
+
+#include "include/int_types.h"
+#include <string>
+#include <vector>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+
+class OptionPrinter {
+public:
+  typedef boost::program_options::options_description OptionsDescription;
+
+  static const std::string POSITIONAL_ARGUMENTS;
+  static const std::string OPTIONAL_ARGUMENTS;
+
+  static const size_t LINE_WIDTH = 80;
+  static const size_t MIN_NAME_WIDTH = 20;
+  static const size_t MAX_DESCRIPTION_OFFSET = LINE_WIDTH / 2;
+
+  OptionPrinter(const OptionsDescription &positional,
+                const OptionsDescription &optional);
+
+  void print_short(std::ostream &os, size_t initial_offset);
+  void print_detailed(std::ostream &os);
+
+private:
+  const OptionsDescription &m_positional;
+  const OptionsDescription &m_optional;
+
+  size_t compute_name_width(size_t indent);
+};
+
+} // namespace rbd
+
+#endif // CEPH_RBD_OPTION_PRINTER_H
diff --git a/src/tools/rbd/Shell.cc b/src/tools/rbd/Shell.cc
new file mode 100644
index 0000000..3e2987b
--- /dev/null
+++ b/src/tools/rbd/Shell.cc
@@ -0,0 +1,401 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/IndentStream.h"
+#include "tools/rbd/OptionPrinter.h"
+#include "common/config.h"
+#include "global/global_context.h"
+#include "include/stringify.h"
+#include <algorithm>
+#include <iostream>
+#include <set>
+
+namespace rbd {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+static const std::string APP_NAME("rbd");
+static const std::string HELP_SPEC("help");
+static const std::string BASH_COMPLETION_SPEC("bash-completion");
+
+struct Secret {};
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+              Secret *target_type, int) {
+  std::cerr << "rbd: --secret is deprecated, use --keyfile" << std::endl;
+
+  po::validators::check_first_occurrence(v);
+  const std::string &s = po::validators::get_single_string(values);
+  g_conf->set_val_or_die("keyfile", s.c_str());
+  v = boost::any(s);
+}
+
+std::string format_command_spec(const Shell::CommandSpec &spec) {
+  return joinify<std::string>(spec.begin(), spec.end(), " ");
+}
+
+std::string format_command_name(const Shell::CommandSpec &spec,
+                                const Shell::CommandSpec &alias_spec) {
+  std::string name = format_command_spec(spec);
+  if (!alias_spec.empty()) {
+    name += " (" + format_command_spec(alias_spec) + ")";
+  }
+  return name;
+}
+
+std::string format_option_suffix(
+    const boost::shared_ptr<po::option_description> &option) {
+  std::string suffix;
+  if (option->semantic()->max_tokens() != 0) {
+    if (option->description().find("path") != std::string::npos ||
+        option->description().find("file") != std::string::npos) {
+      suffix += " path";
+    } else if (option->description().find("host") != std::string::npos) {
+      suffix += " host";
+    } else {
+      suffix += " arg";
+    }
+  }
+  return suffix;
+}
+
+} // anonymous namespace
+
+std::vector<Shell::Action *> Shell::s_actions;
+std::set<std::string> Shell::s_switch_arguments;
+
+int Shell::execute(int arg_count, const char **arg_values) {
+
+  std::vector<std::string> arguments;
+  prune_command_line_arguments(arg_count, arg_values, &arguments);
+
+  std::vector<std::string> command_spec;
+  get_command_spec(arguments, &command_spec);
+
+  if (command_spec.empty() || command_spec == CommandSpec({"help"})) {
+    // list all available actions
+    print_help();
+    return 0;
+  } else if (command_spec[0] == HELP_SPEC) {
+    // list help for specific action
+    command_spec.erase(command_spec.begin());
+    Action *action = find_action(command_spec, NULL);
+    if (action == NULL) {
+      print_unknown_action(command_spec);
+      return EXIT_FAILURE;
+    } else {
+      print_action_help(action);
+      return 0;
+    }
+  } else if (command_spec[0] == BASH_COMPLETION_SPEC) {
+    command_spec.erase(command_spec.begin());
+    print_bash_completion(command_spec);
+    return 0;
+  }
+
+  CommandSpec *matching_spec;
+  Action *action = find_action(command_spec, &matching_spec);
+  if (action == NULL) {
+    print_unknown_action(command_spec);
+    return EXIT_FAILURE;
+  }
+
+  po::variables_map vm;
+  try {
+    po::options_description positional_opts;
+    po::options_description command_opts;
+    (*action->get_arguments)(&positional_opts, &command_opts);
+
+    // dynamically allocate options for our command (e.g. snap list) and
+    // its associated positional arguments
+    po::options_description argument_opts;
+    argument_opts.add_options()
+      (at::POSITIONAL_COMMAND_SPEC.c_str(),
+       po::value<std::vector<std::string> >()->required(), "")
+      (at::POSITIONAL_ARGUMENTS.c_str(),
+       po::value<std::vector<std::string> >(), "");
+
+    po::positional_options_description positional_options;
+    positional_options.add(at::POSITIONAL_COMMAND_SPEC.c_str(),
+                           matching_spec->size());
+    if (!positional_opts.options().empty()) {
+      int max_count = positional_opts.options().size();
+      if (positional_opts.options().back()->semantic()->max_tokens() > 1)
+        max_count = -1;
+      positional_options.add(at::POSITIONAL_ARGUMENTS.c_str(), max_count);
+    }
+
+    po::options_description global_opts;
+    get_global_options(&global_opts);
+
+    po::options_description group_opts;
+    group_opts.add(command_opts)
+              .add(argument_opts)
+              .add(global_opts);
+
+    po::store(po::command_line_parser(arguments)
+      .style(po::command_line_style::default_style &
+        ~po::command_line_style::allow_guessing)
+      .options(group_opts)
+      .positional(positional_options)
+      .run(), vm);
+
+    if (vm[at::POSITIONAL_COMMAND_SPEC].as<std::vector<std::string> >() !=
+          *matching_spec) {
+      std::cerr << "rbd: failed to parse command" << std::endl;
+      return EXIT_FAILURE;
+    }
+
+    int r = (*action->execute)(vm);
+    if (r != 0) {
+      return std::abs(r);
+    }
+  } catch (po::required_option& e) {
+    std::cerr << "rbd: " << e.what() << std::endl;
+    return EXIT_FAILURE;
+  } catch (po::too_many_positional_options_error& e) {
+    std::cerr << "rbd: too many arguments" << std::endl;
+    return EXIT_FAILURE;
+  } catch (po::error& e) {
+    std::cerr << "rbd: " << e.what() << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  return 0;
+}
+
+void Shell::get_command_spec(const std::vector<std::string> &arguments,
+                             std::vector<std::string> *command_spec) {
+  for (size_t i = 0; i < arguments.size(); ++i) {
+    std::string arg(arguments[i]);
+    if (arg == "-h" || arg == "--help") {
+      *command_spec = {HELP_SPEC};
+      return;
+    } else if (arg == "--") {
+      // all arguments after a double-dash are positional
+      if (i + 1 < arguments.size()) {
+        command_spec->insert(command_spec->end(),
+                             arguments.data() + i + 1,
+                             arguments.data() + arguments.size());
+      }
+      return;
+    } else if (arg[0] == '-') {
+      // if the option is not a switch, skip its value
+      if (arg.size() >= 2 &&
+          (arg[1] == '-' || s_switch_arguments.count(arg.substr(1, 1)) == 0) &&
+          (arg[1] != '-' ||
+             s_switch_arguments.count(arg.substr(2, std::string::npos)) == 0) &&
+          at::SWITCH_ARGUMENTS.count(arg.substr(2, std::string::npos)) == 0 &&
+          arg.find('=') == std::string::npos) {
+        ++i;
+      }
+    } else {
+      command_spec->push_back(arg);
+    }
+  }
+}
+
+Shell::Action *Shell::find_action(const CommandSpec &command_spec,
+                                  CommandSpec **matching_spec) {
+  for (size_t i = 0; i < s_actions.size(); ++i) {
+    Action *action = s_actions[i];
+    if (action->command_spec.size() <= command_spec.size()) {
+      if (std::includes(action->command_spec.begin(),
+                        action->command_spec.end(),
+                        command_spec.begin(),
+                        command_spec.begin() + action->command_spec.size())) {
+        if (matching_spec != NULL) {
+          *matching_spec = &action->command_spec;
+        }
+        return action;
+      }
+    }
+    if (!action->alias_command_spec.empty() &&
+        action->alias_command_spec.size() <= command_spec.size()) {
+      if (std::includes(action->alias_command_spec.begin(),
+                        action->alias_command_spec.end(),
+                        command_spec.begin(),
+                        command_spec.begin() +
+                          action->alias_command_spec.size())) {
+        if (matching_spec != NULL) {
+          *matching_spec = &action->alias_command_spec;
+        }
+        return action;
+      }
+    }
+  }
+  return NULL;
+}
+
+void Shell::get_global_options(po::options_description *opts) {
+  opts->add_options()
+    ("conf,c", po::value<std::string>(), "path to cluster configuration")
+    ("cluster", po::value<std::string>(), "cluster name")
+    ("id", po::value<std::string>(), "client id (without 'client.' prefix)")
+    ("user", po::value<std::string>(), "client id (without 'client.' prefix)")
+    ("name,n", po::value<std::string>(), "client name")
+    ("mon_host,m", po::value<std::string>(), "monitor host")
+    ("secret", po::value<Secret>(), "path to secret key (deprecated)")
+    ("keyfile,K", po::value<std::string>(), "path to secret key")
+    ("keyring,k", po::value<std::string>(), "path to keyring");
+}
+
+void Shell::prune_command_line_arguments(int arg_count, const char **arg_values,
+                                         std::vector<std::string> *args) {
+
+  std::vector<std::string> config_keys;
+  g_conf->get_all_keys(&config_keys);
+  std::set<std::string> config_key_set(config_keys.begin(), config_keys.end());
+
+  args->reserve(arg_count);
+  for (int i = 1; i < arg_count; ++i) {
+    std::string arg(arg_values[i]);
+    if (arg.size() > 2 && arg.substr(0, 2) == "--") {
+      std::string option_name(arg.substr(2));
+      std::string alt_option_name(option_name);
+      std::replace(alt_option_name.begin(), alt_option_name.end(), '-', '_');
+      if (config_key_set.count(option_name) ||
+          config_key_set.count(alt_option_name)) {
+        // Ceph config override -- skip since it's handled by CephContext
+        ++i;
+        continue;
+      }
+    }
+
+    args->push_back(arg);
+  }
+}
+
+void Shell::print_help() {
+  std::cout << "usage: " << APP_NAME << " <command> ..."
+            << std::endl << std::endl
+            << "Command-line interface for managing Ceph RBD images."
+            << std::endl << std::endl;
+
+  std::vector<Action *> actions(s_actions);
+  std::sort(actions.begin(), actions.end(),
+            [](Action *lhs, Action *rhs) { return lhs->command_spec <
+                                                    rhs->command_spec; });
+
+  std::cout << OptionPrinter::POSITIONAL_ARGUMENTS << ":" << std::endl
+            << "  <command>" << std::endl;
+
+  // since the commands have spaces, we have to build our own formatter
+  std::string indent(4, ' ');
+  size_t name_width = OptionPrinter::MIN_NAME_WIDTH;
+  for (size_t i = 0; i < actions.size(); ++i) {
+    Action *action = actions[i];
+    std::string name = format_command_name(action->command_spec,
+                                           action->alias_command_spec);
+    name_width = std::max(name_width, name.size());
+  }
+  name_width += indent.size();
+  name_width = std::min(name_width, OptionPrinter::MAX_DESCRIPTION_OFFSET) + 1;
+
+  for (size_t i = 0; i < actions.size(); ++i) {
+    Action *action = actions[i];
+    std::stringstream ss;
+    ss << indent
+       << format_command_name(action->command_spec, action->alias_command_spec);
+
+    std::cout << ss.str();
+    if (!action->description.empty()) {
+      IndentStream indent_stream(name_width, ss.str().size(),
+                                 OptionPrinter::LINE_WIDTH,
+                                 std::cout);
+      indent_stream << action->description << std::endl;
+    } else {
+      std::cout << std::endl;
+    }
+  }
+
+  po::options_description global_opts(OptionPrinter::OPTIONAL_ARGUMENTS);
+  get_global_options(&global_opts);
+  std::cout << std::endl << global_opts << std::endl
+            << "See '" << APP_NAME << " help <command>' for help on a specific "
+            << "command." << std::endl;
+}
+
+void Shell::print_action_help(Action *action) {
+
+  std::stringstream ss;
+  ss << "usage: " << APP_NAME << " "
+     << format_command_spec(action->command_spec);
+  std::cout << ss.str();
+
+  po::options_description positional;
+  po::options_description options;
+  (*action->get_arguments)(&positional, &options);
+
+  OptionPrinter option_printer(positional, options);
+  option_printer.print_short(std::cout, ss.str().size());
+
+  if (!action->description.empty()) {
+    std::cout << std::endl << action->description << std::endl;
+  }
+
+  std::cout << std::endl;
+  option_printer.print_detailed(std::cout);
+
+  if (!action->help.empty()) {
+    std::cout << action->help << std::endl;
+  }
+}
+
+void Shell::print_unknown_action(const std::vector<std::string> &command_spec) {
+  std::cerr << "error: unknown option '"
+            << joinify<std::string>(command_spec.begin(),
+                                    command_spec.end(), " ") << "'"
+            << std::endl << std::endl;
+  print_help();
+}
+
+void Shell::print_bash_completion(const CommandSpec &command_spec) {
+  Action *action = find_action(command_spec, NULL);
+  po::options_description global_opts;
+  get_global_options(&global_opts);
+  print_bash_completion_options(global_opts);
+
+  if (action != nullptr) {
+    po::options_description positional_opts;
+    po::options_description command_opts;
+    (*action->get_arguments)(&positional_opts, &command_opts);
+    print_bash_completion_options(command_opts);
+  } else {
+    std::cout << "|help";
+    for (size_t i = 0; i < s_actions.size(); ++i) {
+      Action *action = s_actions[i];
+      std::cout << "|"
+                << joinify<std::string>(action->command_spec.begin(),
+                                        action->command_spec.end(), " ");
+      if (!action->alias_command_spec.empty()) {
+        std::cout << "|"
+                   << joinify<std::string>(action->alias_command_spec.begin(),
+                                          action->alias_command_spec.end(),
+                                          " ");
+      }
+    }
+  }
+  std::cout << "|" << std::endl;
+}
+
+void Shell::print_bash_completion_options(const po::options_description &ops) {
+  for (size_t i = 0; i < ops.options().size(); ++i) {
+    auto option = ops.options()[i];
+    std::string long_name(option->canonical_display_name(0));
+    std::string short_name(option->canonical_display_name(
+      po::command_line_style::allow_dash_for_short));
+
+    std::cout << "|--" << long_name << format_option_suffix(option);
+    if (long_name != short_name) {
+      std::cout << "|" << short_name << format_option_suffix(option);
+    }
+  }
+}
+
+} // namespace rbd
diff --git a/src/tools/rbd/Shell.h b/src/tools/rbd/Shell.h
new file mode 100644
index 0000000..4a21325
--- /dev/null
+++ b/src/tools/rbd/Shell.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_SHELL_H
+#define CEPH_RBD_SHELL_H
+
+#include "include/int_types.h"
+#include <set>
+#include <string>
+#include <vector>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+
+class Shell {
+public:
+  typedef std::vector<std::string> CommandSpec;
+
+  struct Action {
+    typedef void (*GetArguments)(boost::program_options::options_description *,
+                                 boost::program_options::options_description *);
+    typedef int (*Execute)(const boost::program_options::variables_map &);
+
+    CommandSpec command_spec;
+    CommandSpec alias_command_spec;
+    const std::string description;
+    const std::string help;
+    GetArguments get_arguments;
+    Execute execute;
+
+    template <typename Args, typename Execute>
+    Action(const std::initializer_list<std::string> &command_spec,
+           const std::initializer_list<std::string> &alias_command_spec,
+           const std::string &description, const std::string &help,
+           Args args, Execute execute)
+        : command_spec(command_spec), alias_command_spec(alias_command_spec),
+          description(description), help(help), get_arguments(args),
+          execute(execute) {
+      Shell::s_actions.push_back(this);
+    }
+
+  };
+
+  struct SwitchArguments {
+    SwitchArguments(const std::initializer_list<std::string> &arguments) {
+      Shell::s_switch_arguments.insert(arguments.begin(), arguments.end());
+    }
+  };
+
+  int execute(int arg_count, const char **arg_values);
+
+private:
+  static std::vector<Action *> s_actions;
+  static std::set<std::string> s_switch_arguments;
+
+  void get_command_spec(const std::vector<std::string> &arguments,
+                        std::vector<std::string> *command_spec);
+  Action *find_action(const CommandSpec &command_spec,
+                      CommandSpec **matching_spec);
+
+  void get_global_options(boost::program_options::options_description *opts);
+  void prune_command_line_arguments(int arg_count, const char **arg_values,
+                                    std::vector<std::string> *args);
+
+  void print_help();
+  void print_action_help(Action *action);
+  void print_unknown_action(const CommandSpec &command_spec);
+
+  void print_bash_completion(const CommandSpec &command_spec);
+  void print_bash_completion_options(
+    const boost::program_options::options_description &ops);
+};
+
+} // namespace rbd
+
+#endif // CEPH_RBD_SHELL_H
diff --git a/src/tools/rbd/Utils.cc b/src/tools/rbd/Utils.cc
new file mode 100644
index 0000000..02af9ef
--- /dev/null
+++ b/src/tools/rbd/Utils.cc
@@ -0,0 +1,431 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/Utils.h"
+#include "include/assert.h"
+#include "include/Context.h"
+#include "include/encoding.h"
+#include "common/common_init.h"
+#include "include/stringify.h"
+#include "include/rbd/features.h"
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "global/global_context.h"
+#include <iostream>
+#include <boost/regex.hpp>
+
+namespace rbd {
+namespace utils {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+int ProgressContext::update_progress(uint64_t offset, uint64_t total) {
+  if (progress) {
+    int pc = total ? (offset * 100ull / total) : 0;
+    if (pc != last_pc) {
+      cerr << "\r" << operation << ": "
+           << pc << "% complete...";
+      cerr.flush();
+      last_pc = pc;
+    }
+  }
+  return 0;
+}
+
+void ProgressContext::finish() {
+  if (progress) {
+    cerr << "\r" << operation << ": 100% complete...done." << std::endl;
+  }
+}
+
+void ProgressContext::fail() {
+  if (progress) {
+    cerr << "\r" << operation << ": " << last_pc << "% complete...failed."
+         << std::endl;
+  }
+}
+
+void aio_context_callback(librbd::completion_t completion, void *arg)
+{
+  librbd::RBD::AioCompletion *aio_completion =
+    reinterpret_cast<librbd::RBD::AioCompletion*>(completion);
+  Context *context = reinterpret_cast<Context *>(arg);
+  context->complete(aio_completion->get_return_value());
+  aio_completion->release();
+}
+
+int read_string(int fd, unsigned max, std::string *out) {
+  char buf[4];
+
+  int r = safe_read_exact(fd, buf, 4);
+  if (r < 0)
+    return r;
+
+  bufferlist bl;
+  bl.append(buf, 4);
+  bufferlist::iterator p = bl.begin();
+  uint32_t len;
+  ::decode(len, p);
+  if (len > max)
+    return -EINVAL;
+
+  char sbuf[len];
+  r = safe_read_exact(fd, sbuf, len);
+  if (r < 0)
+    return r;
+  out->assign(sbuf, len);
+  return len;
+}
+
+int extract_spec(const std::string &spec, std::string *pool_name,
+                 std::string *image_name, std::string *snap_name) {
+  boost::regex pattern("^(?:([^/@]+)/)?([^/@]+)(?:@([^/@]+))?$");
+  boost::smatch match;
+  if (!boost::regex_match(spec, match, pattern)) {
+    std::cerr << "rbd: invalid spec '" << spec << "'" << std::endl;
+    return -EINVAL;
+  }
+
+  if (pool_name != nullptr && match[1].matched) {
+    *pool_name = match[1];
+  }
+  if (image_name != nullptr) {
+    *image_name = match[2];
+  }
+  if (snap_name != nullptr && match[3].matched) {
+    *snap_name = match[3];
+  }
+  return 0;
+}
+
+std::string get_positional_argument(const po::variables_map &vm, size_t index) {
+  if (vm.count(at::POSITIONAL_ARGUMENTS) == 0) {
+    return "";
+  }
+
+  const std::vector<std::string> &args =
+    boost::any_cast<std::vector<std::string> >(
+      vm[at::POSITIONAL_ARGUMENTS].value());
+  if (index < args.size()) {
+    return args[index];
+  }
+  return "";
+}
+
+int get_pool_image_snapshot_names(const po::variables_map &vm,
+                                  at::ArgumentModifier mod,
+                                  size_t *spec_arg_index,
+                                  std::string *pool_name,
+                                  std::string *image_name,
+                                  std::string *snap_name,
+                                  SnapshotPresence snapshot_presence,
+                                  bool image_required) {
+  std::string pool_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+    at::DEST_POOL_NAME : at::POOL_NAME);
+  std::string image_key = (mod == at::ARGUMENT_MODIFIER_DEST ?
+    at::DEST_IMAGE_NAME : at::IMAGE_NAME);
+
+  if (vm.count(pool_key) && pool_name != nullptr) {
+    *pool_name = vm[pool_key].as<std::string>();
+  }
+  if (vm.count(image_key) && image_name != nullptr) {
+    *image_name = vm[image_key].as<std::string>();
+  }
+  if (vm.count(at::SNAPSHOT_NAME) && snap_name != nullptr &&
+      mod != at::ARGUMENT_MODIFIER_DEST) {
+    *snap_name = vm[at::SNAPSHOT_NAME].as<std::string>();
+  }
+
+  if (image_name != nullptr && !image_name->empty()) {
+    // despite the separate pool and snapshot name options,
+    // we can also specify them via the image option
+    std::string image_name_copy(*image_name);
+    extract_spec(image_name_copy, pool_name, image_name, snap_name);
+  }
+
+  int r;
+  if (image_name != nullptr && spec_arg_index != nullptr &&
+      image_name->empty()) {
+    std::string spec = get_positional_argument(vm, (*spec_arg_index)++);
+    if (!spec.empty()) {
+      r = extract_spec(spec, pool_name, image_name, snap_name);
+      if (r < 0) {
+        return r;
+      }
+    }
+  }
+
+  if (pool_name->empty()) {
+    *pool_name = at::DEFAULT_POOL_NAME;
+  }
+
+  if (image_name != nullptr && image_required && image_name->empty()) {
+    std::string prefix = at::get_description_prefix(mod);
+    std::cerr << "rbd: "
+              << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string())
+              << "image name was not specified" << std::endl;
+    return -EINVAL;
+  }
+
+  if (snap_name != nullptr) {
+    r = validate_snapshot_name(mod, *snap_name, snapshot_presence);
+    if (r < 0) {
+      return r;
+    }
+  }
+  return 0;
+}
+
+int validate_snapshot_name(at::ArgumentModifier mod,
+                           const std::string &snap_name,
+                           SnapshotPresence snapshot_presence) {
+  std::string prefix = at::get_description_prefix(mod);
+  switch (snapshot_presence) {
+  case SNAPSHOT_PRESENCE_PERMITTED:
+    break;
+  case SNAPSHOT_PRESENCE_NONE:
+    if (!snap_name.empty()) {
+      std::cerr << "rbd: "
+                << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string())
+                << "snapname specified for a command that doesn't use it"
+                << std::endl;
+      return -EINVAL;
+    }
+    break;
+  case SNAPSHOT_PRESENCE_REQUIRED:
+    if (snap_name.empty()) {
+      std::cerr << "rbd: "
+                << (mod == at::ARGUMENT_MODIFIER_DEST ? prefix : std::string())
+                << "snap name was not specified" << std::endl;
+      return -EINVAL;
+    }
+    break;
+  }
+  return 0;
+}
+
+int get_image_options(const boost::program_options::variables_map &vm,
+                      int *order, uint32_t *format, uint64_t *features,
+                      uint32_t *stripe_unit, uint32_t *stripe_count) {
+  if (vm.count(at::IMAGE_ORDER)) {
+    *order = vm[at::IMAGE_ORDER].as<uint32_t>();
+  } else {
+    *order = 22;
+  }
+
+  bool features_specified = false;
+  if (vm.count(at::IMAGE_FEATURES)) {
+    *features = vm[at::IMAGE_FEATURES].as<uint64_t>();
+    features_specified = true;
+  } else {
+    *features = g_conf->rbd_default_features;
+  }
+
+  if (vm.count(at::IMAGE_STRIPE_UNIT)) {
+    *stripe_unit = vm[at::IMAGE_STRIPE_UNIT].as<uint32_t>();
+  } else {
+    *stripe_unit = g_conf->rbd_default_stripe_unit;
+  }
+
+  if (vm.count(at::IMAGE_STRIPE_COUNT)) {
+    *stripe_count = vm[at::IMAGE_STRIPE_COUNT].as<uint32_t>();
+  } else {
+    *stripe_count = g_conf->rbd_default_stripe_count;
+  }
+
+  if ((*stripe_unit != 0 && *stripe_count == 0) ||
+      (*stripe_unit == 0 && *stripe_count != 0)) {
+    std::cerr << "must specify both (or neither) of stripe-unit and stripe-count"
+              << std::endl;
+    return -EINVAL;
+  } else if ((*stripe_unit || *stripe_count) &&
+             (*stripe_unit != (1ll << *order) && *stripe_count != 1)) {
+    *features |= RBD_FEATURE_STRIPINGV2;
+  } else {
+    *features &= ~RBD_FEATURE_STRIPINGV2;
+  }
+
+  if (vm.count(at::IMAGE_SHARED) && vm[at::IMAGE_SHARED].as<bool>()) {
+    *features &= ~RBD_FEATURES_SINGLE_CLIENT;
+  }
+
+  if (format != nullptr) {
+    bool format_specified = false;
+    if (vm.count(at::IMAGE_NEW_FORMAT)) {
+      *format = 2;
+      format_specified = true;
+    } else if (vm.count(at::IMAGE_FORMAT)) {
+      *format = vm[at::IMAGE_FORMAT].as<uint32_t>();
+      format_specified = true;
+    } else {
+      *format = g_conf->rbd_default_format;
+    }
+
+    if (features_specified && *features != 0) {
+      if (format_specified && *format == 1) {
+        std::cerr << "rbd: features not allowed with format 1; "
+                  << "use --image-format 2" << std::endl;
+        return -EINVAL;
+      } else {
+        *format = 2;
+        format_specified = true;
+      }
+    }
+
+    if ((*stripe_unit || *stripe_count) &&
+        (*stripe_unit != (1ull << *order) && *stripe_count != 1)) {
+      if (format_specified && *format == 1) {
+        std::cerr << "rbd: non-default striping not allowed with format 1; "
+                  << "use --image-format 2" << std::endl;
+        return -EINVAL;
+      } else {
+        *format = 2;
+        format_specified = 2;
+      }
+    }
+
+    if (format_specified) {
+      int r = g_conf->set_val("rbd_default_format", stringify(*format));
+      assert(r == 0);
+    }
+  }
+
+  return 0;
+}
+
+int get_image_size(const boost::program_options::variables_map &vm,
+                   uint64_t *size) {
+  if (vm.count(at::IMAGE_SIZE) == 0) {
+    std::cerr << "rbd: must specify --size <M/G/T>" << std::endl;
+    return -EINVAL;
+  }
+
+  *size = vm[at::IMAGE_SIZE].as<uint64_t>();
+  return 0;
+}
+
+int get_path(const boost::program_options::variables_map &vm,
+             const std::string &positional_path, std::string *path) {
+  if (!positional_path.empty()) {
+    *path = positional_path;
+  } else if (vm.count(at::PATH)) {
+    *path = vm[at::PATH].as<std::string>();
+  }
+
+  if (path->empty()) {
+    std::cerr << "rbd: path was not specified" << std::endl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+int get_formatter(const po::variables_map &vm,
+                  at::Format::Formatter *formatter) {
+  if (vm.count(at::FORMAT)) {
+    bool pretty = vm[at::PRETTY_FORMAT].as<bool>();
+    *formatter = vm[at::FORMAT].as<at::Format>().create_formatter(pretty);
+    if (*formatter == nullptr && pretty) {
+      std::cerr << "rbd: --pretty-format only works when --format "
+                << "is json or xml" << std::endl;
+      return -EINVAL;
+    }
+  }
+  return 0;
+}
+
+void init_context() {
+  g_conf->set_val_or_die("rbd_cache_writethrough_until_flush", "false");
+  g_conf->apply_changes(NULL);
+  common_init_finish(g_ceph_context);
+}
+
+int init(const std::string &pool_name, librados::Rados *rados,
+         librados::IoCtx *io_ctx) {
+  init_context();
+
+  int r = rados->init_with_context(g_ceph_context);
+  if (r < 0) {
+    std::cerr << "rbd: couldn't initialize rados!" << std::endl;
+    return r;
+  }
+
+  r = rados->connect();
+  if (r < 0) {
+    std::cerr << "rbd: couldn't connect to the cluster!" << std::endl;
+    return r;
+  }
+
+  r = init_io_ctx(*rados, pool_name, io_ctx);
+  if (r < 0) {
+    return r;
+  }
+  return 0;
+}
+
+int init_io_ctx(librados::Rados &rados, const std::string &pool_name,
+                librados::IoCtx *io_ctx) {
+  int r = rados.ioctx_create(pool_name.c_str(), *io_ctx);
+  if (r < 0) {
+    std::cerr << "rbd: error opening pool " << pool_name << ": "
+              << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+int open_image(librados::IoCtx &io_ctx, const std::string &image_name,
+               bool read_only, librbd::Image *image) {
+  int r;
+  librbd::RBD rbd;
+  if (read_only) {
+    r = rbd.open_read_only(io_ctx, *image, image_name.c_str(), NULL);
+  } else {
+    r = rbd.open(io_ctx, *image, image_name.c_str());
+  }
+
+  if (r < 0) {
+    std::cerr << "rbd: error opening image " << image_name << ": "
+              << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+int init_and_open_image(const std::string &pool_name,
+                        const std::string &image_name,
+                        const std::string &snap_name, bool read_only,
+                        librados::Rados *rados, librados::IoCtx *io_ctx,
+                        librbd::Image *image) {
+  int r = init(pool_name, rados, io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  r = open_image(*io_ctx, image_name, read_only, image);
+  if (r < 0) {
+    return r;
+  }
+
+  if (!snap_name.empty()) {
+    r = snap_set(*image, snap_name);
+    if (r < 0) {
+      return r;
+    }
+  }
+  return 0;
+}
+
+int snap_set(librbd::Image &image, const std::string snap_name) {
+  int r = image.snap_set(snap_name.c_str());
+  if (r < 0) {
+    std::cerr << "error setting snapshot context: " << cpp_strerror(r)
+              << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+} // namespace utils
+} // namespace rbd
diff --git a/src/tools/rbd/Utils.h b/src/tools/rbd/Utils.h
new file mode 100644
index 0000000..0b7794e
--- /dev/null
+++ b/src/tools/rbd/Utils.h
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_RBD_UTILS_H
+#define CEPH_RBD_UTILS_H
+
+#include "include/int_types.h"
+#include "include/rados/librados.hpp"
+#include "include/rbd/librbd.hpp"
+#include "tools/rbd/ArgumentTypes.h"
+#include <string>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace utils {
+
+static const std::string RBD_DIFF_BANNER ("rbd diff v1\n");
+
+enum SnapshotPresence {
+  SNAPSHOT_PRESENCE_NONE,
+  SNAPSHOT_PRESENCE_PERMITTED,
+  SNAPSHOT_PRESENCE_REQUIRED
+};
+
+struct ProgressContext : public librbd::ProgressContext {
+  const char *operation;
+  bool progress;
+  int last_pc;
+
+  ProgressContext(const char *o, bool no_progress)
+    : operation(o), progress(!no_progress), last_pc(0) {
+  }
+
+  int update_progress(uint64_t offset, uint64_t total);
+  void finish();
+  void fail();
+};
+
+void aio_context_callback(librbd::completion_t completion, void *arg);
+
+int read_string(int fd, unsigned max, std::string *out);
+
+int extract_spec(const std::string &spec, std::string *pool_name,
+                 std::string *image_name, std::string *snap_name);
+
+std::string get_positional_argument(
+    const boost::program_options::variables_map &vm, size_t index);
+
+int get_pool_image_snapshot_names(
+    const boost::program_options::variables_map &vm,
+    argument_types::ArgumentModifier mod, size_t *spec_arg_index,
+    std::string *pool_name, std::string *image_name, std::string *snap_name,
+    SnapshotPresence snapshot_presence, bool image_required = true);
+
+int validate_snapshot_name(argument_types::ArgumentModifier mod,
+                           const std::string &snap_name,
+                           SnapshotPresence snapshot_presence);
+
+int get_image_options(const boost::program_options::variables_map &vm,
+                      int *order, uint32_t *format, uint64_t *features,
+                      uint32_t *stripe_unit, uint32_t *stripe_count);
+
+int get_image_size(const boost::program_options::variables_map &vm,
+                   uint64_t *size);
+
+int get_path(const boost::program_options::variables_map &vm,
+             const std::string &positional_path, std::string *path);
+
+int get_formatter(const boost::program_options::variables_map &vm,
+                  argument_types::Format::Formatter *formatter);
+
+void init_context();
+
+int init(const std::string &pool_name, librados::Rados *rados,
+         librados::IoCtx *io_ctx);
+
+int init_io_ctx(librados::Rados &rados, const std::string &pool_name,
+                librados::IoCtx *io_ctx);
+
+int open_image(librados::IoCtx &io_ctx, const std::string &image_name,
+               bool read_only, librbd::Image *image);
+
+int init_and_open_image(const std::string &pool_name,
+                        const std::string &image_name,
+                        const std::string &snap_name, bool read_only,
+                        librados::Rados *rados, librados::IoCtx *io_ctx,
+                        librbd::Image *image);
+
+int snap_set(librbd::Image &image, const std::string snap_name);
+
+} // namespace utils
+} // namespace rbd
+
+#endif // CEPH_RBD_UTILS_H
diff --git a/src/tools/rbd/action/BenchWrite.cc b/src/tools/rbd/action/BenchWrite.cc
new file mode 100644
index 0000000..d767c08
--- /dev/null
+++ b/src/tools/rbd/action/BenchWrite.cc
@@ -0,0 +1,310 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/strtol.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include <iostream>
+#include <boost/accumulators/accumulators.hpp>
+#include <boost/accumulators/statistics/stats.hpp>
+#include <boost/accumulators/statistics/rolling_sum.hpp>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace bench_write {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+struct Size {};
+struct IOPattern {};
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+              Size *target_type, int) {
+  po::validators::check_first_occurrence(v);
+  const std::string &s = po::validators::get_single_string(values);
+
+  std::string parse_error;
+  uint64_t size = strict_sistrtoll(s.c_str(), &parse_error);
+  if (!parse_error.empty()) {
+    throw po::validation_error(po::validation_error::invalid_option_value);
+  }
+  v = boost::any(size);
+}
+
+void validate(boost::any& v, const std::vector<std::string>& values,
+              IOPattern *target_type, int) {
+  po::validators::check_first_occurrence(v);
+  const std::string &s = po::validators::get_single_string(values);
+  if (s == "rand") {
+    v = boost::any(true);
+  } else if (s == "seq") {
+    v = boost::any(false);
+  } else {
+    throw po::validation_error(po::validation_error::invalid_option_value);
+  }
+
+}
+
+} // anonymous namespace
+
+static void rbd_bencher_completion(void *c, void *pc);
+struct rbd_bencher;
+
+struct rbd_bencher {
+  librbd::Image *image;
+  Mutex lock;
+  Cond cond;
+  int in_flight;
+
+  rbd_bencher(librbd::Image *i)
+    : image(i),
+      lock("rbd_bencher::lock"),
+      in_flight(0)
+  { }
+
+  bool start_write(int max, uint64_t off, uint64_t len, bufferlist& bl,
+                   int op_flags)
+  {
+    {
+      Mutex::Locker l(lock);
+      if (in_flight >= max)
+        return false;
+      in_flight++;
+    }
+    librbd::RBD::AioCompletion *c =
+      new librbd::RBD::AioCompletion((void *)this, rbd_bencher_completion);
+    image->aio_write2(off, len, bl, c, op_flags);
+    //cout << "start " << c << " at " << off << "~" << len << std::endl;
+    return true;
+  }
+
+  void wait_for(int max) {
+    Mutex::Locker l(lock);
+    while (in_flight > max) {
+      utime_t dur;
+      dur.set_from_double(.2);
+      cond.WaitInterval(g_ceph_context, lock, dur);
+    }
+  }
+
+};
+
+void rbd_bencher_completion(void *vc, void *pc)
+{
+  librbd::RBD::AioCompletion *c = (librbd::RBD::AioCompletion *)vc;
+  rbd_bencher *b = static_cast<rbd_bencher *>(pc);
+  //cout << "complete " << c << std::endl;
+  int ret = c->get_return_value();
+  if (ret != 0) {
+    cout << "write error: " << cpp_strerror(ret) << std::endl;
+    assert(0 == ret);
+  }
+  b->lock.Lock();
+  b->in_flight--;
+  b->cond.Signal();
+  b->lock.Unlock();
+  c->release();
+}
+
+int do_bench_write(librbd::Image& image, uint64_t io_size,
+                   uint64_t io_threads, uint64_t io_bytes,
+                   bool random)
+{
+  rbd_bencher b(&image);
+
+  std::cout << "bench-write "
+       << " io_size " << io_size
+       << " io_threads " << io_threads
+       << " bytes " << io_bytes
+       << " pattern " << (random ? "random" : "sequential")
+       << std::endl;
+
+  srand(time(NULL) % (unsigned long) -1);
+
+  bufferptr bp(io_size);
+  memset(bp.c_str(), rand() & 0xff, io_size);
+  bufferlist bl;
+  bl.push_back(bp);
+
+  utime_t start = ceph_clock_now(NULL);
+  utime_t last;
+  unsigned ios = 0;
+
+  uint64_t size = 0;
+  image.size(&size);
+
+  vector<uint64_t> thread_offset;
+  uint64_t i;
+  uint64_t start_pos;
+
+  // disturb all thread's offset, used by seq write
+  for (i = 0; i < io_threads; i++) {
+    start_pos = (rand() % (size / io_size)) * io_size;
+    thread_offset.push_back(start_pos);
+  }
+
+  const int WINDOW_SIZE = 5;
+  typedef boost::accumulators::accumulator_set<
+    double, boost::accumulators::stats<
+      boost::accumulators::tag::rolling_sum> > RollingSum;
+
+  RollingSum time_acc(
+    boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE);
+  RollingSum ios_acc(
+    boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE);
+  RollingSum off_acc(
+    boost::accumulators::tag::rolling_window::window_size = WINDOW_SIZE);
+  uint64_t cur_ios = 0;
+  uint64_t cur_off = 0;
+
+  int op_flags;
+  if  (random) {
+    op_flags = LIBRADOS_OP_FLAG_FADVISE_RANDOM;
+  } else {
+    op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
+  }
+
+  printf("  SEC       OPS   OPS/SEC   BYTES/SEC\n");
+  uint64_t off;
+  for (off = 0; off < io_bytes; ) {
+    b.wait_for(io_threads - 1);
+    i = 0;
+    while (i < io_threads && off < io_bytes) {
+      if (random) {
+        thread_offset[i] = (rand() % (size / io_size)) * io_size;
+      } else {
+        thread_offset[i] += io_size;
+        if (thread_offset[i] + io_size > size)
+          thread_offset[i] = 0;
+      }
+
+      if (!b.start_write(io_threads, thread_offset[i], io_size, bl, op_flags))
+        break;
+
+      ++i;
+      ++ios;
+      off += io_size;
+
+      ++cur_ios;
+      cur_off += io_size;
+    }
+
+    utime_t now = ceph_clock_now(NULL);
+    utime_t elapsed = now - start;
+    if (last.is_zero()) {
+      last = elapsed;
+    } else if (elapsed.sec() != last.sec()) {
+      time_acc(elapsed - last);
+      ios_acc(static_cast<double>(cur_ios));
+      off_acc(static_cast<double>(cur_off));
+      cur_ios = 0;
+      cur_off = 0;
+
+      double time_sum = boost::accumulators::rolling_sum(time_acc);
+      printf("%5d  %8d  %8.2lf  %8.2lf\n",
+             (int)elapsed,
+             (int)(ios - io_threads),
+             boost::accumulators::rolling_sum(ios_acc) / time_sum,
+             boost::accumulators::rolling_sum(off_acc) / time_sum);
+      last = elapsed;
+    }
+  }
+  b.wait_for(0);
+  int r = image.flush();
+  if (r < 0) {
+    std::cerr << "Error flushing data at the end: " << cpp_strerror(r)
+              << std::endl;
+  }
+
+  utime_t now = ceph_clock_now(NULL);
+  double elapsed = now - start;
+
+  printf("elapsed: %5d  ops: %8d  ops/sec: %8.2lf  bytes/sec: %8.2lf\n",
+         (int)elapsed, ios, (double)ios / elapsed, (double)off / elapsed);
+
+  return 0;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  // TODO
+  options->add_options()
+    ("io-size", po::value<Size>(), "write size (in B/K/M/G/T)")
+    ("io-threads", po::value<uint32_t>(), "ios in flight")
+    ("io-total", po::value<Size>(), "total size to write (in B/K/M/G/T)")
+    ("io-pattern", po::value<IOPattern>(), "write pattern (rand or seq)");
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  uint64_t bench_io_size;
+  if (vm.count("io-size")) {
+    bench_io_size = vm["io-size"].as<uint64_t>();
+  } else {
+    bench_io_size = 4096;
+  }
+
+  uint32_t bench_io_threads;
+  if (vm.count("io-threads")) {
+    bench_io_threads = vm["io-threads"].as<uint32_t>();
+  } else {
+    bench_io_threads = 16;
+  }
+
+  uint64_t bench_bytes;
+  if (vm.count("io-total")) {
+    bench_bytes = vm["io-total"].as<uint64_t>();
+  } else {
+    bench_bytes = 1 << 30;
+  }
+
+  bool bench_random;
+  if (vm.count("io-pattern")) {
+    bench_random = vm["io-pattern"].as<bool>();
+  } else {
+    bench_random = false;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false, &rados,
+                                 &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_bench_write(image, bench_io_size, bench_io_threads, bench_bytes,
+                     bench_random);
+  if (r < 0) {
+    std::cerr << "bench-write failed: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"bench-write"}, {}, "Simple write benchmark.", "", &get_arguments, &execute);
+
+} // namespace bench_write
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Children.cc b/src/tools/rbd/action/Children.cc
new file mode 100644
index 0000000..b930eb9
--- /dev/null
+++ b/src/tools/rbd/action/Children.cc
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offsset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace children {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+int do_list_children(librbd::Image &image, Formatter *f)
+{
+  std::set<std::pair<std::string, std::string> > children;
+  int r;
+
+  r = image.list_children(&children);
+  if (r < 0)
+    return r;
+
+  if (f)
+    f->open_array_section("children");
+
+  for (auto &child_it : children) {
+    if (f) {
+      f->open_object_section("child");
+      f->dump_string("pool", child_it.first);
+      f->dump_string("image", child_it.second);
+      f->close_section();
+    } else {
+      std::cout << child_it.first << "/" << child_it.second << std::endl;
+    }
+  }
+
+  if (f) {
+    f->close_section();
+    f->flush(std::cout);
+  }
+
+  return 0;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_REQUIRED);
+  if (r < 0) {
+    return r;
+  }
+
+  at::Format::Formatter formatter;
+  r = utils::get_formatter(vm, &formatter);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, snap_name, true,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_list_children(image, formatter.get());
+  if (r < 0) {
+    std::cerr << "rbd: listing children failed: " << cpp_strerror(r)
+              << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"children"}, {}, "Display children of snapshot.", "", &get_arguments,
+  &execute);
+
+} // namespace children
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Clone.cc b/src/tools/rbd/action/Clone.cc
new file mode 100644
index 0000000..6c98433
--- /dev/null
+++ b/src/tools/rbd/action/Clone.cc
@@ -0,0 +1,100 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace clone {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+int do_clone(librbd::RBD &rbd, librados::IoCtx &p_ioctx,
+             const char *p_name, const char *p_snapname,
+             librados::IoCtx &c_ioctx, const char *c_name,
+             uint64_t features, int *c_order,
+             uint64_t stripe_unit, uint64_t stripe_count) {
+  if ((features & RBD_FEATURE_LAYERING) != RBD_FEATURE_LAYERING) {
+    return -EINVAL;
+  }
+
+  return rbd.clone2(p_ioctx, p_name, p_snapname, c_ioctx, c_name, features,
+                    c_order, stripe_unit, stripe_count);
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE);
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+  at::add_create_image_options(options, false);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_REQUIRED);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string dst_pool_name;
+  std::string dst_image_name;
+  std::string dst_snap_name;
+  r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name, &dst_image_name,
+    &dst_snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  int order;
+  uint64_t features;
+  uint32_t stripe_unit;
+  uint32_t stripe_count;
+  r = utils::get_image_options(vm, &order, nullptr, &features, &stripe_unit,
+                               &stripe_count);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::IoCtx dst_io_ctx;
+  r = utils::init_io_ctx(rados, dst_pool_name, &dst_io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librbd::RBD rbd;
+  r = do_clone(rbd, io_ctx, image_name.c_str(), snap_name.c_str(), dst_io_ctx,
+               dst_image_name.c_str(), features, &order, stripe_unit,
+               stripe_count);
+  if (r < 0) {
+    std::cerr << "rbd: clone error: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"clone"}, {}, "Clone a snapshot into a COW child image.",
+  at::get_long_features_help(), &get_arguments, &execute);
+
+} // namespace clone
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Copy.cc b/src/tools/rbd/action/Copy.cc
new file mode 100644
index 0000000..9275e4b
--- /dev/null
+++ b/src/tools/rbd/action/Copy.cc
@@ -0,0 +1,90 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace copy {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_copy(librbd::Image &src, librados::IoCtx& dest_pp,
+                   const char *destname, bool no_progress)
+{
+  utils::ProgressContext pc("Image copy", no_progress);
+  int r = src.copy_with_progress(dest_pp, destname, pc);
+  if (r < 0){
+    pc.fail();
+    return r;
+  }
+  pc.finish();
+  return 0;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_or_snap_spec_options(positional, options,
+                                     at::ARGUMENT_MODIFIER_SOURCE);
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+  at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_PERMITTED);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string dst_pool_name;
+  std::string dst_image_name;
+  std::string dst_snap_name;
+  r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name, &dst_image_name,
+    &dst_snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, snap_name, true,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::IoCtx dst_io_ctx;
+  r = utils::init_io_ctx(rados, dst_pool_name, &dst_io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_copy(image, dst_io_ctx, dst_image_name.c_str(),
+              vm[at::NO_PROGRESS].as<bool>());
+  if (r < 0) {
+    std::cerr << "rbd: copy failed: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"copy"}, {"cp"}, "Copy src image to dest.", "", &get_arguments, &execute);
+
+} // namespace copy
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Create.cc b/src/tools/rbd/action/Create.cc
new file mode 100644
index 0000000..49eedb6
--- /dev/null
+++ b/src/tools/rbd/action/Create.cc
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace create {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_create(librbd::RBD &rbd, librados::IoCtx& io_ctx,
+                     const char *imgname, uint64_t size, int *order,
+                     int format, uint64_t features,
+                     uint64_t stripe_unit, uint64_t stripe_count) {
+  int r;
+  if (format == 1) {
+    r = rbd.create(io_ctx, imgname, size, order);
+  } else {
+    r = rbd.create3(io_ctx, imgname, size, features, order,
+                    stripe_unit, stripe_count);
+  }
+  if (r < 0) {
+    return r;
+  }
+  return 0;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_create_image_options(options, true);
+  at::add_size_option(options);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  int order;
+  uint32_t format;
+  uint64_t features;
+  uint32_t stripe_unit;
+  uint32_t stripe_count;
+  r = utils::get_image_options(vm, &order, &format, &features, &stripe_unit,
+                               &stripe_count);
+  if (r < 0) {
+    return r;
+  }
+
+  uint64_t size;
+  r = utils::get_image_size(vm, &size);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librbd::RBD rbd;
+  r = do_create(rbd, io_ctx, image_name.c_str(), size, &order, format, features,
+                stripe_unit, stripe_count);
+  if (r < 0) {
+    std::cerr << "rbd: create error: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"create"}, {}, "Create an empty image.", at::get_long_features_help(),
+  &get_arguments, &execute);
+
+} // namespace create
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Diff.cc b/src/tools/rbd/action/Diff.cc
new file mode 100644
index 0000000..cd0aeb2
--- /dev/null
+++ b/src/tools/rbd/action/Diff.cc
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace diff {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+struct output_method {
+  output_method() : f(NULL), t(NULL), empty(true) {}
+  Formatter *f;
+  TextTable *t;
+  bool empty;
+};
+
+static int diff_cb(uint64_t ofs, size_t len, int exists, void *arg)
+{
+  output_method *om = static_cast<output_method *>(arg);
+  om->empty = false;
+  if (om->f) {
+    om->f->open_object_section("extent");
+    om->f->dump_unsigned("offset", ofs);
+    om->f->dump_unsigned("length", len);
+    om->f->dump_string("exists", exists ? "true" : "false");
+    om->f->close_section();
+  } else {
+    assert(om->t);
+    *(om->t) << ofs << len << (exists ? "data" : "zero") << TextTable::endrow;
+  }
+  return 0;
+}
+
+static int do_diff(librbd::Image& image, const char *fromsnapname,
+                   bool whole_object, Formatter *f)
+{
+  int r;
+  librbd::image_info_t info;
+
+  r = image.stat(info, sizeof(info));
+  if (r < 0)
+    return r;
+
+  output_method om;
+  if (f) {
+    om.f = f;
+    f->open_array_section("extents");
+  } else {
+    om.t = new TextTable();
+    om.t->define_column("Offset", TextTable::LEFT, TextTable::LEFT);
+    om.t->define_column("Length", TextTable::LEFT, TextTable::LEFT);
+    om.t->define_column("Type", TextTable::LEFT, TextTable::LEFT);
+  }
+
+  r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object,
+                          diff_cb, &om);
+  if (f) {
+    f->close_section();
+    f->flush(std::cout);
+  } else {
+    if (!om.empty)
+      std::cout << *om.t;
+    delete om.t;
+  }
+  return r;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_or_snap_spec_options(positional, options,
+                                     at::ARGUMENT_MODIFIER_NONE);
+  options->add_options()
+    (at::FROM_SNAPSHOT_NAME.c_str(), po::value<std::string>(),
+     "snapshot starting point")
+    (at::WHOLE_OBJECT.c_str(), po::bool_switch(), "compare whole object");
+  at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_PERMITTED);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string from_snap_name;
+  if (vm.count(at::FROM_SNAPSHOT_NAME)) {
+    from_snap_name = vm[at::FROM_SNAPSHOT_NAME].as<std::string>();
+  }
+
+  bool diff_whole_object = vm[at::WHOLE_OBJECT].as<bool>();
+
+  at::Format::Formatter formatter;
+  r = utils::get_formatter(vm, &formatter);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, snap_name, true,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_diff(image, from_snap_name.empty() ? nullptr : from_snap_name.c_str(),
+              diff_whole_object, formatter.get());
+  if (r < 0) {
+    std::cerr << "rbd: diff error: " << cpp_strerror(r) << std::endl;
+    return -r;
+  }
+  return 0;
+}
+
+Shell::SwitchArguments switched_arguments({at::WHOLE_OBJECT});
+Shell::Action action(
+  {"diff"}, {},
+  "Print extents that differ since a previous snap, or image creation.", "",
+  &get_arguments, &execute);
+
+} // namespace diff
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/DiskUsage.cc b/src/tools/rbd/action/DiskUsage.cc
new file mode 100644
index 0000000..8e59ffe
--- /dev/null
+++ b/src/tools/rbd/action/DiskUsage.cc
@@ -0,0 +1,268 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/types.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <algorithm>
+#include <iostream>
+#include <boost/bind.hpp>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace disk_usage {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int disk_usage_callback(uint64_t offset, size_t len, int exists,
+                               void *arg) {
+  uint64_t *used_size = reinterpret_cast<uint64_t *>(arg);
+  if (exists) {
+    (*used_size) += len;
+  }
+  return 0;
+}
+
+static int compute_image_disk_usage(const std::string& name,
+                                    const std::string& snap_name,
+                                    const std::string& from_snap_name,
+                                    librbd::Image &image, uint64_t size,
+                                    TextTable& tbl, Formatter *f,
+                                    uint64_t *used_size) {
+  const char* from = NULL;
+  if (!from_snap_name.empty()) {
+    from = from_snap_name.c_str();
+  }
+
+  uint64_t flags;
+  int r = image.get_flags(&flags);
+  if (r < 0) {
+    std::cerr << "rbd: failed to retrieve image flags: " << cpp_strerror(r)
+         << std::endl;
+    return r;
+  }
+  if ((flags & RBD_FLAG_FAST_DIFF_INVALID) != 0) {
+    std::cerr << "warning: fast-diff map is invalid for " << name
+         << (snap_name.empty() ? "" : "@" + snap_name) << ". "
+         << "operation may be slow." << std::endl;
+  }
+
+  *used_size = 0;
+  r = image.diff_iterate2(from, 0, size, false, true,
+                          &disk_usage_callback, used_size);
+  if (r < 0) {
+    std::cerr << "rbd: failed to iterate diffs: " << cpp_strerror(r)
+              << std::endl;
+    return r;
+  }
+
+  if (f) {
+    f->open_object_section("image");
+    f->dump_string("name", name);
+    if (!snap_name.empty()) {
+      f->dump_string("snapshot", snap_name);
+    }
+    f->dump_unsigned("provisioned_size", size);
+    f->dump_unsigned("used_size" , *used_size);
+    f->close_section();
+  } else {
+    std::string full_name = name;
+    if (!snap_name.empty()) {
+      full_name += "@" + snap_name;
+    }
+    tbl << full_name
+        << stringify(si_t(size))
+        << stringify(si_t(*used_size))
+        << TextTable::endrow;
+  }
+  return 0;
+}
+
+static int do_disk_usage(librbd::RBD &rbd, librados::IoCtx &io_ctx,
+                        const char *imgname, const char *snapname,
+                        Formatter *f) {
+  std::vector<std::string> names;
+  int r = rbd.list(io_ctx, names);
+  if (r == -ENOENT) {
+    r = 0;
+  } else if (r < 0) {
+    return r;
+  }
+
+  TextTable tbl;
+  if (f) {
+    f->open_object_section("stats");
+    f->open_array_section("images");
+  } else {
+    tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+    tbl.define_column("PROVISIONED", TextTable::RIGHT, TextTable::RIGHT);
+    tbl.define_column("USED", TextTable::RIGHT, TextTable::RIGHT);
+  }
+
+  uint64_t used_size = 0;
+  uint64_t total_prov = 0;
+  uint64_t total_used = 0;
+  std::sort(names.begin(), names.end());
+  for (std::vector<string>::const_iterator name = names.begin();
+       name != names.end(); ++name) {
+    if (imgname != NULL && *name != imgname) {
+      continue;
+    }
+
+    librbd::Image image;
+    r = rbd.open_read_only(io_ctx, image, name->c_str(), NULL);
+    if (r < 0) {
+      if (r != -ENOENT) {
+        std::cerr << "rbd: error opening " << *name << ": " << cpp_strerror(r)
+                  << std::endl;
+      }
+      continue;
+    }
+
+    uint64_t features;
+    int r = image.features(&features);
+    if (r < 0) {
+      std::cerr << "rbd: failed to retrieve image features: " << cpp_strerror(r)
+                << std::endl;
+      return r;
+    }
+    if ((features & RBD_FEATURE_FAST_DIFF) == 0) {
+      std::cerr << "warning: fast-diff map is not enabled for " << *name << ". "
+                << "operation may be slow." << std::endl;
+    }
+
+    librbd::image_info_t info;
+    if (image.stat(info, sizeof(info)) < 0) {
+      return -EINVAL;
+    }
+
+    std::vector<librbd::snap_info_t> snap_list;
+    r = image.snap_list(snap_list);
+    if (r < 0) {
+      std::cerr << "rbd: error opening " << *name << " snapshots: "
+                << cpp_strerror(r) << std::endl;
+      continue;
+    }
+
+    std::string last_snap_name;
+    std::sort(snap_list.begin(), snap_list.end(),
+              boost::bind(&librbd::snap_info_t::id, _1) <
+                boost::bind(&librbd::snap_info_t::id, _2));
+    for (std::vector<librbd::snap_info_t>::const_iterator snap =
+         snap_list.begin(); snap != snap_list.end(); ++snap) {
+      librbd::Image snap_image;
+      r = rbd.open_read_only(io_ctx, snap_image, name->c_str(),
+                             snap->name.c_str());
+      if (r < 0) {
+        std::cerr << "rbd: error opening snapshot " << *name << "@"
+                  << snap->name << ": " << cpp_strerror(r) << std::endl;
+        return r;
+      }
+
+      if (imgname == NULL || (snapname != NULL && snap->name == snapname)) {
+        r = compute_image_disk_usage(*name, snap->name, last_snap_name,
+                                     snap_image, snap->size, tbl, f,
+                                     &used_size);
+        if (r < 0) {
+          return r;
+        }
+
+        if (snapname != NULL) {
+          total_prov += snap->size;
+        }
+        total_used += used_size;
+      }
+      last_snap_name = snap->name;
+    }
+
+    if (snapname == NULL) {
+      r = compute_image_disk_usage(*name, "", last_snap_name, image, info.size,
+                                   tbl, f, &used_size);
+      if (r < 0) {
+        return r;
+      }
+      total_prov += info.size;
+      total_used += used_size;
+    }
+  }
+
+  if (f) {
+    f->close_section();
+    if (imgname == NULL) {
+      f->dump_unsigned("total_provisioned_size", total_prov);
+      f->dump_unsigned("total_used_size", total_used);
+    }
+    f->close_section();
+    f->flush(std::cout);
+  } else {
+    if (imgname == NULL) {
+      tbl << "<TOTAL>"
+          << stringify(si_t(total_prov))
+          << stringify(si_t(total_used))
+          << TextTable::endrow;
+    }
+    std::cout << tbl;
+  }
+
+  return 0;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_or_snap_spec_options(positional, options,
+                                     at::ARGUMENT_MODIFIER_NONE);
+  at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_PERMITTED,
+    false);
+  if (r < 0) {
+    return r;
+  }
+
+  at::Format::Formatter formatter;
+  r = utils::get_formatter(vm, &formatter);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librbd::RBD rbd;
+  r = do_disk_usage(rbd, io_ctx,
+                    image_name.empty() ? nullptr: image_name.c_str() ,
+                    snap_name.empty() ? nullptr : snap_name.c_str(),
+                    formatter.get());
+  if (r < 0) {
+    std::cerr << "du failed: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"disk-usage"}, {"du"}, "Show disk usage stats for pool, image or snapshot",
+  "", &get_arguments, &execute);
+
+} // namespace disk_usage
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Export.cc b/src/tools/rbd/action/Export.cc
new file mode 100644
index 0000000..324a4b3
--- /dev/null
+++ b/src/tools/rbd/action/Export.cc
@@ -0,0 +1,196 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/Context.h"
+#include "common/errno.h"
+#include "common/Throttle.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+#include <boost/scope_exit.hpp>
+
+namespace rbd {
+namespace action {
+namespace export_full {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+class C_Export : public Context
+{
+public:
+  C_Export(SimpleThrottle &simple_throttle, librbd::Image &image,
+                   uint64_t offset, uint64_t length, int fd)
+    : m_aio_completion(
+        new librbd::RBD::AioCompletion(this, &utils::aio_context_callback)),
+      m_throttle(simple_throttle), m_image(image), m_offset(offset),
+      m_length(length), m_fd(fd)
+  {
+  }
+
+  void send()
+  {
+    m_throttle.start_op();
+
+    int op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
+                   LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+    int r = m_image.aio_read2(m_offset, m_length, m_bufferlist,
+                              m_aio_completion, op_flags);
+    if (r < 0) {
+      cerr << "rbd: error requesting read from source image" << std::endl;
+      m_aio_completion->release();
+      m_throttle.end_op(r);
+    }
+  }
+
+  virtual void finish(int r)
+  {
+    BOOST_SCOPE_EXIT((&m_throttle) (&r))
+    {
+      m_throttle.end_op(r);
+    } BOOST_SCOPE_EXIT_END
+
+    if (r < 0) {
+      cerr << "rbd: error reading from source image at offset "
+           << m_offset << ": " << cpp_strerror(r) << std::endl;
+      return;
+    }
+
+    assert(m_bufferlist.length() == static_cast<size_t>(r));
+    if (m_fd != STDOUT_FILENO) {
+      if (m_bufferlist.is_zero()) {
+        return;
+      }
+
+      uint64_t chkret = lseek64(m_fd, m_offset, SEEK_SET);
+      if (chkret != m_offset) {
+        cerr << "rbd: error seeking destination image to offset "
+             << m_offset << std::endl;
+        r = -errno;
+        return;
+      }
+    }
+
+    r = m_bufferlist.write_fd(m_fd);
+    if (r < 0) {
+      cerr << "rbd: error writing to destination image at offset "
+           << m_offset << std::endl;
+    }
+  }
+
+private:
+  librbd::RBD::AioCompletion *m_aio_completion;
+  SimpleThrottle &m_throttle;
+  librbd::Image &m_image;
+  bufferlist m_bufferlist;
+  uint64_t m_offset;
+  uint64_t m_length;
+  int m_fd;
+};
+
+static int do_export(librbd::Image& image, const char *path, bool no_progress)
+{
+  librbd::image_info_t info;
+  int64_t r = image.stat(info, sizeof(info));
+  if (r < 0)
+    return r;
+
+  int fd;
+  int max_concurrent_ops;
+  bool to_stdout = (strcmp(path, "-") == 0);
+  if (to_stdout) {
+    fd = STDOUT_FILENO;
+    max_concurrent_ops = 1;
+  } else {
+    max_concurrent_ops = max(g_conf->rbd_concurrent_management_ops, 1);
+    fd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644);
+    if (fd < 0) {
+      return -errno;
+    }
+    posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+  }
+
+  utils::ProgressContext pc("Exporting image", no_progress);
+
+  SimpleThrottle throttle(max_concurrent_ops, false);
+  uint64_t period = image.get_stripe_count() * (1ull << info.order);
+  for (uint64_t offset = 0; offset < info.size; offset += period) {
+    if (throttle.pending_error()) {
+      break;
+    }
+
+    uint64_t length = min(period, info.size - offset);
+    C_Export *ctx = new C_Export(throttle, image, offset, length, fd);
+    ctx->send();
+
+    pc.update_progress(offset, info.size);
+  }
+
+  r = throttle.wait_for_ret();
+  if (!to_stdout) {
+    if (r >= 0) {
+      r = ftruncate(fd, info.size);
+    }
+    close(fd);
+  }
+
+  if (r < 0) {
+    pc.fail();
+  } else {
+    pc.finish();
+  }
+  return r;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_or_snap_spec_options(positional, options,
+                                     at::ARGUMENT_MODIFIER_SOURCE);
+  at::add_path_options(positional, options,
+                       "export file (or '-' for stdout)");
+  at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_PERMITTED);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string path;
+  r = utils::get_path(vm, utils::get_positional_argument(vm, 1), &path);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, snap_name, true,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_export(image, path.c_str(), vm[at::NO_PROGRESS].as<bool>());
+  if (r < 0) {
+    std::cerr << "rbd: export error: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"export"}, {}, "Export image to file.", "", &get_arguments, &execute);
+
+} // namespace export_full
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/ExportDiff.cc b/src/tools/rbd/action/ExportDiff.cc
new file mode 100644
index 0000000..245bbf3
--- /dev/null
+++ b/src/tools/rbd/action/ExportDiff.cc
@@ -0,0 +1,260 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/encoding.h"
+#include "common/errno.h"
+#include "common/Throttle.h"
+#include <fcntl.h>
+#include <iostream>
+#include <stdlib.h>
+#include <boost/program_options.hpp>
+#include <boost/scope_exit.hpp>
+
+namespace rbd {
+namespace action {
+namespace export_diff {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+struct ExportDiffContext {
+  librbd::Image *image;
+  int fd;
+  uint64_t totalsize;
+  utils::ProgressContext pc;
+  OrderedThrottle throttle;
+
+  ExportDiffContext(librbd::Image *i, int f, uint64_t t, int max_ops,
+                    bool no_progress) :
+    image(i), fd(f), totalsize(t), pc("Exporting image", no_progress),
+    throttle(max_ops, true) {
+  }
+};
+
+class C_ExportDiff : public Context {
+public:
+  C_ExportDiff(ExportDiffContext *edc, uint64_t offset, uint64_t length,
+               bool exists)
+    : m_export_diff_context(edc), m_offset(offset), m_length(length),
+      m_exists(exists) {
+  }
+
+  int send() {
+    if (m_export_diff_context->throttle.pending_error()) {
+      return m_export_diff_context->throttle.wait_for_ret();
+    }
+
+    C_OrderedThrottle *ctx = m_export_diff_context->throttle.start_op(this);
+    if (m_exists) {
+      librbd::RBD::AioCompletion *aio_completion =
+        new librbd::RBD::AioCompletion(ctx, &utils::aio_context_callback);
+
+      int op_flags = LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+      int r = m_export_diff_context->image->aio_read2(
+        m_offset, m_length, m_read_data, aio_completion, op_flags);
+      if (r < 0) {
+        aio_completion->release();
+        ctx->complete(r);
+      }
+    } else {
+      ctx->complete(0);
+    }
+    return 0;
+  }
+
+  static int export_diff_cb(uint64_t offset, size_t length, int exists,
+                            void *arg) {
+    ExportDiffContext *edc = reinterpret_cast<ExportDiffContext *>(arg);
+
+    C_ExportDiff *context = new C_ExportDiff(edc, offset, length, exists);
+    return context->send();
+  }
+
+protected:
+  virtual void finish(int r) {
+    if (r >= 0) {
+      if (m_exists) {
+        m_exists = !m_read_data.is_zero();
+      }
+      r = write_extent(m_export_diff_context, m_offset, m_length, m_exists);
+      if (r == 0 && m_exists) {
+        r = m_read_data.write_fd(m_export_diff_context->fd);
+      }
+    }
+    m_export_diff_context->throttle.end_op(r);
+  }
+
+private:
+  ExportDiffContext *m_export_diff_context;
+  uint64_t m_offset;
+  uint64_t m_length;
+  bool m_exists;
+  bufferlist m_read_data;
+
+  static int write_extent(ExportDiffContext *edc, uint64_t offset,
+                          uint64_t length, bool exists) {
+    // extent
+    bufferlist bl;
+    __u8 tag = exists ? 'w' : 'z';
+    ::encode(tag, bl);
+    ::encode(offset, bl);
+    ::encode(length, bl);
+    int r = bl.write_fd(edc->fd);
+
+    edc->pc.update_progress(offset, edc->totalsize);
+    return r;
+  }
+};
+
+static int do_export_diff(librbd::Image& image, const char *fromsnapname,
+                          const char *endsnapname, bool whole_object,
+                          const char *path, bool no_progress)
+{
+  int r;
+  librbd::image_info_t info;
+  int fd;
+
+  r = image.stat(info, sizeof(info));
+  if (r < 0)
+    return r;
+
+  if (strcmp(path, "-") == 0)
+    fd = 1;
+  else
+    fd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644);
+  if (fd < 0)
+    return -errno;
+
+  BOOST_SCOPE_EXIT((&r) (&fd) (&path)) {
+    close(fd);
+    if (r < 0 && fd != 1) {
+      remove(path);
+    }
+  } BOOST_SCOPE_EXIT_END
+
+  {
+    // header
+    bufferlist bl;
+    bl.append(utils::RBD_DIFF_BANNER);
+
+    __u8 tag;
+    if (fromsnapname) {
+      tag = 'f';
+      ::encode(tag, bl);
+      std::string from(fromsnapname);
+      ::encode(from, bl);
+    }
+
+    if (endsnapname) {
+      tag = 't';
+      ::encode(tag, bl);
+      std::string to(endsnapname);
+      ::encode(to, bl);
+    }
+
+    tag = 's';
+    ::encode(tag, bl);
+    uint64_t endsize = info.size;
+    ::encode(endsize, bl);
+
+    r = bl.write_fd(fd);
+    if (r < 0) {
+      return r;
+    }
+  }
+  ExportDiffContext edc(&image, fd, info.size,
+                        g_conf->rbd_concurrent_management_ops, no_progress);
+  r = image.diff_iterate2(fromsnapname, 0, info.size, true, whole_object,
+                          &C_ExportDiff::export_diff_cb, (void *)&edc);
+  if (r < 0) {
+    goto out;
+  }
+
+  r = edc.throttle.wait_for_ret();
+  if (r < 0) {
+    goto out;
+  }
+
+  {
+    __u8 tag = 'e';
+    bufferlist bl;
+    ::encode(tag, bl);
+    r = bl.write_fd(fd);
+  }
+
+ out:
+  if (r < 0)
+    edc.pc.fail();
+  else
+    edc.pc.finish();
+  return r;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_or_snap_spec_options(positional, options,
+                                     at::ARGUMENT_MODIFIER_SOURCE);
+  at::add_path_options(positional, options,
+                       "export file (or '-' for stdout)");
+  options->add_options()
+    (at::FROM_SNAPSHOT_NAME.c_str(), po::value<std::string>(),
+     "snapshot starting point")
+    (at::WHOLE_OBJECT.c_str(), po::bool_switch(), "compare whole object");
+  at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_PERMITTED);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string path;
+  r = utils::get_path(vm, utils::get_positional_argument(vm, 1), &path);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string from_snap_name;
+  if (vm.count(at::FROM_SNAPSHOT_NAME)) {
+    from_snap_name = vm[at::FROM_SNAPSHOT_NAME].as<std::string>();
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, snap_name, true,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_export_diff(image,
+                     from_snap_name.empty() ? nullptr : from_snap_name.c_str(),
+                     snap_name.empty() ? nullptr : snap_name.c_str(),
+                     vm[at::WHOLE_OBJECT].as<bool>(), path.c_str(),
+                     vm[at::NO_PROGRESS].as<bool>());
+  if (r < 0) {
+    std::cerr << "rbd: export-diff error: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::SwitchArguments switched_arguments({at::WHOLE_OBJECT});
+Shell::Action action(
+  {"export-diff"}, {}, "Export incremental diff to file.", "",
+  &get_arguments, &execute);
+
+} // namespace export_diff
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Feature.cc b/src/tools/rbd/action/Feature.cc
new file mode 100644
index 0000000..4bd61a6
--- /dev/null
+++ b/src/tools/rbd/action/Feature.cc
@@ -0,0 +1,86 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace feature {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  positional->add_options()
+    ("features", po::value<at::ImageFeatures>()->multitoken(),
+     ("image features\n" + at::get_short_features_help(false)).c_str());
+}
+
+int execute(const po::variables_map &vm, bool enabled) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  const std::vector<std::string> &args = vm[at::POSITIONAL_ARGUMENTS]
+    .as<std::vector<std::string> >();
+  std::vector<std::string> feature_names(args.begin() + 1, args.end());
+  if (feature_names.empty()) {
+    std::cerr << "rbd: at least one feature name must be specified"
+              << std::endl;
+    return -EINVAL;
+  }
+
+  boost::any features_any(static_cast<uint64_t>(0));
+  at::ImageFeatures image_features;
+  at::validate(features_any, feature_names, &image_features, 0);
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = image.update_features(boost::any_cast<uint64_t>(features_any), enabled);
+  if (r < 0) {
+    std::cerr << "rbd: failed to update image features: " << cpp_strerror(r)
+              << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+int execute_disable(const po::variables_map &vm) {
+  return execute(vm, false);
+}
+
+int execute_enable(const po::variables_map &vm) {
+  return execute(vm, true);
+}
+
+Shell::Action action_disable(
+  {"feature", "disable"}, {}, "Disable the specified image feature.", "",
+  &get_arguments, &execute_disable);
+Shell::Action action_enable(
+  {"feature", "enable"}, {}, "Enable the specified image feature.", "",
+  &get_arguments, &execute_enable);
+
+} // namespace feature
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Flatten.cc b/src/tools/rbd/action/Flatten.cc
new file mode 100644
index 0000000..5122543
--- /dev/null
+++ b/src/tools/rbd/action/Flatten.cc
@@ -0,0 +1,71 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace flatten {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_flatten(librbd::Image& image, bool no_progress)
+{
+  utils::ProgressContext pc("Image flatten", no_progress);
+  int r = image.flatten_with_progress(pc);
+  if (r < 0) {
+    pc.fail();
+    return r;
+  }
+  pc.finish();
+  return 0;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_flatten(image, vm[at::NO_PROGRESS].as<bool>());
+  if (r < 0) {
+    std::cerr << "rbd: flatten error: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"flatten"}, {}, "Fill clone with parent data (make it independent).", "",
+  &get_arguments, &execute);
+
+} // namespace flatten
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/ImageMeta.cc b/src/tools/rbd/action/ImageMeta.cc
new file mode 100644
index 0000000..2cf1a25
--- /dev/null
+++ b/src/tools/rbd/action/ImageMeta.cc
@@ -0,0 +1,313 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace image_meta {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+void add_key_option(po::options_description *positional) {
+  positional->add_options()
+    ("key", "image meta key");
+}
+
+int get_key(const po::variables_map &vm, std::string *key) {
+  *key = utils::get_positional_argument(vm, 1);
+  if (key->empty()) {
+    std::cerr << "rbd: metadata key was not specified" << std::endl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+} // anonymous namespace
+
+static int do_metadata_list(librbd::Image& image, Formatter *f)
+{
+  std::map<std::string, bufferlist> pairs;
+  int r;
+  TextTable tbl;
+
+  r = image.metadata_list("", 0, &pairs);
+  if (r < 0) {
+    std::cerr << "failed to list metadata of image : " << cpp_strerror(r)
+              << std::endl;
+    return r;
+  }
+
+  if (f) {
+    f->open_object_section("metadatas");
+  } else {
+    tbl.define_column("Key", TextTable::LEFT, TextTable::LEFT);
+    tbl.define_column("Value", TextTable::LEFT, TextTable::LEFT);
+  }
+
+  if (!pairs.empty()) {
+    bool one = (pairs.size() == 1);
+
+    if (!f) {
+      std::cout << "There " << (one ? "is " : "are ") << pairs.size()
+           << " metadata" << (one ? "" : "s") << " on this image.\n";
+    }
+
+    for (std::map<std::string, bufferlist>::iterator it = pairs.begin();
+         it != pairs.end(); ++it) {
+      std::string val(it->second.c_str(), it->second.length());
+      if (f) {
+        f->dump_string(it->first.c_str(), val.c_str());
+      } else {
+        tbl << it->first << val.c_str() << TextTable::endrow;
+      }
+    }
+    if (!f)
+      std::cout << tbl;
+  }
+
+  if (f) {
+    f->close_section();
+    f->flush(std::cout);
+  }
+  return 0;
+}
+
+static int do_metadata_set(librbd::Image& image, const char *key,
+                          const char *value)
+{
+  int r = image.metadata_set(key, value);
+  if (r < 0) {
+    std::cerr << "failed to set metadata " << key << " of image : "
+              << cpp_strerror(r) << std::endl;
+  }
+  return r;
+}
+
+static int do_metadata_remove(librbd::Image& image, const char *key)
+{
+  int r = image.metadata_remove(key);
+  if (r < 0) {
+    std::cerr << "failed to remove metadata " << key << " of image : "
+              << cpp_strerror(r) << std::endl;
+  }
+  return r;
+}
+
+static int do_metadata_get(librbd::Image& image, const char *key)
+{
+  std::string s;
+  int r = image.metadata_get(key, &s);
+  if (r < 0) {
+    std::cerr << "failed to get metadata " << key << " of image : "
+              << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  std::cout << s << std::endl;
+  return r;
+}
+
+void get_list_arguments(po::options_description *positional,
+                        po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  at::Format::Formatter formatter;
+  r = utils::get_formatter(vm, &formatter);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_metadata_list(image, formatter.get());
+  if (r < 0) {
+    std::cerr << "rbd: listing metadata failed: " << cpp_strerror(r)
+              << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_get_arguments(po::options_description *positional,
+                       po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  add_key_option(positional);
+}
+
+int execute_get(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string key;
+  r = get_key(vm, &key);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_metadata_get(image, key.c_str());
+  if (r < 0) {
+    std::cerr << "rbd: getting metadata failed: " << cpp_strerror(r)
+              << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_set_arguments(po::options_description *positional,
+                       po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  add_key_option(positional);
+  positional->add_options()
+    ("value", "image meta value");
+}
+
+int execute_set(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string key;
+  r = get_key(vm, &key);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string value = utils::get_positional_argument(vm, 2);
+  if (value.empty()) {
+    std::cerr << "rbd: metadata value was not specified" << std::endl;
+    return -EINVAL;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_metadata_set(image, key.c_str(), value.c_str());
+  if (r < 0) {
+    std::cerr << "rbd: setting metadata failed: " << cpp_strerror(r)
+              << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_remove_arguments(po::options_description *positional,
+                          po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  add_key_option(positional);
+}
+
+int execute_remove(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string key;
+  r = get_key(vm, &key);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_metadata_remove(image, key.c_str());
+  if (r < 0) {
+    std::cerr << "rbd: removing metadata failed: " << cpp_strerror(r)
+              << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action_list(
+  {"image-meta", "list"}, {}, "Image metadata list keys with values.", "",
+  &get_list_arguments, &execute_list);
+Shell::Action action_get(
+  {"image-meta", "get"}, {},
+  "Image metadata get the value associated with the key.", "",
+  &get_get_arguments, &execute_get);
+Shell::Action action_set(
+  {"image-meta", "set"}, {}, "Image metadata set key with value.", "",
+  &get_set_arguments, &execute_set);
+Shell::Action action_remove(
+  {"image-meta", "remove"}, {},
+  "Image metadata remove the key and value associated.", "",
+  &get_remove_arguments, &execute_remove);
+
+} // namespace image_meta
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Import.cc b/src/tools/rbd/action/Import.cc
new file mode 100644
index 0000000..bb7cb7d
--- /dev/null
+++ b/src/tools/rbd/action/Import.cc
@@ -0,0 +1,319 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/Context.h"
+#include "common/blkdev.h"
+#include "common/errno.h"
+#include "common/Throttle.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+#include <boost/scoped_ptr.hpp>
+
+namespace rbd {
+namespace action {
+namespace import {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+class C_Import : public Context {
+public:
+  C_Import(SimpleThrottle &simple_throttle, librbd::Image &image,
+           bufferlist &bl, uint64_t offset)
+    : m_throttle(simple_throttle), m_image(image),
+      m_aio_completion(
+        new librbd::RBD::AioCompletion(this, &utils::aio_context_callback)),
+      m_bufferlist(bl), m_offset(offset)
+  {
+  }
+
+  void send()
+  {
+    m_throttle.start_op();
+
+    int op_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL |
+                   LIBRADOS_OP_FLAG_FADVISE_NOCACHE;
+    int r = m_image.aio_write2(m_offset, m_bufferlist.length(), m_bufferlist,
+                               m_aio_completion, op_flags);
+    if (r < 0) {
+      std::cerr << "rbd: error requesting write to destination image"
+                << std::endl;
+      m_aio_completion->release();
+      m_throttle.end_op(r);
+    }
+  }
+
+  virtual void finish(int r)
+  {
+    if (r < 0) {
+      std::cerr << "rbd: error writing to destination image at offset "
+                << m_offset << ": " << cpp_strerror(r) << std::endl;
+    }
+    m_throttle.end_op(r);
+  }
+
+private:
+  SimpleThrottle &m_throttle;
+  librbd::Image &m_image;
+  librbd::RBD::AioCompletion *m_aio_completion;
+  bufferlist m_bufferlist;
+  uint64_t m_offset;
+};
+
+static int do_import(librbd::RBD &rbd, librados::IoCtx& io_ctx,
+                     const char *imgname, int *order, const char *path,
+                     int format, uint64_t features,
+                     uint64_t stripe_unit, uint64_t stripe_count,
+                     bool no_progress)
+{
+  int fd, r;
+  struct stat stat_buf;
+  utils::ProgressContext pc("Importing image", no_progress);
+
+  assert(imgname);
+
+  // default order as usual
+  if (*order == 0)
+    *order = 22;
+
+  // try to fill whole imgblklen blocks for sparsification
+  uint64_t image_pos = 0;
+  size_t imgblklen = 1 << *order;
+  char *p = new char[imgblklen];
+  size_t reqlen = imgblklen;    // amount requested from read
+  ssize_t readlen;              // amount received from one read
+  size_t blklen = 0;            // amount accumulated from reads to fill blk
+  librbd::Image image;
+  uint64_t size = 0;
+
+  boost::scoped_ptr<SimpleThrottle> throttle;
+  bool from_stdin = !strcmp(path, "-");
+  if (from_stdin) {
+    throttle.reset(new SimpleThrottle(1, false));
+    fd = 0;
+    size = 1ULL << *order;
+  } else {
+    throttle.reset(new SimpleThrottle(
+      max(g_conf->rbd_concurrent_management_ops, 1), false));
+    if ((fd = open(path, O_RDONLY)) < 0) {
+      r = -errno;
+      std::cerr << "rbd: error opening " << path << std::endl;
+      goto done2;
+    }
+
+    if ((fstat(fd, &stat_buf)) < 0) {
+      r = -errno;
+      std::cerr << "rbd: stat error " << path << std::endl;
+      goto done;
+    }
+    if (S_ISDIR(stat_buf.st_mode)) {
+      r = -EISDIR;
+      std::cerr << "rbd: cannot import a directory" << std::endl;
+      goto done;
+    }
+    if (stat_buf.st_size)
+      size = (uint64_t)stat_buf.st_size;
+
+    if (!size) {
+      int64_t bdev_size = 0;
+      r = get_block_device_size(fd, &bdev_size);
+      if (r < 0) {
+        std::cerr << "rbd: unable to get size of file/block device"
+                  << std::endl;
+        goto done;
+      }
+      assert(bdev_size >= 0);
+      size = (uint64_t) bdev_size;
+    }
+
+    posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+  }
+
+  if (format == 1) {
+    // weird striping not allowed with format 1!
+    if ((stripe_unit || stripe_count) &&
+        (stripe_unit != (1ull << *order) && stripe_count != 1)) {
+      std::cerr << "non-default striping not allowed with format 1; "
+                << "use --image-format 2" << std::endl;
+      return -EINVAL;
+    }
+    r = rbd.create(io_ctx, imgname, size, order);
+  } else {
+    r = rbd.create3(io_ctx, imgname, size, features, order,
+                    stripe_unit, stripe_count);
+  }
+  if (r < 0) {
+    std::cerr << "rbd: image creation failed" << std::endl;
+    goto done;
+  }
+
+  r = rbd.open(io_ctx, image, imgname);
+  if (r < 0) {
+    std::cerr << "rbd: failed to open image" << std::endl;
+    goto done;
+  }
+
+  // loop body handles 0 return, as we may have a block to flush
+  while ((readlen = ::read(fd, p + blklen, reqlen)) >= 0) {
+    if (throttle->pending_error()) {
+      break;
+    }
+
+    blklen += readlen;
+    // if read was short, try again to fill the block before writing
+    if (readlen && ((size_t)readlen < reqlen)) {
+      reqlen -= readlen;
+      continue;
+    }
+    if (!from_stdin)
+      pc.update_progress(image_pos, size);
+
+    bufferlist bl(blklen);
+    bl.append(p, blklen);
+    // resize output image by binary expansion as we go for stdin
+    if (from_stdin && (image_pos + (size_t)blklen) > size) {
+      size *= 2;
+      r = image.resize(size);
+      if (r < 0) {
+        std::cerr << "rbd: can't resize image during import" << std::endl;
+        goto done;
+      }
+    }
+
+    // write as much as we got; perhaps less than imgblklen
+    // but skip writing zeros to create sparse images
+    if (!bl.is_zero()) {
+      C_Import *ctx = new C_Import(*throttle, image, bl, image_pos);
+      ctx->send();
+    }
+
+    // done with whole block, whether written or not
+    image_pos += blklen;
+    // if read had returned 0, we're at EOF and should quit
+    if (readlen == 0)
+      break;
+    blklen = 0;
+    reqlen = imgblklen;
+  }
+  r = throttle->wait_for_ret();
+  if (r < 0) {
+    goto done;
+  }
+
+  if (from_stdin) {
+    r = image.resize(image_pos);
+    if (r < 0) {
+      std::cerr << "rbd: final image resize failed" << std::endl;
+      goto done;
+    }
+  }
+
+  r = image.close();
+
+ done:
+  if (!from_stdin) {
+    if (r < 0)
+      pc.fail();
+    else
+      pc.finish();
+    close(fd);
+  }
+ done2:
+  delete[] p;
+  return r;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_path_options(positional, options,
+                       "import file (or '-' for stdin)");
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+  at::add_create_image_options(options, true);
+  at::add_no_progress_option(options);
+
+  // TODO legacy rbd allowed import to accept both 'image'/'dest' and
+  //      'pool'/'dest-pool'
+  at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE, " (deprecated)");
+  at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE, " (deprecated)");
+}
+
+int execute(const po::variables_map &vm) {
+  std::string path;
+  int r = utils::get_path(vm, utils::get_positional_argument(vm, 0), &path);
+  if (r < 0) {
+    return r;
+  }
+
+  // odd check to support legacy / deprecated behavior of import
+  std::string deprecated_pool_name;
+  if (vm.count(at::POOL_NAME)) {
+    deprecated_pool_name = vm[at::POOL_NAME].as<std::string>();
+    std::cerr << "rbd: --pool is deprecated for import, use --dest-pool"
+              << std::endl;
+  }
+
+  std::string deprecated_image_name;
+  if (vm.count(at::IMAGE_NAME)) {
+    utils::extract_spec(vm[at::IMAGE_NAME].as<std::string>(),
+                        &deprecated_pool_name, &deprecated_image_name, nullptr);
+    std::cerr << "rbd: --image is deprecated for import, use --dest"
+              << std::endl;
+  } else {
+    deprecated_image_name = path.substr(path.find_last_of("/") + 1);
+  }
+
+  size_t arg_index = 1;
+  std::string pool_name = deprecated_pool_name;
+  std::string image_name;
+  std::string snap_name;
+  r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE, false);
+  if (r < 0) {
+    return r;
+  }
+
+  if (image_name.empty()) {
+    image_name = deprecated_image_name;
+  }
+
+  int order;
+  uint32_t format;
+  uint64_t features;
+  uint32_t stripe_unit;
+  uint32_t stripe_count;
+  r = utils::get_image_options(vm, &order, &format, &features, &stripe_unit,
+                               &stripe_count);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librbd::RBD rbd;
+  r = do_import(rbd, io_ctx, image_name.c_str(), &order, path.c_str(),
+                format, features, stripe_unit, stripe_count,
+                vm[at::NO_PROGRESS].as<bool>());
+  if (r < 0) {
+    std::cerr << "rbd: import failed: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+
+  return 0;
+}
+
+Shell::Action action(
+  {"import"}, {}, "Import image from file.", at::get_long_features_help(),
+  &get_arguments, &execute);
+
+} // namespace import
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/ImportDiff.cc b/src/tools/rbd/action/ImportDiff.cc
new file mode 100644
index 0000000..9f600f8
--- /dev/null
+++ b/src/tools/rbd/action/ImportDiff.cc
@@ -0,0 +1,223 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/encoding.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+
+namespace rbd {
+namespace action {
+namespace import_diff {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_import_diff(librbd::Image &image, const char *path,
+                          bool no_progress)
+{
+  int fd, r;
+  struct stat stat_buf;
+  utils::ProgressContext pc("Importing image diff", no_progress);
+  uint64_t size = 0;
+  uint64_t off = 0;
+  string from, to;
+  char buf[utils::RBD_DIFF_BANNER.size() + 1];
+
+  bool from_stdin = !strcmp(path, "-");
+  if (from_stdin) {
+    fd = 0;
+  } else {
+    fd = open(path, O_RDONLY);
+    if (fd < 0) {
+      r = -errno;
+      std::cerr << "rbd: error opening " << path << std::endl;
+      return r;
+    }
+    r = ::fstat(fd, &stat_buf);
+    if (r < 0)
+      goto done;
+    size = (uint64_t)stat_buf.st_size;
+  }
+
+  r = safe_read_exact(fd, buf, utils::RBD_DIFF_BANNER.size());
+  if (r < 0)
+    goto done;
+  buf[utils::RBD_DIFF_BANNER.size()] = '\0';
+  if (strcmp(buf, utils::RBD_DIFF_BANNER.c_str())) {
+    std::cerr << "invalid banner '" << buf << "', expected '"
+              << utils::RBD_DIFF_BANNER << "'" << std::endl;
+    r = -EINVAL;
+    goto done;
+  }
+
+  while (true) {
+    __u8 tag;
+    r = safe_read_exact(fd, &tag, 1);
+    if (r < 0) {
+      goto done;
+    }
+
+    if (tag == 'e') {
+      dout(2) << " end diff" << dendl;
+      break;
+    } else if (tag == 'f') {
+      r = utils::read_string(fd, 4096, &from);   // 4k limit to make sure we don't get a garbage string
+      if (r < 0)
+        goto done;
+      dout(2) << " from snap " << from << dendl;
+
+      if (!image.snap_exists(from.c_str())) {
+        std::cerr << "start snapshot '" << from
+                  << "' does not exist in the image, aborting" << std::endl;
+        r = -EINVAL;
+        goto done;
+      }
+    }
+    else if (tag == 't') {
+      r = utils::read_string(fd, 4096, &to);   // 4k limit to make sure we don't get a garbage string
+      if (r < 0)
+        goto done;
+      dout(2) << "   to snap " << to << dendl;
+
+      // verify this snap isn't already present
+      if (image.snap_exists(to.c_str())) {
+        std::cerr << "end snapshot '" << to
+                  << "' already exists, aborting" << std::endl;
+        r = -EEXIST;
+        goto done;
+      }
+    } else if (tag == 's') {
+      uint64_t end_size;
+      char buf[8];
+      r = safe_read_exact(fd, buf, 8);
+      if (r < 0)
+        goto done;
+      bufferlist bl;
+      bl.append(buf, 8);
+      bufferlist::iterator p = bl.begin();
+      ::decode(end_size, p);
+      uint64_t cur_size;
+      image.size(&cur_size);
+      if (cur_size != end_size) {
+        dout(2) << "resize " << cur_size << " -> " << end_size << dendl;
+        image.resize(end_size);
+      } else {
+        dout(2) << "size " << end_size << " (no change)" << dendl;
+      }
+      if (from_stdin)
+        size = end_size;
+    } else if (tag == 'w' || tag == 'z') {
+      uint64_t len;
+      char buf[16];
+      r = safe_read_exact(fd, buf, 16);
+      if (r < 0)
+        goto done;
+      bufferlist bl;
+      bl.append(buf, 16);
+      bufferlist::iterator p = bl.begin();
+      ::decode(off, p);
+      ::decode(len, p);
+
+      if (tag == 'w') {
+        bufferptr bp = buffer::create(len);
+        r = safe_read_exact(fd, bp.c_str(), len);
+        if (r < 0)
+          goto done;
+        bufferlist data;
+        data.append(bp);
+        dout(2) << " write " << off << "~" << len << dendl;
+        image.write2(off, len, data, LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
+      } else {
+        dout(2) << " zero " << off << "~" << len << dendl;
+        image.discard(off, len);
+      }
+    } else {
+      std::cerr << "unrecognized tag byte " << (int)tag
+                << " in stream; aborting" << std::endl;
+      r = -EINVAL;
+      goto done;
+    }
+    if (!from_stdin) {
+      // progress through input
+      uint64_t off = lseek64(fd, 0, SEEK_CUR);
+      pc.update_progress(off, size);
+    } else if (size) {
+      // progress through image offsets.  this may jitter if blocks
+      // aren't in order, but it is better than nothing.
+      pc.update_progress(off, size);
+    }
+  }
+  // take final snap
+  if (to.length()) {
+    dout(2) << " create end snap " << to << dendl;
+    r = image.snap_create(to.c_str());
+  }
+
+ done:
+  if (r < 0)
+    pc.fail();
+  else
+    pc.finish();
+  if (!from_stdin)
+    close(fd);
+  return r;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_path_options(positional, options,
+                       "import file (or '-' for stdin)");
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm) {
+  std::string path;
+  int r = utils::get_path(vm, utils::get_positional_argument(vm, 0), &path);
+  if (r < 0) {
+    return r;
+  }
+
+  size_t arg_index = 1;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_import_diff(image, path.c_str(), vm[at::NO_PROGRESS].as<bool>());
+  if (r < 0) {
+    cerr << "rbd: import-diff failed: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"import-diff"}, {}, "Import an incremental diff.", "", &get_arguments,
+  &execute);
+
+} // namespace list
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Info.cc b/src/tools/rbd/action/Info.cc
new file mode 100644
index 0000000..76e3940
--- /dev/null
+++ b/src/tools/rbd/action/Info.cc
@@ -0,0 +1,232 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/types.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace info {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static void format_bitmask(Formatter *f, const std::string &name,
+                           const std::map<uint64_t, std::string>& mapping,
+                           uint64_t bitmask)
+{
+  int count = 0;
+  std::string group_name(name + "s");
+  if (f == NULL) {
+    std::cout << "\t" << group_name << ": ";
+  } else {
+    f->open_array_section(group_name.c_str());
+  }
+  for (std::map<uint64_t, std::string>::const_iterator it = mapping.begin();
+       it != mapping.end(); ++it) {
+    if ((it->first & bitmask) == 0) {
+      continue;
+    }
+
+    if (f == NULL) {
+      if (count++ > 0) {
+        std::cout << ", ";
+      }
+      std::cout << it->second;
+    } else {
+      f->dump_string(name.c_str(), it->second);
+    }
+  }
+  if (f == NULL) {
+    std::cout << std::endl;
+  } else {
+    f->close_section();
+  }
+}
+
+static void format_features(Formatter *f, uint64_t features)
+{
+  format_bitmask(f, "feature", at::ImageFeatures::FEATURE_MAPPING, features);
+}
+
+static void format_flags(Formatter *f, uint64_t flags)
+{
+  std::map<uint64_t, std::string> mapping = {
+    {RBD_FLAG_OBJECT_MAP_INVALID, "object map invalid"},
+    {RBD_FLAG_FAST_DIFF_INVALID, "fast diff invalid"}};
+  format_bitmask(f, "flag", mapping, flags);
+}
+
+static int do_show_info(const char *imgname, librbd::Image& image,
+                        const char *snapname, Formatter *f)
+{
+  librbd::image_info_t info;
+  std::string parent_pool, parent_name, parent_snapname;
+  uint8_t old_format;
+  uint64_t overlap, features, flags;
+  bool snap_protected = false;
+  int r;
+
+  r = image.stat(info, sizeof(info));
+  if (r < 0)
+    return r;
+
+  r = image.old_format(&old_format);
+  if (r < 0)
+    return r;
+
+  r = image.overlap(&overlap);
+  if (r < 0)
+    return r;
+
+  r = image.features(&features);
+  if (r < 0)
+    return r;
+
+  r = image.get_flags(&flags);
+  if (r < 0) {
+    return r;
+  }
+
+  if (snapname) {
+    r = image.snap_is_protected(snapname, &snap_protected);
+    if (r < 0)
+      return r;
+  }
+
+  char prefix[RBD_MAX_BLOCK_NAME_SIZE + 1];
+  strncpy(prefix, info.block_name_prefix, RBD_MAX_BLOCK_NAME_SIZE);
+  prefix[RBD_MAX_BLOCK_NAME_SIZE] = '\0';
+
+  if (f) {
+    f->open_object_section("image");
+    f->dump_string("name", imgname);
+    f->dump_unsigned("size", info.size);
+    f->dump_unsigned("objects", info.num_objs);
+    f->dump_int("order", info.order);
+    f->dump_unsigned("object_size", info.obj_size);
+    f->dump_string("block_name_prefix", prefix);
+    f->dump_int("format", (old_format ? 1 : 2));
+  } else {
+    std::cout << "rbd image '" << imgname << "':\n"
+              << "\tsize " << prettybyte_t(info.size) << " in "
+              << info.num_objs << " objects"
+              << std::endl
+              << "\torder " << info.order
+              << " (" << prettybyte_t(info.obj_size) << " objects)"
+              << std::endl
+              << "\tblock_name_prefix: " << prefix
+              << std::endl
+              << "\tformat: " << (old_format ? "1" : "2")
+              << std::endl;
+  }
+
+  if (!old_format) {
+    format_features(f, features);
+    format_flags(f, flags);
+  }
+
+  // snapshot info, if present
+  if (snapname) {
+    if (f) {
+      f->dump_string("protected", snap_protected ? "true" : "false");
+    } else {
+      std::cout << "\tprotected: " << (snap_protected ? "True" : "False")
+                << std::endl;
+    }
+  }
+
+  // parent info, if present
+  if ((image.parent_info(&parent_pool, &parent_name, &parent_snapname) == 0) &&
+      parent_name.length() > 0) {
+    if (f) {
+      f->open_object_section("parent");
+      f->dump_string("pool", parent_pool);
+      f->dump_string("image", parent_name);
+      f->dump_string("snapshot", parent_snapname);
+      f->dump_unsigned("overlap", overlap);
+      f->close_section();
+    } else {
+      std::cout << "\tparent: " << parent_pool << "/" << parent_name
+                << "@" << parent_snapname << std::endl;
+      std::cout << "\toverlap: " << prettybyte_t(overlap) << std::endl;
+    }
+  }
+
+  // striping info, if feature is set
+  if (features & RBD_FEATURE_STRIPINGV2) {
+    if (f) {
+      f->dump_unsigned("stripe_unit", image.get_stripe_unit());
+      f->dump_unsigned("stripe_count", image.get_stripe_count());
+    } else {
+      std::cout << "\tstripe unit: " << prettybyte_t(image.get_stripe_unit())
+                << std::endl
+                << "\tstripe count: " << image.get_stripe_count() << std::endl;
+    }
+  }
+
+  if (f) {
+    f->close_section();
+    f->flush(std::cout);
+  }
+
+  return 0;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_or_snap_spec_options(positional, options,
+                                     at::ARGUMENT_MODIFIER_NONE);
+  at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_PERMITTED);
+  if (r < 0) {
+    return r;
+  }
+
+  at::Format::Formatter formatter;
+  r = utils::get_formatter(vm, &formatter);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, snap_name, true,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_show_info(image_name.c_str(), image,
+                   snap_name.empty() ? nullptr : snap_name.c_str(),
+                   formatter.get());
+  if (r < 0) {
+    std::cerr << "rbd: info: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"info"}, {}, "Show information about image size, striping, etc.", "",
+  &get_arguments, &execute);
+
+} // namespace info
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Kernel.cc b/src/tools/rbd/action/Kernel.cc
new file mode 100644
index 0000000..541da95
--- /dev/null
+++ b/src/tools/rbd/action/Kernel.cc
@@ -0,0 +1,360 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/krbd.h"
+#include "include/stringify.h"
+#include "include/uuid.h"
+#include "common/config.h"
+#include "common/errno.h"
+#include "common/strtol.h"
+#include "common/Formatter.h"
+#include "msg/msg_types.h"
+#include "global/global_context.h"
+#include <iostream>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/scope_exit.hpp>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace kernel {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+std::map<std::string, std::string> map_options;
+
+} // anonymous namespace
+
+static std::string map_option_uuid_cb(const char *value_char)
+{
+  uuid_d u;
+  if (!u.parse(value_char))
+    return "";
+
+  return stringify(u);
+}
+
+static std::string map_option_ip_cb(const char *value_char)
+{
+  entity_addr_t a;
+  const char *endptr;
+  if (!a.parse(value_char, &endptr) ||
+      endptr != value_char + strlen(value_char)) {
+    return "";
+  }
+
+  return stringify(a.addr);
+}
+
+static std::string map_option_int_cb(const char *value_char)
+{
+  std::string err;
+  int d = strict_strtol(value_char, 10, &err);
+  if (!err.empty() || d < 0)
+    return "";
+
+  return stringify(d);
+}
+
+static void put_map_option(const std::string key, std::string val)
+{
+  map_options[key] = val;
+}
+
+static int put_map_option_value(const std::string opt, const char *value_char,
+                                std::string (*parse_cb)(const char *))
+{
+  if (!value_char || *value_char == '\0') {
+    std::cerr << "rbd: " << opt << " option requires a value" << std::endl;
+    return -EINVAL;
+  }
+
+  std::string value = parse_cb(value_char);
+  if (value.empty()) {
+    std::cerr << "rbd: invalid " << opt << " value '" << value_char << "'"
+              << std::endl;
+    return -EINVAL;
+  }
+
+  put_map_option(opt, opt + "=" + value);
+  return 0;
+}
+
+static int parse_map_options(char *options)
+{
+  for (char *this_char = strtok(options, ", ");
+       this_char != NULL;
+       this_char = strtok(NULL, ",")) {
+    char *value_char;
+
+    if ((value_char = strchr(this_char, '=')) != NULL)
+      *value_char++ = '\0';
+
+    if (!strcmp(this_char, "fsid")) {
+      if (put_map_option_value("fsid", value_char, map_option_uuid_cb))
+        return -EINVAL;
+    } else if (!strcmp(this_char, "ip")) {
+      if (put_map_option_value("ip", value_char, map_option_ip_cb))
+        return -EINVAL;
+    } else if (!strcmp(this_char, "share") || !strcmp(this_char, "noshare")) {
+      put_map_option("share", this_char);
+    } else if (!strcmp(this_char, "crc") || !strcmp(this_char, "nocrc")) {
+      put_map_option("crc", this_char);
+    } else if (!strcmp(this_char, "cephx_require_signatures") ||
+               !strcmp(this_char, "nocephx_require_signatures")) {
+      put_map_option("cephx_require_signatures", this_char);
+    } else if (!strcmp(this_char, "tcp_nodelay") ||
+               !strcmp(this_char, "notcp_nodelay")) {
+      put_map_option("tcp_nodelay", this_char);
+    } else if (!strcmp(this_char, "cephx_sign_messages") ||
+               !strcmp(this_char, "nocephx_sign_messages")) {
+      put_map_option("cephx_sign_messages", this_char);
+    } else if (!strcmp(this_char, "mount_timeout")) {
+      if (put_map_option_value("mount_timeout", value_char, map_option_int_cb))
+        return -EINVAL;
+    } else if (!strcmp(this_char, "osdkeepalive")) {
+      if (put_map_option_value("osdkeepalive", value_char, map_option_int_cb))
+        return -EINVAL;
+    } else if (!strcmp(this_char, "osd_idle_ttl")) {
+      if (put_map_option_value("osd_idle_ttl", value_char, map_option_int_cb))
+        return -EINVAL;
+    } else if (!strcmp(this_char, "rw") || !strcmp(this_char, "ro")) {
+      put_map_option("rw", this_char);
+    } else if (!strcmp(this_char, "queue_depth")) {
+      if (put_map_option_value("queue_depth", value_char, map_option_int_cb))
+        return -EINVAL;
+    } else {
+      std::cerr << "rbd: unknown map option '" << this_char << "'" << std::endl;
+      return -EINVAL;
+    }
+  }
+
+  return 0;
+}
+
+static int do_kernel_showmapped(Formatter *f)
+{
+  struct krbd_ctx *krbd;
+  int r;
+
+  r = krbd_create_from_context(g_ceph_context, &krbd);
+  if (r < 0)
+    return r;
+
+  r = krbd_showmapped(krbd, f);
+
+  krbd_destroy(krbd);
+  return r;
+}
+
+static int do_kernel_map(const char *poolname, const char *imgname,
+                         const char *snapname)
+{
+  struct krbd_ctx *krbd;
+  std::ostringstream oss;
+  char *devnode;
+  int r;
+
+  r = krbd_create_from_context(g_ceph_context, &krbd);
+  if (r < 0)
+    return r;
+
+  for (std::map<std::string, std::string>::iterator it = map_options.begin();
+       it != map_options.end(); ) {
+    // for compatibility with < 3.7 kernels, assume that rw is on by
+    // default and omit it even if it was specified by the user
+    // (see ceph.git commit fb0f1986449b)
+    if (it->first == "rw" && it->second == "rw") {
+      map_options.erase(it);
+    } else {
+      if (it != map_options.begin())
+        oss << ",";
+      oss << it->second;
+      ++it;
+    }
+  }
+
+  r = krbd_map(krbd, poolname, imgname, snapname, oss.str().c_str(), &devnode);
+  if (r < 0)
+    goto out;
+
+  std::cout << devnode << std::endl;
+
+  free(devnode);
+out:
+  krbd_destroy(krbd);
+  return r;
+}
+
+static int do_kernel_unmap(const char *dev, const char *poolname,
+                           const char *imgname, const char *snapname)
+{
+  struct krbd_ctx *krbd;
+  int r;
+
+  r = krbd_create_from_context(g_ceph_context, &krbd);
+  if (r < 0)
+    return r;
+
+  if (dev)
+    r = krbd_unmap(krbd, dev);
+  else
+    r = krbd_unmap_by_spec(krbd, poolname, imgname, snapname);
+
+  krbd_destroy(krbd);
+  return r;
+}
+
+void get_show_arguments(po::options_description *positional,
+                        po::options_description *options) {
+  at::add_format_options(options);
+}
+
+int execute_show(const po::variables_map &vm) {
+  at::Format::Formatter formatter;
+  int r = utils::get_formatter(vm, &formatter);
+  if (r < 0) {
+    return r;
+  }
+
+  utils::init_context();
+
+  r = do_kernel_showmapped(formatter.get());
+  if (r < 0) {
+    std::cerr << "rbd: showmapped failed: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_map_arguments(po::options_description *positional,
+                       po::options_description *options) {
+  at::add_image_or_snap_spec_options(positional, options,
+                                     at::ARGUMENT_MODIFIER_NONE);
+  options->add_options()
+    ("options,o", po::value<std::string>(), "mapping options")
+    ("read-only", po::bool_switch(), "mount read-only");
+}
+
+int execute_map(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_PERMITTED);
+  if (r < 0) {
+    return r;
+  }
+
+  if (vm["read-only"].as<bool>()) {
+    put_map_option("rw", "ro");
+  }
+
+  // parse default options first so they can be overwritten by cli options
+  char *default_map_options = strdup(g_conf->rbd_default_map_options.c_str());
+  BOOST_SCOPE_EXIT( (default_map_options) ) {
+    free(default_map_options);
+  } BOOST_SCOPE_EXIT_END;
+
+  if (parse_map_options(default_map_options)) {
+    std::cerr << "rbd: couldn't parse default map options" << std::endl;
+    return -EINVAL;
+  }
+
+  if (vm.count("options")) {
+    char *cli_map_options = strdup(vm["options"].as<std::string>().c_str());
+    BOOST_SCOPE_EXIT( (cli_map_options) ) {
+      free(cli_map_options);
+    } BOOST_SCOPE_EXIT_END;
+
+    if (parse_map_options(cli_map_options)) {
+      std::cerr << "rbd: couldn't parse map options" << std::endl;
+      return -EINVAL;
+    }
+  }
+
+  utils::init_context();
+
+  r = do_kernel_map(pool_name.c_str(), image_name.c_str(), snap_name.c_str());
+  if (r < 0) {
+    std::cerr << "rbd: map failed: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+
+  return 0;
+}
+
+void get_unmap_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  positional->add_options()
+    ("image-or-snap-or-device-spec",
+     "image, snapshot, or device specification\n"
+     "[<pool-name>/]<image-name>[@<snapshot-name>] or <device-path>");
+  at::add_pool_option(options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_image_option(options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_snap_option(options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_unmap(const po::variables_map &vm) {
+  std::string device_name = utils::get_positional_argument(vm, 0);
+  if (!boost::starts_with(device_name, "/dev/")) {
+    device_name.clear();
+  }
+
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r;
+  if (device_name.empty()) {
+    r = utils::get_pool_image_snapshot_names(
+      vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+      &snap_name, utils::SNAPSHOT_PRESENCE_PERMITTED,
+      false);
+    if (r < 0) {
+      return r;
+    }
+  }
+
+  if (device_name.empty() && image_name.empty()) {
+    std::cerr << "rbd: unmap requires either image name or device path"
+              << std::endl;
+    return -EINVAL;
+  }
+
+  utils::init_context();
+
+  r = do_kernel_unmap(device_name.empty() ? nullptr : device_name.c_str(),
+                      pool_name.c_str(), image_name.c_str(),
+                      snap_name.empty() ? nullptr : snap_name.c_str());
+  if (r < 0) {
+    std::cerr << "rbd: unmap failed: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::SwitchArguments switched_arguments({"read-only"});
+Shell::Action action_show(
+  {"showmapped"}, {}, "Show the rbd images mapped by the kernel.", "",
+  &get_show_arguments, &execute_show);
+
+Shell::Action action_map(
+  {"map"}, {}, "Map image to a block device using the kernel.", "",
+  &get_map_arguments, &execute_map);
+
+Shell::Action action_unmap(
+  {"unmap"}, {}, "Unmap a rbd device that was used by the kernel.", "",
+  &get_unmap_arguments, &execute_unmap);
+
+} // namespace kernel
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/List.cc b/src/tools/rbd/action/List.cc
new file mode 100644
index 0000000..6b2041a
--- /dev/null
+++ b/src/tools/rbd/action/List.cc
@@ -0,0 +1,235 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/stringify.h"
+#include "include/types.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace list {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+int do_list(librbd::RBD &rbd, librados::IoCtx& io_ctx, bool lflag,
+                   Formatter *f) {
+  std::vector<std::string> names;
+  int r = rbd.list(io_ctx, names);
+  if (r == -ENOENT)
+    r = 0;
+  if (r < 0)
+    return r;
+
+  if (!lflag) {
+    if (f)
+      f->open_array_section("images");
+    for (std::vector<std::string>::const_iterator i = names.begin();
+       i != names.end(); ++i) {
+       if (f)
+         f->dump_string("name", *i);
+       else
+         std::cout << *i << std::endl;
+    }
+    if (f) {
+      f->close_section();
+      f->flush(std::cout);
+    }
+    return 0;
+  }
+
+  TextTable tbl;
+
+  if (f) {
+    f->open_array_section("images");
+  } else {
+    tbl.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+    tbl.define_column("SIZE", TextTable::RIGHT, TextTable::RIGHT);
+    tbl.define_column("PARENT", TextTable::LEFT, TextTable::LEFT);
+    tbl.define_column("FMT", TextTable::RIGHT, TextTable::RIGHT);
+    tbl.define_column("PROT", TextTable::LEFT, TextTable::LEFT);
+    tbl.define_column("LOCK", TextTable::LEFT, TextTable::LEFT);
+  }
+
+  std::string pool, image, snap, parent;
+
+  for (std::vector<std::string>::const_iterator i = names.begin();
+       i != names.end(); ++i) {
+    librbd::image_info_t info;
+    librbd::Image im;
+
+    r = rbd.open_read_only(io_ctx, im, i->c_str(), NULL);
+    // image might disappear between rbd.list() and rbd.open(); ignore
+    // that, warn about other possible errors (EPERM, say, for opening
+    // an old-format image, because you need execute permission for the
+    // class method)
+    if (r < 0) {
+      if (r != -ENOENT) {
+        std::cerr << "rbd: error opening " << *i << ": " << cpp_strerror(r)
+                  << std::endl;
+      }
+      // in any event, continue to next image
+      continue;
+    }
+
+    // handle second-nth trips through loop
+    parent.clear();
+    r = im.parent_info(&pool, &image, &snap);
+    if (r < 0 && r != -ENOENT)
+      return r;
+    bool has_parent = false;
+    if (r != -ENOENT) {
+      parent = pool + "/" + image + "@" + snap;
+      has_parent = true;
+    }
+
+    if (im.stat(info, sizeof(info)) < 0)
+      return -EINVAL;
+
+    uint8_t old_format;
+    im.old_format(&old_format);
+
+    std::list<librbd::locker_t> lockers;
+    bool exclusive;
+    r = im.list_lockers(&lockers, &exclusive, NULL);
+    if (r < 0)
+      return r;
+    std::string lockstr;
+    if (!lockers.empty()) {
+      lockstr = (exclusive) ? "excl" : "shr";
+    }
+
+    if (f) {
+      f->open_object_section("image");
+      f->dump_string("image", *i);
+      f->dump_unsigned("size", info.size);
+      if (has_parent) {
+        f->open_object_section("parent");
+        f->dump_string("pool", pool);
+        f->dump_string("image", image);
+        f->dump_string("snapshot", snap);
+        f->close_section();
+      }
+      f->dump_int("format", old_format ? 1 : 2);
+      if (!lockers.empty())
+        f->dump_string("lock_type", exclusive ? "exclusive" : "shared");
+      f->close_section();
+    } else {
+      tbl << *i
+          << stringify(si_t(info.size))
+          << parent
+          << ((old_format) ? '1' : '2')
+          << ""                         // protect doesn't apply to images
+          << lockstr
+          << TextTable::endrow;
+    }
+
+    std::vector<librbd::snap_info_t> snaplist;
+    if (im.snap_list(snaplist) >= 0 && !snaplist.empty()) {
+      for (std::vector<librbd::snap_info_t>::iterator s = snaplist.begin();
+           s != snaplist.end(); ++s) {
+        bool is_protected;
+        bool has_parent = false;
+        parent.clear();
+        im.snap_set(s->name.c_str());
+        r = im.snap_is_protected(s->name.c_str(), &is_protected);
+        if (r < 0)
+          return r;
+        if (im.parent_info(&pool, &image, &snap) >= 0) {
+          parent = pool + "/" + image + "@" + snap;
+          has_parent = true;
+        }
+        if (f) {
+          f->open_object_section("snapshot");
+          f->dump_string("image", *i);
+          f->dump_string("snapshot", s->name);
+          f->dump_unsigned("size", s->size);
+          if (has_parent) {
+            f->open_object_section("parent");
+            f->dump_string("pool", pool);
+            f->dump_string("image", image);
+            f->dump_string("snapshot", snap);
+            f->close_section();
+          }
+          f->dump_int("format", old_format ? 1 : 2);
+          f->dump_string("protected", is_protected ? "true" : "false");
+          f->close_section();
+        } else {
+          tbl << *i + "@" + s->name
+              << stringify(si_t(s->size))
+              << parent
+              << ((old_format) ? '1' : '2')
+              << (is_protected ? "yes" : "")
+              << ""                     // locks don't apply to snaps
+              << TextTable::endrow;
+        }
+      }
+    }
+  }
+  if (f) {
+    f->close_section();
+    f->flush(std::cout);
+  } else if (!names.empty()) {
+    std::cout << tbl;
+  }
+
+  return 0;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  positional->add_options()
+    ("pool-name", "pool name");
+  options->add_options()
+    ("long,l", po::bool_switch(), "long listing format")
+    ("pool,p", po::value<std::string>(), "pool name");
+  at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm) {
+  std::string pool_name = utils::get_positional_argument(vm, 0);
+  if (pool_name.empty() && vm.count("pool")) {
+    pool_name = vm["pool"].as<std::string>();
+  }
+
+  if (pool_name.empty()) {
+    pool_name = at::DEFAULT_POOL_NAME;
+  }
+
+  at::Format::Formatter formatter;
+  int r = utils::get_formatter(vm, &formatter);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librbd::RBD rbd;
+  r = do_list(rbd, io_ctx, vm["long"].as<bool>(), formatter.get());
+  if (r < 0) {
+    std::cerr << "rbd: list: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+
+  return 0;
+}
+
+Shell::SwitchArguments switched_arguments({"long", "l"});
+Shell::Action action(
+  {"list"}, {"ls"}, "List rbd images.", "", &get_arguments, &execute);
+
+} // namespace list
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Lock.cc b/src/tools/rbd/action/Lock.cc
new file mode 100644
index 0000000..c39a4c5
--- /dev/null
+++ b/src/tools/rbd/action/Lock.cc
@@ -0,0 +1,266 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace lock {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+namespace {
+
+void add_id_option(po::options_description *positional) {
+  positional->add_options()
+    ("lock-id", "unique lock id");
+}
+
+int get_id(const po::variables_map &vm, std::string *id) {
+  *id = utils::get_positional_argument(vm, 1);
+  if (id->empty()) {
+    std::cerr << "rbd: lock id was not specified" << std::endl;
+    return -EINVAL;
+  }
+  return 0;
+}
+
+} // anonymous namespace
+
+static int do_lock_list(librbd::Image& image, Formatter *f)
+{
+  std::list<librbd::locker_t> lockers;
+  bool exclusive;
+  std::string tag;
+  TextTable tbl;
+  int r;
+
+  r = image.list_lockers(&lockers, &exclusive, &tag);
+  if (r < 0)
+    return r;
+
+  if (f) {
+    f->open_object_section("locks");
+  } else {
+    tbl.define_column("Locker", TextTable::LEFT, TextTable::LEFT);
+    tbl.define_column("ID", TextTable::LEFT, TextTable::LEFT);
+    tbl.define_column("Address", TextTable::LEFT, TextTable::LEFT);
+  }
+
+  if (lockers.size()) {
+    bool one = (lockers.size() == 1);
+
+    if (!f) {
+      std::cout << "There " << (one ? "is " : "are ") << lockers.size()
+           << (exclusive ? " exclusive" : " shared")
+           << " lock" << (one ? "" : "s") << " on this image.\n";
+      if (!exclusive)
+        std::cout << "Lock tag: " << tag << "\n";
+    }
+
+    for (std::list<librbd::locker_t>::const_iterator it = lockers.begin();
+         it != lockers.end(); ++it) {
+      if (f) {
+        f->open_object_section(it->cookie.c_str());
+        f->dump_string("locker", it->client);
+        f->dump_string("address", it->address);
+        f->close_section();
+      } else {
+        tbl << it->client << it->cookie << it->address << TextTable::endrow;
+      }
+    }
+    if (!f)
+      std::cout << tbl;
+  }
+
+  if (f) {
+    f->close_section();
+    f->flush(std::cout);
+  }
+  return 0;
+}
+
+static int do_lock_add(librbd::Image& image, const char *cookie,
+                       const char *tag)
+{
+  if (tag)
+    return image.lock_shared(cookie, tag);
+  else
+    return image.lock_exclusive(cookie);
+}
+
+static int do_lock_remove(librbd::Image& image, const char *client,
+                          const char *cookie)
+{
+  return image.break_lock(client, cookie);
+}
+
+void get_list_arguments(po::options_description *positional,
+                        po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  at::Format::Formatter formatter;
+  r = utils::get_formatter(vm, &formatter);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", true,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_lock_list(image, formatter.get());
+  if (r < 0) {
+    std::cerr << "rbd: listing locks failed: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_add_arguments(po::options_description *positional,
+                       po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  add_id_option(positional);
+  options->add_options()
+    ("shared", po::value<std::string>(), "shared lock tag");
+}
+
+int execute_add(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string lock_cookie;
+  r = get_id(vm, &lock_cookie);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string lock_tag;
+  if (vm.count("shared")) {
+    lock_tag = vm["shared"].as<std::string>();
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_lock_add(image, lock_cookie.c_str(),
+                  lock_tag.empty() ? nullptr : lock_tag.c_str());
+  if (r < 0) {
+    if (r == -EBUSY || r == -EEXIST) {
+      if (!lock_tag.empty()) {
+        std::cerr << "rbd: lock is alrady held by someone else"
+                  << " with a different tag" << std::endl;
+      } else {
+        std::cerr << "rbd: lock is already held by someone else" << std::endl;
+      }
+    } else {
+      std::cerr << "rbd: taking lock failed: " << cpp_strerror(r) << std::endl;
+    }
+    return r;
+  }
+  return 0;
+}
+
+void get_remove_arguments(po::options_description *positional,
+                          po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  add_id_option(positional);
+  positional->add_options()
+    ("locker", "locker client");
+}
+
+int execute_remove(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string lock_cookie;
+  r = get_id(vm, &lock_cookie);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string lock_client = utils::get_positional_argument(vm, 2);
+  if (lock_client.empty()) {
+    std::cerr << "rbd: locker was not specified" << std::endl;
+    return -EINVAL;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_lock_remove(image, lock_client.c_str(), lock_cookie.c_str());
+  if (r < 0) {
+    std::cerr << "rbd: releasing lock failed: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action_list(
+  {"lock", "list"}, {"lock", "ls"}, "Show locks held on an image.", "",
+  &get_list_arguments, &execute_list);
+Shell::Action action_add(
+  {"lock", "add"}, {}, "Take a lock on an image.", "",
+  &get_add_arguments, &execute_add);
+Shell::Action action_remove(
+  {"lock", "remove"}, {"lock", "rm"}, "Release a lock on an image.", "",
+  &get_remove_arguments, &execute_remove);
+
+} // namespace lock
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/MergeDiff.cc b/src/tools/rbd/action/MergeDiff.cc
new file mode 100644
index 0000000..9e08a37
--- /dev/null
+++ b/src/tools/rbd/action/MergeDiff.cc
@@ -0,0 +1,436 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/safe_io.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+#define dout_subsys ceph_subsys_rbd
+
+namespace rbd {
+namespace action {
+namespace merge_diff {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int parse_diff_header(int fd, __u8 *tag, string *from, string *to, uint64_t *size)
+{
+  int r;
+
+  {//header
+    char buf[utils::RBD_DIFF_BANNER.size() + 1];
+    r = safe_read_exact(fd, buf, utils::RBD_DIFF_BANNER.size());
+    if (r < 0)
+      return r;
+
+    buf[utils::RBD_DIFF_BANNER.size()] = '\0';
+    if (strcmp(buf, utils::RBD_DIFF_BANNER.c_str())) {
+      std::cerr << "invalid banner '" << buf << "', expected '"
+                << utils::RBD_DIFF_BANNER << "'" << std::endl;
+      return -EINVAL;
+    }
+  }
+
+  while (true) {
+    r = safe_read_exact(fd, tag, 1);
+    if (r < 0)
+      return r;
+
+    if (*tag == 'f') {
+      r = utils::read_string(fd, 4096, from);   // 4k limit to make sure we don't get a garbage string
+      if (r < 0)
+        return r;
+      dout(2) << " from snap " << *from << dendl;
+    } else if (*tag == 't') {
+      r = utils::read_string(fd, 4096, to);   // 4k limit to make sure we don't get a garbage string
+      if (r < 0)
+        return r;
+      dout(2) << " to snap " << *to << dendl;
+    } else if (*tag == 's') {
+      char buf[8];
+      r = safe_read_exact(fd, buf, 8);
+      if (r < 0)
+        return r;
+
+      bufferlist bl;
+      bl.append(buf, 8);
+      bufferlist::iterator p = bl.begin();
+      ::decode(*size, p);
+    } else {
+      break;
+    }
+  }
+
+  return 0;
+}
+
+static int parse_diff_body(int fd, __u8 *tag, uint64_t *offset, uint64_t *length)
+{
+  int r;
+
+  if (!(*tag)) {
+    r = safe_read_exact(fd, tag, 1);
+    if (r < 0)
+      return r;
+  }
+
+  if (*tag == 'e') {
+    offset = 0;
+    length = 0;
+    return 0;
+  }
+
+  if (*tag != 'w' && *tag != 'z')
+    return -ENOTSUP;
+
+  char buf[16];
+  r = safe_read_exact(fd, buf, 16);
+  if (r < 0)
+    return r;
+
+  bufferlist bl;
+  bl.append(buf, 16);
+  bufferlist::iterator p = bl.begin();
+  ::decode(*offset, p);
+  ::decode(*length, p);
+
+  if (!(*length))
+    return -ENOTSUP;
+
+  return 0;
+}
+
+/*
+ * fd: the diff file to read from
+ * pd: the diff file to be written into
+ */
+static int accept_diff_body(int fd, int pd, __u8 tag, uint64_t offset, uint64_t length)
+{
+  if (tag == 'e')
+    return 0;
+
+  bufferlist bl;
+  ::encode(tag, bl);
+  ::encode(offset, bl);
+  ::encode(length, bl);
+  int r;
+  r = bl.write_fd(pd);
+  if (r < 0)
+    return r;
+
+  if (tag == 'w') {
+    bufferptr bp = buffer::create(length);
+    r = safe_read_exact(fd, bp.c_str(), length);
+    if (r < 0)
+      return r;
+    bufferlist data;
+    data.append(bp);
+    r = data.write_fd(pd);
+    if (r < 0)
+      return r;
+  }
+
+  return 0;
+}
+
+/*
+ * Merge two diff files into one single file
+ * Note: It does not do the merging work if
+ * either of the source diff files is stripped,
+ * since which complicates the process and is
+ * rarely used
+ */
+static int do_merge_diff(const char *first, const char *second,
+                         const char *path, bool no_progress)
+{
+  utils::ProgressContext pc("Merging image diff", no_progress);
+  int fd = -1, sd = -1, pd = -1, r;
+
+  string f_from, f_to;
+  string s_from, s_to;
+  uint64_t f_size, s_size, pc_size;
+
+  __u8 f_tag = 0, s_tag = 0;
+  uint64_t f_off = 0, f_len = 0;
+  uint64_t s_off = 0, s_len = 0;
+  bool f_end = false, s_end = false;
+
+  bool first_stdin = !strcmp(first, "-");
+  if (first_stdin) {
+    fd = 0;
+  } else {
+    fd = open(first, O_RDONLY);
+    if (fd < 0) {
+      r = -errno;
+      std::cerr << "rbd: error opening " << first << std::endl;
+      goto done;
+    }
+  }
+
+  sd = open(second, O_RDONLY);
+  if (sd < 0) {
+    r = -errno;
+    std::cerr << "rbd: error opening " << second << std::endl;
+    goto done;
+  }
+
+  if (strcmp(path, "-") == 0) {
+    pd = 1;
+  } else {
+    pd = open(path, O_WRONLY | O_CREAT | O_EXCL, 0644);
+    if (pd < 0) {
+      r = -errno;
+      std::cerr << "rbd: error create " << path << std::endl;
+      goto done;
+    }
+  }
+
+  //We just handle the case like 'banner, [ftag], [ttag], stag, [wztag]*,etag',
+  // and the (offset,length) in wztag must be ascending order.
+
+  r = parse_diff_header(fd, &f_tag, &f_from, &f_to, &f_size);
+  if (r < 0) {
+    std::cerr << "rbd: failed to parse first diff header" << std::endl;
+    goto done;
+  }
+
+  r = parse_diff_header(sd, &s_tag, &s_from, &s_to, &s_size);
+  if (r < 0) {
+    std::cerr << "rbd: failed to parse second diff header" << std::endl;
+    goto done;
+  }
+
+  if (f_to != s_from) {
+    r = -EINVAL;
+    std::cerr << "The first TO snapshot must be equal with the second FROM "
+              << "snapshot, aborting" << std::endl;
+    goto done;
+  }
+
+  {
+    // header
+    bufferlist bl;
+    bl.append(utils::RBD_DIFF_BANNER);
+
+    __u8 tag;
+    if (f_from.size()) {
+      tag = 'f';
+      ::encode(tag, bl);
+      ::encode(f_from, bl);
+    }
+
+    if (s_to.size()) {
+      tag = 't';
+      ::encode(tag, bl);
+      ::encode(s_to, bl);
+    }
+
+    tag = 's';
+    ::encode(tag, bl);
+    ::encode(s_size, bl);
+
+    r = bl.write_fd(pd);
+    if (r < 0) {
+      std::cerr << "rbd: failed to write merged diff header" << std::endl;
+      goto done;
+    }
+  }
+  if (f_size > s_size)
+    pc_size = f_size << 1;
+  else
+    pc_size = s_size << 1;
+
+  //data block
+  while (!f_end || !s_end) {
+    // progress through input
+    pc.update_progress(f_off + s_off, pc_size);
+
+    if (!f_end && !f_len) {
+      uint64_t last_off = f_off;
+
+      r = parse_diff_body(fd, &f_tag, &f_off, &f_len);
+      dout(2) << "first diff data chunk: tag=" << f_tag << ", "
+              << "off=" << f_off << ", "
+              << "len=" << f_len << dendl;
+      if (r < 0) {
+        std::cerr << "rbd: failed to read first diff data chunk header"
+                  << std::endl;
+        goto done;
+      }
+
+      if (f_tag == 'e') {
+        f_end = true;
+        f_tag = 'z';
+        f_off = f_size;
+        if (f_size < s_size)
+          f_len = s_size - f_size;
+        else
+          f_len = 0;
+      }
+
+      if (last_off > f_off) {
+        r = -ENOTSUP;
+        std::cerr << "rbd: out-of-order offset from first diff ("
+             << last_off << " > " << f_off << ")" << std::endl;
+        goto done;
+      }
+    }
+
+    if (!s_end && !s_len) {
+      uint64_t last_off = s_off;
+
+      r = parse_diff_body(sd, &s_tag, &s_off, &s_len);
+      dout(2) << "second diff data chunk: tag=" << f_tag << ", "
+              << "off=" << f_off << ", "
+              << "len=" << f_len << dendl;
+      if (r < 0) {
+        std::cerr << "rbd: failed to read second diff data chunk header"
+                  << std::endl;
+        goto done;
+      }
+
+      if (s_tag == 'e') {
+        s_end = true;
+        s_off = s_size;
+        if (s_size < f_size)
+          s_len = f_size - s_size;
+        else
+          s_len = 0;
+      }
+
+      if (last_off > s_off) {
+        r = -ENOTSUP;
+        std::cerr << "rbd: out-of-order offset from second diff ("
+                  << last_off << " > " << s_off << ")" << std::endl;
+        goto done;
+      }
+    }
+
+    if (f_off < s_off && f_len) {
+      uint64_t delta = s_off - f_off;
+      if (delta > f_len)
+        delta = f_len;
+      r = accept_diff_body(fd, pd, f_tag, f_off, delta);
+      f_off += delta;
+      f_len -= delta;
+
+      if (!f_len) {
+        f_tag = 0;
+        continue;
+      }
+    }
+    assert(f_off >= s_off);
+
+    if (f_off < s_off + s_len && f_len) {
+      uint64_t delta = s_off + s_len - f_off;
+      if (delta > f_len)
+        delta = f_len;
+      if (f_tag == 'w') {
+        if (first_stdin) {
+          bufferptr bp = buffer::create(delta);
+          r = safe_read_exact(fd, bp.c_str(), delta);
+        } else {
+          r = lseek(fd, delta, SEEK_CUR);
+        }
+        if (r < 0) {
+          std::cerr << "rbd: failed to skip first diff data" << std::endl;
+          goto done;
+        }
+      }
+      f_off += delta;
+      f_len -= delta;
+
+      if (!f_len) {
+        f_tag = 0;
+        continue;
+      }
+    }
+    assert(f_off >= s_off + s_len);
+    if (s_len) {
+      r = accept_diff_body(sd, pd, s_tag, s_off, s_len);
+      s_off += s_len;
+      s_len = 0;
+      s_tag = 0;
+    } else
+      assert(f_end && s_end);
+    continue;
+  }
+
+  {//tail
+    __u8 tag = 'e';
+    bufferlist bl;
+    ::encode(tag, bl);
+    r = bl.write_fd(pd);
+  }
+
+done:
+  if (pd > 2)
+    close(pd);
+  if (sd > 2)
+    close(sd);
+  if (fd > 2)
+    close(fd);
+
+  if(r < 0) {
+    pc.fail();
+    if (pd > 2)
+      unlink(path);
+  } else
+    pc.finish();
+
+  return r;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  positional->add_options()
+    ("diff1-path", "path to first diff (or '-' for stdin)")
+    ("diff2-path", "path to second diff");
+  at::add_path_options(positional, options,
+                       "path to merged diff (or '-' for stdout)");
+  at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm) {
+  std::string first_diff = utils::get_positional_argument(vm, 0);
+  if (first_diff.empty()) {
+    std::cerr << "rbd: first diff was not specified" << std::endl;
+    return -EINVAL;
+  }
+
+  std::string second_diff = utils::get_positional_argument(vm, 1);
+  if (second_diff.empty()) {
+    std::cerr << "rbd: second diff was not specified" << std::endl;
+    return -EINVAL;
+  }
+
+  std::string path;
+  int r = utils::get_path(vm, utils::get_positional_argument(vm, 2),
+                          &path);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_merge_diff(first_diff.c_str(), second_diff.c_str(), path.c_str(),
+                    vm[at::NO_PROGRESS].as<bool>());
+  if (r < 0) {
+    cerr << "rbd: merge-diff error" << std::endl;
+    return -r;
+  }
+
+  return 0;
+}
+
+Shell::Action action(
+  {"merge-diff"}, {}, "Merge two diff exports together.", "",
+  &get_arguments, &execute);
+
+} // namespace merge_diff
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/ObjectMap.cc b/src/tools/rbd/action/ObjectMap.cc
new file mode 100644
index 0000000..b14bc72
--- /dev/null
+++ b/src/tools/rbd/action/ObjectMap.cc
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace object_map {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_object_map_rebuild(librbd::Image &image, bool no_progress)
+{
+  utils::ProgressContext pc("Object Map Rebuild", no_progress);
+  int r = image.rebuild_object_map(pc);
+  if (r < 0) {
+    pc.fail();
+    return r;
+  }
+  pc.finish();
+  return 0;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_or_snap_spec_options(positional, options,
+                                     at::ARGUMENT_MODIFIER_NONE);
+  at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_PERMITTED);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, snap_name, false,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_object_map_rebuild(image, vm[at::NO_PROGRESS].as<bool>());
+  if (r < 0) {
+    std::cerr << "rbd: rebuilding object map failed: " << cpp_strerror(r)
+              << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"object-map", "rebuild"}, {}, "Rebuild an invalid object map.", "",
+  &get_arguments, &execute);
+
+} // namespace object_map
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Remove.cc b/src/tools/rbd/action/Remove.cc
new file mode 100644
index 0000000..6c2d2c3
--- /dev/null
+++ b/src/tools/rbd/action/Remove.cc
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace remove {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_delete(librbd::RBD &rbd, librados::IoCtx& io_ctx,
+                     const char *imgname, bool no_progress)
+{
+  utils::ProgressContext pc("Removing image", no_progress);
+  int r = rbd.remove_with_progress(io_ctx, imgname, pc);
+  if (r < 0) {
+    pc.fail();
+    return r;
+  }
+  pc.finish();
+  return 0;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librbd::RBD rbd;
+  r = do_delete(rbd, io_ctx, image_name.c_str(),
+                vm[at::NO_PROGRESS].as<bool>());
+  if (r < 0) {
+    if (r == -ENOTEMPTY) {
+      std::cerr << "rbd: image has snapshots - these must be deleted"
+                << " with 'rbd snap purge' before the image can be removed."
+                << std::endl;
+    } else if (r == -EBUSY) {
+      std::cerr << "rbd: error: image still has watchers"
+                << std::endl
+                << "This means the image is still open or the client using "
+                << "it crashed. Try again after closing/unmapping it or "
+                << "waiting 30s for the crashed client to timeout."
+                << std::endl;
+    } else {
+      std::cerr << "rbd: delete error: " << cpp_strerror(r) << std::endl;
+    }
+    return r ;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"remove"}, {"rm"}, "Delete an image.", "", &get_arguments, &execute);
+
+} // namespace remove
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Rename.cc b/src/tools/rbd/action/Rename.cc
new file mode 100644
index 0000000..c076111
--- /dev/null
+++ b/src/tools/rbd/action/Rename.cc
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace rename {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_rename(librbd::RBD &rbd, librados::IoCtx& io_ctx,
+                     const char *imgname, const char *destname)
+{
+  int r = rbd.rename(io_ctx, imgname, destname);
+  if (r < 0)
+    return r;
+  return 0;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE);
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  std::string dst_pool_name;
+  std::string dst_image_name;
+  std::string dst_snap_name;
+  r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dst_pool_name, &dst_image_name,
+    &dst_snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  if (pool_name != dst_pool_name) {
+    std::cerr << "rbd: mv/rename across pools not supported" << std::endl
+              << "source pool: " << pool_name<< " dest pool: " << dst_pool_name
+              << std::endl;
+    return -EINVAL;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  r = utils::init(pool_name, &rados, &io_ctx);
+  if (r < 0) {
+    return r;
+  }
+
+  librbd::RBD rbd;
+  r = do_rename(rbd, io_ctx, image_name.c_str(), dst_image_name.c_str());
+  if (r < 0) {
+    std::cerr << "rbd: rename error: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"rename"}, {"mv"}, "Rename image within pool.", "", &get_arguments,
+  &execute);
+
+} // namespace list
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Resize.cc b/src/tools/rbd/action/Resize.cc
new file mode 100644
index 0000000..aa7a390
--- /dev/null
+++ b/src/tools/rbd/action/Resize.cc
@@ -0,0 +1,94 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace resize {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_resize(librbd::Image& image, uint64_t size, bool no_progress)
+{
+  utils::ProgressContext pc("Resizing image", no_progress);
+  int r = image.resize_with_progress(size, pc);
+  if (r < 0) {
+    pc.fail();
+    return r;
+  }
+  pc.finish();
+  return 0;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_size_option(options);
+  options->add_options()
+    ("allow-shrink", po::bool_switch(), "permit shrinking");
+  at::add_no_progress_option(options);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  uint64_t size;
+  r = utils::get_image_size(vm, &size);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, snap_name, false,
+                                 &rados, &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  librbd::image_info_t info;
+  r = image.stat(info, sizeof(info));
+  if (r < 0) {
+    std::cerr << "rbd: resize error: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+
+  if (info.size > size && !vm["allow-shrink"].as<bool>()) {
+    std::cerr << "rbd: shrinking an image is only allowed with the "
+              << "--allow-shrink flag" << std::endl;
+    return -EINVAL;
+  }
+
+  r = do_resize(image, size, vm[at::NO_PROGRESS].as<bool>());
+  if (r < 0) {
+    std::cerr << "rbd: resize error: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::SwitchArguments switched_arguments({"allow-shrink"});
+Shell::Action action(
+  {"resize"}, {}, "Resize (expand or shrink) image.", "", &get_arguments,
+  &execute);
+
+} // namespace list
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Snap.cc b/src/tools/rbd/action/Snap.cc
new file mode 100644
index 0000000..e20e878
--- /dev/null
+++ b/src/tools/rbd/action/Snap.cc
@@ -0,0 +1,495 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/types.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include "common/TextTable.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace snap {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+int do_list_snaps(librbd::Image& image, Formatter *f)
+{
+  std::vector<librbd::snap_info_t> snaps;
+  TextTable t;
+  int r;
+
+  r = image.snap_list(snaps);
+  if (r < 0)
+    return r;
+
+  if (f) {
+    f->open_array_section("snapshots");
+  } else {
+    t.define_column("SNAPID", TextTable::RIGHT, TextTable::RIGHT);
+    t.define_column("NAME", TextTable::LEFT, TextTable::LEFT);
+    t.define_column("SIZE", TextTable::RIGHT, TextTable::RIGHT);
+  }
+
+  for (std::vector<librbd::snap_info_t>::iterator s = snaps.begin();
+       s != snaps.end(); ++s) {
+    if (f) {
+      f->open_object_section("snapshot");
+      f->dump_unsigned("id", s->id);
+      f->dump_string("name", s->name);
+      f->dump_unsigned("size", s->size);
+      f->close_section();
+    } else {
+      t << s->id << s->name << stringify(prettybyte_t(s->size))
+        << TextTable::endrow;
+    }
+  }
+
+  if (f) {
+    f->close_section();
+    f->flush(std::cout);
+  } else if (snaps.size()) {
+    std::cout << t;
+  }
+
+  return 0;
+}
+
+int do_add_snap(librbd::Image& image, const char *snapname)
+{
+  int r = image.snap_create(snapname);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int do_remove_snap(librbd::Image& image, const char *snapname)
+{
+  int r = image.snap_remove(snapname);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int do_rollback_snap(librbd::Image& image, const char *snapname,
+                     bool no_progress)
+{
+  utils::ProgressContext pc("Rolling back to snapshot", no_progress);
+  int r = image.snap_rollback_with_progress(snapname, pc);
+  if (r < 0) {
+    pc.fail();
+    return r;
+  }
+  pc.finish();
+  return 0;
+}
+
+int do_purge_snaps(librbd::Image& image, bool no_progress)
+{
+  utils::ProgressContext pc("Removing all snapshots", no_progress);
+  std::vector<librbd::snap_info_t> snaps;
+  bool is_protected = false;
+  int r = image.snap_list(snaps);
+  if (r < 0) {
+    pc.fail();
+    return r;
+  } else if (0 == snaps.size()) {
+    return 0;
+  } else {
+    for (size_t i = 0; i < snaps.size(); ++i) {
+      r = image.snap_is_protected(snaps[i].name.c_str(), &is_protected);
+      if (r < 0) {
+        pc.fail();
+        return r;
+      } else if (is_protected == true) {
+        pc.fail();
+        std::cerr << "\r" << "rbd: snapshot '" << snaps[i].name.c_str()
+                  << "' is protected from removal." << std::endl;
+        return -EBUSY;
+      }
+    }
+    for (size_t i = 0; i < snaps.size(); ++i) {
+      r = image.snap_remove(snaps[i].name.c_str());
+      if (r < 0) {
+        pc.fail();
+        return r;
+      }
+      pc.update_progress(i + 1, snaps.size());
+    }
+
+    pc.finish();
+    return 0;
+  }
+}
+
+int do_protect_snap(librbd::Image& image, const char *snapname)
+{
+  int r = image.snap_protect(snapname);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+int do_unprotect_snap(librbd::Image& image, const char *snapname)
+{
+  int r = image.snap_unprotect(snapname);
+  if (r < 0)
+    return r;
+
+  return 0;
+}
+
+void get_list_arguments(po::options_description *positional,
+                        po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_format_options(options);
+}
+
+int execute_list(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  at::Format::Formatter formatter;
+  r = utils::get_formatter(vm, &formatter);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false, &rados,
+                                 &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_list_snaps(image, formatter.get());
+  if (r < 0) {
+    cerr << "rbd: failed to list snapshots: " << cpp_strerror(r)
+         << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_create_arguments(po::options_description *positional,
+                          po::options_description *options) {
+  at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_create(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_REQUIRED);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false, &rados,
+                                 &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_add_snap(image, snap_name.c_str());
+  if (r < 0) {
+    cerr << "rbd: failed to create snapshot: " << cpp_strerror(r)
+         << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_remove_arguments(po::options_description *positional,
+                          po::options_description *options) {
+  at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_remove(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_REQUIRED);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false, &rados,
+                                 &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_remove_snap(image, snap_name.c_str());
+  if (r < 0) {
+    if (r == -EBUSY) {
+      std::cerr << "rbd: snapshot '" << snap_name << "' "
+                << "is protected from removal." << std::endl;
+    } else {
+      std::cerr << "rbd: failed to remove snapshot: " << cpp_strerror(r)
+                << std::endl;
+    }
+    return r;
+  }
+  return 0;
+}
+
+void get_purge_arguments(po::options_description *positional,
+                         po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_no_progress_option(options);
+}
+
+int execute_purge(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false, &rados,
+                                 &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_purge_snaps(image, vm[at::NO_PROGRESS].as<bool>());
+  if (r < 0) {
+    if (r != -EBUSY) {
+      std::cerr << "rbd: removing snaps failed: " << cpp_strerror(r)
+                << std::endl;
+    }
+    return r;
+  }
+  return 0;
+}
+
+void get_rollback_arguments(po::options_description *positional,
+                            po::options_description *options) {
+  at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_no_progress_option(options);
+}
+
+int execute_rollback(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_REQUIRED);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false, &rados,
+                                 &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_rollback_snap(image, snap_name.c_str(),
+                       vm[at::NO_PROGRESS].as<bool>());
+  if (r < 0) {
+    std::cerr << "rbd: rollback failed: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_protect_arguments(po::options_description *positional,
+                           po::options_description *options) {
+  at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_protect(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_REQUIRED);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false, &rados,
+                                 &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_protect_snap(image, snap_name.c_str());
+  if (r < 0) {
+    std::cerr << "rbd: protecting snap failed: " << cpp_strerror(r)
+              << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_unprotect_arguments(po::options_description *positional,
+                             po::options_description *options) {
+  at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute_unprotect(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_REQUIRED);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false, &rados,
+                                 &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_unprotect_snap(image, snap_name.c_str());
+  if (r < 0) {
+    std::cerr << "rbd: unprotecting snap failed: " << cpp_strerror(r)
+              << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_rename_arguments(po::options_description *positional,
+                          po::options_description *options) {
+  at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_SOURCE);
+  at::add_snap_spec_options(positional, options, at::ARGUMENT_MODIFIER_DEST);
+}
+
+int execute_rename(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string src_snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_SOURCE, &arg_index, &pool_name, &image_name,
+    &src_snap_name, utils::SNAPSHOT_PRESENCE_REQUIRED);
+  if (r < 0) {
+    return -r;
+  }
+
+  std::string dest_pool_name(pool_name);
+  std::string dest_image_name;
+  std::string dest_snap_name;
+  r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_DEST, &arg_index, &dest_pool_name,
+    &dest_image_name, &dest_snap_name, utils::SNAPSHOT_PRESENCE_REQUIRED);
+  if (r < 0) {
+    return -r;
+  }
+
+  if (pool_name != dest_pool_name) {
+    std::cerr << "rbd: source and destination pool must be the same"
+              << std::endl;
+    return -EINVAL;
+  } else if (image_name != dest_image_name) {
+    std::cerr << "rbd: source and destination image name must be the same"
+              << std::endl;
+    return -EINVAL;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", false, &rados,
+                                 &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = image.snap_rename(src_snap_name.c_str(), dest_snap_name.c_str());
+  if (r < 0) {
+    std::cerr << "rbd: renaming snap failed: " << cpp_strerror(r)
+              << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action_list(
+  {"snap", "list"}, {"snap", "ls"}, "Dump list of image snapshots.", "",
+  &get_list_arguments, &execute_list);
+Shell::Action action_create(
+  {"snap", "create"}, {"snap", "add"}, "Create a snapshot.", "",
+  &get_create_arguments, &execute_create);
+Shell::Action action_remove(
+  {"snap", "remove"}, {"snap", "rm"}, "Deletes a snapshot.", "",
+  &get_remove_arguments, &execute_remove);
+Shell::Action action_purge(
+  {"snap", "purge"}, {}, "Deletes all snapshots.", "",
+  &get_purge_arguments, &execute_purge);
+Shell::Action action_rollback(
+  {"snap", "rollback"}, {"snap", "revert"}, "Rollback image to snapshot.", "",
+  &get_rollback_arguments, &execute_rollback);
+Shell::Action action_protect(
+  {"snap", "protect"}, {}, "Prevent a snapshot from being deleted.", "",
+  &get_protect_arguments, &execute_protect);
+Shell::Action action_unprotect(
+  {"snap", "unprotect"}, {}, "Allow a snapshot to be deleted.", "",
+  &get_unprotect_arguments, &execute_unprotect);
+Shell::Action action_rename(
+  {"snap", "rename"}, {}, "Rename a snapshot.", "",
+  &get_rename_arguments, &execute_rename);
+
+} // namespace snap
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Status.cc b/src/tools/rbd/action/Status.cc
new file mode 100644
index 0000000..da8fe97
--- /dev/null
+++ b/src/tools/rbd/action/Status.cc
@@ -0,0 +1,133 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/rbd_types.h"
+#include "common/errno.h"
+#include "common/Formatter.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace status {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+static int do_show_status(librados::IoCtx &io_ctx, librbd::Image &image,
+                          const char *imgname, Formatter *f)
+{
+  librbd::image_info_t info;
+  uint8_t old_format;
+  int r;
+  std::string header_oid;
+  std::list<obj_watch_t> watchers;
+
+  r = image.old_format(&old_format);
+  if (r < 0)
+    return r;
+
+  if (old_format) {
+    header_oid = imgname;
+    header_oid += RBD_SUFFIX;
+  } else {
+    r = image.stat(info, sizeof(info));
+    if (r < 0)
+      return r;
+
+    char prefix[RBD_MAX_BLOCK_NAME_SIZE + 1];
+    strncpy(prefix, info.block_name_prefix, RBD_MAX_BLOCK_NAME_SIZE);
+    prefix[RBD_MAX_BLOCK_NAME_SIZE] = '\0';
+
+    header_oid = RBD_HEADER_PREFIX;
+    header_oid.append(prefix + strlen(RBD_DATA_PREFIX));
+  }
+
+  r = io_ctx.list_watchers(header_oid, &watchers);
+  if (r < 0)
+    return r;
+
+  if (f)
+    f->open_object_section("status");
+
+  if (f) {
+    f->open_object_section("watchers");
+    for (std::list<obj_watch_t>::iterator i = watchers.begin(); i != watchers.end(); ++i) {
+      f->open_object_section("watcher");
+      f->dump_string("address", i->addr);
+      f->dump_unsigned("client", i->watcher_id);
+      f->dump_unsigned("cookie", i->cookie);
+      f->close_section();
+    }
+    f->close_section();
+  } else {
+    if (watchers.size()) {
+      std::cout << "Watchers:" << std::endl;
+      for (std::list<obj_watch_t>::iterator i = watchers.begin();
+           i != watchers.end(); ++i) {
+        std::cout << "\twatcher=" << i->addr << " client." << i->watcher_id
+                  << " cookie=" << i->cookie << std::endl;
+      }
+    } else {
+      std::cout << "Watchers: none" << std::endl;
+    }
+  }
+  if (f) {
+    f->close_section();
+    f->flush(std::cout);
+  }
+
+  return 0;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+  at::add_format_options(options);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  at::Format::Formatter formatter;
+  r = utils::get_formatter(vm, &formatter);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", true, &rados,
+                                 &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_show_status(io_ctx, image, image_name.c_str(), formatter.get());
+  if (r < 0) {
+    std::cerr << "rbd: show status failed: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"status"}, {}, "Show the status of this image.", "", &get_arguments,
+  &execute);
+
+} // namespace status
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/action/Watch.cc b/src/tools/rbd/action/Watch.cc
new file mode 100644
index 0000000..3e53255
--- /dev/null
+++ b/src/tools/rbd/action/Watch.cc
@@ -0,0 +1,137 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/ArgumentTypes.h"
+#include "tools/rbd/Shell.h"
+#include "tools/rbd/Utils.h"
+#include "include/rbd_types.h"
+#include "common/errno.h"
+#include <iostream>
+#include <boost/program_options.hpp>
+
+namespace rbd {
+namespace action {
+namespace watch {
+
+namespace at = argument_types;
+namespace po = boost::program_options;
+
+class RbdWatchCtx : public librados::WatchCtx2 {
+public:
+  RbdWatchCtx(librados::IoCtx& io_ctx, const char *image_name,
+              std::string header_oid)
+    : m_io_ctx(io_ctx), m_image_name(image_name), m_header_oid(header_oid)
+  {
+  }
+
+  virtual ~RbdWatchCtx() {}
+
+  virtual void handle_notify(uint64_t notify_id,
+                             uint64_t cookie,
+                             uint64_t notifier_id,
+                             bufferlist& bl) {
+    std::cout << m_image_name << " received notification: notify_id="
+              << notify_id << ", cookie=" << cookie << ", notifier_id="
+              << notifier_id << ", bl.length=" << bl.length() << std::endl;
+    bufferlist reply;
+    m_io_ctx.notify_ack(m_header_oid, notify_id, cookie, reply);
+  }
+
+  virtual void handle_error(uint64_t cookie, int err) {
+    std::cerr << m_image_name << " received error: cookie=" << cookie << ", "
+              << "err=" << cpp_strerror(err) << std::endl;
+  }
+private:
+  librados::IoCtx m_io_ctx;
+  const char *m_image_name;
+  std::string m_header_oid;
+};
+
+static int do_watch(librados::IoCtx& pp, librbd::Image &image,
+                    const char *imgname)
+{
+  uint8_t old_format;
+  int r = image.old_format(&old_format);
+  if (r < 0) {
+    std::cerr << "failed to query format" << std::endl;
+    return r;
+  }
+
+  std::string header_oid;
+  if (old_format != 0) {
+    header_oid = std::string(imgname) + RBD_SUFFIX;
+  } else {
+    librbd::image_info_t info;
+    r = image.stat(info, sizeof(info));
+    if (r < 0) {
+      std::cerr << "failed to stat image" << std::endl;
+      return r;
+    }
+
+    char prefix[RBD_MAX_BLOCK_NAME_SIZE + 1];
+    strncpy(prefix, info.block_name_prefix, RBD_MAX_BLOCK_NAME_SIZE);
+    prefix[RBD_MAX_BLOCK_NAME_SIZE] = '\0';
+
+    std::string image_id(prefix + strlen(RBD_DATA_PREFIX));
+    header_oid = RBD_HEADER_PREFIX + image_id;
+  }
+
+  uint64_t cookie;
+  RbdWatchCtx ctx(pp, imgname, header_oid);
+  r = pp.watch2(header_oid, &cookie, &ctx);
+  if (r < 0) {
+    std::cerr << "rbd: watch failed" << std::endl;
+    return r;
+  }
+
+  std::cout << "press enter to exit..." << std::endl;
+  getchar();
+
+  r = pp.unwatch2(cookie);
+  if (r < 0) {
+    std::cerr << "rbd: unwatch failed" << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+void get_arguments(po::options_description *positional,
+                   po::options_description *options) {
+  at::add_image_spec_options(positional, options, at::ARGUMENT_MODIFIER_NONE);
+}
+
+int execute(const po::variables_map &vm) {
+  size_t arg_index = 0;
+  std::string pool_name;
+  std::string image_name;
+  std::string snap_name;
+  int r = utils::get_pool_image_snapshot_names(
+    vm, at::ARGUMENT_MODIFIER_NONE, &arg_index, &pool_name, &image_name,
+    &snap_name, utils::SNAPSHOT_PRESENCE_NONE);
+  if (r < 0) {
+    return r;
+  }
+
+  librados::Rados rados;
+  librados::IoCtx io_ctx;
+  librbd::Image image;
+  r = utils::init_and_open_image(pool_name, image_name, "", true, &rados,
+                                 &io_ctx, &image);
+  if (r < 0) {
+    return r;
+  }
+
+  r = do_watch(io_ctx, image, image_name.c_str());
+  if (r < 0) {
+    std::cerr << "rbd: watch failed: " << cpp_strerror(r) << std::endl;
+    return r;
+  }
+  return 0;
+}
+
+Shell::Action action(
+  {"watch"}, {}, "Watch events on image.", "", &get_arguments, &execute);
+
+} // namespace watch
+} // namespace action
+} // namespace rbd
diff --git a/src/tools/rbd/rbd.cc b/src/tools/rbd/rbd.cc
new file mode 100644
index 0000000..a83db24
--- /dev/null
+++ b/src/tools/rbd/rbd.cc
@@ -0,0 +1,20 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "tools/rbd/Shell.h"
+#include "include/int_types.h"
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include <vector>
+
+int main(int argc, const char **argv)
+{
+  std::vector<const char*> args;
+  argv_to_vec(argc, argv, args);
+  env_to_vec(args);
+
+  global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+
+  rbd::Shell shell;
+  return shell.execute(argc, argv);
+}
diff --git a/src/tracing/librados.tp b/src/tracing/librados.tp
index 5bb6ee7..7171787 100644
--- a/src/tracing/librados.tp
+++ b/src/tracing/librados.tp
@@ -1091,6 +1091,24 @@ TRACEPOINT_EVENT(librados, rados_ioctx_pool_requires_alignment_exit,
     )
 )
 
+TRACEPOINT_EVENT(librados, rados_ioctx_pool_requires_alignment_enter2,
+    TP_ARGS(
+        rados_ioctx_t, ioctx),
+    TP_FIELDS(
+        ctf_integer_hex(rados_ioctx_t, ioctx, ioctx)
+    )
+)
+
+TRACEPOINT_EVENT(librados, rados_ioctx_pool_requires_alignment_exit2,
+    TP_ARGS(
+        int, retval,
+        int, requires),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+        ctf_integer(int, requires, requires)
+    )
+)
+
 TRACEPOINT_EVENT(librados, rados_ioctx_pool_required_alignment_enter,
     TP_ARGS(
         rados_ioctx_t, ioctx),
@@ -1107,6 +1125,24 @@ TRACEPOINT_EVENT(librados, rados_ioctx_pool_required_alignment_exit,
     )
 )
 
+TRACEPOINT_EVENT(librados, rados_ioctx_pool_required_alignment_enter2,
+    TP_ARGS(
+        rados_ioctx_t, ioctx),
+    TP_FIELDS(
+        ctf_integer_hex(rados_ioctx_t, ioctx, ioctx)
+    )
+)
+
+TRACEPOINT_EVENT(librados, rados_ioctx_pool_required_alignment_exit2,
+    TP_ARGS(
+        int, retval,
+        uint64_t, alignment),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+        ctf_integer(uint64_t, alignment, alignment)
+    )
+)
+
 TRACEPOINT_EVENT(librados, rados_ioctx_locator_set_key_enter,
     TP_ARGS(
         rados_ioctx_t, ioctx,
@@ -1591,7 +1627,7 @@ TRACEPOINT_EVENT(librados, rados_tmap_get_exit,
         int, len),
     TP_FIELDS(
         ctf_integer(int, retval, retval)
-        ctf_sequence(unsigned char, buf, buf, uint32_t, CEPH_MIN(len, CEPH_TRACE_BUF_TRUNC_LEN))
+        ctf_sequence(unsigned char, buf, buf, uint32_t, CEPH_MIN((size_t)len, CEPH_TRACE_BUF_TRUNC_LEN))
         ctf_integer(int, len, len)
     )
 )
@@ -2339,6 +2375,31 @@ TRACEPOINT_EVENT(librados, rados_notify2_exit,
     )
 )
 
+TRACEPOINT_EVENT(librados, rados_aio_notify_enter,
+    TP_ARGS(
+        rados_ioctx_t, ioctx,
+        const char*, oid,
+        rados_completion_t, completion,
+        const char*, buf,
+        int, buf_len,
+	uint64_t, timeout_ms),
+    TP_FIELDS(
+        ctf_integer_hex(rados_ioctx_t, ioctx, ioctx)
+        ctf_string(oid, oid)
+        ctf_integer_hex(rados_completion_t, completion, completion)
+        ceph_ctf_sequence(unsigned char, buf, buf, size_t, buf_len)
+        ctf_integer(uint64_t, timeout_ms, timeout_ms)
+    )
+)
+
+TRACEPOINT_EVENT(librados, rados_aio_notify_exit,
+    TP_ARGS(
+        int, retval),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+    )
+)
+
 TRACEPOINT_EVENT(librados, rados_notify_ack_enter,
     TP_ARGS(
         rados_ioctx_t, ioctx,
@@ -3337,3 +3398,39 @@ TRACEPOINT_EVENT(librados, rados_aio_read_op_operate_exit,
         ctf_integer(int, retval, retval)
     )
 )
+
+TRACEPOINT_EVENT(librados, rados_cache_pin_enter,
+    TP_ARGS(
+        rados_ioctx_t, io,
+	const char*, o),
+    TP_FIELDS(
+        ctf_integer_hex(rados_ioctx_t, io, io)
+        ceph_ctf_string(o, o)
+    )
+)
+
+TRACEPOINT_EVENT(librados, rados_cache_pin_exit,
+    TP_ARGS(
+        int, retval),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+    )
+)
+
+TRACEPOINT_EVENT(librados, rados_cache_unpin_enter,
+    TP_ARGS(
+        rados_ioctx_t, io,
+	const char*, o),
+    TP_FIELDS(
+        ctf_integer_hex(rados_ioctx_t, io, io)
+        ceph_ctf_string(o, o)
+    )
+)
+
+TRACEPOINT_EVENT(librados, rados_cache_unpin_exit,
+    TP_ARGS(
+        int, retval),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+    )
+)
diff --git a/src/tracing/librbd.tp b/src/tracing/librbd.tp
index f9101bd..f6e1cb1 100644
--- a/src/tracing/librbd.tp
+++ b/src/tracing/librbd.tp
@@ -305,6 +305,29 @@ TRACEPOINT_EVENT(librbd, create3_exit,
     )
 )
 
+TRACEPOINT_EVENT(librbd, create4_enter,
+    TP_ARGS(
+        const char*, pool_name,
+        int64_t, id,
+        const char*, imgname,
+        uint64_t, size,
+        void*, opts),
+    TP_FIELDS(
+        ctf_string(pool_name, pool_name)
+        ctf_integer(int64_t, id, id)
+        ctf_string(imgname, imgname)
+        ctf_integer_hex(void*, opts, opts)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, create4_exit,
+    TP_ARGS(
+        int, retval),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+    )
+)
+
 TRACEPOINT_EVENT(librbd, remove_enter,
     TP_ARGS(
         const char*, pool_name,
@@ -671,6 +694,36 @@ TRACEPOINT_EVENT(librbd, copy2_exit,
     )
 )
 
+TRACEPOINT_EVENT(librbd, copy3_enter,
+    TP_ARGS(
+        void*, src_imagectx,
+        const char*, src_name,
+        const char*, src_snap_name,
+        char, src_read_only,
+        const char*, dst_pool_name,
+        uint64_t, dst_id,
+        const char*, dst_name,
+        void*, opts),
+    TP_FIELDS(
+        ctf_integer_hex(void*, src_imagectx, src_imagectx)
+        ctf_string(src_name, src_name)
+        ctf_string(src_snap_name, src_snap_name)
+        ctf_integer(char, src_read_only, src_read_only)
+        ctf_string(dst_pool_name, dst_pool_name)
+        ctf_integer(uint64_t, dst_id, dst_id)
+        ctf_string(dst_name, dst_name)
+        ctf_integer_hex(void*, opts, opts)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, copy3_exit,
+    TP_ARGS(
+        int, retval),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+    )
+)
+
 TRACEPOINT_EVENT(librbd, resize_enter,
     TP_ARGS(
         void*, imagectx,
@@ -874,6 +927,36 @@ TRACEPOINT_EVENT(librbd, clone2_exit,
     )
 )
 
+TRACEPOINT_EVENT(librbd, clone3_enter,
+    TP_ARGS(
+        const char*, parent_pool_name,
+        uint64_t, parent_id,
+        const char*, parent_name,
+        const char*, parent_snap_name,
+        const char*, child_pool_name,
+        uint64_t, child_id,
+        const char*, child_name,
+	void*, opts),
+    TP_FIELDS(
+        ctf_string(parent_pool_name, parent_pool_name)
+        ctf_integer(uint64_t, parent_id, parent_id)
+        ctf_string(parent_name, parent_name)
+        ctf_string(parent_snap_name, parent_snap_name)
+        ctf_string(child_pool_name, child_pool_name)
+        ctf_integer(uint64_t, child_id, child_id)
+        ctf_string(child_name, child_name)
+	ctf_integer_hex(void*, opts, opts)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, clone3_exit,
+    TP_ARGS(
+        int, retval),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+    )
+)
+
 TRACEPOINT_EVENT(librbd, flatten_enter,
     TP_ARGS(
         void*, imagectx,
@@ -966,6 +1049,32 @@ TRACEPOINT_EVENT(librbd, snap_rollback_exit,
     )
 )
 
+TRACEPOINT_EVENT(librbd, snap_rename_enter,
+    TP_ARGS(
+        void*, imagectx,
+        const char*, name,
+        const char*, snap_name,
+        char, read_only,
+        const char*, src_snap_name,
+        const char*, dst_snap_name),
+    TP_FIELDS(
+        ctf_integer_hex(void*, imagectx, imagectx)
+        ctf_string(name, name)
+        ctf_string(snap_name, snap_name)
+        ctf_integer(char, read_only, read_only)
+        ctf_string(src_snap_name, src_snap_name)
+        ctf_string(dst_snap_name, dst_snap_name)
+    )
+)
+
+TRACEPOINT_EVENT(librbd, snap_rename_exit,
+    TP_ARGS(
+        int, retval),
+    TP_FIELDS(
+        ctf_integer(int, retval, retval)
+    )
+)
+
 TRACEPOINT_EVENT(librbd, snap_list_enter,
     TP_ARGS(
         void*, imagectx,
diff --git a/src/tracing/osd.tp b/src/tracing/osd.tp
index ba0fd32..7a2ffd9 100644
--- a/src/tracing/osd.tp
+++ b/src/tracing/osd.tp
@@ -477,6 +477,26 @@ TRACEPOINT_EVENT(osd, do_osd_op_pre_watch,
     )
 )
 
+TRACEPOINT_EVENT(osd, do_osd_op_pre_cache_pin,
+    TP_ARGS(
+        const char*, oid,
+        uint64_t, snap),
+    TP_FIELDS(
+        ctf_string(oid, oid)
+        ctf_integer(uint64_t, snap, snap)
+    )
+)
+
+TRACEPOINT_EVENT(osd, do_osd_op_pre_cache_unpin,
+    TP_ARGS(
+        const char*, oid,
+        uint64_t, snap),
+    TP_FIELDS(
+        ctf_string(oid, oid)
+        ctf_integer(uint64_t, snap, snap)
+    )
+)
+
 TRACEPOINT_EVENT(osd, do_osd_op_pre_setxattr,
     TP_ARGS(
         const char*, oid,
@@ -640,12 +660,10 @@ TRACEPOINT_EVENT(osd, do_osd_op_pre_omap_cmp,
 TRACEPOINT_EVENT(osd, do_osd_op_pre_omapsetvals,
     TP_ARGS(
         const char*, oid,
-        uint64_t, snap,
-        const char*, keys),
+        uint64_t, snap),
     TP_FIELDS(
         ctf_string(oid, oid)
         ctf_integer(uint64_t, snap, snap)
-        ctf_string(keys, keys)
     )
 )
 
@@ -672,12 +690,10 @@ TRACEPOINT_EVENT(osd, do_osd_op_pre_omapclear,
 TRACEPOINT_EVENT(osd, do_osd_op_pre_omaprmkeys,
     TP_ARGS(
         const char*, oid,
-        uint64_t, snap,
-        const char*, keys),
+        uint64_t, snap),
     TP_FIELDS(
         ctf_string(oid, oid)
         ctf_integer(uint64_t, snap, snap)
-        ctf_string(keys, keys)
     )
 )
 
diff --git a/src/tracing/tracing-common.h b/src/tracing/tracing-common.h
index 2ab1b76..aed4122 100644
--- a/src/tracing/tracing-common.h
+++ b/src/tracing/tracing-common.h
@@ -4,7 +4,7 @@
 // Amount of buffer data to dump when using ceph_ctf_sequence or ceph_ctf_sequencep.
 // If 0, then *_data field is omitted entirely.
 #if !defined(CEPH_TRACE_BUF_TRUNC_LEN)
-#define CEPH_TRACE_BUF_TRUNC_LEN 0
+#define CEPH_TRACE_BUF_TRUNC_LEN 0u
 #endif
 
 // TODO: This is GCC-specific.  Replace CEPH_MAX and CEPH_MIN with standard macros, if possible.
diff --git a/src/vstart.sh b/src/vstart.sh
index 679d33a..7e32047 100755
--- a/src/vstart.sh
+++ b/src/vstart.sh
@@ -450,6 +450,8 @@ $CMDSDEBUG
         mds debug auth pins = true
         mds debug subtrees = true
         mds data = $CEPH_DEV_DIR/mds.\$id
+        mds root ino uid = `id -u`
+        mds root ino gid = `id -g`
 $extra_conf
 [osd]
 $DAEMONOPTS
@@ -724,7 +726,7 @@ do_rgw()
 
     # Create Swift user
     echo "setting up user tester"
-    $CEPH_BIN/radosgw-admin user create -c $conf_fn --subuser=test:tester --display-name=Tester-Subuser --key-type=swift --secret=testing > /dev/null
+    $CEPH_BIN/radosgw-admin user create -c $conf_fn --subuser=test:tester --display-name=Tester-Subuser --key-type=swift --secret=testing --access=full > /dev/null
 
     echo ""
     echo "S3 User Info:"
diff --git a/systemd/Makefile.am b/systemd/Makefile.am
index b7fde38..1a43415 100644
--- a/systemd/Makefile.am
+++ b/systemd/Makefile.am
@@ -5,7 +5,8 @@ unitfiles = \
 	ceph-create-keys at .service \
 	ceph-osd at .service \
 	ceph-radosgw at .service \
-	ceph-disk at .service
+	ceph-disk at .service \
+	rbdmap.service
 
 unitdir = $(systemd_unit_dir)
 
diff --git a/systemd/Makefile.in b/systemd/Makefile.in
index f678e1d..9d26136 100644
--- a/systemd/Makefile.in
+++ b/systemd/Makefile.in
@@ -167,6 +167,7 @@ AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 BOOST_PROGRAM_OPTIONS_LIBS = @BOOST_PROGRAM_OPTIONS_LIBS@
 BOOST_RANDOM_LIBS = @BOOST_RANDOM_LIBS@
+BOOST_REGEX_LIBS = @BOOST_REGEX_LIBS@
 BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@
 CC = @CC@
 CCAS = @CCAS@
@@ -358,7 +359,8 @@ unitfiles = \
 	ceph-create-keys at .service \
 	ceph-osd at .service \
 	ceph-radosgw at .service \
-	ceph-disk at .service
+	ceph-disk at .service \
+	rbdmap.service
 
 unitdir = $(systemd_unit_dir)
 unit_DATA = $(unitfiles)
diff --git a/systemd/ceph b/systemd/ceph
index 1657779..6a69271 100644
--- a/systemd/ceph
+++ b/systemd/ceph
@@ -56,8 +56,8 @@ case $action in start | stop | status | enable | disable | mask | unmask | resta
     rc_check
 ;;
 *)
-    echo "Invalid paramter : $action"
-    echo "Valid paramters  : start | stop | status | enable | disable | mask | unmask | restart | is-active | is-failed | show | kill | reset-failed"
+    echo "Invalid parameter : $action"
+    echo "Valid parameters  : start | stop | status | enable | disable | mask | unmask | restart | is-active | is-failed | show | kill | reset-failed"
 ;;
 esac
 
diff --git a/systemd/rbdmap.service b/systemd/rbdmap.service
new file mode 100644
index 0000000..23d8fdb
--- /dev/null
+++ b/systemd/rbdmap.service
@@ -0,0 +1,12 @@
+[Unit]
+Description=Map RBD devices
+
+After=network-online.target local-fs.target
+Wants=network-online.target local-fs.target
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+ExecStart=/usr/bin/rbdmap map
+ExecReload=/usr/bin/rbdmap map
+ExecStop=/usr/bin/rbdmap unmap

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-ceph/ceph.git



More information about the Pkg-ceph-commits mailing list